A dataset of movie reviews labeled by whether the movie review is positive or negative
!pip install git+https://github.com/netbrainml/nbml.git
from nbml.tools import *
from IPython.display import clear_output
clear_output()
from keras.datasets import imdb
import numpy as np
old = np.load
np.load = lambda *a,**k: old(*a,**k,allow_pickle=True)
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=1000) #This gets the most common 1000 words
np.load = old
del(old)
shapes(X_train, y_train, X_test, y_test)
For NLP (Natural Language Processing), we need to create a vocabulary and map the words into indexes. We can get this from keras. However, we need to assign a couple of key tokens, such as the start of the review, or unknown words. Usually this is common practice.
dictv = imdb.get_word_index()
dictv = {k:(v+3) for k,v in dictv.items()}
dictv["<PAD>"] = 0
dictv["<START>"] = 1
dictv["<UNK>"] = 2
dictv["<UNUSED>"] = 3
dictv
We need a way to decipher reviews, so we reverse the dictionary
reverse_word_index = dict([(value, key) for (key, value) in dictv.items()])
def decode_review(text):
return ' '.join([reverse_word_index.get(i, '?') for i in text])
decode_review(X_train[0]), y_train[0]
We also need to pad our sentences so that they are the same length with the pad token.
from keras.preprocessing import sequence
pad_ = max([len(x) for x in X_train])
X_train_pd = sequence.pad_sequences(X_train, value=dictv["<PAD>"], maxlen=pad_, padding='post')
X_test_pd = sequence.pad_sequences(X_test, value=dictv["<PAD>"], maxlen=pad_, padding='post')
shapes(X_train_pd, X_test_pd)
Each sequence is too long. For now, we can slice the sentences in smaller sequences.
from tqdm import tqdm
def sliceton(X,Y,n):
out = []
ys = np.array([])
for i,review in enumerate(tqdm(X)):
for idx in range(0, len(review),n):
if idx+n>len(review)-1:
out.append(np.array(sequence.pad_sequences(np.array(review[idx: idx+n])[None,:],
value=dictv["<PAD>"], maxlen=n,
padding='post')).squeeze())
ys = np.append(ys,Y[i])
break
out.append(review[idx: idx+n])
ys = np.append(ys,Y[i])
return np.array(out), ys
X_train_pds, y_train_pds = sliceton(X_train, y_train, 128)
X_test_pds, y_test_pds = sliceton(X_test, y_test, 128)
shapes(X_train_pds, y_train_pds, X_test_pds, y_test_pds)
from tensorflow import keras
rnn_model = keras.Sequential([
keras.layers.SimpleRNN(64),
keras.layers.Dense(1, activation='sigmoid')])
rnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
rnn_model.fit(X_train_pds[::,None].astype(float), y_train_pds, validation_data=(X_test_pds[::,None].astype(float), y_test_pds), epochs=3, batch_size=2048)
Learns matrix to map and encode words as real-valued vectors where the similarity between words
in terms of meaning translates to closeness in the vector space.
So now we have a vector to represent a word, which its value is learnable
evl = 500
rnnE_model = keras.Sequential([
keras.layers.Embedding(1000, evl,input_length = 128),
keras.layers.SimpleRNN(64),
keras.layers.Dense(1, activation='sigmoid')])
rnnE_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
rnnE_model.fit(X_train_pds, y_train_pds, validation_data=(X_test_pds, y_test_pds), epochs=5, batch_size=2048)
from tensorflow import keras
evl = 500
lstm_model = keras.Sequential([
keras.layers.Embedding(1000, evl,input_length = 128),
keras.layers.LSTM(64),
keras.layers.Dense(1, activation='sigmoid')])
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_train_pds, y_train_pds, validation_data=(X_test_pds, y_test_pds), epochs=5, batch_size=2048)
gru_model = keras.Sequential([
keras.layers.Embedding(1000, evl,input_length = 128),
keras.layers.GRU(64),
keras.layers.Dense(1, activation='sigmoid')])
gru_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
gru_model.fit(X_train_pds, y_train_pds, validation_data=(X_test_pds, y_test_pds), epochs=5, batch_size=2048)
titanic_review = """James Cameron's 'Titanic' shares a similar motto to Marmite,
"you either love it or hate it", I for one love this film, yes
I know it's got a drawn out romance story, but there's just
something about the 3-hour fill of the film that makes its such a
spectacularly emotional and beautiful movie. I saw this a lot when
I was growing up, this was one of the films of my childhood, it is
truly a powerfully resonant and visually stunning movie of epic proportions.
Personally I favour the British original 'A Night to Remember', but this is a
pretty close contender. Winner of 11 Oscars, James Cameron's romantic-disaster
epic is a triumph of cinema that boasts perfect chemistry between Kate and Leo as
the lovers bound for tragedy. Many people disregard this film nowadays solely
because it's become the most popular film ever made alongside Cameron's other epic
'Avatar', and whilst 'Titanic' is definitely not one of my favourite films, it's
just so powerfully amazing and no doubt at all it has once brought a tear to everyone's
eyes. The main aspect I love in this film is James Horner's haunting score that was a key
ingredient in the film's success, it is simply perfect, too bad Celine Dion had to close
this on her awful pop version. Nonetheless, 'Titanic' is a modern classic and a
beautifully spectacular film that will live on."""
gotti_review = """I'd rather wake up next to a severed horse head than ever watch 'Gotti' again.
The worst movie of the year so far, the long-awaited biopic about the Gambino crime boss' rise from
made man to top dog took four directors, 44 producers and eight years to make. It shows. The finished
product belongs in a cement bucket at the bottom of the river."""
def getSent(review, model):
review_ar = []
for word in review.split(" "):
val = dictv.get(word)
review_ar.append(val) if val is not None else None
review_ar = sequence.pad_sequences(np.array(review_ar)[None,::], maxlen=128, padding='post')
print(review_ar,review_ar.shape)
print(model.predict(review_ar))
getSent(titanic_review,rnnE_model), getSent(gotti_review,rnnE_model)
getSent(titanic_review,lstm_model), getSent(gotti_review,lstm_model)
getSent(titanic_review,gru_model), getSent(gotti_review,gru_model)