Working with Keras on IMDB Dataset

A dataset of movie reviews labeled by whether the movie review is positive or negative

image.png

In [1]:
!pip install git+https://github.com/netbrainml/nbml.git
from nbml.tools import *
from IPython.display import clear_output
clear_output()
In [2]:
from keras.datasets import imdb
import numpy as np

old = np.load
np.load = lambda *a,**k: old(*a,**k,allow_pickle=True)

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=1000) #This gets the most common 1000 words

np.load = old
del(old)

shapes(X_train, y_train, X_test, y_test)
Using TensorFlow backend.
/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])
/opt/conda/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
/opt/conda/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
/opt/conda/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
/opt/conda/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
/opt/conda/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
/opt/conda/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
17465344/17464789 [==============================] - 2s 0us/step
arg_0: (25000,)
arg_1: (25000,)
arg_2: (25000,)
arg_3: (25000,)

For NLP (Natural Language Processing), we need to create a vocabulary and map the words into indexes. We can get this from keras. However, we need to assign a couple of key tokens, such as the start of the review, or unknown words. Usually this is common practice.

In [3]:
dictv = imdb.get_word_index()
dictv = {k:(v+3) for k,v in dictv.items()}
dictv["<PAD>"] = 0
dictv["<START>"] = 1
dictv["<UNK>"] = 2
dictv["<UNUSED>"] = 3
dictv
Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json
1646592/1641221 [==============================] - 1s 1us/step
Out[3]:
{'fawn': 34704,
 'tsukino': 52009,
 'nunnery': 52010,
 'sonja': 16819,
 'vani': 63954,
 'woods': 1411,
 'spiders': 16118,
 'hanging': 2348,
 'woody': 2292,
 'trawling': 52011,
 "hold's": 52012,
 'comically': 11310,
 'localized': 40833,
 'disobeying': 30571,
 "'royale": 52013,
 "harpo's": 40834,
 'canet': 52014,
 'aileen': 19316,
 'acurately': 52015,
 "diplomat's": 52016,
 'rickman': 25245,
 'arranged': 6749,
 'rumbustious': 52017,
 'familiarness': 52018,
 "spider'": 52019,
 'hahahah': 68807,
 "wood'": 52020,
 'transvestism': 40836,
 "hangin'": 34705,
 'bringing': 2341,
 'seamier': 40837,
 'wooded': 34706,
 'bravora': 52021,
 'grueling': 16820,
 'wooden': 1639,
 'wednesday': 16821,
 "'prix": 52022,
 'altagracia': 34707,
 'circuitry': 52023,
 'crotch': 11588,
 'busybody': 57769,
 "tart'n'tangy": 52024,
 'burgade': 14132,
 'thrace': 52026,
 "tom's": 11041,
 'snuggles': 52028,
 'francesco': 29117,
 'complainers': 52030,
 'templarios': 52128,
 '272': 40838,
 '273': 52031,
 'zaniacs': 52133,
 '275': 34709,
 'consenting': 27634,
 'snuggled': 40839,
 'inanimate': 15495,
 'uality': 52033,
 'bronte': 11929,
 'errors': 4013,
 'dialogs': 3233,
 "yomada's": 52034,
 "madman's": 34710,
 'dialoge': 30588,
 'usenet': 52036,
 'videodrome': 40840,
 "kid'": 26341,
 'pawed': 52037,
 "'girlfriend'": 30572,
 "'pleasure": 52038,
 "'reloaded'": 52039,
 "kazakos'": 40842,
 'rocque': 52040,
 'mailings': 52041,
 'brainwashed': 11930,
 'mcanally': 16822,
 "tom''": 52042,
 'kurupt': 25246,
 'affiliated': 21908,
 'babaganoosh': 52043,
 "noe's": 40843,
 'quart': 40844,
 'kids': 362,
 'uplifting': 5037,
 'controversy': 7096,
 'kida': 21909,
 'kidd': 23382,
 "error'": 52044,
 'neurologist': 52045,
 'spotty': 18513,
 'cobblers': 30573,
 'projection': 9881,
 'fastforwarding': 40845,
 'sters': 52046,
 "eggar's": 52047,
 'etherything': 52048,
 'gateshead': 40846,
 'airball': 34711,
 'unsinkable': 25247,
 'stern': 7183,
 "cervi's": 52049,
 'dnd': 40847,
 'dna': 11589,
 'insecurity': 20601,
 "'reboot'": 52050,
 'trelkovsky': 11040,
 'jaekel': 52051,
 'sidebars': 52052,
 "sforza's": 52053,
 'distortions': 17636,
 'mutinies': 52054,
 'sermons': 30605,
 '7ft': 40849,
 'boobage': 52055,
 "o'bannon's": 52056,
 'populations': 23383,
 'chulak': 52057,
 'mesmerize': 27636,
 'quinnell': 52058,
 'yahoo': 10310,
 'meteorologist': 52060,
 'beswick': 42580,
 'boorman': 15496,
 'voicework': 40850,
 "ster'": 52061,
 'blustering': 22925,
 'hj': 52062,
 'intake': 27637,
 'morally': 5624,
 'jumbling': 40852,
 'bowersock': 52063,
 "'porky's'": 52064,
 'gershon': 16824,
 'ludicrosity': 40853,
 'coprophilia': 52065,
 'expressively': 40854,
 "india's": 19503,
 "post's": 34713,
 'wana': 52066,
 'wang': 5286,
 'wand': 30574,
 'wane': 25248,
 'edgeways': 52324,
 'titanium': 34714,
 'pinta': 40855,
 'want': 181,
 'pinto': 30575,
 'whoopdedoodles': 52068,
 'tchaikovsky': 21911,
 'travel': 2106,
 "'victory'": 52069,
 'copious': 11931,
 'gouge': 22436,
 "chapters'": 52070,
 'barbra': 6705,
 'uselessness': 30576,
 "wan'": 52071,
 'assimilated': 27638,
 'petiot': 16119,
 'most\x85and': 52072,
 'dinosaurs': 3933,
 'wrong': 355,
 'seda': 52073,
 'stollen': 52074,
 'sentencing': 34715,
 'ouroboros': 40856,
 'assimilates': 40857,
 'colorfully': 40858,
 'glenne': 27639,
 'dongen': 52075,
 'subplots': 4763,
 'kiloton': 52076,
 'chandon': 23384,
 "effect'": 34716,
 'snugly': 27640,
 'kuei': 40859,
 'welcomed': 9095,
 'dishonor': 30074,
 'concurrence': 52078,
 'stoicism': 23385,
 "guys'": 14899,
 "beroemd'": 52080,
 'butcher': 6706,
 "melfi's": 40860,
 'aargh': 30626,
 'playhouse': 20602,
 'wickedly': 11311,
 'fit': 1183,
 'labratory': 52081,
 'lifeline': 40862,
 'screaming': 1930,
 'fix': 4290,
 'cineliterate': 52082,
 'fic': 52083,
 'fia': 52084,
 'fig': 34717,
 'fmvs': 52085,
 'fie': 52086,
 'reentered': 52087,
 'fin': 30577,
 'doctresses': 52088,
 'fil': 52089,
 'zucker': 12609,
 'ached': 31934,
 'counsil': 52091,
 'paterfamilias': 52092,
 'songwriter': 13888,
 'shivam': 34718,
 'hurting': 9657,
 'effects': 302,
 'slauther': 52093,
 "'flame'": 52094,
 'sommerset': 52095,
 'interwhined': 52096,
 'whacking': 27641,
 'bartok': 52097,
 'barton': 8778,
 'frewer': 21912,
 "fi'": 52098,
 'ingrid': 6195,
 'stribor': 30578,
 'approporiately': 52099,
 'wobblyhand': 52100,
 'tantalisingly': 52101,
 'ankylosaurus': 52102,
 'parasites': 17637,
 'childen': 52103,
 "jenkins'": 52104,
 'metafiction': 52105,
 'golem': 17638,
 'indiscretion': 40863,
 "reeves'": 23386,
 "inamorata's": 57784,
 'brittannica': 52107,
 'adapt': 7919,
 "russo's": 30579,
 'guitarists': 48249,
 'abbott': 10556,
 'abbots': 40864,
 'lanisha': 17652,
 'magickal': 40866,
 'mattter': 52108,
 "'willy": 52109,
 'pumpkins': 34719,
 'stuntpeople': 52110,
 'estimate': 30580,
 'ugghhh': 40867,
 'gameplay': 11312,
 "wern't": 52111,
 "n'sync": 40868,
 'sickeningly': 16120,
 'chiara': 40869,
 'disturbed': 4014,
 'portmanteau': 40870,
 'ineffectively': 52112,
 "duchonvey's": 82146,
 "nasty'": 37522,
 'purpose': 1288,
 'lazers': 52115,
 'lightened': 28108,
 'kaliganj': 52116,
 'popularism': 52117,
 "damme's": 18514,
 'stylistics': 30581,
 'mindgaming': 52118,
 'spoilerish': 46452,
 "'corny'": 52120,
 'boerner': 34721,
 'olds': 6795,
 'bakelite': 52121,
 'renovated': 27642,
 'forrester': 27643,
 "lumiere's": 52122,
 'gaskets': 52027,
 'needed': 887,
 'smight': 34722,
 'master': 1300,
 "edie's": 25908,
 'seeber': 40871,
 'hiya': 52123,
 'fuzziness': 52124,
 'genesis': 14900,
 'rewards': 12610,
 'enthrall': 30582,
 "'about": 40872,
 "recollection's": 52125,
 'mutilated': 11042,
 'fatherlands': 52126,
 "fischer's": 52127,
 'positively': 5402,
 '270': 34708,
 'ahmed': 34723,
 'zatoichi': 9839,
 'bannister': 13889,
 'anniversaries': 52130,
 "helm's": 30583,
 "'work'": 52131,
 'exclaimed': 34724,
 "'unfunny'": 52132,
 '274': 52032,
 'feeling': 547,
 "wanda's": 52134,
 'dolan': 33269,
 '278': 52136,
 'peacoat': 52137,
 'brawny': 40873,
 'mishra': 40874,
 'worlders': 40875,
 'protags': 52138,
 'skullcap': 52139,
 'dastagir': 57599,
 'affairs': 5625,
 'wholesome': 7802,
 'hymen': 52140,
 'paramedics': 25249,
 'unpersons': 52141,
 'heavyarms': 52142,
 'affaire': 52143,
 'coulisses': 52144,
 'hymer': 40876,
 'kremlin': 52145,
 'shipments': 30584,
 'pixilated': 52146,
 "'00s": 30585,
 'diminishing': 18515,
 'cinematic': 1360,
 'resonates': 14901,
 'simplify': 40877,
 "nature'": 40878,
 'temptresses': 40879,
 'reverence': 16825,
 'resonated': 19505,
 'dailey': 34725,
 '2\x85': 52147,
 'treize': 27644,
 'majo': 52148,
 'kiya': 21913,
 'woolnough': 52149,
 'thanatos': 39800,
 'sandoval': 35734,
 'dorama': 40882,
 "o'shaughnessy": 52150,
 'tech': 4991,
 'fugitives': 32021,
 'teck': 30586,
 "'e'": 76128,
 'doesn’t': 40884,
 'purged': 52152,
 'saying': 660,
 "martians'": 41098,
 'norliss': 23421,
 'dickey': 27645,
 'dicker': 52155,
 "'sependipity": 52156,
 'padded': 8425,
 'ordell': 57795,
 "sturges'": 40885,
 'independentcritics': 52157,
 'tempted': 5748,
 "atkinson's": 34727,
 'hounded': 25250,
 'apace': 52158,
 'clicked': 15497,
 "'humor'": 30587,
 "martino's": 17180,
 "'supporting": 52159,
 'warmongering': 52035,
 "zemeckis's": 34728,
 'lube': 21914,
 'shocky': 52160,
 'plate': 7479,
 'plata': 40886,
 'sturgess': 40887,
 "nerds'": 40888,
 'plato': 20603,
 'plath': 34729,
 'platt': 40889,
 'mcnab': 52162,
 'clumsiness': 27646,
 'altogether': 3902,
 'massacring': 42587,
 'bicenntinial': 52163,
 'skaal': 40890,
 'droning': 14363,
 'lds': 8779,
 'jaguar': 21915,
 "cale's": 34730,
 'nicely': 1780,
 'mummy': 4591,
 "lot's": 18516,
 'patch': 10089,
 'kerkhof': 50205,
 "leader's": 52164,
 "'movie": 27647,
 'uncomfirmed': 52165,
 'heirloom': 40891,
 'wrangle': 47363,
 'emotion\x85': 52166,
 "'stargate'": 52167,
 'pinoy': 40892,
 'conchatta': 40893,
 'broeke': 41131,
 'advisedly': 40894,
 "barker's": 17639,
 'descours': 52169,
 'lots': 775,
 'lotr': 9262,
 'irs': 9882,
 'lott': 52170,
 'xvi': 40895,
 'irk': 34731,
 'irl': 52171,
 'ira': 6890,
 'belzer': 21916,
 'irc': 52172,
 'ire': 27648,
 'requisites': 40896,
 'discipline': 7696,
 'lyoko': 52964,
 'extend': 11313,
 'nature': 876,
 "'dickie'": 52173,
 'optimist': 40897,
 'lapping': 30589,
 'superficial': 3903,
 'vestment': 52174,
 'extent': 2826,
 'tendons': 52175,
 "heller's": 52176,
 'quagmires': 52177,
 'miyako': 52178,
 'moocow': 20604,
 "coles'": 52179,
 'lookit': 40898,
 'ravenously': 52180,
 'levitating': 40899,
 'perfunctorily': 52181,
 'lookin': 30590,
 "lot'": 40901,
 'lookie': 52182,
 'fearlessly': 34873,
 'libyan': 52184,
 'fondles': 40902,
 'gopher': 35717,
 'wearying': 40904,
 "nz's": 52185,
 'minuses': 27649,
 'puposelessly': 52186,
 'shandling': 52187,
 'decapitates': 31271,
 'humming': 11932,
 "'nother": 40905,
 'smackdown': 21917,
 'underdone': 30591,
 'frf': 40906,
 'triviality': 52188,
 'fro': 25251,
 'bothers': 8780,
 "'kensington": 52189,
 'much': 76,
 'muco': 34733,
 'wiseguy': 22618,
 "richie's": 27651,
 'tonino': 40907,
 'unleavened': 52190,
 'fry': 11590,
 "'tv'": 40908,
 'toning': 40909,
 'obese': 14364,
 'sensationalized': 30592,
 'spiv': 40910,
 'spit': 6262,
 'arkin': 7367,
 'charleton': 21918,
 'jeon': 16826,
 'boardroom': 21919,
 'doubts': 4992,
 'spin': 3087,
 'hepo': 53086,
 'wildcat': 27652,
 'venoms': 10587,
 'misconstrues': 52194,
 'mesmerising': 18517,
 'misconstrued': 40911,
 'rescinds': 52195,
 'prostrate': 52196,
 'majid': 40912,
 'climbed': 16482,
 'canoeing': 34734,
 'majin': 52198,
 'animie': 57807,
 'sylke': 40913,
 'conditioned': 14902,
 'waddell': 40914,
 '3\x85': 52199,
 'hyperdrive': 41191,
 'conditioner': 34735,
 'bricklayer': 53156,
 'hong': 2579,
 'memoriam': 52201,
 'inventively': 30595,
 "levant's": 25252,
 'portobello': 20641,
 'remand': 52203,
 'mummified': 19507,
 'honk': 27653,
 'spews': 19508,
 'visitations': 40915,
 'mummifies': 52204,
 'cavanaugh': 25253,
 'zeon': 23388,
 "jungle's": 40916,
 'viertel': 34736,
 'frenchmen': 27654,
 'torpedoes': 52205,
 'schlessinger': 52206,
 'torpedoed': 34737,
 'blister': 69879,
 'cinefest': 52207,
 'furlough': 34738,
 'mainsequence': 52208,
 'mentors': 40917,
 'academic': 9097,
 'stillness': 20605,
 'academia': 40918,
 'lonelier': 52209,
 'nibby': 52210,
 "losers'": 52211,
 'cineastes': 40919,
 'corporate': 4452,
 'massaging': 40920,
 'bellow': 30596,
 'absurdities': 19509,
 'expetations': 53244,
 'nyfiken': 40921,
 'mehras': 75641,
 'lasse': 52212,
 'visability': 52213,
 'militarily': 33949,
 "elder'": 52214,
 'gainsbourg': 19026,
 'hah': 20606,
 'hai': 13423,
 'haj': 34739,
 'hak': 25254,
 'hal': 4314,
 'ham': 4895,
 'duffer': 53262,
 'haa': 52216,
 'had': 69,
 'advancement': 11933,
 'hag': 16828,
 "hand'": 25255,
 'hay': 13424,
 'mcnamara': 20607,
 "mozart's": 52217,
 'duffel': 30734,
 'haq': 30597,
 'har': 13890,
 'has': 47,
 'hat': 2404,
 'hav': 40922,
 'haw': 30598,
 'figtings': 52218,
 'elders': 15498,
 'underpanted': 52219,
 'pninson': 52220,
 'unequivocally': 27655,
 "barbara's": 23676,
 "bello'": 52222,
 'indicative': 13000,
 'yawnfest': 40923,
 'hexploitation': 52223,
 "loder's": 52224,
 'sleuthing': 27656,
 "justin's": 32625,
 "'ball": 52225,
 "'summer": 52226,
 "'demons'": 34938,
 "mormon's": 52228,
 "laughton's": 34740,
 'debell': 52229,
 'shipyard': 39727,
 'unabashedly': 30600,
 'disks': 40404,
 'crowd': 2293,
 'crowe': 10090,
 "vancouver's": 56437,
 'mosques': 34741,
 'crown': 6630,
 'culpas': 52230,
 'crows': 27657,
 'surrell': 53347,
 'flowless': 52232,
 'sheirk': 52233,
 "'three": 40926,
 "peterson'": 52234,
 'ooverall': 52235,
 'perchance': 40927,
 'bottom': 1324,
 'chabert': 53366,
 'sneha': 52236,
 'inhuman': 13891,
 'ichii': 52237,
 'ursla': 52238,
 'completly': 30601,
 'moviedom': 40928,
 'raddick': 52239,
 'brundage': 51998,
 'brigades': 40929,
 'starring': 1184,
 "'goal'": 52240,
 'caskets': 52241,
 'willcock': 52242,
 "threesome's": 52243,
 "mosque'": 52244,
 "cover's": 52245,
 'spaceships': 17640,
 'anomalous': 40930,
 'ptsd': 27658,
 'shirdan': 52246,
 'obscenity': 21965,
 'lemmings': 30602,
 'duccio': 30603,
 "levene's": 52247,
 "'gorby'": 52248,
 "teenager's": 25258,
 'marshall': 5343,
 'honeymoon': 9098,
 'shoots': 3234,
 'despised': 12261,
 'okabasho': 52249,
 'fabric': 8292,
 'cannavale': 18518,
 'raped': 3540,
 "tutt's": 52250,
 'grasping': 17641,
 'despises': 18519,
 "thief's": 40931,
 'rapes': 8929,
 'raper': 52251,
 "eyre'": 27659,
 'walchek': 52252,
 "elmo's": 23389,
 'perfumes': 40932,
 'spurting': 21921,
 "exposition'\x85": 52253,
 'denoting': 52254,
 'thesaurus': 34743,
 "shoot'": 40933,
 'bonejack': 49762,
 'simpsonian': 52256,
 'hebetude': 30604,
 "hallow's": 34744,
 'desperation\x85': 52257,
 'incinerator': 34745,
 'congratulations': 10311,
 'humbled': 52258,
 "else's": 5927,
 'trelkovski': 40848,
 "rape'": 52259,
 "'chapters'": 59389,
 '1600s': 52260,
 'martian': 7256,
 'nicest': 25259,
 'eyred': 52262,
 'passenger': 9460,
 'disgrace': 6044,
 'moderne': 52263,
 'barrymore': 5123,
 'yankovich': 52264,
 'moderns': 40934,
 'studliest': 52265,
 'bedsheet': 52266,
 'decapitation': 14903,
 'slurring': 52267,
 "'nunsploitation'": 52268,
 "'character'": 34746,
 'cambodia': 9883,
 'rebelious': 52269,
 'pasadena': 27660,
 'crowne': 40935,
 "'bedchamber": 52270,
 'conjectural': 52271,
 'appologize': 52272,
 'halfassing': 52273,
 'paycheque': 57819,
 'palms': 20609,
 "'islands": 52274,
 'hawked': 40936,
 'palme': 21922,
 'conservatively': 40937,
 'larp': 64010,
 'palma': 5561,
 'smelling': 21923,
 'aragorn': 13001,
 'hawker': 52275,
 'hawkes': 52276,
 'explosions': 3978,
 'loren': 8062,
 "pyle's": 52277,
 'shootout': 6707,
 "mike's": 18520,
 "driscoll's": 52278,
 'cogsworth': 40938,
 "britian's": 52279,
 'childs': 34747,
 "portrait's": 52280,
 'chain': 3629,
 'whoever': 2500,
 'puttered': 52281,
 'childe': 52282,
 'maywether': 52283,
 'chair': 3039,
 "rance's": 52284,
 'machu': 34748,
 'ballet': 4520,
 'grapples': 34749,
 'summerize': 76155,
 'freelance': 30606,
 "andrea's": 52286,
 '\x91very': 52287,
 'coolidge': 45882,
 'mache': 18521,
 'balled': 52288,
 'grappled': 40940,
 'macha': 18522,
 'underlining': 21924,
 'macho': 5626,
 'oversight': 19510,
 'machi': 25260,
 'verbally': 11314,
 'tenacious': 21925,
 'windshields': 40941,
 'paychecks': 18560,
 'jerk': 3399,
 "good'": 11934,
 'prancer': 34751,
 'prances': 21926,
 'olympus': 52289,
 'lark': 21927,
 'embark': 10788,
 'gloomy': 7368,
 'jehaan': 52290,
 'turaqui': 52291,
 "child'": 20610,
 'locked': 2897,
 'pranced': 52292,
 'exact': 2591,
 'unattuned': 52293,
 'minute': 786,
 'skewed': 16121,
 'hodgins': 40943,
 'skewer': 34752,
 'think\x85': 52294,
 'rosenstein': 38768,
 'helmit': 52295,
 'wrestlemanias': 34753,
 'hindered': 16829,
 "martha's": 30607,
 'cheree': 52296,
 "pluckin'": 52297,
 'ogles': 40944,
 'heavyweight': 11935,
 'aada': 82193,
 'chopping': 11315,
 'strongboy': 61537,
 'hegemonic': 41345,
 'adorns': 40945,
 'xxth': 41349,
 'nobuhiro': 34754,
 'capitães': 52301,
 'kavogianni': 52302,
 'antwerp': 13425,
 'celebrated': 6541,
 'roarke': 52303,
 'baggins': 40946,
 'cheeseburgers': 31273,
 'matras': 52304,
 "nineties'": 52305,
 "'craig'": 52306,
 'celebrates': 13002,
 'unintentionally': 3386,
 'drafted': 14365,
 'climby': 52307,
 '303': 52308,
 'oldies': 18523,
 'climbs': 9099,
 'honour': 9658,
 'plucking': 34755,
 '305': 30077,
 'address': 5517,
 'menjou': 40947,
 "'freak'": 42595,
 'dwindling': 19511,
 'benson': 9461,
 'white’s': 52310,
 'shamelessness': 40948,
 'impacted': 21928,
 'upatz': 52311,
 'cusack': 3843,
 "flavia's": 37570,
 'effette': 52312,
 'influx': 34756,
 'boooooooo': 52313,
 'dimitrova': 52314,
 'houseman': 13426,
 'bigas': 25262,
 'boylen': 52315,
 'phillipenes': 52316,
 'fakery': 40949,
 "grandpa's": 27661,
 'darnell': 27662,
 'undergone': 19512,
 'handbags': 52318,
 'perished': 21929,
 'pooped': 37781,
 'vigour': 27663,
 'opposed': 3630,
 'etude': 52319,
 "caine's": 11802,
 'doozers': 52320,
 'photojournals': 34757,
 'perishes': 52321,
 'constrains': 34758,
 'migenes': 40951,
 'consoled': 30608,
 'alastair': 16830,
 'wvs': 52322,
 'ooooooh': 52323,
 'approving': 34759,
 'consoles': 40952,
 'disparagement': 52067,
 'futureistic': 52325,
 'rebounding': 52326,
 "'date": 52327,
 'gregoire': 52328,
 'rutherford': 21930,
 'americanised': 34760,
 'novikov': 82199,
 'following': 1045,
 'munroe': 34761,
 "morita'": 52329,
 'christenssen': 52330,
 'oatmeal': 23109,
 'fossey': 25263,
 'livered': 40953,
 'listens': 13003,
 "'marci": 76167,
 "otis's": 52333,
 'thanking': 23390,
 'maude': 16022,
 'extensions': 34762,
 'ameteurish': 52335,
 "commender's": 52336,
 'agricultural': 27664,
 'convincingly': 4521,
 'fueled': 17642,
 'mahattan': 54017,
 "paris's": 40955,
 'vulkan': 52339,
 'stapes': 52340,
 'odysessy': 52341,
 'harmon': 12262,
 'surfing': 4255,
 'halloran': 23497,
 'unbelieveably': 49583,
 "'offed'": 52342,
 'quadrant': 30610,
 'inhabiting': 19513,
 'nebbish': 34763,
 'forebears': 40956,
 'skirmish': 34764,
 'ocassionally': 52343,
 "'resist": 52344,
 'impactful': 21931,
 'spicier': 52345,
 'touristy': 40957,
 "'football'": 52346,
 'webpage': 40958,
 'exurbia': 52348,
 'jucier': 52349,
 'professors': 14904,
 'structuring': 34765,
 'jig': 30611,
 'overlord': 40959,
 'disconnect': 25264,
 'sniffle': 82204,
 'slimeball': 40960,
 'jia': 40961,
 'milked': 16831,
 'banjoes': 40962,
 'jim': 1240,
 'workforces': 52351,
 'jip': 52352,
 'rotweiller': 52353,
 'mundaneness': 34766,
 "'ninja'": 52354,
 "dead'": 11043,
 "cipriani's": 40963,
 'modestly': 20611,
 "professor'": 52355,
 'shacked': 40964,
 'bashful': 34767,
 'sorter': 23391,
 'overpowering': 16123,
 'workmanlike': 18524,
 'henpecked': 27665,
 'sorted': 18525,
 "jōb's": 52357,
 "'always": 52358,
 "'baptists": 34768,
 'dreamcatchers': 52359,
 "'silence'": 52360,
 'hickory': 21932,
 'fun\x97yet': 52361,
 'breakumentary': 52362,
 'didn': 15499,
 'didi': 52363,
 'pealing': 52364,
 'dispite': 40965,
 "italy's": 25265,
 'instability': 21933,
 'quarter': 6542,
 'quartet': 12611,
 'padmé': 52365,
 "'bleedmedry": 52366,
 'pahalniuk': 52367,
 'honduras': 52368,
 'bursting': 10789,
 "pablo's": 41468,
 'irremediably': 52370,
 'presages': 40966,
 'bowlegged': 57835,
 'dalip': 65186,
 'entering': 6263,
 'newsradio': 76175,
 'presaged': 54153,
 "giallo's": 27666,
 'bouyant': 40967,
 'amerterish': 52371,
 'rajni': 18526,
 'leeves': 30613,
 'macauley': 34770,
 'seriously': 615,
 'sugercoma': 52372,
 'grimstead': 52373,
 "'fairy'": 52374,
 'zenda': 30614,
 "'twins'": 52375,
 'realisation': 17643,
 'highsmith': 27667,
 'raunchy': 7820,
 'incentives': 40968,
 'flatson': 52377,
 'snooker': 35100,
 'crazies': 16832,
 'crazier': 14905,
 'grandma': 7097,
 'napunsaktha': 52378,
 'workmanship': 30615,
 'reisner': 52379,
 "sanford's": 61309,
 '\x91doña': 52380,
 'modest': 6111,
 "everything's": 19156,
 'hamer': 40969,
 "couldn't'": 52382,
 'quibble': 13004,
 'socking': 52383,
 'tingler': 21934,
 'gutman': 52384,
 'lachlan': 40970,
 'tableaus': 52385,
 'headbanger': 52386,
 'spoken': 2850,
 'cerebrally': 34771,
 "'road": 23493,
 'tableaux': 21935,
 "proust's": 40971,
 'periodical': 40972,
 "shoveller's": 52388,
 'tamara': 25266,
 'affords': 17644,
 'concert': 3252,
 "yara's": 87958,
 'someome': 52389,
 'lingering': 8427,
 "abraham's": 41514,
 'beesley': 34772,
 'cherbourg': 34773,
 'kagan': 28627,
 'snatch': 9100,
 "miyazaki's": 9263,
 'absorbs': 25267,
 "koltai's": 40973,
 'tingled': 64030,
 'crossroads': 19514,
 'rehab': 16124,
 'falworth': 52392,
 'sequals': 52393,
 ...}

We need a way to decipher reviews, so we reverse the dictionary

In [4]:
reverse_word_index = dict([(value, key) for (key, value) in dictv.items()])
def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
decode_review(X_train[0]), y_train[0]
Out[4]:
("<START> this film was just brilliant casting <UNK> <UNK> story direction <UNK> really <UNK> the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same <UNK> <UNK> as myself so i loved the fact there was a real <UNK> with this film the <UNK> <UNK> throughout the film were great it was just brilliant so much that i <UNK> the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the <UNK> <UNK> was amazing really <UNK> at the end it was so sad and you know what they say if you <UNK> at a film it must have been good and this definitely was also <UNK> to the two little <UNK> that played the <UNK> of <UNK> and paul they were just brilliant children are often left out of the <UNK> <UNK> i think because the stars that play them all <UNK> up are such a big <UNK> for the whole film but these children are amazing and should be <UNK> for what they have done don't you think the whole story was so <UNK> because it was true and was <UNK> life after all that was <UNK> with us all",
 1)

We also need to pad our sentences so that they are the same length with the pad token.

In [5]:
from keras.preprocessing import sequence

pad_ = max([len(x) for x in X_train])
X_train_pd = sequence.pad_sequences(X_train, value=dictv["<PAD>"], maxlen=pad_, padding='post')
X_test_pd = sequence.pad_sequences(X_test, value=dictv["<PAD>"], maxlen=pad_, padding='post')

shapes(X_train_pd, X_test_pd)
arg_0: (25000, 2494)
arg_1: (25000, 2494)

Each sequence is too long. For now, we can slice the sentences in smaller sequences.

In [6]:
from tqdm import tqdm

def sliceton(X,Y,n):
    out = []
    ys = np.array([])
    for i,review in enumerate(tqdm(X)):
        for idx in range(0, len(review),n):
            if idx+n>len(review)-1:
                out.append(np.array(sequence.pad_sequences(np.array(review[idx: idx+n])[None,:],
                                                           value=dictv["<PAD>"], maxlen=n,
                                                           padding='post')).squeeze())
                ys = np.append(ys,Y[i])
                break    
            out.append(review[idx: idx+n])
            ys = np.append(ys,Y[i])
    return np.array(out), ys

X_train_pds, y_train_pds = sliceton(X_train, y_train, 128)
X_test_pds, y_test_pds = sliceton(X_test, y_test, 128)

shapes(X_train_pds, y_train_pds, X_test_pds, y_test_pds)
100%|██████████| 25000/25000 [00:02<00:00, 10752.65it/s]
100%|██████████| 25000/25000 [00:02<00:00, 12340.60it/s]
arg_0: (59324, 128)
arg_1: (59324,)
arg_2: (57755, 128)
arg_3: (57755,)

Recurrent Network

In [7]:
from tensorflow import keras
rnn_model = keras.Sequential([
    keras.layers.SimpleRNN(64),
    keras.layers.Dense(1, activation='sigmoid')])

rnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

rnn_model.fit(X_train_pds[::,None].astype(float), y_train_pds, validation_data=(X_test_pds[::,None].astype(float), y_test_pds), epochs=3, batch_size=2048)
Train on 59324 samples, validate on 57755 samples
Epoch 1/3
59324/59324 [==============================] - 1s 15us/sample - loss: 0.8330 - acc: 0.5004 - val_loss: 0.8049 - val_acc: 0.5014
Epoch 2/3
59324/59324 [==============================] - 0s 4us/sample - loss: 0.7803 - acc: 0.5041 - val_loss: 0.7695 - val_acc: 0.5008
Epoch 3/3
59324/59324 [==============================] - 0s 5us/sample - loss: 0.7490 - acc: 0.5078 - val_loss: 0.7493 - val_acc: 0.4988
Out[7]:
<tensorflow.python.keras.callbacks.History at 0x7fe9b0b63780>

What is Embeddings?

Learns matrix to map and encode words as real-valued vectors where the similarity between words
in terms of meaning translates to closeness in the vector space.

So now we have a vector to represent a word, which its value is learnable

image.png

In [8]:
evl = 500
rnnE_model = keras.Sequential([
    keras.layers.Embedding(1000, evl,input_length = 128),
    keras.layers.SimpleRNN(64),
    keras.layers.Dense(1, activation='sigmoid')])

rnnE_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

rnnE_model.fit(X_train_pds, y_train_pds, validation_data=(X_test_pds, y_test_pds), epochs=5, batch_size=2048)
Train on 59324 samples, validate on 57755 samples
Epoch 1/5
59324/59324 [==============================] - 5s 88us/sample - loss: 0.6836 - acc: 0.5478 - val_loss: 0.6583 - val_acc: 0.6002
Epoch 2/5
59324/59324 [==============================] - 5s 81us/sample - loss: 0.5899 - acc: 0.7015 - val_loss: 0.5568 - val_acc: 0.7288
Epoch 3/5
59324/59324 [==============================] - 5s 79us/sample - loss: 0.5118 - acc: 0.7570 - val_loss: 0.5060 - val_acc: 0.7636
Epoch 4/5
59324/59324 [==============================] - 5s 80us/sample - loss: 0.4772 - acc: 0.7775 - val_loss: 0.5044 - val_acc: 0.7493
Epoch 5/5
59324/59324 [==============================] - 5s 82us/sample - loss: 0.4552 - acc: 0.7892 - val_loss: 0.4946 - val_acc: 0.7703
Out[8]:
<tensorflow.python.keras.callbacks.History at 0x7fe9af69ed68>

LSTM and GRU

In [9]:
from tensorflow import keras
evl = 500

lstm_model = keras.Sequential([
    keras.layers.Embedding(1000, evl,input_length = 128),
    keras.layers.LSTM(64),
    keras.layers.Dense(1, activation='sigmoid')])

lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

lstm_model.fit(X_train_pds, y_train_pds, validation_data=(X_test_pds, y_test_pds), epochs=5, batch_size=2048)
Train on 59324 samples, validate on 57755 samples
Epoch 1/5
59324/59324 [==============================] - 10s 169us/sample - loss: 0.6792 - acc: 0.5521 - val_loss: 0.6116 - val_acc: 0.6809
Epoch 2/5
59324/59324 [==============================] - 9s 157us/sample - loss: 0.5374 - acc: 0.7436 - val_loss: 0.4972 - val_acc: 0.7691
Epoch 3/5
59324/59324 [==============================] - 9s 156us/sample - loss: 0.4743 - acc: 0.7791 - val_loss: 0.4638 - val_acc: 0.7734
Epoch 4/5
59324/59324 [==============================] - 9s 154us/sample - loss: 0.4529 - acc: 0.7874 - val_loss: 0.4578 - val_acc: 0.7822
Epoch 5/5
59324/59324 [==============================] - 9s 156us/sample - loss: 0.4487 - acc: 0.7862 - val_loss: 0.4478 - val_acc: 0.7883
Out[9]:
<tensorflow.python.keras.callbacks.History at 0x7fe9a986b160>
In [10]:
gru_model = keras.Sequential([
    keras.layers.Embedding(1000, evl,input_length = 128),
    keras.layers.GRU(64),
    keras.layers.Dense(1, activation='sigmoid')])

gru_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

gru_model.fit(X_train_pds, y_train_pds, validation_data=(X_test_pds, y_test_pds), epochs=5, batch_size=2048)
Train on 59324 samples, validate on 57755 samples
Epoch 1/5
59324/59324 [==============================] - 9s 152us/sample - loss: 0.6891 - acc: 0.5278 - val_loss: 0.6812 - val_acc: 0.5541
Epoch 2/5
59324/59324 [==============================] - 8s 140us/sample - loss: 0.5885 - acc: 0.6744 - val_loss: 0.4948 - val_acc: 0.7708
Epoch 3/5
59324/59324 [==============================] - 9s 151us/sample - loss: 0.4738 - acc: 0.7790 - val_loss: 0.4694 - val_acc: 0.7790
Epoch 4/5
59324/59324 [==============================] - 8s 140us/sample - loss: 0.4558 - acc: 0.7893 - val_loss: 0.4578 - val_acc: 0.7856
Epoch 5/5
59324/59324 [==============================] - 8s 139us/sample - loss: 0.4561 - acc: 0.7877 - val_loss: 0.4608 - val_acc: 0.7834
Out[10]:
<tensorflow.python.keras.callbacks.History at 0x7fe974bc19b0>
In [11]:
titanic_review = """James Cameron's 'Titanic' shares a similar motto to Marmite,
            "you either love it or hate it", I for one love this film, yes 
            I know it's got a drawn out romance story, but there's just
            something about the 3-hour fill of the film that makes its such a
            spectacularly emotional and beautiful movie. I saw this a lot when 
            I was growing up, this was one of the films of my childhood, it is 
            truly a powerfully resonant and visually stunning movie of epic proportions. 
            Personally I favour the British original 'A Night to Remember', but this is a 
            pretty close contender. Winner of 11 Oscars, James Cameron's romantic-disaster
            epic is a triumph of cinema that boasts perfect chemistry between Kate and Leo as 
            the lovers bound for tragedy. Many people disregard this film nowadays solely
            because it's become the most popular film ever made alongside Cameron's other epic 
            'Avatar', and whilst 'Titanic' is definitely not one of my favourite films, it's 
            just so powerfully amazing and no doubt at all it has once brought a tear to everyone's 
            eyes. The main aspect I love in this film is James Horner's haunting score that was a key
            ingredient in the film's success, it is simply perfect, too bad Celine Dion had to close
            this on her awful pop version. Nonetheless, 'Titanic' is a modern classic and a 
            beautifully spectacular film that will live on."""

gotti_review = """I'd rather wake up next to a severed horse head than ever watch 'Gotti' again. 
            The worst movie of the year so far, the long-awaited biopic about the Gambino crime boss' rise from
            made man to top dog took four directors, 44 producers and eight years to make. It shows. The finished
            product belongs in a cement bucket at the bottom of the river."""
In [12]:
def getSent(review, model):
    review_ar = []
    for word in review.split(" "):
        val = dictv.get(word)
        review_ar.append(val) if val is not None else None
    review_ar = sequence.pad_sequences(np.array(review_ar)[None,::], maxlen=128, padding='post')
    print(review_ar,review_ar.shape)
    print(model.predict(review_ar))
In [13]:
getSent(titanic_review,rnnE_model), getSent(gotti_review,rnnE_model)
[[  108     7    61    12     9   371     6 10354 19724     5  2009  1380
     20     7  1711  5260     4   204     8    21    14     9     6   184
    491     7  1502  1711     9     6  3823     7   438    15  6091   404
   1175   200     5    17     4  1846  2725    18    84  8112    14    22
   2889    88    45   413     4    91  1063    22   126    93  4648    85
   1711     5  1864     9   407    24    31     7    61  1640    45    43
     38 10354   480     5    57   824    33    32    12    47   280   839
      6  3325     8  4468   293  1251   119    11    14    22     9  2299
    603    15    16     6 10918    11     4   598    12     9   331    99
     78    69     8    14    23    41   373  1719     9     6   682   356
      5     6  1293  2093    22    15    80   412]] (1, 128)
[[0.50999177]]
[[  247  3290    56   375     8     6  6429  1817   419    74   126   106
    249    20     7     4   291    38     4  7421    44     4   823 23097
   2200    93   132     8   350   912   562   689 13560  1180     5  2310
    153     8  2220  3246    11     6 11812  9219    33     4  1324     7
      4     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]] (1, 128)
[[0.28082561]]
Out[13]:
(None, None)
In [14]:
getSent(titanic_review,lstm_model), getSent(gotti_review,lstm_model)
[[  108     7    61    12     9   371     6 10354 19724     5  2009  1380
     20     7  1711  5260     4   204     8    21    14     9     6   184
    491     7  1502  1711     9     6  3823     7   438    15  6091   404
   1175   200     5    17     4  1846  2725    18    84  8112    14    22
   2889    88    45   413     4    91  1063    22   126    93  4648    85
   1711     5  1864     9   407    24    31     7    61  1640    45    43
     38 10354   480     5    57   824    33    32    12    47   280   839
      6  3325     8  4468   293  1251   119    11    14    22     9  2299
    603    15    16     6 10918    11     4   598    12     9   331    99
     78    69     8    14    23    41   373  1719     9     6   682   356
      5     6  1293  2093    22    15    80   412]] (1, 128)
[[0.8580209]]
[[  247  3290    56   375     8     6  6429  1817   419    74   126   106
    249    20     7     4   291    38     4  7421    44     4   823 23097
   2200    93   132     8   350   912   562   689 13560  1180     5  2310
    153     8  2220  3246    11     6 11812  9219    33     4  1324     7
      4     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]] (1, 128)
[[0.3208993]]
Out[14]:
(None, None)
In [15]:
getSent(titanic_review,gru_model), getSent(gotti_review,gru_model)
[[  108     7    61    12     9   371     6 10354 19724     5  2009  1380
     20     7  1711  5260     4   204     8    21    14     9     6   184
    491     7  1502  1711     9     6  3823     7   438    15  6091   404
   1175   200     5    17     4  1846  2725    18    84  8112    14    22
   2889    88    45   413     4    91  1063    22   126    93  4648    85
   1711     5  1864     9   407    24    31     7    61  1640    45    43
     38 10354   480     5    57   824    33    32    12    47   280   839
      6  3325     8  4468   293  1251   119    11    14    22     9  2299
    603    15    16     6 10918    11     4   598    12     9   331    99
     78    69     8    14    23    41   373  1719     9     6   682   356
      5     6  1293  2093    22    15    80   412]] (1, 128)
[[0.5956694]]
[[  247  3290    56   375     8     6  6429  1817   419    74   126   106
    249    20     7     4   291    38     4  7421    44     4   823 23097
   2200    93   132     8   350   912   562   689 13560  1180     5  2310
    153     8  2220  3246    11     6 11812  9219    33     4  1324     7
      4     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]] (1, 128)
[[0.18565017]]
Out[15]:
(None, None)