import pandas as pd
df = pd.read_csv('IMDB-Dataset.csv')
df.head()

df.describe()

print("Min words: ", df['review'].str.split().apply(len).min())
print("Max words: ", df['review'].str.split().apply(len).max())

df['review'].str.split().apply(len).plot(kind='hist', bins=100).set_xlabel('"Words" per review')

Min words:  4
Max words:  2470

Text(0.5, 0, '"Words" per review')

df[df['review'].str.split().apply(len) <= 10]

from collections import Counter

words = Counter()
df['review'].str.split().apply(words.update)
words = pd.Series(words)
print("Number of words (repeated less than 3 times): ", len(words[words < 3]))
words[words < 3].head(10)
# Export to get an idea of why these words are uncommon
#pd.DataFrame(words[words < 3]).to_csv('words.csv')

Number of words (repeated less than 3 times):  322327

import re

html_tags = set()
for review in df['review']:
    # find text inside HTML-like tags, i.e., "<...>"
    matches = re.findall(r"<[^>]+>", review)
    html_tags.update(matches)

print("--------HTML uniques--------")
print(html_tags, "\n")

hash_words = set()
for review in df['review']:
   # match "#" followed by one or more characters
    matches = re.findall(r"#\w+", review)
    hash_words.update(matches)

print("--------# uniques--------")
print(hash_words, "\n")

at_words = set()
for review in df['review']:
  # Same regex but to find mentions or emails, i.e., "@Kelly knows this"
    matches = re.findall(r"@\w+", review)
    at_words.update(matches)

print("--------@ uniques--------")
print(at_words, "\n")

word_com = set()
for review in df['review']:
  # words ending in ".com"
    matches = re.findall(r"\w+\.com", review)
    word_com.update(matches)

print("--------.com uniques--------")
print(word_com)

--------HTML uniques--------
{'<<<<< ......... ......... ........................ ................ ..................... .................. .............. ............ ................<br />', '<<<sigh>', '< who was to be a victim, but woman-power trumps evil scientist every time.<br />', '< and make notes for my partial "review" to show how foolish the movie is. "Resident Evil" (horror) or "Dude, Where\'s My Car?" (comedy) I can watch over and over again and still enjoy! But this...!<br />', '<< Review posted at FilmDailies.com>', '< Cough >', '<i>', '<<<<<br />', "< $600 per screen its opening weekend, and just over $400 each, after its month's theater run in latter 2002. Overall gross was $261K, which I'd doubt could cover cast and crew's hotel and food for a week on location.<br />", '<p>', '< than 30 minutes of watching, being bored and irritated. <br />', '</em>', '<http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=ANSWERMAN>', '<SPOILER>', '<grin>', '<3 <br />', '<sp?) classic "Romeo & Juliet". Guess I\'ll have to rent that next.<br />', '<em>', '<-----Minor Spoilers!---->', '< YES >', '<hr>', '<33<br />', '<< controversial.<br />', '<..>', '<<<<<<br />', '</SPOILER>', '<br />', '<=8.<br />', '</i>', '<-----Minor Spoilers!----->', '< Cough , cough >'} 

--------# uniques--------
{'#91', '#one', '#k', '#Major', '#100', '#o', '#3', '#ck', '#8', '#yawws', '#4', '#216', '#183', '#t', '#16', '#2', '#7', '#269', '#comment', '#8230', '#hole', '#17', '#SPOILERS', '#85', '#K', '#10', '#8217', '#232', '#305', '#240', '#15', '#18', '#1', '#s', '#cking', '#mn', '#26', '#61514', '#40', '#58', '#32', '#502', '#really', '#62', '#the', '#stein', '#608', '#29', '#5', '#9', '#WK00817', '#23', '#12', '#racist', '#97', '#an', '#terrible', '#ers', '#38', '#80', '#345', '#88', '#21', '#6', '#27', '#64', '#cheesy', '#Bad', '#81', '#KING', '#ingwasted', '#good', '#Western', '#ing', '#14', '#0', '#22', '#Q', '#11', '#35', '#701', '#13', '#police'} 

--------@ uniques--------
{'@hotmail', '@midohio', '@earthlink', '@rn', '@verizon', '@AOL', '@adelphia', '@_', '@K', '@ckin', '@itv', '@aol', '@let', '@g', '@ing', '@ppy', '@talkamerica', '@sidwell', '@googlemail', '@ERS', '@Yahoo', '@nk', '@k', '@pipinternet', '@aim', '@pfest', '@ss', '@ck', '@bbe', '@hot', '@rt', '@the', '@night', '@p', '@yahoo', '@YAHOO', '@netzero', '@fightrunner', '@t', '@mn', '@ks'} 

--------.com uniques--------
{'ifeng.com', 'www.com', 'peterkurth.com', 'lovetrapmovie.com', 'play.com', 'Tv.com', 'PetitionOnline.com', 'threestooges.com', 'poffysmoviemania.com', 'thenewamerican.com', 'filmcritic.com', 'Amazon.com', 'FilmDailies.com', 'jumpedtheshark.com', 'mybluray.com', 'mail.com', 'amargosaoperahouse.com', 'ResidentHazard.com', 'globalpublicmedia.com', 'thinkgeek.com', 'dvdtalk.com', 'HULU.com', 'list.com', 'mmmyeah.com', 'TheCoffeeCoaster.com', 'bleedmedry.com', 'flixer.com', 'carlylegroup.com', 'Startup.com', 'fwfr.com', 'higherpraise.com', 'hbo.com', 'Dry.com', 'weatherpaparazzi.com', 'mikeandvicki.com', 'fosteronfilm.com', 'happierabroad.com', 'rateyourmusic.com', 'screendaily.com', 'macrophile.com', 'petitiononline.com', 'theindependent.com', 'nvogel.com', 'cinemablend.com', 'uglypeople.com', 'MissCastaway.com', 'troma.com', 'IMDB.com', 'hotmail.com', 'Dreadcentral.com', 'razzies.com', 'AOL.com', 'Flixter.com', 'aol.com', 'community.com', 'Oprah.com', 'google.com', 'REV3.com', 'imdb.com', 'Veoh.com', 'Brainiacs.com', 'sublymonal.com', 'zonadvd.com', 'eloquentbooks.com', 'thestuffblag.com', 'mysoju.com', 'thehollywoodnews.com', 'friderwaves.com', 'mysteriesofcanada.com', 'ioffer.com', 'softfordigging.com', 'helium.com', 'LDSSingles.com', 'amazon.com', 'oldies.com', 'half.com', 'loveearth.com', 'MySpace.com', 'wholovesthesun.com', 'thefilmstage.com', 'blogspot.com', 'freedomofmind.com', 'Yahoo.com', 'RuthlessReviews.com', 'obsessedwithfilm.com', 'itv.com', 'Kamera.com', 'moviemusereviews.com', 'kennyhotz.com', 'myspace.com', 'whitepages.com', 'MTV.com', 'walmart.com', 'fullmoondirect.com', 'beyondhollywood.com', 'channel101.com', 'ebay.com', 'tinyurl.com', 'youtube.com', 'yahoo.com', 'joseiturbi.com', 'gitwisters.com', 'deviantart.com', 'angelfire.com', 'Newgrounds.com', 'mediasickness.com', 'intuitor.com', 'piczo.com', 'collider.com', 'pathtofreedom.com', 'archives.com', 'Detstar.com', 'blogs.com', 'Blogspot.com', 'warnerbros.com', 'playitforwardoz.com', 'filmcow.com', 'dot.com', 'peterhenderson.com', 'metacritic.com', 'IndependentCritics.com', 'treasureflix.com', 'googlemail.com', 'tccandler.com', 'britannica.com', 'ernestfunclub.com', 'aim.com', 'Jezebel.com', 'johntopping.com', 'geocities.com', 'aetv.com', 'multiply.com', 'Genforum.com', 'iCarly.com', 'Monthly.com', 'Reel.com', 'cinemademerde.com', 'Dolemite.com', 'badpuppy.com', 'answers.com', 'go.com', 'RottenTomatoes.com', 'davidlynch.com', 'Youtube.com', 'Letters.com', 'YouTube.com', 'angels.com', 'nixflix.com', 'suntimes.com', 'Half.com', 'spatulamadness.com', 'Salon.com', 'IMDb.com', 'whipped.com', 'ABC.com', 'dvdbeaver.com', 'HorrorYearbook.com', 'wazi.com', 'thepetitionsite.com'}

df['text'] = df['review']

df = df.drop_duplicates(subset='review')
df = df[df['review'].str.split().apply(len) <= 250]

df = df[~df['review'].str.contains(r'@', regex=True)]

# Words ending in "example.com + optional final dot" (where dot must be preserved)
# Complex link: example.com/.../... | Simple link example.com
df['text'] = df['text'].str.replace(r'\S*\.com/\S*\.', '.', regex=True) # Complex + "."
df['text'] = df['text'].str.replace(r'\S*\.com/\S*', ' ', regex=True) # Complex
df['text'] = df['text'].str.replace(r'\S*\.com\.', '.', regex=True) # Simple + "."
df['text'] = df['text'].str.replace(r'\S*\.com\b', ' ', regex=True) # Simple
pd.set_option('display.max_colwidth', 700)
df[df['review'].str.contains(r'http://www.myspace.com/62229249.', regex=True)]

# It is understood that <SPOILER> is an imdb-specific tag
html_tags = ["<i>", "</i>", "<p>", "</p>", "<br>", "<br />", "<em>", "</em>", "<SPOILER>", "</SPOILER>"] # For some reason <br />, not </br>
for tag in html_tags:
    df['text'] = df['text'].str.replace(tag, " ", regex=False)
df[df['review'].str.contains(r'\.<br /><br />It is a quirky flick', regex=True)]

# With possible non-space characters \S* before or after
df['text'] = df['text'].str.replace(r"\S*#\S*", " ", regex=True)
df[df['review'].str.contains(r'"Sorte Nula" is the #1 Box')]

print("ºRows previous to drop ->", df.shape[0], "\n")

words_letters_numbers = set()
for review in df['text']:
  # Regex by @Roberto, source, first response:
  # https://stackoverflow.com/questions/44187078/regex-to-get-words-containing-letters-and-numbers-certain-special-but-not-o
    matches = re.findall(r"([A-Za-z]+[\d@]+[\w@]*|[\d@]+[A-Za-z]+[\w@]*)", review)
    words_letters_numbers.update(matches)
print(words_letters_numbers, "\n")

def is_alpha_numeric(text):
    return bool(re.search(r"([A-Za-z]+[\d@]+[\w@]*|[\d@]+[A-Za-z]+[\w@]*)", text))

df = df[~df['text'].apply(is_alpha_numeric)]
print("ºRows after ->", df.shape[0])

ºRows previous to drop -> 34791 

{'h3ll', 'NUMB3RS', '1930ies', 'Dogma95', 'BruceV3', '14th', '2am', 'H2EFW', '2004s', '1980s', 'JP3', '5seconds', '6yrs', '1860s', 'recommanded1', '2h', '1d', '17yo', '1950ies', '230mph', 'ZB2', 'n64', '30pm', '80ish', '1980ies', '5ive', 'part7', '90s', '10yr', '4out5', '60ies', '58minutes', '100ft', 'ZB1', '1st', '2D', 'A1', '5yo', 'A666333', '2point4', '00Schneider', '50usd', '200th', 'AvP1', '4X', 'F16', 'K2', 'SAT1', '9pm', '31st', '1990ies', '89or', '40th', '1h45', 'RR7', 'd1', 'RH3', '80yr', 'G4', '4K', '4pm', 'OldWereWolf56', '12mm', '7eventy', '18s', '3x', '2min', '9mm', '20mins', 'WW1', '13TH', 'THX1138', 'F903', '45rpm', '3DVD', 'x4', '2MORE', '1500s', 'ejames6342', '1080p', 'suck3d', 'spt11', 'R2', 'sysnuk3r', 's01', '50Ft', '3rds', '20m', '80s', 'SE7EN', '1960s', '3th', 'addicted2you', '15am', 'dcreasy2001', 'U2', '6am', 'piss3d', '17million', '38K', 'THPS3', '7ft', 'd8', '500th', 'MSTK3000', '2s', '8P', '100mph', 'ps3', '5m', 'the13th', '80S', 'hostel2', 'Y2K', '17th', 'ps1', '3M', 'MI4', '18year', 'ITV1', 'drss1942', '00s', '30something', '57th', 'MST3K', 'ou812', '110mph', '15Apr08', 'T2', '480m', '125m', '100m', '19K', '0f', '2pac', '372nd', '10yo', '40yr', '300mln', '16th', '40mins', '8MM', 'vh1', '90mins', 'BI2', 'GE007', 'sp4ectacle', '66p', '12M', '10ish', '69th', '5min', 'Gundam0079', '15ft', '00pm', '3x5', 'RS1', '7c', '9s', '32x', '60min', '4M', '99cents', '3K', 'producer9and', '18A', '197something', 'PS4', '22h45', 'DS9', '1h30', 'O11', '30th', '1tv', 'BBC4', 'DD1', 'Junagadh75', 'MDogg20', 'H2O', 'Wolf3D', '42m', '20mn', '225mins', 'WW2', 'MP5', 'bbc1', '1and', '42nd', 'F18', '19th', '07B', '180d', '15pm', 'data7', 'C3PO', '9Is', 'magellan33', 'BBC2', '12th', 'M80', 'Mst3k', 'Matt2', 'mst3k', '15PA', 'cr5eate', 'FP701', 'line4s', '5mins', '20ties', '2x4', '360Remake', 'WO2', 'Bond2a', '00AM', 'slight1y', 'season3', '230am', '37449ing', '13th', 'S500', 'PS1', 'A5zo', 'RTL7', '1o', 'PS2', '150k', 'Mandy62', '90min', '20yrs', '2cops', '2furious', 'R1', '79th', 'gr8', '1870s', '7ish', '90ish', '1h53', '1v', '27th', '10th', 'besties4lyf', 'C2', '50ies', '30MM', 'cr4p', '11M', 'X2', 'm3', '3colours', 'of5', '1840s', '1970s', '19thC', '1900s', 'b4', 'cough2Fast2Furiouscough', '33BC', '22mins', '90minutes', 'ZombieKilla81', '6th', 'CB4', '3who', 'ED2', '2d', 'm203', '00am', '30am', 'K3g', '1hour', 'k11', '51b', '98minutes', 'kiddypr0n', 'N64', 'P45', 'HOTD2', '4x', '45pm', '4Kids', '30PM', '70s', '4th', 'oo7', '113minutes', '30i', '1950s', 'sh1t', '2CVs', '74th', 'V8', '1h40', 'MSF2000', '2nd', 'R18', 'JP1', 'TomReynolds2004', '16mm', '1850s', 'C1', '401K', 'O12', '500lbs', 'Se7en', '1am', '5kph', 'l946', 'taptieg24', '1mln', 'HBO2', '30mins', 'ST1100', '2AM', '700K', '22nd', '330am', '40s', '50th', '1800Mph', '2370BCE', '10PM', '480p', 'CI2', '25min', '000DM', 'NC17', 'OB101', 'PG13', 'golden70', 'sh17', '36th', 'f16', 'V2', '11th', 'Zoey101', '14A', 'XBOX360', 'p3n1', '35th', 'P615', 'Vh1', 'H5N1', '1600s', 'P61', 'S01E01', '51st', '85mins', 'G7', 'PS3', '27x41', 'MFT3K', '23rd', '26th', '16A', '100times', '20c', '2fast', '5c', 'Shai6an', 'hi8', 'JP2', '150th', '49th', 'algernon4', '3d', '9as', '45mins', 'ED1', 'H2', '4am', '16K', '4EVA', '1880s', 'OSS117', 'BBC1', 'h1t', '1ch', '39D', 'M16s', '60ties', 'GF1', 'DW3', 'R2D2', '28th', '1M', 'AriesGemini100', '2Dimensional', '1ton', 'D2', 'ee03128', '80min', 'P90', '20ies', '60s', '100miles', '29th', '6Hours', 'B5', '157th', 'Zeoy101', '18th', 'x5', '6pm', '100Bt', '38k', '20s', '08th', '4ward', 'mgs4', '1000s', '55th', '1800s', 'MP3', '3lbs', 'f14', 'ITV4', '330mins', '0and', '3PO', '49er', 'T3', '70p', '1800hrs', 'MST3', '99cent', 'c1', '9PM', 'BI1', '9th', 'any1', 'badger1970', '30ties', '10s', '5s', 'THHE2', '30ft', '100k', '235th', '1990s', '12p', '50s', '24yr', '3D', 'h1', 'Pig2', 'WW11', '24Mar2001', '15mins', 'MI3', '45am', 'HP3', '32lb', 'H3', '1hr', '3m', '33m', '30k', '9lbs', '4Ever', 'part1', '176th', '6yo', '336th', 'Sim0ne', '40something', '3mins', '5th', '2X', '30yr', 'F86', 'f18', '2hours', '20ft', 'K9PI', 'CKY2K', 'G3A3', 'e04', '1500B', '99p', '4Q2', '60isms', '56th', '75m', '16s', '3AM', 'More4', '00o', '2000s', 'MSFT3000', '2gether', '1h40m', 'F5', 'ff07', 'ADGTH2', 'K3G', '40min', '50m', 'Critters4', '66th', '1min', 'Rd1', '101st', 'season2', '100x', '7th', '44mb', '24P', '10mil', 'MI5', 'DK3', 'r1313', 'FF7', '42nd_Street', 'MeatMarket2', '20X', 'C4', '540i', '500db', 'MST3000', 'N1', '4kids', 'Dogme95', 'Fett1138', 'P9FOS', 'MVC2', 'i8n', 'TV8', 'f117', 'SG1', '12x', '8mm', 'SEASON2', 'sh1tty', 'AVP2', '35s', '10mins', 'f00l', '1200F', 'ITV2', '4TH', 'MA2412', 'seen1', '442nd', 'T4', '20th', 'x3', '1920ies', '7mm', 'Lore60', 'H20', '1s', '10lines', 'MST3k', 'BBC3', '1hour17min', '80ies', '198os', '102nd', '24years', '0s', '1984ish', '3h', 'CO2', '2inch', 'str8', '8pm', '83mins', '150m', 'M1', '50min', '24th', '8O', 'Ring2', '531st', 'm61', 'MSTK3', '100s', '2PM', '84th', '10minutes', 'K98k', 'TLK3', '45th', '8yrs', '5years', '12yrs', 'HoTD2', 'Larryjoe76', '12A', 'charliesangel415', '25yrs', '1h', '4H', '1970ish', '15mts', 'ps2', '1910s', '0r', '16B', '35mm', '1970ies', '110min', 'x2', 'fi9lm', '9of10', 'johnl3d', 'HRGD002', '37C', '25th', '78rpm', 'season1', 'E3', '20year', '7days', '2x', 'Ft13th', '4got', 'XL5', '6TH', 'gr88', 'formula4', '20minutes', '16MM', '21st', '25mins', '3rd', '3yrs', 'AH56A', '50Cr', '8FTDF', '100min', 'dan7', '10p', '70ies', 'F0rs4k3n', '100mins', '2K', 'film4', 'jamrom4', '200mph', 'Mst3king', 'S1', '12yr', 'F4', 'S1m0ne', '11yr', 'channel4', '300C', '94th', '100th', '30s', 'PS5', '10yrs', 'F13th', 'T1000', 'PM7', '85min', '06th', 'MI6', '7even', '5hrs', 'pg13', '9ya', '4x4', 'FF2', '48hrs', '24p', '150K', '10x', '55yr', '25million', 'M16', '29s', '34th', '1100ad', '35pm', '1920s', '35mins', '8th', '70th', '12nd', 'ZB3', '3am', 'C3', 'S10', '100yards', 'P614', '15th', 'K9', 'ff7', '14s', 'MOLEY75', 'F117', 'st00pid', 'H6', '2hr', '30min', 'SM64', '240Z', '50k', '106min', '2hrs', 'l950', 'P3', 'LV2', '1890s', '1940s', '1700s', '1790s', 'ANT1', '4ever', 'MK2', '4yr', '10am', '145th', 'Y12', '01pm', 'C3P0', '10pm', 'F1', 'woody7739', '50p', 'ww2', 'BruceV13', 'PS6', '1930s', 'Aliens3', '5yrs', 'VH1', '88min', '24h', 'LV1', '000s'} 

ºRows after -> 31687

df['text'] = df['text'].str.replace(r"\d+", " ", regex=True)
df[df['review'].str.contains(r'This movie made it into one of my top 10 most awful movies')]

punctuation = r";:,¡!\"#$~%&\()*©+-=<>/@[\]ªº^_`{|}~?¿" # Note that (') & (.) are not included
df['text'] = df['text'].replace(f"[{re.escape(punctuation)}]", " ", regex=True)
df[df['review'].str.contains(r'"has got all the polari"')]

# multi-dot patterns with or without spaces
df['text'] = df['text'].replace(r"(?:\.\s*){2,}", ". ", regex=True)
df['text'] = df['text'].replace(r"\s*\.\s*", ". ", regex=True)
df[df['review'].str.contains(r'. . . er, sorry . . .')]

df['text'] = df['text'].str.lower()
df.head(1)

print("Previous to drop:")
print(df[df['text'].str.contains(r'stooooooopiddddd', regex=True)]['review'])
df = df[~df['text'].str.contains(r'([a-z])\1{2,}', regex=True, na=False)]
print("\nAfter drop:")
print(df[df['text'].str.contains(r'stooooooopiddddd', regex=True)]['review'])

Previous to drop:
45877    BOOOOOOOORRRRRINNGGGGGGGG and STOOOOOOOPIDDDDD. Kept falling asleep. If you want to see Miles O'Keefe loping around in a furry Speedo by all means rent this movie. If not please don't bother... Rife with anachronisms. Was this supposed to be set in the Ice Age, the Iron Age, the Steel Age or the Age of Reason? What was the reason for the black nylon wig on the guy dressed up as Genghis Khan? Was that really supposed to be Genghis Khan? If Ator had access to so much advanced technology and science why did we have wait another 1000 years for Leonardo? It's never clear where Ator comes from or if he's supposed to be some superior sort of being. You wonder if it was all explained in the firs...
Name: review, dtype: object

/tmp/ipython-input-1431963179.py:3: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.
  df = df[~df['text'].str.contains(r'([a-z])\1{2,}', regex=True, na=False)]

After drop:
Series([], Name: review, dtype: object)

df['text'] = df['text'].replace(r'\s{2,}', ' ', regex=True)
df['text'] = df['text'].str.strip()

%%capture
!pip install contractions

import contractions

df['text']=df['text'].apply(lambda x:contractions.fix(x))
df[df['review'].str.contains(r"While I've")][:1] # -> while i have

df = df.sample(frac=0.4, random_state=1) # reduced to 12k rows due to resource constraints
df.describe()

df = df.reset_index(drop=True)
print("Dups ->", df['text'].duplicated().any())
print("Size ->", df.shape) # Enough size
df['sentiment'].value_counts().plot(kind='bar'); # Well balanced

Dups -> False
Size -> (12396, 3)

%%capture
import nltk
from nltk import pos_tag, word_tokenize
from nltk.collocations import *
nltk.download('all')

stopwords =  ["would", "'s", "ll"]
stopwords = stopwords + nltk.corpus.stopwords.words('english')
stopwords[:15]

['would',
 "'s",
 'll',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any']

opinions = " ".join(df['text'])
opinions[:250]

'a gritty look at new york city and dock workers. this is a classic film realistic brutal at times always believable. it was originally shown live on tv also starring sidney poitier. john cassavetes was a fantastic director and actor. i doubt whoever '

tokens = word_tokenize(opinions)
pos_tags = pos_tag(tokens)
print(pos_tags[:10])

[('a', 'DT'), ('gritty', 'JJ'), ('look', 'NN'), ('at', 'IN'), ('new', 'JJ'), ('york', 'NN'), ('city', 'NN'), ('and', 'CC'), ('dock', 'NN'), ('workers', 'NNS')]

# Load statistical association measures for bigrams and trigrams
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

ngrams_num = 300 # top n-grams to extract

# Extract n-grams (2, 3) removing irrelevant characters such as punctuation
bigram_finder = BigramCollocationFinder.from_words(tokens)
bigram_finder.apply_word_filter(lambda w: (re.match(r'\W', w)))
trigram_finder = TrigramCollocationFinder.from_words(tokens)
trigram_finder.apply_word_filter(lambda w: (re.match(r'\W', w)))

# Best PMI - Likehood Ratio
best_bigrams_pmi = bigram_finder.nbest(bigram_measures.pmi, ngrams_num)
best_trigrams_pmi = trigram_finder.nbest(trigram_measures.pmi, ngrams_num)
best_bigrams_lr = bigram_finder.nbest(bigram_measures.likelihood_ratio, ngrams_num)
best_trigrams_lr = trigram_finder.nbest(trigram_measures.likelihood_ratio, ngrams_num)

# Set of Part-of-Speech (PoS) tags
# Adverbs are kept to retain meaningful expressions
avoid_pos = {"DT", "IN", "PRP", "RP", "CC", "CD", "MD"}
# Filter n-grams: remove those starting or ending with stopwords or undesired PoS tags
def filter_ngrams(ngrams):
    filtered = []
    for ng in ngrams:
        first_word, last_word = ng[0], ng[-1]
        first_tag = pos_tag([first_word])[0][1]
        last_tag = pos_tag([last_word])[0][1]

        if first_word not in stopwords and last_word not in stopwords \
           and first_tag not in avoid_pos and last_tag not in avoid_pos:
            filtered.append(ng)
    return filtered

best_bigrams_pmi_filtered = filter_ngrams(best_bigrams_pmi)
best_trigrams_pmi_filtered = filter_ngrams(best_trigrams_pmi)
best_bigrams_lr_filtered = filter_ngrams(best_bigrams_lr)
best_trigrams_lr_filtered = filter_ngrams(best_trigrams_lr)
best_pmi_filtered = best_bigrams_pmi_filtered + best_trigrams_pmi_filtered
best_lr_filtered = best_bigrams_lr_filtered + best_trigrams_lr_filtered

best_pmi_filtered = best_bigrams_pmi_filtered + best_trigrams_pmi_filtered
best_lr_filtered = best_bigrams_lr_filtered + best_trigrams_lr_filtered
print("BEST BIGRAMS AND TRIGRAMS PMI:")
print(best_pmi_filtered[:20])
print("\nBEST BIGRAMS AND TRIGRAMS LIKEHOOD RATIO:")
print(best_lr_filtered[:20])
print("\n\nSince the trigrams are not shown, let's see some of them")
print("LR",best_trigrams_lr_filtered[:5])
print("PMI",best_trigrams_pmi_filtered[:5])

BEST BIGRAMS AND TRIGRAMS PMI:
[('aake', 'sandgren'), ('aatish', 'kapadia'), ('abderrahmane', 'sissako'), ('abortive', 'stillborn'), ('absorbent', 'undergarments'), ('ach', 'jodel'), ('acp', 'anbuselvan'), ('adjoining', 'homesteads'), ('aeneid', 'eneide'), ('afrika', 'korps'), ('agnus', 'schrim'), ('ahmed', 'sellam'), ('akim', 'tamiroff'), ('akosua', 'busia'), ('akshaye', 'khanna'), ('alannis', 'morisette'), ('alexei', 'sayle'), ('alfonso', 'cuarón'), ('alik', 'shahadah'), ('alisha', 'seton')]

BEST BIGRAMS AND TRIGRAMS LIKEHOOD RATIO:
[('ever', 'seen'), ('special', 'effects'), ('low', 'budget'), ('sci', 'fi'), ('year', 'old'), ('years', 'ago'), ('high', 'school'), ('new', 'york'), ('main', 'character'), ('highly', 'recommend'), ('much', 'better'), ('worth', 'watching'), ('story', 'line'), ('kung', 'fu'), ('production', 'values'), ('real', 'life'), ('martial', 'arts'), ('well', 'done'), ('pretty', 'good'), ('saw', 'this', 'movie')]


Since the trigrams are not shown, let's see some of them
LR [('saw', 'this', 'movie'), ('watch', 'this', 'movie'), ('watching', 'this', 'movie'), ('recommend', 'this', 'movie'), ('see', 'this', 'movie')]
PMI [('ach', 'jodel', 'mir'), ('almedia', 'oksana', 'akinshina'), ('andrés', 'gertrúdix', 'geli'), ('arik', 'calismani', 'istemiyorum'), ('ashoka', 'mangal', 'pandey')]

def extract_nominal_phrases(pos_tags):
    nominal_phrases = []
    noun_tags = ["NN", "NNS"]
    valid_tags = ["JJ", "NN", "NNS"]

    # Bigrams
    for i in range(len(pos_tags) - 1):
        word1, tag1 = pos_tags[i]
        word2, tag2 = pos_tags[i + 1]
        # Since text has been transformed into lowercase, the PoS for "i" may be incorrect
        if "i" in (word1, word2):
            continue

        if (tag1 in valid_tags and tag2 in valid_tags) and \
            (tag1 in noun_tags or tag2 in noun_tags):
            nominal_phrases.append(f"{word1}-{word2}")

    # Trigrams
    for i in range(len(pos_tags) - 2):
        word1, tag1 = pos_tags[i]
        word2, tag2 = pos_tags[i + 1]
        word3, tag3 = pos_tags[i + 2]
        if "i" in (word1, word2, word3):
            continue
        if (tag1 in valid_tags and tag2 in valid_tags and tag3 in valid_tags) and \
           (tag1 in noun_tags or tag2 in noun_tags or tag3 in noun_tags):
            nominal_phrases.append(f"{word1}-{word2}-{word3}")

    return nominal_phrases

nominal_phrases = extract_nominal_phrases(pos_tags)
print("First 20 Bigrams:")
print(nominal_phrases[:20])
print("\nLast 20 Trigrams")
print(nominal_phrases[-20:])

First 20 Bigrams:
['gritty-look', 'new-york', 'york-city', 'dock-workers', 'classic-film', 'film-realistic', 'realistic-brutal', 'sidney-poitier', 'john-cassavetes', 'fantastic-director', 'read-mansfield', 'mansfield-park', 'sir-thomas', 'aunt-norris', 'first-person', 'person-narrative', 'entertaining-heroine', 'movie-version', 'version-fanny', 'fanny-flirts']

Last 20 Trigrams
['film-festival-beard', 'acclaimed-german-director', 'german-director-leni', 'director-leni-riefenstahl', 'lavish-engagement-party', 'current-party-system', 'talented-beautiful-actresses', 'single-memorable-line', 'cheap-black-wig', 'drag-queen-costume', 'queen-costume-shop', 'actor-peter-coyote', 'student-cafeteria-buffets', 'philbin-harrison-surf', 'harrison-surf-scenes', 'watch-aloha-summer', 'brave-little-boy', 'new-york-jewish', 'york-jewish-center', 'incredible-archival-footage']

# Phrases object requires "iterable of list of str"
opinions_string = " ".join(df['text'])
opinion_sentences = opinions_string.split('. ')
opinion_sentences[:5]

['a gritty look at new york city and dock workers',
 'this is a classic film realistic brutal at times always believable',
 'it was originally shown live on tv also starring sidney poitier',
 'john cassavetes was a fantastic director and actor',
 'i doubt whoever wrote this screenplay has ever actually read mansfield park']

tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in opinion_sentences]

%%capture
!pip install gensim

from gensim.models.phrases import Phraser
from gensim.models import Phrases

# min_count: ignores all words with a lower collected count
# threshold: "A phrase of words a followed by b is accepted if the score of the phrase is greater than threshold" "(higher means fewer phrases)"
phrase_model  = Phrases(tokenized_sentences, min_count=6, threshold=10, delimiter='_')
phraser = Phraser(phrase_model)

# Apply & filter
opinion_phrases_no_stopwords = []
for sentence in tokenized_sentences:
    phrases_in_sentence = phraser[sentence]
    for phrase in phrases_in_sentence:
        # If it is a ngram, ensure that it does not begin or end with a stopword
        if '_' in phrase:
            parts = phrase.split('_')
            if phrase not in stopwords and parts[0] not in stopwords and parts[-1] not in stopwords:
                opinion_phrases_no_stopwords.append(phrase)
        # or a stopword itself
        else:
            if phrase not in stopwords:
                opinion_phrases_no_stopwords.append(phrase)

opinion_phrases_no_stopwords[:20]

['gritty',
 'new_york',
 'city',
 'dock',
 'workers',
 'classic',
 'film',
 'realistic',
 'brutal',
 'always',
 'believable',
 'originally',
 'shown',
 'live',
 'tv',
 'also_starring',
 'sidney_poitier',
 'john_cassavetes',
 'fantastic',
 'director']

# Fix spaces, just in case
opinion_phrases_stripped_no_stopwords = [c.strip() for c in opinion_phrases_no_stopwords]
opinion_phrases_stripped_no_stopwords[:10]

['gritty',
 'new_york',
 'city',
 'dock',
 'workers',
 'classic',
 'film',
 'realistic',
 'brutal',
 'always']

import gensim

# Insert the phrases detected as collocations into the original review
collocation_phrases = {phrase for phrase in opinion_phrases_stripped_no_stopwords if '_' in phrase}

def transform_sentence(sentence, collocation_phrases):
    words = word_tokenize(sentence)
    for i in range(len(words) - 1):
        bigram = f"{words[i]} {words[i + 1]}"
        bigram_underscore = f"{words[i]}_{words[i + 1]}"
        if bigram_underscore in collocation_phrases:
            sentence = re.sub(rf"\b{bigram}\b", bigram_underscore, sentence)
    return sentence

opinion_sentences_transformed = [transform_sentence(os, collocation_phrases) for os in opinion_sentences]
for sentence in opinion_sentences_transformed:
    if "kung_fu" in sentence:
        print("Test -> Passed")
        break

Test -> Passed

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag, word_tokenize

lemmatizer = WordNetLemmatizer()

# Map NLTK PoS tags to WordNet PoS tags
def map_pos_to_wordnet(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN

def lemmatize_token(token, pos):
    # If it's a collocation, return as is (or lemmatize without PoS)
    if '_' in token:
        return lemmatizer.lemmatize(token)
    return lemmatizer.lemmatize(token, pos=map_pos_to_wordnet(pos))

# Filter out stopwords and phrases starting/ending with stopwords
def is_valid_token(token, stopwords):
    if '_' in token:
        parts = token.split('_')
        return parts[0] not in stopwords and parts[-1] not in stopwords
    return token not in stopwords

# Process each sentence: tokenize, POS tag, lemmatize and filter
lemmatized_sentences = []
for sentence in opinion_sentences_transformed:
    tokens = word_tokenize(sentence)
    pos_tags = pos_tag(tokens)
    lemmatized = [
        lemmatize_token(token.lower(), pos)
        for token, pos in pos_tags
        if is_valid_token(token.lower(), stopwords)
    ]
    if lemmatized:
        lemmatized_sentences.append(lemmatized)

all_lemmatized_tokens = [token for sentence in lemmatized_sentences for token in sentence]
print(all_lemmatized_tokens[:10])

['gritty', 'look', 'new_york', 'city', 'dock', 'worker', 'classic', 'film', 'realistic', 'brutal']

print("Lemmatized sentences")
print(lemmatized_sentences[1])
print(lemmatized_sentences[2], "\n")
print("Comparation vs 'original'")
print(opinion_sentences_transformed[1])
print(opinion_sentences_transformed[2])

Lemmatized sentences
['classic', 'film', 'realistic', 'brutal', 'time', 'always', 'believable']
['originally', 'show', 'live', 'tv', 'also_starring', 'sidney_poitier'] 

Comparation vs 'original'
this is a classic film realistic brutal at times always believable
it was originally shown live on tv also_starring sidney_poitier

lemmatized_sentences = [sentence for sentence in lemmatized_sentences if len(sentence) > 2]

from gensim.models import Word2Vec

w2v_opinions = gensim.models.Word2Vec(
    # Source: https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
    vector_size=150, # "Dimensionality of the word vectors"
    window=5, # "Maximum distance between the current and predicted word within a sentence"
    min_count=5, # "Ignores all words with total frequency lower than this"
    workers=4,
    seed=1
)
w2v_opinions.build_vocab(lemmatized_sentences)
w2v_opinions.train(lemmatized_sentences, total_examples=len(lemmatized_sentences), epochs=15)
print(w2v_opinions.corpus_count)

89269

def most_similar_terms(target_term, model=w2v_opinions, topn=10):
  return pd.DataFrame(model.wv.most_similar(target_term, topn=topn), columns=["term", 'score'])

most_similar_terms("camera")

most_similar_terms("amazing")

from nltk.corpus import wordnet as wn

def wu_palmer_similarity(term1, term2):
    synsets1 = wn.synsets(term1, pos=wn.NOUN)
    synsets2 = wn.synsets(term2, pos=wn.NOUN)
    synset1 = synsets1[0]
    synset2 = synsets2[0]
    similarity = synset1.wup_similarity(synset2)
    return similarity

term1 = "camera"
term2 = "shot"

wn_similarity = wu_palmer_similarity(term1, term2)
print(f"Wordnet (first try): {term1} & {term2}: {wn_similarity:.4f}\n")

w2v_similarity = w2v_opinions.wv.similarity(term1, term2)
print(f"word2vec: {term1} & {term2}: {w2v_similarity:.4f}")

Wordnet (first try): camera & shot: 0.1250

word2vec: camera & shot: 0.6745

lem = WordNetLemmatizer()

def get_noun_and_collocation(sentence):
   nouns_and_collocations = []
   noun_tags = ['NN', 'NNS']
   tokens_pos_tagged = pos_tag(word_tokenize(sentence))
   for tpos in tokens_pos_tagged:
       lemma = lem.lemmatize(tpos[0]).lower()
       if '_' in lemma:
           nouns_and_collocations.append(lemma)
       elif tpos[1] in noun_tags and tpos[0] not in stopwords:
           nouns_and_collocations.append(lemma)
   return nouns_and_collocations

# List of lists (required format) avoding empty or one-word opinions
candidates = (get_noun_and_collocation(opinion) for opinion in opinion_sentences_transformed)
noun_and_collocation_stream = [x for x in candidates if len(x) > 1]

import gensim.corpora as corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

dictionary = corpora.Dictionary(noun_and_collocation_stream)
print("Global dictionary | Unique tokens (NOT filtered):", len(dictionary))
# Filter tokens with lower freq than 3 and the most common
dictionary.filter_extremes(no_below = 3, no_above= .9)
print("Global dictionary | Unique tokens (-filtered-):", len(dictionary))
corpus = [dictionary.doc2bow(text) for text in noun_and_collocation_stream]

# Keep only the first 3000 most frequent (avoiding top 0.8)
most_freq_dictionary = corpora.Dictionary(noun_and_collocation_stream)
most_freq_dictionary.filter_extremes(no_above = 0.8, keep_n=3000)
print("\nMost frequent dictionary | Unique tokens:", len(most_freq_dictionary))
reduced_corpus = [most_freq_dictionary.doc2bow(text) for text in noun_and_collocation_stream]

Global dictionary | Unique tokens (NOT filtered): 28848
Global dictionary | Unique tokens (-filtered-): 12699

Most frequent dictionary | Unique tokens: 3000

import gensim.corpora as corpora

# If possible, increase iterations or 'passes' through the corpus during training
def lda_model(corpus, dictionary, num_topics, passes=70, random_state=1):
    lda_model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=random_state,
        passes=passes
    )
    return lda_model

lda10 = lda_model(corpus, dictionary, 10)
lda10.print_topics(num_topics=10, num_words=7)

[(0,
  '0.035*"film" + 0.033*"woman" + 0.028*"year" + 0.020*"dvd" + 0.015*"song" + 0.013*"today" + 0.012*"job"'),
 (1,
  '0.338*"movie" + 0.022*"thing" + 0.021*"time" + 0.016*"people" + 0.015*"anyone" + 0.015*"line" + 0.012*"something"'),
 (2,
  '0.137*"film" + 0.027*"guy" + 0.026*"story" + 0.025*"director" + 0.018*"love" + 0.017*"way" + 0.016*"man"'),
 (3,
  '0.052*"film" + 0.048*"time" + 0.030*"kind" + 0.027*"music" + 0.018*"look" + 0.018*"course" + 0.018*"comedy"'),
 (4,
  '0.029*"book" + 0.028*"minute" + 0.023*"fact" + 0.019*"piece" + 0.018*"night" + 0.017*"part" + 0.016*"dialogue"'),
 (5,
  '0.061*"lot" + 0.036*"film" + 0.023*"problem" + 0.017*"production" + 0.014*"cast" + 0.013*"tv" + 0.013*"play"'),
 (6,
  '0.091*"character" + 0.043*"show" + 0.031*"one" + 0.029*"scene" + 0.027*"life" + 0.018*"family" + 0.017*"episode"'),
 (7,
  '0.037*"people" + 0.033*"end" + 0.025*"kid" + 0.024*"friend" + 0.023*"world" + 0.022*"girl" + 0.020*"child"'),
 (8,
  '0.098*"movie" + 0.052*"film" + 0.030*"nothing" + 0.026*"fan" + 0.022*"reason" + 0.022*"action" + 0.019*"horror"'),
 (9,
  '0.057*"actor" + 0.036*"film" + 0.032*"performance" + 0.031*"acting" + 0.027*"role" + 0.026*"script" + 0.024*"plot"')]

lda20 = lda_model(corpus, dictionary, 20)
lda20.print_topics(num_topics=15, num_words=7)

[(14,
  '0.269*"time" + 0.095*"movie" + 0.028*"laugh" + 0.024*"others" + 0.021*"even_though" + 0.019*"gore" + 0.014*"dream"'),
 (7,
  '0.195*"people" + 0.047*"someone" + 0.044*"sense" + 0.041*"viewer" + 0.039*"person" + 0.031*"mother" + 0.031*"home"'),
 (8,
  '0.153*"way" + 0.132*"plot" + 0.116*"movie" + 0.074*"nothing" + 0.044*"ever_seen" + 0.025*"heart" + 0.020*"set"'),
 (16,
  '0.128*"woman" + 0.066*"course" + 0.057*"place" + 0.040*"quality" + 0.028*"sequence" + 0.028*"romance" + 0.026*"b"'),
 (5,
  '0.115*"lot" + 0.099*"movie" + 0.061*"reason" + 0.054*"horror" + 0.048*"star" + 0.043*"problem" + 0.032*"flick"'),
 (12,
  '0.104*"end" + 0.090*"guy" + 0.074*"world" + 0.059*"anything" + 0.037*"house" + 0.035*"picture" + 0.034*"boy"'),
 (11,
  '0.511*"film" + 0.038*"year" + 0.037*"director" + 0.024*"money" + 0.021*"song" + 0.020*"name" + 0.017*"job"'),
 (19,
  '0.198*"scene" + 0.104*"comedy" + 0.038*"sort" + 0.037*"style" + 0.035*"comment" + 0.029*"sex" + 0.028*"genre"'),
 (3,
  '0.102*"movie" + 0.061*"kind" + 0.059*"day" + 0.054*"music" + 0.045*"tv" + 0.032*"night" + 0.031*"eye"'),
 (0,
  '0.098*"fact" + 0.062*"joke" + 0.045*"rest" + 0.044*"father" + 0.036*"video" + 0.027*"town" + 0.025*"matter"'),
 (1,
  '0.439*"movie" + 0.092*"thing" + 0.039*"fan" + 0.034*"girl" + 0.028*"series" + 0.016*"game" + 0.014*"opinion"'),
 (2,
  '0.073*"idea" + 0.035*"brother" + 0.032*"camera" + 0.031*"hero" + 0.030*"voice" + 0.027*"john" + 0.027*"killer"'),
 (18,
  '0.140*"actor" + 0.083*"movie" + 0.077*"performance" + 0.059*"friend" + 0.050*"child" + 0.040*"audience" + 0.039*"hour"'),
 (6,
  '0.216*"character" + 0.101*"show" + 0.057*"family" + 0.049*"action" + 0.041*"line" + 0.041*"episode" + 0.034*"version"'),
 (15,
  '0.068*"point" + 0.049*"shot" + 0.046*"couple" + 0.040*"screen" + 0.032*"crap" + 0.030*"relationship" + 0.029*"thriller"')]

lda15_reduced = lda_model(reduced_corpus, most_freq_dictionary, 15)
lda15_reduced.print_topics(num_topics=15, num_words=7)

[(0,
  '0.523*"film" + 0.034*"series" + 0.026*"point" + 0.024*"problem" + 0.018*"picture" + 0.017*"job" + 0.016*"heart"'),
 (1,
  '0.138*"life" + 0.058*"kind" + 0.057*"year" + 0.052*"work" + 0.051*"reason" + 0.049*"child" + 0.041*"dvd"'),
 (2,
  '0.202*"time" + 0.065*"man" + 0.047*"woman" + 0.041*"action" + 0.034*"episode" + 0.032*"star" + 0.031*"sense"'),
 (3,
  '0.118*"movie" + 0.053*"fan" + 0.044*"script" + 0.040*"money" + 0.040*"horror" + 0.038*"tv" + 0.037*"line"'),
 (4,
  '0.170*"character" + 0.106*"actor" + 0.080*"show" + 0.060*"end" + 0.054*"nothing" + 0.043*"love" + 0.036*"minute"'),
 (5,
  '0.149*"thing" + 0.047*"anything" + 0.040*"everything" + 0.033*"plot" + 0.028*"death" + 0.028*"production" + 0.028*"sort"'),
 (6,
  '0.216*"story" + 0.034*"viewer" + 0.032*"person" + 0.029*"night" + 0.029*"direction" + 0.027*"dialogue" + 0.026*"today"'),
 (7,
  '0.133*"people" + 0.125*"movie" + 0.063*"director" + 0.060*"one" + 0.055*"guy" + 0.033*"ever_seen" + 0.029*"course"'),
 (8,
  '0.149*"way" + 0.053*"cast" + 0.037*"look" + 0.029*"rest" + 0.027*"men" + 0.026*"boy" + 0.026*"head"'),
 (9,
  '0.062*"acting" + 0.061*"day" + 0.055*"girl" + 0.042*"moment" + 0.040*"everyone" + 0.028*"home" + 0.028*"war"'),
 (10,
  '0.064*"role" + 0.059*"kid" + 0.059*"family" + 0.051*"music" + 0.032*"wife" + 0.031*"song" + 0.030*"eye"'),
 (11,
  '0.538*"movie" + 0.048*"something" + 0.029*"bit" + 0.023*"anyone" + 0.013*"writer" + 0.012*"theater" + 0.012*"plot"'),
 (12,
  '0.097*"lot" + 0.091*"part" + 0.061*"fact" + 0.051*"book" + 0.048*"idea" + 0.041*"fun" + 0.041*"someone"'),
 (13,
  '0.135*"scene" + 0.074*"performance" + 0.071*"comedy" + 0.037*"hour" + 0.032*"shot" + 0.031*"piece" + 0.029*"mind"'),
 (14,
  '0.075*"world" + 0.052*"audience" + 0.041*"place" + 0.040*"hollywood" + 0.039*"word" + 0.039*"drama" + 0.035*"type"')]

lda30_reduced = lda_model(reduced_corpus, most_freq_dictionary, 30)
lda30_reduced.print_topics(num_topics=15, num_words=7)

[(14,
  '0.093*"work" + 0.092*"music" + 0.074*"film" + 0.070*"moment" + 0.054*"place" + 0.053*"hollywood" + 0.045*"head"'),
 (17,
  '0.322*"people" + 0.087*"anything" + 0.061*"couple" + 0.043*"film" + 0.022*"sci_fi" + 0.021*"language" + 0.020*"conclusion"'),
 (7,
  '0.191*"guy" + 0.099*"version" + 0.067*"side" + 0.059*"theater" + 0.045*"level" + 0.038*"location" + 0.036*"god"'),
 (3,
  '0.086*"reason" + 0.082*"child" + 0.076*"money" + 0.068*"film" + 0.045*"special_effect" + 0.044*"writer" + 0.044*"men"'),
 (1,
  '0.216*"life" + 0.124*"man" + 0.079*"book" + 0.042*"death" + 0.031*"stuff" + 0.030*"age" + 0.027*"film"'),
 (21,
  '0.143*"one" + 0.085*"horror" + 0.061*"film" + 0.060*"song" + 0.053*"dialogue" + 0.052*"screen" + 0.051*"today"'),
 (4,
  '0.113*"nothing" + 0.096*"fact" + 0.074*"point" + 0.069*"tv" + 0.067*"film" + 0.059*"everyone" + 0.048*"night"'),
 (2,
  '0.091*"course" + 0.066*"type" + 0.058*"film" + 0.055*"car" + 0.054*"quality" + 0.051*"art" + 0.039*"much_better"'),
 (20,
  '0.160*"acting" + 0.126*"fan" + 0.047*"brother" + 0.047*"others" + 0.039*"low_budget" + 0.037*"gore" + 0.035*"film"'),
 (10,
  '0.246*"plot" + 0.059*"eye" + 0.050*"cinema" + 0.050*"job" + 0.044*"film" + 0.037*"set" + 0.037*"experience"'),
 (13,
  '0.111*"episode" + 0.058*"actress" + 0.056*"camera" + 0.052*"film" + 0.034*"team" + 0.032*"story_line" + 0.030*"period"'),
 (24,
  '0.114*"kid" + 0.107*"love" + 0.100*"girl" + 0.080*"ever_seen" + 0.072*"sense" + 0.059*"name" + 0.051*"mother"'),
 (9,
  '0.099*"day" + 0.097*"world" + 0.071*"fun" + 0.061*"viewer" + 0.056*"piece" + 0.055*"film" + 0.046*"sort"'),
 (28,
  '0.081*"house" + 0.066*"hand" + 0.063*"group" + 0.057*"situation" + 0.053*"school" + 0.049*"country" + 0.043*"emotion"'),
 (27,
  '0.078*"style" + 0.076*"case" + 0.075*"boy" + 0.073*"talent" + 0.058*"voice" + 0.056*"cinematography" + 0.034*"space"')]

def coherence_lda_model(model, texts, dictionary):
    coherence_model = CoherenceModel(
        model=model,
        texts=texts,
        dictionary=dictionary,
        coherence='c_v'
        )
    return coherence_model.get_coherence()

print("Model with 10 Topics")
print("Coherence:", coherence_lda_model(lda10, noun_and_collocation_stream, dictionary))
print("Perplexity:", lda10.log_perplexity(corpus))
print("\nModel with 20 Topics")
print("Coherence:", coherence_lda_model(lda20, noun_and_collocation_stream, dictionary))
print("Perplexity:", lda20.log_perplexity(corpus))
print("\nModel (Reduced) with 15 Topics")
print("Coherence:", coherence_lda_model(lda15_reduced, noun_and_collocation_stream, most_freq_dictionary))
print("Perplexity:", lda15_reduced.log_perplexity(reduced_corpus))
print("\nModel (Reduced) with 30 Topics")
print("Coherence:", coherence_lda_model(lda30_reduced, noun_and_collocation_stream, most_freq_dictionary))
print("Perplexity:", lda30_reduced.log_perplexity(reduced_corpus))

Model with 10 Topics
Coherence: 0.2405938157320638
Perplexity: -8.679757318807164

Model with 20 Topics
Coherence: 0.3300805148333529
Perplexity: -13.258408576218134

Model (Reduced) with 15 Topics
Coherence: 0.24212758381551266
Perplexity: -9.147166342383676

Model (Reduced) with 30 Topics
Coherence: 0.4113756058587291
Perplexity: -11.10965329760329

%%capture
!pip install pyLDAvis

import pyLDAvis.gensim

pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda30_reduced, reduced_corpus, most_freq_dictionary, R=15)

opinions = df['text'].tolist()
print(opinions[:2])

['a gritty look at new york city and dock workers. this is a classic film realistic brutal at times always believable. it was originally shown live on tv also starring sidney poitier. john cassavetes was a fantastic director and actor.', "i doubt whoever wrote this screenplay has ever actually read mansfield park. or if they have it was not very well. none of the characters are what they should be fanny is lively and conscious of her mistreatment while sir thomas who treated her very well seems to have accidentally fallen into aunt norris' personality. additionally a first person narrative by fanny is highly inappropriate to both the story and her character. fanny is not an entertaining heroine and i would contend that she is not meant to be. additionally in the movie version fanny flirts shamelessly with edmund from the very beginning when they have been raised as brother and sister austen's fanny would have shrank from flirtation of any sort and the novel paints the fanny edmund pairing as highly uncomfortable. as it should be. unlike some other jane austen novels p p emma mansfield park does not rest on the strength of its female protagonist. it is a very different sort of novel than the others it is not meant to be a love story. i watched this movie because i have just now finished reading mansfield park and i am absolutely horrified by what i see miss austen is rolling in her grave."]

labels = df['sentiment'].tolist()
labels = [0 if label == 'negative' else 1 for label in labels]
print(labels[:20])

[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0]

from sklearn.feature_extraction.text import TfidfVectorizer

# Remove stopwords and noise
vectorizer = TfidfVectorizer(analyzer='word', min_df=2, stop_words=stopwords, max_df=0.95)
X = vectorizer.fit_transform(opinions)
M = X.toarray()

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import time

X_train, X_test, y_train, y_test = train_test_split(M, labels, train_size=0.8, random_state=1)
log_reg = LogisticRegression()

start_time = time.time()
log_reg.fit(X_train, y_train)
end_time = time.time()
training_time = end_time - start_time
print(f"Runtime: {training_time:.4f} seconds")

Runtime: 36.9630 seconds

from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc
import matplotlib.pyplot as plt

# Report
print("\nClassification Report:\n===============")
y_pred = log_reg.predict(X_test)
print(classification_report(y_test, y_pred))

# ROC curve
probabilities = log_reg.predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, probabilities[:, 1])
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], color='lightgray', lw=1.5, linestyle='--')
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curve - Logistic Regression', fontsize=14, pad=15)
plt.legend(loc="lower right", fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

Classification Report:
===============
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      1269
           1       0.87      0.89      0.88      1211

    accuracy                           0.88      2480
   macro avg       0.88      0.88      0.88      2480
weighted avg       0.88      0.88      0.88      2480

# Explainability
class_labels = log_reg.classes_
feature_names = vectorizer.get_feature_names_out()
negative = sorted(zip(log_reg.coef_[0], feature_names))[:15]
positive = sorted(zip(log_reg.coef_[0], feature_names))[-15:]
print("MOST INFLUENTIAL\n")
print("--------Positive--------")
for coef, feat in positive:
    print(class_labels[0], coef.round(3), feat)
print("\n--------Negative--------")
for coef, feat in reversed(negative):
    print(class_labels[1], coef.round(3), feat)

MOST INFLUENTIAL

--------Positive--------
0 2.635 hilarious
0 2.796 superb
0 2.797 highly
0 2.8 today
0 2.929 well
0 2.994 perfect
0 3.119 love
0 3.148 loved
0 3.171 favorite
0 3.171 enjoyed
0 3.175 wonderful
0 3.245 amazing
0 3.951 best
0 4.603 excellent
0 6.504 great

--------Negative--------
1 -2.831 worse
1 -2.843 script
1 -2.908 horrible
1 -2.911 crap
1 -3.086 dull
1 -3.257 even
1 -3.313 minutes
1 -4.009 terrible
1 -4.063 poor
1 -4.614 nothing
1 -4.749 boring
1 -4.79 waste
1 -5.099 awful
1 -6.383 bad
1 -7.227 worst

%%capture
!pip install transformers datasets torch scikit-learn accelerate

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datasets import Dataset
from transformers import AutoTokenizer

# Prepare dataset with stratified partitioning
labels = df['sentiment'].tolist()
df['label'] = [0 if label == 'negative' else 1 for label in labels]

train_df, test_df = train_test_split(df, test_size=0.2, random_state=1, stratify=df['label'])
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_tokenized = train_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: 
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  warnings.warn(

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Map:   0%|          | 0/2480 [00:00<?, ? examples/s]

%%capture
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load Model & Define the training process
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    report = classification_report(labels, predictions, output_dict=True)
    return {"accuracy": report["accuracy"], "f1": report["macro avg"]["f1-score"]}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    compute_metrics=compute_metrics,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

trainer.train() # From now on, GPU is required

TrainOutput(global_step=1860, training_loss=0.21042110073950984, metrics={'train_runtime': 1679.1105, 'train_samples_per_second': 17.717, 'train_steps_per_second': 1.108, 'total_flos': 3940640175218688.0, 'train_loss': 0.21042110073950984, 'epoch': 3.0})

from google.colab import drive
drive.mount('/content/drive')

# Save best model (epoch 2) & tokenizer
model.save_pretrained('/content/drive/MyDrive/distilbert_sentiment')
tokenizer.save_pretrained('/content/drive/MyDrive/distilbert_sentiment')

Mounted at /content/drive

('/content/drive/MyDrive/distilbert_sentiment/tokenizer_config.json',
 '/content/drive/MyDrive/distilbert_sentiment/special_tokens_map.json',
 '/content/drive/MyDrive/distilbert_sentiment/vocab.txt',
 '/content/drive/MyDrive/distilbert_sentiment/added_tokens.json',
 '/content/drive/MyDrive/distilbert_sentiment/tokenizer.json')

from google.colab import drive
# Execute in a new session
drive.mount('/content/drive')

Mounted at /content/drive

from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification

# Load trained model
model_path = "/content/drive/MyDrive/distilbert_sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=64,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_tokenized
)

from scipy.special import softmax
from sklearn.metrics import classification_report, roc_curve, auc

# Predict
predictions_output = trainer.predict(test_tokenized)
predicted_labels = np.argmax(predictions_output.predictions, axis=1)
true_labels = predictions_output.label_ids
probabilities = softmax(predictions_output.predictions, axis=1)
positive_scores = probabilities[:, 1]

# Report
print("Classification Report:\n===============")
class_names = ['negative', 'positive']
print(classification_report(true_labels, predicted_labels, target_names=class_names))

# AUC
fpr, tpr, thresholds = roc_curve(true_labels, positive_scores)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], color='lightgray', lw=1.5, linestyle='--')
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curve - Transformer Model (DistilBERT)', fontsize=14, pad=15)
plt.legend(loc="lower right", fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

Classification Report:
===============
              precision    recall  f1-score   support

    negative       0.93      0.92      0.93      1237
    positive       0.92      0.93      0.93      1243

    accuracy                           0.93      2480
   macro avg       0.93      0.93      0.93      2480
weighted avg       0.93      0.93      0.93      2480

test_review = "The movie had a great story and fantastic setting, but the acting ruined it."

# LR
start_time = time.time()
test_vectorized = vectorizer.transform([test_review])
log_reg_pred = log_reg.predict(test_vectorized)
log_reg_prob = log_reg.predict_proba(test_vectorized)
end_time = time.time()
log_reg_inference_time = end_time - start_time

print("Logistic Regression Prediction:", "positive" if log_reg_pred[0] == 1 else "negative")
print("Probabilities:", log_reg_prob)
print(f"Inference time: {log_reg_inference_time:.6f} seconds\n")

# DistilBERT
from datasets import Dataset
test_dataset_single = Dataset.from_dict({"text": [test_review]})
test_tokenized_single = test_dataset_single.map(tokenize_function)

start_time = time.time()
predictions_output = trainer.predict(test_tokenized_single)
probs = softmax(predictions_output.predictions, axis=1)
bert_pred = np.argmax(probs, axis=1)[0]
end_time = time.time()
bert_inference_time = end_time - start_time

print("DistilBERT Prediction:", "positive" if bert_pred == 1 else "negative")
print("Probabilities:", probs)
print(f"Inference time: {bert_inference_time:.6f} seconds")

Logistic Regression Prediction: positive
Probabilities: [[0.1140709 0.8859291]]
Inference time: 0.010610 seconds

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

DistilBERT Prediction: negative
Probabilities: [[0.97370726 0.02629277]]
Inference time: 0.050776 seconds

	review	sentiment
11926	I wouldn't rent this one even on dollar rental...	negative
13109	More suspenseful, more subtle, much, much more...	negative
18400	Brilliant and moving performances by Tom Court...	positive
19874	This movie is terrible but it has some good ef...	negative
27521	Read the book, forget the movie!	negative
28920	Primary plot!Primary direction!Poor interpreta...	negative
31072	What a script, what a story, what a mess!	negative
40817	I hope this group of film-makers never re-unites.	negative

	0
GO.	2
Penitentary.	1
inwards,	2
many..Aryans,	1
gangstas,	1
more....so	1
scuffles,	1
romance...OZ	1
(crooked	1
nickel,	2

	term	score
0	zoom	0.722666
1	hand_held	0.705170
2	shot	0.674524
3	frame	0.620062
4	randomly	0.611334
5	movement	0.610686
6	shaky	0.606696
7	slow_motion	0.600896
8	speed	0.599681
9	prop	0.597222

	term	score
0	outstanding	0.794955
1	brilliant	0.771096
2	fantastic	0.749010
3	incredible	0.746010
4	remarkable	0.735514
5	terrific	0.733990
6	magnificent	0.726378
7	exceptional	0.713707
8	fabulous	0.711457
9	wonderful	0.702281

Epoch	Training Loss	Validation Loss	Accuracy	F1
1	0.369000	0.225610	0.905645	0.905409
2	0.187200	0.207101	0.925806	0.925802
3	0.075100	0.310638	0.924194	0.924193

Introduction¶

1. Preprocess Data¶

1.1 Data load and overview¶

1.2 Data cleaning¶

2. Final data preprocessing¶

3. Knowledge extraction models¶

3.1 Relationships between concepts/words¶

3.2 Discovering general topics (LDA models)¶

4. Sentiment prediction: traditional vs. transformers¶

4.1 Logistic Regression (yes, nowadays)¶

4.2 Transofmers: BERT based model¶

Model Comparison: LR vs. DistilBERT¶

Practical Considerations¶

Conclusion¶

	review	sentiment
0	One of the other reviewers has mentioned that ...	positive
1	A wonderful little production. <br /><br />The...	positive
2	I thought this was a wonderful way to spend ti...	positive
3	Basically there's a family where a little boy ...	negative
4	Petter Mattei's "Love in the Time of Money" is...	positive

	review	sentiment
count	50000	50000
unique	49582	2
top	Loved today's show!!! It was a variety and not...	positive
freq	5	25000

	review	sentiment	text
count	12396	12396	12396
unique	12396	2	12396
top	this is one of the greatest documentaries i've ever seen along with "Dark Days". I have skated for maybe an hour my entire life, and I still love this movie. Peralta and his excellent editor captured the feeling and atmosphere perfectly, helped in part with some incredible archival footage. Tony Alva is one of the coolest individuals in existence. Love those knee high striped sport socks, you rock Tony!<br /><br />Not only is this movie a visual feast, but the soundtrack has to be one of the best in history, if you're into 70's rock. Buy the DVD, you won't regret it.	positive	this is one of the greatest documentaries i have ever seen along with dark days. i have skated for maybe an hour my entire life and i still love this movie. peralta and his excellent editor captured the feeling and atmosphere perfectly helped in part with some incredible archival footage. tony alva is one of the coolest individuals in existence. love those knee high striped sport socks you rock tony not only is this movie a visual feast but the soundtrack has to be one of the best in history if you are into 's rock. buy the dvd you will not regret it.
freq	1	6214	1