import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
np.random.seed(0)


                        data_url = 'https://github.com/Harvard-IACS/2021-CS109A/raw/master/content/lectures/lecture23/data/movie_reviews.zip'
df = pd.read_csv(data_url, compression='zip')


                        df.head()


                        df.shape


                        df.label.unique()


                        labels = {0: 'bad', 1: 'good'}
seen = {'bad': False, 'good': False}
for i in range(df.shape[0]):
    label = df.loc[i,'label']
    if not seen[labels[label]]:
        # display/print combination used to appease Ed's strange output behavior
        display(df.loc[i, 'text'])
        print()
        display(f"label: {labels[label]}")
        print()
        seen[labels[label]] = True
    if all(val == True for val in seen.values()):
        break


                        ### edTest(test_remove_br) ###
# fill in the regular expression
remove_br = lambda x: re.sub(___, ' ', x)


                        df['text'] = df.text.apply(___)


                        df.loc[4,'text']


                        example_str = df.loc[4,'text']
print(example_str)


                        # store copy of data with punctuation
df_raw = df.copy()


                        ### edTest(test_punc_regex) ###
# create a regex that will match the characters described above 
punc_regex = ___


                        df['text'] = df.text.str.replace(punc_regex, '', regex=True) # remove punctuation


                        example_str = df.loc[4,'text']
print(example_str)


                        # generate indices to designate train and test observations
train_idx, test_idx = train_test_split(range(df.shape[0]), test_size=0.2, random_state=0, stratify=df['label'])


                        # Separate the predictor from the response
x = df.text.values
y = df.label.values


                        # Create train and test sets using the generated indices
x_train = x[train_idx]
y_train = y[train_idx]
x_test = x[test_idx]
y_test = y[test_idx]


                        from sklearn.feature_extraction.text import TfidfVectorizer


                        ### edTest(test_tfidf) ###
vec = TfidfVectorizer(___)


                        from sklearn.linear_model import LogisticRegressionCV


                        ### edTest(test_clf) ###
# Instantiate our Classifier
clf = LogisticRegressionCV(___)


                        from sklearn.pipeline import make_pipeline


                        ### edTest(test_pipeline) ###
# Construct the pipeline
pipe = make_pipeline(___)


                        # For the sake of time, we are fitting quickly and we may not converge
# We'll supress those pesky warnings
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
# We also ignore FutureWarnings due to version issues on Ed
simplefilter("ignore", category=(ConvergenceWarning, FutureWarning))


                        ### edTest(test_fit) ###
# Fit the model via the pipeline
pipe.___(___,___)


                        pipe.get_params()['steps']


                        features = pipe.get_params()['tfidfvectorizer'].get_feature_names()
print('# of features:', len(features))


                        sample_size = 40
feature_sample_idx = np.random.choice(len(features), size=sample_size, replace=False)
print(np.array(features)[feature_sample_idx])


                        best_C = pipe.get_params()['logisticregressioncv'].C_[0]
print(f'Best C from cross-validation: {best_C:.4f}')


                        ### edTest(test_pred) ###
# Predict class labels on test data
y_pred = pipe.___(___)

# Predict probabilities of the positive on the test data
y_pred_proba = pipe.___(___)[___,___]

# Calculate test accuracy (there are several ways to do this)
test_acc = ___
print(f"test accuracy: {test_acc:0.3f}")


                        # For interpretation
import eli5
# for parsing/formating eli5's HTML output
from bs4 import BeautifulSoup
# for displaying formatted HTML output
from IPython.display import HTML


                        eli5.show_weights(clf, vec=vec, top=25)


                        x_train_raw = df_raw.text[train_idx].values
x_test_raw = df_raw.text[test_idx].values


                        df_raw[df.text.str.contains(' 710 ')].iloc[0].text


                        def eli5_html(clf, vec, observation):
    """
    helper function for nicely formatting and displaying eli5 output
    """
    # Get info on is driving a given observation's predictions
    eli5_results = eli5.show_prediction(estimator=clf, doc=observation, vec=vec, targets=[True], target_names=['bad', 'good'])
    # Convert eli5's HTML data to BS object for parsing/formatting
    soup = BeautifulSoup(eli5_results.data, 'html.parser')
    # Remove a table we don't want
    soup.table.decompose()
    # Remove the first <p> tag with unwanted text
    soup.p.decompose()
    # Display the newly formatted HTML!
    display(HTML(str(soup)))


                        # Find indices of 5 worst reviews
worst5 = x_test_raw[___]


                        for i, review in enumerate(worst5):
    style = 'background-color:black;color:white;font-weight:bold;padding:4px'
    display(HTML(f"<p style={style}>Bad Movie #{i+1} 🍅</p>"))
    eli5_html(clf, vec, review)


                        # Find indices of 5 best reviews
best5 = x_test_raw[___]


                        for i, review in enumerate(best5):
    display(HTML(f"<p style={style}>Good Movie #{i+1} 🏆</p>"))
    eli5_html(clf, vec, review)


                        # Find indices of the 5 most neutral reviews
meh5 = x_test_raw[___]


                        for i, review in enumerate(meh5):
    display(HTML(f"<p style={style}>'Meh' Movie #{i+1} 😐</p>"))
    eli5_html(clf, vec, review)


                        my_review = """
            your review here
            """

# Remove punctuation using your regex from earlier
my_review = re.sub(punc_regex, '', my_review)
# Remove leading & trailing whitespace
# and put into a numpy array (which the model expects)
my_review = np.array([my_review.strip()])
my_review


                        my_review_proba = pipe.predict_proba(my_review)[:,1][0]
my_review_label = pipe.predict(my_review)[0]
print('predicted class:', my_review_label)
print('predicted probability:', my_review_proba)


                        display(HTML(f"<p style={style}>My Review 🍿</p>"))
eli5_html(clf, vec, my_review[0])

Movie Review Classifier 🍿📽️¶

What are the 5 worst movie reviews in the test set according to your model? 🍅¶

What are the 5 best movie review in the test set according to your model? 🏆¶