from nltk.stem import PorterStemmer    # stem the words
from nltk.tokenize import word_tokenize # tokenize the sentences into tokens
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer # vectorize the texts
from sklearn.model_selection import train_test_split # split the testing and training sets

def preprocess(path):
    '''generate cleaned dataset
    
    Args:
        path(string): the path of the file of testing data

    Returns:
        X_train (list): the list of features of training data
        X_test (list): the list of features of test data
        y_train (list): the list of targets of training data ('1' or '0')
        y_test (list): the list of targets of training data ('1' or '0')
    '''
    
    # text preprocessing: iterate through the original file and 
    with open(path, encoding='utf-8') as file:
        # record all words and its label
        labels = []
        preprocessed = []
        for line in file:
            # get sentence and label
            sentence, label = line.strip('\n').split('\t')
            labels.append(int(label))
            
            # remove punctuation and numbers
            for ch in punctuation+'0123456789':
                sentence = sentence.replace(ch,' ')
            # tokenize the words and stem them
            words = []
            for w in word_tokenize(sentence):
                words.append(PorterStemmer().stem(w))
            preprocessed.append(' '.join(words))
    
    # vectorize the texts
    vectorizer = TfidfVectorizer(stop_words='english', sublinear_tf=True)
    X = vectorizer.fit_transform(preprocessed)
    # split the testing and training sets
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = preprocess('imdb_labelled.txt')


from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
from matplotlib import pyplot as plt
from time import time

def classify(clf, todense=False):
    '''to classify the data using machine learning models
    
    Args:
        clf: the model chosen to analyze the data
        todense(bool): whether to make the sparse matrix dense
        
    '''
    global X_train, X_test, y_train, y_test
    t = time()
    if todense:
        clf.fit(X_train.todense(), y_train)
        y_pred = clf.predict(X_test.todense())
    else:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
    print(f'Time cost of {str(clf)}: {round(time()-t,2)}s\nThe accuracy of {str(clf)}: {accuracy_score(y_test,y_pred)}\n')


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
classify(LinearDiscriminantAnalysis(),todense=True)

Time cost of LinearDiscriminantAnalysis(): 0.79s
The accuracy of LinearDiscriminantAnalysis(): 0.71


from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
for model in [LogisticRegression(), MultinomialNB(), SVC(), SGDClassifier(), MLPClassifier()]:
    classify(model)

Time cost of LogisticRegression(): 0.03s
The accuracy of LogisticRegression(): 0.825

Time cost of MultinomialNB(): 0.0s
The accuracy of MultinomialNB(): 0.825

Time cost of SVC(): 0.09s
The accuracy of SVC(): 0.835

Time cost of SGDClassifier(): 0.0s
The accuracy of SGDClassifier(): 0.82

Time cost of MLPClassifier(): 3.47s
The accuracy of MLPClassifier(): 0.81


from statistics import mode
def ensemble(models):
    '''to ensemble the models and classify the data based on each model's vote
    
    Args:
        models: the list of models chosen to analyze the data
        
    '''
    global X_train, X_test, y_train, y_test
    t = time()
    # iterate through all the models and collect all their predictions
    y_preds = []
    for clf in models:
        clf.fit(X_train, y_train)
        y_preds.append(clf.predict(X_test))
    
    # Count their votes and get the mode of each prediction as the decision
    y_pred = []
    for i in range(len(y_preds[0])):
        y_pred.append(mode([y[i] for y in y_preds]))
    print(f'Time cost: {round(time()-t,2)}s\nAccuracy: {accuracy_score(y_test,y_pred)}\n')
    plot_confusion_matrix(clf, X_test, y_test, values_format = 'd',display_labels=['positive','negative'])

ensemble([LogisticRegression(),MultinomialNB(),SVC(),SGDClassifier()])

Time cost: 0.12s
Accuracy: 0.83

Sentiment Analysis of Movie Reviews pt.1 -- Basics¶

--by Charlie Chengrui Zheng 01/15/2021¶

Text Preprocessing¶

Feature Extraction¶

Modeling¶

Ensemble Learning¶