from nltk.stem import PorterStemmer    # stem the words
from nltk.tokenize import word_tokenize # tokenize the sentences into tokens
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer # vectorize the texts
from sklearn.model_selection import train_test_split # split the testing and training sets

def preprocess(path, ngram):
    '''generate cleaned dataset
    
    Args:
        path(string): the path of the file of testing data
        ngram(tuple (min_n, max_n)): the range of n-gram model

    Returns:
        X_train (list): the list of features of training data
        X_test (list): the list of features of testing data
        y_train (list): the list of targets of training data ('1' or '0')
        y_test (list): the list of targets of testing data ('1' or '0')
    '''
    
    # text preprocessing: iterate through the original file and 
    with open(path, encoding='utf-8') as file:
        # record all words and its label
        labels = []
        preprocessed = []
        for line in file:
            # get sentence and label
            sentence, label = line.strip('\n').split('\t')
            labels.append(int(label))
            
            # remove punctuation and numbers
            for ch in punctuation+'0123456789':
                sentence = sentence.replace(ch,' ')
            # tokenize the words and stem them
            words = []
            for w in word_tokenize(sentence):
                words.append(PorterStemmer().stem(w))
            preprocessed.append(' '.join(words))
    
    # vectorize the texts
    vectorizer = TfidfVectorizer(stop_words='english', sublinear_tf=True, ngram_range=ngram)
    X = vectorizer.fit_transform(preprocessed)
    # split the testing and training sets
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2)
    return X_train, X_test, y_train, y_test

from sklearn.metrics import accuracy_score
def classify(clf):
    '''to classify the data using machine learning models
    
    Args:
        clf: the model chosen to analyze the data        
    '''
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    return accuracy


from sklearn.naive_bayes import MultinomialNB
import pandas as pd
# create a dictionary to record the accuracy for each ngram_range
d = {}
# iterate through each ngram_range
for ngram in [(1,1),(1,2),(1,3),(2,2),(2,3),(3,3)]:
    X_train, X_test, y_train, y_test = preprocess('imdb_labelled.txt',ngram)
    d[str(ngram)] = [classify(MultinomialNB())]
df = pd.DataFrame(data=d)
df


alpha_list = [0.1,0.5,1,1.5,2,2.5]
d = {'alpha':alpha_list}
for ngram in [(1,1),(1,2),(1,3)]:
    acc = []
    for value in alpha_list:
        X_train, X_test, y_train, y_test = preprocess('imdb_labelled.txt',ngram)
        acc.append(classify(MultinomialNB(alpha = value)))
    d[ngram] = acc
df = pd.DataFrame(data=d)
df

	alpha	(1, 1)	(1, 2)	(1, 3)
0	0.1	0.815	0.785	0.780
1	0.5	0.775	0.785	0.835
2	1.0	0.760	0.815	0.825
3	1.5	0.850	0.765	0.795
4	2.0	0.805	0.765	0.795
5	2.5	0.805	0.745	0.800

Sentiment Analysis of Movie Reviews pt.3 -- n-gram¶

--by Charlie Chengrui Zheng 01/25/2021¶

N-gram¶

Tuning parameter¶

Naive Bayes Classifier¶

Smoothing¶