X_train.get_shape()[1]

2317


from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler

# performance SVD
svd = TruncatedSVD(100)
# performance LSA and normalization
lsa = make_pipeline(svd, MinMaxScaler())
X_lsa = lsa.fit_transform(X)


from nltk.stem import PorterStemmer    # stem the words
from nltk.tokenize import word_tokenize # tokenize the sentences into tokens
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer # vectorize the texts
from sklearn.model_selection import train_test_split # split the testing and training sets
from sklearn.decomposition import TruncatedSVD # SVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler

def preprocess(path):
    '''generate cleaned dataset
    
    Args:
        path(string): the path of the file of testing data

    Returns:
        X_train (list): the list of features of training data
        X_test (list): the list of features of test data
        y_train (list): the list of targets of training data ('1' or '0')
        y_test (list): the list of targets of training data ('1' or '0')
    '''
    
    # text preprocessing: iterate through the original file and 
    with open(path, encoding='utf-8') as file:
        # record all words and its label
        labels = []
        preprocessed = []
        for line in file:
            # get sentence and label
            sentence, label = line.strip('\n').split('\t')
            labels.append(int(label))
            
            # remove punctuation and numbers
            for ch in punctuation+'0123456789':
                sentence = sentence.replace(ch,' ')
            # tokenize the words and stem them
            words = []
            for w in word_tokenize(sentence):
                words.append(PorterStemmer().stem(w))
            preprocessed.append(' '.join(words))
    
    # vectorize the texts by tfidf
    vectorizer = TfidfVectorizer(stop_words='english', sublinear_tf=True)
    X_tfidf = vectorizer.fit_transform(preprocessed)
    svd = TruncatedSVD(100)
    # perform lsa
    lsa = make_pipeline(svd, MinMaxScaler())
    X_lsa = lsa.fit_transform(X_tfidf)
    # split the testing and training sets
    X_train, X_test, y_train, y_test = train_test_split(X_lsa, labels, test_size=0.2)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = preprocess('imdb_labelled.txt')


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
classify(LinearDiscriminantAnalysis())

Time cost of LinearDiscriminantAnalysis(): 0.05s
The accuracy of LinearDiscriminantAnalysis(): 0.73


from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
for model in [LogisticRegression(), MultinomialNB(), SVC(), SGDClassifier(), MLPClassifier()]:
    classify(model)

Time cost of LogisticRegression(): 0.08s
The accuracy of LogisticRegression(): 0.76

Time cost of MultinomialNB(): 0.0s
The accuracy of MultinomialNB(): 0.775

Time cost of SVC(): 0.08s
The accuracy of SVC(): 0.745

Time cost of SGDClassifier(): 0.01s
The accuracy of SGDClassifier(): 0.75

Time cost of MLPClassifier(): 0.68s
The accuracy of MLPClassifier(): 0.77


ensemble([LinearDiscriminantAnalysis(),LogisticRegression(),MultinomialNB(),SVC(),SGDClassifier(),MLPClassifier()])

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:614: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  warnings.warn(

Time cost: 1.03s
Accuracy: 0.78

Sentiment Analysis of Movie Reviews pt.2 -- LSA¶

--by Charlie Chengrui Zheng 01/15/2021¶

Recap¶

LSA and SVD¶

Performance¶