Anda di halaman 1dari 11

NavieBayes- Revised

October 10, 2018

1 Assigment - 4
1.1 Apply Naive Bayes to Amazon reviews
Note: Used 5000 review 2500 positive and 2500 negative

In [1]: %matplotlib inline


import numpy as np
import pandas as pd

import matplotlib.pyplot as plt


import seaborn as sn

from sklearn.naive_bayes import MultinomialNB


from sklearn.metrics import accuracy_score
# from sklearn import cross_validation
from sklearn.model_selection import cross_val_score
#from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score,precision_score,recall_score

import warnings
warnings.filterwarnings('ignore')

In [2]: df=pd.read_pickle('./Amazon_5000.pkl')
#sorting values TBS
df = df.sort_values(['Time'],ascending=True)

In [3]: #Creating Train and test data (80-20 split, Since i ahve small dataset i need more data
# therefore converted in 80-20 split instead of 70-30)
## 80% of sorted data will = Total number of rows (sorted) *0.8
X_train = df.iloc[:int(len(df)*.8),1]
Y_train = df.iloc[:int(len(df)*.8),-1]
# len(Y_train)== len(X_train)
X_test = df.iloc[int(len(df)*.8)+1:,1]
Y_test = df.iloc[int(len(df)*.8)+1:,-1]
# len(Y_test)== len(X_test)

1
In [4]: # Label encoding
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Ytrain = le.fit_transform(Y_train)
Ytest = le.transform(Y_test)

2 BoW
In [5]: # #BoW without bigram
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
Xtrain1 = vectorizer.fit_transform(X_train.values)
Xtest1 = vectorizer.transform(X_test.values)
# Xtest.shape

In [6]: score_1 = []
alpha = np.arange(0.001,10,0.1)
# alpha.shape
for Alpha in alpha:
clf=MultinomialNB(alpha=Alpha)
scores = cross_val_score(clf, Xtrain1, Ytrain, cv=10, scoring='accuracy')
# print('Aplha =',round(Alpha,5),'\t\t\tScore =',round(scores.mean(),5))

score_1.append(scores.mean())

In [17]: # changing to misclassification error


MSE_1 = [1 - x for x in score_1]
Training_Error1=round(1-max(score_1),4)
alpha_BoW = round(alpha[MSE_1.index(min(MSE_1))],3)

In [8]: clf1=MultinomialNB(alpha=alpha_BoW)
clf1.fit( Xtrain1, Ytrain)
pred_BoW=clf1.predict(Xtest1)
Accuracy1=round(accuracy_score(Ytest,pred_BoW,normalize=True),4)
testError1 = 1-Accuracy1

In [11]: #confusion matrix


cm1=confusion_matrix(Ytest, pred_BoW)
p1=precision_score(Ytest, pred_BoW, average=None)
r1=recall_score(Ytest, pred_BoW, average=None)

f1_1= (f1_score(Ytest, pred_BoW, average=None))


df_cm1 = pd.DataFrame(cm1,columns=[le.inverse_transform(0),le.inverse_transform(1)]
,index=[le.inverse_transform(0),le.inverse_transform(1)])

In [12]: #BoW Feature importance


features = vectorizer.get_feature_names()
feature_count = clf1.feature_count_

2
class_count = clf1.class_count_
pos_points_prob_sort = clf1.feature_log_prob_[1, :].argsort()
neg_points_prob_sort = clf1.feature_log_prob_[0, :].argsort()
log_prob = clf1.feature_log_prob_
feature_prob= pd.DataFrame(log_prob, columns=features).T
feature_prob.columns = [le.inverse_transform(0),le.inverse_transform(1)]
top_positive_BoW = feature_prob['positive'].sort_values(ascending=False)[:10]
top_negative_BoW = feature_prob['negative'].sort_values(ascending=False)[:10]

positive_BoW= pd.Series.to_frame(top_positive_BoW)
positive_BoW.reset_index(level=0, inplace=True)
positive_BoW.columns = ['BoW_Words_Pos','BoW_Prob']

negative_BoW= pd.Series.to_frame(top_negative_BoW)
negative_BoW.reset_index(level=0, inplace=True)
negative_BoW.columns = ['BoW_Words_neg','BoW_Prob']
BoW_FI= pd.concat([positive_BoW,negative_BoW], axis=1)
print('Feature importance - BoW')
BoW_FI

Feature importance - BoW

Out[12]: BoW_Words_Pos BoW_Prob BoW_Words_neg BoW_Prob


0 like -4.821223 tast -4.583463
1 tast -4.936946 like -4.628531
2 flavor -5.010554 product -4.847117
3 good -5.031487 one -5.157214
4 one -5.109834 tri -5.158664
5 love -5.115570 flavor -5.174748
6 great -5.130054 would -5.201639
7 coffe -5.191737 coffe -5.295225
8 use -5.199527 food -5.311992
9 tea -5.282650 good -5.459290

3 Tf-IDF
In [13]: from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vect = TfidfVectorizer()
Xtrain_tf_idf = tf_idf_vect.fit_transform(X_train.values)
Xtest_tf_idf = tf_idf_vect.transform(X_test.values)

In [15]: score_2 = []
for Alpha in alpha:
clf=MultinomialNB(alpha=Alpha)
scores = cross_val_score(clf, Xtrain_tf_idf, Ytrain, cv=10, scoring='accuracy')
# print('Aplha =',round(Alpha,5),'\t\t\tScore =',round(scores.mean(),5))
score_2.append(scores.mean())

3
In [18]: # changing to misclassification error
MSE_2 = [1 - x for x in score_2]
Training_Error2=round(1-max(score_2),4)
alpha_tfidf = round(alpha[MSE_2.index(min(MSE_2))],3)

In [24]: clf2=MultinomialNB(alpha=alpha_tfidf)
clf2.fit( Xtrain_tf_idf, Ytrain)
pred_tfidf=clf2.predict(Xtest_tf_idf)
Accuracy2=round(accuracy_score(Ytest,pred_tfidf,normalize=True),4)
testError2 = 1-Accuracy2

In [27]: #confusion matrix


cm2=confusion_matrix(Ytest, pred_tfidf)
p2=precision_score(Ytest, pred_tfidf, average=None)
r2=recall_score(Ytest, pred_tfidf, average=None)

f1_2= (f1_score(Ytest, pred_tfidf, average=None))


df_cm2 = pd.DataFrame(cm2,columns=[le.inverse_transform(0),le.inverse_transform(1)]
,index=[le.inverse_transform(0),le.inverse_transform(1)])

In [34]: #BoW Feature importance


features = tf_idf_vect.get_feature_names()
feature_count = clf2.feature_count_
class_count = clf2.class_count_
pos_points_prob_sort = clf2.feature_log_prob_[1, :].argsort()
neg_points_prob_sort = clf2.feature_log_prob_[0, :].argsort()
log_prob = clf2.feature_log_prob_
feature_prob= pd.DataFrame(log_prob, columns=features).T
feature_prob.columns = [le.inverse_transform(0),le.inverse_transform(1)]
top_positive_tfidf = feature_prob['positive'].sort_values(ascending=False)[:10]
top_negative_tfidf = feature_prob['negative'].sort_values(ascending=False)[:10]

positive= pd.Series.to_frame(top_positive_tfidf)
positive.reset_index(level=0, inplace=True)
positive.columns = ['Tf-IDF_Words_Pos','Tf-IDF_Prob']

negative= pd.Series.to_frame(top_negative_tfidf)
negative.reset_index(level=0, inplace=True)
negative.columns = ['TFIDF_Words_neg','TFIDF_Prob']
TfIDF_FI= pd.concat([positbive,negative], axis=1)
print('Feature importance - TF-IDF')
TfIDF_FI

Feature importance - TF-IDF

Out[34]: Tf-IDF_Words_Pos Tf-IDF_Prob TFIDF_Words_neg TFIDF_Prob


0 great -6.242255 tast -6.107194
1 love -6.315906 like -6.216595

4
2 coffe -6.365689 product -6.316280
3 good -6.376578 coffe -6.465138
4 flavor -6.430549 would -6.574775
5 like -6.436484 flavor -6.585915
6 tea -6.450763 tri -6.613772
7 tast -6.501234 one -6.643812
8 one -6.600750 dog -6.728935
9 use -6.628035 buy -6.749567

4 Avg W2V
In [35]: from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=

In [36]: def list_of_sent(df):


i=0
list_sent=[]
for sent in df.values:
filtered_sentence=[]
for w in sent.split():
for cleaned_words in w.split():
if(cleaned_words.isalpha()):
filtered_sentence.append(cleaned_words.lower())
else:
continue

list_sent.append(filtered_sentence)

return list_sent

In [56]: W2VTraining_list = list_of_sent(X_train)


W2VTest_list = list_of_sent(X_test)
# W2VTraining_list
list_all_words = list_of_sent(df['WoutStopWords'])
w2v_model = Word2Vec(list_all_words,min_count=5,size=50, workers=4)
words = list(w2v_model.wv.vocab)
# print(len(words))
Traing_vectors = []# the avg-w2v for each sentence/review is stored in this list
for sent in W2VTraining_list: # for each review/sentence
sent_vec = np.zeros(50) # as word vectors are of zero length
cnt_words =0 # num of words with a valid vector in the sentence/review
for word in sent: # for each word in a review/sentence
try:
vec = w2v_model.wv[word]
sent_vec += vec

5
cnt_words += 1
except:
pass
sent_vec /= cnt_words
Traing_vectors.append(sent_vec)

# scaling training vector


xtrain=np.array(Traing_vectors)+2
# x.min()

In [51]: score_3 = []
for Alpha in alpha:
clf=MultinomialNB(alpha=Alpha)
scores = cross_val_score(clf, xtrain, Ytrain, cv=10, scoring='accuracy')
# print('Aplha =',round(Alpha,5),'\t\t\tScore =',round(scores.mean(),5))
score_3.append(scores.mean())

In [52]: # changing to misclassification error


MSE_3 = [1 - x for x in score_3]
Training_Error3=round(1-max(score_3),4)
alpha_W2V = round(alpha[MSE_3.index(min(MSE_3))],3)

In [55]: Test_vectors = []# the avg-w2v for each sentence/review is stored in this list
for sent in W2VTest_list: # for each review/sentence
i=0
sent_vec = np.zeros(50) # as word vectors are of zero length
cnt_words =1 # num of words with a valid vector in the sentence/review
for word in sent: # for each word in a review/sentence
try:
vec = w2v_model.wv[word]
sent_vec += vec
cnt_words += 1
except:
pass
# i+=1
sent_vec /= cnt_words

# print(i,'\t',cnt_words)
Test_vectors.append(sent_vec)
xtest=np.array(Test_vectors)+2
# xtest.min()

Out[55]: 0.8316740879887028

In [63]: clf3=MultinomialNB(alpha=alpha_W2V)
clf3.fit( xtrain, Ytrain)
pred_W2V=clf3.predict(xtest)
Accuracy3=round(accuracy_score(Ytest,pred_W2V,normalize=True),4)
testError3 = 1-Accuracy3

6
In [65]: #confusion matrix
cm3=confusion_matrix(Ytest, pred_W2V)
p3=precision_score(Ytest, pred_W2V, average=None)
r3=recall_score(Ytest, pred_W2V, average=None)

f1_3= (f1_score(Ytest, pred_W2V, average=None))


df_cm3 = pd.DataFrame(cm3,columns=[le.inverse_transform(0),le.inverse_transform(1)]
,index=[le.inverse_transform(0),le.inverse_transform(1)])

5 TF-IDF W2V
In [67]: tfidf_feat = tf_idf_vect.get_feature_names()
# tfidf_feat = tf_idf_vect.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_train_vectors = []; # the tfidf-w2v for each sentence/review is stored in this li


row=0;
for sent in W2VTraining_list: # for each review/sentence
sent_vec = np.zeros(50) # as word vectors are of zero length
weight_sum =0; # num of words with a valid vector in the sentence/review
for word in sent: # for each word in a review/sentence
try:
vec = w2v_model.wv[word]
# print(vec)
# obtain the tf_idfidf of a word in a sentence/review
tfidf = Xtrain_tf_idf[row, tfidf_feat.index(word)]
# print(tfidf)
sent_vec += (vec * tfidf)
# print(sent_vec)

weight_sum += tfidf
# print(weight_sum)
except:
pass
sent_vec /= weight_sum
# print(sent_vec)
tfidf_train_vectors.append(sent_vec)
row += 1
Xtrain =np.array(tfidf_train_vectors)+2

In [68]: score_4 = []
for Alpha in alpha:
clf=MultinomialNB(alpha=Alpha)
scores = cross_val_score(clf, Xtrain, Ytrain, cv=10, scoring='accuracy')
# print('Aplha =',round(Alpha,5),'\t\t\tScore =',round(scores.mean(),5))
score_4.append(scores.mean())

In [70]: # changing to misclassification error

7
MSE_4 = [1 - x for x in score_4]
Training_Error4=round(1-max(score_4),4)
alpha_tfW2V = round(alpha[MSE_4.index(min(MSE_4))],3)

In [81]: tfidf_test_vectors = []; # the tfidf-w2v for each sentence/review is stored in this lis
row=0;
for sent in W2VTest_list: # for each review/sentence
sent_vec = np.zeros(50) # as word vectors are of zero length
weight_sum =1; # num of words with a valid vector in the sentence/review
for word in sent: # for each word in a review/sentence
try:
vec = w2v_model.wv[word]
# print(vec)
# obtain the tf_idfidf of a word in a sentence/review
tfidf = Xtrain_tf[row, tfidf_feat.index(word)]
# print(tfidf)
sent_vec += (vec * tfidf)
# print(sent_vec)

weight_sum += tfidf
# print(weight_sum)
except:
pass
sent_vec /= weight_sum
# print(sent_vec)
tfidf_test_vectors.append(sent_vec)
row += 1
Xtest=np.array(tfidf_test_vectors)+2

In [82]: clf4=MultinomialNB(alpha=alpha_tfW2V)
clf4.fit( Xtrain, Ytrain)
pred_tfW2V=clf4.predict(Xtest)
Accuracy4=round(accuracy_score(Ytest,pred_tfW2V,normalize=True),4)
testError4 = round(1-Accuracy4,4)

In [83]: #confusion matrix


cm4=confusion_matrix(Ytest, pred_tfW2V)
p4=precision_score(Ytest, pred_tfW2V, average=None)
r4=recall_score(Ytest, pred_tfW2V, average=None)

f1_4= (f1_score(Ytest, pred_tfW2V, average=None))


df_cm4 = pd.DataFrame(cm4,columns=[le.inverse_transform(0),le.inverse_transform(1)]
,index=[le.inverse_transform(0),le.inverse_transform(1)])

6 Conclusion Results
In [84]: print('\tTable 1 - Comparing Value of alpha, Train error and test error')
print('\t~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

8
print('\n')
print("\t\t BoW\t\t TF-ID\t\t Avg W2V\t TF-ID W2V")
print("\t\t ---\t\t -----\t\t -------\t ---------")

print("Optimal Alpha\t",alpha_BoW,"\t\t",alpha_tfidf,"\t\t",alpha_W2V,"\t\t",alpha_tfW2
print("Training Error\t",Training_Error1,"\t",Training_Error2,"\t\t",Training_Error3,"\
print("Test Error\t",testError1,"\t",testError2,"\t",testError3,"\t",testError4)

Table 1 - Comparing Value of alpha, Train error and test error


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

BoW TF-ID Avg W2V TF-ID W2V


--- ----- ------- ---------
Optimal Alpha 4.401 3.101 6.001 0.001
Training Error 0.1732 0.166 0.4615 0.4619
Test Error 0.1622 0.1602 0.4334 0.4955

In [86]: print('\t\t\tTable 2 - Perfomance Measurements')


print('\t\t\t~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print('\n')
print("\t\t\t BoW\t\t TF-ID\t\t Avg W2V\t TF-ID W2V")
print("\t\t\t ---\t\t -----\t\t -------\t ---------")

print("Accuracy\t\t ",Accuracy1*100,"\t",Accuracy2*100,"\t\t",Accuracy3*100,"\t\t",Accu
print('\n')
print("f1 Score-a)Positive\t ",round(f1_1[1],4),"\t",round(f1_2[1],4),'\t',round(f1_3[1
print(" b)Negative\t ",round(f1_1[0],4),"\t",round(f1_2[0],4),'\t',round(f1_3[0
print('\n')
print("PrecisionScore-a)Positive",round(p1[1],4),"\t",round(p2[1],4),'\t',round(p3[1],4
print(" b)Negative",round(p1[0],4),"\t",round(p2[0],4),'\t',round(p3[0],4
print('\n')
print("RecallScore-a)Positive\t ",round(r1[1],4),"\t",round(r2[1],4),'\t',round(r3[1],4
print(" b)Negative\t ",round(r1[0],4),"\t",round(r2[0],4),'\t',round(r3[0],4
print('\n')

Table 2 - Perfomance Measurements


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

BoW TF-ID Avg W2V TF-ID W2V


--- ----- ------- ---------
Accuracy 83.78 83.98 56.66 83.78

f1 Score-a)Positive 0.8396 0.8447 0.5595 0.0


b)Negative 0.836 0.8347 0.5734 0.6707

9
PrecisionScore-a)Positive 0.8233 0.8131 0.5635 0.0
b)Negative 0.8533 0.8707 0.5695 0.5045

RecallScore-a)Positive 0.8566 0.8788 0.5556 0.0


b)Negative 0.8194 0.8016 0.5774 1.0

In [80]: # Confusion Matrix


# fig, axs = plt.subplots(2,2, figsize=(8,8), sharey=True)
fig = plt.figure(figsize=(15,14))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)

sn.set(font_scale=1.4)#for label size


sn.heatmap(df_cm1, annot=True,annot_kws={"size": 12},ax=ax1)# font size
ax1.set_title('BoW - Confusion Matrix')

sn.set(font_scale=1.4)#for label size


sn.heatmap(df_cm2, annot=True,annot_kws={"size": 12},ax=ax2)# font size
ax2.set_title('TF-IDF - Confusion Matrix')

sn.set(font_scale=1.4)#for label size


sn.heatmap(df_cm3, annot=True,annot_kws={"size": 12},ax=ax3)# font size
ax3.set_title('Avg W2V - Confusion Matrix')

sn.set(font_scale=1.4)#for label size


sn.heatmap(df_cm4, annot=True,annot_kws={"size": 12},ax=ax4)# font size
ax4.set_title('Tf-IDF - Confusion Matrix')

Out[80]: Text(0.5,1,'Tf-IDF - Confusion Matrix')

10
11