Klasifikasi Dengan W2v.ipynb - Colaboratory
Klasifikasi Dengan W2v.ipynb - Colaboratory
ipynb - Colaboratory
pip install sastrawi
## for data
import pandas as pd
import numpy as np
## for preprocessing
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
## for bag-of-words
from sklearn import model_selection, preprocessing, metrics, manifold
from sklearn.ensemble import RandomForestClassifier
## for word embedding
import gensim
df = pd.read_csv('data sentimen dki.csv')
df.head()
2 3 negative Agus-Sylvi Kalau aku sih gak nunggu hasil akhir QC tp lag...
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory #memanggil modul untuk stemming
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory #memang
algstem = StemmerFactory()
stemmer = algstem.create_stemmer() #deklarasi algoritma stemming
stpw = StopWordRemoverFactory()
stplist = stpw.get_stop_words() #mendapatkan daftar stopword yang ada di library
cleantext = "[^A-Za-z0-9]"
https://colab.research.google.com/drive/1T_sh7Q8xMGJ2pB2c3sITCzy6dKk2oBhj?authuser=1#printMode=true 1/5
6/13/22, 10:19 AM klasifikasi dengan w2v.ipynb - Colaboratory
def preparasi(teks):
tokens = []
#remove noise and lowercasing
teks = re.sub(cleantext,' ',str(teks).lower()).strip()
#spell check menggunakan nltk edit distance
for word in teks.split():
#stemming
token = stemmer.stem(word) #penggunaan sastrawi untuk stemming
#print(token)
#remove stopword
if token not in stplist: #mengecek apakah hasil stem merupakan stopword
#print(token)
tokens.append(token)
return " ".join(tokens)
df['preprocessing'] = df['Text Tweet'].apply(lambda x:preparasi(x))
df.head()
1 2 negative Agus-Sylvi #agussilvy bicara apa kasihan yaa...lap itu ai... agu
2 3 negative Agus-Sylvi Kalau aku sih gak nunggu hasil akhir QC tp lag... kala
3 4 negative Agus-Sylvi Kasian oh kasian dengan peluru 1milyar untuk t... kas
df_baru = df[['Pasangan Calon','preprocessing']]
df_baru.head()
1 Agus-Sylvi agussilvy bicara apa kasihan yaa lap air mata ...
df_train, df_test = model_selection.train_test_split(df_baru, test_size=0.3)
x_train = df_train['preprocessing']
x_test = df_test['preprocessing']
y_train = df_train['Pasangan Calon'].values #data training, y
y_test = df_test['Pasangan Calon'].values #data testing, y
https://colab.research.google.com/drive/1T_sh7Q8xMGJ2pB2c3sITCzy6dKk2oBhj?authuser=1#printMode=true 2/5
6/13/22, 10:19 AM klasifikasi dengan w2v.ipynb - Colaboratory
## buat list unigram agar sesuai dengan inputan gensim untuk data train data test
#untuk data train
lst_corpus = []
for string in x_train:
lst_words = string.split()
lst_grams = [" ".join(lst_words[i:i+1])
for i in range(0, len(lst_words), 1)]
lst_corpus.append(lst_grams)
#untuk data testing
lst_corpus_test = []
for string in x_test:
lst_words = string.split()
lst_grams = [" ".join(lst_words[i:i+1])
for i in range(0, len(lst_words), 1)]
lst_corpus_test.append(lst_grams)
## fit w2v
w2v_model = gensim.models.word2vec.Word2Vec(lst_corpus, size = 50, window=5, min_count=2,
print (w2v_model.wv['anies'])
# Generate aggregated sentence vectors based on the word vectors for each word in the sent
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index2word )
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
for ls in lst_corpus])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
for ls in lst_corpus_test])
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:5: VisibleDeprecationWar
"""
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:7: VisibleDeprecationWar
import sys
# Compute sentence vectors by averaging the word vectors for the words contained in the se
X_train_vect_avg = []
for v in X_train_vect:
if v.size:
X_train_vect_avg.append(v.mean(axis=0))
else:
X_train_vect_avg.append(np.zeros(100, dtype=float))
X_test_vect_avg = []
for v in X_test_vect:
if v.size:
https://colab.research.google.com/drive/1T_sh7Q8xMGJ2pB2c3sITCzy6dKk2oBhj?authuser=1#printMode=true 3/5
6/13/22, 10:19 AM klasifikasi dengan w2v.ipynb - Colaboratory
X_test_vect_avg.append(v.mean(axis=0))
else:
X_test_vect_avg.append(np.zeros(100, dtype=float))
rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train)
y_pred = rf_model.predict(X_test_vect_avg)
y_pred
## Accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", round(accuracy,2))
print("Detail:")
print(metrics.classification_report(y_test, y_pred))
Accuracy: 0.77
Detail:
#membuat data frame untuk data yang belum memiliki label
d = {'teks': ['aku sih mendukung calon yang amanah seperti anies', 'kasian si anies, sudah
df_nolabel = pd.DataFrame(data=d)
#preprocessing untuk data baru
df_nolabel['preprocessing'] = df_nolabel['teks'].apply(lambda x:preparasi(x))
df_nolabel.head()
teks preprocessing
0 aku sih mendukung calon yang amanah seperti anies aku sih dukung calon amanah anies
1 kasian si anies, sudah dipecat jadi menteri ma... kasi si anies pecat jadi menteri gila jabat
#untuk data baru
x_baru = df_nolabel['preprocessing']
lst_corpus_new = []
for string in x_baru:
lst_words = string.split()
lst_grams = [" ".join(lst_words[i:i+1])
for i in range(0, len(lst_words), 1)]
lst_corpus_new.append(lst_grams)
https://colab.research.google.com/drive/1T_sh7Q8xMGJ2pB2c3sITCzy6dKk2oBhj?authuser=1#printMode=true 4/5
6/13/22, 10:19 AM klasifikasi dengan w2v.ipynb - Colaboratory
X_baru_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
for ls in lst_corpus_new])
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: VisibleDeprecationWar
X_baru_vect_avg = []
for v in X_baru_vect:
if v.size:
X_baru_vect_avg.append(v.mean(axis=0))
else:
X_baru_vect_avg.append(np.zeros(100, dtype=float))
rf_model.predict(X_baru_vect_avg) #memprediksi data baru
https://colab.research.google.com/drive/1T_sh7Q8xMGJ2pB2c3sITCzy6dKk2oBhj?authuser=1#printMode=true 5/5