Anda di halaman 1dari 15

19/2/2019 Analisis Data Iris - Paulina Ade Cahyanti (662016006) & Jessica Ordelia (662016018)

In [4]:

#Langkah 1. Program Python Memanggil fungsi


get_ipython().magic(u'matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
iris= pd.read_csv('iris.csv')
iris.columns
Out[4]:

Index(['sepal.length', 'sepal.width', 'petal.length', 'petal.width',


'variety'],
dtype='object')

In [5]:

#Langkah 2. Menampilkan data


iris.head()

Out[5]:

sepal.length sepal.width petal.length petal.width variety

0 5.1 3.5 1.4 0.2 Setosa

1 4.9 3.0 1.4 0.2 Setosa

2 4.7 3.2 1.3 0.2 Setosa

3 4.6 3.1 1.5 0.2 Setosa

4 5.0 3.6 1.4 0.2 Setosa

In [6]:

#Langkah 3. Memperhatikan dimensi data


iris.head()

Out[6]:

sepal.length sepal.width petal.length petal.width variety

0 5.1 3.5 1.4 0.2 Setosa

1 4.9 3.0 1.4 0.2 Setosa

2 4.7 3.2 1.3 0.2 Setosa

3 4.6 3.1 1.5 0.2 Setosa

4 5.0 3.6 1.4 0.2 Setosa

file:///C:/Users/Acer/Downloads/Analisis%20Data%20Iris%20-%20Paulina%20Ade%20Cahyanti%20(662016006)%20&%20Jessica%20Ordelia… 1/13
19/2/2019 Analisis Data Iris - Paulina Ade Cahyanti (662016006) & Jessica Ordelia (662016018)

In [7]:

#Langkah 4. Untuk menghitung tiap klas

#Untuk mengidentifikasi berapa variety


iris.groupby('variety').size()

Out[7]:

variety
Setosa 50
Versicolor 50
Virginica 50
dtype: int64

file:///C:/Users/Acer/Downloads/Analisis%20Data%20Iris%20-%20Paulina%20Ade%20Cahyanti%20(662016006)%20&%20Jessica%20Ordelia… 2/13
19/2/2019 Analisis Data Iris - Paulina Ade Cahyanti (662016006) & Jessica Ordelia (662016018)

In [8]:

#Langkah 5. Menampilkan data per faktor dengan berdasarkan klasifikasi


variety iris.groupby('variety').hist(figsize=(9, 9))

file:///C:/Users/Acer/Downloads/Analisis%20Data%20Iris%20-%20Paulina%20Ade%20Cahyanti%20(662016006)%20&%20Jessica%20Ordelia… 3/13
19/2/2019 Analisis Data Iris - Paulina Ade Cahyanti (662016006) & Jessica Ordelia (662016018)

Out[8]:

variety
Setosa [[AxesSubplot(0.125,0.551739;0.336957x0.328261...
Versicolor [[AxesSubplot(0.125,0.551739;0.336957x0.328261...
Virginica [[AxesSubplot(0.125,0.551739;0.336957x0.328261...
dtype: object

file:///C:/Users/Acer/Downloads/Analisis%20Data%20Iris%20-%20Paulina%20Ade%20Cahyanti%20(662016006)%20&%20Jessica%20Ordelia… 4/13
19/2/2019 Analisis Data Iris - Paulina Ade Cahyanti (662016006) & Jessica Ordelia (662016018)

file:///C:/Users/Acer/Downloads/Analisis%20Data%20Iris%20-%20Paulina%20Ade%20Cahyanti%20(662016006)%20&%20Jessica%20Ordelia… 5/13
19/2/2019 Analisis Data Iris - Paulina Ade Cahyanti (662016006) & Jessica Ordelia (662016018)

file:///C:/Users/Acer/Downloads/Analisis%20Data%20Iris%20-%20Paulina%20Ade%20Cahyanti%20(662016006)%20&%20Jessica%20Ordelia… 6/13
19/2/2019 Analisis Data Iris - Paulina Ade Cahyanti (662016006) & Jessica Ordelia (662016018)

In [9]:

#Langkah 6. #Untuk mengenali ada tidaknya missing data, ternyata hasil menunjukkan
tida k adanya data yang kosong
iris.isnull().sum()
iris.isna().sum()

Out[9]:

sepal.length 0
sepal.width 0
petal.length 0
petal.width 0
variety 0
dtype: int64

In [18]:

namafaktor = ["sepal.length","sepal.width","petal.length","petal.width"]
X = iris[namafaktor]
y = iris['variety']

In [10]:

#Langkah 14. Mengimport algoritma pengklasifikasi model

from sklearn.neighbors import


KNeighborsClassifier from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier from
sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier from
sklearn.ensemble import GradientBoostingClassifier

C:\Users\Acer\Anaconda3\lib\site-packages\sklearn\ensemble\weight_boostin
g.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy m
odule and should not be imported. It will be removed in a future NumPy rel
ease.
from numpy.core.umath_tests import inner1d

In [11]:

#Langkah 15. Kita akan menginisialiasi model-model klasifikasi dengan parameter yang
su dah baku (default) dan menambahkan parameter tersebut dalam daftar model

models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVC', SVC()))
models.append(('LR', LogisticRegression()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('GNB', GaussianNB()))
models.append(('RF', RandomForestClassifier()))
models.append(('GB', GradientBoostingClassifier()))

file:///C:/Users/Acer/Downloads/Analisis%20Data%20Iris%20-%20Paulina%20Ade%20Cahyanti%20(662016006)%20&%20Jessica%20Ordelia… 7/13
19/2/2019 Analisis Data Iris - Paulina Ade Cahyanti (662016006) & Jessica Ordelia (662016018)

In [12]:

#Langkah 16. Mengimport fungsi-fungsi yang diperlukan

from sklearn.model_selection import train_test_split


from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold # ini ditambahkan krn tidk ada tetapi perlu

In [19]:

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = iris.variety,


rand om_state=0)

In [20]:

names = []
scores = []
for name, model in models:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
scores.append(accuracy_score(y_test, y_pred))
names.append(name)
tr_split = pd.DataFrame({'Nama': names, 'Nilai': scores})
print(tr_split)

Nama Nilai
0 KNN 1.000000
1 SVC 1.000000
2 LR 0.973684
3 DT 0.973684
4 GNB 0.973684
5 RF 0.973684
6 GB 0.973684

In [21]:

names = []
scores = []
for name, model in models:
kfold = KFold(n_splits=10, random_state=10)
score = cross_val_score(model, X, y, cv=kfold, scoring='accuracy').mean()
names.append(name)
scores.append(score)
kf_cross_val = pd.DataFrame({'Nama': names, 'Nilai': scores})
print(kf_cross_val)

Nama Nilai
0 KNN 0.933333
1 SVC 0.953333
2 LR 0.880000
3 DT 0.946667
4 GNB 0.946667
5 RF 0.953333
6 GB 0.926667

file:///C:/Users/Acer/Downloads/Analisis%20Data%20Iris%20-%20Paulina%20Ade%20Cahyanti%20(662016006)%20&%20Jessica%20Ordelia… 8/13
19/2/2019 Analisis Data Iris - Paulina Ade Cahyanti (662016006) & Jessica Ordelia (662016018)

In [22]:

axis = sns.barplot(x = 'Nama', y = 'Nilai', data = kf_cross_val)


axis.set(xlabel='Klasifikasi', ylabel='Akurasi')
for p in axis.patches:
tinggi = p.get_height()
axis.text(p.get_x() + p.get_width()/2, tinggi + 0.005, '{:1.4f}'.format(tinggi),
ha ="center")

plt.show()

In [23]:

# Importing
libraries import
pandas as pd import
numpy as np import
math
import operator

In [24]:

# Start of STEP 1
# Importing data
data = pd.read_csv("iris.csv")
# End of STEP 1

data.head()

Out[24]:

sepal.length sepal.width petal.length petal.width variety

0 5.1 3.5 1.4 0.2 Setosa

1 4.9 3.0 1.4 0.2 Setosa

2 4.7 3.2 1.3 0.2 Setosa

3 4.6 3.1 1.5 0.2 Setosa

4 5.0 3.6 1.4 0.2 Setosa

file:///C:/Users/Acer/Downloads/Analisis%20Data%20Iris%20-%20Paulina%20Ade%20Cahyanti%20(662016006)%20&%20Jessica%20Ordelia… 9/13
19/2/2019 Analisis Data Iris - Paulina Ade Cahyanti (662016006) & Jessica Ordelia (662016018)

In [25]:

# Defining a function which calculates euclidean distance between two data


points def euclideanDistance(data1, data2, length):
distance = 0
for x in range(length):
distance += np.square(data1[x] -
data2[x]) return np.sqrt(distance)

# Defining our KNN model


def knn(trainingSet, testInstance, k):

distances = {}
sort = {}

length = testInstance.shape[1]

#### Start of STEP 3


# Calculating euclidean distance between each row of training data and test
data for x in range(len(trainingSet)):

#### Start of STEP 3.1


dist = euclideanDistance(testInstance, trainingSet.iloc[x], length)

distances[x] = dist[0]
# End of STEP 3.1

# Start of STEP 3.2


# Sorting them on the basis of distance
sorted_d = sorted(distances.items(), key=operator.itemgetter(1))
#### End of STEP 3.2

neighbors = []

#### Start of STEP 3.3


# Extracting top k neighbors
for x in range(k):
neighbors.append(sorted_d[x][0])
#### End of STEP
3.3 classVotes = {}

#### Start of STEP 3.4


# Calculating the most freq class in the
neighbors for x in range(len(neighbors)):
response = trainingSet.iloc[neighbors[x]][-1]

if response in classVotes:
classVotes[response] += 1
else:
classVotes[response] = 1
# End of STEP 3.4

# Start of STEP 3.5


sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
return(sortedVotes[0][0], neighbors)
#### End of STEP 3.5

file:///C:/Users/Acer/Downloads/Analisis%20Data%20Iris%20-%20Paulina%20Ade%20Cahyanti%20(662016006)%20&%20Jessica%20Ordelia… 10/13
19/2/2019 Analisis Data Iris - Paulina Ade Cahyanti (662016006) & Jessica Ordelia (662016018)

In [26]:

# Creating a dummy testset


testSet = [[7.2, 3.6, 5.1, 2.5]]
test = pd.DataFrame(testSet)

In [28]:

#### Start of STEP 2


# Setting number of neighbors =
1 k = 1
#### End of STEP 2
# Running KNN model
result,neigh = knn(data, test, k)

# Predicted
class
print(result)
# Nearest neighbor
print(neigh)

Virginica
[141]

In [29]:

# Setting number of neighbors =


3 k = 3
# Running KNN model
result,neigh = knn(data, test, k)
# Predicted
class
print(result)
# 3 nearest
neighbors
print(neigh)

Virginica
[141, 139, 120]

In [30]:

# Setting number of neighbors =


5 k = 5
# Running KNN model
result,neigh = knn(data, test, k)
# Predicted
class
print(result)
# 5 nearest
neighbors
print(neigh)

Virginica
[141, 139, 120, 145, 144]
file:///C:/Users/Acer/Downloads/Analisis%20Data%20Iris%20-%20Paulina%20Ade%20Cahyanti%20(662016006)%20&%20Jessica%20Ordelia… 11/13
19/2/2019 Analisis Data Iris - Paulina Ade Cahyanti (662016006) & Jessica Ordelia (662016018)

In [32]:

from sklearn.neighbors import KNeighborsClassifier


neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(data.iloc[:,0:4], data['variety'])

# Predicted class
print(neigh.predict(test))
# 3 nearest neighbors
print(neigh.kneighbors(test)[1])

['Virginica']
[[141 139 120]]

In [13]:

#Logistik Regression
#Pada bagian ini kita memanggil data dimana dari proses cross validation

get_ipython().magic(u'matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
irs = pd.read_csv('iris.csv')
iris.columns
iris.head()

Out[13]:

sepal.length sepal.width petal.length petal.width variety

0 5.1 3.5 1.4 0.2 Setosa

1 4.9 3.0 1.4 0.2 Setosa

2 4.7 3.2 1.3 0.2 Setosa

3 4.6 3.1 1.5 0.2 Setosa

4 5.0 3.6 1.4 0.2 Setosa

In [16]:

X = iris[['sepal.length','petal.length']].values y
= iris[['variety']].values

file:///C:/Users/Acer/Downloads/Analisis%20Data%20Iris%20-%20Paulina%20Ade%20Cahyanti%20(662016006)%20&%20Jessica%20Ordelia… 12/13
19/2/2019 Analisis Data Iris - Paulina Ade Cahyanti (662016006) & Jessica Ordelia (662016018)

In [17]:

#Pada bagian ini kita akan melakukan transformasi dimana data ( 2 var di atas
:Glukosa dan BMI mempunyai standar deviasi 1 dan rata-rata 0

from sklearn.preprocessing import StandardScaler


sc = StandardScaler()
X = sc.fit_transform(X)

mean = np.mean(X, axis=0)


print('Rata-rata: (%d, %d)' % (mean[0], mean[1]))
standard_deviasi = np.std(X, axis=0)
print('Standard deviasi: (%d, %d)' % (standard_deviasi[0], standard_deviasi[1]))

Rata-rata: (0, 0)
Standard deviasi: (1, 1)

In [18]:

# Data 10 baris pertama setelah


ditransformasi print(X[0:10,:])

[[-0.90068117 -1.34022653]
[-1.14301691 -1.34022653]
[-1.38535265 -1.39706395]
[-1.50652052 -1.2833891 ]
[-1.02184904 -1.34022653]
[-0.53717756 -1.16971425]
[-1.50652052 -1.34022653]
[-1.02184904 -1.2833891 ]
[-1.74885626 -1.34022653]
[-1.14301691 -1.2833891 ]]

In [ ]:

# Karena dalam penyusunan algoritma logistic regression masih kurang tepat maka proses
tidak dapat dilanjutkan, terjadi eror yang sangat banyak dan masih harus dilakukan
peninjauan ulang terhadap tahap-tahap awal pada bagian logistic regression
file:///C:/Users/Acer/Downloads/Analisis%20Data%20Iris%20-%20Paulina%20Ade%20Cahyanti%20(662016006)%20&%20Jessica%20Ordelia… 13/13

Anda mungkin juga menyukai