Created by SmirkyGraphs. Source: GitHub.


Predicting Iris Species

Imports

In [1]:
# For Data
import pandas as pd
from pandas import Series,DataFrame
import numpy as np

# For Graphing
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [11]:
#Styling
sns.set_style('darkgrid')
my_color = sns.color_palette()
sns.set()
In [3]:
# Loading the iris data
df = sns.load_dataset("iris")

Data Exploration

In [4]:
df.head()
Out[4]:
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
In [5]:
df.describe()
Out[5]:
sepal_length sepal_width petal_length petal_width
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.057333 3.758000 1.199333
std 0.828066 0.435866 1.765298 0.762238
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000
In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal_length    150 non-null float64
sepal_width     150 non-null float64
petal_length    150 non-null float64
petal_width     150 non-null float64
species         150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB
In [7]:
df.shape
Out[7]:
(150, 5)
In [8]:
df['species'].value_counts()
Out[8]:
virginica     50
setosa        50
versicolor    50
Name: species, dtype: int64

We have 150 samples, 50 of each iris plant type

Graphing to Visualize the Differences

In [12]:
plot = df.pivot_table(index=df['species'], values=['sepal_length','sepal_width','petal_length','petal_width'], 
    aggfunc='mean').plot(kind='bar', color=my_color, legend=True, rot=0)
In [10]:
sns.pairplot(df, hue="species")
Out[10]:
<seaborn.axisgrid.PairGrid at 0xa9190244a8>

Predicting Setosa should be no problem as it stands out from the others in every way
however Versicolor and Virginica have some overlap which may cause an issue

Predicting Iris Species

In [13]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
In [14]:
# Encoding the species

le = preprocessing.LabelEncoder()
df['species_id'] = le.fit_transform(df['species'])
In [15]:
df.head()
Out[15]:
sepal_length sepal_width petal_length petal_width species species_id
0 5.1 3.5 1.4 0.2 setosa 0
1 4.9 3.0 1.4 0.2 setosa 0
2 4.7 3.2 1.3 0.2 setosa 0
3 4.6 3.1 1.5 0.2 setosa 0
4 5.0 3.6 1.4 0.2 setosa 0
In [16]:
feature_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
X = df[feature_cols]

y = df['species_id']
In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4,test_size=0.4)
In [18]:
# Support Vector Machine
svc = svm.SVC()
svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)
accuracy_score(y_test,y_pred)
Out[18]:
0.98333333333333328
In [19]:
# K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)
Out[19]:
0.98333333333333328
In [20]:
# Logistic Regression
logreg = LogisticRegression(random_state=1)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
accuracy_score(y_test,y_pred)
Out[20]:
0.93333333333333335
In [21]:
# Random Forest
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)
Out[21]:
0.94999999999999996
In [22]:
# Naive Bayes
gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)
accuracy_score(y_test,y_pred)
Out[22]:
0.96666666666666667
In [23]:
# Decision Tree
dtc = DecisionTreeClassifier(random_state=1)
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)
accuracy_score(y_test,y_pred)
Out[23]:
0.96666666666666667
In [24]:
# Extra Tree
etc = ExtraTreeClassifier(random_state=1)
etc.fit(X_train, y_train)

y_pred = etc.predict(X_test)
accuracy_score(y_test,y_pred)
Out[24]:
0.96666666666666667
In [25]:
# Cross Validation
knn_score = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
svc_scores = cross_val_score(svc, X, y, cv=10, scoring='accuracy')
logreg_scores = cross_val_score(logreg, X, y, cv=10, scoring='accuracy')
rf_scores = cross_val_score(rf, X, y, cv=10, scoring='accuracy')
gnb_scores = cross_val_score(gnb, X, y, cv=10, scoring='accuracy')
dtc_scores = cross_val_score(dtc, X, y, cv=10, scoring='accuracy')
etc_scores = cross_val_score(etc, X, y, cv=10, scoring='accuracy')
In [26]:
print('---------------------------')
print('  Model Accuracy Scores')
print('---------------------------')

print('knn','\t',knn_score.mean())
print('svm','\t',svc_scores.mean())
print('logreg','\t',logreg_scores.mean())
print('rf','\t',rf_scores.mean())
print('gnb','\t',gnb_scores.mean())
print('dtc','\t',dtc_scores.mean())
print('etc','\t',etc_scores.mean())
---------------------------
  Model Accuracy Scores
---------------------------
knn 	 0.966666666667
svm 	 0.98
logreg 	 0.953333333333
rf 	 0.96
gnb 	 0.953333333333
dtc 	 0.953333333333
etc 	 0.94