Created by SmirkyGraphs. Source: GitHub.

Predicting Iris Species¶

Index¶

Data Exploration
Graphing Difference
Predicting Species

Imports¶

# For Data
import pandas as pd
from pandas import Series,DataFrame
import numpy as np

# For Graphing
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#Styling
sns.set_style('darkgrid')
my_color = sns.color_palette()
sns.set()

# Loading the iris data
df = sns.load_dataset("iris")

Data Exploration¶

df.head()

df.describe()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal_length    150 non-null float64
sepal_width     150 non-null float64
petal_length    150 non-null float64
petal_width     150 non-null float64
species         150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB

df.shape

(150, 5)

df['species'].value_counts()

virginica     50
setosa        50
versicolor    50
Name: species, dtype: int64

We have 150 samples, 50 of each iris plant type

Graphing to Visualize the Differences¶

plot = df.pivot_table(index=df['species'], values=['sepal_length','sepal_width','petal_length','petal_width'], 
    aggfunc='mean').plot(kind='bar', color=my_color, legend=True, rot=0)

sns.pairplot(df, hue="species")

<seaborn.axisgrid.PairGrid at 0xa9190244a8>

Predicting Setosa should be no problem as it stands out from the others in every way
however Versicolor and Virginica have some overlap which may cause an issue

Predicting Iris Species¶

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier

# Encoding the species

le = preprocessing.LabelEncoder()
df['species_id'] = le.fit_transform(df['species'])

df.head()

feature_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
X = df[feature_cols]

y = df['species_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4,test_size=0.4)

# Support Vector Machine
svc = svm.SVC()
svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)
accuracy_score(y_test,y_pred)

0.98333333333333328

# K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)

0.98333333333333328

# Logistic Regression
logreg = LogisticRegression(random_state=1)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
accuracy_score(y_test,y_pred)

0.93333333333333335

# Random Forest
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.94999999999999996

# Naive Bayes
gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)
accuracy_score(y_test,y_pred)

0.96666666666666667

# Decision Tree
dtc = DecisionTreeClassifier(random_state=1)
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)
accuracy_score(y_test,y_pred)

0.96666666666666667

# Extra Tree
etc = ExtraTreeClassifier(random_state=1)
etc.fit(X_train, y_train)

y_pred = etc.predict(X_test)
accuracy_score(y_test,y_pred)

0.96666666666666667

# Cross Validation
knn_score = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
svc_scores = cross_val_score(svc, X, y, cv=10, scoring='accuracy')
logreg_scores = cross_val_score(logreg, X, y, cv=10, scoring='accuracy')
rf_scores = cross_val_score(rf, X, y, cv=10, scoring='accuracy')
gnb_scores = cross_val_score(gnb, X, y, cv=10, scoring='accuracy')
dtc_scores = cross_val_score(dtc, X, y, cv=10, scoring='accuracy')
etc_scores = cross_val_score(etc, X, y, cv=10, scoring='accuracy')

print('---------------------------')
print('  Model Accuracy Scores')
print('---------------------------')

print('knn','\t',knn_score.mean())
print('svm','\t',svc_scores.mean())
print('logreg','\t',logreg_scores.mean())
print('rf','\t',rf_scores.mean())
print('gnb','\t',gnb_scores.mean())
print('dtc','\t',dtc_scores.mean())
print('etc','\t',etc_scores.mean())

---------------------------
  Model Accuracy Scores
---------------------------
knn 	 0.966666666667
svm 	 0.98
logreg 	 0.953333333333
rf 	 0.96
gnb 	 0.953333333333
dtc 	 0.953333333333
etc 	 0.94

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

	sepal_length	sepal_width	petal_length	petal_width
count	150.000000	150.000000	150.000000	150.000000
mean	5.843333	3.057333	3.758000	1.199333
std	0.828066	0.435866	1.765298	0.762238
min	4.300000	2.000000	1.000000	0.100000
25%	5.100000	2.800000	1.600000	0.300000
50%	5.800000	3.000000	4.350000	1.300000
75%	6.400000	3.300000	5.100000	1.800000
max	7.900000	4.400000	6.900000	2.500000

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa