Created by SmirkyGraphs. Source: GitHub.
# For Data
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
# For Graphing
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#Styling
sns.set_style('darkgrid')
my_color = sns.color_palette()
sns.set()
# Loading the iris data
df = sns.load_dataset("iris")
df.head()
df.describe()
df.info()
df.shape
df['species'].value_counts()
We have 150 samples, 50 of each iris plant type
plot = df.pivot_table(index=df['species'], values=['sepal_length','sepal_width','petal_length','petal_width'],
aggfunc='mean').plot(kind='bar', color=my_color, legend=True, rot=0)
sns.pairplot(df, hue="species")
Predicting Setosa should be no problem as it stands out from the others in every way
however Versicolor and Virginica have some overlap which may cause an issue
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
# Encoding the species
le = preprocessing.LabelEncoder()
df['species_id'] = le.fit_transform(df['species'])
df.head()
feature_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
X = df[feature_cols]
y = df['species_id']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4,test_size=0.4)
# Support Vector Machine
svc = svm.SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
accuracy_score(y_test,y_pred)
# K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)
# Logistic Regression
logreg = LogisticRegression(random_state=1)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
accuracy_score(y_test,y_pred)
# Random Forest
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)
# Naive Bayes
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
accuracy_score(y_test,y_pred)
# Decision Tree
dtc = DecisionTreeClassifier(random_state=1)
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
accuracy_score(y_test,y_pred)
# Extra Tree
etc = ExtraTreeClassifier(random_state=1)
etc.fit(X_train, y_train)
y_pred = etc.predict(X_test)
accuracy_score(y_test,y_pred)
# Cross Validation
knn_score = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
svc_scores = cross_val_score(svc, X, y, cv=10, scoring='accuracy')
logreg_scores = cross_val_score(logreg, X, y, cv=10, scoring='accuracy')
rf_scores = cross_val_score(rf, X, y, cv=10, scoring='accuracy')
gnb_scores = cross_val_score(gnb, X, y, cv=10, scoring='accuracy')
dtc_scores = cross_val_score(dtc, X, y, cv=10, scoring='accuracy')
etc_scores = cross_val_score(etc, X, y, cv=10, scoring='accuracy')
print('---------------------------')
print(' Model Accuracy Scores')
print('---------------------------')
print('knn','\t',knn_score.mean())
print('svm','\t',svc_scores.mean())
print('logreg','\t',logreg_scores.mean())
print('rf','\t',rf_scores.mean())
print('gnb','\t',gnb_scores.mean())
print('dtc','\t',dtc_scores.mean())
print('etc','\t',etc_scores.mean())