Created by SmirkyGraphs. Source: GitHub.
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
titanic_df = pd.read_csv('train.csv')
# Previewing the Data
titanic_df.head()
# Overall general info
titanic_df.info()
# Viewing the data types of each column
titanic_df.dtypes
There are 891 rows ~100 missing Age and ~700 missing Cabin information
titanic_df.describe()
Lets try and answer some important questions like
# Graphing by Gender
sns.factorplot('Sex',data=titanic_df,kind="count")
# Graphing by class
sns.factorplot('Pclass',data=titanic_df,hue='Sex',kind="count")
Lets find out how many children were onboard
def male_female_child(passenger):
age,sex = passenger
if age < 16:
return 'child'
else:
return sex
titanic_df['person'] = titanic_df[['Age','Sex']].apply(male_female_child,axis=1)
sns.factorplot('Pclass',data=titanic_df,hue='person',kind='count')
Surprisingly most of the children are located in 3rd class
# Graphing age
titanic_df['Age'].hist(bins=70)
titanic_df['Age'].mean()
titanic_df['Age'].median()
titanic_df['person'].value_counts()
83 people under the age 16
fig = sns.FacetGrid(titanic_df,hue='Sex',aspect=4)
fig.map(sns.kdeplot,'Age',shade=True)
oldest = titanic_df['Age'].max()
fig.set(xlim=(0,oldest))
fig.add_legend()
fig = sns.FacetGrid(titanic_df,hue='person',aspect=4)
fig.map(sns.kdeplot,'Age',shade=True)
oldest = titanic_df['Age'].max()
fig.set(xlim=(0,oldest))
fig.add_legend()
fig = sns.FacetGrid(titanic_df,hue='Pclass',aspect=4)
fig.map(sns.kdeplot,'Age',shade=True)
oldest = titanic_df['Age'].max()
fig.set(xlim=(0,oldest))
fig.add_legend()
lets check what deck people were on
deck = titanic_df['Cabin'].dropna()
deck.head()
levels = []
for level in deck:
levels.append(level[0])
cabin_df = DataFrame(levels)
cabin_df.columns = ['Cabin']
cabin_df = cabin_df[cabin_df.Cabin != 'T']
sns.factorplot('Cabin',data=cabin_df,order='ABCDEFG',kind='count',palette='winter_d')
Where did they depart from?
sns.factorplot('Embarked',data=titanic_df,hue='Pclass',kind='count'
,order=['C','Q','S'])
Lets find out who was alone, and who was with family
for this we will use SibSp (siblings) and Parch (parents)
titanic_df.head()
titanic_df['Alone'] = titanic_df.SibSp + titanic_df.Parch
index_1 = titanic_df.Alone > 0
index_2 = titanic_df.Alone == 0
titanic_df.loc[index_1, 'Alone'] = 'With Family'
titanic_df.loc[index_2, 'Alone'] = 'Alone'
titanic_df.head()
# Graphing who was alone
sns.factorplot('Alone',data=titanic_df,palette='Blues',kind='count')
titanic_df['Survivor'] = titanic_df.Survived.map({0:'no',1:'yes'})
sns.factorplot('Survivor',data=titanic_df,kind='count')
Women and Children first right?
Lets find out
# Graphing by the sex/age
sns.factorplot('Pclass','Survived',hue='person',data=titanic_df)
sns.lmplot('Age','Survived',data=titanic_df)
sns.lmplot('Age','Survived',hue='Pclass',palette='winter',data=titanic_df)
generations = [10,20,40,60,80]
sns.lmplot('Age','Survived',hue='Pclass',palette='winter',data=titanic_df,x_bins=generations)
sns.lmplot('Age','Survived',hue='Sex',data=titanic_df,palette='winter_d',x_bins=generations)
levels_df = titanic_df.dropna()
levels_df.head()
char_cabin = titanic_df["Cabin"].astype(str)
new_Cabin = np.array([cabin[0] for cabin in char_cabin])
new_Cabin = pd.Categorical(new_Cabin)
new_Cabin.describe()
titanic_df['Deck'] = new_Cabin
titanic_df.head()
# Just getting values with a deck level
deck_df = titanic_df[titanic_df.Deck != 'n']
deck_df = deck_df[deck_df.Deck != 'T']
deck_df.head()
deck_df.describe()
sns.lmplot('Age','Survived',hue="Deck", data=deck_df,palette='winter',
hue_order=['A','B','C','D','E','F','G'], x_bins=generations).set(ylim=[-0.4,1.4])
sns.factorplot('Deck','Survived',data=deck_df, palette='winter', order=['A','B','C','D','E','F','G'])
sns.factorplot('Survivor',hue='Deck', data=deck_df, palette='winter',
hue_order=['A','B','C','D','E','F','G'], kind="count")
sns.lmplot('Age','Survived',hue='Alone',data=titanic_df,palette='winter',x_bins=generations)
sns.factorplot('Alone',hue='Survivor',data=titanic_df,palette='winter',kind="count", hue_order=['yes','no'])
You were more likely to survive if you went on the titanic with family
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
train_df = pd.read_csv('train.csv')
train_df.describe()
First we have to take care of the nulls and convert our strings to numbers
# Replaicng missing values
# Filling missing values
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())
train_df['Fare'] = train_df['Fare'].fillna(train_df['Fare'].median())
# Filling missing with the most common
train_df['Embarked'] = train_df['Embarked'].fillna('S')
# Making Columns
# Gender/Age
train_df['person'] = train_df[['Age','Sex']].apply(male_female_child,axis=1)
train_df['Alone'] = train_df.SibSp + train_df.Parch
# Alone/With Family
index_1 = train_df.Alone > 0
index_2 = train_df.Alone == 0
train_df.loc[index_1, 'Alone'] = 'With Family'
train_df.loc[index_2, 'Alone'] = 'Alone'
# Deck
train_df['Deck'] = new_Cabin
train_df['Deck'] = train_df['Deck'].astype('str')
#############################################################################
# Replacing sex
train_df.loc[train_df['Sex'] == 'male','Sex'] = 0
train_df.loc[train_df['Sex'] == 'female','Sex'] = 1
# Replacing person
train_df.loc[train_df['person'] == 'male','person'] = 0
train_df.loc[train_df['person'] == 'female','person'] = 1
train_df.loc[train_df['person'] == 'child','person'] = 2
# Replacing Location
train_df.loc[train_df['Embarked'] == 'S','Embarked'] = 0
train_df.loc[train_df['Embarked'] == 'C','Embarked'] = 1
train_df.loc[train_df['Embarked'] == 'Q','Embarked'] = 2
# Replace Alone
train_df.loc[train_df['Alone'] == 'With Family','Alone'] = 0
train_df.loc[train_df['Alone'] == 'Alone','Alone'] = 1
# Replace Deck 'A','B','C','D','E','F','G', 'T', 'n'
train_df.loc[train_df['Deck'] == 'A','Deck'] = 0
train_df.loc[train_df['Deck'] == 'B','Deck'] = 1
train_df.loc[train_df['Deck'] == 'C','Deck'] = 2
train_df.loc[train_df['Deck'] == 'D','Deck'] = 3
train_df.loc[train_df['Deck'] == 'E','Deck'] = 4
train_df.loc[train_df['Deck'] == 'F','Deck'] = 5
train_df.loc[train_df['Deck'] == 'G','Deck'] = 6
train_df.loc[train_df['Deck'] == 'T','Deck'] = 7
train_df.loc[train_df['Deck'] == 'n','Deck'] = 7
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'person', 'Alone', 'Deck']
alg = LogisticRegression(random_state=1)
scores = cross_validation.cross_val_score(alg, train_df[predictors], train_df['Survived'], cv=3)
print(scores.mean())
# Test Data
test_df = pd.read_csv('test.csv')
# Replaicng missing values
# Filling missing values
test_df['Age'] = test_df['Age'].fillna(train_df['Age'].median())
test_df['Fare'] = test_df['Fare'].fillna(train_df['Fare'].median())
# Filling missing with the most common
test_df['Embarked'] = test_df['Embarked'].fillna('S')
test_cabin = test_df["Cabin"].astype(str)
test_Cabin = np.array([cabin[0] for cabin in test_cabin])
test_Cabin = pd.Categorical(test_Cabin)
# Making Columns
test_df['person'] = test_df[['Age','Sex']].apply(male_female_child,axis=1)
test_df['Alone'] = test_df.SibSp + test_df.Parch
index_1 = test_df.Alone > 0
index_2 = test_df.Alone == 0
test_df.loc[index_1, 'Alone'] = 'With Family'
test_df.loc[index_2, 'Alone'] = 'Alone'
test_df['Deck'] = test_Cabin
test_df['Deck'] = test_df['Deck'].astype('str')
# Replacing sex
test_df.loc[test_df['Sex'] == 'male','Sex'] = 0
test_df.loc[test_df['Sex'] == 'female','Sex'] = 1
# Replacing person
test_df.loc[test_df['person'] == 'male','person'] = 0
test_df.loc[test_df['person'] == 'female','person'] = 1
test_df.loc[test_df['person'] == 'child','person'] = 2
# Replacing Location
test_df.loc[test_df['Embarked'] == 'S','Embarked'] = 0
test_df.loc[test_df['Embarked'] == 'C','Embarked'] = 1
test_df.loc[test_df['Embarked'] == 'Q','Embarked'] = 2
# Replace Alone
test_df.loc[test_df['Alone'] == 'With Family','Alone'] = 0
test_df.loc[test_df['Alone'] == 'Alone','Alone'] = 1
# Replace Deck 'A','B','C','D','E','F','G', 'T', 'n'
test_df.loc[test_df['Deck'] == 'A','Deck'] = 0
test_df.loc[test_df['Deck'] == 'B','Deck'] = 1
test_df.loc[test_df['Deck'] == 'C','Deck'] = 2
test_df.loc[test_df['Deck'] == 'D','Deck'] = 3
test_df.loc[test_df['Deck'] == 'E','Deck'] = 4
test_df.loc[test_df['Deck'] == 'F','Deck'] = 5
test_df.loc[test_df['Deck'] == 'G','Deck'] = 6
test_df.loc[test_df['Deck'] == 'T','Deck'] = 7
test_df.loc[test_df['Deck'] == 'n','Deck'] = 7
# Predict Test values
alg.fit(train_df[predictors],train_df['Survived'])
# Make predictions using the test set
predictions = alg.predict(test_df[predictors])
# Create a new dataframe with only the columns Kaggle wants from the dataset
submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': predictions})
submission.to_csv('submission.csv')