Created by SmirkyGraphs. Code: GitHub. Source: Kaggle.
1) Introduction
2) Correlation
5) Predicting
# For Data
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
# For Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
my_color = sns.color_palette()
sns.set_style('darkgrid')
sns.set()
df = pd.read_csv('train.csv')
df.head()
df.shape
We have 1460 samples made of 80 features and 1 target variable that is the SalePrice to work with
test = pd.read_csv('test.csv')
test.head()
test.shape
df.info()
df.head()
# Statistical Summary of Target
df['SalePrice'].describe()
# Statistical Summary
df.describe()
# Compute the correlation matrix
corr = df.corr()
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Custom colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap
sns.heatmap(corr, cmap=cmap, vmax=1, square=True, linewidths=.5)
corr = df.corr()["SalePrice"]
corr[np.argsort(corr, axis=0)[::-1]]
# Top 35 Features
corr.sort_values(ascending=False).head(35)
df['SalePrice'].hist(bins=70)
df['YearBuilt'].hist(bins=70)
sns.regplot(x = 'OverallQual', y = 'SalePrice', data = df, color = 'g')
sns.regplot(x = 'OverallCond', y = 'SalePrice', data = df, color = 'g')
sns.regplot(x = 'YearBuilt', y = 'SalePrice', data = df, color = 'g')
# Getting Price per Sqrft
df['PricePrSF'] = df['SalePrice']/df['GrLivArea']
df['PricePrSF'].hist(bins=70)
df['PricePrSF'].describe()
x, axarr = plt.subplots(3, 2, figsize=(10, 9))
price = df.SalePrice.values
axarr[0, 0].scatter(df.GrLivArea.values, price)
axarr[0, 0].set_title('GrLiveArea')
axarr[0, 1].scatter(df.GarageArea.values, price)
axarr[0, 1].set_title('GarageArea')
axarr[1, 0].scatter(df.TotalBsmtSF.values, price)
axarr[1, 0].set_title('TotalBsmtSF')
axarr[1, 1].scatter(df['1stFlrSF'].values, price)
axarr[1, 1].set_title('1stFlrSF')
axarr[2, 0].scatter(df.TotRmsAbvGrd.values, price)
axarr[2, 0].set_title('TotRmsAbvGrd')
axarr[2, 1].scatter(df.MasVnrArea.values, price)
axarr[2, 1].set_title('MasVnrArea')
x.text(-0.01, 0.5, 'Sale Price', va='center', rotation='vertical', fontsize = 12)
plt.tight_layout()
plt.show()
sns.factorplot('SaleCondition',data=df,kind="count")
sns.factorplot('SaleType',data=df,kind="count")
sns.factorplot('Street',data=df,kind="count")
sns.factorplot('Alley',data=df,kind="count")
sns.factorplot('BldgType',data=df,kind="count")
sns.factorplot('HouseStyle',data=df,kind="count")
sns.factorplot('OverallQual',data=df,kind="count")
plt.figure(figsize = (12, 6))
sns.countplot(x = 'Neighborhood', data = df)
xt = plt.xticks(rotation=45)
fig, ax = plt.subplots(2, 2, figsize = (10, 8))
sns.boxplot('BsmtCond', 'SalePrice', data = df, ax = ax[0, 0])
sns.boxplot('BsmtQual', 'SalePrice', data = df, ax = ax[0, 1])
sns.boxplot('BsmtExposure', 'SalePrice', data = df, ax = ax[1, 0])
sns.boxplot('BsmtFinType1', 'SalePrice', data = df, ax = ax[1, 1])
fig, ax = plt.subplots(1, 2, figsize = (12,6))
sns.boxplot('HouseStyle', 'SalePrice', data = df, ax = ax[0])
sns.boxplot('BldgType', 'SalePrice', data = df, ax = ax[1])
sns.factorplot('HeatingQC',data=df,kind="count")
sns.factorplot('Fence',data=df,kind="count")
sns.factorplot('HeatingQC', 'SalePrice', hue = 'CentralAir', estimator = np.mean, data = df,
size = 4.5, aspect = 1.4)
sns.factorplot('BedroomAbvGr',data=df,kind="count")
train = df
# Joining train and test
features = pd.concat([train, test], keys=['train', 'test'])
features.isnull().sum().sort_values(ascending=False).head(35)
# Removing Outliers
features.drop(features[(features['GrLivArea']>4000) & (features['SalePrice']<300000)].index)
# For these missing features missing values likely mean it doesn't exist
features['PoolQC'] = features['PoolQC'].fillna('None')
features['MiscFeature'] = features['MiscFeature'].fillna('None')
features['Alley'] = features['Alley'].fillna('None')
features['Fence'] = features['Fence'].fillna('None')
features['FireplaceQu'] = features['FireplaceQu'].fillna('None')
features['GarageType'] = features['GarageType'].fillna('None')
features['GarageFinish'] = features['GarageFinish'].fillna('None')
features['GarageQual'] = features['GarageQual'].fillna('None')
features['GarageCond'] = features['GarageCond'].fillna('None')
features['BsmtQual'] = features['BsmtQual'].fillna('None')
features['BsmtCond'] = features['BsmtCond'].fillna('None')
features['BsmtExposure'] = features['BsmtExposure'].fillna('None')
features['BsmtFinType1'] = features['BsmtFinType1'].fillna('None')
features['BsmtFinType2'] = features['BsmtFinType2'].fillna('None')
features['MasVnrType'] = features['MasVnrType'].fillna('None')
features['MSSubClass'] = features['MSSubClass'].fillna('None')
# Fill missing with 0
features['TotalBsmtSF'] = features['TotalBsmtSF'].fillna(0)
features['GarageYrBlt'] = features['GarageYrBlt'].fillna(0)
features['GarageArea'] = features['GarageArea'].fillna(0.0)
features['GarageCars'] = features['GarageCars'].fillna(0.0)
features['MasVnrArea'] = features['MasVnrArea'].fillna(0.0)
features['BsmtFullBath'] = features['BsmtFullBath'].fillna(0)
features['BsmtHalfBath'] = features['BsmtHalfBath'].fillna(0)
features['BsmtUnfSF'] = features['BsmtUnfSF'].fillna(0)
features['BsmtFinSF1'] = features['BsmtFinSF1'].fillna(0)
features['BsmtFinSF2'] = features['BsmtFinSF2'].fillna(0)
# Replace with mode
features['MSZoning'] = features['MSZoning'].fillna(features['MSZoning'].mode()[0])
features['Electrical'] = features['Electrical'].fillna(features['Electrical'].mode()[0])
features['KitchenQual'] = features['KitchenQual'].fillna(features['KitchenQual'].mode()[0])
features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])
features['Functional'] = features['Functional'].fillna(features['Functional'].mode()[0])
features['Exterior1st'] = features['Exterior1st'].fillna(features['Exterior1st'].mode()[0])
features['Exterior2nd'] = features['Exterior2nd'].fillna(features['Exterior2nd'].mode()[0])
features['Utilities'] = features['Utilities'].fillna(features['Utilities'].mode()[0])
# Replace with median
features['LotFrontage'] = features['LotFrontage'].fillna(features['LotFrontage'].median())
# Adding total sqfootage
features['TotalSF'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']
# Dropping SF
features = features.drop(['TotalBsmtSF'], axis=1)
features = features.drop(['1stFlrSF'], axis=1)
features = features.drop(['2ndFlrSF'], axis=1)
features.dtypes.value_counts()
features.isnull().sum().sort_values(ascending=False).head(3)
features.shape
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.metrics import mean_squared_log_error, mean_squared_error, make_scorer, mean_absolute_error
# Models
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.tree import ExtraTreeRegressor, DecisionTreeRegressor
from sklearn.svm import SVR
from civismlext.stacking import StackedRegressor
# Stacking
from civismlext.nonnegative import NonNegativeLinearRegression
# Xgboost
import xgboost as xgb
le = LabelEncoder()
features['BldgType'] = le.fit_transform(features['BldgType'])
features['BsmtCond'] = le.fit_transform(features['BsmtCond'])
features['BsmtExposure'] = le.fit_transform(features['BsmtExposure'])
features['BsmtFinType1'] = le.fit_transform(features['BsmtFinType1'])
features['BsmtFinType2'] = le.fit_transform(features['BsmtFinType2'])
features['BsmtQual'] = le.fit_transform(features['BsmtQual'])
features['CentralAir'] = le.fit_transform(features['CentralAir'])
features['Condition1'] = le.fit_transform(features['Condition1'])
features['Condition2'] = le.fit_transform(features['Condition2'])
features['Electrical'] = le.fit_transform(features['Electrical'])
features['ExterCond'] = le.fit_transform(features['ExterCond'])
features['ExterQual'] = le.fit_transform(features['ExterQual'])
features['Exterior1st'] = le.fit_transform(features['Exterior1st'])
features['Exterior2nd'] = le.fit_transform(features['Exterior2nd'])
features['FireplaceQu'] = le.fit_transform(features['FireplaceQu'])
features['Foundation'] = le.fit_transform(features['Foundation'])
features['Functional'] = le.fit_transform(features['Functional'])
features['GarageCond'] = le.fit_transform(features['GarageCond'])
features['GarageFinish'] = le.fit_transform(features['GarageFinish'])
features['GarageQual'] = le.fit_transform(features['GarageQual'])
features['GarageType'] = le.fit_transform(features['GarageType'])
features['Heating'] = le.fit_transform(features['Heating'])
features['HeatingQC'] = le.fit_transform(features['HeatingQC'])
features['HouseStyle'] = le.fit_transform(features['HouseStyle'])
features['KitchenQual'] = le.fit_transform(features['KitchenQual'])
features['LandContour'] = le.fit_transform(features['LandContour'])
features['LandSlope'] = le.fit_transform(features['LandSlope'])
features['LotConfig'] = le.fit_transform(features['LotConfig'])
features['LotShape'] = le.fit_transform(features['LotShape'])
features['MSZoning'] = le.fit_transform(features['MSZoning'])
features['MasVnrType'] = le.fit_transform(features['MasVnrType'])
features['Neighborhood'] = le.fit_transform(features['Neighborhood'])
features['PavedDrive'] = le.fit_transform(features['PavedDrive'])
features['RoofMatl'] = le.fit_transform(features['RoofMatl'])
features['RoofStyle'] = le.fit_transform(features['RoofStyle'])
features['SaleCondition'] = le.fit_transform(features['SaleCondition'])
features['SaleType'] = le.fit_transform(features['SaleType'])
features['Street'] = le.fit_transform(features['Street'])
features['Utilities'] = le.fit_transform(features['Utilities'])
features['Fence'] = le.fit_transform(features['Fence'])
features['Alley'] = le.fit_transform(features['Alley'])
features['MiscFeature'] = le.fit_transform(features['MiscFeature'])
features['PoolQC'] = le.fit_transform(features['PoolQC'])
features['MSSubClass'] = le.fit_transform(features['MSSubClass'])
train = features.loc['train']
test = features.loc['test']
test = test.drop(['SalePrice'], axis=1)
test = test.drop(['PricePrSF'], axis=1)
train = train.drop(['PricePrSF'], axis=1)
print(train.shape)
print(test.shape)
# Splitting the data
X = train.drop(['SalePrice'], axis=1)
y = train.SalePrice
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4,test_size=0.2)
xgb_model = xgb.XGBRegressor(n_estimators=400, n_jobs=-1, nthread=-1)
xgb_model.fit(X_train, y_train)
xgb_model.score(X_train,y_train)
xgb_model.score(X_test,y_test)
rf = RandomForestRegressor(random_state=4, n_estimators=4000, n_jobs=-1)
rf.fit(X_train, y_train.ravel())
rf.score(X_train,y_train)
rf.score(X_test,y_test)
ada = AdaBoostRegressor(n_estimators=400, learning_rate=0.1)
ada.fit(X_train, y_train.ravel())
ada.score(X_train,y_train)
ada.score(X_test,y_test)
nn = NonNegativeLinearRegression(fit_intercept=True, normalize=False)
nn.fit(X_train, y_train)
nn.score(X_train,y_train)
nn.score(X_test,y_test)
ex = ExtraTreesRegressor(random_state=4, n_jobs=-1, max_features=50)
ex.fit(X_train, y_train)
ex.score(X_train,y_train)
ex.score(X_test,y_test)
en = ElasticNetCV(random_state=4, n_jobs=-1, alphas=[0.1, 1, 10, 100], cv=5)
en.fit(X_train, y_train)
en.score(X_train,y_train)
en.score(X_test,y_test)
models = [
('ada', ada),
('en', en),
('nn', nn),
('ex', ex),
('rf', rf),
]
stack = StackedRegressor(models, n_jobs=-1)
stack.fit(X_train, y_train)
stack.score(X_train,y_train)
stack.score(X_test,y_test)
XGB_model = xgb_model.fit(X_train, y_train)
y_pred = stack.predict(X_test)
print('MAE:\t$%.2f' % mean_absolute_error(y_test, y_pred))
print('MSLE:\t%.5f' % mean_squared_log_error(y_test, y_pred))
predictions = XGB_model.predict(test)*0.30 + stack.predict(test)*0.70
submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predictions})
submission.to_csv('submissions.csv', index=False)