Created by SmirkyGraphs. Code: GitHub. Source: Kaggle.


In [1]:
import pandas as pd
import numpy as np
import gc
In [2]:
# Application Train/Test

train = pd.read_csv('./input/application_train.csv')
test = pd.read_csv('./input/application_test.csv')

features = pd.concat([train, test], keys=['train', 'test'], sort=False)

features = features.replace('XNA', np.nan)
features = features.replace('XAP', np.nan)

# Encoding Categories
name_contract_type = pd.get_dummies(features.NAME_CONTRACT_TYPE, prefix='name_contract_type')
code_gender = pd.get_dummies(features.CODE_GENDER, prefix='gender')
flag_own_car = pd.get_dummies(features.FLAG_OWN_CAR, prefix='own_car')
flag_own_realty = pd.get_dummies(features.FLAG_OWN_REALTY, prefix='own_realty')
name_type_suite = pd.get_dummies(features.NAME_TYPE_SUITE, prefix='type_suite')
name_income_type = pd.get_dummies(features.NAME_INCOME_TYPE, prefix='income_type')
name_education_type = pd.get_dummies(features.NAME_EDUCATION_TYPE, prefix='education_type')
name_family_status = pd.get_dummies(features.NAME_FAMILY_STATUS, prefix='family_status')
name_housing_type = pd.get_dummies(features.NAME_HOUSING_TYPE, prefix='housing_type')
occupation_type = pd.get_dummies(features.OCCUPATION_TYPE, prefix='occupation_type')
weekday_appr_process_start = pd.get_dummies(features.WEEKDAY_APPR_PROCESS_START, prefix='weekday_appr_process')
organization_type = pd.get_dummies(features.ORGANIZATION_TYPE, prefix='org_type')
fondkapremont_mode = pd.get_dummies(features.FONDKAPREMONT_MODE, prefix='fondkapremont_mode')
housetype_mode = pd.get_dummies(features.HOUSETYPE_MODE, prefix='housetype_mode')
wallsmaterial_mode = pd.get_dummies(features.WALLSMATERIAL_MODE, prefix='wallsmaterial_mode')
emergencystate_mode = pd.get_dummies(features.EMERGENCYSTATE_MODE, prefix='emergencystate_mode')


cols = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 
        'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
        'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE',
        'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']

features = features.drop(columns=cols)

features_final = pd.concat([features, name_contract_type, code_gender, flag_own_car, flag_own_realty,
                           name_type_suite, name_income_type, name_education_type, name_family_status,
                           name_housing_type, occupation_type, weekday_appr_process_start,
                           organization_type, fondkapremont_mode, housetype_mode,
                           wallsmaterial_mode, emergencystate_mode], axis=1)

features_final = features_final.reset_index()
features_final = features_final.drop(columns='level_1')
features_final = features_final.rename(columns={'level_0' : 'set'})

features_final['DAYS_EMPLOYED'] = features_final['DAYS_EMPLOYED'].replace(365243, -999999)
features_final = features_final.replace('XNA', np.nan)

del [name_contract_type, code_gender, flag_own_car, flag_own_realty, name_type_suite, name_income_type, name_education_type, 
name_family_status, name_housing_type, occupation_type, weekday_appr_process_start, organization_type, fondkapremont_mode,
housetype_mode, wallsmaterial_mode, emergencystate_mode]

gc.collect()
Out[2]:
14
In [5]:
# Bureau Balance

bureau_balance = pd.read_csv('./input/bureau_balance.csv')
bureau_balance = pd.get_dummies(bureau_balance, prefix='status')


count = bureau_balance[['SK_ID_BUREAU', 'MONTHS_BALANCE']].groupby('SK_ID_BUREAU').count()
bureau_balance['months_count'] = bureau_balance['SK_ID_BUREAU'].map(count['MONTHS_BALANCE'])
bureau_bal_final = bureau_balance.groupby('SK_ID_BUREAU').mean()

del bureau_balance
gc.collect()
Out[5]:
14
In [6]:
# Bureau

bureau = pd.read_csv('./input/bureau.csv')

credit_active = pd.get_dummies(bureau.CREDIT_ACTIVE, prefix='cred_active')
credit_currency = pd.get_dummies(bureau.CREDIT_CURRENCY, prefix='cred_curr')
credit_type = pd.get_dummies(bureau.CREDIT_TYPE, prefix='cred_type')

cols = ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE']
bureau = bureau.drop(columns=cols)

buro_final = pd.concat([bureau, credit_active, credit_currency, credit_type], axis=1)

bureau_per_loan = buro_final[['SK_ID_CURR', 'SK_ID_BUREAU']].groupby('SK_ID_CURR').count()
buro_final['bureau_per_loan_count'] = buro_final['SK_ID_CURR'].map(bureau_per_loan['SK_ID_BUREAU'])

del credit_active, credit_currency, credit_type, bureau
gc.collect()

buro_final = buro_final.merge(right=bureau_bal_final, how='left', on='SK_ID_BUREAU')

buro_final = buro_final.groupby('SK_ID_CURR').mean()

del bureau_bal_final
gc.collect()
Out[6]:
28
In [7]:
# Credit Card Balance

credit_card_bal = pd.read_csv('./input/credit_card_balance.csv')
credit_card_bal = pd.get_dummies(credit_card_bal, prefix='cc_name_contract')

num_prev_loan = credit_card_bal[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
credit_card_bal['cc_prev_count'] = credit_card_bal['SK_ID_CURR'].map(num_prev_loan['SK_ID_PREV'])

credit_card_final = credit_card_bal.groupby('SK_ID_CURR').mean()

del credit_card_bal
gc.collect()
Out[7]:
35
In [8]:
# Installments Payments

installments_payments = pd.read_csv('./input/installments_payments.csv')

num_prev_loan = installments_payments[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
installments_payments['install_count'] = installments_payments['SK_ID_CURR'].map(num_prev_loan['SK_ID_PREV'])

installments_final = installments_payments.groupby('SK_ID_CURR').mean()

del installments_payments
gc.collect()
Out[8]:
35
In [9]:
# POS Cash Balance

pos_cash_bal = pd.read_csv('./input/POS_CASH_balance.csv')

pos_cash_bal = pos_cash_bal.replace('XNA', np.nan)
pos_cash_bal = pos_cash_bal.replace('XAP', np.nan)

pos_cash_bal = pd.get_dummies(pos_cash_bal, prefix='pos_name_contract')

num_prev_loan = pos_cash_bal[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
pos_cash_bal['pos_prev_count'] = pos_cash_bal['SK_ID_CURR'].map(num_prev_loan['SK_ID_PREV'])

pos_cash_final = pos_cash_bal.groupby('SK_ID_CURR').mean()

del pos_cash_bal
gc.collect()
Out[9]:
56
In [10]:
# Previous Application

prev_app = pd.read_csv('./input/previous_application.csv')

prev_app = prev_app.replace('XNA', np.nan)

prev_dumm = pd.DataFrame()

prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_CONTRACT_TYPE, prefix='name_contract_type')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.WEEKDAY_APPR_PROCESS_START, prefix='weekday_appr_start')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.FLAG_LAST_APPL_PER_CONTRACT, prefix='flag_last_appl')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_CASH_LOAN_PURPOSE, prefix='cash_purpose')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_CONTRACT_STATUS, prefix='contract_status')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_PAYMENT_TYPE, prefix='payment_type')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.CODE_REJECT_REASON, prefix='reject_reason')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_TYPE_SUITE, prefix='type_suite')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_CLIENT_TYPE, prefix='client_type')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_GOODS_CATEGORY, prefix='goods_cat')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_PORTFOLIO, prefix='portfolio')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_PRODUCT_TYPE, prefix='product_type')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.CHANNEL_TYPE, prefix='channel_type')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_SELLER_INDUSTRY, prefix='seller_indust')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_YIELD_GROUP, prefix='yield_group')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.PRODUCT_COMBINATION, prefix='product_comb')], axis=1)


cols = ['NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'FLAG_LAST_APPL_PER_CONTRACT', 'NAME_CASH_LOAN_PURPOSE',
        'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE', 
        'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE', 'NAME_SELLER_INDUSTRY',
        'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION']

prev_app = pd.concat([prev_app, prev_dumm], axis=1)

prev_app = prev_app.drop(columns=cols)

num_prev_loan = prev_app[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
prev_app['prev_count'] = prev_app['SK_ID_CURR'].map(num_prev_loan['SK_ID_PREV'])

prev_final = prev_app.groupby('SK_ID_CURR').mean()

del prev_dumm, prev_app
gc.collect()
Out[10]:
161
In [11]:
# Merge Files

final_data = features_final.merge(right=buro_final, how='left', on='SK_ID_CURR')
final_data = final_data.merge(right=prev_final, how='left', on='SK_ID_CURR')
final_data = final_data.merge(right=pos_cash_final, how='left', on='SK_ID_CURR')
final_data = final_data.merge(right=credit_card_final, how='left', on='SK_ID_CURR')
final_data = final_data.merge(right=installments_final, how='left', on='SK_ID_CURR')

cols = ['SK_ID_PREV_x', 'SK_ID_PREV_y', 'SK_ID_BUREAU']

final_data = final_data.drop(columns=cols)

del features_final, buro_final, prev_final, pos_cash_final, credit_card_final, installments_final
gc.collect()
Out[11]:
140
In [12]:
# Split Train/Test

train = final_data.loc[final_data['set'] == 'train']
test = final_data.loc[final_data['set'] == 'test']

train = train.drop(['set'], axis=1)

test = test.drop(['set'], axis=1)
test = test.drop(['TARGET'], axis=1)

del final_data
gc.collect()
Out[12]:
21
In [13]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
import xgboost as xgb
In [14]:
X = train.drop(['TARGET'], axis=1)
y = train.TARGET

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4,test_size=0.3)
In [15]:
clf = LGBMClassifier(
            n_estimators=4000,
            learning_rate=0.03,
            num_leaves=30,
            colsample_bytree=.8,
            subsample=.9,
            max_depth=7,
            reg_alpha=.1,
            reg_lambda=.1,
            min_split_gain=.01,
            min_child_weight=2,
            silent=-1,
            verbose=-1,
        )
In [16]:
clf.fit(X_train, y_train, 
        eval_set = [(X_train, y_train), (X_test, y_test)], 
        eval_metric='auc', verbose=100, early_stopping_rounds=100
       )
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.783931	valid_1's auc: 0.760559
[200]	training's auc: 0.806841	valid_1's auc: 0.771149
[300]	training's auc: 0.821768	valid_1's auc: 0.774805
[400]	training's auc: 0.834434	valid_1's auc: 0.776295
[500]	training's auc: 0.845971	valid_1's auc: 0.776932
[600]	training's auc: 0.856261	valid_1's auc: 0.77719
[700]	training's auc: 0.865363	valid_1's auc: 0.777286
[800]	training's auc: 0.874226	valid_1's auc: 0.777523
[900]	training's auc: 0.882001	valid_1's auc: 0.777479
Early stopping, best iteration is:
[819]	training's auc: 0.875897	valid_1's auc: 0.777598
Out[16]:
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,
        importance_type='split', learning_rate=0.03, max_depth=7,
        min_child_samples=20, min_child_weight=2, min_split_gain=0.01,
        n_estimators=4000, n_jobs=-1, num_leaves=30, objective=None,
        random_state=None, reg_alpha=0.1, reg_lambda=0.1, silent=-1,
        subsample=0.9, subsample_for_bin=200000, subsample_freq=0,
        verbose=-1)
In [17]:
xgb_model = xgb.XGBClassifier(
                n_estimators=4000,
                learning_rate=0.03,
                n_jobs=-1, 
                nthread=-1 
                )
In [18]:
xgb_model.fit(X_train, y_train, 
        eval_set = [(X_train, y_train), (X_test, y_test)], 
        eval_metric='auc', verbose=100, early_stopping_rounds=100
       )
[0]	validation_0-auc:0.688411	validation_1-auc:0.683621
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 100 rounds.
[100]	validation_0-auc:0.737058	validation_1-auc:0.73027
[200]	validation_0-auc:0.764864	validation_1-auc:0.753471
[300]	validation_0-auc:0.777168	validation_1-auc:0.76255
[400]	validation_0-auc:0.784437	validation_1-auc:0.76709
[500]	validation_0-auc:0.789378	validation_1-auc:0.769664
[600]	validation_0-auc:0.793224	validation_1-auc:0.771252
[700]	validation_0-auc:0.796545	validation_1-auc:0.772734
[800]	validation_0-auc:0.799528	validation_1-auc:0.773678
[900]	validation_0-auc:0.802359	validation_1-auc:0.774525
[1000]	validation_0-auc:0.804939	validation_1-auc:0.775189
[1100]	validation_0-auc:0.807429	validation_1-auc:0.775831
[1200]	validation_0-auc:0.809751	validation_1-auc:0.776371
[1300]	validation_0-auc:0.812046	validation_1-auc:0.776712
[1400]	validation_0-auc:0.814172	validation_1-auc:0.777097
[1500]	validation_0-auc:0.816074	validation_1-auc:0.777355
[1600]	validation_0-auc:0.818133	validation_1-auc:0.77769
[1700]	validation_0-auc:0.820005	validation_1-auc:0.777868
[1800]	validation_0-auc:0.821764	validation_1-auc:0.778007
[1900]	validation_0-auc:0.823649	validation_1-auc:0.778062
[2000]	validation_0-auc:0.825222	validation_1-auc:0.778147
[2100]	validation_0-auc:0.826838	validation_1-auc:0.778276
[2200]	validation_0-auc:0.828373	validation_1-auc:0.778281
Stopping. Best iteration:
[2142]	validation_0-auc:0.827491	validation_1-auc:0.77829

Out[18]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.03, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=4000,
       n_jobs=-1, nthread=-1, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
In [19]:
lgb_results = clf.predict_proba(test, num_iteration=clf.best_iteration_)[:,1]
xgb_results = xgb_model.predict_proba(test)[:,1]
In [20]:
predictions = xgb_results*0.70 + lgb_results*0.30
In [21]:
submission = pd.DataFrame({'SK_ID_CURR': test.SK_ID_CURR, 'TARGET': predictions}) 
In [22]:
submission.to_csv('submissions.csv', index=False)