Created by SmirkyGraphs. Code: GitHub. Source: Kaggle.
import pandas as pd
import numpy as np
import gc
# Application Train/Test
train = pd.read_csv('./input/application_train.csv')
test = pd.read_csv('./input/application_test.csv')
features = pd.concat([train, test], keys=['train', 'test'], sort=False)
features = features.replace('XNA', np.nan)
features = features.replace('XAP', np.nan)
# Encoding Categories
name_contract_type = pd.get_dummies(features.NAME_CONTRACT_TYPE, prefix='name_contract_type')
code_gender = pd.get_dummies(features.CODE_GENDER, prefix='gender')
flag_own_car = pd.get_dummies(features.FLAG_OWN_CAR, prefix='own_car')
flag_own_realty = pd.get_dummies(features.FLAG_OWN_REALTY, prefix='own_realty')
name_type_suite = pd.get_dummies(features.NAME_TYPE_SUITE, prefix='type_suite')
name_income_type = pd.get_dummies(features.NAME_INCOME_TYPE, prefix='income_type')
name_education_type = pd.get_dummies(features.NAME_EDUCATION_TYPE, prefix='education_type')
name_family_status = pd.get_dummies(features.NAME_FAMILY_STATUS, prefix='family_status')
name_housing_type = pd.get_dummies(features.NAME_HOUSING_TYPE, prefix='housing_type')
occupation_type = pd.get_dummies(features.OCCUPATION_TYPE, prefix='occupation_type')
weekday_appr_process_start = pd.get_dummies(features.WEEKDAY_APPR_PROCESS_START, prefix='weekday_appr_process')
organization_type = pd.get_dummies(features.ORGANIZATION_TYPE, prefix='org_type')
fondkapremont_mode = pd.get_dummies(features.FONDKAPREMONT_MODE, prefix='fondkapremont_mode')
housetype_mode = pd.get_dummies(features.HOUSETYPE_MODE, prefix='housetype_mode')
wallsmaterial_mode = pd.get_dummies(features.WALLSMATERIAL_MODE, prefix='wallsmaterial_mode')
emergencystate_mode = pd.get_dummies(features.EMERGENCYSTATE_MODE, prefix='emergencystate_mode')
cols = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE',
'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE',
'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']
features = features.drop(columns=cols)
features_final = pd.concat([features, name_contract_type, code_gender, flag_own_car, flag_own_realty,
name_type_suite, name_income_type, name_education_type, name_family_status,
name_housing_type, occupation_type, weekday_appr_process_start,
organization_type, fondkapremont_mode, housetype_mode,
wallsmaterial_mode, emergencystate_mode], axis=1)
features_final = features_final.reset_index()
features_final = features_final.drop(columns='level_1')
features_final = features_final.rename(columns={'level_0' : 'set'})
features_final['DAYS_EMPLOYED'] = features_final['DAYS_EMPLOYED'].replace(365243, -999999)
features_final = features_final.replace('XNA', np.nan)
del [name_contract_type, code_gender, flag_own_car, flag_own_realty, name_type_suite, name_income_type, name_education_type,
name_family_status, name_housing_type, occupation_type, weekday_appr_process_start, organization_type, fondkapremont_mode,
housetype_mode, wallsmaterial_mode, emergencystate_mode]
gc.collect()
# Bureau Balance
bureau_balance = pd.read_csv('./input/bureau_balance.csv')
bureau_balance = pd.get_dummies(bureau_balance, prefix='status')
count = bureau_balance[['SK_ID_BUREAU', 'MONTHS_BALANCE']].groupby('SK_ID_BUREAU').count()
bureau_balance['months_count'] = bureau_balance['SK_ID_BUREAU'].map(count['MONTHS_BALANCE'])
bureau_bal_final = bureau_balance.groupby('SK_ID_BUREAU').mean()
del bureau_balance
gc.collect()
# Bureau
bureau = pd.read_csv('./input/bureau.csv')
credit_active = pd.get_dummies(bureau.CREDIT_ACTIVE, prefix='cred_active')
credit_currency = pd.get_dummies(bureau.CREDIT_CURRENCY, prefix='cred_curr')
credit_type = pd.get_dummies(bureau.CREDIT_TYPE, prefix='cred_type')
cols = ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE']
bureau = bureau.drop(columns=cols)
buro_final = pd.concat([bureau, credit_active, credit_currency, credit_type], axis=1)
bureau_per_loan = buro_final[['SK_ID_CURR', 'SK_ID_BUREAU']].groupby('SK_ID_CURR').count()
buro_final['bureau_per_loan_count'] = buro_final['SK_ID_CURR'].map(bureau_per_loan['SK_ID_BUREAU'])
del credit_active, credit_currency, credit_type, bureau
gc.collect()
buro_final = buro_final.merge(right=bureau_bal_final, how='left', on='SK_ID_BUREAU')
buro_final = buro_final.groupby('SK_ID_CURR').mean()
del bureau_bal_final
gc.collect()
# Credit Card Balance
credit_card_bal = pd.read_csv('./input/credit_card_balance.csv')
credit_card_bal = pd.get_dummies(credit_card_bal, prefix='cc_name_contract')
num_prev_loan = credit_card_bal[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
credit_card_bal['cc_prev_count'] = credit_card_bal['SK_ID_CURR'].map(num_prev_loan['SK_ID_PREV'])
credit_card_final = credit_card_bal.groupby('SK_ID_CURR').mean()
del credit_card_bal
gc.collect()
# Installments Payments
installments_payments = pd.read_csv('./input/installments_payments.csv')
num_prev_loan = installments_payments[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
installments_payments['install_count'] = installments_payments['SK_ID_CURR'].map(num_prev_loan['SK_ID_PREV'])
installments_final = installments_payments.groupby('SK_ID_CURR').mean()
del installments_payments
gc.collect()
# POS Cash Balance
pos_cash_bal = pd.read_csv('./input/POS_CASH_balance.csv')
pos_cash_bal = pos_cash_bal.replace('XNA', np.nan)
pos_cash_bal = pos_cash_bal.replace('XAP', np.nan)
pos_cash_bal = pd.get_dummies(pos_cash_bal, prefix='pos_name_contract')
num_prev_loan = pos_cash_bal[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
pos_cash_bal['pos_prev_count'] = pos_cash_bal['SK_ID_CURR'].map(num_prev_loan['SK_ID_PREV'])
pos_cash_final = pos_cash_bal.groupby('SK_ID_CURR').mean()
del pos_cash_bal
gc.collect()
# Previous Application
prev_app = pd.read_csv('./input/previous_application.csv')
prev_app = prev_app.replace('XNA', np.nan)
prev_dumm = pd.DataFrame()
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_CONTRACT_TYPE, prefix='name_contract_type')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.WEEKDAY_APPR_PROCESS_START, prefix='weekday_appr_start')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.FLAG_LAST_APPL_PER_CONTRACT, prefix='flag_last_appl')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_CASH_LOAN_PURPOSE, prefix='cash_purpose')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_CONTRACT_STATUS, prefix='contract_status')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_PAYMENT_TYPE, prefix='payment_type')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.CODE_REJECT_REASON, prefix='reject_reason')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_TYPE_SUITE, prefix='type_suite')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_CLIENT_TYPE, prefix='client_type')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_GOODS_CATEGORY, prefix='goods_cat')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_PORTFOLIO, prefix='portfolio')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_PRODUCT_TYPE, prefix='product_type')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.CHANNEL_TYPE, prefix='channel_type')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_SELLER_INDUSTRY, prefix='seller_indust')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.NAME_YIELD_GROUP, prefix='yield_group')], axis=1)
prev_dumm = pd.concat([prev_dumm, pd.get_dummies(prev_app.PRODUCT_COMBINATION, prefix='product_comb')], axis=1)
cols = ['NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'FLAG_LAST_APPL_PER_CONTRACT', 'NAME_CASH_LOAN_PURPOSE',
'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE',
'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE', 'NAME_SELLER_INDUSTRY',
'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION']
prev_app = pd.concat([prev_app, prev_dumm], axis=1)
prev_app = prev_app.drop(columns=cols)
num_prev_loan = prev_app[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
prev_app['prev_count'] = prev_app['SK_ID_CURR'].map(num_prev_loan['SK_ID_PREV'])
prev_final = prev_app.groupby('SK_ID_CURR').mean()
del prev_dumm, prev_app
gc.collect()
# Merge Files
final_data = features_final.merge(right=buro_final, how='left', on='SK_ID_CURR')
final_data = final_data.merge(right=prev_final, how='left', on='SK_ID_CURR')
final_data = final_data.merge(right=pos_cash_final, how='left', on='SK_ID_CURR')
final_data = final_data.merge(right=credit_card_final, how='left', on='SK_ID_CURR')
final_data = final_data.merge(right=installments_final, how='left', on='SK_ID_CURR')
cols = ['SK_ID_PREV_x', 'SK_ID_PREV_y', 'SK_ID_BUREAU']
final_data = final_data.drop(columns=cols)
del features_final, buro_final, prev_final, pos_cash_final, credit_card_final, installments_final
gc.collect()
# Split Train/Test
train = final_data.loc[final_data['set'] == 'train']
test = final_data.loc[final_data['set'] == 'test']
train = train.drop(['set'], axis=1)
test = test.drop(['set'], axis=1)
test = test.drop(['TARGET'], axis=1)
del final_data
gc.collect()
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
import xgboost as xgb
X = train.drop(['TARGET'], axis=1)
y = train.TARGET
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4,test_size=0.3)
clf = LGBMClassifier(
n_estimators=4000,
learning_rate=0.03,
num_leaves=30,
colsample_bytree=.8,
subsample=.9,
max_depth=7,
reg_alpha=.1,
reg_lambda=.1,
min_split_gain=.01,
min_child_weight=2,
silent=-1,
verbose=-1,
)
clf.fit(X_train, y_train,
eval_set = [(X_train, y_train), (X_test, y_test)],
eval_metric='auc', verbose=100, early_stopping_rounds=100
)
xgb_model = xgb.XGBClassifier(
n_estimators=4000,
learning_rate=0.03,
n_jobs=-1,
nthread=-1
)
xgb_model.fit(X_train, y_train,
eval_set = [(X_train, y_train), (X_test, y_test)],
eval_metric='auc', verbose=100, early_stopping_rounds=100
)
lgb_results = clf.predict_proba(test, num_iteration=clf.best_iteration_)[:,1]
xgb_results = xgb_model.predict_proba(test)[:,1]
predictions = xgb_results*0.70 + lgb_results*0.30
submission = pd.DataFrame({'SK_ID_CURR': test.SK_ID_CURR, 'TARGET': predictions})
submission.to_csv('submissions.csv', index=False)