Exploring the Data

Importing Libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib as mpl

Custom Variable Scoring Function

def variable_score(X_train,Y_train):

    
    #__________________Decision_Tree_____________________________________
    from sklearn.ensemble import ExtraTreesClassifier

    model = ExtraTreesClassifier()
    model.fit(X_train, Y_train)

    tree_importance = zip(X_test.columns, model.feature_importances_.tolist())
    tree_importance = np.asarray(list(tree_importance))
    tree_importance = np.sort(tree_importance, axis = 0)[::-1]
    tree_importance = pd.DataFrame(tree_importance, columns = ['Feature','Tree_Importance_Score'])
    tree_importance.set_index('Feature', inplace=True)
    
    feature_importance = tree_importance
    
    #__________________Correlation_____________________________________
    
   
    import math

    cor_list = []

    for column in X_train.columns:
        list_to_append = [column, X_train[column].corr(Y_train)]
        if math.isnan(list_to_append[1]):
            list_to_append[1] = 0
        cor_list.append(list_to_append)

    cor_list = np.asarray(cor_list)
    cor_list = np.sort(cor_list, axis = 0)[::-1]
    cor_list = pd.DataFrame(cor_list, columns = ['Feature','Correlation_with_Target'])
    cor_list.set_index('Feature', inplace=True)

    feature_importance = feature_importance.join(cor_list)
    
    return feature_importance
    

Reading in Data

df = pd.read_csv('LendingClub2012to2013.csv', skiprows=1, low_memory=False)
print(df.shape)

df = df[pd.notnull(df['loan_status'])]

# Keeping a backup copy of the data
df_orginal = df.copy()

(188183, 115)

Initial Dropping

Dropping variables is initally based on percent of values missing and text columns with too many different classes.

not_missing = df.apply(lambda x: (x.count()/df.shape[0])).sort_values()
not_missing = not_missing[not_missing < .5]
print(not_missing)
not_missing = not_missing.index.values.tolist()
df.drop(not_missing, axis=1, inplace = True)

annual_inc_joint                  0.000000
inq_fi                            0.000000
all_util                          0.000000
max_bal_bc                        0.000000
open_rv_24m                       0.000000
open_rv_12m                       0.000000
il_util                           0.000000
total_cu_tl                       0.000000
total_bal_il                      0.000000
open_il_24m                       0.000000
open_il_12m                       0.000000
open_il_6m                        0.000000
open_acc_6m                       0.000000
verification_status_joint         0.000000
dti_joint                         0.000000
mths_since_rcnt_il                0.000000
inq_last_12m                      0.000000
next_pymnt_d                      0.088883
mths_since_last_record            0.092857
mths_since_last_major_derog       0.172791
mths_since_recent_bc_dlq          0.195296
mths_since_recent_revol_delinq    0.289333
mths_since_last_delinq            0.428354
desc                              0.432977
dtype: float64

df_obj = df.select_dtypes(include = ['object'])
too_many = df_obj.apply(lambda x: (x.nunique() / df.shape[0])).sort_values(ascending=False)
print(too_many)
too_many = too_many[too_many >= .2].index.values.tolist()
df.drop(too_many, axis=1, inplace = True)

id                     1.000000
url                    1.000000
emp_title              0.612102
title                  0.238839
revol_util             0.005675
zip_code               0.004458
earliest_cr_line       0.003263
int_rate               0.000771
last_credit_pull_d     0.000319
last_pymnt_d           0.000314
addr_state             0.000260
sub_grade              0.000186
issue_d                0.000128
purpose                0.000069
emp_length             0.000058
loan_status            0.000037
grade                  0.000037
home_ownership         0.000027
verification_status    0.000016
term                   0.000011
initial_list_status    0.000011
application_type       0.000005
pymnt_plan             0.000005
dtype: float64

Data Transformation

Label econding, imputation, train/test split

# Label encoding

from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
cols = df.select_dtypes(include = ['object']).keys().tolist()
for column in cols:
    df[column] = label.fit_transform(df[column].astype('str'))

# Imputation: Median

df.fillna(df.median(), inplace = True)

# Train/Test Splits

Y = df['loan_status']
X = df.drop('loan_status',axis=1)

from sklearn.model_selection import train_test_split

# Generating the final test set:
X_train, X_test_final_test, Y_train, Y_final_test = train_test_split(X,Y, test_size = .2)

# Generating the expirment set: 
X_train, X_test, Y_train, Y_test = train_test_split(X_train,Y_train, test_size = .1)

print("Final test size: " + str(Y_final_test.shape[0]))
print("Intermediate test size: " + str(Y_test.shape[0]))
print("Training size: " + str(Y_train.shape[0]))

Final test size: 37637
Intermediate test size: 15055
Training size: 135489

# Scale

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

Removing Features with Little Variance

from sklearn.feature_selection import VarianceThreshold

print('Features:',X_train.shape[1])

# Remove all features that are either one or zero (on or off) in more than 90% of the samples
# http://scikit-learn.org/stable/modules/feature_selection.html
selector = VarianceThreshold(threshold=(.9 * (1 - .9)))
selector = selector.fit(X_train)

selector.get_support

variance = zip(X_test.columns, selector.get_support())
variance = np.asarray(list(variance))


below_threshold = variance[variance[:,1] == 'False'][:,0]
print('Feature(s) to be dropped:', below_threshold)


labels = X_train.columns.tolist()  

# List Comprehension to remove labels: https://stackoverflow.com/questions/4211209/remove-all-the-elements-that-occur-in-one-list-from-another                             
labels = [x for x in labels if x not in below_threshold]
X_train = pd.DataFrame(selector.fit_transform(X_train), columns=labels)
print('Features:',X_train.shape[1])

Features: 86
Feature(s) to be dropped: ['pymnt_plan' 'policy_code' 'application_type']
Features: 83

Finding Variable Importance with Correlation and Random Forest

df = variable_score(X_train,Y_train)

df.sort_values('Tree_Importance_Score')[::-1].head(10)

	Tree_Importance_Score	Correlation_with_Target
Feature
zip_code	8.206404938300277e-05	9.05799017886097e-05
verification_status	5.368457724594576e-05	0.006985712324612762
total_rev_hi_lim	3.8450493069382144e-05	0.006453755025627213
total_rec_prncp	2.929757586780374e-05	0.005947541594535856
total_rec_late_fee	2.3402103757702527e-05	0.005683856517796645
total_rec_int	0.10177860331002972	0.004814525409741282
total_pymnt_inv	0.09515163532375359	0.0036057409102408016
total_pymnt	0.09503450247976335	0.003560534801558149
total_acc	0.07831017202339975	0.0027663034269301066
tot_hi_cred_lim	0.070266255189276	0.0025077576084473175

key_columns= df.sort_values('Tree_Importance_Score')[::-1].head(10).index.tolist()

df_orginal[key_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 188181 entries, 0 to 188180
Data columns (total 10 columns):
zip_code               188181 non-null object
verification_status    188181 non-null object
total_rev_hi_lim       160440 non-null float64
total_rec_prncp        188181 non-null float64
total_rec_late_fee     188181 non-null float64
total_rec_int          188181 non-null float64
total_pymnt_inv        188181 non-null float64
total_pymnt            188181 non-null float64
total_acc              188181 non-null float64
tot_hi_cred_lim        160440 non-null float64
dtypes: float64(8), object(2)
memory usage: 15.8+ MB

df_orginal[key_columns].head()

	zip_code	verification_status	total_rev_hi_lim	total_rec_prncp	total_rec_int	total_pymnt_inv	total_pymnt	total_acc	tot_hi_cred_lim
0	100xx	Verified	43100.0	20800.00	3126.64	23926.64	23926.64001	41.0	43100.0
1	060xx	Not Verified	61100.0	11633.82	1261.93	12895.75	12895.75000	22.0	333044.0
2	281xx	Not Verified	29700.0	12000.00	1988.61	13988.61	13988.61000	18.0	29700.0
3	871xx	Source Verified	8100.0	12000.00	1359.78	13359.78	13359.77686	32.0	18130.0
4	493xx	Verified	28200.0	24000.00	4652.21	28652.21	28652.21000	39.0	229072.0

Plotting the Data

The target variable is loan status

# Target Variable

plt.hist(x='loan_status',data=df_orginal)
mpl.rcParams['figure.figsize'] = (5,5)
plt.xticks(rotation=45)
# plt.savefig('foo.png',bbox_inches="tight")
plt.show()

png

Below is chart for the top ten feature variables

# Top feature charts

fig, axs = plt.subplots(2,5, figsize=(30, 15))
axs = axs.ravel()

titles = ['Zip Code','Verification Status','Total revolving high credit/credit limit',
         'Principal received to date','Late fees received to date','Interest received to date',
         'Pymnts received for portion of total amount funded by investors',
         'Pymnts received for total amount funded','Total bankcard high credit/credit limit',
         'Total current balance of all installment accounts']

for i, column in enumerate(key_columns):

    axs[i].hist(x=column,data=df_orginal[pd.notnull(df_orginal[column])])
    axs[i].set_title(titles[i])

# plt.savefig('foo2.png',bbox_inches="tight")

png