Pipelining and Applying Algorithms

Importing Libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Imputer, FunctionTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.feature_selection import SelectFromModel

from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

Custom Classes

class FilterNAs(BaseEstimator, TransformerMixin):
    def __init__(self, drop_thresh = .5):
        self.d_per = drop_thresh
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        new_thresh = self.d_per * X.shape[0]
        X.dropna(thresh=new_thresh, inplace=True, axis = 1)
        return X
    
class DropTextCol(BaseEstimator, TransformerMixin):
    def __init__(self, drop_thresh = .2):
        self.d_per = drop_thresh
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        df_obj = X.select_dtypes(include = ['object'])
        too_many = df_obj.apply(lambda x: (x.nunique() / df.shape[0])).sort_values(ascending=False)
        too_many = too_many[too_many >= self.d_per].index.values.tolist()
        X.drop(too_many, axis=1, inplace = True)
        return X    

# Copied from Machine Learning Project Checklist from the book Hands-On Machine Learning with Scikit-Learn
# & TensorFlow by Aurélien Géron. Page 67.

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, strings = False):
        self.attribute_names = attribute_names
        self.strings = strings
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        if(self.strings):
            return X[self.attribute_names].values.astype('str')
        return X[self.attribute_names].values
    
# class ToNumpyArray(BaseEstimator, TransformerMixin):      
#     def fit(self, X, y=None):
#         return self
#     def transform(self, X, y=None):
#         return np.c_[X]
#     def names(self, X, y=None):
#         return X.columns.tolist()   

Scikit Learn CategoricalEncoder

This Scikit Learn Class is only in the github dev version and will be implemented at a later date in the standard version.

Link to code: https://github.com/scikit-learn/scikit-learn/blob/47ce5e1/sklearn/preprocessing/data.py#L2871. Idea taken from 02_end_to_end_machine_learning_project. Part of the Machine Learning and Deep Learning in python using Scikit-Learn and TensorFlow jupyter notebooks.

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse

class CategoricalEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
                 handle_unknown='error'):
        self.encoding = encoding
        self.categories = categories
        self.dtype = dtype
        self.handle_unknown = handle_unknown

    def fit(self, X, y=None):
        if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
            template = ("encoding should be either 'onehot', 'onehot-dense' "
                        "or 'ordinal', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
                        "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
            raise ValueError("handle_unknown='ignore' is not supported for"
                             " encoding='ordinal'")

        if self.categories != 'auto':
            for cats in self.categories:
                if not np.all(np.sort(cats) == np.array(cats)):
                    raise ValueError("Unsorted categories are not yet "
                                     "supported")

        X_temp = check_array(X, dtype=None)
        if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
            X = check_array(X, dtype=np.object)
        else:
            X = X_temp

        n_samples, n_features = X.shape

        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]

        for i in range(n_features):
            le = self._label_encoders_[i]
            Xi = X[:, i]
            if self.categories == 'auto':
                le.fit(Xi)
            else:
                if self.handle_unknown == 'error':
                    valid_mask = np.in1d(Xi, self.categories[i])
                    if not np.all(valid_mask):
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
                le.classes_ = np.array(self.categories[i])

        self.categories_ = [le.classes_ for le in self._label_encoders_]

        return self

    def transform(self, X):
        X_temp = check_array(X, dtype=None)
        if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
            X = check_array(X, dtype=np.object)
        else:
            X = X_temp

        n_samples, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            Xi = X[:, i]
            valid_mask = np.in1d(Xi, self.categories_[i])

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    # Set the problematic rows to an acceptable value and
                    # continue `The rows are marked `X_mask` and will be
                    # removed later.
                    X_mask[:, i] = valid_mask
                    Xi = Xi.copy()
                    Xi[~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(Xi)

        if self.encoding == 'ordinal':
            return X_int.astype(self.dtype, copy=False)

        mask = X_mask.ravel()
        n_values = [cats.shape[0] for cats in self.categories_]
        n_values = np.array([0] + n_values)
        feature_indices = np.cumsum(n_values)

        indices = (X_int + feature_indices[:-1]).ravel()[mask]
        indptr = X_mask.sum(axis=1).cumsum()
        indptr = np.insert(indptr, 0, 0)
        data = np.ones(n_samples * n_features)[mask]

        out = sparse.csr_matrix((data, indices, indptr),
                                shape=(n_samples, feature_indices[-1]),
                                dtype=self.dtype)
        if self.encoding == 'onehot-dense':
            return out.toarray()
        else:
            return out

    def inverse_transform(self, X):
        check_is_fitted(self, 'categories_')
        X = check_array(X, accept_sparse='csr')

        n_samples, _ = X.shape
        n_features = len(self.categories_)
        n_transformed_features = sum([len(cats) for cats in self.categories_])

        # validate shape of passed X
        msg = ("Shape of the passed X data is not correct. Expected {0} "
               "columns, got {1}.")
        if self.encoding == 'ordinal' and X.shape[1] != n_features:
            raise ValueError(msg.format(n_features, X.shape[1]))
        elif (self.encoding.startswith('onehot')
                and X.shape[1] != n_transformed_features):
            raise ValueError(msg.format(n_transformed_features, X.shape[1]))

        # create resulting array of appropriate dtype
        dt = np.find_common_type([cat.dtype for cat in self.categories_], [])
        X_tr = np.empty((n_samples, n_features), dtype=dt)

        if self.encoding == 'ordinal':
            for i in range(n_features):
                labels = X[:, i].astype('int64')
                X_tr[:, i] = self.categories_[i][labels]

        else:  # encoding == 'onehot' / 'onehot-dense'
            j = 0
            found_unknown = {}

            for i in range(n_features):
                n_categories = len(self.categories_[i])
                sub = X[:, j:j + n_categories]

                # for sparse X argmax returns 2D matrix, ensure 1D array
                labels = np.asarray(_argmax(sub, axis=1)).flatten()
                X_tr[:, i] = self.categories_[i][labels]

                if self.handle_unknown == 'ignore':
                    # ignored unknown categories: we have a row of all zero's
                    unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
                    if unknown.any():
                        found_unknown[i] = unknown

                j += n_categories

            # if ignored are found: potentially need to upcast result to
            # insert None values
            if found_unknown:
                if X_tr.dtype != object:
                    X_tr = X_tr.astype(object)

                for idx, mask in found_unknown.items():
                    X_tr[mask, idx] = None

        return X_tr

Reading in Data

df = pd.read_csv('LendingClub2012to2013.csv', skiprows=1, low_memory=False)
df = df.dropna(subset = ['loan_status'])
df = df.dropna(how='all', axis = 0)
df_org = df.copy()


Y = df['loan_status']
X = df.drop('loan_status',axis=1)

Pipelines

# Custom DataFrame pipeline

df_pipline = Pipeline([
    ('FilterNAs', FilterNAs()),
    ('DropingText', DropTextCol()),    
])

X = df_pipline.fit_transform(X)

text_attribs = X.select_dtypes(include = ['object']).columns.tolist()
num_attribs = X.select_dtypes(include = ['float64']).columns.tolist()

# Feature Union transformation pipeline

text_pipeline = Pipeline([
('selector', DataFrameSelector(text_attribs, strings = True)),
('categorical_encoder', CategoricalEncoder()), 
('imputer', Imputer(strategy="median"))
])

num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', Imputer(strategy="median")),
('robust_scaler', RobustScaler()) # Due to the presence of numerous outliers, the robust scaler more adequate than standard
# ('var_threshold', VarianceThreshold(threshold=(.9 * (1 - .9))))  # Got rid of this because it actually lowers overall precision and recall.
])

double_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("text_pipeline", text_pipeline)
])

X = double_pipeline.fit_transform(X)
X = X.toarray()

Testing/ Train Split

Hold-out is 20% of the data, Validation is 10% of the remaining data, Training is 90% of the remaining data.

# Generating the final test set:
X_train, X_test_final_test, Y_train, Y_final_test = train_test_split(X,Y, test_size = .2)

# Generating the intermediate set: 
X_train, X_test, Y_train, Y_test = train_test_split(X_train,Y_train, test_size = .1)

print("Final test size: " + str(Y_final_test.shape[0]))
print("Intermediate test size: " + str(Y_test.shape[0]))
print("Training size: " + str(Y_train.shape[0]))

Final test size: 37637
Intermediate test size: 15055
Training size: 135489

Code used to generate models and results

Computer intensive due to the fact that there are hundreds of variables.

At the bottom are machine learning models that are much less computer intensive that peformed the final model testing.

Fine tuning the machine learning models

The one of the models that perform well is the Random Forest. Under this model, there is already very high precision and recall already for many classes of the target variable. Through tuning the models and perhaps modifying the target variables, we may be able to use this model for feature selection.

ml_algo = [GaussianNB(), DecisionTreeClassifier(),
            RandomForestClassifier(), LogisticRegression()]

for algo in ml_algo:
    Model = algo
    Model = Model.fit(X_train,Y_train)
    predict = Model.predict(X_test)
    
    print("============================",str(algo),"============================")
    print("Cross Validation: ", cross_val_score(Model, X_train,Y_train,cv=5), '\n')
    print("Classification Report: ",'\n', classification_report(predict,Y_test), '\n')
    print("Accuary Score: ",accuracy_score(predict,Y_test), '\n')
    print("Confusion Matrix:",'\n',  confusion_matrix(predict, Y_test), '\n')
    print("=========================================================================================", '\n')

============================ GaussianNB(priors=None) ============================
Cross Validation:  [0.71821268 0.70867159 0.70432151 0.72991327 0.71378483] 

Classification Report:  
                     precision    recall  f1-score   support

       Charged Off       0.89      0.90      0.90      2273
           Current       0.13      0.93      0.23       174
           Default       1.00      0.00      0.00      1864
        Fully Paid       0.75      1.00      0.86      8606
   In Grace Period       0.57      0.02      0.04       620
 Late (16-30 days)       0.19      0.00      0.01      1039
Late (31-120 days)       0.70      0.10      0.18       479

       avg / total       0.75      0.72      0.64     15055
 

Accuary Score:  0.7198937230156094 

Confusion Matrix: 
 [[2045    0    0  228    0    0    0]
 [   0  161    0   12    0    1    0]
 [  82    1    2 1773    0    1    5]
 [  40    1    0 8565    0    0    0]
 [  38  369    0  180   13   10   10]
 [  55  571    0  397    7    3    6]
 [  25  133    0  268    3    1   49]] 

========================================================================================= 

============================ DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best') ============================
Cross Validation:  [0.99081249 0.99143911 0.99095841 0.99077321 0.99128991] 

Classification Report:  
                     precision    recall  f1-score   support

       Charged Off       0.99      0.99      0.99      2293
           Current       0.97      0.98      0.98      1226
           Default       0.50      0.50      0.50         2
        Fully Paid       1.00      1.00      1.00     11415
   In Grace Period       0.17      0.12      0.14        34
 Late (16-30 days)       0.06      0.12      0.08         8
Late (31-120 days)       0.86      0.78      0.82        77

       avg / total       0.99      0.99      0.99     15055
 

Accuary Score:  0.9914314181335104 

Confusion Matrix: 
 [[ 2264     0     0    29     0     0     0]
 [    0  1202     0     0    16     4     4]
 [    0     0     1     0     0     0     1]
 [   21     0     0 11394     0     0     0]
 [    0    20     0     0     4     7     3]
 [    0     5     0     0     0     1     2]
 [    0     9     1     0     3     4    60]] 

========================================================================================= 

============================ RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) ============================
Cross Validation:  [0.98723341 0.98896679 0.98656678 0.98571692 0.98759919] 

Classification Report:  
                     precision    recall  f1-score   support

       Charged Off       0.97      1.00      0.98      2212
           Current       0.99      0.94      0.97      1297
           Default       0.00      0.00      0.00         0
        Fully Paid       1.00      0.99      1.00     11520
   In Grace Period       0.00      0.00      0.00         2
 Late (16-30 days)       0.00      0.00      0.00         2
Late (31-120 days)       0.31      1.00      0.48        22

       avg / total       0.99      0.99      0.99     15055
 

Accuary Score:  0.987910993025573 

Confusion Matrix: 
 [[ 2207     0     0     4     0     0     1]
 [    0  1225     1     0    22    10    39]
 [    0     0     0     0     0     0     0]
 [   78    10     1 11419     1     6     5]
 [    0     0     0     0     0     0     2]
 [    0     1     0     0     0     0     1]
 [    0     0     0     0     0     0    22]] 

========================================================================================= 



C:\Users\jakes\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\metrics\classification.py:1137: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)


============================ LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) ============================
Cross Validation:  [0.99357981 0.99287823 0.99143817 0.99151135 0.96969921] 

Classification Report:  
                     precision    recall  f1-score   support

       Charged Off       0.98      1.00      0.99      2249
           Current       0.99      0.93      0.96      1311
           Default       0.00      0.00      0.00         0
        Fully Paid       1.00      1.00      1.00     11464
   In Grace Period       0.00      0.00      0.00         6
 Late (16-30 days)       0.00      0.00      0.00         2
Late (31-120 days)       0.27      0.83      0.41        23

       avg / total       0.99      0.99      0.99     15055
 

Accuary Score:  0.9899701095981401 

Confusion Matrix: 
 [[ 2243     0     0     6     0     0     0]
 [    0  1225     2     0    23    11    50]
 [    0     0     0     0     0     0     0]
 [   42     5     0 11417     0     0     0]
 [    0     3     0     0     0     3     0]
 [    0     1     0     0     0     0     1]
 [    0     2     0     0     0     2    19]] 

========================================================================================= 



C:\Users\jakes\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\metrics\classification.py:1137: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)

Grid Search

param_grid = {
    'n_estimators': [15, 23, 30, 45, 60, 75],
     'max_features': ['auto' , None, .5,.7],
     'min_samples_leaf' : [2,5,10,50,120],
     'max_depth': [None, 10, 20,50,100],
     'n_jobs' : [-1]
    }

final_grid = {
    'n_estimators': [25],
     'max_features': [.5,None,'sqrt'],
     'min_samples_leaf' : [2,10,20],
     'max_depth': [None,30],
     'n_jobs' : [-1]
    }


best_grid = {
    'n_estimators': [25],
     'max_features': [None],
     'min_samples_leaf' : [10],
     'max_depth': [None],
     'n_jobs' : [-1]
    }

forest = RandomForestClassifier()
grid_search = GridSearchCV(forest, final_grid, cv=5)
grid_search.fit(X_test,Y_test)

Saving Model

joblib.dump(grid_search.best_estimator_, "final_model.pkl")

Loading the Best Model

final_model = joblib.load("final_model.pkl")
final_model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Getting the variable importance

Through this code we can get the variable importance of our decision tree.

Basic idea taken from Hands-On Machine Learning page 74 and https://stackoverflow.com/questions/36633460/how-to-get-feature-names-selected-by-feature-elimination-in-sklearn-pipeline.

# Getting the names of the columns

text_col_names = list(np.hstack((text_pipeline.named_steps['categorical_encoder'].categories_)))
num_col_names = list(np.asarray(num_attribs))
attributes = np.concatenate([num_col_names,text_col_names])

# Making a dataframe of importance

model = final_model

feature_importance = zip(attributes.tolist(),model.feature_importances_.tolist())
feature_importance = np.asarray(list(feature_importance))
feature_importance = np.sort(feature_importance, axis = 0)[::-1]
feature_importance = pd.DataFrame(feature_importance, columns = ['Feature','Tree_Importance_Score'])
feature_importance.head(20)

	Feature	Tree_Importance_Score
0	wedding	9.907714074093709e-06
1	w	9.723394957926462e-07
2	vacation	9.619829870131448e-07
3	total_rev_hi_lim	9.564526420689163e-06
4	total_rec_prncp	9.520262367635957e-07
5	total_rec_late_fee	9.491119386912804e-07
6	total_rec_int	9.43946480146061e-06
7	total_pymnt_inv	9.386789949662516e-06
8	total_pymnt	9.266861532305998e-06
9	total_il_high_credit_limit	9.12268594982573e-07
10	total_bc_limit	8.860984618665859e-07
11	total_bal_ex_mort	8.665862926011948e-06
12	total_acc	8.576324971072865e-07
13	tot_hi_cred_lim	8.56190836110275e-07
14	tot_cur_bal	8.55569842128399e-05
15	tot_coll_amt	8.426843274140549e-07
16	tax_liens	8.411840850625117e-07
17	small_business	8.401010104036136e-07
18	revol_bal	8.241690403417078e-06
19	renewable_energy	8.218242241209607e-06

Saving to excel

df = pd.DataFrame(text_col_names, index = text_attribs)
filepath = 'my_excel_file.xlsx'
df.set_index(text_attribs)
df.to_excel(filepath)

Applying the Final Modeling

Feature Selection

The tuned Random Forest is in charge of picking the variables.

print(X_train.shape)
model = SelectFromModel(final_model, prefit=True)
X_train = model.transform(X_train)
X_test_final_test = model.transform(X_test_final_test)
print(X_train.shape)

(135489, 3010)
(135489, 20)

ml_algo = [GaussianNB(), DecisionTreeClassifier(),
            RandomForestClassifier(), LogisticRegression()]

for algo in ml_algo:
    Model = algo
    Model = Model.fit(X_train,Y_train)
    predict = Model.predict(X_test_final_test)
    
    print("============================",str(algo),"============================")
#     print("Cross Validation: ", cross_val_score(Model, X_train,Y_train,cv=10), '\n')
    print("Classification Report: ",'\n', classification_report(predict,Y_final_test), '\n')
#     print("Accuary Score: ",accuracy_score(predict, Y_final_test), '\n')
    print("Confusion Matrix:",'\n',  confusion_matrix(predict, Y_final_test), '\n')
    print("=========================================================================================", '\n')

============================ GaussianNB(priors=None) ============================
Classification Report:  
                     precision    recall  f1-score   support

       Charged Off       0.91      0.96      0.93      5247
           Current       0.97      0.98      0.97      3062
           Default       1.00      0.18      0.31        22
        Fully Paid       0.99      0.98      0.99     29025
   In Grace Period       0.28      0.19      0.23       108
 Late (16-30 days)       0.32      0.18      0.23        51
Late (31-120 days)       0.59      0.65      0.61       122

       avg / total       0.97      0.97      0.97     37637
 

Confusion Matrix: 
 [[ 5037     0     0   210     0     0     0]
 [    0  3000     0     4    39     7    12]
 [    0     0     4     4     0     0    14]
 [  493     2     0 28530     0     0     0]
 [    1    43     0     6    21    10    27]
 [    0    18     0     7    14     9     3]
 [    5    30     0     5     1     2    79]] 

========================================================================================= 

============================ DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best') ============================
Classification Report:  
                     precision    recall  f1-score   support

       Charged Off       0.99      0.99      0.99      5514
           Current       0.97      0.98      0.98      3080
           Default       1.00      0.44      0.62         9
        Fully Paid       1.00      1.00      1.00     28789
   In Grace Period       0.23      0.19      0.21        88
 Late (16-30 days)       0.21      0.24      0.23        25
Late (31-120 days)       0.76      0.78      0.77       132

       avg / total       0.99      0.99      0.99     37637
 

Confusion Matrix: 
 [[ 5484     0     0    30     0     0     0]
 [    0  3011     0     0    41     8    20]
 [    0     0     4     0     0     0     5]
 [   52     1     0 28736     0     0     0]
 [    0    56     0     0    17     8     7]
 [    0     8     0     0    11     6     0]
 [    0    17     0     0     6     6   103]] 

========================================================================================= 

============================ RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) ============================
Classification Report:  
                     precision    recall  f1-score   support

       Charged Off       0.99      1.00      1.00      5502
           Current       1.00      0.98      0.99      3160
           Default       0.75      0.60      0.67         5
        Fully Paid       1.00      1.00      1.00     28801
   In Grace Period       0.17      0.42      0.25        31
 Late (16-30 days)       0.25      0.28      0.26        25
Late (31-120 days)       0.78      0.93      0.85       113

       avg / total       1.00      1.00      1.00     37637
 

Confusion Matrix: 
 [[ 5496     0     0     6     0     0     0]
 [    0  3083     0     0    47     9    21]
 [    0     1     3     0     0     0     1]
 [   40     1     0 28760     0     0     0]
 [    0     4     0     0    13    10     4]
 [    0     3     0     0    11     7     4]
 [    0     1     1     0     4     2   105]] 

========================================================================================= 

============================ LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) ============================
Classification Report:  
                     precision    recall  f1-score   support

       Charged Off       0.96      1.00      0.98      5312
           Current       0.99      0.98      0.98      3145
           Default       0.00      0.00      0.00         0
        Fully Paid       1.00      0.99      1.00     29005
   In Grace Period       0.24      0.40      0.30        45
 Late (16-30 days)       0.07      0.12      0.09        17
Late (31-120 days)       0.76      0.91      0.83       113

       avg / total       0.99      0.99      0.99     37637
 

Confusion Matrix: 
 [[ 5297     0     0    15     0     0     0]
 [    0  3072     0     3    41     8    21]
 [    0     0     0     0     0     0     0]
 [  239    14     2 28748     0     1     1]
 [    0     5     0     0    18    14     8]
 [    0     2     0     0    11     2     2]
 [    0     0     2     0     5     3   103]] 

========================================================================================= 



C:\Users\jakes\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\metrics\classification.py:1137: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)