# import packages
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# set the aesthetic style of the plots
sns.set_style()

# filter warning messages
import warnings
warnings.filterwarnings('ignore')


# set default matplotlib parameters
COLOR = '#ababab'
mpl.rcParams['figure.titlesize'] = 16
mpl.rcParams['text.color'] = 'black'
mpl.rcParams['axes.labelcolor'] = COLOR
mpl.rcParams['xtick.color'] = COLOR
mpl.rcParams['ytick.color'] = COLOR
mpl.rcParams['grid.color'] = COLOR
mpl.rcParams['grid.alpha'] = 0.1


# import data set and create a data frame
df_credit = pd.read_csv('http://dl.dropboxusercontent.com/s/xn2a4kzf0zer0xu/acquisition_train.csv?dl=0')


# show first 5 rows
df_credit.head()


# data frame shape
print('Number of rows: ', df_credit.shape[0])
print('Number of columns: ', df_credit.shape[1])

Number of rows:  45000
Number of columns:  43


# data frame summary
df_credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 43 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   ids                                               45000 non-null  object 
 1   target_default                                    41741 non-null  object 
 2   score_1                                           44438 non-null  object 
 3   score_2                                           44438 non-null  object 
 4   score_3                                           44438 non-null  float64
 5   score_4                                           45000 non-null  float64
 6   score_5                                           45000 non-null  float64
 7   score_6                                           45000 non-null  float64
 8   risk_rate                                         44438 non-null  float64
 9   last_amount_borrowed                              15044 non-null  float64
 10  last_borrowed_in_months                           15044 non-null  float64
 11  credit_limit                                      31200 non-null  float64
 12  reason                                            44434 non-null  object 
 13  income                                            44438 non-null  float64
 14  facebook_profile                                  40542 non-null  object 
 15  state                                             44438 non-null  object 
 16  zip                                               44438 non-null  object 
 17  channel                                           44438 non-null  object 
 18  job_name                                          41664 non-null  object 
 19  real_state                                        44438 non-null  object 
 20  ok_since                                          18455 non-null  float64
 21  n_bankruptcies                                    44303 non-null  float64
 22  n_defaulted_loans                                 44426 non-null  float64
 23  n_accounts                                        44438 non-null  float64
 24  n_issues                                          33456 non-null  float64
 25  application_time_applied                          45000 non-null  object 
 26  application_time_in_funnel                        45000 non-null  int64  
 27  email                                             45000 non-null  object 
 28  external_data_provider_credit_checks_last_2_year  22372 non-null  float64
 29  external_data_provider_credit_checks_last_month   45000 non-null  int64  
 30  external_data_provider_credit_checks_last_year    29876 non-null  float64
 31  external_data_provider_email_seen_before          42767 non-null  float64
 32  external_data_provider_first_name                 45000 non-null  object 
 33  external_data_provider_fraud_score                45000 non-null  int64  
 34  lat_lon                                           43637 non-null  object 
 35  marketing_channel                                 41422 non-null  object 
 36  profile_phone_number                              45000 non-null  object 
 37  reported_income                                   45000 non-null  float64
 38  shipping_state                                    45000 non-null  object 
 39  shipping_zip_code                                 45000 non-null  int64  
 40  profile_tags                                      45000 non-null  object 
 41  user_agent                                        44278 non-null  object 
 42  target_fraud                                      1522 non-null   object 
dtypes: float64(18), int64(4), object(21)
memory usage: 14.8+ MB


# percentage of missing values per feature
print((df_credit.isnull().sum() * 100 / df_credit.shape[0]).sort_values(ascending=False))

target_fraud                                        96.617778
last_amount_borrowed                                66.568889
last_borrowed_in_months                             66.568889
ok_since                                            58.988889
external_data_provider_credit_checks_last_2_year    50.284444
external_data_provider_credit_checks_last_year      33.608889
credit_limit                                        30.666667
n_issues                                            25.653333
facebook_profile                                     9.906667
marketing_channel                                    7.951111
job_name                                             7.413333
target_default                                       7.242222
external_data_provider_email_seen_before             4.962222
lat_lon                                              3.028889
user_agent                                           1.604444
n_bankruptcies                                       1.548889
n_defaulted_loans                                    1.275556
reason                                               1.257778
zip                                                  1.248889
n_accounts                                           1.248889
channel                                              1.248889
score_1                                              1.248889
score_3                                              1.248889
risk_rate                                            1.248889
income                                               1.248889
real_state                                           1.248889
state                                                1.248889
score_2                                              1.248889
profile_tags                                         0.000000
shipping_zip_code                                    0.000000
shipping_state                                       0.000000
reported_income                                      0.000000
profile_phone_number                                 0.000000
external_data_provider_credit_checks_last_month      0.000000
external_data_provider_fraud_score                   0.000000
external_data_provider_first_name                    0.000000
score_4                                              0.000000
score_5                                              0.000000
score_6                                              0.000000
email                                                0.000000
application_time_in_funnel                           0.000000
application_time_applied                             0.000000
ids                                                  0.000000
dtype: float64


df_credit.dropna(subset=['target_default'], inplace=True)


# drop the column "target_fraud"
df_credit.drop('target_fraud', axis=1, inplace=True)


# number of unique observations per column
df_credit.nunique().sort_values()

channel                                                 1
external_data_provider_credit_checks_last_2_year        1
last_borrowed_in_months                                 2
target_default                                          2
facebook_profile                                        2
external_data_provider_credit_checks_last_year          2
external_data_provider_credit_checks_last_month         4
real_state                                              5
n_defaulted_loans                                       5
email                                                   6
n_bankruptcies                                          6
score_1                                                 7
marketing_channel                                       9
shipping_state                                         25
score_2                                                35
n_issues                                               44
n_accounts                                             44
state                                                  50
external_data_provider_email_seen_before               62
risk_rate                                              81
score_3                                                87
ok_since                                              100
user_agent                                            297
application_time_in_funnel                            501
zip                                                   823
external_data_provider_fraud_score                   1001
last_amount_borrowed                                13480
reason                                              14260
credit_limit                                        19336
lat_lon                                             21596
profile_tags                                        24458
shipping_zip_code                                   26996
job_name                                            30543
external_data_provider_first_name                   31183
application_time_applied                            33560
reported_income                                     37368
income                                              38849
score_5                                             41741
profile_phone_number                                41741
score_4                                             41741
score_6                                             41741
ids                                                 41741
dtype: int64


# drop the columns "channel" and "external_data_provider_credit_checks_last_2_year"
df_credit.drop(labels=['channel', 'external_data_provider_credit_checks_last_2_year'], axis=1, inplace=True)


df_credit.drop(labels=['email', 'reason', 'zip', 'job_name', 'external_data_provider_first_name', 'lat_lon',
                       'shipping_zip_code', 'user_agent', 'profile_tags', 'marketing_channel',
                       'profile_phone_number', 'application_time_applied', 'ids'], axis=1, inplace=True)


# show descriptive statistics
df_credit.describe()


# count of "inf" values in "reported_income"
np.isinf(df_credit['reported_income']).sum()

66


# count of values = -999 in "external_data_provider_email_seen_before"
df_credit.loc[df_credit['external_data_provider_email_seen_before'] == -999, 'external_data_provider_email_seen_before'].value_counts()

-999.0    591
Name: external_data_provider_email_seen_before, dtype: int64


# replace "inf" values with "nan"
df_credit['reported_income'] = df_credit['reported_income'].replace(np.inf, np.nan)

# replace "-999" values with "nan"
df_credit.loc[df_credit['external_data_provider_email_seen_before'] == -999, 'external_data_provider_email_seen_before'] = np.nan


# data frame containing numerical features
df_credit_numerical = df_credit[['score_3', 'risk_rate', 'last_amount_borrowed', 
                                 'last_borrowed_in_months', 'credit_limit', 'income', 'ok_since', 
                                 'n_bankruptcies', 'n_defaulted_loans', 'n_accounts', 'n_issues', 
                                 'external_data_provider_email_seen_before']]


# plot a histogram for each of the features above 

nrows = 3
ncols = 4

fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(25, 16))

r = 0
c = 0

for i in df_credit_numerical:
  sns.distplot(df_credit_numerical[i], bins=15,kde=False, ax=ax[r][c])
  if c == ncols - 1:
    r += 1
    c = 0
  else:
    c += 1

plt.show()


df_credit_num = df_credit.select_dtypes(exclude='object').columns
df_credit_cat = df_credit.select_dtypes(include='object').columns

# fill missing values for "last_amount_borrowed", "last_borrowed_in_months" and "n_issues"
df_credit['last_amount_borrowed'].fillna(value=0, inplace=True)
df_credit['last_borrowed_in_months'].fillna(value=0, inplace=True)
df_credit['n_issues'].fillna(value=0, inplace=True)

# fill missing values for numerical variables
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer = imputer.fit(df_credit.loc[:, df_credit_num])
df_credit.loc[:, df_credit_num] = imputer.transform(df_credit.loc[:, df_credit_num])

# fill missing values for categorical variables
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer = imputer.fit(df_credit.loc[:, df_credit_cat])
df_credit.loc[:, df_credit_cat] = imputer.transform(df_credit.loc[:, df_credit_cat])


df_credit.isnull().sum()

target_default                                     0
score_1                                            0
score_2                                            0
score_3                                            0
score_4                                            0
score_5                                            0
score_6                                            0
risk_rate                                          0
last_amount_borrowed                               0
last_borrowed_in_months                            0
credit_limit                                       0
income                                             0
facebook_profile                                   0
state                                              0
real_state                                         0
ok_since                                           0
n_bankruptcies                                     0
n_defaulted_loans                                  0
n_accounts                                         0
n_issues                                           0
application_time_in_funnel                         0
external_data_provider_credit_checks_last_month    0
external_data_provider_credit_checks_last_year     0
external_data_provider_email_seen_before           0
external_data_provider_fraud_score                 0
reported_income                                    0
shipping_state                                     0
dtype: int64


bin_var = df_credit.nunique()[df_credit.nunique() == 2].keys().tolist()
num_var = [col for col in df_credit.select_dtypes(['int', 'float']).columns.tolist() if col not in bin_var]
cat_var = [col for col in df_credit.select_dtypes(['object']).columns.tolist() if col not in bin_var]

df_credit_encoded = df_credit.copy()

# label encoding for the binary variables
le = LabelEncoder()
for col in bin_var:
  df_credit_encoded[col] = le.fit_transform(df_credit_encoded[col])

# encoding with get_dummies for the categorical variables
df_credit_encoded = pd.get_dummies(df_credit_encoded, columns=cat_var)

df_credit_encoded.head()


# feature matrix
X = df_credit_encoded.drop('target_default', axis=1)

# target vector
y = df_credit_encoded['target_default']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y)


# standardize numerical variables
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

# resample
rus = RandomUnderSampler()
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)


# define the function val_model
def val_model(X, y, clf, show=True):
    """
    Apply cross-validation on the training set.

    # Arguments
        X: DataFrame containing the independent variables.
        y: Series containing the target vector.
        clf: Scikit-learn estimator instance.
        
    # Returns
        float, mean value of the cross-validation scores.
    """
    
    X = np.array(X)
    y = np.array(y)

    pipeline = make_pipeline(StandardScaler(), clf)
    scores = cross_val_score(pipeline, X, y, scoring='recall')

    if show == True:
        print(f'Recall: {scores.mean()}, {scores.std()}')
    
    return scores.mean()


#evaluate the models
xgb = XGBClassifier()
lgb = LGBMClassifier()
cb = CatBoostClassifier()

model = []
recall = []

for clf in (xgb, lgb, cb):
    model.append(clf.__class__.__name__)
    recall.append(val_model(X_train_rus, y_train_rus, clf, show=False))

pd.DataFrame(data=recall, index=model, columns=['Recall'])


# XGBoost
xgb = XGBClassifier()

# parameter to be searched
param_grid = {'n_estimators': range(0,1000,50)}

# find the best parameter   
kfold = StratifiedKFold(n_splits=3, shuffle=True)
grid_search = GridSearchCV(xgb, param_grid, scoring="recall", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train_rus, y_train_rus)

print(f'Best result: {grid_result.best_score_} for {grid_result.best_params_}')

Best result: 0.6421130614407926 for {'n_estimators': 50}


# XGBoost
xgb = XGBClassifier(n_estimators=50)

# parameter to be searched
param_grid = {'max_depth': [1, 3, 5],
              'min_child_weight': [1, 3, 6]}

# find the best parameter   
kfold = StratifiedKFold(n_splits=3, shuffle=True)
grid_search = GridSearchCV(xgb, param_grid, scoring="recall", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train_rus, y_train_rus)

print(f'Best result: {grid_result.best_score_} for {grid_result.best_params_}')

Best result: 0.6573262337968221 for {'max_depth': 3, 'min_child_weight': 6}


# XGBoost
xgb = XGBClassifier(n_estimators=50, max_depth=3, min_child_weight=6)

# parameter to be searched
param_grid = {'gamma': [0, 1, 5]}

# find the best parameter   
kfold = StratifiedKFold(n_splits=3, shuffle=True)
grid_search = GridSearchCV(xgb, param_grid, scoring="recall", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train_rus, y_train_rus)

print(f'Best result: {grid_result.best_score_} for {grid_result.best_params_}')

Best result: 0.6631309580889413 for {'gamma': 5}


# XGBoost
xgb = XGBClassifier(n_estimators=50, max_depth=3, min_child_weight=6, gamma=1)

# parameter to be searched
param_grid = {'learning_rate': [0.0001, 0.001, 0.01, 0.1]}

# find the best parameter
kfold = StratifiedKFold(n_splits=3, shuffle=True)
grid_search = GridSearchCV(xgb, param_grid, scoring='recall', n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train_rus, y_train_rus)

print(f'Best result: {grid_result.best_score_} for {grid_result.best_params_}')

Best result: 0.8134521075697547 for {'learning_rate': 0.0001}


# LightGBM
lbg = LGBMClassifier(silent=False)

# parameter to be searched
param_grid = {"max_depth": np.arange(5, 75, 10),
              "learning_rate" : [0.001, 0.01, 0.1],
              "num_leaves": np.arange(20, 220, 50),
             }

# find the best parameter            
kfold = StratifiedKFold(n_splits=3, shuffle=True)
grid_search = GridSearchCV(lbg, param_grid, scoring="recall", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train_rus, y_train_rus)

print(f'Best result: {grid_result.best_score_} for {grid_result.best_params_}')


lbg = LGBMClassifier(learning_rate=0.01, max_depth=5, num_leaves=50, silent=False)

# parameter to be searched
param_grid = {'min_data_in_leaf': np.arange(100, 1000, 100)}

# find the best parameter            
kfold = StratifiedKFold(n_splits=3, shuffle=True)
grid_search = GridSearchCV(lbg, param_grid, scoring="recall", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train_rus, y_train_rus)

print(f'Best result: {grid_result.best_score_} for {grid_result.best_params_}')


# CatBoost
cb = CatBoostClassifier()

# parameter to be searched
param_grid = {'depth': [6, 8, 10],
              'learning_rate': [0.03, 0.1],
              'l2_leaf_reg': [1, 5, 10],
             }

# find the best parameter            
kfold = StratifiedKFold(n_splits=3, shuffle=True)
grid_search = GridSearchCV(cb, param_grid, scoring="recall", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train_rus, y_train_rus)

print(f'Best result: {grid_result.best_score_} for {grid_result.best_params_}')


# final XGBoost model
xgb = XGBClassifier(max_depth=3, learning_rate=0.0001, n_estimators=50, gamma=1, min_child_weight=6)
xgb.fit(X_train_rus, y_train_rus)

# prediction
X_test_xgb = scaler.transform(X_test)
y_pred_xgb = xgb.predict(X_test_xgb)

# classification report
print(classification_report(y_test, y_pred_xgb))

# confusion matrix
fig, ax = plt.subplots()
sns.heatmap(confusion_matrix(y_test, y_pred_xgb, normalize='true'), annot=True, ax=ax)
ax.set_title('Confusion Matrix - XGBoost')
ax.set_xlabel('Predicted Value')
ax.set_ylabel('Real Value')

plt.show()

              precision    recall  f1-score   support

           0       0.92      0.44      0.60      8771
           1       0.22      0.81      0.34      1665

    accuracy                           0.50     10436
   macro avg       0.57      0.63      0.47     10436
weighted avg       0.81      0.50      0.56     10436


# final LightGBM model
lgb = LGBMClassifier(num_leaves=70, max_depth=5, learning_rate=0.01, min_data_in_leaf=400)
lgb.fit(X_train_rus, y_train_rus)

# prediction
X_test_lgb = scaler.transform(X_test)
y_pred_lgb = lgb.predict(X_test_lgb)

# classification report
print(classification_report(y_test, y_pred_lgb))

# confusion matrix
fig, ax = plt.subplots()
sns.heatmap(confusion_matrix(y_test, y_pred_lgb, normalize='true'), annot=True, ax=ax)
ax.set_title('Confusion Matrix - LightGBM')
ax.set_xlabel('Predicted Value')
ax.set_ylabel('Real Value')

plt.show()

[LightGBM] [Warning] min_data_in_leaf is set=400, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=400
              precision    recall  f1-score   support

           0       0.84      1.00      0.91      8771
           1       0.00      0.00      0.00      1665

    accuracy                           0.84     10436
   macro avg       0.42      0.50      0.46     10436
weighted avg       0.71      0.84      0.77     10436


# final CatBoost model
cb = CatBoostClassifier(learning_rate=0.03, depth=6, l2_leaf_reg=5, logging_level='Silent')
cb.fit(X_train_rus, y_train_rus)

# prediction
X_test_cb = scaler.transform(X_test)
y_pred_cb = cb.predict(X_test_cb)

# classification report
print(classification_report(y_test, y_pred_cb))

# confusion matrix
fig, ax = plt.subplots()
sns.heatmap(confusion_matrix(y_test, y_pred_cb, normalize='true'), annot=True, ax=ax)
ax.set_title('Confusion Matrix - CatBoost')
ax.set_xlabel('Predicted Value')
ax.set_ylabel('Real Value')

plt.show()

	ids	target_default	score_1	score_2	score_3	score_4	score_5	score_6	risk_rate	last_amount_borrowed	...	external_data_provider_fraud_score	lat_lon	marketing_channel	profile_phone_number	reported_income	shipping_state	shipping_zip_code	profile_tags	user_agent	target_fraud
0	343b7e7b-2cf8-e508-b8fd-0a0285af30aa	False	1Rk8w4Ucd5yR3KcqZzLdow==	IOVu8au3ISbo6+zmfnYwMg==	350.0	101.800832	0.259555	108.427273	0.40	25033.92	...	645	(-29.151545708122246, -51.1386461804385)	Invite-email	514-9840782	57849.0	BR-MT	17528	{'tags': ['n19', 'n8']}	Mozilla/5.0 (Linux; Android 6.0.1; SGP771 Buil...	NaN
1	bc2c7502-bbad-0f8c-39c3-94e881967124	False	DGCQep2AE5QRkNCshIAlFQ==	SaamrHMo23l/3TwXOWgVzw==	370.0	97.062615	0.942655	92.002546	0.24	NaN	...	243	(-19.687710705798963, -47.94151536525154)	Radio-commercial	251-3659293	4902.0	BR-RS	40933	{'tags': ['n6', 'n7', 'nim']}	Mozilla/5.0 (Linux; Android 5.0.2; SAMSUNG SM-...	NaN
2	669630dd-2e6a-0396-84bf-455e5009c922	True	DGCQep2AE5QRkNCshIAlFQ==	Fv28Bz0YRTVAT5kl1bAV6g==	360.0	100.027073	0.351918	112.892453	0.29	7207.92	...	65	(-28.748023890412284, -51.867279334353995)	Waiting-list	230-6097993	163679.0	BR-RR	50985	{'tags': ['n0', 'n17', 'nim', 'da']}	Mozilla/5.0 (Linux; Android 6.0.1; SGP771 Buil...	NaN
3	d235609e-b6cb-0ccc-a329-d4f12e7ebdc1	False	1Rk8w4Ucd5yR3KcqZzLdow==	dCm9hFKfdRm7ej3jW+gyxw==	510.0	101.599485	0.987673	94.902491	0.32	NaN	...	815	(-17.520650158450454, -39.75801139933186)	Waiting-list	261-3543751	1086.0	BR-RN	37825	{'tags': ['n4']}	Mozilla/5.0 (Linux; Android 6.0; HTC One X10 B...	NaN
4	9e0eb880-e8f4-3faa-67d8-f5cdd2b3932b	False	8k8UDR4Yx0qasAjkGrUZLw==	+CxEO4w7jv3QPI/BQbyqAA==	500.0	98.474289	0.532539	118.126207	0.18	NaN	...	320	(-16.574259446978008, -39.90990074785962)	Invite-email	102-3660162	198618.0	BR-MT	52827	{'tags': ['pro+aty', 'n19', 'da', 'b19']}	Mozilla/5.0 (Linux; Android 7.0; Pixel C Build...	NaN

	score_3	score_4	score_5	score_6	risk_rate	last_amount_borrowed	last_borrowed_in_months	credit_limit	income	ok_since	n_bankruptcies	n_defaulted_loans	n_accounts	n_issues	application_time_in_funnel	external_data_provider_credit_checks_last_month	external_data_provider_credit_checks_last_year	external_data_provider_email_seen_before	external_data_provider_fraud_score	reported_income
count	41741.000000	41741.000000	41741.000000	41741.000000	41741.000000	14133.000000	14133.000000	28632.000000	4.174100e+04	17276.000000	41606.000000	41729.000000	41741.000000	30818.000000	41741.000000	41741.000000	27720.000000	39656.000000	41741.000000	41741.0
mean	346.459836	100.006820	0.499416	99.919399	0.294451	13328.104095	40.588410	33877.220453	7.108012e+04	35.192174	0.076696	0.004625	10.639108	11.023882	247.748545	1.504396	0.504185	12.731188	500.491771	inf
std	110.102271	3.183821	0.288085	10.022703	0.101561	7918.698433	9.437936	36141.985884	5.225978e+04	21.629577	0.274820	0.080157	4.588175	4.596036	146.326172	1.114207	0.499992	125.711218	287.993121	NaN
min	0.000000	86.191572	0.000035	60.663039	0.000000	1005.180000	36.000000	0.000000	4.821180e+03	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-999.000000	0.000000	403.0
25%	270.000000	97.862546	0.251595	93.182517	0.220000	7210.280000	36.000000	9975.000000	4.401958e+04	17.000000	0.000000	0.000000	7.000000	8.000000	120.000000	1.000000	0.000000	11.000000	252.000000	50910.0
50%	340.000000	100.017950	0.500174	99.977774	0.290000	12011.050000	36.000000	25213.000000	6.004409e+04	32.000000	0.000000	0.000000	10.000000	10.000000	248.000000	2.000000	1.000000	27.000000	502.000000	101623.0
75%	420.000000	102.143100	0.747630	106.630991	0.360000	18030.160000	36.000000	46492.500000	8.503289e+04	50.000000	0.000000	0.000000	13.000000	14.000000	375.000000	2.000000	1.000000	43.000000	747.000000	151248.0
max	990.000000	113.978234	0.999973	142.192400	0.900000	35059.600000	60.000000	448269.000000	5.000028e+06	141.000000	5.000000	5.000000	49.000000	49.000000	500.000000	3.000000	1.000000	59.000000	1000.000000	inf

	target_default	score_3	score_4	score_5	score_6	risk_rate	last_amount_borrowed	last_borrowed_in_months	credit_limit	income	...	shipping_state_BR-RN	shipping_state_BR-RR	shipping_state_BR-RS
0	0	350.0	101.800832	0.259555	108.427273	0.40	25033.92	36.0	0.0	65014.12	...	0	0	0
1	0	370.0	97.062615	0.942655	92.002546	0.24	0.00	0.0	39726.0	100018.91	...	0	0	1
2	1	360.0	100.027073	0.351918	112.892453	0.29	7207.92	36.0	25213.0	65023.65	...	0	1	0
3	0	510.0	101.599485	0.987673	94.902491	0.32	0.00	0.0	54591.0	68830.01	...	1	0	0
4	0	500.0	98.474289	0.532539	118.126207	0.18	0.00	0.0	25213.0	60011.29	...	0	0	0

Credit Risk Analysis with Machine Learning¶

Predicting the risk of client default using XGBoost, LightGBM and CatBoost¶

About the Data¶

Data Analysis¶

Machine Learning Models¶

XGBoost¶

LightGBM¶

CatBoost¶

Conclusion¶