import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as ss
from statsmodels.stats.weightstats import ztest

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split


import warnings
warnings.filterwarnings('ignore')


retention = pd.read_csv('Retention.csv', encoding='ISO-8859-1')
retention.head()


trainingCost = pd.read_csv('Training_Cost.csv', encoding='ISO-8859-1')
trainingCost


retention.shape

(14999, 9)


retention.notna().shape

(14999, 9)


retention['Satisfaction'].describe()

count    14999.000000
mean         0.612834
std          0.248631
min          0.090000
25%          0.440000
50%          0.640000
75%          0.820000
max          1.000000
Name: Satisfaction, dtype: float64


retention['Department'].nunique()

10


retention.groupby('Department')['Satisfaction'].count()

Department
IT             1227
RandD           787
accounting      767
hr              739
management      630
marketing       858
product_mng     902
sales          4140
support        2229
technical      2720
Name: Satisfaction, dtype: int64


retention.groupby('Department')['Satisfaction'].mean()

Department
IT             0.618142
RandD          0.619822
accounting     0.582151
hr             0.598809
management     0.621349
marketing      0.618601
product_mng    0.619634
sales          0.614447
support        0.618300
technical      0.607897
Name: Satisfaction, dtype: float64


retention_rates = 1 - retention.groupby('Department')['Left'].sum()/retention.groupby('Department')['Left'].count()
retention_rates

Department
IT             0.777506
RandD          0.846252
accounting     0.734029
hr             0.709066
management     0.855556
marketing      0.763403
product_mng    0.780488
sales          0.755072
support        0.751009
technical      0.743750
Name: Left, dtype: float64


retention['Left'].value_counts()

0    11428
1     3571
Name: Left, dtype: int64


retention.groupby('Left').mean()


retention.groupby('Department').mean()


retention.groupby('Salary').mean()


%matplotlib inline
import matplotlib.pyplot as plt
pd.crosstab(retention.Department,retention.Left).plot(kind='bar')
plt.title('Turnover Frequency for Department')
plt.xlabel('Department')
plt.ylabel('Frequency of Turnover')
plt.savefig('department_bar_chart')


table=pd.crosstab(retention.Salary, retention.Left)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Salary Level vs Turnover')
plt.xlabel('Salary Level')
plt.ylabel('Proportion of Employees')
plt.savefig('salary_bar_chart')


pd.crosstab(retention.Department, retention.Left)


num_bins = 10

retention.hist(bins=num_bins, figsize=(20,15))
plt.show()


retention_rates.loc['hr']

0.7090663058186739


retention['trainingCost'] = trainingCost.set_index('Department').lookup(retention['Department'],retention['Salary'])
retention.head()


retention['trainingCost'].sum()

20567600


retention.groupby('Left')['trainingCost'].sum()[1]/retention['trainingCost'].sum()

0.19397498978976643


retention[retention['Left']==1].groupby('Department')['trainingCost'].sum()/retention.groupby('Department')['trainingCost'].sum()

Department
IT             0.189955
RandD          0.154545
accounting     0.248514
hr             0.289747
management     0.069547
marketing      0.186834
product_mng    0.209008
sales          0.172518
support        0.174801
technical      0.238479
Name: trainingCost, dtype: float64


R = retention.loc[retention['Left']==0,'Last_evaluation']
L = retention.loc[retention['Left']==1,'Last_evaluation']


ss.stats.ttest_ind(a=R, b=L,equal_var=False,alternative='greater')

Ttest_indResult(statistic=-1.6508075726997398, pvalue=0.9505808315672778)


RH = retention[retention['Salary']=='high'].loc[retention['Left']==0,'Last_evaluation']
LH = retention[retention['Salary']=='high'].loc[retention['Left']==1,'Last_evaluation']


ss.stats.ttest_ind(a=RH, b=LH,equal_var=False,alternative='greater')

Ttest_indResult(statistic=3.082181370261907, pvalue=0.0013663080445550345)


RM = retention[retention['Salary']=='medium'].loc[retention['Left']==0,'Last_evaluation']
LM = retention[retention['Salary']=='medium'].loc[retention['Left']==1,'Last_evaluation']


ss.stats.ttest_ind(a=RM, b=LM,equal_var=False,alternative='greater')

Ttest_indResult(statistic=-3.1063203806486315, pvalue=0.9990382952951038)


RL = retention[retention['Salary']=='low'].loc[retention['Left']==0,'Last_evaluation']
LL = retention[retention['Salary']=='low'].loc[retention['Left']==1,'Last_evaluation']


ss.stats.ttest_ind(a=RL, b=LL,equal_var=False,alternative='greater')

Ttest_indResult(statistic=0.007113352772731251, pvalue=0.4971624104987519)


retention.head()


cat_vars=['Department','Salary']
for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(retention[var], prefix=var)
    r1=retention.join(cat_list)
    retention=r1


retention.drop(['Department', 'Salary'], axis=1, inplace=True)


retention.columns.values

array(['Satisfaction', 'Last_evaluation', 'Project', 'Monthly_hours',
       'Time', 'Left', 'Promotion', 'trainingCost', 'Department_IT',
       'Department_RandD', 'Department_accounting', 'Department_hr',
       'Department_management', 'Department_marketing',
       'Department_product_mng', 'Department_sales', 'Department_support',
       'Department_technical', 'Salary_high', 'Salary_low',
       'Salary_medium'], dtype=object)


plt.figure(figsize=(15, 15))
sns.heatmap(retention.corr(), annot=True)

<AxesSubplot:>


retention_vars=retention.columns.values.tolist()
y=['Left']
X=[i for i in retention_vars if i not in y]

X

['Satisfaction',
 'Last_evaluation',
 'Project',
 'Monthly_hours',
 'Time',
 'Promotion',
 'trainingCost',
 'Department_IT',
 'Department_RandD',
 'Department_accounting',
 'Department_hr',
 'Department_management',
 'Department_marketing',
 'Department_product_mng',
 'Department_sales',
 'Department_support',
 'Department_technical',
 'Salary_high',
 'Salary_low',
 'Salary_medium']


from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

rfe = RFE(model)
rfe = rfe.fit(retention[X], retention[y])
print(rfe.support_)
print(rfe.ranking_)

[ True  True False False  True  True False False  True False  True  True
 False False False False False  True  True  True]
[ 1  1  2  9  1  1 11  3  1  7  1  1  8  4 10  6  5  1  1  1]


cols=['Satisfaction', 'Last_evaluation', 'Time', 
      'Department_IT', 'Department_accounting', 'Department_hr', 'Department_management', 'Salary_high', 'Salary_low', 'Salary_low'] 
X=retention[cols]
y=retention['Left']


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()


from sklearn.metrics import accuracy_score
print('Logistic regression accuracy: {:.3f}'.format(accuracy_score(y_test, logreg.predict(X_test))))

Logistic regression accuracy: 0.762


from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier()


print('Random Forest Accuracy: {:.3f}'.format(accuracy_score(y_test, rf.predict(X_test))))

Random Forest Accuracy: 0.981


from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

SVC()


print('Support vector machine accuracy: {:.3f}'.format(accuracy_score(y_test, svc.predict(X_test))))

Support vector machine accuracy: 0.898


from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10)
modelCV = RandomForestClassifier()
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

10-fold cross validation average accuracy: 0.980


from sklearn.metrics import classification_report
print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3421
           1       0.96      0.96      0.96      1079

    accuracy                           0.98      4500
   macro avg       0.97      0.97      0.97      4500
weighted avg       0.98      0.98      0.98      4500


y_pred = rf.predict(X_test)
from sklearn.metrics import confusion_matrix
import seaborn as sns
forest_cm = metrics.confusion_matrix(y_test, y_pred)
sns.heatmap(forest_cm, annot=True, fmt='.2f',xticklabels = ["Left", "Retained"] , yticklabels = ["Left", "Retained"] )
plt.xlabel('True class')
plt.ylabel('Predicted class')
plt.title('Random Forest')
plt.savefig('random_forest')


print(classification_report(y_test, logreg.predict(X_test)))

              precision    recall  f1-score   support

           0       0.80      0.92      0.85      3421
           1       0.51      0.26      0.34      1079

    accuracy                           0.76      4500
   macro avg       0.65      0.59      0.60      4500
weighted avg       0.73      0.76      0.73      4500


logreg_y_pred = logreg.predict(X_test)
logreg_cm = metrics.confusion_matrix(y_test,logreg_y_pred)
sns.heatmap(logreg_cm, annot=True, fmt='.2f',xticklabels = ["Left", "Retained"] , yticklabels = ["Left", "Retained"] )
plt.xlabel('True class')
plt.ylabel('Predicted class')
plt.title('Logistic Regression')
plt.savefig('logistic_regression')


print(classification_report(y_test, svc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.95      0.91      0.93      3421
           1       0.76      0.85      0.80      1079

    accuracy                           0.90      4500
   macro avg       0.85      0.88      0.87      4500
weighted avg       0.90      0.90      0.90      4500


svc_y_pred = svc.predict(X_test)
svc_cm = metrics.confusion_matrix(y_test, svc_y_pred)
sns.heatmap(svc_cm, annot=True, fmt='.2f',xticklabels = ["Left", "Retained"] , yticklabels = ["Left", "Retained"] )
plt.xlabel('True class')
plt.ylabel('Predicted class')
plt.title('Support Vector Machine')
plt.savefig('support_vector_machine')


from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])

rf_roc_auc = roc_auc_score(y_test, rf.predict(X_test))
rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_test, rf.predict_proba(X_test)[:,1])

plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot(rf_fpr, rf_tpr, label='Random Forest (area = %0.2f)' % rf_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('ROC')
plt.show()


feature_labels = np.array(['Satisfaction', 'Last_evaluation', 'Time', 
      'Department_IT', 'Department_accounting', 'Department_hr', 'Department_management', 'Salary_high', 'Salary_low', 'Salary_low'])
importance = rf.feature_importances_
feature_indexes_by_importance = importance.argsort()
for index in feature_indexes_by_importance:
    print('{}-{:.2f}%'.format(feature_labels[index], (importance[index] *100.0)))

Department_management-0.25%
Department_IT-0.25%
Department_hr-0.26%
Department_accounting-0.34%
Salary_low-0.52%
Salary_low-0.59%
Salary_high-0.83%
Last_evaluation-19.60%
Time-25.82%
Satisfaction-51.55%

	Satisfaction	Last_evaluation	Project	Monthly_hours	Time	Promotion
Left
0	0.666810	0.714951	3.786664	199.060203	3.380032	0.026251
1	0.440098	0.720877	3.855503	207.419210	3.876505	0.005321

	Satisfaction	Last_evaluation	Project	Monthly_hours	Time	Left	Promotion
Department
IT	0.618142	0.716634	3.816626	202.215974	3.468623	0.222494	0.002445
RandD	0.619822	0.711881	3.853875	200.800508	3.367217	0.153748	0.034307
accounting	0.582151	0.718318	3.825293	201.162973	3.522816	0.265971	0.018253
hr	0.598809	0.710108	3.654939	198.684709	3.355886	0.290934	0.020298
management	0.621349	0.723794	3.860317	201.249206	4.303175	0.144444	0.109524
marketing	0.618601	0.716340	3.687646	199.385781	3.569930	0.236597	0.050117
product_mng	0.619634	0.714823	3.807095	199.965632	3.475610	0.219512	0.000000
sales	0.614447	0.710121	3.776329	200.911353	3.534058	0.244928	0.024155
support	0.618300	0.723194	3.803948	200.758188	3.393001	0.248991	0.008973
technical	0.607897	0.721379	3.877941	202.497426	3.411397	0.256250	0.010294

	Satisfaction	Last_evaluation	Project	Monthly_hours	Time	Left	Promotion
Salary
high	0.637470	0.704325	3.767179	199.867421	3.692805	0.066289	0.058205
low	0.600753	0.717017	3.799891	200.996583	3.438218	0.296884	0.009021
medium	0.621817	0.717927	3.813528	201.338349	3.529010	0.204313	0.028079

People analytics¶

Part 1: Exploratory data analysis (EDA):¶

Part 2: AB testing:¶

Part 3: Model building:¶

Logistic regression model¶

Random Forest¶

Support Vector Machine¶

Precision and recall¶

	Satisfaction	Last_evaluation	Project	Monthly_hours	Time	Department	Salary
0	0.71	0.36	4	278	4	product_mng	high
1	0.44	0.36	2	136	3	technical	high
2	0.79	0.36	3	114	3	technical	high
3	0.42	0.37	2	284	3	technical	high
4	0.96	0.37	3	111	2	sales	high

	Department	high	medium	low
0	accounting	2000	1000	1000
1	hr	1300	900	500
2	IT	4000	2000	1000
3	management	2000	1000	300
4	marketing	4000	3000	600
5	product_mng	3000	2000	1000
6	RandD	2000	2000	1000
7	sales	5000	2000	500
8	support	3000	1000	300
9	technical	4000	2000	1000

Left	0	1
Department
IT	954	273
RandD	666	121
accounting	563	204
hr	524	215
management	539	91
marketing	655	203
product_mng	704	198
sales	3126	1014
support	1674	555
technical	2023	697