# Import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle


# Load dataset into a dataframe
df = pd.read_csv("HR_capstone_dataset.csv")

# Display first few rows of the dataframe
df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


# Gather descriptive statistics about the data
df.describe()


# Display all column names
df.columns = [item.lower().replace(' ','_') for item in list( df.columns ) ]
df.head()


# Check for missing values
df.isna().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
work_accident            0
left                     0
promotion_last_5years    0
department               0
salary                   0
dtype: int64


# Check for missing values
df_orig = df.copy() # original data to refer to later
df = pd.get_dummies(df, drop_first=True) # Create dummy variables, drop the first category


# Check for duplicates
dups = df[ (df.duplicated(keep='first')) ] # Get duplicate rows. Keep first occurrence
dups.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3008 entries, 396 to 14998
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   satisfaction_level      3008 non-null   float64
 1   last_evaluation         3008 non-null   float64
 2   number_project          3008 non-null   int64  
 3   average_montly_hours    3008 non-null   int64  
 4   time_spend_company      3008 non-null   int64  
 5   work_accident           3008 non-null   int64  
 6   left                    3008 non-null   int64  
 7   promotion_last_5years   3008 non-null   int64  
 8   department_RandD        3008 non-null   uint8  
 9   department_accounting   3008 non-null   uint8  
 10  department_hr           3008 non-null   uint8  
 11  department_management   3008 non-null   uint8  
 12  department_marketing    3008 non-null   uint8  
 13  department_product_mng  3008 non-null   uint8  
 14  department_sales        3008 non-null   uint8  
 15  department_support      3008 non-null   uint8  
 16  department_technical    3008 non-null   uint8  
 17  salary_low              3008 non-null   uint8  
 18  salary_medium           3008 non-null   uint8  
dtypes: float64(2), int64(6), uint8(11)
memory usage: 243.8 KB


# Inspect some rows containing duplicates
dups


# Drop duplicates and save resulting dataframe in a new variable as needed
df = df[ ~(df.duplicated(keep='first')) ] 
df.shape

# Display first few rows of new dataframe as needed
df.describe()


# Create a boxplot to visualize distribution of employee `tenure` and detect any outliers
sns.boxplot( x=df.time_spend_company )
#sns.histplot( x=df.time_spend_company)

<AxesSubplot:xlabel='time_spend_company'>


# Determine the number of rows containing outliers
o = df[df.time_spend_company>5]
o.shape

# 19 employees have been at the company for more than 5 years.

(824, 19)


# Get numbers of people who left vs. stayed
print( df.left.value_counts() )
print("Out of 10,876 employees, 1115 people left.")

# Get percentages of people who left vs. stayed
print( df.left.value_counts(normalize=True) )
print("i.e. That's about 9% turnover rate!")


# Now focus on good workers who left.
df['left_good_worker'] = ( df.last_evaluation >= .6 ) * df.left 

print('----')
print("Focus on good workers leaving.")
# Get numbers of people who left vs. stayed
print( df.left_good_worker.value_counts() )
print("Wow -- all of the workers who left had an evaluation of 6 or above.")

# Get percentages of people who left vs. stayed
print( df.left_good_worker.value_counts(normalize=True) )

# Write over left field so that I can see the results with this more targetted DV
df.left = df.left_good_worker
df = df.drop(columns='left_good_worker')

0    10876
1     1115
Name: left, dtype: int64
Out of 10,876 employees, 1115 people left.
0    0.907014
1    0.092986
Name: left, dtype: float64
i.e. That's about 9% turnover rate!
----
Focus on good workers leaving.
0    10876
1     1115
Name: left_good_worker, dtype: int64
Wow -- all of the workers who left had an evaluation of 6 or above.
0    0.907014
1    0.092986
Name: left_good_worker, dtype: float64


# Create a plot as needed
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11991 entries, 0 to 11999
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   satisfaction_level      11991 non-null  float64
 1   last_evaluation         11991 non-null  float64
 2   number_project          11991 non-null  int64  
 3   average_montly_hours    11991 non-null  int64  
 4   time_spend_company      11991 non-null  int64  
 5   work_accident           11991 non-null  int64  
 6   left                    11991 non-null  int64  
 7   promotion_last_5years   11991 non-null  int64  
 8   department_RandD        11991 non-null  uint8  
 9   department_accounting   11991 non-null  uint8  
 10  department_hr           11991 non-null  uint8  
 11  department_management   11991 non-null  uint8  
 12  department_marketing    11991 non-null  uint8  
 13  department_product_mng  11991 non-null  uint8  
 14  department_sales        11991 non-null  uint8  
 15  department_support      11991 non-null  uint8  
 16  department_technical    11991 non-null  uint8  
 17  salary_low              11991 non-null  uint8  
 18  salary_medium           11991 non-null  uint8  
dtypes: float64(2), int64(6), uint8(11)
memory usage: 971.9 KB


# Create a histogram based on those who left vs stayed
df.target = df.left==1
sns.histplot(x=df.satisfaction_level, hue=df.left)
plt.title("Histogram of satisfaction level -- workers who left vs stayed")

Text(0.5, 1.0, 'Histogram of satisfaction level -- workers who left vs stayed')


# Create a plot as needed
sns.histplot(x=df.last_evaluation, hue=df.left)
plt.title("Histogram of last evaluation score -- workers who left vs stayed")

Text(0.5, 1.0, 'Histogram of last evaluation score -- workers who left vs stayed')


# Create a plot as needed
df.info()
compare = [ 'left', 'satisfaction_level', 'last_evaluation', 'average_montly_hours']

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11991 entries, 0 to 11999
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   satisfaction_level      11991 non-null  float64
 1   last_evaluation         11991 non-null  float64
 2   number_project          11991 non-null  int64  
 3   average_montly_hours    11991 non-null  int64  
 4   time_spend_company      11991 non-null  int64  
 5   work_accident           11991 non-null  int64  
 6   left                    11991 non-null  int64  
 7   promotion_last_5years   11991 non-null  int64  
 8   department_RandD        11991 non-null  uint8  
 9   department_accounting   11991 non-null  uint8  
 10  department_hr           11991 non-null  uint8  
 11  department_management   11991 non-null  uint8  
 12  department_marketing    11991 non-null  uint8  
 13  department_product_mng  11991 non-null  uint8  
 14  department_sales        11991 non-null  uint8  
 15  department_support      11991 non-null  uint8  
 16  department_technical    11991 non-null  uint8  
 17  salary_low              11991 non-null  uint8  
 18  salary_medium           11991 non-null  uint8  
dtypes: float64(2), int64(6), uint8(11)
memory usage: 971.9 KB


# Look at relationships between our variables of interest
sns.pairplot(df[compare], hue="left")
#plt.title("Pairwise plot relating hours, job satisfaction and evaluation")

<seaborn.axisgrid.PairGrid at 0x12483185220>


# Create a plot as needed
sns.histplot(x=df.average_montly_hours, hue=df.left)
plt.title("Average monthly hours worked of those who left vs those who remained")

Text(0.5, 1.0, 'Average monthly hours worked of those who left vs those who remained')


# Appears there is a mis-match in working hours. Some worked too much while others have too few hours.
sns.scatterplot(x=df.average_montly_hours,y=df.satisfaction_level,hue=df.left,alpha=0.05)
plt.title("Employee Satisfaction vs hours worked")

Text(0.5, 1.0, 'Employee Satisfaction vs hours worked')


# Create a plot as needed
sns.scatterplot(x=df.last_evaluation,y=df.satisfaction_level,hue=df.left,alpha=0.05)
plt.title("Employee satisfaction vs their last evaluation")

Text(0.5, 1.0, 'Employee satisfaction vs their last evaluation')


# color pallette
# Get Unique depts
color_labels = df_orig['department'].unique()

# List of colors in the color palettes
rgb_values = sns.color_palette("Set2", len(color_labels))

# Map continents to the colors
color_map = dict(zip(color_labels, rgb_values))
print( color_map)

{'sales': (0.4, 0.7607843137254902, 0.6470588235294118), 'accounting': (0.9882352941176471, 0.5529411764705883, 0.3843137254901961), 'hr': (0.5529411764705883, 0.6274509803921569, 0.796078431372549), 'technical': (0.9058823529411765, 0.5411764705882353, 0.7647058823529411), 'support': (0.6509803921568628, 0.8470588235294118, 0.32941176470588235), 'management': (1.0, 0.8509803921568627, 0.1843137254901961), 'IT': (0.8980392156862745, 0.7686274509803922, 0.5803921568627451), 'product_mng': (0.7019607843137254, 0.7019607843137254, 0.7019607843137254), 'marketing': (0.4, 0.7607843137254902, 0.6470588235294118), 'RandD': (0.9882352941176471, 0.5529411764705883, 0.3843137254901961)}


%matplotlib widget

from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap
fig = plt.figure(figsize=(8,8))
ax = Axes3D(fig)
#ax.scatter(xs=df_orig.last_evaluation,ys=df_orig.satisfaction_level,zs=df_orig.average_montly_hours,   c=df_orig.department.map(color_map), alpha=0.15)
ax.scatter(xs=df.last_evaluation,ys=df.satisfaction_level,zs=df.average_montly_hours,   c=df.left, alpha=0.15)

ax.set_xlabel('Last evaluation (quality)')
ax.set_ylabel('Satisfaction level')
ax.set_zlabel('Monthly hours')

plt.show()

C:\Users\jon\AppData\Local\Temp\ipykernel_18508\844919536.py:6: MatplotlibDeprecationWarning: Axes3D(fig) adding itself to the figure is deprecated since 3.4. Pass the keyword argument auto_add_to_figure=False and use fig.add_axes(ax) to suppress this warning. The default value of auto_add_to_figure will change to False in mpl3.5 and True values will no longer work in 3.6.  This is consistent with other Axes classes.
  ax = Axes3D(fig)


# add a ave_monthly_hours2 to capture non-linear effects
df['long_hours'] = df.average_montly_hours >= 220 # 173.33 is average


%matplotlib inline

import sklearn.model_selection as ms
import sklearn.preprocessing as pp
import sklearn.ensemble as en

rfc = en.RandomForestClassifier(random_state=42)
y = df.left
X = df.drop(columns=['left','average_montly_hours'])
X['long_hours'] = X.long_hours.apply(lambda x: 0 + 1*(x))

X_train, X_test, y_train, y_test = ms.train_test_split(X,y,train_size=0.8, random_state=42)
X_train.shape
X_trn, X_val, y_trn, y_val = ms.train_test_split(X_train,y_train,train_size=0.75, random_state=42)
print(X_trn.shape)
print(X_test.shape)
print(X_val.shape)

f0 = rfc.fit(X_trn,y_trn)

(7194, 18)
(2399, 18)
(2398, 18)


# CV
cv_params = {'max_depth': [None], 
             'min_samples_leaf': [10,20,50],
             'min_samples_split': [.02,.05,.10,.20],
             'max_features': [2,4,6,8],
             'n_estimators': [75, 100, 125, 150]
             }  
scoring = {'accuracy', 'precision', 'recall', 'f1'}

#custom_split = PredefinedSplit(split_index)
rf_val = ms.GridSearchCV(rfc, cv_params, scoring=scoring, cv=5, refit='f1')


%%time
rf_val.fit(X_train, y_train)

CPU times: total: 6min 14s
Wall time: 6min 14s

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'max_depth': [None], 'max_features': [2, 4, 6, 8],
                         'min_samples_leaf': [10, 20, 50],
                         'min_samples_split': [0.02, 0.05, 0.1, 0.2],
                         'n_estimators': [75, 100, 125, 150]},
             refit='f1', scoring={'f1', 'recall', 'accuracy', 'precision'})

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'max_depth': [None], 'max_features': [2, 4, 6, 8],
                         'min_samples_leaf': [10, 20, 50],
                         'min_samples_split': [0.02, 0.05, 0.1, 0.2],
                         'n_estimators': [75, 100, 125, 150]},
             refit='f1', scoring={'f1', 'recall', 'accuracy', 'precision'})

RandomForestClassifier(random_state=42)

RandomForestClassifier(random_state=42)


# Obtain optimal parameters.
rf_val.best_params_

{'max_depth': None,
 'max_features': 8,
 'min_samples_leaf': 10,
 'min_samples_split': 0.02,
 'n_estimators': 75}


pickle.dump(rf_val, open("rf_model.p",'wb') )


import sklearn.metrics as m
import sklearn.tree as tree
import sklearn.model_selection as ms
import sklearn.ensemble as e


# Fit the optimal model.

my_rfc = e.RandomForestClassifier(max_features=8,min_samples_leaf=10, min_samples_split=0.02, n_estimators=75, random_state=42)
f0 = my_rfc.fit(X_train,y_train)


estimator = my_rfc.estimators_[3]

from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = X_trn.columns,
                #class_names = df.target_names,
                rounded = True, proportion = False, 
                precision = 2, filled = True, max_depth=4)

# Convert to png using system command (requires Graphviz)
#from subprocess import call
#call(['dot', '-Tpng', '/home/jovyan/work/tree.dot', '-o', '/home/jovyan/work/tree.png', '-Gdpi=600'])

# Display in jupyter notebook
#from IPython.display import Image
#Image(filename = 'tree.png')


import matplotlib.pyplot as plt
plt.figure(figsize=(18,18))

tree.plot_tree(my_rfc.estimators_[4],max_depth=3, fontsize=10,filled=True, feature_names=X.columns, class_names={0:'Stayed', 1:'Left'})
plt.show()


%matplotlib inline
o = my_rfc.feature_importances_
o = [round(i,3) for i in o]
c = X_trn.columns
pair = list( zip( *sorted( zip(o, c), key=lambda pair: pair[0] ) ) )
print(pair[1])
sns.barplot( x=list(pair[0]), y=list(pair[1]))
plt.title(f'Variable importance reported by Random Forest model.')
#print(pair[1])

('promotion_last_5years', 'department_RandD', 'department_accounting', 'department_hr', 'department_management', 'department_marketing', 'department_product_mng', 'department_sales', 'department_support', 'department_technical', 'salary_medium', 'work_accident', 'salary_low', 'long_hours', 'number_project', 'last_evaluation', 'time_spend_company', 'satisfaction_level')

Text(0.5, 1.0, 'Variable importance reported by Random Forest model.')


#smaller = ['long_hours','satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','work_accident']
#sdf = df[smaller]
sns.histplot(x=df.average_montly_hours, alpha=.53)
plt.title("Average monthly hours - Note the bimodal distribution")

C:\Users\jon\anaconda3\envs\BERTopic\lib\site-packages\mpl_toolkits\mplot3d\art3d.py:900: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if zdir == 'x':
C:\Users\jon\anaconda3\envs\BERTopic\lib\site-packages\mpl_toolkits\mplot3d\art3d.py:902: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  elif zdir == 'y':

Text(0.5, 0.92, 'Average monthly hours - Note the bimodal distribution')


y_pred = f0.predict(X_val)

acc = m.accuracy_score(y_val,y_pred)
rec = m.recall_score(y_val,y_pred)
prec = m.precision_score(y_val,y_pred)
f1 = m.f1_score(y_val,y_pred)
print(f'Acc: {acc}  Recall:{rec}  Precision: {prec}   F1:{f1}')

Acc: 0.98790658882402  Recall:0.8772727272727273  Precision: 0.9897435897435898   F1:0.9301204819277108


cm = m.confusion_matrix(y_val,y_pred)
print(cm)
p = m.ConfusionMatrixDisplay(cm,display_labels=[])
p.plot(values_format='')

[[2176    2]
 [  27  193]]

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1f0426b40d0>


# Nothing excessively correlated
co = df.corr()
sns.heatmap(co,cmap='Blues')

<AxesSubplot:>


import sklearn.linear_model as lm
lmc = lm.LogisticRegressionCV( cv=5, random_state=42,solver='liblinear')
f1 = lmc.fit( X_trn, y_trn )


import statsmodels.api as sm
myX = X_trn.copy()
print(y_trn)
myX = pd.get_dummies(myX,drop_first=True,columns=['long_hours'])

log_reg = sm.Logit( y_trn, myX ).fit()
#log_reg.get_margeff(at='mean').summary()
log_reg.summary()
#df.info()

9569     0
9097     0
978      0
11670    0
2077     0
        ..
4753     0
6290     0
5264     0
11860    0
7907     0
Name: left, Length: 7194, dtype: int64
Optimization terminated successfully.
         Current function value: 0.228906
         Iterations 8


print( np.exp( log_reg.params) )
np.exp( log_reg.conf_int() )

satisfaction_level        0.018572
last_evaluation           3.020675
number_project            1.059011
time_spend_company        1.206150
work_accident             0.244168
promotion_last_5years     0.154198
department_RandD          0.115243
department_accounting     0.150885
department_hr             0.183482
department_management     0.101046
department_marketing      0.141697
department_product_mng    0.165992
department_sales          0.166464
department_support        0.197339
department_technical      0.190080
salary_low                0.345785
salary_medium             0.227109
long_hours_1              7.455418
dtype: float64


y_pred = f0.predict(X_test)

acc = m.accuracy_score(y_test,y_pred)
rec = m.recall_score(y_test,y_pred)
prec = m.precision_score(y_test,y_pred)
f1score = m.f1_score(y_test,y_pred)
print(f'Acc: {acc}  Recall:{rec}  Precision: {prec}   F1:{f1score}')

Acc: 0.9820758649437266  Recall:0.8317757009345794  Precision: 0.9621621621621622   F1:0.8922305764411027


y_pred = f0.predict(X_test)
cm = m.confusion_matrix(y_test,y_pred)
print(cm)
p = m.ConfusionMatrixDisplay(cm,display_labels=["Stayed","Left"])
p.plot(values_format='')
plt.title(f"Random forest confusion matrix - Test data")

[[2178    7]
 [  36  178]]

Text(0.5, 1.0, 'Random forest confusion matrix - Test data')


y_pred = f1.predict(X_val)
cm = m.confusion_matrix(y_val,y_pred)
print(cm)
p = m.ConfusionMatrixDisplay(cm,display_labels=["Stayed","Left"])
p.plot(values_format='')
plt.title(f"Logit model confusion matrix - Test data")

[[2145   33]
 [  63  157]]

Text(0.5, 1.0, 'Logit model confusion matrix - Test data')


y_pred = f1.predict(X_test)

acc = m.accuracy_score(y_test,y_pred)
rec = m.recall_score(y_test,y_pred)
prec = m.precision_score(y_test,y_pred)
f1score = m.f1_score(y_test,y_pred)
print(f'Acc: {acc}  Recall:{rec}  Precision: {prec}   F1:{f1score}')

Acc: 0.9578991246352647  Recall:0.7009345794392523  Precision: 0.8021390374331551   F1:0.7481296758104737

Variable	Description
satisfaction_level	Employee-reported job satisfaction level [0–1]
last_evaluation	Score of employee's last performance review [0–1]
number_project	Number of projects employee contributes to
average_monthly_hours	Average number of hours employee worked per month
time_spend_company	How long the employee has been with the company (years)
Work_accident	Whether or not the employee experienced an accident while at work
left	Whether or not the employee left the company
promotion_last_5years	Whether or not the employee was promoted in the last 5 years
Department	The employee's department
salary	The employee's salary (U.S. dollars)

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	Work_accident	left	promotion_last_5years
count	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000
mean	0.612834	0.716102	3.803054	201.050337	3.498233	0.144610	0.238083	0.021268
std	0.248631	0.171169	1.232592	49.943099	1.460136	0.351719	0.425924	0.144281
min	0.090000	0.360000	2.000000	96.000000	2.000000	0.000000	0.000000	0.000000
25%	0.440000	0.560000	3.000000	156.000000	3.000000	0.000000	0.000000	0.000000
50%	0.640000	0.720000	4.000000	200.000000	3.000000	0.000000	0.000000	0.000000
75%	0.820000	0.870000	5.000000	245.000000	4.000000	0.000000	0.000000	0.000000
max	1.000000	1.000000	7.000000	310.000000	10.000000	1.000000	1.000000	1.000000

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	work_accident	left	promotion_last_5years	department_RandD	department_accounting	department_hr	department_management	department_marketing	department_product_mng	department_sales	department_support	department_technical	salary_low	salary_medium
396	0.46	0.57	2	139	3	0	1	0	0	0	0	0	0	0	1	0	0	1	0
866	0.41	0.46	2	128	3	0	1	0	0	1	0	0	0	0	0	0	0	1	0
1317	0.37	0.51	2	127	3	0	1	0	0	0	0	0	0	0	1	0	0	0	1
1368	0.41	0.52	2	132	3	0	1	0	1	0	0	0	0	0	0	0	0	1	0
1461	0.42	0.53	2	142	3	0	1	0	0	0	0	0	0	0	1	0	0	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
14994	0.40	0.57	2	151	3	0	1	0	0	0	0	0	0	0	0	1	0	1	0
14995	0.37	0.48	2	160	3	0	1	0	0	0	0	0	0	0	0	1	0	1	0
14996	0.37	0.53	2	143	3	0	1	0	0	0	0	0	0	0	0	1	0	1	0
14997	0.11	0.96	6	280	4	0	1	0	0	0	0	0	0	0	0	1	0	1	0
14998	0.37	0.52	2	158	3	0	1	0	0	0	0	0	0	0	0	1	0	1	0

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	work_accident	left	promotion_last_5years	department_RandD	department_accounting	department_hr	department_management	department_marketing	department_product_mng	department_sales	department_support	department_technical	salary_low	salary_medium
count	11991.000000	11991.000000	11991.000000	11991.000000	11991.000000	11991.000000	11991.000000	11991.000000	11991.000000	11991.000000	11991.000000	11991.000000	11991.000000	11991.000000	11991.000000	11991.000000	11991.00000	11991.000000	11991.000000
mean	0.629658	0.716683	3.802852	200.473522	3.364857	0.154282	0.166041	0.016929	0.057877	0.051789	0.050121	0.036361	0.056125	0.057210	0.270119	0.151864	0.18714	0.478692	0.438746
std	0.241070	0.168343	1.163238	48.727813	1.330240	0.361234	0.372133	0.129012	0.233520	0.221610	0.218204	0.187194	0.230173	0.232252	0.444040	0.358904	0.39004	0.499567	0.496254
min	0.090000	0.360000	2.000000	96.000000	2.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.00000	0.000000	0.000000
25%	0.480000	0.570000	3.000000	157.000000	3.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.00000	0.000000	0.000000
50%	0.660000	0.720000	4.000000	200.000000	3.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.00000	0.000000	0.000000
75%	0.820000	0.860000	5.000000	243.000000	4.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.00000	1.000000	1.000000
max	1.000000	1.000000	7.000000	310.000000	10.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.00000	1.000000	1.000000

Dep. Variable:	left	No. Observations:	7194
Model:	Logit	Df Residuals:	7176
Method:	MLE	Df Model:	17
Date:	Fri, 22 Sep 2023	Pseudo R-squ.:	0.2691
Time:	18:10:22	Log-Likelihood:	-1646.8
converged:	True	LL-Null:	-2253.1
Covariance Type:	nonrobust	LLR p-value:	2.439e-247

Capstone project: Data-driven suggestions for HR¶

Description and deliverables¶

Business scenario and problem¶

Familiarize yourself with the HR dataset¶

Import packages¶

Load dataset¶

Step 2. Data Exploration (Initial EDA and data cleaning)¶

Gather basic information about the data¶

Gather descriptive statistics about the data¶

Rename columns¶

Check duplicates¶

Check outliers¶

Exploratory data analysis¶

Data visualizations¶

Insights¶

Construct a model¶

Recall model assumptions¶

Modeling¶

Logit testing¶

Logit model¶

Recall evaluation metrics¶

Brief conclusion¶

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	left	Department	salary
0	0.38	0.53	2	157	3	1	sales	low
1	0.80	0.86	5	262	6	1	sales	medium
2	0.11	0.88	7	272	4	1	sales	medium
3	0.72	0.87	5	223	5	1	sales	low
4	0.37	0.52	2	159	3	1	sales	low

	coef	std err	z	P>\|z\|	[0.025	0.975]
satisfaction_level	-3.9861	0.169	-23.651	0.000	-4.316	-3.656
last_evaluation	1.1055	0.270	4.091	0.000	0.576	1.635
number_project	0.0573	0.039	1.483	0.138	-0.018	0.133
time_spend_company	0.1874	0.031	6.022	0.000	0.126	0.248
work_accident	-1.4099	0.188	-7.480	0.000	-1.779	-1.040
promotion_last_5years	-1.8695	0.750	-2.494	0.013	-3.339	-0.400
department_RandD	-2.1607	0.245	-8.819	0.000	-2.641	-1.680
department_accounting	-1.8912	0.233	-8.125	0.000	-2.347	-1.435
department_hr	-1.6956	0.238	-7.137	0.000	-2.161	-1.230
department_management	-2.2922	0.290	-7.894	0.000	-2.861	-1.723
department_marketing	-1.9541	0.247	-7.911	0.000	-2.438	-1.470
department_product_mng	-1.7958	0.230	-7.795	0.000	-2.247	-1.344
department_sales	-1.7930	0.144	-12.460	0.000	-2.075	-1.511
department_support	-1.6228	0.161	-10.095	0.000	-1.938	-1.308
department_technical	-1.6603	0.153	-10.857	0.000	-1.960	-1.361
salary_low	-1.0619	0.137	-7.776	0.000	-1.330	-0.794
salary_medium	-1.4823	0.143	-10.365	0.000	-1.763	-1.202
long_hours_1	2.0089	0.109	18.439	0.000	1.795	2.222

	0	1
satisfaction_level	0.013347	0.025841
last_evaluation	1.778721	5.129798
number_project	0.981748	1.142354
time_spend_company	1.134774	1.282015
work_accident	0.168754	0.353284
promotion_last_5years	0.035488	0.669997
department_RandD	0.071294	0.186282
department_accounting	0.095612	0.238111
department_hr	0.115175	0.292299
department_management	0.057193	0.178521
department_marketing	0.087320	0.229937
department_product_mng	0.105677	0.260733
department_sales	0.125554	0.220705
department_support	0.144003	0.270430
department_technical	0.140854	0.256510
salary_low	0.264586	0.451903
salary_medium	0.171595	0.300583
long_hours_1	6.021901	9.230185