# Import packages for data manipulation
import pandas as pd

# Import packages for data visualization
import seaborn as sns
import matplotlib.pyplot as plt

# This lets us see all of the columns, preventing Juptyer from redacting them.


# Import packages for data modeling
import sklearn.preprocessing as pp

# This is the function that helps plot feature importance
import sklearn.metrics as m


# This module lets us save our models once we fit them.
import pickle


# Import dataset
df0 = pd.read_csv('waze_dataset.csv')


# Inspect the first five rows
df0.head()


# Copy the df0 dataframe
df0.isna().sum()
df = df0.dropna().drop(columns='ID').copy() # drop missing labels
df = pd.get_dummies(df, drop_first=True) # create dummy variables


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14299 entries, 0 to 14998
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   sessions                 14299 non-null  int64  
 1   drives                   14299 non-null  int64  
 2   total_sessions           14299 non-null  float64
 3   n_days_after_onboarding  14299 non-null  int64  
 4   total_navigations_fav1   14299 non-null  int64  
 5   total_navigations_fav2   14299 non-null  int64  
 6   driven_km_drives         14299 non-null  float64
 7   duration_minutes_drives  14299 non-null  float64
 8   activity_days            14299 non-null  int64  
 9   driving_days             14299 non-null  int64  
 10  label_retained           14299 non-null  uint8  
 11  device_iPhone            14299 non-null  uint8  
dtypes: float64(3), int64(7), uint8(2)
memory usage: 1.2 MB


# 1. Create `km_per_driving_day` feature
df['km_per_driving_day'] = df.driven_km_drives / df.driving_days
df.loc[df.driving_days == 0,'km_per_driving_day'] = 0 # fix cases of missing values
# 2. Get descriptive stats
df.km_per_driving_day.describe()

count    14299.000000
mean       581.942399
std       1038.254509
min          0.000000
25%        136.168003
50%        273.301012
75%        558.018761
max      15420.234110
Name: km_per_driving_day, dtype: float64


# 1. Create `percent_sessions_in_last_month` feature
df['percent_sessions_in_last_month'] = df.sessions / round( df.total_sessions )

# 1. Get descriptive stats
df.percent_sessions_in_last_month.describe()
df.loc[round(df.total_sessions)==0,'percent_sessions_in_last_month'] = 0
df.percent_sessions_in_last_month.isna().sum() # missing values!

#sns.histplot(df.percent_sessions_in_last_month)
#df[df.percent_sessions_in_last_month>1]
#df.loc[df.sessions == round(df.total_sessions)]

0


# Create `professional_driver` feature
df['professional_driver'] = ((df.drives >= 60) & (df.driving_days >= 15))


# Create `total_sessions_per_day` feature
df['total_sessions_per_day'] = df.total_sessions / df.n_days_after_onboarding  
df.total_sessions_per_day.describe()

count    14299.000000
mean         0.338207
std          1.319814
min          0.000298
25%          0.050818
50%          0.100457
75%          0.215210
max         39.763874
Name: total_sessions_per_day, dtype: float64


# Get descriptive stats
df.total_sessions_per_day.describe()

count    14299.000000
mean         0.338207
std          1.319814
min          0.000298
25%          0.050818
50%          0.100457
75%          0.215210
max         39.763874
Name: total_sessions_per_day, dtype: float64


# Create `km_per_hour` feature
import numpy as np
df['duration_hours_drives'] = round( df.duration_minutes_drives / 60.0, 1 )
df['km_per_hour'] = df.driven_km_drives / df.duration_hours_drives
df.loc[df.km_per_hour == np.Inf,'km_per_hour'] = 0
df.km_per_hour.describe()
df[ df.km_per_hour > 120 ]
sns.histplot(df.km_per_hour[df.km_per_hour< 200])


# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# Major change to the data -- drop unlikely obs.
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# Something is wrong -- it makes no sense for there to be people who travel > 200km per hour!
# Perhaps they have the app open on a plane?! Or in meters rather than km??
look = df.km_per_hour >= 200
df = df[df.km_per_hour <= 150] # Drop these rows!!!
df.shape

(9000, 20)


# Create `km_per_drive` feature
df['km_per_drive'] = df.driven_km_drives / df.drives
df.loc[df.drives==0,'km_per_drive']=0 
df.km_per_drive.describe()

count     9000.000000
mean       224.622611
std        618.644479
min          0.000000
25%         30.208270
50%         68.953665
75%        168.655354
max      15777.426560
Name: km_per_drive, dtype: float64


# Create `percent_of_sessions_to_favorite` feature
df['percent_of_sessions_to_favorite'] = ( df.total_navigations_fav1 + df.total_navigations_fav2 ) / df.sessions
df.loc[df.sessions==0,'percent_of_sessions_to_favorite'] = 0
# Get descriptive stats
df.percent_of_sessions_to_favorite.describe()

count    9000.000000
mean        8.103972
std        28.185086
min         0.000000
25%         0.489608
50%         1.790895
75%         5.425987
max       799.000000
Name: percent_of_sessions_to_favorite, dtype: float64


# Drop rows with missing values
# Already done.


# Create new `device2` variable
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9000 entries, 0 to 14998
Data columns (total 20 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   sessions                         9000 non-null   int64  
 1   drives                           9000 non-null   int64  
 2   total_sessions                   9000 non-null   float64
 3   n_days_after_onboarding          9000 non-null   int64  
 4   total_navigations_fav1           9000 non-null   int64  
 5   total_navigations_fav2           9000 non-null   int64  
 6   driven_km_drives                 9000 non-null   float64
 7   duration_minutes_drives          9000 non-null   float64
 8   activity_days                    9000 non-null   int64  
 9   driving_days                     9000 non-null   int64  
 10  label_retained                   9000 non-null   uint8  
 11  device_iPhone                    9000 non-null   uint8  
 12  km_per_driving_day               9000 non-null   float64
 13  percent_sessions_in_last_month   9000 non-null   float64
 14  total_sessions_per_day           9000 non-null   float64
 15  duration_hours_drives            9000 non-null   float64
 16  km_per_hour                      9000 non-null   float64
 17  km_per_drive                     9000 non-null   float64
 18  percent_of_sessions_to_favorite  9000 non-null   float64
 19  churned                          9000 non-null   int32  
dtypes: float64(10), int32(1), int64(7), uint8(2)
memory usage: 1.3 MB


# Create binary `label2` column
df['churned'] = 0 + 1*(df.label_retained == 0)


# Drop `ID` column
# done


# Get class balance of 'label' col
df.churned.value_counts(normalize=True)

0    0.816
1    0.184
Name: churned, dtype: float64


# 1. Isolate X variables
import sklearn.model_selection as ms

df1 = df.drop(columns='label_retained')
df1.head()
X = df1.drop(columns='churned')

# 2. Isolate y variable
y = df1.churned

# 3. Split into train and test sets
X_train, X_test, y_train, y_test = ms.train_test_split(X,y, random_state=42, train_size=.80, stratify=y)

# 4. Split into train and validate sets
X_trn, X_val, y_trn, y_val = ms.train_test_split(X_train,y_train, random_state=42, train_size=.75, stratify=y_train)


print(X_train.shape)
print(X_trn.shape)
print(X_val.shape)
print(X_test.shape)

(7200, 18)
(5400, 18)
(1800, 18)
(1800, 18)


# 1. Instantiate the random forest classifier
import sklearn.tree as tree
import sklearn.ensemble as en

rf = en.RandomForestClassifier()

# 2. Create a dictionary of hyperparameters to tune
cv_params = {#'max_depth':[2,4],
             'max_features':[2,4,6],
             'min_samples_leaf':[5,10],
             'n_estimators':[50,100,500]
            }

# 3. Define a dictionary of scoring metrics to capture
scoring = ['f1','recall','precision','accuracy']

# 4. Instantiate the GridSearchCV object
gs = ms.GridSearchCV(estimator=rf,param_grid=cv_params,scoring=scoring,refit='recall',cv=5)


%%time
f0 = gs.fit(X_trn,y_trn)

CPU times: total: 5min 1s
Wall time: 5min 1s


# Examine best score
f0.best_estimator_

RandomForestClassifier(max_features=6, min_samples_leaf=5, n_estimators=50)

RandomForestClassifier(max_features=6, min_samples_leaf=5, n_estimators=50)


# Examine best hyperparameter combo
# RandomForestClassifier(max_features=6, min_samples_leaf=5, n_estimators=50)


def make_results(model_name:str, model_object, metric:str,X_val=X_val):
    '''
    Arguments:
        model_name (string): what you want the model to be called in the output table
        model_object: a fit GridSearchCV object
        metric (string): precision, recall, f1, or accuracy

    Returns a pandas df with the F1, recall, precision, and accuracy scores
    for the model with the best mean 'metric' score across all validation folds.
    '''

  # Create dictionary that maps input metric to actual metric name in GridSearchCV
    model = model_object.best_estimator_
    y_pred = model.predict(X_val)
    bs = model_object.best_score_
  # Get all the results from the CV and put them in a df
    acc = m.accuracy_score(y_val,y_pred)
    prec = m.precision_score(y_val,y_pred)
    rec = m.recall_score(y_val,y_pred)
    f1 = m.f1_score(y_val,y_pred)
    d = {'model':model_name,'Recall':rec,'Prec':prec,'F1':f1,'Acc':acc}
    sdf = pd.DataFrame(d.items()).transpose()
    sdf.columns = list( sdf.iloc[0] ) 
    sdf = sdf.iloc[1:,:]
    
  # Isolate the row of the df with the max(metric) score
    return sdf

  # Extract Accuracy, precision, recall, and f1 score from that row

  # Create table of results

results_rf = make_results("Random Forest",f0,'recall')
results_rf


!python -m pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting xgboost
  Downloading xgboost-2.0.0-py3-none-win_amd64.whl (99.7 MB)
     ---------------------------------------- 99.7/99.7 MB 3.7 MB/s eta 0:00:00
Requirement already satisfied: scipy in c:\users\jon\anaconda3\envs\bertopic\lib\site-packages (from xgboost) (1.10.0)
Requirement already satisfied: numpy in c:\users\jon\anaconda3\envs\bertopic\lib\site-packages (from xgboost) (1.23.5)
Installing collected packages: xgboost
Successfully installed xgboost-2.0.0

WARNING: Ignoring invalid distribution -atplotlib (c:\users\jon\anaconda3\envs\bertopic\lib\site-packages)
WARNING: Ignoring invalid distribution -atplotlib (c:\users\jon\anaconda3\envs\bertopic\lib\site-packages)
WARNING: Ignoring invalid distribution -atplotlib (c:\users\jon\anaconda3\envs\bertopic\lib\site-packages)
WARNING: Ignoring invalid distribution -atplotlib (c:\users\jon\anaconda3\envs\bertopic\lib\site-packages)
WARNING: Ignoring invalid distribution -atplotlib (c:\users\jon\anaconda3\envs\bertopic\lib\site-packages)
WARNING: Ignoring invalid distribution -atplotlib (c:\users\jon\anaconda3\envs\bertopic\lib\site-packages)
WARNING: Ignoring invalid distribution -atplotlib (c:\users\jon\anaconda3\envs\bertopic\lib\site-packages)


# 1. Instantiate the XGBoost classifier
from xgboost import XGBClassifier
xgb = XGBClassifier()

# 2. Create a dictionary of hyperparameters to tune
param_grid = {
 'max_depth':[2,6,8,10,12,14],
 'min_child_weight':[.5,1,2,10],
 'learning_rate':[.1,.2,.3],
 'n_estimators':[100,200,400]   
}
# 3. Define a dictionary of scoring metrics to capture
# same as before!
print( scoring)

# 4. Instantiate the GridSearchCV object
gscv = ms.GridSearchCV(estimator=xgb, param_grid=param_grid, scoring=scoring, refit='recall', cv=5 )

['f1', 'recall', 'precision', 'accuracy']


%%time
f1 = gscv.fit(X_trn,y_trn)

CPU times: total: 3h 11min 2s
Wall time: 19min 32s


# Examine best score
f1.best_score_

0.24046495101771487


# Examine best parameters
f1.best_params_

{'learning_rate': 0.3,
 'max_depth': 8,
 'min_child_weight': 10,
 'n_estimators': 400}


# Call 'make_results()' on the GridSearch object
results_xgb = make_results("XGBoost",f1,'recall')
results_xgb


# Use random forest model to predict on validation data
y_pred = f1.predict(X_val)


def get_test_scores(model_name:str, preds, y_test_data):
    '''
    Generate a table of test scores.

    In:
        model_name (string): Your choice: how the model will be named in the output table
        preds: numpy array of test predictions
        y_test_data: numpy array of y_test data

    Out:
        table: a pandas df of precision, recall, f1, and accuracy scores for your model
    '''
    accuracy = m.accuracy_score(y_test_data, preds)
    precision = m.precision_score(y_test_data, preds)
    recall = m.recall_score(y_test_data, preds)
    f1 = m.f1_score(y_test_data, preds)

    table = pd.DataFrame({'model': [model_name],
                          'precision': [precision],
                          'recall': [recall],
                          'F1': [f1],
                          'accuracy': [accuracy]
                          })

    return table


# Get validation scores for RF model
rs0 = get_test_scores('XGBoost', y_pred, y_val)

# Append to the results table
results_all = pd.concat([results_rf, results_xgb])
results_all


# Use XGBoost model to predict on test data
y_pred_test = f1.predict(X_test)
results_champ = make_results("XGBoost champion",f1,'recall',X_val=X_test)
#results_champ = make_results("RF champion",f0,'recall',X_val=X_test)
results_champ


# Generate array of values for confusion matrix
cm = m.confusion_matrix(y_true=y_test,y_pred=f1.predict(X_test))
cm
# Plot confusion matrix
p = m.ConfusionMatrixDisplay(cm, display_labels=f1.classes_)
p.plot(values_format='')

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1562176c1f0>


from xgboost import plot_importance
plot_importance(f1.best_estimator_)

<AxesSubplot:title={'center':'Feature importance'}, xlabel='F score', ylabel='Features'>


# Plot precision-recall curve
# classifier, X_test, y_test, name="LinearSVC", plot_chance_level=True
m.PrecisionRecallDisplay.from_estimator(f1.best_estimator_, X_test, y_test)

<sklearn.metrics._plot.precision_recall_curve.PrecisionRecallDisplay at 0x1561f8edee0>


# Get predicted probabilities on the test data


# Create a list of just the second column values (probability of target)
prob_churned = f1.predict_proba(X_trn)[:,1] >= 0.5

# Create an array of new predictions that assigns a 1 to any value >= 0.4
pd.DataFrame(prob_churned).value_counts()

False    4406
True      994
dtype: int64


# Get evaluation metrics for when the threshold is 0.4


def threshold_finder(y_test_data, probabilities, desired_recall):
    '''
    Find the threshold that most closely yields a desired recall score.

    Inputs:
        y_test_data: Array of true y values
        probabilities: The results of the `predict_proba()` model method
        desired_recall: The recall that you want the model to have

    Outputs:
        threshold: The threshold that most closely yields the desired recall
        recall: The exact recall score associated with `threshold`
    '''
    probs = [x[1] for x in probabilities]  # Isolate second column of `probabilities`
    thresholds = np.arange(0, 1, 0.001)    # Set a grid of 1,000 thresholds to test

    scores = []
    for threshold in thresholds:
        # Create a new array of {0, 1} predictions based on new threshold
        preds = np.array([1 if x >= threshold else 0 for x in probs])
        # Calculate recall score for that threshold
        recall = m.recall_score(y_test_data, preds)
        # Append the threshold and its corresponding recall score as a tuple to `scores`
        scores.append((threshold, recall))

    distances = []
    for idx, score in enumerate(scores):
        # Calculate how close each actual score is to the desired score
        distance = abs(score[1] - desired_recall)
        # Append the (index#, distance) tuple to `distances`
        distances.append((idx, distance))

    # Sort `distances` by the second value in each of its tuples (least to greatest)
    sorted_distances = sorted(distances, key=lambda x: x[1], reverse=False)
    # Identify the tuple with the actual recall closest to desired recall
    best = sorted_distances[0]
    # Isolate the index of the threshold with the closest recall score
    best_idx = best[0]
    # Retrieve the threshold and actual recall score closest to desired recall
    threshold, recall = scores[best_idx]

    return threshold, recall


# Get the predicted probabilities from the champion model
pp = f1.best_estimator_.predict_proba(X_test)
# Call the function
threshold_finder(y_test, pp, desired_recall=.50)

(0.08, 0.4984894259818731)


# Create an array of new predictions that assigns a 1 to any value >= 0.124
probs = [x[1] for x in pp]
new_preds = np.array([1 if x >= 0.08 else 0 for x in probs])

# Get evaluation metrics for when the threshold is 0.124
get_test_scores('XGB, threshold = 0.124', new_preds, y_test)

	model	Recall	Prec	F1	Acc
1	Random Forest	0.10574	0.4375	0.170316	0.810556
1	XGBoost	0.214502	0.420118	0.284	0.801111

Waze Project¶

Course 6 End-of-Course Project: Build a machine learning model¶

Build a machine learning model¶

Imports and data loading¶

Basic Feature engineering¶

`km_per_driving_day`¶

`percent_sessions_in_last_month`¶

`professional_driver`¶

`total_sessions_per_day`¶

`km_per_hour`¶

`km_per_drive`¶

`percent_of_sessions_to_favorite`¶

Task 3. Drop missing values¶

Task 4. Outliers¶

Task 5. Variable encoding¶

Dummying features¶

Target encoding¶

Task 6. Feature selection¶

Task 7. Evaluation metric¶

Task 8. Modeling workflow and model selection process¶

Task 9. Split the data¶

Task 10. Modeling¶

Random forest¶

HINT

XGBoost¶

Task 11. Model selection¶

Random forest¶

Task 12. Use champion model to predict on test data¶

Task 13. Confusion matrix¶

Task 14. Feature importance¶

Conclusion¶

BONUS¶

Identify an optimal decision threshold¶

	ID	label	sessions	drives	total_sessions	n_days_after_onboarding	total_navigations_fav1	total_navigations_fav2	driven_km_drives	duration_minutes_drives	activity_days	driving_days	device
0	0	retained	283	226	296.748273	2276	208	0	2628.845068	1985.775061	28	19	Android
1	1	retained	133	107	326.896596	1225	19	64	13715.920550	3160.472914	13	11	iPhone
2	2	retained	114	95	135.522926	2651	0	0	3059.148818	1610.735904	14	8	Android
3	3	retained	49	40	67.589221	15	322	7	913.591123	587.196542	7	3	iPhone
4	4	retained	84	68	168.247020	1562	166	5	3950.202008	1219.555924	27	18	Android

Waze Project¶

Course 6 End-of-Course Project: Build a machine learning model¶

Build a machine learning model¶

Imports and data loading¶

Basic Feature engineering¶

km_per_driving_day¶

percent_sessions_in_last_month¶

professional_driver¶

total_sessions_per_day¶

km_per_hour¶

km_per_drive¶

percent_of_sessions_to_favorite¶

Task 3. Drop missing values¶

Task 4. Outliers¶

Task 5. Variable encoding¶

Dummying features¶

Target encoding¶

Task 6. Feature selection¶

Task 7. Evaluation metric¶

Task 8. Modeling workflow and model selection process¶

Task 9. Split the data¶

Task 10. Modeling¶

Random forest¶

HINT

XGBoost¶

Task 11. Model selection¶

Random forest¶

Task 12. Use champion model to predict on test data¶

Task 13. Confusion matrix¶

Task 14. Feature importance¶

Conclusion¶

BONUS¶

Identify an optimal decision threshold¶

`km_per_driving_day`¶

`percent_sessions_in_last_month`¶

`professional_driver`¶

`total_sessions_per_day`¶

`km_per_hour`¶

`km_per_drive`¶

`percent_of_sessions_to_favorite`¶