This notebook is part of a GitHub repository: https://github.com/pessini/moby-bikes
MIT Licensed
Author: Leandro Pessini
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime
import sys
import os
# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Models & Evaluation
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# statsmodel
import statsmodels.api as sm
import statsmodels.tsa.api as smt
import statsmodels.formula.api as smf
import statsmodels.stats as stats
# Boost models
import xgboost as xgb
from xgboost import XGBRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
import catboost as cat
from catboost import CatBoostRegressor
from sklearn import metrics
# Hyperparameter optimization
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# Custom objects
sys.path.insert(0, os.path.abspath('../src/'))
import experiment_tracker as et
import time
import warnings
warnings.simplefilter('ignore', FutureWarning)
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)
df_train = pd.read_csv('../data/processed/df_train.csv')
df_test = pd.read_csv('../data/processed/df_test.csv')
df_train.info()
# creates a new object to keep track of the experiments
experiment_tracker = et.ExperimentTracker()
df = df_train.copy()
X = df.drop(['count'], axis=1)
y = df.pop('count')
all_columns = list(X.columns)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_val.shape
test_df = df_test.copy()
X_test = test_df.drop(['count'], axis=1)
y_test = test_df.pop('count')
X_test.shape
idea_dummy = et.Idea(idea='Dummy Regressor', potential_outcome='To use as a baseline model, expected to perform badly.')
experiment_tracker.new_idea(idea_dummy)
def get_train_val_score(model, predictors, X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val):
X_train = X_train[[c for c in X_train.columns if c in predictors]]
X_val = X_val[[c for c in X_val.columns if c in predictors]]
model.fit(X_train, y_train)
# Predict on training and validation set
y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)
# Calculate the RSME and MAE
train_rsme = metrics.mean_squared_error(y_train, y_pred_train, squared=False)
# If squared = True returns MSE value, if False returns RMSE value.
val_rsme = metrics.mean_squared_error(y_val, y_pred_val, squared=False)
train_mae = metrics.mean_absolute_error(y_train, y_pred_train)
val_mae = metrics.mean_absolute_error(y_val, y_pred_val)
return train_rsme, val_rsme, train_mae, val_mae
def get_metrics_to_Experiment() -> list:
rsme = et.Score('RSME', '{:.4f}'.format(train_rsme), '{:.4f}'.format(val_rsme))
mae = et.Score('MAE', '{:.4f}'.format(train_mae), '{:.4f}'.format(val_mae))
return [rsme, mae]
from sklearn.dummy import DummyRegressor
predictors = ['temp','rhum','wdsp','rain']
dummy_regr = DummyRegressor(strategy="mean")
train_rsme, val_rsme, train_mae, val_mae = get_train_val_score(dummy_regr, predictors)
exp_dummy_regr = et.Experiment('Dummy Regressor', predictors=predictors, hyperparameters=dummy_regr.get_params(),
score=get_metrics_to_Experiment(), notes='Baseline Model for comparison')
experiment_tracker.add_experiment(exp_dummy_regr)
idea_linear = et.Idea(idea='Linear Regression', potential_outcome='Expected to perform bad as we have many outliers and a count as target variable')
experiment_tracker.new_idea(idea_linear)
from sklearn.linear_model import LinearRegression
predictors = ['temp','rhum','wdsp','rain']
lin_reg = LinearRegression()
train_rsme, val_rsme, train_mae, val_mae = get_train_val_score(lin_reg, predictors)
exp_lin_regr = et.Experiment('Linear Regression', predictors=predictors, hyperparameters='',
score=get_metrics_to_Experiment(), notes='Linear Regression')
experiment_tracker.add_experiment(exp_lin_regr)
experiment_tracker.update_idea(idea_linear, learnings='As expected the performance is not good as we have a lot of outliers and a count as target variable')
idea_random_forest = et.Idea(idea='Random Forest (only weather features)', potential_outcome='Expected to perform better than the Linear Regression')
experiment_tracker.new_idea(idea_random_forest)
from sklearn.ensemble import RandomForestRegressor
predictors = ['temp','rhum','wdsp','rain']
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
train_rsme, val_rsme, train_mae, val_mae = get_train_val_score(rf, predictors)
exp_rf_regr = et.Experiment('Random Forest (only weather features)', predictors=predictors, hyperparameters=rf.get_params(),
score=get_metrics_to_Experiment(), notes='')
experiment_tracker.add_experiment(exp_rf_regr)
# update ideas with learnings
learnings = \
"""Random Forest with just a few hyperparameters performed just a little better than linear regression (validations score).
It seems to be overfitting as we see validation scores much higher than training scores. It's a sign that tuning hyperparameters is needed."""
experiment_tracker.update_idea(idea_random_forest, learnings=str.strip(learnings))
import category_encoders as ce
def preprocessor(predictors: list) -> ColumnTransformer:
# Setting remainder='passthrough' will mean that all columns not specified in the list of “transformers”
# will be passed through without transformation, instead of being dropped
##################### Categorical variables #####################
all_cat_vars = ['timesofday','dayofweek','holiday','peak','hour','working_day','season','month']
cat_vars = [categorical_var for categorical_var in all_cat_vars if categorical_var in predictors]
# categorical variables
cat_pipe = Pipeline([
('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])
cat_encoder = 'cat', cat_pipe, cat_vars
##################### Numerical variables #####################
all_num_vars = ['rain', 'temp', 'rhum','wdsp','temp_r']
num_vars = [numerical_var for numerical_var in all_num_vars if numerical_var in predictors]
num_pipe = Pipeline([
('scaler', StandardScaler())
# ('scaler', MinMaxScaler())
])
num_enconder = 'num', num_pipe, num_vars
##################### Ordinal variables #####################
all_ord_vars = ['wind_speed_group','rainfall_intensity']
ord_vars = [ordinal_var for ordinal_var in all_ord_vars if ordinal_var in predictors]
ordinal_cols_mapping = []
if 'wind_speed_group' in predictors:
ordinal_cols_mapping.append(
{"col":"wind_speed_group",
"mapping": {
'Calm / Light Breeze': 0,
'Breeze': 1,
'Moderate Breeze': 2,
'Strong Breeze / Near Gale': 3,
'Gale / Storm': 4
}}
)
if 'rainfall_intensity' in predictors:
ordinal_cols_mapping.append(
{"col":"rainfall_intensity",
"mapping": {
'no rain': 0,
'drizzle': 1,
'light rain': 2,
'moderate rain': 3,
'heavy rain': 4
}}
)
# ordinal variables
ord_pipe = Pipeline([
('ordinal', ce.OrdinalEncoder(mapping=ordinal_cols_mapping))
])
ord_enconder = 'ordinal', ord_pipe, ord_vars
#################################################################################
orig_vars = [var for var in predictors if var not in cat_vars and var not in num_vars and var not in ord_vars]
orig_enconder = 'pass_vars', 'passthrough', orig_vars
# ['temp_bin','rhum_bin']
# ord_pipe = 'passthrough'
transformers_list = []
transformers_list.append(cat_encoder) if cat_vars else None
transformers_list.append(ord_enconder) if ord_vars else None
transformers_list.append(num_enconder) if num_vars else None
# transformers_list.append(orig_enconder) if orig_vars else None
return ColumnTransformer(transformers=transformers_list,
remainder='drop')
def plot_feature_importances(model, columns, X=X_val, y=y_val, plot_title='Feature Importances using permutation'):
#Plotting features importance
from sklearn.inspection import permutation_importance
result = permutation_importance(
model, X[columns], y, n_repeats=10, random_state=42, n_jobs=2
)
feat_importances = pd.Series(result.importances_mean, index=columns)
feat_importances.sort_values(ascending=False, inplace=True)
fig = plt.figure(figsize=(15, 12))
sns.barplot(x=feat_importances.values, y=feat_importances.index, orient='h')
plt.title(plot_title)
locs, labels = plt.xticks()
plt.tick_params(axis='both', which='major', labelsize=12)
plt.show()
idea_rf = et.Idea(idea='Random Forest with all expected features', potential_outcome='To use as a baseline model with all features.')
experiment_tracker.new_idea(idea_rf)
predictors = ['temp_r','rhum','holiday','dayofweek','timesofday','wind_speed_group','rainfall_intensity','peak','working_day']
# random forest model
params_rf = {'n_estimators': 100,
'max_depth': 10,
'random_state': 42}
# Fit a pipeline with transformers and an estimator to the training data
pipe_rf = Pipeline([
('preprocessor', preprocessor(predictors)),
('model', RandomForestRegressor(**params_rf, criterion='squared_error'))
])
# pipe_rf.fit(X_train[predictors], y_train)
train_rsme, val_rsme, train_mae, val_mae = get_train_val_score(pipe_rf, predictors)
exp_rf_regr = et.Experiment('Random Forest with all expected features', predictors=predictors, hyperparameters=pipe_rf['model'].get_params(),
score=get_metrics_to_Experiment(), notes='Added all predictors and using preprocessing')
experiment_tracker.add_experiment(exp_rf_regr)
plot_feature_importances(model=pipe_rf, columns=predictors,plot_title='Random Forest Feature Importances')
learnings = \
"""Random Forest model with all features has decrease RSME and particularly in validation metrics."""
experiment_tracker.update_idea(idea_rf, learnings=str.strip(learnings))
idea_rf_cat = et.Idea(idea='Random Forest with all features as categorical', potential_outcome='Changing temp and hum to categorical variables it will improve the model\
specifically the prediction with boosting trees.')
experiment_tracker.new_idea(idea_rf_cat)
predictors = ['temp_bin','rhum_bin','holiday','dayofweek','timesofday','wind_speed_group','rainfall_intensity','peak','working_day']
# random forest model
params_rf = {'n_estimators': 100,
'max_depth': 20,
'random_state': 0,
'min_samples_split' : 5,
'n_jobs': -1}
# Fit a pipeline with transformers and an estimator to the training data
pipe_rf = Pipeline([
('preprocessor', preprocessor(predictors)),
('model', RandomForestRegressor(**params_rf, criterion='squared_error'))
])
train_rsme, val_rsme, train_mae, val_mae = get_train_val_score(pipe_rf, predictors)
exp_rf_regr = et.Experiment('Random Forest (all categoricals)', predictors=predictors, hyperparameters=pipe_rf['model'].get_params(),
score=get_metrics_to_Experiment(), notes='Added all predictors and using preprocessing')
experiment_tracker.add_experiment(exp_rf_regr)
plot_feature_importances(model=pipe_rf, columns=predictors,plot_title='Random Forest Feature Importances')
learnings = \
"""Changing temp and hum into categorical variables did not improve the model. The expected improvement is for boosting models."""
experiment_tracker.update_idea(idea_rf_cat, learnings=str.strip(learnings))
# from IPython.display import display
# from sklearn import set_config
# set_config(display='diagram')
# display(pipe_rf)
idea_catboost = et.Idea(idea='Catboost', potential_outcome='Using all features as categorical variables it will perform better using boosting trees.')
experiment_tracker.new_idea(idea_catboost)
predictors = ['temp_bin','rhum_bin','holiday','dayofweek','timesofday','wind_speed_group','rainfall_intensity','peak','working_day']
# random forest model
params_catboost = {'n_estimators': 100,
'random_state': 42,
'loss_function': 'RMSE',
'verbose': 25}
# Fit a pipeline with transformers and an estimator to the training data
pipe_catboost = Pipeline([
('preprocessor', preprocessor(predictors)),
('model', CatBoostRegressor(**params_catboost))
])
# fitparams_catboost = {'model__eval_set': (X_val[predictors], y_val)}
# pipe_catboost.named_steps.model.set_params(eval_set=(X_val, y_val))
train_rsme, val_rsme, train_mae, val_mae = get_train_val_score(pipe_catboost, predictors)
exp_catboost_regr = et.Experiment('Catboost model', predictors=predictors, hyperparameters=pipe_catboost['model'].get_params(),
score=get_metrics_to_Experiment(), notes='Added all categoricals features to use Catboost model.')
experiment_tracker.add_experiment(exp_catboost_regr)
plot_feature_importances(model=pipe_catboost, columns=predictors,plot_title='Catboost Feature Importances')
learnings = \
"""Catboost model improved RSME compared to Random Forest (all cat vars) but not as good as Random Forest (temp/hum as numerical feat)."""
experiment_tracker.update_idea(idea_catboost, learnings=str.strip(learnings))
idea_svr = et.Idea(idea='SVM Regressor', potential_outcome='SVM Regressor can be a good model for this dataset as it is a linear model and extrapolates well.')
experiment_tracker.new_idea(idea_svr)
from sklearn.svm import SVR
predictors = ['temp_bin','rhum_bin','holiday','dayofweek','timesofday','wind_speed_group','rainfall_intensity','peak','working_day']
params_svr = {'kernel': 'poly',
'degree': 5,
'gamma': 'scale',
'C': 100
}
# Fit a pipeline with transformers and an estimator to the training data
pipe_svr = Pipeline([
('preprocessor', preprocessor(predictors)),
('model', SVR(**params_svr))
])
train_rsme, val_rsme, train_mae, val_mae = get_train_val_score(pipe_svr, predictors)
exp_svr= et.Experiment('Support Vector Regression', predictors=predictors, hyperparameters=pipe_svr['model'].get_params(),
score=get_metrics_to_Experiment(), notes='')
experiment_tracker.add_experiment(exp_svr)
plot_feature_importances(model=pipe_svr, columns=predictors,plot_title='SVM Regressor Feature Importances')
learnings = \
"""SVM Regressor did not improve the model. Also a few features that it does not seem important on all models, now are, for example Working Day.
This model will be discarded as it is not a good model for this dataset."""
experiment_tracker.update_idea(idea_svr, learnings=str.strip(learnings))
idea_gbm = et.Idea(idea='LightGBM', potential_outcome='Another boosting tree but lighter and known to be more accurate than Catboost.')
experiment_tracker.new_idea(idea_gbm)
predictors = ['temp_bin','rhum_bin','holiday','dayofweek','timesofday','wind_speed_group','rainfall_intensity','peak','working_day']
# LightGBM model
params_lightgbm = {'n_estimators': 100,
'random_state': 42,
'metric': 'rmse',
'verbose': 25
}
# Fit a pipeline with transformers and an estimator to the training data
pipe_lightgbm = Pipeline([
('preprocessor', preprocessor(predictors)),
('model', LGBMRegressor(**params_lightgbm))
])
train_rsme, val_rsme, train_mae, val_mae = get_train_val_score(pipe_lightgbm, predictors)
exp_lightgbm = et.Experiment('LightGBM', predictors=predictors, hyperparameters=pipe_lightgbm['model'].get_params(),
score=get_metrics_to_Experiment(), notes='')
experiment_tracker.add_experiment(exp_lightgbm)
plot_feature_importances(model=pipe_lightgbm, columns=predictors,plot_title='LightGBM Feature Importances')
learnings = \
"""LightGBM model did improve the model on Validation set. The gap between the validation and training set is not large as other models."""
experiment_tracker.update_idea(idea_gbm, learnings=str.strip(learnings))
experiment_tracker.print_partial_results()
# experiment_tracker.print_partial_results(filter_metric='rsme')
idea_rf_1 = et.Idea(idea='Random Forest (wind speed numerical + estimators increased)', potential_outcome="Because temperature and humidity as numerical variables is better to RF, changing wind speed will increase model's performance.")
experiment_tracker.new_idea(idea_rf_1)
predictors = ['temp_r','rhum','holiday','dayofweek','timesofday','wind_speed_group','rainfall_intensity','peak','working_day']
params_rf_1 = {'n_estimators': 500,
'max_depth': 10,
'random_state': 42}
pipe_rf_1 = Pipeline([
('preprocessor', preprocessor(predictors)),
('model', RandomForestRegressor(**params_rf_1, criterion='squared_error'))
])
train_rsme, val_rsme, train_mae, val_mae = get_train_val_score(pipe_rf_1, predictors)
exp_rf_regr_1 = et.Experiment('Random Forest (wdsp + n_estimator = 500)', predictors=predictors, hyperparameters=pipe_rf_1['model'].get_params(),
score=get_metrics_to_Experiment(), notes='Changed wind speed group to numerical feature and increased n_estimators to 500.')
experiment_tracker.add_experiment(exp_rf_regr_1)
idea_rf_2 = et.Idea(idea='Random Forest (+ hour)', potential_outcome="Adding hour to the model will increase the model's performance because timesofday (time feature) is a good predictor.")
experiment_tracker.new_idea(idea_rf_2)
predictors = ['temp_r','rhum','holiday','dayofweek','timesofday','wind_speed_group','rainfall_intensity','peak','working_day', 'hour']
params_rf_2 = {'n_estimators': 500,
'max_depth': 10,
'random_state': 42}
pipe_rf_2 = Pipeline([
('preprocessor', preprocessor(predictors)),
('model', RandomForestRegressor(**params_rf_2, criterion='squared_error'))
])
train_rsme, val_rsme, train_mae, val_mae = get_train_val_score(pipe_rf_2, predictors)
exp_rf_regr_2 = et.Experiment('Random Forest (+ hour)', predictors=predictors, hyperparameters=pipe_rf_2['model'].get_params(),
score=get_metrics_to_Experiment(), notes='Added hour to the model')
experiment_tracker.add_experiment(exp_rf_regr_2)
learnings = \
"""Adding hour to the model did improve the model. Not substantially better than the previous model though."""
experiment_tracker.update_idea(idea_rf_2, learnings=str.strip(learnings))
idea_rf = et.Idea(idea='Random Forest (+ season and wdsp)', potential_outcome="Adding season and changing wind speed to numerical to the model will increase the model's performance.")
experiment_tracker.new_idea(idea_rf)
predictors = ['temp_r','rhum','holiday','dayofweek','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']
params_rf = {'n_estimators': 500,
'max_depth': 10,
'random_state': 42}
pipe_rf = Pipeline([
('preprocessor', preprocessor(predictors)),
('model', RandomForestRegressor(**params_rf, criterion='squared_error'))
])
train_rsme, val_rsme, train_mae, val_mae = get_train_val_score(pipe_rf, predictors)
exp_rf_regr = et.Experiment('Random Forest (+ season and wdsp)', predictors=predictors, hyperparameters=pipe_rf['model'].get_params(),
score=get_metrics_to_Experiment(), notes='Added season and wdsp to the model')
experiment_tracker.add_experiment(exp_rf_regr)
learnings = \
"""Adding season and changing wind speed to numerical did improve the model."""
experiment_tracker.update_idea(idea_rf, learnings=str.strip(learnings))
idea_rf = et.Idea(idea='Random Forest (max_features: sqrt)', potential_outcome="Changing the number of features to consider when looking for the best split to sqrt(n_features) can increase the performance.")
experiment_tracker.new_idea(idea_rf)
predictors = ['temp_r','rhum','holiday','dayofweek','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']
params_rf = {'n_estimators': 500,
'max_depth': 10,
'max_features': 'sqrt',
'random_state': 42}
pipe_rf = Pipeline([
('preprocessor', preprocessor(predictors)),
('model', RandomForestRegressor(**params_rf, criterion='squared_error'))
])
train_rsme, val_rsme, train_mae, val_mae = get_train_val_score(pipe_rf, predictors)
exp_rf_regr = et.Experiment('Random Forest (max_features: sqrt)', predictors=predictors, hyperparameters=pipe_rf['model'].get_params(),
score=get_metrics_to_Experiment(), notes='Changed number of features to sqrt')
experiment_tracker.add_experiment(exp_rf_regr)
learnings = \
"""Changing hyperparameter max_features to sqrt did not improve the model."""
experiment_tracker.update_idea(idea_rf, learnings=str.strip(learnings))
experiment_tracker.print_partial_results()
idea_rf_grid = et.Idea(idea='Random Forest (GridSearchCV)', potential_outcome="Hyperparameter tuning with GridSearchCV can increase the model's performance and reduce overfitting.")
experiment_tracker.new_idea(idea_rf_grid)
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']
params_rf_grid = {'bootstrap': [True],
'max_depth': [5, 10, 20, 30],
'max_features': ['log2',0.3,0.5,0.7],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10],
'n_estimators': [500, 1000, 1500, 2000]
}
pipe_rf_grid = Pipeline([
('preprocessor', preprocessor(predictors)),
('rf', GridSearchCV(RandomForestRegressor(random_state=42),param_grid=params_rf_grid, cv=3, scoring='neg_root_mean_squared_error', refit=True, n_jobs=-1, verbose=-1))
])
train_rsme, val_rsme, train_mae, val_mae = get_train_val_score(pipe_rf_grid, predictors)
exp_rf_regr_grid = et.Experiment('Random Forest (GridSearchCV)', predictors=predictors, hyperparameters=pipe_rf_grid['rf'].best_estimator_.get_params(),
score=get_metrics_to_Experiment(), notes='Hyperparameter tuning with GridSearchCV')
experiment_tracker.add_experiment(exp_rf_regr_grid)
experiment_tracker.print_partial_results(filter_model='Random Forest', filter_metric='rsme')
learnings = \
"""Random Forest (GridSearchCV) had a little impact on test set performance and it proved not worth as it is computationally intensive."""
experiment_tracker.update_idea(idea_rf_grid, learnings=str.strip(learnings))
idea_xgboost = et.Idea(idea='XGBoost', potential_outcome="XGBoost can be used to improve the model's performance as in many Kaggle's competitions.")
experiment_tracker.new_idea(idea_xgboost)
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']
params_xgboost = {'max_depth':10,
'seed': 42,
'eval_metric': 'rmse',
'verbosity': 0
}
pipe_xgboost = Pipeline([
('preprocessor', preprocessor(predictors)),
('model', XGBRegressor(**params_xgboost))
])
train_rsme, val_rsme, train_mae, val_mae = get_train_val_score(pipe_xgboost, predictors)
exp_xgboost = et.Experiment('XGBoost', predictors=predictors, hyperparameters=pipe_xgboost['model'].get_params(),
score=get_metrics_to_Experiment(), notes='')
experiment_tracker.add_experiment(exp_xgboost)
learnings = \
"""XGBoost looks promising as it has the best score on the train set by far."""
experiment_tracker.update_idea(idea_xgboost, learnings=str.strip(learnings))
plot_feature_importances(model=pipe_xgboost, columns=predictors,plot_title='XGBoost Feature Importances')
experiment_tracker.to_excel('../documentation/experiment_tracker.xlsx')
import joblib
#save your model or results
joblib.dump(pipe_rf_grid['rf'], '../models/RandomForest.pkl')
%reload_ext watermark
%watermark -a "Leandro Pessini" -n -u -v -iv -w
GitHub repository
Author: Leandro Pessini