This notebook is part of a GitHub repository: https://github.com/pessini/moby-bikes
MIT Licensed
Author: Leandro Pessini
import pandas as pd
from pandas import MultiIndex, Int16Dtype
import numpy as np
import datetime
import sys
import os
import joblib
# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Models & Evaluation
from sklearn.model_selection import KFold
# Boost models
import xgboost as xgb
from sklearn import metrics
# Custom objects
sys.path.insert(0, os.path.abspath('../src/'))
import experiment_tracker as et
import time
import warnings
warnings.simplefilter('ignore', FutureWarning)
# ideas_df = pd.read_excel('../documentation/experiment_tracker.xlsx', sheet_name='Ideas')
# experiments_df = pd.read_excel('../documentation/experiment_tracker.xlsx', sheet_name='Experiments')
# creates a new object to keep track of the experiments
experiment_tracker = et.ExperimentTracker()
df_train = pd.read_csv('../data/processed/df_train.csv')
df_test = pd.read_csv('../data/processed/df_test.csv')
df = df_train.copy()
X = df.drop(['count'], axis=1)
y = df.pop('count')
all_columns = list(X.columns)
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
X.shape
test_df = df_test.copy()
X_test = test_df.drop(['count'], axis=1)
y_test = test_df.pop('count')
X_test.shape
def get_metrics_to_Experiment(dict_scores = None) -> list:
if dict_scores is None:
dict_scores = {}
rsme = et.Score('RSME', '{:.4f}'.format(dict_scores['train_rsme']), '{:.4f}'.format(dict_scores['val_rsme']))
mae = et.Score('MAE', '{:.4f}'.format(dict_scores['train_mae']), '{:.4f}'.format(dict_scores['val_mae']))
return [rsme, mae]
import category_encoders as ce
def preprocessor(predictors: list) -> ColumnTransformer:
# Setting remainder='passthrough' will mean that all columns not specified in the list of “transformers”
# will be passed through without transformation, instead of being dropped
##################### Categorical variables #####################
all_cat_vars = ['timesofday','dayofweek','holiday','peak','hour','working_day','season','month']
cat_vars = [categorical_var for categorical_var in all_cat_vars if categorical_var in predictors]
# categorical variables
cat_pipe = Pipeline([
('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])
cat_encoder = 'cat', cat_pipe, cat_vars
##################### Numerical variables #####################
all_num_vars = ['rain', 'temp', 'rhum','wdsp','temp_r']
num_vars = [numerical_var for numerical_var in all_num_vars if numerical_var in predictors]
num_pipe = Pipeline([
('scaler', StandardScaler())
# ('scaler', MinMaxScaler())
])
num_enconder = 'num', num_pipe, num_vars
##################### Ordinal variables #####################
all_ord_vars = ['wind_speed_group','rainfall_intensity']
ord_vars = [ordinal_var for ordinal_var in all_ord_vars if ordinal_var in predictors]
ordinal_cols_mapping = []
if 'wind_speed_group' in predictors:
ordinal_cols_mapping.append(
{"col":"wind_speed_group",
"mapping": {
'Calm / Light Breeze': 0,
'Breeze': 1,
'Moderate Breeze': 2,
'Strong Breeze / Near Gale': 3,
'Gale / Storm': 4
}}
)
if 'rainfall_intensity' in predictors:
ordinal_cols_mapping.append(
{"col":"rainfall_intensity",
"mapping": {
'no rain': 0,
'drizzle': 1,
'light rain': 2,
'moderate rain': 3,
'heavy rain': 4
}}
)
# ordinal variables
ord_pipe = Pipeline([
('ordinal', ce.OrdinalEncoder(mapping=ordinal_cols_mapping))
])
ord_enconder = 'ordinal', ord_pipe, ord_vars
#################################################################################
orig_vars = [var for var in predictors if var not in cat_vars and var not in num_vars and var not in ord_vars]
orig_enconder = 'pass_vars', 'passthrough', orig_vars
# ['temp_bin','rhum_bin']
# ord_pipe = 'passthrough'
transformers_list = []
transformers_list.append(cat_encoder) if cat_vars else None
transformers_list.append(ord_enconder) if ord_vars else None
transformers_list.append(num_enconder) if num_vars else None
# transformers_list.append(orig_enconder) if orig_vars else None
return ColumnTransformer(transformers=transformers_list,
remainder='drop')
def summarize_dict(dictionary, function):
return {k: function(v) for k,v in dictionary.items()}
def kfold_score(params, predictors, X=X, y=y, n_folds=5, verbose=50, early_stopping_rounds=10):
pipe_xgboost = Pipeline([
('preprocessor', preprocessor(predictors)),
('model', xgb.XGBRegressor(**params))
])
X = X[[c for c in X.columns if c in predictors]]
cv = KFold(n_splits=n_folds, shuffle=True, random_state=2022)
scores = {"train_rsme":[],"val_rsme":[],"train_mae":[],"val_mae":[]}
for n_fold, (train_index, test_index) in enumerate(cv.split(X, y)):
print('#'*40, f'Fold {n_fold+1} out of {cv.n_splits}', '#'*40)
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
# Xy = xgb.DMatrix(X_train, y_train, enable_categorical=True)
X_test_transformed = pipe_xgboost['preprocessor'].fit_transform(X_test)
pipe_xgboost.fit(X_train, y_train,
model__eval_set=[(X_test_transformed, y_test)],
model__early_stopping_rounds=early_stopping_rounds,
model__verbose=verbose)
# pipe_xgboost.fit(X_train, y_train)
# print(pipe_xgboost['model'].evals_result())
# Predict on training and validation set
y_pred_train = pipe_xgboost.predict(X_train)
y_pred_val = pipe_xgboost.predict(X_test)
# Calculate the RSME and MAE
# If squared = True returns MSE value, if False returns RMSE value.
scores['train_rsme'].append(metrics.mean_squared_error(y_train, y_pred_train, squared=False))
scores['val_rsme'].append(metrics.mean_squared_error(y_test, y_pred_val, squared=False))
scores['train_mae'].append(metrics.mean_absolute_error(y_train, y_pred_train))
scores['val_mae'].append(metrics.mean_absolute_error(y_test, y_pred_val))
print(f"Fold {n_fold+1} - best iteration: {pipe_xgboost['model'].get_booster().best_iteration}\n")
return summarize_dict(scores, np.mean), pipe_xgboost
#predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']
# Baseline model
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']
params_xgboost = {'max_depth':3,
'n_estimators': 500,
'seed': 42,
'eval_metric': 'rmse'
}
dict_scores, xgb_model = kfold_score(params_xgboost, predictors, n_folds=3)
exp_xgboost = et.Experiment('XGBoost (Baseline)', predictors=predictors, hyperparameters=xgb_model['model'].get_params(),
score=get_metrics_to_Experiment(dict_scores), notes='Baseline XGBoost')
experiment_tracker.add_experiment(exp_xgboost)
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']
params_xgboost = {'max_depth':3,
'eta': 0.2,
'n_estimators': 500,
'subsample': 1,
'colsample_bytree': 0.5,
'gamma': 1,
'seed': 42,
'eval_metric': 'rmse'
}
dict_scores, xgb_model = kfold_score(params_xgboost, predictors, n_folds=3)
exp_xgboost = et.Experiment('XGBoost 1', predictors=predictors, hyperparameters=xgb_model['model'].get_params(),
score=get_metrics_to_Experiment(dict_scores), notes='')
experiment_tracker.add_experiment(exp_xgboost)
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']
params_xgboost = {'max_depth':9,
'eta': 0.01,
'n_estimators': 1000,
'subsample': 0.7,
'colsample_bytree': 0.5,
'gamma': 1,
'seed': 42,
'eval_metric': 'rmse'
}
dict_scores, xgb_model = kfold_score(params_xgboost, predictors, n_folds=5, verbose=250, early_stopping_rounds=30)
exp_xgboost = et.Experiment('XGBoost (max_depth, eta, subsampel and estimators)', predictors=predictors, hyperparameters=xgb_model['model'].get_params(),
score=get_metrics_to_Experiment(dict_scores))
experiment_tracker.add_experiment(exp_xgboost)
experiment_tracker.print_partial_results(filter_metric='rsme')
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']
params_xgboost = {'max_depth':9,
'eta': 0.01,
'n_estimators': 1000,
'subsample': 0.7,
'colsample_bytree': 0.5,
'gamma': 1.5,
'seed': 42,
'eval_metric': 'rmse'
}
dict_scores, xgb_model = kfold_score(params_xgboost, predictors, n_folds=5, verbose=250, early_stopping_rounds=30)
exp_xgboost = et.Experiment('XGBoost (gamma: 1.5)', predictors=predictors, hyperparameters=xgb_model['model'].get_params(),
score=get_metrics_to_Experiment(dict_scores))
experiment_tracker.add_experiment(exp_xgboost)
experiment_tracker.print_partial_results(filter_metric='rsme')
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']
params_xgboost = {'max_depth':7,
'eta': 0.01,
'n_estimators': 1000,
'subsample': 0.7,
'colsample_bytree': 0.5,
'gamma': 1.5,
'seed': 42,
'eval_metric': 'rmse'
}
dict_scores, xgb_model = kfold_score(params_xgboost, predictors, n_folds=5, verbose=250, early_stopping_rounds=30)
exp_xgboost = et.Experiment('XGBoost (max_depth: 7)', predictors=predictors, hyperparameters=xgb_model['model'].get_params(),
score=get_metrics_to_Experiment(dict_scores))
experiment_tracker.add_experiment(exp_xgboost)
predictors = ['temp','rhum','dayofweek','timesofday','wdsp','rainfall_intensity', 'working_day', 'hour', 'season']
params_xgboost = {'max_depth':7,
'eta': 0.01,
'n_estimators': 1000,
'subsample': 0.7,
'colsample_bytree': 0.5,
'gamma': 1.5,
'seed': 42,
'eval_metric': 'rmse'
}
dict_scores, xgb_model = kfold_score(params_xgboost, predictors, n_folds=5, verbose=250, early_stopping_rounds=30)
exp_xgboost = et.Experiment('XGBoost (- holiday and - peak features)', predictors=predictors, hyperparameters=xgb_model['model'].get_params(),
score=get_metrics_to_Experiment(dict_scores))
experiment_tracker.add_experiment(exp_xgboost)
import pickle
pickle.dump(xgb_model, open('../models/xgb_pipeline.pkl', 'wb'))
pickle.dump(xgb_model['model'], open('../models/xgboost.pkl', 'wb'))
xgb_model['model'].save_model('../models/XGBoost.json')
xgb_model['model'].save_model('../models/XGBoost.model')
predictors = ['temp','rhum','dayofweek','timesofday','wdsp','rainfall_intensity', 'hour', 'season']
params_xgboost = {'max_depth':7,
'eta': 0.01,
'n_estimators': 1000,
'subsample': 0.7,
'colsample_bytree': 0.5,
'gamma': 1.5,
'seed': 42,
'eval_metric': 'rmse'
}
dict_scores, xgb_model = kfold_score(params_xgboost, predictors, n_folds=5, verbose=250, early_stopping_rounds=30)
exp_xgboost = et.Experiment('XGBoost (- working_day feat)', predictors=predictors, hyperparameters=xgb_model['model'].get_params(),
score=get_metrics_to_Experiment(dict_scores))
experiment_tracker.add_experiment(exp_xgboost)
predictors = ['temp','rhum','dayofweek','timesofday','wdsp','rainfall_intensity', 'hour', 'working_day']
params_xgboost = {'max_depth':7,
'eta': 0.01,
'n_estimators': 1000,
'subsample': 0.7,
'colsample_bytree': 0.5,
'gamma': 1.5,
'seed': 42,
'eval_metric': 'rmse'
}
dict_scores, xgb_model = kfold_score(params_xgboost, predictors, n_folds=5, verbose=250, early_stopping_rounds=30)
exp_xgboost = et.Experiment('XGBoost (- season feat)', predictors=predictors, hyperparameters=xgb_model['model'].get_params(),
score=get_metrics_to_Experiment(dict_scores))
experiment_tracker.add_experiment(exp_xgboost)
experiment_tracker.print_partial_results(filter_metric='rsme')
predictors = ['temp','rhum','dayofweek', 'timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']
params_xgboost = {'max_depth':11,
'eta': 0.001,
'n_estimators': 5000,
'subsample': 0.7,
'colsample_bytree': 0.5,
'gamma': 1.5,
'seed': 42,
'eval_metric': 'rmse'
}
dict_scores, xgb_model = kfold_score(params_xgboost, predictors, n_folds=5, verbose=1000, early_stopping_rounds=30)
exp_xgboost = et.Experiment('XGBoost (eta: 0.001)', predictors=predictors, hyperparameters=xgb_model['model'].get_params(),
score=get_metrics_to_Experiment(dict_scores))
experiment_tracker.add_experiment(exp_xgboost)
predictors = ['temp','rhum','dayofweek', 'timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']
params_xgboost = {'max_depth':9,
'eta': 0.001,
'n_estimators': 5000,
'subsample': 0.8,
'colsample_bytree': 0.5,
'gamma': 1.5,
'seed': 42,
'eval_metric': 'rmse'
}
dict_scores, xgb_model = kfold_score(params_xgboost, predictors, n_folds=5, verbose=1000, early_stopping_rounds=30)
exp_xgboost = et.Experiment('XGBoost (max_depth: 9 and subsample: 0.8)', predictors=predictors, hyperparameters=xgb_model['model'].get_params(),
score=get_metrics_to_Experiment(dict_scores))
experiment_tracker.add_experiment(exp_xgboost)
experiment_tracker.print_partial_results()
pickle.dump(xgb_model, open('../models/xgboost_complex_pipe.pkl', 'wb'))
pickle.dump(xgb_model['model'], open('../models/xgboost_complex.pkl', 'wb'))
xgb_model['model'].save_model('../models/xgboost_complex.json')
xgb_model['model'].save_model('../models/xgboost_complex.model')
def normalized_rsme(value, dataset) -> float:
return value / max(dataset['count']) - min(dataset['count'])
# Normalized RSME - Root Mean Square Error
normtrain_rsme = normalized_rsme(dict_scores['train_rsme'], df_train)
normtrain_rsme
df_positives = df_train[df_train['count'] > 0].copy()
X_pos = df_positives.drop(['count'], axis=1)
y_pos = df_positives.pop('count')
X.shape, X_pos.shape
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']
params_xgboost = {'max_depth':9,
'eta': 0.001,
'n_estimators': 5000,
'subsample': 0.8,
'colsample_bytree': 0.5,
'gamma': 1.5,
'seed': 42,
'eval_metric': 'rmse'
}
dict_scores, xgb_model = kfold_score(params_xgboost, X=X_pos, y=y_pos, predictors=predictors, n_folds=5, verbose=1000, early_stopping_rounds=30)
exp_xgboost = et.Experiment('XGBoost (only + count values)', predictors=predictors, hyperparameters=xgb_model['model'].get_params(),
score=get_metrics_to_Experiment(dict_scores), notes='Checking possibility of using hurdle model')
experiment_tracker.add_experiment(exp_xgboost)
experiment_tracker.to_excel('../documentation/experiment_tracker_xgboost.xlsx')
%reload_ext watermark
%watermark -a "Leandro Pessini" -n -u -v -iv -w
GitHub repository
Author: Leandro Pessini