This notebook is part of a GitHub repository: https://github.com/pessini/moby-bikes
MIT Licensed
Author: Leandro Pessini
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import numpy as np
import scipy.stats as stats
from patsy import dmatrices
# statsmodels
from statsmodels.stats import diagnostic as diag
import statsmodels.api as sm
from statsmodels.formula.api import glm
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
%matplotlib inline
import warnings
warnings.simplefilter('ignore', FutureWarning)
%reload_ext watermark
%watermark -a "Leandro Pessini" -n -u -v -iv -w
df_train = pd.read_csv('../data/processed/df_train.csv', parse_dates=['date'])
df_test = pd.read_csv('../data/processed/df_test.csv', parse_dates=['date'])
fig, ax = plt.subplots(nrows=2,ncols=2,figsize=(26, 12))
sns.lineplot(data=df_train, x='hour', y='count', hue='season', hue_order=['Winter','Spring','Summer','Autumn'], ax=ax[0][0], ci=None)
ax[0][0].set(xlabel='Hour', ylabel='', title='Seasons')
sns.lineplot(data=df_train, x='hour', y='count', hue='dayofweek', hue_order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'], ax=ax[1][0], ci=None)
ax[1][0].set(xlabel='Hour', ylabel='', title='Days of week')
sns.pointplot(data=df_train, x="hour", y="count", hue="working_day", ax=ax[0][1], ci=None)
ax[0][1].set(xlabel='Hour', ylabel='', title='Working Days')
sns.pointplot(data=df_train, x='hour', y='count', hue='timesofday', hue_order=['Morning','Afternoon','Evening','Night'], ax=ax[1][1], ci=None)
ax[1][1].set(xlabel='Hour', ylabel='', title='Times of the Day')
plt.show()
When we plot the number o rentals on each hour of day, we notice that no discrepancy's found on different Seasons, Weekends/Holidays and even week days follow a similar pattern throughout the day.
monthly_data = df_train.groupby('month')['count'].agg('sum').reset_index(name='total_rentals')
month_map = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
monthly_data['monthly_data_str'] = monthly_data['month'].map(lambda d : month_map[d])
monthly_stats = f"Mean = {round(monthly_data['total_rentals'].mean(), 2)} \nStd. Dev = {round(monthly_data['total_rentals'].std(), 2)}"
monthly_data = monthly_data[['monthly_data_str', 'total_rentals']]
fig, ax = plt.subplots(1, 2, gridspec_kw={'width_ratios': [1, 3]}, figsize=(10,6), tight_layout=True)
ax[0].axis('off')
ax[0].axis('tight')
table = ax[0].table(cellText=monthly_data.values, colLabels=['Month','Total Rentals'], cellLoc='center', loc='center', colColours =["#eb826b"] * 2)
table.auto_set_font_size(False)
table.set_fontsize(12)
table.auto_set_column_width(col=list(range(len(monthly_data.columns))))
table.scale(3,2)
monthly_plot = sns.pointplot(x=monthly_data["monthly_data_str"], y=monthly_data["total_rentals"], ax=ax[1])
ax[1].set(ylim=(0,max(monthly_data["total_rentals"]+100)), xlabel='Month', ylabel='Total Rentals')
monthly_plot.annotate(monthly_stats, xy=(3, 1500), fontsize=16, fontfamily='monospace')
fig.tight_layout()
plt.show()
df_train.info()
def daily_data(df):
slice_daily_data = df[['date', 'holiday', 'dayofweek', 'working_day', 'season']]
df = df.groupby('date')['count'].agg('sum').reset_index(name='daily_rentals')
time_series_df = pd.merge(df, slice_daily_data, on='date', how='left')
time_series_df.drop_duplicates(subset='date', keep='first', inplace=True)
time_series_df.set_index('date', inplace=True)
return time_series_df
daily_rentals_train = df_train.copy()
daily_rentals_test = df_test.copy()
time_series_df_train = daily_data(daily_rentals_train)
time_series_df_test = daily_data(daily_rentals_test)
final_df = pd.concat([time_series_df_train, time_series_df_test], axis=0)
train_df, test_df = final_df[final_df['type'] == 'train'], final_df[final_df['type'] == 'test']
fig, ax = plt.subplots(figsize=(22, 12))
sns.lineplot(x=train_df.index,
y=train_df["daily_rentals"], ax=ax)
sns.lineplot(x=test_df.index,
y=test_df["daily_rentals"], ax=ax)
ax.axhline(45, ls='--', color='grey')
ax.set(xlabel='Time', ylabel='Rentals Count')
ax.set(ylim=(10,max(final_df["daily_rentals"])+10))
ax.set_title(fontsize=20, label="Time Series on Daily Rentals")
plt.legend(['train','test'])
plt.show()
train_df[train_df['daily_rentals'] < 45]
If a time series of count data is generated by a Poisson point process then event occurrences in successive time intervals are independent. Independence is a reasonable assumption when the underlying stochastic process for events, conditional on covariates, has no memory. Then there is no need for special time series models. For example, the number of deaths (or births) in a region may be uncorrelated over time. At the same time the population, which cumulates births and deaths, will be very highly correlated over time.
The first step for time series count data is therefore to test for serial correlation.
expr = """ count ~ temp_r + wdsp + rhum + rain + holiday"""
y_train, X_train = dmatrices(expr, df_train, return_type = 'dataframe')
nb_training_results = sm.GLM(y_train, X_train, family = sm.families.NegativeBinomial()).fit(maxiter=5000, method='nm', cov_type='HC3')
print(nb_training_results.summary())
# print(f'AIC: {round(nb_training_results.aic,2)} \nBIC: {round(nb_training_results.bic,2)}')
The Durbin-Watson statistic will always have a value ranging between 0 and 4. A value of 2.0 indicates there is no autocorrelation detected in the sample. Values from 0 to less than 2 point to positive autocorrelation and values from 2 to 4 means negative autocorrelation.
# The response residuals are defined as endog - fittedvalues
# The hypotheses followed for the Durbin Watson statistic:
# H(0) = First-order autocorrelation does not exist.
# H(1) = First-order autocorrelation exists.
print(f'Durbin-Watson test for serial correlation: {round(durbin_watson(nb_training_results.resid_response),2)}')
Possible positive autocorrelation
A rule of thumb is that DW test statistic values in the range of 1.5 to 2.5 are relatively normal. Values outside this range could, however, be a cause for concern. Field(2009) suggests that values under 1 or more than 3 are a definite cause for concern.
Field, A. (2009). Discovering Statistics Using SPSS, 3rd Edition (Introducing Statistical Methods) (3rd ed.). SAGE Publications Ltd.
houlyTM = df_train[['date','hour','count']].copy()
# houlyTM = hourly_rentals[['date','hour','count']].copy()
houlyTM['datetime'] = pd.to_datetime(houlyTM.date) + pd.to_timedelta(houlyTM.hour, unit='h')
houlyTM.set_index('datetime', inplace=True)
houlyTM.drop(houlyTM.tail(25).index,inplace=True)
Autocorrelation occurs when the residuals are not independent of each other.
fig = plt.figure(figsize=(18,10))
gs = fig.add_gridspec(2, 2)
ax0 = fig.add_subplot(gs[0, :])
ax1 = fig.add_subplot(gs[1, :])
plot_acf(nb_training_results.resid_response, ax=ax0)
plot_pacf(nb_training_results.resid_response, ax=ax1)
plt.show()
How to Handle Autocorrelation
If you reject the null hypothesis and conclude that autocorrelation is present in the residuals, then you have a few different options to correct this problem if you deem it to be serious enough:
For positive serial correlation, consider adding lags of the dependent and/or independent variable to the model.
For negative serial correlation, check to make sure that none of your variables are overdifferenced.
For seasonal correlation, consider adding seasonal dummy variables to the model.
# Stacking Random Forest and GBM
rf_pred = predict_on_test_set(rf_model, rf_cols)
gbm_pred = predict_on_test_set(gbm_model, gbm_cols)
# taking weighted average of output from two models
y_pred = np.round(.20*rf_pred + .80*gbm_pred)
https://blog.paperspace.com/introduction-time-series-analysis/
https://www.machinelearningplus.com/time-series/time-series-analysis-python/
https://towardsdatascience.com/how-to-predict-your-step-count-for-next-week-a16b7800b408
GitHub repository
Author: Leandro Pessini