This notebook is part of a GitHub repository: https://github.com/pessini/moby-bikes
MIT Licensed
Author: Leandro Pessini
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import numpy as np
import scipy.stats as stats
from patsy import dmatrices
# statsmodels
from statsmodels.stats import diagnostic as diag
import statsmodels.api as sm
from statsmodels.formula.api import glm
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
%matplotlib inline
import warnings
warnings.simplefilter('ignore', FutureWarning)
%reload_ext watermark
%watermark -a "Leandro Pessini" -n -u -v -iv -w
df_train = pd.read_csv('../data/processed/df_train.csv')
df_test = pd.read_csv('../data/processed/df_test.csv')
all_data = pd.read_csv('../data/processed/rentals_data.csv')
Influencial Points (not all unusual observations have much of an impact on the model) - https://www.kirenz.com/post/2021-11-14-linear-regression-diagnostics-in-python/linear-regression-diagnostics-in-python/
expr = \
""" count ~ temp_bin + wind_speed_group + rhum_bin + rainfall_intensity + holiday + timesofday"""
y_train, X_train = dmatrices(expr, df_train, return_type = 'dataframe')
nb_training_results = sm.GLM(y_train, X_train, family = sm.families.NegativeBinomial(alpha = 1)).fit(maxiter=5000, method='nm', cov_type='HC3')
We can use influence plots to identify observations in our independent variables which have “unusual” values in comparison to other values.
Influence plots show the (externally) studentized residuals vs. the leverage of each observation:
infl = nb_training_results.get_influence(observed=False)
fig, ax = plt.subplots(figsize=(10, 8))
infl.plot_index(y_var="cooks", threshold=3 * infl.cooks_distance[0].mean(),
ax=ax, title="Influence plots - Cook's Distance")
fig.tight_layout(pad=1.0)
plt.axhline(4 / df_train.shape[0], color='r', linestyle='--')
plt.show()
A data point that has a large value for Cook’s Distance indicates that it strongly influences the fitted values. A general rule of thumb is that any point with a Cook’s Distance over 4/n (where n is the total number of data points) is considered to be an outlier.
# obtain Cook's distance
model_cooksd = infl.cooks_distance[0]
threshold_c = 4 / df_train.shape[0]
out_d = model_cooksd > threshold_c
subset_data = df_train[~out_d]
print(f'Number of possible outliers: {df_train.shape[0] - subset_data.shape[0]}')
fig, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(data=df_train,
y="count",
orient="v",ax=ax)
plt.show()
hourlyDataOutliers = df_train[df_train['count'] > 14]
hourlyDataWithoutOutliers = df_train[df_train['count'] <= 14]
print(f"Outliers (#): {round(hourlyDataOutliers.shape[0], 2)}")
print(f"Outliers (%): {round((hourlyDataOutliers.shape[0] / df_train.shape[0])*100, 2)}%")
hourlyDataOutliers['holiday'].value_counts()
hourlyDataOutliers['working_day'].value_counts(normalize=True)
hourlyDataOutliers['season'].value_counts()
Unusual observations do not seem to be related with Holiday nor Weekends (Working_Day).
daily_count = df_train.groupby('date')['count'].sum().reset_index()
daily_count_no_outliers = hourlyDataWithoutOutliers.groupby('date')['count'].sum().reset_index()
sns.boxplot(data=daily_count,y="count",orient="v")
daily_count[daily_count['count'] < 20]
The "outlier" showed on the boxplot is on Christmas Day.
daily_count.loc[295:302]
mean_daily = daily_count['count'].mean()
std_dev_daily = daily_count['count'].std()
slice_daily = mean_daily + (2*std_dev_daily)
daily_count[daily_count['count'] > slice_daily]
march_07 = df_train[df_train['date'] == '2021-03-07']
march_07[['hour', 'rain', 'temp', 'rhum', 'wdsp', 'season', 'timesofday','working_day', 'count']]
april_25 = df_train[df_train['date'] == '2021-04-25']
april_25[['hour', 'rain', 'temp', 'rhum', 'wdsp', 'season', 'timesofday', 'count']]
daily_count.describe()
print(f"Mean of daily rentals = {round(daily_count['count'].mean(), 2)}")
print(f"Mean of daily rentals without Outliers = {round(daily_count_no_outliers['count'].mean(), 2)}")
hourlyDataOutliers = hourlyDataOutliers.join(daily_count.set_index('date'), on='date', lsuffix='_hour', rsuffix='_day')
outliers_df = hourlyDataOutliers[hourlyDataOutliers['count_day'] > 110].sort_values(by='count_day', ascending=False)
outliers_df.head()
hourlyDataOutliers.to_csv('../data/interim/outliers.csv')
GitHub repository
Author: Leandro Pessini