This notebook is part of a GitHub repository: https://github.com/pessini/moby-bikes
MIT Licensed
Author: Leandro Pessini
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import numpy as np
import scipy.stats as stats
from patsy import dmatrices
# statsmodels
from statsmodels.stats import diagnostic as diag
import statsmodels.api as sm
from statsmodels.formula.api import glm
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
%matplotlib inline
import warnings
warnings.simplefilter('ignore', FutureWarning)
%reload_ext watermark
%watermark -a "Leandro Pessini" -n -u -v -iv -w
df_train = pd.read_csv('../data/processed/df_train.csv')
df_test = pd.read_csv('../data/processed/df_test.csv')
all_data = pd.read_csv('../data/processed/rentals_data.csv')
df_train.head()
# EDA libraries
# from pandas_profiling import ProfileReport
# from dataprep.eda import create_report
# import sweetviz as sv
# Dataprep Report
# hourly_data_report = create_report(hourly_data, title='Hourly Data Report', progress=False)
# hourly_data_report.save('../reports/dataprep/hourly_data_report')
# hourly_rentals_report = create_report(hourly_rentals, title='Hourly Data Report (Only Rentals)', progress=False)
# hourly_rentals_report.save('../reports/dataprep/hourly_rentals_report')
# Pandas Profiling
# profile = ProfileReport(hourly_data, title='Hourly Data', html={'style':{'full_width':True}})
# profile.to_file(output_file='../reports/pandas-profiling/hourly_data_report.html')
# profile = ProfileReport(hourly_rentals, title='Hourly Data Report (Only Rentals)', html={'style':{'full_width':True}})
# profile.to_file(output_file='../reports/pandas-profiling/hourly_rentals_report.html')
# to_notebook_iframe() will print in the jupyter notebook's cell
# profile.to_notebook_iframe()
# SweetViz
# hourlydata_sweetviz_report = sv.analyze(hourly_data, target_feat='count')
# hourlydata_sweetviz_report.show_html('../reports/sweetviz/hourly_data_report.html')
# hourlyrentals_sweetviz_report = sv.analyze(hourly_rentals, target_feat='count')
# hourlyrentals_sweetviz_report.show_html('../reports/sweetviz/hourly_rentals_report.html')
battery_dist_df = all_data[all_data['duration'] > 0].copy()
def group_battery_status(df):
bins= [0,30,50,80,100]
labels = ['< 30%','30% - 50%','50% - 80%','> 80%']
df['battery_status'] = pd.cut(df['start_battery'], bins=bins, labels=labels, right=False)
s = df.battery_status
counts = s.value_counts()
percent = s.value_counts(normalize=True)
df_summary = pd.DataFrame({'counts': counts, 'per': percent}, labels)
df_summary['per100'] = round((df_summary['per']*100),2).astype(str) + '%'
return df_summary
battery_dist = group_battery_status(battery_dist_df)
battery_dist
fig, ax = plt.subplots(figsize=(6, 6))
sns.histplot(data=battery_dist_df, x='start_battery', kde=True)
plt.show()
avg_rental_duration = f"Average rentals duration: {round(all_data['duration'].mean(), 2)} minutes"
fig, ax = plt.subplots(nrows=2,ncols=2,figsize=(22, 12))
# fig.text(0.125,1.1, avg_rental_duration, fontfamily='serif',fontsize=14, fontweight='bold', color='#444444')
fig.text(0.35,0.95,avg_rental_duration,fontfamily='serif',fontsize=22,fontweight='bold',color='#444444')
season_rentduration = all_data.groupby(['season'])['duration'].mean()
sns.barplot(x=season_rentduration.values, y=season_rentduration.index, ax=ax[0][0], orient='h', palette='husl')
ax[0][0].set(xlabel='Duration (minutes)', ylabel='', title='Average duration (in minutes) of rentals by season')
day_of_week_rentduration = all_data.groupby(['dayofweek'])['duration'].mean()
sns.barplot(x=day_of_week_rentduration.values, y=day_of_week_rentduration.index, ci=None, ax=ax[1][0],
order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'], orient='h', palette='husl')
ax[1][0].set(xlabel='Duration (minutes)', ylabel='', title='Average duration (in minutes) of rentals across all days of week')
holiday_rentduration = all_data.groupby(['holiday'])['duration'].mean()
sns.barplot(x=holiday_rentduration.values, y=holiday_rentduration.index, ax=ax[0][1], orient='h', palette='husl')
ax[0][1].set(xlabel='Duration (minutes)', ylabel='Bank Holiday', title='Average duration (in minutes) of rentals by Bank Holiday')
# ax[0][1].bar_label(ax[0][1].containers[0])
timesofday_rentduration = all_data.groupby(['timesofday'])['duration'].mean()
sns.barplot(x=timesofday_rentduration.values, y=timesofday_rentduration.index, ax=ax[1][1],
order=['Morning','Afternoon','Evening','Night'], orient='h', palette='husl')
ax[1][1].set(xlabel='Duration (minutes)', ylabel='', title='Average duration (in minutes) of rentals across Times of the Day')
plt.show()
We can spot a few patterns when in colder days (Autumn and Winter | Night), the duration of rentals on average is higher. It seems that, on average, customers stay longer with bikes on early days of the week (Monday, Tuesday and Wednesday). On the other hand, it does not seem to be a difference on average rentals duration on Bank Holidays.
fig, ax = plt.subplots(nrows=2,ncols=2,figsize=(22, 12))
sns.barplot(data=df_train,
x='count', y='season', order=['Winter','Spring','Summer','Autumn'], ci=None, ax=ax[0][0])
ax[0][0].set(xlabel='Number of Rentals', ylabel='Season', title='Rentals across all seasons')
day_of_week = df_train.groupby('dayofweek')['count'].agg('sum').reset_index(name='count')
sns.barplot(data=day_of_week, x='count', y='dayofweek', ci=None, ax=ax[1][0],
order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
ax[1][0].set(xlabel='Number of Rentals', ylabel='Day of Week', title='Rentals across all days of week')
sns.barplot(data=df_train, x='count', y='working_day', ci=None, ax=ax[0][1], orient='h')
# sns.lineplot(data=df_train , x="hour", y="count", hue="working_day", ax=ax[0][1], ci=None)
ax[0][1].set(xlabel='Number of Rentals', ylabel='Working Day', title='Rentals across Working Days')
sns.barplot(data=df_train, x='count', y='timesofday', ci=None, ax=ax[1][1],
order=['Morning','Afternoon','Evening','Night'])
ax[1][1].set(xlabel='Number of Rentals', ylabel='Period of the Day', title='Rentals across Times of the Day')
plt.show()
As expected, in the Spring and Summer had more rentals than on Autumn and Winter as the weather impacts directly the number of rentals and those seasons usually have better weather. Also, the number of rentals is significant higher in the Afternoon followed by Mornining, Evening and then Night. Again, the number of rentals do not seem to have a significant difference on Weekends/Holidays.
As consequence of the duration of rentals being higher during early weekdays, we see that Thursday, Friday and Saturday have a higher flux of new rentals.
fig, ax = plt.subplots(nrows=2,ncols=2,figsize=(26, 12))
sns.lineplot(data=df_train, x='hour', y='count', hue='season', hue_order=['Winter','Spring','Summer','Autumn'], ax=ax[0][0], ci=None)
ax[0][0].set(xlabel='Hour', ylabel='', title='Seasons')
sns.lineplot(data=df_train, x='hour', y='count', hue='dayofweek', hue_order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'], ax=ax[1][0], ci=None)
ax[1][0].set(xlabel='Hour', ylabel='', title='Days of week')
sns.pointplot(data=df_train, x="hour", y="count", hue="working_day", ax=ax[0][1], ci=None)
ax[0][1].set(xlabel='Hour', ylabel='', title='Working Days')
sns.pointplot(data=df_train, x='hour', y='count', hue='timesofday', hue_order=['Morning','Afternoon','Evening','Night'], ax=ax[1][1], ci=None)
ax[1][1].set(xlabel='Hour', ylabel='', title='Times of the Day')
plt.show()
When we plot the number o rentals on each hour of day, we notice that no discrepancy's found on different Seasons, Weekends/Holidays and even week days follow a similar pattern throughout the day.
fig, axes = plt.subplots(nrows=2,ncols=4, figsize=(30, 16))
sns.boxplot(data=df_train,y="count",orient="v",ax=axes[0][0])
sns.boxplot(data=df_train,y="count",x="season",orient="v",ax=axes[0][1])
sns.boxplot(data=df_train,y="count",x="dayofweek",orient="v",ax=axes[1][0],
order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
sns.boxplot(data=df_train,y="count",x="working_day",orient="v",ax=axes[1][1])
sns.boxplot(data=df_train,y="count",x="timesofday",orient="v",ax=axes[0][2], order=['Morning', 'Afternoon', 'Evening', 'Night'])
sns.boxplot(data=df_train,y="count",x="rainfall_intensity",orient="v",ax=axes[1][2],
order=['no rain', 'drizzle', 'light rain', 'moderate rain', 'heavy rain'])
monthly_data = df_train[['month','count']].copy()
month_map = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
monthly_data['month_str'] = df_train['month'].map(lambda d : month_map[d])
sns.boxplot(data=monthly_data, x="month_str", y="count", orient="v",ax=axes[0][3], order=month_map.values())
sns.boxplot(data=df_train, x="holiday", y="count", orient="v",ax=axes[1][3])
axes[0][0].set(ylabel='Count',title="Box Plot On Count")
axes[0][1].set(xlabel='', ylabel='',title="Box Plot On Count Across Seasons")
axes[1][0].set(xlabel='', ylabel='',title="Box Plot On Count by Day of Week")
axes[1][1].set(xlabel='Working Day', ylabel='',title="Box Plot On Count by Working Day")
axes[0][2].set(xlabel='', ylabel='',title="Box Plot On Count by Times of Day")
axes[1][2].set(xlabel='Rainfall Intensity Levels', ylabel='',title="Box Plot On Count by Rainfall Intensity")
axes[0][3].set(xlabel='', ylabel='', title="Box Plot On Montly Rentals Count")
axes[1][3].set(xlabel='Bank Holiday', ylabel='', title="Box Plot On Count by Working Day")
plt.show()
fig, ax = plt.subplots(figsize=(18, 6))
sns.boxplot(data=df_train, x="hour", y="count", orient="v",ax=ax)
ax.set(xlabel='Hour', ylabel='Count')
ax.set_title(fontsize=20, label="Box Plot On Hourly Rentals Count")
plt.show()
As demonstrated above by line plots and from the boxplot, in the afternoon bikes are more rented on average and in total. Hourly means have demonstrated being very scattered.
fig = plt.figure(figsize=(18,10))
gs = fig.add_gridspec(2, 2)
ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[1, :])
sns.boxplot(data=df_train, x="temp_bin", y="count", orient="v",ax=ax0)
sns.boxplot(data=df_train, x="rhum_bin", y="count", orient="v",ax=ax1)
sns.boxplot(data=df_train, x="wind_speed_group", y="count", orient="v",ax=ax2,
order=['Calm / Light Breeze', 'Breeze', 'Moderate Breeze', 'Strong Breeze / Near Gale','Gale / Storm'])
ax0.set(xlabel='Temperature (bins - kmeans)',title="BoxPlot - Temperature")
ax1.set(xlabel='Relative Humidity (bins - kmeans)',title="BoxPlot - Relative Humidity")
ax2.set(xlabel='',title="BoxPlot - Wind Speed Levels")
plt.show()
Weather features, as temperature and humidity, have visually mean differences and it makes sense when deciding to use a bike. Wind Speed seems to decrease usage on average only when it is strong and storms.
The ANOVA test has important assumptions that must be satisfied in order for the associated p-value to be valid.
count feature does not have normal distribution, using non-parametric Kruskal-Wallis H-test instead
# Defining a function to find the statistical relationship with all the categorical variables
def kruskal_test_categorical_features(df, target, cat_vars):
from scipy.stats import kruskal
# Creating an empty list of final selected predictors
selected_predictors=[]
print('##### Kruskal-Wallis H-test Results ##### \n')
for predictor in cat_vars:
cat_grouplist=df.groupby(predictor)[target].apply(list)
kruskal_res = kruskal(*cat_grouplist) # Kruskal-Wallis H-test
# If the ANOVA P-Value is <0.05, that means we reject H0
if (kruskal_res[1] < 0.05):
print(predictor, 'is correlated with', target, '| p-value:', kruskal_res[1])
selected_predictors.append(predictor)
else:
print(predictor, 'is NOT correlated with', target, '| p-value:', kruskal_res[1])
return(selected_predictors)
# Calling the function to check which categorical variables are correlated with target
cat_vars=\
['season', 'hour', 'holiday', 'working_day', 'peak', 'timesofday', 'rainfall_intensity', 'dayofweek', 'temp_bin', 'rhum_bin', 'wind_speed_group', 'wind_bft']
kruskal_test_categorical_features(df=df_train, target='count', cat_vars=cat_vars)
corrMatt = df_train[['temp','wdsp','rhum','count']].corr()
mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False
cmap = sns.diverging_palette(220, 20, as_cmap=True)
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corrMatt, mask=mask,vmax=.3, annot=True, ax=ax, cmap=cmap)
plt.show()
df1Corr=pd.DataFrame(corrMatt.drop('count').unstack().sort_values(ascending=False)['count'],columns=['Correlation to the target'])
df1Corr.style.background_gradient(cmap=sns.light_palette("red", as_cmap=True))
fig = plt.figure(figsize=(22,6))
gs = fig.add_gridspec(2, 3)
ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])
sns.histplot(df_train['temp'],ax=ax0, stat='density', kde=True)
sns.histplot(df_train['rhum'],ax=ax1, stat='density', kde=True)
sns.histplot(df_train['wdsp'],ax=ax2, stat='density', kde=True)
ax0.set(xlabel='Temperature',title="Distribution - Temperature")
ax1.set(xlabel='Relative Humidity',title="Distribution - Relative Humidity")
ax2.set(xlabel='Wind Speed',title="Distribution - Wind Speed")
plt.show()
sns.pairplot(df_train,
x_vars=['temp','wdsp','rhum'],
dropna=True,
y_vars='count',
height=6,
kind="reg",
palette='Set1')
plt.show()
Temperature and Relative Humidity show a strong correlation with Rentals Count (positive and negative, respectively) but Wind Speed does not seem to be correlated with Rentals Count.
pp = sns.pairplot(df_train,
x_vars='wdsp',
dropna=True,
y_vars='count', height=8, aspect=0.8, kind="reg", hue='season')
pp.fig.suptitle("Relationship between Wind Speed vs Rentals Count by Seasion")
plt.show()
If we plot the relationship by Season we can see that Wind Speed is correlated but depending on the Season is positive or negative. Which makes sense since on Autumn and Winter, as it is colder and temperature is normally low, Rentals decrease as the Wind Speed increases. In the Spring and Summer, because the temperature are on average higher the wind speed we see a different effect (positive correlation).
monthly_data = df_train.groupby('month')['count'].agg('sum').reset_index(name='total_rentals')
month_map = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
monthly_data['monthly_data_str'] = monthly_data['month'].map(lambda d : month_map[d])
monthly_stats = f"Mean = {round(monthly_data['total_rentals'].mean(), 2)} \nStd. Dev = {round(monthly_data['total_rentals'].std(), 2)}"
monthly_data = monthly_data[['monthly_data_str', 'total_rentals']]
fig, ax = plt.subplots(1, 2, gridspec_kw={'width_ratios': [1, 3]}, figsize=(10,6), tight_layout=True)
ax[0].axis('off')
ax[0].axis('tight')
table = ax[0].table(cellText=monthly_data.values, colLabels=['Month','Total Rentals'], cellLoc='center', loc='center', colColours =["#eb826b"] * 2)
table.auto_set_font_size(False)
table.set_fontsize(12)
table.auto_set_column_width(col=list(range(len(monthly_data.columns))))
table.scale(3,2)
monthly_plot = sns.pointplot(x=monthly_data["monthly_data_str"], y=monthly_data["total_rentals"], ax=ax[1])
ax[1].set(ylim=(0,max(monthly_data["total_rentals"]+100)), xlabel='Month', ylabel='Total Rentals')
monthly_plot.annotate(monthly_stats, xy=(3, 1500), fontsize=16, fontfamily='monospace')
fig.tight_layout()
plt.show()
monthly_testdata = df_test.groupby('month')['count'].agg('sum').reset_index(name='total_rentals')
monthly_testdata['month'] = monthly_testdata['month'].map(lambda d : month_map[d])
monthly_testdata
def labels_zeros(df):
zero_values = len(df[df['count'] == 0])
stats_df = df['count'].agg(['mean','var','std'])
return f"Zeros: {round((zero_values/df.shape[0])*100, 2)}% \n" + \
f"Mean = {round(stats_df['mean'], 2)} \nVariance = {round(stats_df['var'], 2)} \nStd. Dev = {round(stats_df['std'], 2)} \n" +\
f"n = {df.shape[0]}"
df_night = df_train[df_train['timesofday'] == 'Night']
df_morn = df_train[df_train['timesofday'] == 'Morning']
df_aft = df_train[df_train['timesofday'] == 'Afternoon']
df_eve = df_train[df_train['timesofday'] == 'Evening']
fig = plt.figure(figsize=(22,8))
gs = fig.add_gridspec(2, 2)
ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[1, 0])
ax3 = fig.add_subplot(gs[1, 1])
night_plot = sns.histplot(data=df_night, x='count', ax=ax0, stat='count', discrete=True, color='#e34a33')
morning_plot = sns.histplot(data=df_morn, x='count', ax=ax1, stat='count', discrete=True, color='#2ca25f')
afternoon_plot = sns.histplot(data=df_aft, x='count', ax=ax2, stat='count', discrete=True, color='#feb24c')
evening_plot = sns.histplot(data=df_eve, x='count', ax=ax3, stat='count', discrete=True, color='#43a2ca')
night_plot.annotate(labels_zeros(df_night), xy=(8, 600), fontsize=16, fontfamily='monospace')
label_stats = labels_zeros(df_morn)
morning_plot.annotate(labels_zeros(df_morn), xy=(15, 100), fontsize=16, fontfamily='monospace')
label_stats = labels_zeros(df_aft)
afternoon_plot.annotate(labels_zeros(df_aft), xy=(18, 100), fontsize=16, fontfamily='monospace')
label_stats = labels_zeros(df_eve)
evening_plot.annotate(labels_zeros(df_eve), xy=(13, 100), fontsize=16, fontfamily='monospace')
ax0.set(xlabel='',title="NIGHT Rentals Count")
ax1.set(xlabel='',title="MORNING Rentals Count")
ax2.set(xlabel='',title="AFTERNOON Rentals Count")
ax3.set(xlabel='',title="EVENING Rentals Count")
plt.show()
GitHub repository
Author: Leandro Pessini