Octocat This notebook is part of a GitHub repository: https://github.com/pessini/moby-bikes
MIT Licensed
Author: Leandro Pessini

Exploratory Data Analysis (EDA)¶

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import numpy as np
import scipy.stats as stats
from patsy import dmatrices

# statsmodels
from statsmodels.stats import diagnostic as diag
import statsmodels.api as sm
from statsmodels.formula.api import glm
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

%matplotlib inline
import warnings
warnings.simplefilter('ignore', FutureWarning)

%reload_ext watermark
%watermark -a "Leandro Pessini" -n -u -v -iv -w

Author: Leandro Pessini

Last updated: Sat May 14 2022

Python implementation: CPython
Python version       : 3.9.6
IPython version      : 8.3.0

pandas     : 1.3.0
sys        : 3.9.6 | packaged by conda-forge | (default, Jul 11 2021, 03:36:15) 
[Clang 11.1.0 ]
matplotlib : 3.4.2
numpy      : 1.21.1
seaborn    : 0.11.1
scipy      : 1.8.0
statsmodels: 0.13.2

Watermark: 2.3.0

df_train = pd.read_csv('../data/processed/df_train.csv')
df_test = pd.read_csv('../data/processed/df_test.csv')
all_data = pd.read_csv('../data/processed/rentals_data.csv')
df_train.head()

# EDA libraries
# from pandas_profiling import ProfileReport
# from dataprep.eda import create_report
# import sweetviz as sv

# Dataprep Report

# hourly_data_report = create_report(hourly_data, title='Hourly Data Report', progress=False)
# hourly_data_report.save('../reports/dataprep/hourly_data_report')
# hourly_rentals_report = create_report(hourly_rentals, title='Hourly Data Report (Only Rentals)', progress=False)
# hourly_rentals_report.save('../reports/dataprep/hourly_rentals_report')

# Pandas Profiling

# profile = ProfileReport(hourly_data, title='Hourly Data', html={'style':{'full_width':True}})
# profile.to_file(output_file='../reports/pandas-profiling/hourly_data_report.html')

# profile = ProfileReport(hourly_rentals, title='Hourly Data Report (Only Rentals)', html={'style':{'full_width':True}})
# profile.to_file(output_file='../reports/pandas-profiling/hourly_rentals_report.html')

# to_notebook_iframe() will print in the jupyter notebook's cell
# profile.to_notebook_iframe()

# SweetViz

# hourlydata_sweetviz_report = sv.analyze(hourly_data, target_feat='count')
# hourlydata_sweetviz_report.show_html('../reports/sweetviz/hourly_data_report.html')

# hourlyrentals_sweetviz_report = sv.analyze(hourly_rentals, target_feat='count')
# hourlyrentals_sweetviz_report.show_html('../reports/sweetviz/hourly_rentals_report.html')

Battery distribution¶

battery_dist_df = all_data[all_data['duration'] > 0].copy()

def group_battery_status(df):

    bins= [0,30,50,80,100]
    labels = ['< 30%','30% - 50%','50% - 80%','> 80%']
    df['battery_status'] = pd.cut(df['start_battery'], bins=bins, labels=labels, right=False)

    s = df.battery_status
    counts = s.value_counts()
    percent = s.value_counts(normalize=True)
    df_summary = pd.DataFrame({'counts': counts, 'per': percent}, labels)
    df_summary['per100'] = round((df_summary['per']*100),2).astype(str) + '%'
    
    return df_summary

battery_dist = group_battery_status(battery_dist_df)
battery_dist

Initial battery density¶

fig, ax = plt.subplots(figsize=(6, 6))
sns.histplot(data=battery_dist_df, x='start_battery', kde=True)
plt.show()

Duration of rentals¶

avg_rental_duration = f"Average rentals duration: {round(all_data['duration'].mean(), 2)} minutes"

fig, ax = plt.subplots(nrows=2,ncols=2,figsize=(22, 12))

# fig.text(0.125,1.1, avg_rental_duration, fontfamily='serif',fontsize=14, fontweight='bold', color='#444444')
fig.text(0.35,0.95,avg_rental_duration,fontfamily='serif',fontsize=22,fontweight='bold',color='#444444')

season_rentduration = all_data.groupby(['season'])['duration'].mean()
sns.barplot(x=season_rentduration.values, y=season_rentduration.index, ax=ax[0][0], orient='h', palette='husl')
ax[0][0].set(xlabel='Duration (minutes)', ylabel='', title='Average duration (in minutes) of rentals by season')

day_of_week_rentduration = all_data.groupby(['dayofweek'])['duration'].mean()
sns.barplot(x=day_of_week_rentduration.values, y=day_of_week_rentduration.index, ci=None, ax=ax[1][0],
            order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'], orient='h', palette='husl')
ax[1][0].set(xlabel='Duration (minutes)', ylabel='', title='Average duration (in minutes) of rentals across all days of week')

holiday_rentduration = all_data.groupby(['holiday'])['duration'].mean()
sns.barplot(x=holiday_rentduration.values, y=holiday_rentduration.index, ax=ax[0][1], orient='h', palette='husl')
ax[0][1].set(xlabel='Duration (minutes)', ylabel='Bank Holiday', title='Average duration (in minutes) of rentals by Bank Holiday')
# ax[0][1].bar_label(ax[0][1].containers[0])

timesofday_rentduration = all_data.groupby(['timesofday'])['duration'].mean()
sns.barplot(x=timesofday_rentduration.values, y=timesofday_rentduration.index, ax=ax[1][1],
            order=['Morning','Afternoon','Evening','Night'], orient='h', palette='husl')
ax[1][1].set(xlabel='Duration (minutes)', ylabel='', title='Average duration (in minutes) of rentals across Times of the Day')

plt.show()

We can spot a few patterns when in colder days (Autumn and Winter | Night), the duration of rentals on average is higher. It seems that, on average, customers stay longer with bikes on early days of the week (Monday, Tuesday and Wednesday). On the other hand, it does not seem to be a difference on average rentals duration on Bank Holidays.

Total number of rentals¶

fig, ax = plt.subplots(nrows=2,ncols=2,figsize=(22, 12))

sns.barplot(data=df_train, 
            x='count', y='season', order=['Winter','Spring','Summer','Autumn'], ci=None, ax=ax[0][0])
ax[0][0].set(xlabel='Number of Rentals', ylabel='Season', title='Rentals across all seasons')

day_of_week = df_train.groupby('dayofweek')['count'].agg('sum').reset_index(name='count')
sns.barplot(data=day_of_week, x='count', y='dayofweek', ci=None, ax=ax[1][0],
            order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
ax[1][0].set(xlabel='Number of Rentals', ylabel='Day of Week', title='Rentals across all days of week')

sns.barplot(data=df_train, x='count', y='working_day', ci=None, ax=ax[0][1], orient='h')
# sns.lineplot(data=df_train , x="hour", y="count", hue="working_day", ax=ax[0][1], ci=None)
ax[0][1].set(xlabel='Number of Rentals', ylabel='Working Day', title='Rentals across Working Days')

sns.barplot(data=df_train, x='count', y='timesofday', ci=None, ax=ax[1][1],
            order=['Morning','Afternoon','Evening','Night'])
ax[1][1].set(xlabel='Number of Rentals', ylabel='Period of the Day', title='Rentals across Times of the Day')

plt.show()

As expected, in the Spring and Summer had more rentals than on Autumn and Winter as the weather impacts directly the number of rentals and those seasons usually have better weather. Also, the number of rentals is significant higher in the Afternoon followed by Mornining, Evening and then Night. Again, the number of rentals do not seem to have a significant difference on Weekends/Holidays.

As consequence of the duration of rentals being higher during early weekdays, we see that Thursday, Friday and Saturday have a higher flux of new rentals.

Hourly number of rentals¶

fig, ax = plt.subplots(nrows=2,ncols=2,figsize=(26, 12))

sns.lineplot(data=df_train, x='hour', y='count', hue='season', hue_order=['Winter','Spring','Summer','Autumn'], ax=ax[0][0], ci=None)
ax[0][0].set(xlabel='Hour', ylabel='', title='Seasons')

sns.lineplot(data=df_train, x='hour', y='count', hue='dayofweek', hue_order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'], ax=ax[1][0], ci=None)
ax[1][0].set(xlabel='Hour', ylabel='', title='Days of week')

sns.pointplot(data=df_train, x="hour", y="count", hue="working_day", ax=ax[0][1], ci=None)
ax[0][1].set(xlabel='Hour', ylabel='', title='Working Days')

sns.pointplot(data=df_train, x='hour', y='count', hue='timesofday', hue_order=['Morning','Afternoon','Evening','Night'], ax=ax[1][1], ci=None)
ax[1][1].set(xlabel='Hour', ylabel='', title='Times of the Day')

plt.show()

When we plot the number o rentals on each hour of day, we notice that no discrepancy's found on different Seasons, Weekends/Holidays and even week days follow a similar pattern throughout the day.

BoxPlot analyses¶

fig, axes = plt.subplots(nrows=2,ncols=4, figsize=(30, 16))

sns.boxplot(data=df_train,y="count",orient="v",ax=axes[0][0])
sns.boxplot(data=df_train,y="count",x="season",orient="v",ax=axes[0][1])
sns.boxplot(data=df_train,y="count",x="dayofweek",orient="v",ax=axes[1][0], 
            order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
sns.boxplot(data=df_train,y="count",x="working_day",orient="v",ax=axes[1][1])
sns.boxplot(data=df_train,y="count",x="timesofday",orient="v",ax=axes[0][2], order=['Morning', 'Afternoon', 'Evening', 'Night'])
sns.boxplot(data=df_train,y="count",x="rainfall_intensity",orient="v",ax=axes[1][2], 
            order=['no rain', 'drizzle', 'light rain', 'moderate rain', 'heavy rain'])

monthly_data = df_train[['month','count']].copy()
month_map = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
monthly_data['month_str'] = df_train['month'].map(lambda d : month_map[d])
sns.boxplot(data=monthly_data, x="month_str", y="count", orient="v",ax=axes[0][3], order=month_map.values())
sns.boxplot(data=df_train, x="holiday", y="count", orient="v",ax=axes[1][3])

axes[0][0].set(ylabel='Count',title="Box Plot On Count")
axes[0][1].set(xlabel='', ylabel='',title="Box Plot On Count Across Seasons")
axes[1][0].set(xlabel='', ylabel='',title="Box Plot On Count by Day of Week")
axes[1][1].set(xlabel='Working Day', ylabel='',title="Box Plot On Count by Working Day")
axes[0][2].set(xlabel='', ylabel='',title="Box Plot On Count by Times of Day")
axes[1][2].set(xlabel='Rainfall Intensity Levels', ylabel='',title="Box Plot On Count by Rainfall Intensity")
axes[0][3].set(xlabel='', ylabel='', title="Box Plot On Montly Rentals Count")
axes[1][3].set(xlabel='Bank Holiday', ylabel='', title="Box Plot On Count by Working Day")

plt.show()

Main findings:¶

Bikes are, on average, less rented on Bank Holidays. This could be because Moby Bikes have a subscription plan and this could impact if people subscribe to commute, for example.
Apart from Moderate Rain, we do not see a big mean difference on Rainfall Intensity Levels.
We already saw on above plots that in the Afternoon where most of the bikes are rented and the same pattern is reflected on the means values.
Monthly average is also presenting some patterns that could indicate time series seasonality.

fig, ax = plt.subplots(figsize=(18, 6))
sns.boxplot(data=df_train, x="hour", y="count", orient="v",ax=ax)
ax.set(xlabel='Hour', ylabel='Count')
ax.set_title(fontsize=20, label="Box Plot On Hourly Rentals Count")
plt.show()

As demonstrated above by line plots and from the boxplot, in the afternoon bikes are more rented on average and in total. Hourly means have demonstrated being very scattered.

fig = plt.figure(figsize=(18,10))
gs = fig.add_gridspec(2, 2)
ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[1, :])

sns.boxplot(data=df_train, x="temp_bin", y="count", orient="v",ax=ax0)
sns.boxplot(data=df_train, x="rhum_bin", y="count", orient="v",ax=ax1)
sns.boxplot(data=df_train, x="wind_speed_group", y="count", orient="v",ax=ax2,
            order=['Calm / Light Breeze', 'Breeze', 'Moderate Breeze', 'Strong Breeze / Near Gale','Gale / Storm'])

ax0.set(xlabel='Temperature (bins - kmeans)',title="BoxPlot - Temperature")
ax1.set(xlabel='Relative Humidity (bins - kmeans)',title="BoxPlot - Relative Humidity")
ax2.set(xlabel='',title="BoxPlot - Wind Speed Levels")

plt.show()

Weather features, as temperature and humidity, have visually mean differences and it makes sense when deciding to use a bike. Wind Speed seems to decrease usage on average only when it is strong and storms.

Kruskal-Wallis H-test on categorical features¶

The ANOVA test has important assumptions that must be satisfied in order for the associated p-value to be valid.

The samples are independent.
Each sample is from a normally distributed population.
The population standard deviations of the groups are all equal. This property is known as homoscedasticity.

count feature does not have normal distribution, using non-parametric Kruskal-Wallis H-test instead

# Defining a function to find the statistical relationship with all the categorical variables
def kruskal_test_categorical_features(df, target, cat_vars):
    from scipy.stats import kruskal

    # Creating an empty list of final selected predictors
    selected_predictors=[]
    
    print('##### Kruskal-Wallis H-test Results ##### \n')
    for predictor in cat_vars:
        cat_grouplist=df.groupby(predictor)[target].apply(list)
        kruskal_res = kruskal(*cat_grouplist) #  Kruskal-Wallis H-test
        
        # If the ANOVA P-Value is <0.05, that means we reject H0
        if (kruskal_res[1] < 0.05):
            print(predictor, 'is correlated with', target, '| p-value:', kruskal_res[1])
            selected_predictors.append(predictor)
        else:
            print(predictor, 'is NOT correlated with', target, '| p-value:', kruskal_res[1])
    
    return(selected_predictors)

# Calling the function to check which categorical variables are correlated with target
cat_vars=\
    ['season', 'hour', 'holiday', 'working_day', 'peak', 'timesofday', 'rainfall_intensity', 'dayofweek', 'temp_bin', 'rhum_bin', 'wind_speed_group', 'wind_bft']
kruskal_test_categorical_features(df=df_train, target='count', cat_vars=cat_vars)

##### Kruskal-Wallis H-test Results ##### 

season is correlated with count | p-value: 3.235207575592282e-18
hour is correlated with count | p-value: 0.0
holiday is correlated with count | p-value: 0.0022325129212506403
working_day is NOT correlated with count | p-value: 0.8372798268983568
peak is correlated with count | p-value: 1.8482408173644874e-119
timesofday is correlated with count | p-value: 0.0
rainfall_intensity is correlated with count | p-value: 3.2862191459573774e-28
dayofweek is correlated with count | p-value: 7.741882355779844e-07
temp_bin is correlated with count | p-value: 4.5486744761349474e-247
rhum_bin is correlated with count | p-value: 0.0
wind_speed_group is correlated with count | p-value: 1.0308324657554141e-28
wind_bft is correlated with count | p-value: 1.810350081440984e-36

['season',
 'hour',
 'holiday',
 'peak',
 'timesofday',
 'rainfall_intensity',
 'dayofweek',
 'temp_bin',
 'rhum_bin',
 'wind_speed_group',
 'wind_bft']

Correlation Matrix¶

corrMatt = df_train[['temp','wdsp','rhum','count']].corr()
mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False
cmap = sns.diverging_palette(220, 20, as_cmap=True)

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corrMatt, mask=mask,vmax=.3, annot=True, ax=ax, cmap=cmap)
plt.show()

df1Corr=pd.DataFrame(corrMatt.drop('count').unstack().sort_values(ascending=False)['count'],columns=['Correlation to the target'])
df1Corr.style.background_gradient(cmap=sns.light_palette("red", as_cmap=True))

Distribution of numerical features¶

fig = plt.figure(figsize=(22,6))
gs = fig.add_gridspec(2, 3)
ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])

sns.histplot(df_train['temp'],ax=ax0, stat='density', kde=True)
sns.histplot(df_train['rhum'],ax=ax1, stat='density', kde=True)
sns.histplot(df_train['wdsp'],ax=ax2, stat='density', kde=True)

ax0.set(xlabel='Temperature',title="Distribution - Temperature")
ax1.set(xlabel='Relative Humidity',title="Distribution - Relative Humidity")
ax2.set(xlabel='Wind Speed',title="Distribution - Wind Speed")

plt.show()

Relations between Numerical Features and Target variable¶

sns.pairplot(df_train, 
             x_vars=['temp','wdsp','rhum'],
             dropna=True, 
             y_vars='count', 
             height=6, 
             kind="reg", 
             palette='Set1')
plt.show()

Temperature and Relative Humidity show a strong correlation with Rentals Count (positive and negative, respectively) but Wind Speed does not seem to be correlated with Rentals Count.

pp = sns.pairplot(df_train, 
             x_vars='wdsp',
             dropna=True, 
             y_vars='count', height=8, aspect=0.8, kind="reg", hue='season')
pp.fig.suptitle("Relationship between Wind Speed vs Rentals Count by Seasion")
plt.show()

If we plot the relationship by Season we can see that Wind Speed is correlated but depending on the Season is positive or negative. Which makes sense since on Autumn and Winter, as it is colder and temperature is normally low, Rentals decrease as the Wind Speed increases. In the Spring and Summer, because the temperature are on average higher the wind speed we see a different effect (positive correlation).

Monthly Data¶

monthly_data = df_train.groupby('month')['count'].agg('sum').reset_index(name='total_rentals')
month_map = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
monthly_data['monthly_data_str'] = monthly_data['month'].map(lambda d : month_map[d])
monthly_stats = f"Mean = {round(monthly_data['total_rentals'].mean(), 2)} \nStd. Dev = {round(monthly_data['total_rentals'].std(), 2)}"
monthly_data = monthly_data[['monthly_data_str', 'total_rentals']]
fig, ax = plt.subplots(1, 2, gridspec_kw={'width_ratios': [1, 3]}, figsize=(10,6), tight_layout=True)
ax[0].axis('off')
ax[0].axis('tight')
table = ax[0].table(cellText=monthly_data.values, colLabels=['Month','Total Rentals'], cellLoc='center', loc='center', colColours =["#eb826b"] * 2)
table.auto_set_font_size(False)
table.set_fontsize(12)
table.auto_set_column_width(col=list(range(len(monthly_data.columns)))) 
table.scale(3,2)
monthly_plot = sns.pointplot(x=monthly_data["monthly_data_str"], y=monthly_data["total_rentals"], ax=ax[1])
ax[1].set(ylim=(0,max(monthly_data["total_rentals"]+100)), xlabel='Month', ylabel='Total Rentals')
monthly_plot.annotate(monthly_stats, xy=(3, 1500), fontsize=16, fontfamily='monospace')
fig.tight_layout()
plt.show()

monthly_testdata = df_test.groupby('month')['count'].agg('sum').reset_index(name='total_rentals')
monthly_testdata['month'] = monthly_testdata['month'].map(lambda d : month_map[d])
monthly_testdata

Distribution across Times of the Day¶

def labels_zeros(df):
    zero_values = len(df[df['count'] == 0])
    stats_df = df['count'].agg(['mean','var','std'])
    return f"Zeros: {round((zero_values/df.shape[0])*100, 2)}% \n" + \
        f"Mean = {round(stats_df['mean'], 2)} \nVariance = {round(stats_df['var'], 2)} \nStd. Dev = {round(stats_df['std'], 2)} \n" +\
        f"n = {df.shape[0]}"

df_night = df_train[df_train['timesofday'] == 'Night']
df_morn = df_train[df_train['timesofday'] == 'Morning']
df_aft = df_train[df_train['timesofday'] == 'Afternoon']
df_eve = df_train[df_train['timesofday'] == 'Evening']

fig = plt.figure(figsize=(22,8))
gs = fig.add_gridspec(2, 2)
ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[1, 0])
ax3 = fig.add_subplot(gs[1, 1])

night_plot = sns.histplot(data=df_night, x='count', ax=ax0, stat='count', discrete=True, color='#e34a33')
morning_plot = sns.histplot(data=df_morn, x='count', ax=ax1, stat='count', discrete=True, color='#2ca25f')
afternoon_plot = sns.histplot(data=df_aft, x='count', ax=ax2, stat='count', discrete=True, color='#feb24c')
evening_plot = sns.histplot(data=df_eve, x='count', ax=ax3, stat='count', discrete=True, color='#43a2ca')

night_plot.annotate(labels_zeros(df_night), xy=(8, 600), fontsize=16, fontfamily='monospace')
label_stats = labels_zeros(df_morn)
morning_plot.annotate(labels_zeros(df_morn), xy=(15, 100), fontsize=16, fontfamily='monospace')
label_stats = labels_zeros(df_aft)
afternoon_plot.annotate(labels_zeros(df_aft), xy=(18, 100), fontsize=16, fontfamily='monospace')
label_stats = labels_zeros(df_eve)
evening_plot.annotate(labels_zeros(df_eve), xy=(13, 100), fontsize=16, fontfamily='monospace')

ax0.set(xlabel='',title="NIGHT Rentals Count")
ax1.set(xlabel='',title="MORNING Rentals Count")
ax2.set(xlabel='',title="AFTERNOON Rentals Count")
ax3.set(xlabel='',title="EVENING Rentals Count")

plt.show()

GitHub Mark GitHub repository
Author: Leandro Pessini

	counts	per	per100
< 30%	4482	0.086272	8.63%
30% - 50%	9552	0.183862	18.39%
50% - 80%	22236	0.428010	42.8%
> 80%	15682	0.301856	30.19%

	temp	rhum	wdsp	date	hour	day	month	year	count	...	working_day	season	peak	timesofday	rainfall_intensity	wind_bft	wind_speed_group	temp_r	rhum_bin
0	0.1	98	4	2021-03-01	0	1	3	2021	0	...	True	Winter	False	Night	no rain	2	Calm / Light Breeze	0	4.0
1	-1.1	98	3	2021-03-01	1	1	3	2021	0	...	True	Winter	False	Night	no rain	2	Calm / Light Breeze	-1	4.0
2	-1.2	98	4	2021-03-01	2	1	3	2021	1	...	True	Winter	False	Night	no rain	2	Calm / Light Breeze	-1	4.0
3	-0.9	100	5	2021-03-01	3	1	3	2021	0	...	True	Winter	False	Night	no rain	2	Calm / Light Breeze	-1	4.0
4	0.0	100	6	2021-03-01	4	1	3	2021	0	...	True	Winter	False	Night	no rain	2	Calm / Light Breeze	0	4.0

	month	total_rentals
0	Mar	2361
1	Apr	2737