Octocat This notebook is part of a GitHub repository: https://github.com/pessini/moby-bikes
MIT Licensed
Author: Leandro Pessini

Exploratory Data Analysis (EDA)

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import numpy as np
import scipy.stats as stats
from patsy import dmatrices

# statsmodels
from statsmodels.stats import diagnostic as diag
import statsmodels.api as sm
from statsmodels.formula.api import glm
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

%matplotlib inline
import warnings
warnings.simplefilter('ignore', FutureWarning)
In [3]:
%reload_ext watermark
%watermark -a "Leandro Pessini" -n -u -v -iv -w
Author: Leandro Pessini

Last updated: Sat May 14 2022

Python implementation: CPython
Python version       : 3.9.6
IPython version      : 8.3.0

pandas     : 1.3.0
sys        : 3.9.6 | packaged by conda-forge | (default, Jul 11 2021, 03:36:15) 
[Clang 11.1.0 ]
matplotlib : 3.4.2
numpy      : 1.21.1
seaborn    : 0.11.1
scipy      : 1.8.0
statsmodels: 0.13.2

Watermark: 2.3.0

In [4]:
df_train = pd.read_csv('../data/processed/df_train.csv')
df_test = pd.read_csv('../data/processed/df_test.csv')
all_data = pd.read_csv('../data/processed/rentals_data.csv')
rain temp rhum wdsp date hour day month year count ... working_day season peak timesofday rainfall_intensity wind_bft wind_speed_group temp_r temp_bin rhum_bin
0 0.0 0.1 98 4 2021-03-01 0 1 3 2021 0 ... True Winter False Night no rain 2 Calm / Light Breeze 0 0.0 4.0
1 0.0 -1.1 98 3 2021-03-01 1 1 3 2021 0 ... True Winter False Night no rain 2 Calm / Light Breeze -1 0.0 4.0
2 0.0 -1.2 98 4 2021-03-01 2 1 3 2021 1 ... True Winter False Night no rain 2 Calm / Light Breeze -1 0.0 4.0
3 0.0 -0.9 100 5 2021-03-01 3 1 3 2021 0 ... True Winter False Night no rain 2 Calm / Light Breeze -1 0.0 4.0
4 0.0 0.0 100 6 2021-03-01 4 1 3 2021 0 ... True Winter False Night no rain 2 Calm / Light Breeze 0 0.0 4.0

5 rows × 23 columns

In [5]:
# EDA libraries
# from pandas_profiling import ProfileReport
# from dataprep.eda import create_report
# import sweetviz as sv

# Dataprep Report

# hourly_data_report = create_report(hourly_data, title='Hourly Data Report', progress=False)
# hourly_data_report.save('../reports/dataprep/hourly_data_report')
# hourly_rentals_report = create_report(hourly_rentals, title='Hourly Data Report (Only Rentals)', progress=False)
# hourly_rentals_report.save('../reports/dataprep/hourly_rentals_report')

# Pandas Profiling

# profile = ProfileReport(hourly_data, title='Hourly Data', html={'style':{'full_width':True}})
# profile.to_file(output_file='../reports/pandas-profiling/hourly_data_report.html')

# profile = ProfileReport(hourly_rentals, title='Hourly Data Report (Only Rentals)', html={'style':{'full_width':True}})
# profile.to_file(output_file='../reports/pandas-profiling/hourly_rentals_report.html')

# to_notebook_iframe() will print in the jupyter notebook's cell
# profile.to_notebook_iframe()

# SweetViz

# hourlydata_sweetviz_report = sv.analyze(hourly_data, target_feat='count')
# hourlydata_sweetviz_report.show_html('../reports/sweetviz/hourly_data_report.html')

# hourlyrentals_sweetviz_report = sv.analyze(hourly_rentals, target_feat='count')
# hourlyrentals_sweetviz_report.show_html('../reports/sweetviz/hourly_rentals_report.html')

Battery distribution

In [6]:
battery_dist_df = all_data[all_data['duration'] > 0].copy()

def group_battery_status(df):

    bins= [0,30,50,80,100]
    labels = ['< 30%','30% - 50%','50% - 80%','> 80%']
    df['battery_status'] = pd.cut(df['start_battery'], bins=bins, labels=labels, right=False)

    s = df.battery_status
    counts = s.value_counts()
    percent = s.value_counts(normalize=True)
    df_summary = pd.DataFrame({'counts': counts, 'per': percent}, labels)
    df_summary['per100'] = round((df_summary['per']*100),2).astype(str) + '%'
    return df_summary

battery_dist = group_battery_status(battery_dist_df)
counts per per100
< 30% 4482 0.086272 8.63%
30% - 50% 9552 0.183862 18.39%
50% - 80% 22236 0.428010 42.8%
> 80% 15682 0.301856 30.19%

Initial battery density

In [7]:
fig, ax = plt.subplots(figsize=(6, 6))
sns.histplot(data=battery_dist_df, x='start_battery', kde=True)

Duration of rentals

In [8]:
avg_rental_duration = f"Average rentals duration: {round(all_data['duration'].mean(), 2)} minutes"
In [9]:
fig, ax = plt.subplots(nrows=2,ncols=2,figsize=(22, 12))

# fig.text(0.125,1.1, avg_rental_duration, fontfamily='serif',fontsize=14, fontweight='bold', color='#444444')

season_rentduration = all_data.groupby(['season'])['duration'].mean()
sns.barplot(x=season_rentduration.values, y=season_rentduration.index, ax=ax[0][0], orient='h', palette='husl')
ax[0][0].set(xlabel='Duration (minutes)', ylabel='', title='Average duration (in minutes) of rentals by season')

day_of_week_rentduration = all_data.groupby(['dayofweek'])['duration'].mean()
sns.barplot(x=day_of_week_rentduration.values, y=day_of_week_rentduration.index, ci=None, ax=ax[1][0],
            order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'], orient='h', palette='husl')
ax[1][0].set(xlabel='Duration (minutes)', ylabel='', title='Average duration (in minutes) of rentals across all days of week')

holiday_rentduration = all_data.groupby(['holiday'])['duration'].mean()
sns.barplot(x=holiday_rentduration.values, y=holiday_rentduration.index, ax=ax[0][1], orient='h', palette='husl')
ax[0][1].set(xlabel='Duration (minutes)', ylabel='Bank Holiday', title='Average duration (in minutes) of rentals by Bank Holiday')
# ax[0][1].bar_label(ax[0][1].containers[0])

timesofday_rentduration = all_data.groupby(['timesofday'])['duration'].mean()
sns.barplot(x=timesofday_rentduration.values, y=timesofday_rentduration.index, ax=ax[1][1],
            order=['Morning','Afternoon','Evening','Night'], orient='h', palette='husl')
ax[1][1].set(xlabel='Duration (minutes)', ylabel='', title='Average duration (in minutes) of rentals across Times of the Day')


We can spot a few patterns when in colder days (Autumn and Winter | Night), the duration of rentals on average is higher. It seems that, on average, customers stay longer with bikes on early days of the week (Monday, Tuesday and Wednesday). On the other hand, it does not seem to be a difference on average rentals duration on Bank Holidays.

Total number of rentals

In [10]:
fig, ax = plt.subplots(nrows=2,ncols=2,figsize=(22, 12))

            x='count', y='season', order=['Winter','Spring','Summer','Autumn'], ci=None, ax=ax[0][0])
ax[0][0].set(xlabel='Number of Rentals', ylabel='Season', title='Rentals across all seasons')

day_of_week = df_train.groupby('dayofweek')['count'].agg('sum').reset_index(name='count')
sns.barplot(data=day_of_week, x='count', y='dayofweek', ci=None, ax=ax[1][0],
ax[1][0].set(xlabel='Number of Rentals', ylabel='Day of Week', title='Rentals across all days of week')

sns.barplot(data=df_train, x='count', y='working_day', ci=None, ax=ax[0][1], orient='h')
# sns.lineplot(data=df_train , x="hour", y="count", hue="working_day", ax=ax[0][1], ci=None)
ax[0][1].set(xlabel='Number of Rentals', ylabel='Working Day', title='Rentals across Working Days')

sns.barplot(data=df_train, x='count', y='timesofday', ci=None, ax=ax[1][1],
ax[1][1].set(xlabel='Number of Rentals', ylabel='Period of the Day', title='Rentals across Times of the Day')


As expected, in the Spring and Summer had more rentals than on Autumn and Winter as the weather impacts directly the number of rentals and those seasons usually have better weather. Also, the number of rentals is significant higher in the Afternoon followed by Mornining, Evening and then Night. Again, the number of rentals do not seem to have a significant difference on Weekends/Holidays.

As consequence of the duration of rentals being higher during early weekdays, we see that Thursday, Friday and Saturday have a higher flux of new rentals.

Hourly number of rentals

In [11]:
fig, ax = plt.subplots(nrows=2,ncols=2,figsize=(26, 12))

sns.lineplot(data=df_train, x='hour', y='count', hue='season', hue_order=['Winter','Spring','Summer','Autumn'], ax=ax[0][0], ci=None)
ax[0][0].set(xlabel='Hour', ylabel='', title='Seasons')

sns.lineplot(data=df_train, x='hour', y='count', hue='dayofweek', hue_order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'], ax=ax[1][0], ci=None)
ax[1][0].set(xlabel='Hour', ylabel='', title='Days of week')

sns.pointplot(data=df_train, x="hour", y="count", hue="working_day", ax=ax[0][1], ci=None)
ax[0][1].set(xlabel='Hour', ylabel='', title='Working Days')

sns.pointplot(data=df_train, x='hour', y='count', hue='timesofday', hue_order=['Morning','Afternoon','Evening','Night'], ax=ax[1][1], ci=None)
ax[1][1].set(xlabel='Hour', ylabel='', title='Times of the Day')


When we plot the number o rentals on each hour of day, we notice that no discrepancy's found on different Seasons, Weekends/Holidays and even week days follow a similar pattern throughout the day.

BoxPlot analyses

In [12]:
fig, axes = plt.subplots(nrows=2,ncols=4, figsize=(30, 16))

            order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
sns.boxplot(data=df_train,y="count",x="timesofday",orient="v",ax=axes[0][2], order=['Morning', 'Afternoon', 'Evening', 'Night'])
            order=['no rain', 'drizzle', 'light rain', 'moderate rain', 'heavy rain'])

monthly_data = df_train[['month','count']].copy()
month_map = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
monthly_data['month_str'] = df_train['month'].map(lambda d : month_map[d])
sns.boxplot(data=monthly_data, x="month_str", y="count", orient="v",ax=axes[0][3], order=month_map.values())
sns.boxplot(data=df_train, x="holiday", y="count", orient="v",ax=axes[1][3])

axes[0][0].set(ylabel='Count',title="Box Plot On Count")
axes[0][1].set(xlabel='', ylabel='',title="Box Plot On Count Across Seasons")
axes[1][0].set(xlabel='', ylabel='',title="Box Plot On Count by Day of Week")
axes[1][1].set(xlabel='Working Day', ylabel='',title="Box Plot On Count by Working Day")
axes[0][2].set(xlabel='', ylabel='',title="Box Plot On Count by Times of Day")
axes[1][2].set(xlabel='Rainfall Intensity Levels', ylabel='',title="Box Plot On Count by Rainfall Intensity")
axes[0][3].set(xlabel='', ylabel='', title="Box Plot On Montly Rentals Count")
axes[1][3].set(xlabel='Bank Holiday', ylabel='', title="Box Plot On Count by Working Day")


Main findings:

  • Bikes are, on average, less rented on Bank Holidays. This could be because Moby Bikes have a subscription plan and this could impact if people subscribe to commute, for example.
  • Apart from Moderate Rain, we do not see a big mean difference on Rainfall Intensity Levels.
  • We already saw on above plots that in the Afternoon where most of the bikes are rented and the same pattern is reflected on the means values.
  • Monthly average is also presenting some patterns that could indicate time series seasonality.
In [13]:
fig, ax = plt.subplots(figsize=(18, 6))
sns.boxplot(data=df_train, x="hour", y="count", orient="v",ax=ax)
ax.set(xlabel='Hour', ylabel='Count')
ax.set_title(fontsize=20, label="Box Plot On Hourly Rentals Count")

As demonstrated above by line plots and from the boxplot, in the afternoon bikes are more rented on average and in total. Hourly means have demonstrated being very scattered.

In [14]:
fig = plt.figure(figsize=(18,10))
gs = fig.add_gridspec(2, 2)
ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[1, :])

sns.boxplot(data=df_train, x="temp_bin", y="count", orient="v",ax=ax0)
sns.boxplot(data=df_train, x="rhum_bin", y="count", orient="v",ax=ax1)
sns.boxplot(data=df_train, x="wind_speed_group", y="count", orient="v",ax=ax2,
            order=['Calm / Light Breeze', 'Breeze', 'Moderate Breeze', 'Strong Breeze / Near Gale','Gale / Storm'])

ax0.set(xlabel='Temperature (bins - kmeans)',title="BoxPlot - Temperature")
ax1.set(xlabel='Relative Humidity (bins - kmeans)',title="BoxPlot - Relative Humidity")
ax2.set(xlabel='',title="BoxPlot - Wind Speed Levels")


Weather features, as temperature and humidity, have visually mean differences and it makes sense when deciding to use a bike. Wind Speed seems to decrease usage on average only when it is strong and storms.

Kruskal-Wallis H-test on categorical features

The ANOVA test has important assumptions that must be satisfied in order for the associated p-value to be valid.

  • The samples are independent.
  • Each sample is from a normally distributed population.
  • The population standard deviations of the groups are all equal. This property is known as homoscedasticity.

count feature does not have normal distribution, using non-parametric Kruskal-Wallis H-test instead

In [15]:
# Defining a function to find the statistical relationship with all the categorical variables
def kruskal_test_categorical_features(df, target, cat_vars):
    from scipy.stats import kruskal

    # Creating an empty list of final selected predictors
    print('##### Kruskal-Wallis H-test Results ##### \n')
    for predictor in cat_vars:
        kruskal_res = kruskal(*cat_grouplist) #  Kruskal-Wallis H-test
        # If the ANOVA P-Value is <0.05, that means we reject H0
        if (kruskal_res[1] < 0.05):
            print(predictor, 'is correlated with', target, '| p-value:', kruskal_res[1])
            print(predictor, 'is NOT correlated with', target, '| p-value:', kruskal_res[1])
In [16]:
# Calling the function to check which categorical variables are correlated with target
    ['season', 'hour', 'holiday', 'working_day', 'peak', 'timesofday', 'rainfall_intensity', 'dayofweek', 'temp_bin', 'rhum_bin', 'wind_speed_group', 'wind_bft']
kruskal_test_categorical_features(df=df_train, target='count', cat_vars=cat_vars)
##### Kruskal-Wallis H-test Results ##### 

season is correlated with count | p-value: 3.235207575592282e-18
hour is correlated with count | p-value: 0.0
holiday is correlated with count | p-value: 0.0022325129212506403
working_day is NOT correlated with count | p-value: 0.8372798268983568
peak is correlated with count | p-value: 1.8482408173644874e-119
timesofday is correlated with count | p-value: 0.0
rainfall_intensity is correlated with count | p-value: 3.2862191459573774e-28
dayofweek is correlated with count | p-value: 7.741882355779844e-07
temp_bin is correlated with count | p-value: 4.5486744761349474e-247
rhum_bin is correlated with count | p-value: 0.0
wind_speed_group is correlated with count | p-value: 1.0308324657554141e-28
wind_bft is correlated with count | p-value: 1.810350081440984e-36

Correlation Matrix

In [17]:
corrMatt = df_train[['temp','wdsp','rhum','count']].corr()
mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False
cmap = sns.diverging_palette(220, 20, as_cmap=True)

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corrMatt, mask=mask,vmax=.3, annot=True, ax=ax, cmap=cmap)

df1Corr=pd.DataFrame(corrMatt.drop('count').unstack().sort_values(ascending=False)['count'],columns=['Correlation to the target'])
df1Corr.style.background_gradient(cmap=sns.light_palette("red", as_cmap=True))
  Correlation to the target
temp 0.337711
wdsp 0.058572
rhum -0.478805

Distribution of numerical features

In [18]:
fig = plt.figure(figsize=(22,6))
gs = fig.add_gridspec(2, 3)
ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])

sns.histplot(df_train['temp'],ax=ax0, stat='density', kde=True)
sns.histplot(df_train['rhum'],ax=ax1, stat='density', kde=True)
sns.histplot(df_train['wdsp'],ax=ax2, stat='density', kde=True)

ax0.set(xlabel='Temperature',title="Distribution - Temperature")
ax1.set(xlabel='Relative Humidity',title="Distribution - Relative Humidity")
ax2.set(xlabel='Wind Speed',title="Distribution - Wind Speed")


Relations between Numerical Features and Target variable

In [19]:

Temperature and Relative Humidity show a strong correlation with Rentals Count (positive and negative, respectively) but Wind Speed does not seem to be correlated with Rentals Count.

In [20]:
pp = sns.pairplot(df_train, 
             y_vars='count', height=8, aspect=0.8, kind="reg", hue='season')
pp.fig.suptitle("Relationship between Wind Speed vs Rentals Count by Seasion")