This notebook is part of a GitHub repository: https://github.com/pessini/moby-bikes
MIT Licensed
Author: Leandro Pessini
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import numpy as np
from datetime import datetime
import scipy.stats as stats
from patsy import dmatrices
# statsmodels
from statsmodels.stats import diagnostic as diag
import statsmodels.api as sm
from statsmodels.formula.api import glm
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.outliers_influence import variance_inflation_factor
%matplotlib inline
import warnings
warnings.simplefilter('ignore', FutureWarning)
%reload_ext watermark
%watermark -a "Leandro Pessini" -n -u -v -iv -w
df_train = pd.read_csv('../data/interim/dfsample_train.csv', parse_dates=['date'])
df_test = pd.read_csv('../data/interim/dfsample_test.csv', parse_dates=['date'])
all_data = pd.read_csv('../data/interim/all_data.csv',
parse_dates=['lastrentalstart','lastgpstime','date'])
df_train.head()
Hourly trend: It might be a high demand for people commuting to work. Early morning and late evening can have different trend (cyclist) and low demand during 10:00 pm to 4:00 am.
Daily Trend: Users demand more bike on weekdays as compared to weekend or holiday.
Rain: The demand of bikes will be lower on a rainy day as compared to a sunny day. Similarly, higher humidity will cause to lower the demand and vice versa.
Temperature: In Ireland, temperature has positive correlation with bike demand.
Traffic: It can be positively correlated with Bike demand. Higher traffic may force people to use bike as compared to other road transport medium like car, taxi etc.
holiday
workingday
peak
season
: (1 = Spring, 2 = Summer, 3 = Fall, 4 = Winter)duration
*: duration of the rentalfrom functools import lru_cache
@lru_cache(maxsize=None)
def get_irish_bank_holidays() -> np.ndarray:
'''
Returns a Panda Series of Irish Bank Holidays
'''
bank_holidays = pd.read_json('../data/external/irishcalendar.json')
bank_holidays['date'] = pd.to_datetime(arg=bank_holidays['date'], utc=True, infer_datetime_format=True)
bank_holidays['dt'] = pd.to_datetime(bank_holidays['date'].dt.date)
bank_holidays = bank_holidays[bank_holidays['type'] == 'National holiday']
return bank_holidays['dt'].values # np.ndarray
def isBankHoliday(date: pd.DatetimeIndex) -> bool:
'''
Receives a date and returns if it is an Irish Bank Holiday
'''
bank_holidays = get_irish_bank_holidays()
return (pd.Timestamp(date) in bank_holidays)
# holiday
# isBankHoliday(datetime.datetime(2020,3,17))
df_train['holiday'] = df_train['date'].map(isBankHoliday)
df_test['holiday'] = df_test['date'].map(isBankHoliday)
all_data['holiday'] = all_data['date'].map(isBankHoliday)
def get_day_of_week_number(date: pd.DatetimeIndex) -> int:
return date.dayofweek
def get_day_of_week_str(date: pd.DatetimeIndex) -> str:
return date.day_name()
def isWorkingDay(date: pd.DatetimeIndex) -> bool:
return (get_day_of_week_number(date) < 5)
# day of the week
df_train['dayofweek_n'] = df_train['date'].map(get_day_of_week_number)
df_train['dayofweek'] = df_train['date'].map(get_day_of_week_str)
df_test['dayofweek_n'] = df_test['date'].map(get_day_of_week_number)
df_test['dayofweek'] = df_test['date'].map(get_day_of_week_str)
all_data['dayofweek'] = all_data['date'].map(get_day_of_week_str)
# working day (Monday=0, Sunday=6)
# from 0 to 4 or monday to friday and is not holiday
df_train['working_day'] = df_train['date'].map(isWorkingDay)
df_test['working_day'] = df_test['date'].map(isWorkingDay)
all_data['working_day'] = all_data['date'].map(isWorkingDay)
# set working_day to False on National Bank Holidays
df_train.loc[ df_train['holiday'] , 'working_day'] = False
df_test.loc[ df_test['holiday'] , 'working_day'] = False
all_data.loc[ all_data['holiday'] , 'working_day'] = False
def get_season(date: pd.DatetimeIndex) -> str:
'''
Receives a date and returns the corresponded season
0 - Spring | 1 - Summer | 2 - Autumn | 3 - Winter
Vernal equinox(about March 21): day and night of equal length, marking the start of spring
Summer solstice (June 20 or 21): longest day of the year, marking the start of summer
Autumnal equinox(about September 23): day and night of equal length, marking the start of autumn
Winter solstice (December 21 or 22): shortest day of the year, marking the start of winter
'''
Y = 2000 # dummy leap year to allow input X-02-29 (leap day)
seasons = [('Winter', (datetime(Y, 1, 1), datetime(Y, 3, 20))),
('Spring', (datetime(Y, 3, 21), datetime(Y, 6, 20))),
('Summer', (datetime(Y, 6, 21), datetime(Y, 9, 22))),
('Autumn', (datetime(Y, 9, 23), datetime(Y, 12, 20))),
('Winter', (datetime(Y, 12, 21), datetime(Y, 12, 31)))]
date = date.replace(year=Y)
return next(season for season, (start, end) in seasons if start <= date <= end)
df_train['season'] = df_train['date'].map(get_season)
df_test['season'] = df_test['date'].map(get_season)
all_data['season'] = all_data['date'].map(get_season)
def isPeakHour(hour: int, date: pd.DatetimeIndex) -> bool:
'''
Receives an hour / date and returns if it is a peak hour
Source: https://www.independent.ie/irish-news/the-new-commuter-hour-peak-times-increase-with-record-traffic-volumes-36903431.html
'''
return ((isWorkingDay(date) and (6 <= hour <= 10 or 15 <= hour <= 19)))
df_train['peak'] = list(map(isPeakHour, df_train['hour'], df_train['date']))
df_test['peak'] = list(map(isPeakHour, df_test['hour'], df_test['date']))
all_data['peak'] = list(map(isPeakHour, all_data['hour'], all_data['date']))
def times_of_day(hour: int) -> str:
'''
Receives an hour and returns the time of day
Morning: 7:00 - 11:59
Afternoon: 12:01 - 17:59
Evening: 18:00 - 22:59
Night: 23:00 - 06:59
'''
conditions = [
(hour < 7), # night 23:00 - 06:59
(hour >= 7) & (hour < 12), # morning 7:00 - 11:59
(hour >= 12) & (hour < 18), # afternoon 12:01 - 17:59
(hour >= 18) & (hour < 23) # evening 18:00 - 22:59
]
values = ['Night', 'Morning', 'Afternoon', 'Evening']
return np.select(conditions, values,'Night')
df_train['timesofday'] = df_train['hour'].map(times_of_day)
df_test['timesofday'] = df_test['hour'].map(times_of_day)
all_data['timesofday'] = all_data['hour'].map(times_of_day)
Level | Rainfall Intensity |
---|---|
no rain | 0 |
drizzle | 0.1~0.3 |
light rain | 0.3~0.5 |
moderate rain | 0.5~4 |
heavy rain | >4 |
Source: https://www.metoffice.gov.uk/research/library-and-archive/publications/factsheets
PDF direct link: Water in the atmosphere
Precipitation unit: Rain will be output in millimetres (mm).
The minvalue, value and maxvalue values are derived from statistical analysis of the forecast, and refer to the lower (20th percentile), middle (60th percentile) and higher (80th percentile) expected amount. If minvalue and maxvalue are not output, value is the basic forecast amount.
<precipitation unit="mm" value="0.0" minvalue="0.0" maxvalue="0.1"/>
def rain_intensity_level(rain: float) -> str:
'''
Receives a rain intensity (in mm) and returns the rain intensity level
'''
conditions = [
(rain == 0.0), # no rain
(rain <= 0.3), # drizzle
(rain > 0.3) & (rain <= 0.5), # light rain
(rain > 0.5) & (rain <= 4), # moderate rain
(rain > 4) # heavy rain
]
values = ['no rain', 'drizzle', 'light rain', 'moderate rain','heavy rain']
return np.select(conditions, values)
df_train['rainfall_intensity'] = df_train['rain'].map(rain_intensity_level)
df_test['rainfall_intensity'] = df_test['rain'].map(rain_intensity_level)
all_data['rainfall_intensity'] = all_data['rain'].map(rain_intensity_level)
df_train['rainfall_intensity'].value_counts()
import math
def scale(value, factor):
"""
Multiply value by factor, allowing for None values.
"""
return None if value is None else value * factor
def wind_ms(kn):
"""
Convert wind from knots to metres per second
"""
return scale(kn, 0.514)
def wind_kn(ms):
"""
Convert wind from metres per second to knots
"""
return scale(ms, 3.6 / 1.852)
def wind_bft(ms):
"""
Convert wind from metres per second to Beaufort scale
"""
_bft_threshold = (0.3, 1.5, 3.4, 5.4, 7.9, 10.7, 13.8, 17.1, 20.7, 24.4, 28.4, 32.6)
if ms is None:
return None
return next((bft for bft in range(len(_bft_threshold)) if ms < _bft_threshold[bft]), len(_bft_threshold))
df_train['wind_bft'] = df_train.apply(lambda row: wind_bft(wind_ms(row.wdsp)), axis=1)
df_test['wind_bft'] = df_test.apply(lambda row: wind_bft(wind_ms(row.wdsp)), axis=1)
all_data['wind_bft'] = all_data.apply(lambda row: wind_bft(wind_ms(row.wdsp)), axis=1)
df_train['wind_bft'].value_counts().sort_values(ascending=False)
Level | Beaufort scale |
---|---|
Calm / Light Breeze | 0~2 |
Breeze | 3 |
Moderate Breeze | 4-5 |
Strong Breeze / Near Gale | 6-7 |
Gale / Storm | 8~12 |
def group_wind_speed(beaufort_scale: float) -> str:
'''
Receives a beaufort scale and returns the group of the wind speed
'''
conditions = [
(beaufort_scale < 3), # Calm / Light Breeze
(beaufort_scale == 3), # Breeze
(beaufort_scale > 3) & (beaufort_scale < 6), # Moderate Breeze
(beaufort_scale >= 6) & (beaufort_scale < 8), # Strong Breeze / Near Gale
(beaufort_scale > 7) # Gale / Storm
]
values = ['Calm / Light Breeze', 'Breeze', 'Moderate Breeze', 'Strong Breeze / Near Gale','Gale / Storm']
return np.select(conditions, values)
df_train['wind_speed_group'] = df_train['wind_bft'].map(group_wind_speed)
df_test['wind_speed_group'] = df_test['wind_bft'].map(group_wind_speed)
all_data['wind_speed_group'] = all_data['wind_bft'].map(group_wind_speed)
Capturing the relationship on temperature as continuous can be hard for machine learning algorithms as the range is to high. Temperature of 13.4C and 13.9C or 13C and 15C, are practically the same if you think about deciding whether to go bicicling or not. The same rationale applies for humidity and wind speed.
def round_up(x):
'''
Helper function to round away from zero
'''
from math import copysign
return int(x + copysign(0.5, x))
df_train['temp_r'] = df_train['temp'].apply(round_up)
df_test['temp_r'] = df_test['temp'].apply(round_up)
all_data['temp_r'] = all_data['temp'].apply(round_up)
KBinsDiscretizer - Bin continuous data into intervals.
from sklearn.preprocessing import KBinsDiscretizer
# transform the temperature with KBinsDiscretizer
enc_temp = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy='kmeans')
df_train['temp_bin'] = enc_temp.fit_transform(df_train['temp'].array.reshape(-1,1))
df_test['temp_bin'] = enc_temp.transform(df_test['temp'].array.reshape(-1,1))
all_data['temp_bin'] = enc_temp.transform(all_data['temp'].array.reshape(-1,1))
# transform the humidity with KBinsDiscretizer
enc_hum = KBinsDiscretizer(n_bins=5, encode="ordinal", strategy='kmeans')
df_train['rhum_bin'] = enc_hum.fit_transform(df_train['rhum'].array.reshape(-1,1))
df_test['rhum_bin'] = enc_hum.transform(df_test['rhum'].array.reshape(-1,1))
all_data['rhum_bin'] = enc_hum.transform(all_data['rhum'].array.reshape(-1,1))
Period of use
"5.1 Bikes should not be used for more than 19 consecutive hours, this is the maximum period of use." General Terms and Conditions (“GTC”)
* Assumption: Due to lack of information and data, to calculate the duration rental time I am assuming that when a new bike rental starts the duration in minutes will be calculated by: $$ ( RentalDuration = LastGPSTime - LastRentalStart ) $$
# time of rental in minutes (lastgpstime - rental-start)
all_data['duration'] = (all_data['lastgpstime'] - all_data['lastrentalstart']) / pd.Timedelta(minutes=1)
A few GPS records have frozen and stopped sending the accurate data back, which would lead to a bias duration of rentals.
To prevent any inaccurate information these records will be set as 0.
all_data[all_data['duration'] < 0].shape[0]
all_data['duration'] = np.where(all_data['duration'] < 0, 0, all_data['duration'])
all_data[all_data['duration'] < 0].shape[0]
df_train.to_csv('../data/processed/df_train.csv', index=False)
df_test.to_csv('../data/processed/df_test.csv', index=False)
all_data.to_csv('../data/processed/rentals_data.csv', index=False)
GitHub repository
Author: Leandro Pessini