diff --git a/cogo/simulation.py b/cogo/simulation.py index e3df5bc..547248f 100644 --- a/cogo/simulation.py +++ b/cogo/simulation.py @@ -1,283 +1,294 @@ import math import sys import pandas as pd import numpy as np import logging logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout, level=logging.INFO) logger.setLevel(logging.INFO) # Initialize stations with N of M bikes # # For each tick: # For each station: # - determine if a departure should occur (sample from geometric # distribution given that station's inter-departure interval # for that time of day) # If departure: # If n > 0 bikes available: # - determine destination station # - push bike to global 'in transit' list w/departure time # - pop bike from station (free a dock) # - reset station's time since last depart counter # Else: # - panic # Else: # - increment station's time since last depart counter # # For each undocked bike: # - Decrement remaining travel time by 1 tick # If remaining travel time == 0: # If n > 0 available docks at destination station: # - pop bike from in transit list # - push bike to station (lease a dock) # Else: # - panic class Station: def __init__(self, bikeshare_id: int, docks: int): self.bikeshare_id = bikeshare_id self.docks = docks self.docked_bikes = [] self.time_since_last_departure = 0 self.time_until_next_departure = 0 self.interlinks = { 'probability': {}, 'travel_time': {} } self.inter_departure_times = dict() def _check_available_docks(self): if self.docks > len(self.docked_bikes): return True return False def lease_dock(self, bike): if self._check_available_docks(): bike.is_docked = True self.docked_bikes.append(bike) return True else: return False def release_dock(self, destination, transit_time): if self.docked_bikes: # Let's be nice and use FIFO so everything cycles out a little # more regularly _undocked_bike = self.docked_bikes[0] _undocked_bike.is_docked = False _undocked_bike.last_departure_from = self.bikeshare_id _undocked_bike.next_arrival_to = destination _undocked_bike.remaining_transit_time = int( transit_time.total_seconds()) self.docked_bikes.pop(0) return _undocked_bike else: return None def should_bike_depart(self, hour: str): if self.time_until_next_departure == 0: # This will return the number of trials needed until # a success event occurs, sampled from a geometric # distribution, with probability, p, of an individual # success given by 1 / inter_departure_time _inter_departure_interval = self.inter_departure_times.get(hour) if not _inter_departure_interval: _inter_departure_interval = np.mean([ val for val in self.inter_departure_times.values() ]) self.time_until_next_departure = np.random.geometric( p=1/_inter_departure_interval) self.time_since_last_departure = 0 return True self.time_until_next_departure -= 1 self.time_since_last_departure += 1 return False def get_destination_station(self): station = np.random.choice( self.interlinks['station'], size=1, replace=False, p=self.interlinks['probability'] )[0] _idx = self.interlinks['station'].index(station) return (station, self.interlinks['travel_time'][_idx]) class Bike: def __init__(self, id: int): self.bike_id = id self.is_docked = True self.last_departure_from = 0 self.next_arrival_to = 0 self.remaining_transit_time = 0 class Orchestrator: def __init__(self, cogo_stations: pd.DataFrame, station_crosslinks: pd.DataFrame, hourly_trips: pd.DataFrame, bike_count: int): station_crosslinks = station_crosslinks.loc[ station_crosslinks['stop_station_id'].isin( cogo_stations['BIKESHARE_ID'] ) ] self.stations = self._instantiate_stations( cogo_stations, station_crosslinks, hourly_trips) self.bikes_in_transit = [] self.simulation_statistics = pd.DataFrame( columns=[ 'tick_number', 'timestamp', 'station_id', 'available_docks', 'used_docks'] ) + self.delays = [] self._distribute_bikes(bike_count) def _instantiate_stations(self, _cogo_stations, _station_crosslinks, _hourly_trips): stations = dict() for idx, row in _cogo_stations.iterrows(): _id = row['BIKESHARE_ID'] _docks = row['DOCKS'] station = Station(_id, _docks) station.interlinks = { 'station': _station_crosslinks.loc[ _station_crosslinks['start_station_id'] == _id, 'stop_station_id' ].tolist(), 'probability': _station_crosslinks.loc[ _station_crosslinks['start_station_id'] == _id, 'arrival_probability' ].tolist(), 'travel_time': _station_crosslinks.loc[ _station_crosslinks['start_station_id'] == _id, 'average_trip_duration' ].tolist() } if not station.interlinks['station']: # Our station data include some more recent entries than # our trip data, so we should probably make some minimal # effort to keep everything from catching fire when we # encounter them... continue station.interlinks['probability'] = [ float(prob) / sum(station.interlinks['probability']) for prob in station.interlinks['probability'] ] station.inter_departure_times = dict( (row['hour'], row['inter_departure_time']) for _, row in _hourly_trips.loc[ _hourly_trips['station_id'] == _id].iterrows() ) stations[_id] = station return stations def _distribute_bikes(self, _bike_count): _stationlist = list(self.stations.keys()) _counter = 0 for idx in range(_bike_count): _bike = Bike(idx) _lastval = _counter while ( len(self.stations[_stationlist[_counter]].docked_bikes) / self.stations[_stationlist[_counter]].docks ) > 0.9: if _counter == (len(_stationlist) - 1): _counter = 0 else: _counter += 1 if _counter == _lastval: raise Exception( 'You have tried to provision too many bikes for the ' + 'number of docks available. Please reduce the ' + 'number of bikes, or increase the available docks.') self.stations[_stationlist[_counter]].lease_dock(_bike) if _counter == (len(_stationlist) - 1): _counter = 0 else: _counter += 1 return def run_simulation(self, start_hour: int, num_ticks: int): tick = 0 while tick < num_ticks: _current_hour = start_hour + (tick // 60) if _current_hour < 10: _current_hour = '0' + str(_current_hour) else: _current_hour = str(_current_hour) _timestamp = _current_hour + ':' + ( str(tick % 60) if tick % 60 > 9 else '0' + str(tick % 60)) for station in self.stations.values(): if station.should_bike_depart(_current_hour): _dest, _travel_time = station.get_destination_station() _lease = station.release_dock(_dest, _travel_time) if _lease: - logger.info( + logger.debug( f'Leased bike {_lease.bike_id} at {_timestamp} ' f'from station {_lease.last_departure_from}, ' f'heading to station {_lease.next_arrival_to}' ) self.bikes_in_transit.append(_lease) else: logger.debug( 'No bike was available from station ' f'{station.bikeshare_id} at ' f'{_timestamp}.' ' The customer was sad.') + self.delays.append({ + 'timestamp': _timestamp, + 'station': station.bikeshare_id, + 'description': 'no bike was available' + }) _stats = pd.DataFrame( data={ 'tick_number': tick, 'timestamp': f'{_timestamp}', 'station_id': station.bikeshare_id, 'available_docks': ( station.docks - len(station.docked_bikes)), 'used_docks': len(station.docked_bikes)}, index=[1] ) self.simulation_statistics = self.simulation_statistics.append( _stats, ignore_index=True ) for bike in self.bikes_in_transit: logger.debug( f'Bike {bike.bike_id} has {bike.remaining_transit_time} ' f'left until it reaches station {bike.next_arrival_to}') bike.remaining_transit_time -= 60 # transit time in seconds if bike.remaining_transit_time <= 0: logger.debug( f'Bike {bike.bike_id} has arrived at' f'station {bike.next_arrival_to}' ) _lease = ( self.stations[bike.next_arrival_to].lease_dock(bike)) if _lease is True: self.bikes_in_transit.pop( self.bikes_in_transit.index(bike)) else: logger.debug( 'No docks available at station ' f'{bike.next_arrival_to}... Waiting for one' 'to become available.') + self.delays.append({ + 'timestamp': _timestamp, + 'station': bike.next_arrival_to, + 'description': 'no dock was available' + }) tick += 1 diff --git a/notebooks/cogo_trip_analysis.ipynb b/notebooks/cogo_trip_analysis.ipynb index 7642fa3..51282c8 100644 --- a/notebooks/cogo_trip_analysis.ipynb +++ b/notebooks/cogo_trip_analysis.ipynb @@ -1,174 +1,348 @@ { "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import json\n", "import statistics\n", "import folium\n", "import numpy as np\n", "import pandas as pd\n", "import os\n", + "import sys\n", + "import logging\n", "import warnings\n", "from pathlib import Path\n", "from IPython.display import Image, display\n", "\n", "from cogo import plotting, data_prep, simulation\n", "\n", "\n", + "logger = logging.getLogger(__name__)\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Analysis overview\n", "\n", "At a high level, we're looking at rider volume of COGO bikeshare bikes. The plan of analysis I'm taking is to view this almost as a queueing problem, and to simulate inbound and outbound volume at each station over some arbitrary time period, and with respect to the simultaneous activity at every other station. Ultimately, I'm interested in identifying stations that can become bottlenecks, either by virtue of having no remaining bikes to lend, or having no available docks to which to return a bike. I chose this approach for several reasons:\n", " - Predicting overall ridership volume, and forecasting out to some future point, is a relatively straightforward problem. I assume it has already been solved.\n", " - Predicting ridership volume on a per-station basis is only marginally useful: this type of analysis is likely to ignore the fact that we are dealing with a closed system in which there are a limited and pre-determined number of bikes available, of which only a subset can be rented out from a given station at any given time. Any model that is agnostic to the capacity of a given station, or to the volume of inbound traffic it receives, is not liable to provide useful or actionable predictions. (E.g., if we predict that a 15-dock station will have 30 departures over the course of an hour, how can we respond? Do we double the capacity of the station? Do we send a van to fill any empty docks? Will the inbound volume over that time period be enough to meet the demand without any intervention?)\n", " - Treating this as a queueing problem allows us to integrate both seasonal changes in departure volume (hourly, daily, monthly — although this implementation only looks at ridership on an hourly basis) and a transition matrix of probabilities that a prototypical rider will transit from any station `N` to any other station `M`. This specifically solves for the fact that we are operating in a closed system with a limited number of bikes, each station having access only to an even more limited subset thereof at any given point in time. It also gives us the ability to easily take action based on simulations we run: we can easily flag cases where a station is about to run out of either available bikes or free docks, allowing us to move bikes from overfull stations to underfull ones.\n", " \n", "# Exploring the data\n", "\n", "Initially I'm just going to overlay the stations with their cumulative lifetime arrival and departure counts onto a map of downtown Columbus. If we wanted to tweak this to help intuit what our later simulations might tell us, we could look at these data on the basis of average daily arrivals, departures, and net change: this might help visually identify which stations were net consumers of or producers of bikes, and would need to be manually re-balanced more frequently. But for my purposes here, just getting a sense of where everything is geographically is enough for right now." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "APP_ROOT = Path(os.path.realpath(os.path.expanduser(os.getcwd()))).parents[0]\n", "\n", "cogo_data, cogo_stations = data_prep.load_datasets(APP_ROOT)\n", "hourly_trips = data_prep.prepare_hourly_trips(cogo_data)\n", "station_crosslinks = data_prep.build_station_interlinks(cogo_data)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Really, this should be normalized to control for\n", "# number of days a station was in service, but for my\n", "# purposes, I'm just interested in the geographic\n", "# distribution of stations right now\n", "df_agg = plotting.counts_by_hexagon(df=cogo_data, resolution=9)\n", "\n", "m_hex = plotting.choropleth_map(\n", " df_agg=df_agg,\n", " name='Departure Count',\n", " value_col='departure_count',\n", " with_legend=True)\n", "m_hex = plotting.choropleth_map(\n", " df_agg=df_agg,\n", " name='Arrival Count',\n", " value_col='arrival_count',\n", " initial_map=m_hex,\n", " with_legend=True,\n", " kind='outlier'\n", ")\n", "folium.map.LayerControl('bottomright', collapsed=False).add_to(m_hex)\n", "m_hex.save(str(APP_ROOT / 'output' / 'choropleth_counts.html'))\n", "m_hex" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Simulate ridership volumes\n", "\n", "For the actual analysis, I'm building out a simulation that will, starting at any arbitrary hour of the day and running for any arbitrary number of minutes, use historic ridership data to simulate arrivals to and departures from each station. To achieve this, I need to first identify, for each station:\n", " 1. How many docks are available?\n", " 1. How many departures are there, on average, within a given hour?\n", " 1. What is the average time between departures within a given hour?\n", " 1. What is the probability that a rider will take a rented bike to any other individual station `M`?\n", "\n", "Given this information, we can then\n", " - For each tick:\n", " - For each station:\n", " - determine if a departure should occur (sample from geometric distribution given that station's inter-departure interval for that time of day)\n", " - If a departure should occur:\n", " - If `n > 0` bikes are available:\n", " - determine the destination station, based on the probabilities taken from our Station `N`:Station `M` transition matrix\n", " - undock a bike and assign it to a to global 'in transit' list w/estimated transit time\n", " - reset station's time since last depart counter (This is just used for internal tracking to verify that the geometric distribution we're sampling from is behaving reasonably)\n", " - If no bike is available\n", " - The customer is filled with a deep and lingering sadness\n", " - Otherwise:\n", " - increment station's time since last departure counter\n", "\n", " - For each undocked bike:\n", " - Decrement remaining travel time by 1 tick (1 minute)\n", " - If remaining travel time <= 0:\n", " - If `n > 0` available docks at destination station:\n", " - dock the bike\n", " - Otherwise:\n", " - The customer is filled with a deep and lingering sadness\n", " - The bike is thrown into the Olentangy, never to be seen again" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "orchestrator = simulation.Orchestrator(\n", " cogo_stations,\n", " station_crosslinks,\n", " hourly_trips,\n", - " bike_count=350\n", + " bike_count=408 # max bike id in COGO data\n", ")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "orchestrator.run_simulation(\n", " start_hour=12,\n", " num_ticks=120)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can then inspect all of the delays caused by over- or underfull bike stations:" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "orchestrator.delays" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Model evaluation\n", + "\n", + "To evaluate this model's performance, we can just do a straight comparison of the actual arrival and departure counts[1] predicted by our simulation on an hour-by-station basis against the average volumes of the same from our historical data.[2] I'll primarily be judging model accuracy on the basis of Median Absolute Difference (as small of a scale as we're operating on, looking at percentage differences is not super meaningful: if I simulate 5 departures for a station in an hour, but the historic average is 2, that's an 80-ish percent difference for a small delta in absolute terms).\n", + "\n", + "\n", + "[1] — You'll see below that I am restricting the dataset of historical observations to hours 12 and 13. This is purely a function of the constraints I specified above in terms of the starting hour of the simulation and the number of ticks to advance. You are welcome to adjust as you see fit to model a wider range of time.\n", + "[2] — Note that we could also hold an out-of-band sample to generate actual predictions against; however, given that this model is just intended to prove the concept and does not have important seasonalities integrated (day of week, month of year), that would only be of limited use and so I will decline to do so here." + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "sim_res = pd.DataFrame()\n", + "for _, group in orchestrator.simulation_statistics.groupby(['station_id']):\n", + " group['delta'] = group['available_docks'] - group['available_docks'].shift(-1)\n", + " sim_res = pd.concat([sim_res, group])\n", + "sim_res['departures'] = np.abs(sim_res['delta'].apply(lambda x: min(0, x)))\n", + "sim_res['arrivals'] = sim_res['delta'].apply(lambda x: max(0, x))\n", + "\n", + "sim_res['hour'] = sim_res['timestamp'].apply(lambda x: x[:2])\n", + "sim_res = sim_res.groupby(['hour', 'station_id']).agg('sum').reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "cogo_data['arrival_hour'] = cogo_data['stop_timestamp'].dt.strftime('%H')\n", + "cogo_data['arrival_date'] = cogo_data['stop_timestamp'].dt.strftime('%Y-%m-%d')\n", + "\n", + "cogo_data['departure_hour'] = cogo_data['start_timestamp'].dt.strftime('%H')\n", + "cogo_data['departure_date'] = cogo_data['start_timestamp'].dt.strftime('%%Y-%m-%d')\n", + "\n", + "out = (\n", + " cogo_data\n", + " .groupby(['arrival_hour', 'stop_station_id'])\n", + " .agg({\n", + " 'bike_id': 'count',\n", + " 'arrival_date': pd.Series.nunique\n", + " })\n", + " .rename_axis(['hour', 'station_id'])\n", + " .rename({\n", + " 'bike_id': 'arrival_count',\n", + " 'arrival_date': 'n_arrival_days'\n", + " }, axis=1)\n", + ")\n", + "out = out.join(\n", + " cogo_data\n", + " .groupby(['departure_hour', 'start_station_id'])\n", + " .agg({\n", + " 'bike_id': 'count',\n", + " 'departure_date': pd.Series.nunique\n", + " })\n", + " .rename_axis(['hour', 'station_id'])\n", + " .rename({\n", + " 'bike_id': 'departure_count',\n", + " 'departure_date': 'n_departure_days'\n", + " }, axis=1)\n", + ")\n", + "out['average_arrivals'] = round(\n", + " out['arrival_count'] / out['n_arrival_days'], 0)\n", + "out['average_departures'] = round(\n", + " out['departure_count'] / out['n_departure_days'], 0)\n", + "out = out.reset_index()\n", + "out = out.loc[out['hour'].isin(['12', '13'])]" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
departure_deltaarrival_delta
mean1.3142860.871429
median1.0000001.000000
\n", + "
" + ], + "text/plain": [ + " departure_delta arrival_delta\n", + "mean 1.314286 0.871429\n", + "median 1.000000 1.000000" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final = sim_res.merge(out, on=['hour', 'station_id'], how='outer')\n", + "# final['arrival_deviance'] = (\n", + "# np.abs(final['arrivals'] - final['average_arrivals']) / (\n", + "# (final['arrivals'] + final['average_arrivals']) / 2\n", + "# ) * 100\n", + "# )\n", + "# final['departure_deviance'] = (\n", + "# np.abs(final['departures'] - final['average_departures']) / (\n", + "# (final['departures'] + final['average_departures']) / 2\n", + "# ) * 100\n", + "# )\n", + "\n", + "final['arrival_delta'] = np.abs(final['arrivals'] - final['average_arrivals'])\n", + "final['departure_delta'] = np.abs(final['departures'] - final['average_departures'])\n", + "\n", + "final.agg({\n", + " 'departure_delta': ['mean', 'median'],\n", + " 'arrival_delta': ['mean', 'median']\n", + "})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For both arrivals and departures, by both mean and median absolute difference, my simulation is on average within about a single departure or arrival of actuals." + ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }