In this notebook, we will investigate an empirical method to quantify urban public transit accessibility.
Many existing metrics for this purpose are based on a location's proximity to transit stops and the prevalence of sidewalks in a given area. While these are useful heuristics, they do not factor in the availability of public transit modes in real-world use cases.
By contrast, the method presented here samples the best-route calculations offered by Google Maps across thousands of hypothetical journeys within a city. These journeys are used to define a transit factor for any given area of the city, which is based on the projected durations of transit journeys started within the area and normalized by the corresponding drive times.
!python3 -m pip install branca folium geopy googlemaps h3 numpy pandas pytz requests
!python3 -m pip install shapely
import csv
import json
import random
import re
from collections import defaultdict
from datetime import datetime
import branca
import folium
import googlemaps
import h3
import numpy as np
import pandas as pd
import requests
from geopy.distance import great_circle
from pytz import timezone
from shapely.geometry import Point, Polygon
# Create a boundary polygon for SF.
boundary_geometry_SF = json.load(open("SF.geojson"))
boundary_polygon_SF = boundary_geometry_SF["coordinates"][0]
boundary_polygon_SF = [[point[1], point[0]] for point in boundary_polygon_SF]
boundary_polygon_SF += [boundary_polygon_SF[0]]
latitudes = [point[0] for point in boundary_polygon_SF]
longitudes = [point[1] for point in boundary_polygon_SF]
polygon_structure_SF = Polygon(boundary_polygon_SF)
# Compute the bounding box for the polygon (and store its ranges for rejection
# sampling).
min_latitude_SF = min(latitudes)
max_latitude_SF = max(latitudes)
min_longitude_SF = min(longitudes)
max_longitude_SF = max(longitudes)
latitude_range_SF = (min_latitude_SF, max_latitude_SF)
longitude_range_SF = (min_longitude_SF, max_longitude_SF)
bounding_box = [
[min_latitude_SF, min_longitude_SF],
[min_latitude_SF, max_longitude_SF],
[max_latitude_SF, max_longitude_SF],
[max_latitude_SF, min_longitude_SF],
[min_latitude_SF, min_longitude_SF],
]
# Plot the boundary and its bounding box.
center_latitude = sum(latitudes) / len(latitudes)
center_longitude = sum(longitudes) / len(longitudes)
viz_map = folium.Map(
location=[center_latitude, center_longitude], zoom_start=12, tiles="cartodbpositron"
)
polyline = folium.PolyLine(locations=boundary_polygon_SF, weight=4, color="green")
viz_map.add_child(polyline)
polyline = folium.PolyLine(locations=bounding_box, weight=4, color="black")
viz_map.add_child(polyline);
def sample_point(polygon_structure, latitude_range, longitude_range):
"""Rejection sampling (generate a point within the bounding box and reject it if
it's outside the polygon).
"""
while True:
latitude = random.uniform(*latitude_range)
longitude = random.uniform(*longitude_range)
if polygon_structure.contains(Point(latitude, longitude)):
return (latitude, longitude)
# Plot a bunch of random points within SF (as a sanity check of our sampling method).
for _ in range(1000):
point = sample_point(polygon_structure_SF, latitude_range_SF, longitude_range_SF)
marker = folium.CircleMarker(point, radius=1)
viz_map.add_child(marker)
viz_map
# Setup access to the Google Maps API.
GOOGLE_API_KEY = "YOUR_API_KEY_HERE"
maps = googlemaps.Client(key=GOOGLE_API_KEY)
def run_trial(tzinfo, polygon_structure, latitude_range, longitude_range):
"""Samples the driving and transit durations for a random origin and destination on
Google Transit.
Triggers up to 2 requests to the Directions API (the free tier will permit up to 20K
calls to this function).
"""
point_1 = sample_point(polygon_structure_SF, latitude_range_SF, longitude_range_SF)
point_2 = sample_point(polygon_structure_SF, latitude_range_SF, longitude_range_SF)
sampled_at = int(datetime.now().timestamp())
# Find driving duration between the points as a baseline.
driving_result = maps.directions(point_1, point_2, mode="driving")
if len(driving_result) == 0:
print(f"Driving result failed for {point_1} to {point_2}.")
return None
duration_driving = driving_result[0]["legs"][0]["duration"]["value"]
# IMPORTANT: I sampled this data on a Tuesday so the data covers a Saturday, a
# Sunday, and a Monday. Change the random departure time sampling accordingly for
# your own purposes.
today_midnight = int(
tzinfo.localize(
datetime.combine(datetime.now(), datetime.min.time())
).timestamp()
)
three_days_ago_midnight = today_midnight - 3 * 24 * 60 * 60
random_departure_time = int(random.uniform(three_days_ago_midnight, today_midnight))
directions_results = maps.directions(
point_1, point_2, departure_time=random_departure_time, mode="transit"
)
if len(directions_results) == 0:
return (
sampled_at,
*point_1,
*point_2,
random_departure_time,
duration_driving,
None,
None,
None,
)
else:
directions_result = directions_results[0]
# Some additional interesting data is the cost of various rides.
if "fare" in directions_result:
fare_value = directions_result["fare"]["value"]
fare_currency = directions_result["fare"]["currency"]
else:
fare_value = None
fare_currency = None
directions_leg = directions_result["legs"][0]
if "arrival_time" in directions_leg:
# This allows us to include initial waiting time for transit directions.
duration_transit = (
directions_leg["arrival_time"]["value"] - random_departure_time
)
else:
# Typically falls into this case with walking-only transit directions.
duration_transit = directions_leg["duration"]["value"]
# Used to write a CSV row.
return (
sampled_at,
*point_1,
*point_2,
random_departure_time,
duration_driving,
duration_transit,
fare_value,
fare_currency,
)
# UNCOMMENT ME: Test out the data collection before we spam it...
# tzinfo = timezone("America/Los_Angeles")
# result_test = run_trial(
# tzinfo, polygon_structure_SF, latitude_range_SF, longitude_range_SF
# )
# result_test
# UNCOMMENT ME: Sample a bunch of journeys!
# num_trials = 5000
# with open("data.csv", "a") as csv_file:
# writer = csv.writer(csv_file, quoting=csv.QUOTE_MINIMAL)
# for i in range(num_trials):
# result = run_trial(
# tzinfo, polygon_structure_SF, latitude_range_SF, longitude_range_SF
# )
# if result is None:
# continue
# writer.writerow(result)
# if i % 10 == 9:
# csv_file.flush()
# print(f"Finished trial {i + 1}.")
data_raw = pd.read_csv("data.csv")
data_raw = data_raw.replace({np.nan: None})
data_raw[:10]
Let's start by just plotting a bunch of routes originating from an area (with a few constraints).
See code comments.
# Let's look at all routes within a 2 hour duration.
# We use this constraint throughout the notebook; higher durations are often invalid
# routings that are active only on a different day.
data = data_raw[data_raw["duration_transit"] <= 2 * 60 * 60]
routes = [((row[1], row[2]), (row[3], row[4]), row[6], row[7]) for row in data.values]
# And in a 0.5 mile radius around a particular Safeway.
safeway = (37.782490, -122.430809)
margin = 0.5 # In miles.
nearby_routes = [
route for route in routes if great_circle(safeway, route[0]).miles <= margin
]
# Plot Safeway.
route_viz_map = folium.Map(
location=[center_latitude, center_longitude], zoom_start=12, tiles="cartodbpositron"
)
marker = folium.CircleMarker(safeway, radius=1, color="darkorange")
route_viz_map.add_child(marker)
# Plot route start and end points.
for nearby_route in nearby_routes:
marker = folium.CircleMarker(nearby_route[0], radius=1)
route_viz_map.add_child(marker)
marker = folium.CircleMarker(nearby_route[1], radius=1, color="red")
route_viz_map.add_child(marker)
route_viz_map
The proposed transit factor expresses the multiplier over (or under) drive time for a given origin/destination. For the rest of the notebook, we will look at interesting ways to group and then average the transit factor over many journeys.
First, let's calculate the average transit factor for routes started within 0.5 miles of Safeway (shown in the previous map).
transit_factor_total = [route[3] / route[2] for route in nearby_routes]
round(sum(transit_factor_total) / len(transit_factor_total), 2)
Next, let's sample the average transit factor for a bunch of origin areas in the city.
def compute_average_transit_factor_for_origin_area(
max_trip_duration, computation_radius_miles, samples, location
):
"""A function to do the average factor calculation for any origin and its
surrounding area.
"""
samples_limited = samples[samples["duration_transit"] <= 2 * 60 * 60]
routes = [
((row[1], row[2]), (row[3], row[4]), row[6], row[7])
for row in samples_limited.values
]
nearby_routes = [
route for route in routes if great_circle(location, route[0]).miles <= margin
]
transit_factor_total = [route[3] / route[2] for route in nearby_routes]
return sum(transit_factor_total) / len(transit_factor_total)
# This takes a minute or two.
transit_factors = []
for i in range(1000):
point = sample_point(polygon_structure_SF, latitude_range_SF, longitude_range_SF)
transit_factors.append(
(
compute_average_transit_factor_for_origin_area(
2 * 60 * 60, 0.5, data_raw, point
),
point,
)
)
if i % 100 == 99:
print(f"Finished {i + 1}.")
See code comments (preparing colormaps for plotting).
# Let's look at the distribution of average factor values (which will help us create a
# colormap).
factor_values = [factor[0] for factor in transit_factors]
factor_min = min(factor_values)
factor_max = max(factor_values)
factor_mean = sum(factor_values) / len(factor_values)
factor_std_dev = np.std(factor_values)
# Observe that most of the mass of the distribution is < (mean + one standard deviation).
print((factor_min, factor_max, factor_mean, factor_std_dev))
np.histogram(factor_values)
# In general, factor values seem to fall in the range [2.5, 4.5] with a long upper tail.
# Let's use the classic Viridis colormap, but with an extra red tint for the upper tail.
viridis_base = list(reversed(branca.colormap._schemes["viridis"]))
viridis_augmented = viridis_base + ["red"]
series_min = 2.5
series_high = 4.5
series_max = 10
index = list(np.linspace(series_min, series_high, len(viridis_augmented) - 1)) + [
series_max
]
factor_colormap = branca.colormap.LinearColormap(
colors=viridis_augmented, index=index, vmin=series_min, vmax=series_max
)
factor_colormap_for_display = branca.colormap.LinearColormap(
colors=viridis_base, vmin=series_min, vmax=series_high
)
Here, we plot average transit factor values (for each sampled origin area) on a map.
transit_factor_map = folium.Map(
location=[center_latitude, center_longitude], zoom_start=12, tiles="cartodbpositron"
)
for (factor, point) in transit_factors:
marker = folium.CircleMarker(
point, radius=1, color=factor_colormap.rgb_hex_str(factor)
)
transit_factor_map.add_child(marker)
transit_factor_map.add_child(factor_colormap_for_display)
transit_factor_map
The previous technique is useful to score a particular location with a transit factor (based on an average of nearby neighbors within some radius). But maybe we're interested in areas of interest rather than scoring individual locations.
Let's use the H3 geographical index scheme (not covered here) to group the raw origins into several hexagonal regions. Within each hex grouping of origins, we can then take the average of each originating journey's transit factor.
# This resolution basically tiles a handful of neighborhood blocks.
h3_resolution = 8
# We re-use this logic in subsequent steps.
# The parameter consists of individual factor observations with a corresponding hex.
def plot_hex_factors(hex_factors):
# Get averages for every hex.
values_for_hex = defaultdict(list)
for (factor, hex_code) in hex_factors:
values_for_hex[hex_code].append(factor)
hex_averages = {}
for hex_key in values_for_hex:
hex_averages[hex_key] = sum(values_for_hex[hex_key]) / len(
values_for_hex[hex_key]
)
transit_hex_map = folium.Map(
location=[center_latitude, center_longitude],
zoom_start=12,
tiles="cartodbpositron",
)
# Draw all the hexes colored by their averages.
# For this purpose, we keep re-using the baseline colormap for comparison purposes.
# This can and will saturate for really low-valued hexes, so keep this in mind if
# you see a lot of minima on your plot.
for (hex_key, factor) in hex_averages.items():
hex_boundary = h3.h3_to_geo_boundary(hex_key)
hex_boundary = hex_boundary + (hex_boundary[0],)
polyline = folium.PolyLine(
locations=hex_boundary,
weight=1,
color="#333",
fill_color=factor_colormap.rgb_hex_str(factor),
fill_opacity=0.6,
)
transit_hex_map.add_child(polyline)
transit_hex_map.add_child(factor_colormap_for_display)
return transit_hex_map, hex_averages
# Continue to apply the 2 hour constraint from before.
samples_limited = data_raw[data_raw["duration_transit"] <= 2 * 60 * 60]
routes = [
((row[1], row[2]), (row[3], row[4]), row[6], row[7])
for row in samples_limited.values
]
hex_factors = [
(route[3] / route[2], h3.geo_to_h3(*route[0], h3_resolution)) for route in routes
]
hex_map, hex_averages = plot_hex_factors(hex_factors)
hex_map
Sweet. Let's do that again, but this time we're interested in giving destination areas a transit factor. (That is, for a given destination area, what is the average multiplier over drive time to get there by transit?)
hex_factors_dest = [
(route[3] / route[2], h3.geo_to_h3(*route[1], h3_resolution)) for route in routes
]
hex_map_dest, hex_averages_dest = plot_hex_factors(hex_factors_dest)
hex_map_dest
Are there any hexes with large discrepancies between origin and destination factor values?
Let's compute (and plot) the ratio of destination factor values to origin factor values. In this map, areas in blue are "easier" in some sense to originate from than terminate at.
No huge discrepancies here.
factor_ratios = {}
for hex_key in hex_averages:
if hex_key in hex_averages_dest:
factor_ratios[hex_key] = hex_averages_dest[hex_key] / hex_averages[hex_key]
transit_hex_ratio_map = folium.Map(
location=[center_latitude, center_longitude], zoom_start=12, tiles="cartodbpositron"
)
ratio_colormap = branca.colormap.linear.RdBu_04.scale(
min(factor_ratios.values()), max(factor_ratios.values())
)
for (hex_key, ratio) in factor_ratios.items():
hex_boundary = h3.h3_to_geo_boundary(hex_key)
hex_boundary = hex_boundary + (hex_boundary[0],)
polyline = folium.PolyLine(
locations=hex_boundary,
weight=1,
color="#333",
fill_color=ratio_colormap.rgb_hex_str(ratio),
fill_opacity=0.6,
)
transit_hex_ratio_map.add_child(polyline)
transit_hex_ratio_map.add_child(ratio_colormap)
transit_hex_ratio_map
Two more questions.
First, how does the Monday-only origin map compare to the other maps? Try changing == 0
to
== 6
for Sunday or == 5
for Saturday. You should see clear differences.
tzinfo = timezone("America/Los_Angeles")
samples_monday = samples_limited[
samples_limited["departure_time"].apply(
lambda x: tzinfo.localize(datetime.fromtimestamp(x)).weekday()
)
== 0 # For Monday.
]
routes_monday = [
((row[1], row[2]), (row[3], row[4]), row[6], row[7])
for row in samples_monday.values
]
hex_factors_monday = [
(route[3] / route[2], h3.geo_to_h3(*route[0], h3_resolution))
for route in routes_monday
]
hex_map_monday, hex_averages_monday = plot_hex_factors(hex_factors_monday)
hex_map_monday
Finally, how does rush hour (6 AM to 9 AM) look?
Interestingly, major commute thoroughfares (the 38 and 14 buses for example) look much brighter.
samples_hours = samples_limited["departure_time"].apply(
lambda x: tzinfo.localize(datetime.fromtimestamp(x)).hour
)
samples_rush_hour = samples_limited[(6 <= samples_hours) & (samples_hours <= 9)]
routes_rush_hour = [
((row[1], row[2]), (row[3], row[4]), row[6], row[7])
for row in samples_rush_hour.values
]
hex_factors_rush_hour = [
(route[3] / route[2], h3.geo_to_h3(*route[0], h3_resolution))
for route in routes_rush_hour
]
hex_map_rush_hour, hex_averages_rush_hour = plot_hex_factors(hex_factors_rush_hour)
hex_map_rush_hour