Commit a44c2d8f authored by kaghog's avatar kaghog
Browse files

clean up and calibration

parent 8bf46d66
# General pipeline settings
working_directory: /nas/kaghog/nfp/cache_output_m11_sun_25p
flowchart_path: /nas/kaghog/nfp/cache_output_m11_sun_25p/flowchart.json
dryrun: false
# Requested stages
run:
# - data.statpop.persons
# - data.statpop.projections.households
# - data.statpop.scaled
# - synthesis.population.matched
# - data.microcensus.households
# - data.microcensus.csv
# - population.destinations
# - synthesis.population.destinations
# - synthesis.population.spatial.primary.weekend.pry_distance_distributions
# - synthesis.population.spatial.primary.weekend.work_locations
# - synthesis.population.spatial.primary.weekend.education_locations
# - synthesis.population.spatial.secondary.distance_distributions
# - synthesis.population.spatial.secondary.locations
# - synthesis.population.spatial.primary.weekend.locations
# - matsim.facilities
# - matsim.population
# - matsim.households
# - matsim.run
- population.output
# - analysis.analysis
# These are configuration options that we use in the pipeline
config:
threads: 4
random_seed: 0
hot_deck_matching_runners: 2
disable_progress_bar: false
java_memory: 80G
input_downsampling: 0.25
enable_scaling: true
scaling_year: 2020
use_freight: false
use_detailed_activities: false
weekend_scenario: true
specific_weekend_scenario: sunday
hafas_date: 01.10.2018
data_path: /nas/ivtmatsim/scenarios/switzerland/data
output_path: /nas/kaghog/nfp/cache_output_m11_sun_25p
analysis_path: /nas/kaghog/nfp/cache_output_m11_sun_25p/analysis
\ No newline at end of file
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def configure(context):
context.stage("data.microcensus.persons")
context.stage("data.microcensus.trips")
def compute_cdf(df,bin_size=200):
hist_vals, bins_vals = np.histogram(df["distance"], weights=df["weight"],bins=bin_size)
def compute_cdf(context,df,bin_size=100):
#A default bin size of 100 has been chosen after looking at the weekend distance distribution for work and education
# and have found it suitable as the largest bin size with reasonably small number of empty bins for both
#df = df[df["distance"] <= 25000]
#df["distance_km"] = 0.001*df["distance"]
#plt.hist(df["distance"], weights=df["weight"], bins=200)
#plt.savefig("%s/my_dist_hist_raw.png" % context.cache_path)
print("stats: ", df["distance"].describe())
print("INFO: work distance distribution generated for 95th percentile at: ", df["distance"].quantile(0.95))
#print("quantile99: ", np.percentile(dist, 99))
# For work 90th percentile of the distance is 18627 meters, 95th is 27045.02 and 97th is 36726.34
# but there are outliers of upto 235km that throw off the distance sampling
# because with a 200 bin size most distances end up in one bin
# it is stable from 95th percentile below
# Decide percentile and extract the distance distribution for that percentile
# 0.95 works for both education and work
quant = df["distance"].quantile(0.95)
df_quant = df[df["distance"]<=quant]
plt.hist(df_quant["distance"], weights=df_quant["weight"], bins=bin_size)
plt.savefig("%s/my_dist_distribution_raw.png" % context.cache_path)
hist_vals, bins_vals = np.histogram(df_quant["distance"], weights=df_quant["weight"],bins=bin_size)
histbin_midpoints = bins_vals[:-1] + np.diff(bins_vals) / 2
cdf = np.cumsum(hist_vals)
cdf = cdf / cdf[-1]
return cdf, histbin_midpoints
threshold_buffer = np.diff(bins_vals) / 2
threshold_buffer = threshold_buffer[0]
# the threshold buffer is used to create a maximum radius boundary for sampling the distance
# print("mean of hist_midpoint: ", np.mean(histbin_midpoints))
# print("this is max of hist_midpoint: ", np.max(histbin_midpoints))
# print("this is min of hist_midpoint: ", np.min(histbin_midpoints))
# print("this is std of hist_midpoint: ", np.std(histbin_midpoints))
#
# midpoint_km = np.true_divide(histbin_midpoints, 1000)
# midpoint_dig = np.digitize(histbin_midpoints, bins_vals)
#
# plt.hist(midpoint_dig, bins = 200)
# plt.savefig("%s/my_dist_hist_midpoints2.png" % context.cache_path)
return cdf, histbin_midpoints, threshold_buffer
def execute(context):
......@@ -25,7 +70,7 @@ def execute(context):
#the assumption here is that saturday and sunday have the same distance distribution for work or education trips
#which is different from the weekday.
df_persons = df_persons[df_persons["weekend"]]
df_persons = df_persons[df_persons["weekend"]].copy()
df_trips = context.stage("data.microcensus.trips")[["person_id", "trip_id", "mode", "crowfly_distance",
......@@ -42,37 +87,35 @@ def execute(context):
df_trips.loc[df_trips["trip_id"] == 1, "preceding_purpose"] = "home"
# Filtering for only work and education
primary_activities = ["home", "work", "education"]
# df_trips = df_trips[(df_trips["preceding_purpose"].isin(primary_activities)
# & df_trips["following_purpose"].isin(primary_activities))]
#Calculate distributions
bin_size = 200 # ToDo need to find out why this value has been specified and how it affects the distances provided
bin_size_work = 200
bin_size_edu = 80
distributions = {}
#Extract work distances #ToDo take distances from commute distances already provided in mz.commute
df_trips_work = df_trips[(df_trips["preceding_purpose"].isin(["home","work"])
& df_trips["following_purpose"].isin(["home", "work"]))].copy()
#Extract work distances
# ToDo take distances from commute distances already provided in mz.commute
# toDo should potentially remove work-work trips so these distances are not part of the distribution
# Using the person weights and the crowfly distance of all trips that end at work. (the issue here is that
# there is no integrity for h-w trips as some trips are w-w. A solution will be to remove all trips
# that do not hold the integrity of h-w
df_trips_work = df_trips[df_trips["purpose"] == "work"].copy()
df = df_trips_work[["crowfly_distance", "weight"]].rename(columns={"crowfly_distance": "distance"})
work_cdf, work_midpoint_bin_distances = compute_cdf(df, bin_size=200)
work_cdf, work_midpoint_bin_distances, work_threshold_buffer = compute_cdf(context,df, bin_size=bin_size_work)
# Write distribution for work
distributions["work"] = dict(cdf=work_cdf, midpoint_bins=work_midpoint_bin_distances)
distributions["work"] = dict(cdf=work_cdf, midpoint_bins=work_midpoint_bin_distances, threshold_buffer=work_threshold_buffer)
#Extract education distances
df_trips_edu = df_trips[(df_trips["preceding_purpose"].isin(["home", "education"])
& df_trips["following_purpose"].isin(["home", "education"]))].copy()
df_trips_edu = df_trips[df_trips["purpose"] == "education"].copy()
df = df_trips_edu[["crowfly_distance", "weight"]].rename(columns={"crowfly_distance": "distance"})
edu_cdf, edu_midpoint_bin_distances = compute_cdf(df, bin_size=200)
df_edu = df_trips_edu[["crowfly_distance", "weight"]].rename(columns={"crowfly_distance": "distance"})
edu_cdf, edu_midpoint_bin_distances, edu_threshold_buffer = compute_cdf(context, df_edu, bin_size=bin_size_edu)
# Write distribution for education
distributions["education"] = dict(cdf=edu_cdf, midpoint_bins=edu_midpoint_bin_distances)
distributions["education"] = dict(cdf=edu_cdf, midpoint_bins=edu_midpoint_bin_distances, threshold_buffer=edu_threshold_buffer)
......
import math
import geopandas as gpd
import numpy as np
import pandas as pd
import shapely.geometry as geo
from sklearn.neighbors import KDTree
import data.spatial.utils as spatial_utils
import matplotlib.pyplot as plt
def configure(context):
......@@ -57,9 +60,9 @@ def prepare_radius_from_cdf(cdf, midpoint_bins, random_values):
#random_values are random values generated matching the length of the synthetic population which will be
# used to sample radius randomly to be used for sampling distances for each person
#here we are sampling locations for each person not for each trip
value_bins = np.searchsorted(cdf,random_values)
value_bins = np.searchsorted(cdf, random_values)
radius_from_cdf = midpoint_bins[value_bins]
return radius_from_cdf
......@@ -81,49 +84,208 @@ def find_locations(home_coordinates, destination_coordinates, radius):
return indices, distances
def impute_nearest_work_locations(context):
# Prepare work persons
df_persons = context.stage("synthesis.population.enriched")
df_trips = context.stage("synthesis.population.trips")#.sort_values(by=["person_id", "trip_index"])
df_trips_work = df_trips[df_trips["following_purpose"] == "work"].copy()
#df_syn_persons_work = df_persons.copy()
df_syn_persons_work = df_persons[df_persons["person_id"].isin(df_trips_work["person_id"].unique())]
#prepare home coordinates
home_coordinates = np.vstack([df_syn_persons_work["home_x"], df_syn_persons_work["home_y"]]).T
# Prepare work destinations
df_destinations = context.stage("synthesis.population.destinations")
df_candidates = df_destinations[df_destinations["offers_work"]].copy()
work_coordinates = np.vstack(df_candidates["geometry"].apply(lambda x: np.array([x.x, x.y])).values)
query_size = 10
tree = KDTree(work_coordinates)
distances, indices = tree.query(home_coordinates, query_size, return_distance=True)
# randomly select
selector = np.random.randint(query_size, size=(indices.shape[0],))
indices = np.choose(selector, indices.T)
distances = np.choose(selector, distances.T)
df_syn_persons_work["work_x"] = df_candidates.iloc[indices]["destination_x"].values
df_syn_persons_work["work_y"] = df_candidates.iloc[indices]["destination_y"].values
df_syn_persons_work["destination_id"] = df_candidates.iloc[indices]["destination_id"].values
df_syn_persons_work["distance"] = np.sqrt(
(df_syn_persons_work["home_x"] - df_syn_persons_work["work_x"]) ** 2 +
(df_syn_persons_work["home_y"] - df_syn_persons_work["work_y"]) ** 2
)
#plots to check sampled distance distribution
# plt.hist(distances, bins = 200)
# plt.savefig("%s/my_raw_sampled_distances.png" % context.cache_path)
#
# plt.hist(df_syn_persons_work["distance"], bins = 200)
#
# plt.savefig("%s/my_persons_imputed_distances_m.png" % context.cache_path)
#
# df_syn_persons_work.loc[:, "distance_km"] = df_syn_persons_work["distance"]/1000
#
# df_test = df_syn_persons_work[df_syn_persons_work["distance_km"] < 25].copy()
# plt.hist(df_test["distance_km"], bins = 200)
#
# plt.savefig("%s/my_persons_imputed_distances_25km.png" % context.cache_path)
df_syn_persons_work = df_syn_persons_work[["person_id",
"work_x", "work_y",
"destination_id"]].rename({"work_x": "x",
"work_y": "y"},
axis=1)
df_syn_persons_work = spatial_utils.to_gpd(context, df_syn_persons_work, coord_type="work")
return df_syn_persons_work[["person_id", "destination_id", "geometry"]]
def impute_work_locations_method1(context, distributions, destinations, df_syn_persons_work, df_destinations):
# Method1 takes the last distance sampled from the distance distribution based on the radius
#prepare the distances used for sampling based on the cdf (this is the radius variable)
cdf = distributions["work"]["cdf"]
midpoint_bins = distributions["work"]["midpoint_bins"]
random_values = np.random.rand(len(df_syn_persons_work))
value_bins = np.searchsorted(cdf, random_values)
radius = midpoint_bins[value_bins]
#plt.hist(radius, bins=200)
#plt.savefig("%s/my_dist_hist_selected.png" % context.cache_path)
#prepare the home and destination coordinates
destination_coordinates = destinations["work"]["locations"]
home_coordinates = np.vstack([df_syn_persons_work["home_x"], df_syn_persons_work["home_y"]]).T
tree = KDTree(destination_coordinates)
indices, distances = tree.query_radius(home_coordinates, r=radius, return_distance=True, sort_results=True)
# when no facility is found
for i in range(len(indices)):
l = indices[i]
if len(l) == 0:
dist, ind = tree.query(np.array(home_coordinates[i]).reshape(1, -1), 2, return_distance=True,
sort_results=True)
fac = ind[0][1]
indices[i] = [fac]
distances[i] = [dist[0][1]]
#indices, distances = find_locations(home_coordinates, destination_coordinates, radius)
# Select the distances
# Method 1: Select the last index and distance for each person. ToDo note:Make the threshold distance zero
discrete_indices = [l[-1] for l in indices]
discrete_d = [d[-1] for d in distances]
discrete_d = np.array(discrete_d)
#make a plot to look at the distance distribution with this method for smaller distances
plt.hist(discrete_d[discrete_d <25000], bins=200)
#
plt.savefig("%s/my_dist_hist_outputQuery_Method1.png" % context.cache_path)
print("INFO: imputing work locations...")
df_candidates = df_destinations[df_destinations["offers_work"]].copy()
df_work_persons = df_syn_persons_work.copy()
df_work_persons["work_x"] = df_candidates.iloc[discrete_indices]["destination_x"].values
df_work_persons["work_y"] = df_candidates.iloc[discrete_indices]["destination_y"].values
df_work_persons["destination_id"] = df_candidates.iloc[discrete_indices]["destination_id"].values
df_work_persons["distance"] = np.sqrt(
(df_work_persons["home_x"] - df_work_persons["work_x"]) ** 2 +
(df_work_persons["home_y"] - df_work_persons["work_y"] ) ** 2
)
#plt.hist(df_work_persons["distance"], bins = 200)
#plt.savefig("%s/my_dist_hist_persons_imputedQuery.png" % context.cache_path)
#df_work_persons.loc[:,"distance_km"] = 0.001 * df_work_persons["distance"]
#df_test = df_work_persons[df_work_persons["distance_km"] < 17].copy()
#plt.hist(df_test["distance_km"], bins = 200)
#plt.savefig("%s/my_dist_hist_persons_imputed_25km.png" % context.cache_path)
df_work_persons = df_work_persons[["person_id",
"work_x", "work_y",
"destination_id"]].rename({"work_x": "x",
"work_y": "y"},
axis=1)
df_work_persons = spatial_utils.to_gpd(context, df_work_persons, coord_type="work")
return df_work_persons[["person_id", "destination_id", "geometry"]]
def impute_work_locations(context, distributions, destinations, df_syn_persons_work, df_destinations):
# This method selects based on a threshold buffer for the radius and weighted based
# on number of employees in a location
#prepare the distances used for sampling based on the cdf (this is the radius variable)
cdf = distributions["work"]["cdf"]
midpoint_bins = distributions["work"]["midpoint_bins"]
random_values = np.random.rand(len(df_syn_persons_work))
radius = prepare_radius_from_cdf(cdf, midpoint_bins, random_values)
value_bins = np.searchsorted(cdf, random_values)
radius = midpoint_bins[value_bins]
#plt.hist(radius, bins=200)
#plt.savefig("%s/my_dist_hist_selected.png" % context.cache_path)
#radius = prepare_radius_from_cdf(cdf, midpoint_bins, random_values)
#define a threshold distance to be added to the target distance that serves as maximum distance for a person
# the target distance in this case is the midpoint distance of the histogram plus a maximum threshold distance
#this is because we want to select distances within a boundary of this threshold
threshold = 0 #in meters #Todo maybe get this value from the average midpoint of the distance bins
threshold = distributions["work"]["threshold_buffer"] #in meters
radius = radius + threshold
#prepare the home and destination coordinates
destination_coordinates = destinations["work"]["locations"]
home_coordinates = np.vstack([df_syn_persons_work["home_x"], df_syn_persons_work["home_y"]]).T
tree = KDTree(destination_coordinates)
indices, distances = tree.query_radius(home_coordinates, r=radius, return_distance=True, sort_results=True)
# when no facility is found
for i in range(len(indices)):
l = indices[i]
if len(l) == 0:
dist, ind = tree.query(np.array(home_coordinates[i]).reshape(1, -1), 2, return_distance=True,
sort_results=True)
fac = ind[0][1]
indices[i] = [fac]
distances[i] = [dist[0][1]]
indices, distances = find_locations(home_coordinates, destination_coordinates, radius)
#indices, distances = find_locations(home_coordinates, destination_coordinates, radius)
#Select the distances
discrete_indices = []
discrete_distances = []
# Method 1: Select the last index and distance for each person. ToDo note:Make the threshold distance zero
#discrete_indices = [l[-1] for l in indices]
#discrete_distances = [d[-1] for d in distances]
# Method 2: select distances based on number of employees identified in the locations
# In order not to possibly sample distances that are smaller than target distance, is to have a minimum
# threshold below the target distance and consider locations between the threshold and target (donut shape)
for ind, dist in zip(indices,distances):
# limit to distances (indices) within for random selection so that we can use np.choose which works with 32 items
farthest_dist = dist[-1]
min_threshold = 3000 # I am testing this arbitrarily - need to calibrate properly
min_threshold_band = farthest_dist - min_threshold
minimum_selection_bound = max(min_threshold_band, dist[0])
maximum_selection_bound = farthest_dist
#ind = ind[(dist >= minimum_selection_bound) & (dist <= maximum_selection_bound)]
if len(ind) > 2:
farthest_dist = dist[-1]
min_threshold_band = farthest_dist - threshold
# print("min thresh: ", min_threshold_band)
minimum_selection_bound = max(min_threshold_band, dist[0])
# print("min band: ", minimum_selection_bound)
maximum_selection_bound = farthest_dist
# print("max band: ", maximum_selection_bound)
# print("filter1: ", (dist >= minimum_selection_bound))
# print("filter2: ", (dist <= maximum_selection_bound))
# print("filter3: ", (dist >= minimum_selection_bound) & (dist <= maximum_selection_bound))
ind = ind[(dist >= minimum_selection_bound) & (dist <= maximum_selection_bound)]
#Select a location based on number of employee as weights
weights = destinations["work"]["no_employees"][ind]
......@@ -134,7 +296,6 @@ def impute_work_locations(context, distributions, destinations, df_syn_persons_w
#index = np.choose(selector, ind.T)
index = np.random.choice(ind, p=weights)
discrete_indices.append(destinations["work"]["identifiers"][index])
discrete_distances.append(destinations["work"]["locations"][index])
print("INFO: imputing work locations...")
df_candidates = df_destinations[df_destinations["offers_work"]].copy()
......@@ -144,12 +305,31 @@ def impute_work_locations(context, distributions, destinations, df_syn_persons_w
df_work_persons["work_y"] = df_candidates.iloc[discrete_indices]["destination_y"].values
df_work_persons["destination_id"] = df_candidates.iloc[discrete_indices]["destination_id"].values
df_work_persons["distance"] = np.sqrt(
(df_work_persons["home_x"] - df_work_persons["work_x"]) ** 2 +
(df_work_persons["home_y"] - df_work_persons["work_y"] ) ** 2
)
plt.hist(df_work_persons["distance"], bins = 200)
plt.savefig("%s/my_dist_hist_persons_imputedQuery.png" % context.cache_path)
df_work_persons.loc[:,"distance_km"] = 0.001 * df_work_persons["distance"]
#df_test = df_work_persons[df_work_persons["distance_km"] < 17].copy()
#plt.hist(df_work_persons["distance"], bins=200)
#plt.hist(df_test["distance_km"], bins = 200)
#plt.savefig("%s/my_dist_hist_persons_imputed_25km.png" % context.cache_path)
df_work_persons = df_work_persons[["person_id",
"work_x", "work_y",
"destination_id"]].rename({"work_x": "x",
"work_y": "y"},
axis=1)
df_work_persons = spatial_utils.to_gpd(context, df_work_persons, coord_type="work")
return df_work_persons[["person_id", "destination_id", "geometry"]]
......@@ -157,6 +337,8 @@ def impute_work_locations(context, distributions, destinations, df_syn_persons_w
def execute(context):
# Prepare data
distance_distributions = context.stage("synthesis.population.spatial.primary.weekend.pry_distance_distributions")
df_home = prepare_home_locations(context)
......@@ -173,15 +355,14 @@ def execute(context):
# Load trips and get persons that do work trips #Todo maybe i could sort this out using mz.commute info?
df_trips = context.stage("synthesis.population.trips").sort_values(by=["person_id", "trip_index"])
# df_trips_work = df_trips[(df_trips["preceding_purpose"].isin(["home", "work"])
# & df_trips["following_purpose"].isin(["home", "work"]))].copy()
df_trips_work = df_trips[df_trips["following_purpose"] == "work"].copy()
df_syn_persons_work = df_persons[df_persons["person_id"].isin(df_trips_work["person_id"].unique())]
#create work locations for work persons
df_locations = impute_work_locations(context, distance_distributions, destinations, df_syn_persons_work, df_destinations)
#df_locations = impute_nearest_work_locations(context)
return df_locations
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment