Commit 5b0bf129 authored by kaghog's avatar kaghog
Browse files

Merge branch 'nfp_weekend' into nfpCovid19

parents e875713b b9ff119a
# General pipeline settings
working_directory: /nas/asallard/Switzerland/cache_schools
flowchart_path: /nas/asallard/Switzerland/output_schools/flowchart.json
working_directory: /nas/kaghog/nfp/cache_output_m11_sat
flowchart_path: /nas/kaghog/nfp/cache_output_m11_sat/flowchart.json
dryrun: false
# Requested stages
......@@ -8,17 +8,19 @@ run:
# - data.statpop.persons
# - data.statpop.projections.households
# - data.statpop.scaled
# - population.matched
# - synthesis.population.matched
# - data.microcensus.households
# - data.microcensus.csv
# - population.destinations
# - synthesis.population.destinations
# - synthesis.population.spatial.secondary.locations
# - synthesis.population.spatial.primary.weekend.locations
# - matsim.facilities
# - matsim.population
# - matsim.households
- matsim.run
- analysis.analysis
# - population.output
# - matsim.run
- population.output
# - analysis.analysis
# These are configuration options that we use in the pipeline
config:
......@@ -27,12 +29,14 @@ config:
hot_deck_matching_runners: 2
disable_progress_bar: false
java_memory: 80G
input_downsampling: 0.25
input_downsampling: 0.01
enable_scaling: true
scaling_year: 2020
use_freight: true
use_freight: false
use_detailed_activities: false
weekend_scenario: true
specific_weekend_scenario: saturday
hafas_date: 01.10.2018
data_path: /nas/ivtmatsim/scenarios/switzerland/data
output_path: /nas/asallard/Switzerland/output_schools
analysis_path: /nas/asallard/Switzerland/analysis_schools
output_path: /nas/kaghog/nfp/cache_output_m11_sat
analysis_path: /nas/kaghog/nfp/cache_output_m11_sat/analysis
\ No newline at end of file
# General pipeline settings
working_directory: /nas/kaghog/nfp/cache_output_m11_sun_25p
flowchart_path: /nas/kaghog/nfp/cache_output_m11_sun_25p/flowchart.json
dryrun: false
# Requested stages
run:
# - data.statpop.persons
# - data.statpop.projections.households
# - data.statpop.scaled
# - synthesis.population.matched
# - data.microcensus.households
# - data.microcensus.csv
# - population.destinations
# - synthesis.population.destinations
# - synthesis.population.spatial.primary.weekend.pry_distance_distributions
# - synthesis.population.spatial.primary.weekend.work_locations
# - synthesis.population.spatial.primary.weekend.education_locations
# - synthesis.population.spatial.secondary.distance_distributions
# - synthesis.population.spatial.secondary.locations
# - synthesis.population.spatial.primary.weekend.locations
# - matsim.facilities
# - matsim.population
# - matsim.households
# - matsim.run
- population.output
# - analysis.analysis
# These are configuration options that we use in the pipeline
config:
threads: 4
random_seed: 0
hot_deck_matching_runners: 2
disable_progress_bar: false
java_memory: 80G
input_downsampling: 0.25
enable_scaling: true
scaling_year: 2020
use_freight: false
use_detailed_activities: false
weekend_scenario: true
specific_weekend_scenario: sunday
hafas_date: 01.10.2018
data_path: /nas/ivtmatsim/scenarios/switzerland/data
output_path: /nas/kaghog/nfp/cache_output_m11_sun_25p
analysis_path: /nas/kaghog/nfp/cache_output_m11_sun_25p/analysis
\ No newline at end of file
......@@ -8,6 +8,6 @@ def execute(context):
df_trips = context.stage("data.microcensus.trips")
df_transit = context.stage("data.microcensus.transit")
df_persons.to_csv("%s/persons.csv" % context.cache_path, sep = ";", index = None)
df_trips.to_csv("%s/trips.csv" % context.cache_path, sep = ";", index = None)
df_transit.to_csv("%s/transit.csv" % context.cache_path, sep = ";", index = None)
df_persons.to_csv("%s/mz_persons.csv" % context.cache_path, sep = ";", index = None)
df_trips.to_csv("%s/mz_trips.csv" % context.cache_path, sep = ";", index = None)
df_transit.to_csv("%s/mz_transit.csv" % context.cache_path, sep = ";", index = None)
......@@ -21,6 +21,7 @@ def validate(context):
def execute(context):
output_path = context.config("output_path")
output_suffix = ""
# Prepare households
df_households = context.stage("synthesis.population.enriched").rename(
......@@ -67,7 +68,7 @@ def execute(context):
"mz_person_id"
]]
df_persons.to_csv("%s/persons.csv" % output_path, sep = ";", index = None)
df_persons.to_csv(f"%s/persons{output_suffix}.csv" % output_path, sep = ";", index = None)
# Prepare activities
df_activities = context.stage("synthesis.population.activities").rename(
......@@ -87,7 +88,7 @@ def execute(context):
#df_activities = df_activities.astype({"is_last": int})
df_activities.to_csv("%s/activities.csv" % output_path, sep = ";", index = None)
df_activities.to_csv(f"%s/activities{output_suffix}.csv" % output_path, sep = ";", index = None)
# Prepare trips
df_trips = context.stage("synthesis.population.trips").rename(
......@@ -112,7 +113,7 @@ def execute(context):
#"is_first", "is_last"
]]
df_trips.to_csv("%s/trips.csv" % output_path, sep = ";", index = None)
df_trips.to_csv(f"%s/trips{output_suffix}.csv" % output_path, sep = ";", index = None)
# Prepare spatial data sets
df_locations = context.stage("synthesis.population.spatial.locations")#[[
......@@ -168,7 +169,7 @@ def execute(context):
geom = df_spatial["geometry"].values
df_spatial["crowfly_distance"] = [geom[i].length for i in range(len(geom))]
df_spatial = df_spatial.drop(columns = ["geometry"])
df_spatial.to_csv("%s/trips_with_distance.csv" % output_path, sep = ";", index = None)
df_spatial.to_csv(f"%s/trips_with_distance{output_suffix}.csv" % output_path, sep = ";", index = None)
# Write meta information
information = dict(
......
def configure(context):
context.stage("synthesis.population.spatial.primary.work.locations")
context.stage("synthesis.population.spatial.primary.education.locations")
context.stage("synthesis.population.spatial.primary.weekend.locations")
context.config("weekend_scenario", False)
def execute(context):
df_work = context.stage("synthesis.population.spatial.primary.work.locations")
df_education = context.stage("synthesis.population.spatial.primary.education.locations")
# For weekend
is_weekend_scenario = context.config("weekend_scenario")
if is_weekend_scenario:
df_work, df_education = context.stage("synthesis.population.spatial.primary.weekend.locations")
else:
df_work = context.stage("synthesis.population.spatial.primary.work.locations")
df_education = context.stage("synthesis.population.spatial.primary.education.locations")
return df_work, df_education
def configure(context):
context.stage("synthesis.population.spatial.primary.education.locations")
def execute(context):
df_education = context.stage("synthesis.population.spatial.primary.education.locations")
return df_education
import math
import geopandas as gpd
import numpy as np
import pandas as pd
import shapely.geometry as geo
from sklearn.neighbors import KDTree
import data.spatial.utils as spatial_utils
import matplotlib.pyplot as plt
def configure(context):
context.stage("synthesis.population.trips")
context.stage("synthesis.population.sampled")
context.stage("synthesis.population.spatial.home.locations")
context.stage("synthesis.population.spatial.primary.weekend.pry_distance_distributions")
context.stage("synthesis.population.destinations")
context.stage("synthesis.population.enriched")
context.config("random_seed")
context.config("threads")
context.config("output_path")
def prepare_destinations(context):
# ToDo may need to filter out the right education faciltiies for the weekend
return
def impute_locations(context):
# Prepare persons
df_persons = context.stage("synthesis.population.enriched")
df_trips = context.stage("synthesis.population.trips") # .sort_values(by=["person_id", "trip_index"])
df_trips_education = df_trips[df_trips["following_purpose"] == "education"].copy()
df_syn_persons_education = df_persons[df_persons["person_id"].isin(df_trips_education["person_id"].unique())]
# prepare home coordinates
home_coordinates = np.vstack([df_syn_persons_education["home_x"], df_syn_persons_education["home_y"]]).T
# Prepare destinations
df_destinations = context.stage("synthesis.population.destinations")
df_candidates = df_destinations[df_destinations["offers_education"]].copy()
education_coordinates = np.vstack(df_candidates["geometry"].apply(lambda x: np.array([x.x, x.y])).values)
#prepare the distances used for sampling based on the cdf (this is the radius variable)
distributions = context.stage("synthesis.population.spatial.primary.weekend.pry_distance_distributions")
cdf = distributions["education"]["cdf"]
midpoint_bins = distributions["education"]["midpoint_bins"]
random_values = np.random.rand(len(df_syn_persons_education))
value_bins = np.searchsorted(cdf, random_values)
radius = midpoint_bins[value_bins]
#plt.hist(radius, bins=200)
#plt.savefig("%s/my_dist_hist_selected.png" % context.cache_path)
#Sample distances
tree = KDTree(education_coordinates)
indices, distances = tree.query_radius(home_coordinates, r=radius, return_distance=True, sort_results=True)
# when no facility is found
count = 0
for i in range(len(indices)):
l = indices[i]
if len(l) == 0:
count = count + 1
dist, ind = tree.query(np.array(home_coordinates[i]).reshape(1, -1), 2, return_distance=True,
sort_results=True)
fac = ind[0][1]
indices[i] = [fac]
distances[i] = [dist[0][1]]
#report % of observations where facilities were not found
print("INFO: % of persons with education facilities not found are: ", (100*count/len(indices)))
print("INFO: getting nearest education facilities for these persons")
# Select the distances - Select the last index and distance for each person
discrete_indices = [l[-1] for l in indices]
discrete_d = [d[-1] for d in distances]
discrete_d = np.array(discrete_d)
#make a plot to look at the distance distribution with this method for smaller distances
plt.hist(discrete_d[discrete_d <25000], bins=200)
#
plt.savefig("%s/my_dist_hist_outputQuery.png" % context.cache_path)
print("INFO: imputing education locations...")
df_education_persons = df_syn_persons_education.copy()
df_education_persons["education_x"] = df_candidates.iloc[discrete_indices]["destination_x"].values
df_education_persons["education_y"] = df_candidates.iloc[discrete_indices]["destination_y"].values
df_education_persons["destination_id"] = df_candidates.iloc[discrete_indices]["destination_id"].values
df_education_persons["distance"] = np.sqrt(
(df_education_persons["home_x"] - df_education_persons["education_x"]) ** 2 +
(df_education_persons["home_y"] - df_education_persons["education_y"]) ** 2
)
#plt.hist(df_education_persons["distance"], bins = 200)
#plt.savefig("%s/my_dist_hist_persons_imputedQuery.png" % context.cache_path)
#df_education_persons.loc[:,"distance_km"] = 0.001 * df_education_persons["distance"]
#df_test = df_education_persons[df_education_persons["distance_km"] < 17].copy()
#plt.hist(df_test["distance_km"], bins = 200)
#plt.savefig("%s/my_dist_hist_persons_imputed_25km.png" % context.cache_path)
df_education_persons = df_education_persons[["person_id",
"education_x", "education_y",
"destination_id"]].rename({"education_x": "x",
"education_y": "y"},
axis=1)
df_education_persons = spatial_utils.to_gpd(context, df_education_persons, coord_type="education")
return df_education_persons[["person_id", "destination_id", "geometry"]]
def execute(context):
df_locations = impute_locations(context)
return df_locations
#todo: some things not considered and assumptions made:
# Type of facility is not considered for weekend education so someone might be going to kindergarten
# Age is not considered either for these weekend education
# Possibility of over capacity of some of these locations as capacity is not taken into consideration
\ No newline at end of file
import geopandas as gpd
import numpy as np
import pandas as pd
import shapely.geometry as geo
def configure(context):
context.stage("synthesis.population.spatial.primary.weekend.work_locations")
context.stage("synthesis.population.spatial.primary.weekend.education_locations")
def execute(context):
df_work = context.stage("synthesis.population.spatial.primary.weekend.work_locations")
df_education = context.stage("synthesis.population.spatial.primary.weekend.education_locations")
return df_work, df_education
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def configure(context):
context.stage("data.microcensus.persons")
context.stage("data.microcensus.trips")
def compute_cdf(context,df,bin_size=100):
#A default bin size of 100 has been chosen after looking at the weekend distance distribution for work and education
# and have found it suitable as the largest bin size with reasonably small number of empty bins for both
#df = df[df["distance"] <= 25000]
#df["distance_km"] = 0.001*df["distance"]
#plt.hist(df["distance"], weights=df["weight"], bins=200)
#plt.savefig("%s/my_dist_hist_raw.png" % context.cache_path)
print("stats: ", df["distance"].describe())
print("INFO: work distance distribution generated for 95th percentile at: ", df["distance"].quantile(0.95))
#print("quantile99: ", np.percentile(dist, 99))
# For work 90th percentile of the distance is 18627 meters, 95th is 27045.02 and 97th is 36726.34
# but there are outliers of upto 235km that throw off the distance sampling
# because with a 200 bin size most distances end up in one bin
# it is stable from 95th percentile below
# Decide percentile and extract the distance distribution for that percentile
# 0.95 works for both education and work
quant = df["distance"].quantile(0.95)
df_quant = df[df["distance"]<=quant]
plt.hist(df_quant["distance"], weights=df_quant["weight"], bins=bin_size)
plt.savefig("%s/my_dist_distribution_raw.png" % context.cache_path)
hist_vals, bins_vals = np.histogram(df_quant["distance"], weights=df_quant["weight"],bins=bin_size)
histbin_midpoints = bins_vals[:-1] + np.diff(bins_vals) / 2
cdf = np.cumsum(hist_vals)
cdf = cdf / cdf[-1]
threshold_buffer = np.diff(bins_vals) / 2
threshold_buffer = threshold_buffer[0]
# the threshold buffer is used to create a maximum radius boundary for sampling the distance
# print("mean of hist_midpoint: ", np.mean(histbin_midpoints))
# print("this is max of hist_midpoint: ", np.max(histbin_midpoints))
# print("this is min of hist_midpoint: ", np.min(histbin_midpoints))
# print("this is std of hist_midpoint: ", np.std(histbin_midpoints))
#
# midpoint_km = np.true_divide(histbin_midpoints, 1000)
# midpoint_dig = np.digitize(histbin_midpoints, bins_vals)
#
# plt.hist(midpoint_dig, bins = 200)
# plt.savefig("%s/my_dist_hist_midpoints2.png" % context.cache_path)
return cdf, histbin_midpoints, threshold_buffer
def execute(context):
# Prepare data
df_persons = context.stage("data.microcensus.persons")[["person_id", "person_weight", "weekend"]].rename(
columns={"person_weight": "weight"})
#filter out only persons engaged in weekend trips
# ToDo should confirm this
#the assumption here is that saturday and sunday have the same distance distribution for work or education trips
#which is different from the weekday.
df_persons = df_persons[df_persons["weekend"]].copy()
df_trips = context.stage("data.microcensus.trips")[["person_id", "trip_id", "mode", "crowfly_distance",
"departure_time", "arrival_time", "purpose"]]
df_trips = pd.merge(df_trips, df_persons[["person_id", "weight"]], on="person_id")
#keep only trips with distances greater than zero
df_trips = df_trips[df_trips["crowfly_distance"] > 0.0]
#separate trip purpose into origin and destination
df_trips["following_purpose"] = df_trips["purpose"]
df_trips["preceding_purpose"] = df_trips["purpose"].shift(1)
df_trips.loc[df_trips["trip_id"] == 1, "preceding_purpose"] = "home"
# Filtering for only work and education
#Calculate distributions
bin_size_work = 200
bin_size_edu = 80
distributions = {}
#Extract work distances
# ToDo take distances from commute distances already provided in mz.commute
# toDo should potentially remove work-work trips so these distances are not part of the distribution
# Using the person weights and the crowfly distance of all trips that end at work. (the issue here is that
# there is no integrity for h-w trips as some trips are w-w. A solution will be to remove all trips
# that do not hold the integrity of h-w
df_trips_work = df_trips[df_trips["purpose"] == "work"].copy()
df = df_trips_work[["crowfly_distance", "weight"]].rename(columns={"crowfly_distance": "distance"})
work_cdf, work_midpoint_bin_distances, work_threshold_buffer = compute_cdf(context,df, bin_size=bin_size_work)
# Write distribution for work
distributions["work"] = dict(cdf=work_cdf, midpoint_bins=work_midpoint_bin_distances, threshold_buffer=work_threshold_buffer)
#Extract education distances
df_trips_edu = df_trips[df_trips["purpose"] == "education"].copy()
df_edu = df_trips_edu[["crowfly_distance", "weight"]].rename(columns={"crowfly_distance": "distance"})
edu_cdf, edu_midpoint_bin_distances, edu_threshold_buffer = compute_cdf(context, df_edu, bin_size=bin_size_edu)
# Write distribution for education
distributions["education"] = dict(cdf=edu_cdf, midpoint_bins=edu_midpoint_bin_distances, threshold_buffer=edu_threshold_buffer)
return distributions
import math
import geopandas as gpd
import numpy as np
import pandas as pd
import shapely.geometry as geo
from sklearn.neighbors import KDTree
import data.spatial.utils as spatial_utils
import matplotlib.pyplot as plt
def configure(context):
context.stage("synthesis.population.trips")
context.stage("synthesis.population.sampled")
context.stage("synthesis.population.spatial.home.locations")
context.stage("synthesis.population.spatial.primary.weekend.pry_distance_distributions")
context.stage("synthesis.population.destinations")
context.stage("synthesis.population.enriched")
context.config("random_seed")
context.config("threads")
context.config("output_path")
def prepare_home_locations(context):
# Load persons and their primary locations
df_home = context.stage("synthesis.population.spatial.home.locations")
df_home = df_home.rename(columns={"geometry": "home"})
df_locations = context.stage("synthesis.population.sampled")[["person_id", "household_id"]]
df_locations = pd.merge(df_locations, df_home[["household_id", "home"]], how="left", on="household_id")
return df_locations[["person_id", "home"]].sort_values(by="person_id")
def prepare_destinations(context):
df_destinations = context.stage("synthesis.population.destinations")
M = np.max(df_destinations["destination_id"].values.tolist()) + 1
data = {}
identifiers = df_destinations["destination_id"].values
locations = np.vstack(df_destinations["geometry"].apply(lambda x: np.array([x.x, x.y])).values)
no_employees = df_destinations["number_employees"].values
for purpose in ("work", "education"):
f = df_destinations["offers_%s" % purpose].values
data[purpose] = dict(
identifiers=identifiers[f],
locations=locations[f],
no_employees=no_employees[f]
)
print("Number of statent facilities for: ", purpose, len(identifiers[f]))
return data, df_destinations
def prepare_radius_from_cdf(cdf, midpoint_bins, random_values):
#random_values are random values generated matching the length of the synthetic population which will be
# used to sample radius randomly to be used for sampling distances for each person
#here we are sampling locations for each person not for each trip
value_bins = np.searchsorted(cdf, random_values)
radius_from_cdf = midpoint_bins[value_bins]
return radius_from_cdf
def find_locations(home_coordinates, destination_coordinates, radius):
tree = KDTree(destination_coordinates)
indices, distances = tree.query_radius(home_coordinates, r = radius, return_distance = True, sort_results=True)
#when no facility is found
for i in range(len(indices)):
l = indices[i]
if len(l) == 0:
dist, ind = tree.query(np.array(home_coordinates[i]).reshape(1,-1), 2, return_distance = True, sort_results=True)
fac = ind[0][1]
indices[i] = [fac]
distances[i] = [dist[0][1]]
return indices, distances
def impute_nearest_work_locations(context):
# Prepare work persons
df_persons = context.stage("synthesis.population.enriched")
df_trips = context.stage("synthesis.population.trips")#.sort_values(by=["person_id", "trip_index"])
df_trips_work = df_trips[df_trips["following_purpose"] == "work"].copy()
#df_syn_persons_work = df_persons.copy()
df_syn_persons_work = df_persons[df_persons["person_id"].isin(df_trips_work["person_id"].unique())]
#prepare home coordinates
home_coordinates = np.vstack([df_syn_persons_work["home_x"], df_syn_persons_work["home_y"]]).T
# Prepare work destinations
df_destinations = context.stage("synthesis.population.destinations")
df_candidates = df_destinations[df_destinations["offers_work"]].copy()
work_coordinates = np.vstack(df_candidates["geometry"].apply(lambda x: np.array([x.x, x.y])).values)
query_size = 10
tree = KDTree(work_coordinates)
distances, indices = tree.query(home_coordinates, query_size, return_distance=True)
# randomly select
selector = np.random.randint(query_size, size=(indices.shape[0],))
indices = np.choose(selector, indices.T)
distances = np.choose(selector, distances.T)
df_syn_persons_work["work_x"] = df_candidates.iloc[indices]["destination_x"].values
df_syn_persons_work["work_y"] = df_candidates.iloc[indices]["destination_y"].values
df_syn_persons_work["destination_id"] = df_candidates.iloc[indices]["destination_id"].values
df_syn_persons_work["distance"] = np.sqrt(
(df_syn_persons_work["home_x"] - df_syn_persons_work["work_x"]) ** 2 +
(df_syn_persons_work["home_y"] - df_syn_persons_work["work_y"]) ** 2
)
#plots to check sampled distance distribution
# plt.hist(distances, bins = 200)
# plt.savefig("%s/my_raw_sampled_distances.png" % context.cache_path)
#
# plt.hist(df_syn_persons_work["distance"], bins = 200)
#
# plt.savefig("%s/my_persons_imputed_distances_m.png" % context.cache_path)
#
# df_syn_persons_work.loc[:, "distance_km"] = df_syn_persons_work["distance"]/1000