Commit 8bf46d66 authored by kaghog's avatar kaghog
Browse files

setup location assignment for education

fix bug in distance distribution for secondary activities
parent 1cd865bf
......@@ -6,4 +6,138 @@ def execute(context):
df_education = context.stage("synthesis.population.spatial.primary.education.locations")
return df_education
#Todo: need to check if i need to modify for weekends since the number of education trips are low anyways
\ No newline at end of file
import math
import geopandas as gpd
import numpy as np
import pandas as pd
import shapely.geometry as geo
from sklearn.neighbors import KDTree
import data.spatial.utils as spatial_utils
import matplotlib.pyplot as plt
def configure(context):
context.stage("synthesis.population.trips")
context.stage("synthesis.population.sampled")
context.stage("synthesis.population.spatial.home.locations")
context.stage("synthesis.population.spatial.primary.weekend.pry_distance_distributions")
context.stage("synthesis.population.destinations")
context.stage("synthesis.population.enriched")
context.config("random_seed")
context.config("threads")
context.config("output_path")
def prepare_destinations(context):
# ToDo may need to filter out the right education faciltiies for the weekend
return
def impute_locations(context):
# Prepare persons
df_persons = context.stage("synthesis.population.enriched")
df_trips = context.stage("synthesis.population.trips") # .sort_values(by=["person_id", "trip_index"])
df_trips_education = df_trips[df_trips["following_purpose"] == "education"].copy()
df_syn_persons_education = df_persons[df_persons["person_id"].isin(df_trips_education["person_id"].unique())]
# prepare home coordinates
home_coordinates = np.vstack([df_syn_persons_education["home_x"], df_syn_persons_education["home_y"]]).T
# Prepare destinations
df_destinations = context.stage("synthesis.population.destinations")
df_candidates = df_destinations[df_destinations["offers_education"]].copy()
education_coordinates = np.vstack(df_candidates["geometry"].apply(lambda x: np.array([x.x, x.y])).values)
#prepare the distances used for sampling based on the cdf (this is the radius variable)
distributions = context.stage("synthesis.population.spatial.primary.weekend.pry_distance_distributions")
cdf = distributions["education"]["cdf"]
midpoint_bins = distributions["education"]["midpoint_bins"]
random_values = np.random.rand(len(df_syn_persons_education))
value_bins = np.searchsorted(cdf, random_values)
radius = midpoint_bins[value_bins]
#plt.hist(radius, bins=200)
#plt.savefig("%s/my_dist_hist_selected.png" % context.cache_path)
#Sample distances
tree = KDTree(education_coordinates)
indices, distances = tree.query_radius(home_coordinates, r=radius, return_distance=True, sort_results=True)
# when no facility is found
count = 0
for i in range(len(indices)):
l = indices[i]
if len(l) == 0:
count = count + 1
dist, ind = tree.query(np.array(home_coordinates[i]).reshape(1, -1), 2, return_distance=True,
sort_results=True)
fac = ind[0][1]
indices[i] = [fac]
distances[i] = [dist[0][1]]
#report % of observations where facilities were not found
print("INFO: % of persons with education facilities not found are: ", (100*count/len(indices)))
print("INFO: getting nearest education facilities for these persons")
# Select the distances - Select the last index and distance for each person
discrete_indices = [l[-1] for l in indices]
discrete_d = [d[-1] for d in distances]
discrete_d = np.array(discrete_d)
#make a plot to look at the distance distribution with this method for smaller distances
plt.hist(discrete_d[discrete_d <25000], bins=200)
#
plt.savefig("%s/my_dist_hist_outputQuery.png" % context.cache_path)
print("INFO: imputing education locations...")
df_education_persons = df_syn_persons_education.copy()
df_education_persons["education_x"] = df_candidates.iloc[discrete_indices]["destination_x"].values
df_education_persons["education_y"] = df_candidates.iloc[discrete_indices]["destination_y"].values
df_education_persons["destination_id"] = df_candidates.iloc[discrete_indices]["destination_id"].values
df_education_persons["distance"] = np.sqrt(
(df_education_persons["home_x"] - df_education_persons["education_x"]) ** 2 +
(df_education_persons["home_y"] - df_education_persons["education_y"]) ** 2
)
#plt.hist(df_education_persons["distance"], bins = 200)
#plt.savefig("%s/my_dist_hist_persons_imputedQuery.png" % context.cache_path)
#df_education_persons.loc[:,"distance_km"] = 0.001 * df_education_persons["distance"]
#df_test = df_education_persons[df_education_persons["distance_km"] < 17].copy()
#plt.hist(df_test["distance_km"], bins = 200)
#plt.savefig("%s/my_dist_hist_persons_imputed_25km.png" % context.cache_path)
df_education_persons = df_education_persons[["person_id",
"education_x", "education_y",
"destination_id"]].rename({"education_x": "x",
"education_y": "y"},
axis=1)
df_education_persons = spatial_utils.to_gpd(context, df_education_persons, coord_type="education")
return df_education_persons[["person_id", "destination_id", "geometry"]]
def execute(context):
df_locations = impute_locations(context)
return df_locations
#todo: some things not considered and assumptions made:
# Type of facility is not considered for weekend education so someone might be going to kindergarten
# Age is not considered either for these weekend education
# Possibility of over capacity of some of these locations as capacity is not taken into consideration
\ No newline at end of file
......@@ -36,7 +36,7 @@ def calculate_bounds(values, bin_size):
def execute(context):
# Prepare data
df_persons = context.stage("data.microcensus.persons")[["person_id", "person_weight"]].rename(
df_persons = context.stage("data.microcensus.persons")[["person_id", "person_weight", "weekend"]].rename(
columns={"person_weight": "weight"})
#distance distribution for weekend scenario
......@@ -61,6 +61,7 @@ def execute(context):
df_trips = df_trips[~(df_trips["preceding_purpose"].isin(primary_activities)
& df_trips["following_purpose"].isin(primary_activities))]
# Rename columns
distance_column = "crowfly_distance" if "crowfly_distance" in df_trips else "network_distance"
df = df_trips[["mode", "travel_time", distance_column, "weight"]].rename(columns={distance_column: "distance"})
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment