Commit c970730c authored by tchervec's avatar tchervec
Browse files

Resolve "Update python package versions"

parent 2a3d5016
**3.0.0**
- Update python requirements
- Updated statistical matching to most recent version
- Duplicate agents after IPU using Truncate-Replicate-Sample method
- Use WMAPE and WMAE as IPU convergence criteria
......
......@@ -3,11 +3,11 @@ import pyproj
# TODO: Pandas is quite good at working with categorical data. Refactor everything to make use of that.
# It will not only be more readable but will also bring a speedup!
CH1903 = pyproj.Proj("epsg:21781")
CH1903 = "epsg:21781"
LV05 = CH1903
CH1903_PLUS = pyproj.Proj("epsg:2056")
CH1903_PLUS = "epsg:2056"
LV95 = CH1903_PLUS
WGS84 = pyproj.Proj("epsg:4326")
WGS84 = "epsg:4326"
MAXIMUM_HOUSEHOLD_SIZE = 12
MINIMUM_AGE_PER_HOUSEHOLD = 16
......
......@@ -42,7 +42,8 @@ def execute(context):
# Convert coordinates to LV95
coords = df_mz_households[["W_X_CH1903", "W_Y_CH1903"]].values
x, y = pyproj.transform(c.CH1903, c.CH1903_PLUS, coords[:, 0], coords[:, 1])
transformer = pyproj.Transformer.from_crs(c.CH1903, c.CH1903_PLUS)
x, y = transformer.transform(coords[:, 0], coords[:, 1])
df_mz_households.loc[:, "home_x"] = x
df_mz_households.loc[:, "home_y"] = y
......
......@@ -88,7 +88,8 @@ def execute(context):
# Adjust coordinates
for mz_attribute, df_attribute in [("Z", "destination"), ("S", "origin"), ("W", "home")]:
coords = df_mz_trips[["%s_X_CH1903" % mz_attribute, "%s_Y_CH1903" % mz_attribute]].values
x, y = pyproj.transform(c.CH1903, c.CH1903_PLUS, coords[:,0], coords[:,1])
transformer = pyproj.Transformer.from_crs(c.CH1903, c.CH1903_PLUS)
x, y = transformer.transform(coords[:, 0], coords[:, 1])
df_mz_trips.loc[:, "%s_x" % df_attribute] = x
df_mz_trips.loc[:, "%s_y" % df_attribute] = y
......
......@@ -2,7 +2,6 @@ import geopandas as gpd
import numpy as np
import pandas as pd
from sklearn.neighbors import KDTree
import data.spatial.utils
def configure(context):
......@@ -37,7 +36,7 @@ def execute(context):
"%s/%s" % (data_path, shapefile),
encoding="latin1"
).to_crs("epsg:2056")
df.crs = "epsg:2056"
df.loc[:, "municipality_id"] = df[id_field]
......
......@@ -8,16 +8,17 @@ def configure(context):
context.config("data_path")
context.stage("data.spatial.municipalities")
def execute(context):
# Load data
data_path = context.config("data_path")
df_types = pd.read_excel("%s/spatial_structure_2018.xlsx" % data_path,
names=["municipality_id", "TYP"],
usecols=[0, 21],
skiprows=6,
nrows=2229,
)
names=["municipality_id", "TYP"],
usecols=[0, 21],
skiprows=6,
nrows=2229,
)
df_municipalities = context.stage("data.spatial.municipalities")[0]
# Rewrite classification
......@@ -35,14 +36,14 @@ def execute(context):
df_types = df_types[["municipality_id", "municipality_type"]]
# Match by municipality_id
df_existing = pd.merge(df_municipalities, df_types, on = "municipality_id")
df_existing = pd.merge(df_municipalities, df_types, on="municipality_id")
df_existing["imputed_municipality_type"] = False
df_existing = df_existing[["municipality_id", "municipality_type", "imputed_municipality_type", "geometry"]]
# Some ids are missing (because they are special zones)
df_missing = gpd.GeoDataFrame(df_municipalities[
~df_municipalities["municipality_id"].isin(df_existing["municipality_id"])
])
~df_municipalities["municipality_id"].isin(df_existing["municipality_id"])
])
df_missing.crs = df_municipalities.crs
df_missing = df_missing[["municipality_id", "geometry"]]
......@@ -51,7 +52,7 @@ def execute(context):
kd_tree = KDTree(coordinates)
coordinates = np.vstack([df_missing["geometry"].centroid.x, df_missing["geometry"].centroid.y]).T
indices = kd_tree.query(coordinates, return_distance = False).flatten()
indices = kd_tree.query(coordinates, return_distance=False).flatten()
df_missing.loc[:, "municipality_type"] = df_existing.iloc[indices]["municipality_type"].values
df_missing.loc[:, "imputed_municipality_type"] = True
......@@ -59,17 +60,18 @@ def execute(context):
df_mapping = pd.concat((df_existing, df_missing))
assert(len(df_mapping) == len(df_municipalities))
assert(set(np.unique(df_mapping["municipality_id"])) == set(np.unique(df_municipalities["municipality_id"])))
assert (len(df_mapping) == len(df_municipalities))
assert (set(np.unique(df_mapping["municipality_id"])) == set(np.unique(df_municipalities["municipality_id"])))
df_mapping = pd.DataFrame(df_mapping[["municipality_id", "municipality_type", "imputed_municipality_type"]])
df_mapping["municipality_type"] = df_mapping["municipality_type"].astype("category")
return df_mapping
def impute(df, df_municipality_types, remove_unknown = False):
assert("municipality_id" in df.columns)
df = pd.merge(df, df_municipality_types, on = "municipality_id")
def impute(df, df_municipality_types, remove_unknown=False):
assert ("municipality_id" in df.columns)
df = pd.merge(df, df_municipality_types, on="municipality_id")
if remove_unknown:
return df[~np.isnan(df["municipality_type"])]
......
......@@ -4,6 +4,7 @@ import pandas as pd
import shapely.geometry as geo
from sklearn.neighbors import KDTree
def sample_coordinates(row, count):
samples = []
bounds = row["geometry"].bounds
......
......@@ -7,6 +7,7 @@ import data.constants as c
def configure(context):
context.stage("data.statpop.persons")
def execute(context):
df_statpop = context.stage("data.statpop.persons")
density_coordinates = np.vstack([df_statpop["home_x"], df_statpop["home_y"]]).T
......@@ -14,8 +15,9 @@ def execute(context):
return kd_tree
def impute(kd_tree, df, x = "x", y = "y", radius = c.POPULATION_DENSITY_RADIUS):
def impute(kd_tree, df, x="x", y="y", radius=c.POPULATION_DENSITY_RADIUS):
print("Imputing population density ...")
coordinates = np.vstack([df[x], df[y]]).T
counts = kd_tree.query_radius(coordinates, radius, count_only = True)
df["population_density"] = counts # / (np.pi * c.POPULATION_DENSITY_RADIUS**2)
counts = kd_tree.query_radius(coordinates, radius, count_only=True)
df["population_density"] = counts # / (np.pi * c.POPULATION_DENSITY_RADIUS**2)
......@@ -102,12 +102,9 @@ def execute(context):
df_households = df_households.replace(CANTON_TO_ID_MULTILANGUAGE)
# TODO: why do we only use five categories?
# Make zero-based with only 5 categories
df_households["household_size"] = np.minimum(5, df_households["household_size"]) - 1
df_households = df_households.groupby(["canton_id", "household_size"]).sum().reset_index().sort_values(
["canton_id", "household_size"])
df_households = df_households.rename({"household_size": "household_size_class_projection"}, axis=1)
df_households = df_households.sort_values(["canton_id", "household_size_class_projection"])
# Limit to 5 categories
df_households["household_size"] = np.minimum(5, df_households["household_size"])
df_households = df_households.groupby(["canton_id", "household_size"]).sum().reset_index()
else:
......@@ -117,17 +114,28 @@ def execute(context):
header=[0, 1], skiprows=2, nrows=27, index_col=0).reset_index().rename({
"index": "canton_id",
"Total": "total",
"1 Person": 1,
"2 Personen": 2,
"3 und mehr Personen": 3
"1 Person": "1",
"2 Personen": "2",
"3 und mehr Personen": "3",
2017: "2017",
2045: "2045"
}, axis=1)
# Flatten multi-index columns
df_households.columns = ['_'.join(col).strip("_") for col in df_households.columns.values]
# Convert to long format
df_households = df_households.melt(
id_vars="canton_id", value_vars=[1, 2, 3],
value_name="weight", var_name=["household_size", "year"]
id_vars="canton_id", value_vars=["1_2017", "1_2045", "2_2017", "2_2045", "3_2017", "3_2045"],
value_name="weight", var_name=["household_size_year"]
)
# split and rename columns
temp = df_households["household_size_year"].str.split("_", expand=True)
df_households["household_size"] = temp[0].astype(int)
df_households["year"] = temp[1].astype(int)
df_households = df_households[["canton_id", "household_size", "year", "weight"]]
# Remove Switzerland total
df_households = df_households[df_households["canton_id"] != "Schweiz"]
......@@ -152,10 +160,6 @@ def execute(context):
df_households = df_households[("weight", scaling_year)].reset_index()
df_households.columns = ["canton_id", "household_size", "weight"]
# Make zero-based
df_households["household_size"] -= 1
df_households = df_households.rename({"household_size": "household_size_class_projection"}, axis=1)
# Replace cantons
df_households = df_households.replace(CANTON_TO_ID)
......@@ -163,6 +167,13 @@ def execute(context):
df_households["weight"] = np.round(df_households["weight"])
df_households["weight"] = df_households["weight"].astype(int)
# make size class zero-based
df_households = df_households.rename({"household_size": "household_size_class"}, axis=1)
df_households["household_size_class"] = df_households["household_size_class"] - 1
# sort values
df_households = df_households.sort_values(["canton_id", "household_size_class"])
print(df_households.head())
return df_households, scaling_year
......@@ -36,10 +36,12 @@ def execute(context):
print("Number of households before scaling :", len(df_statpop["household_id"].unique()))
print("Number of persons before scaling :", len(df_statpop["person_id"].unique()))
# rename household_size_class column
df_household_controls = df_household_controls.rename({"household_size_class": "household_size_class_projection"}, axis=1)
# we need to add a new household class column with only as many categories as the controls
number_household_classes = len(df_household_controls["household_size_class_projection"].unique())
df_statpop["household_size_class_projection"] = np.minimum(number_household_classes,
df_statpop["household_size"]) - 1
df_statpop["household_size_class_projection"] = np.minimum(number_household_classes, df_statpop["household_size"]) - 1
# create IPU fitting problem by canton
problems = []
......
......@@ -23,29 +23,30 @@ channels:
- defaults
dependencies:
- matplotlib=3.1.3
- pandas=1.0.3
- scipy=1.4.1
- numpy=1.18.1
- matplotlib=3.3.2
- pandas=1.1.3
- scipy=1.5.2
- numpy=1.19.2
- geopandas=0.6.1
- numba=0.49.0
- numba=0.51.2
- palettable=3.3.0
- scikit-learn=0.22.1
- shapely=1.7.0
- tqdm=4.46.0
- scikit-learn=0.23.2
- shapely=1.6.4
- tqdm=4.50.2
- pytables=3.6.1
- xlrd=1.2.0
- pip=20.0.2
- pip=20.2.4
- pip:
- pyproj==3.0.0
- simpledbf==0.2.6
- synpp==1.3.1
- python-Levenshtein==0.12.0
# For testing
- pytest==5.4.2
- pytest==6.1.1
- xlwt==1.3.0
- pysal==1.14.4.post1
- pysal==2.3.0
EOF
)
......
......@@ -4,20 +4,21 @@ import subprocess as sp
def configure(context):
context.stage("utils.java")
def execute(context):
java = context.stage("utils.java")
sp.check_call([
"git", "clone", "https://github.com/eqasim-org/eqasim-java.git"
], cwd = context.cache_path)
], cwd=context.cache_path)
sp.check_call([
"git", "checkout", "v1.0.5"
], cwd = "%s/eqasim-java" % context.cache_path)
], cwd="%s/eqasim-java" % context.cache_path)
sp.check_call([
"mvn", "-Pstandalone", "package"
], cwd = "%s/eqasim-java" % context.cache_path)
], cwd="%s/eqasim-java" % context.cache_path)
jar = "%s/eqasim-java/switzerland/target/switzerland-1.0.5.jar" % context.cache_path
return jar
import os
import subprocess as sp
def configure(context):
context.stage("utils.java")
def execute(context):
java = context.stage("utils.java")
......@@ -11,20 +13,20 @@ def execute(context):
sp.check_call([
"git", "clone", "https://github.com/matsim-org/pt2matsim.git"
], cwd = context.cache_path)
], cwd=context.cache_path)
sp.check_call([
"git", "checkout", "v19.10"
], cwd = "%s/pt2matsim" % context.cache_path)
], cwd="%s/pt2matsim" % context.cache_path)
sp.check_call([
"mvn", "-Djava.io.tmpdir=%s/java_tmp" % context.cache_path, "package"
], cwd = "%s/pt2matsim" % context.cache_path)
], cwd="%s/pt2matsim" % context.cache_path)
jar = "%s/pt2matsim/target/pt2matsim-19.10-shaded.jar" % context.cache_path
java(jar, "org.matsim.pt2matsim.run.CreateDefaultOsmConfig", ["test_config.xml"], cwd = context.cache_path)
java(jar, "org.matsim.pt2matsim.run.CreateDefaultOsmConfig", ["test_config.xml"], cwd=context.cache_path)
assert(os.path.exists("%s/test_config.xml" % context.cache_path))
assert(os.path.exists("%s/java_tmp/GeoTools" % context.cache_path))
assert (os.path.exists("%s/test_config.xml" % context.cache_path))
assert (os.path.exists("%s/java_tmp/GeoTools" % context.cache_path))
return jar, "%s/java_tmp" % context.cache_path
import numpy as np
import pandas as pd
......@@ -5,16 +6,17 @@ def configure(context):
context.stage("data.microcensus.persons")
context.stage("data.microcensus.trips")
def execute(context):
df_trips = pd.DataFrame(context.stage("data.microcensus.trips"), copy = True)
df_trips = pd.DataFrame(context.stage("data.microcensus.trips"), copy=True)
df_persons = context.stage("data.microcensus.persons")
df_trips = pd.merge(df_trips, df_persons[["person_id", "home_x", "home_y"]], how = "left")
df_trips = pd.merge(df_trips, df_persons[["person_id", "home_x", "home_y"]], how="left")
df_trips.loc[:, "previous_trip_id"] = df_trips.loc[:, "trip_id"] - 1
df_activities = pd.merge(
df_trips, df_trips, left_on = ["person_id", "previous_trip_id"], right_on = ["person_id", "trip_id"],
suffixes = ["_following_trip", "_previous_trip"], how = "left"
df_trips, df_trips, left_on=["person_id", "previous_trip_id"], right_on=["person_id", "trip_id"],
suffixes=["_following_trip", "_previous_trip"], how="left"
)
df_activities.loc[:, "start_time"] = df_activities.loc[:, "arrival_time_previous_trip"]
......@@ -33,7 +35,7 @@ def execute(context):
df_activities.loc[f, "location_y"] = df_activities.loc[f, "home_y_following_trip"]
# We're still missing the last activity in the chain.
df_last = df_activities.sort_values(by = ["person_id", "activity_id"]).groupby("person_id").last().reset_index()
df_last = df_activities.sort_values(by=["person_id", "activity_id"]).groupby("person_id").last().reset_index()
df_last.loc[:, "purpose"] = df_last.loc[:, "purpose_following_trip"]
df_last.loc[:, "start_time"] = df_last.loc[:, "arrival_time_following_trip"]
df_last.loc[:, "end_time"] = np.nan
......@@ -45,7 +47,7 @@ def execute(context):
df_activities = pd.concat([df_activities, df_last])
# Some cleanup
df_activities = df_activities.sort_values(by = ["person_id", "activity_id"])
df_activities = df_activities.sort_values(by=["person_id", "activity_id"])
df_activities.loc[:, "duration"] = df_activities.loc[:, "end_time"] - df_activities.loc[:, "start_time"]
df_activities = df_activities[[
......
import os.path
import os
def configure(context):
context.stage("matsim.java.pt2matsim")
context.stage("utils.java")
context.config("data_path")
context.config("hafas_date")
def execute(context):
jar, tmp_path = context.stage("matsim.java.pt2matsim")
......@@ -16,12 +19,12 @@ def execute(context):
"%s/transit_schedule.xml.gz" % context.cache_path,
"%s/transit_vehicles.xml.gz" % context.cache_path,
context.config("hafas_date")
], cwd = context.cache_path, vm_arguments = ["-Djava.io.tmpdir=%s" % tmp_path])
], cwd=context.cache_path, vm_arguments=["-Djava.io.tmpdir=%s" % tmp_path])
assert(os.path.exists("%s/transit_schedule.xml.gz" % context.cache_path))
assert(os.path.exists("%s/transit_vehicles.xml.gz" % context.cache_path))
assert (os.path.exists("%s/transit_schedule.xml.gz" % context.cache_path))
assert (os.path.exists("%s/transit_vehicles.xml.gz" % context.cache_path))
return {
"schedule" : "%s/transit_schedule.xml.gz" % context.cache_path,
"vehicles" : "%s/transit_vehicles.xml.gz" % context.cache_path
"schedule": "%s/transit_schedule.xml.gz" % context.cache_path,
"vehicles": "%s/transit_vehicles.xml.gz" % context.cache_path
}
import os.path
import os
def configure(context):
context.stage("matsim.java.pt2matsim")
context.stage("utils.java")
context.config("data_path")
def execute(context):
jar, tmp_path = context.stage("matsim.java.pt2matsim")
java = context.stage("utils.java")
......@@ -13,7 +15,7 @@ def execute(context):
java(jar, "org.matsim.pt2matsim.run.CreateDefaultOsmConfig", [
"convert_network_template.xml"
], cwd = context.cache_path, vm_arguments = ["-Djava.io.tmpdir=%s" % tmp_path])
], cwd=context.cache_path, vm_arguments=["-Djava.io.tmpdir=%s" % tmp_path])
content = open("%s/convert_network_template.xml" % context.cache_path).read()
......@@ -57,7 +59,7 @@ def execute(context):
java(jar, "org.matsim.pt2matsim.run.Osm2MultimodalNetwork", [
"convert_network.xml"
], cwd = context.cache_path, vm_arguments = ["-Djava.io.tmpdir=%s" % tmp_path])
], cwd=context.cache_path, vm_arguments=["-Djava.io.tmpdir=%s" % tmp_path])
assert(os.path.exists("%s/converted_network.xml.gz" % context.cache_path))
assert (os.path.exists("%s/converted_network.xml.gz" % context.cache_path))
return "%s/converted_network.xml.gz" % context.cache_path
import os.path
import os
def configure(context):
context.config("threads")
context.stage("matsim.java.pt2matsim")
context.stage("utils.java")
context.stage("matsim.network.convert_osm")
context.stage("matsim.network.convert_hafas")
def execute(context):
jar, tmp_path = context.stage("matsim.java.pt2matsim")
java = context.stage("utils.java")
......@@ -17,7 +20,7 @@ def execute(context):
java(jar, "org.matsim.pt2matsim.run.CreateDefaultPTMapperConfig", [
"map_network_template.xml"
], cwd = context.cache_path, vm_arguments = ["-Djava.io.tmpdir=%s" % tmp_path])
], cwd=context.cache_path, vm_arguments=["-Djava.io.tmpdir=%s" % tmp_path])
content = open("%s/map_network_template.xml" % context.cache_path).read()
......@@ -56,16 +59,16 @@ def execute(context):
java(jar, "org.matsim.pt2matsim.run.PublicTransitMapper", [
"map_network.xml"
], cwd = context.cache_path, vm_arguments = ["-Djava.io.tmpdir=%s" % tmp_path])
], cwd=context.cache_path, vm_arguments=["-Djava.io.tmpdir=%s" % tmp_path])
assert(os.path.exists("%s/mapped_network.xml.gz" % context.cache_path))
assert(os.path.exists("%s/mapped_schedule.xml.gz" % context.cache_path))
assert(os.path.exists("%s/road_network.xml.gz" % context.cache_path))
assert(os.path.exists(context.stage("matsim.network.convert_hafas")["vehicles"]))
assert (os.path.exists("%s/mapped_network.xml.gz" % context.cache_path))
assert (os.path.exists("%s/mapped_schedule.xml.gz" % context.cache_path))
assert (os.path.exists("%s/road_network.xml.gz" % context.cache_path))
assert (os.path.exists(context.stage("matsim.network.convert_hafas")["vehicles"]))
return {
"network" : "%s/mapped_network.xml.gz" % context.cache_path,
"schedule" : "%s/mapped_schedule.xml.gz" % context.cache_path,
"road_network" : "%s/road_network.xml.gz" % context.cache_path,
"vehicles" : context.stage("matsim.network.convert_hafas")["vehicles"]
"network": "%s/mapped_network.xml.gz" % context.cache_path,
"schedule": "%s/mapped_schedule.xml.gz" % context.cache_path,
"road_network": "%s/road_network.xml.gz" % context.cache_path,
"vehicles": context.stage("matsim.network.convert_hafas")["vehicles"]
}
import os.path
import os
def configure(context):
context.stage("matsim.java.pt2matsim")
context.stage("matsim.network.mapped")
context.stage("utils.java")
def execute(context):
java = context.stage("utils.java")
jar, tmp_path = context.stage("matsim.java.pt2matsim")
......@@ -15,7 +17,7 @@ def execute(context):
java(jar, "org.matsim.pt2matsim.run.CheckMappedSchedulePlausibility", [
"-Djava.io.tmpdir=%s/java_tmp" % tmp_path,
paths["schedule"], paths["network"], "epsg:2056", context.cache_path
], cwd = context.cache_path)
], cwd=context.cache_path)
assert(os.path.exists("%s/allPlausibilityWarnings.csv" % context.cache_path))
assert (os.path.exists("%s/allPlausibilityWarnings.csv" % context.cache_path))
return context.cache_path
import numpy as np
import pandas as pd
......@@ -5,13 +6,14 @@ def configure(context):
context.stage("population.sociodemographics")
context.stage("population.trips")
def execute(context):
df_trips = pd.DataFrame(context.stage("population.trips"), copy = True)
df_trips = pd.DataFrame(context.stage("population.trips"), copy=True)
df_trips.loc[:, "previous_trip_id"] = df_trips.loc[:, "trip_id"] - 1
df_activities = pd.merge(
df_trips, df_trips, left_on = ["person_id", "previous_trip_id"], right_on = ["person_id", "trip_id"],
suffixes = ["_following_trip", "_previous_trip"], how = "left"
df_trips, df_trips, left_on=["person_id", "previous_trip_id"], right_on=["person_id", "trip_id"],
suffixes=["_following_trip", "_previous_trip"], how="left"
)
df_activities.loc[:, "start_time"] = df_activities.loc[:, "arrival_time_previous_trip"]
......@@ -25,7 +27,7 @@ def execute(context):
df_activities.loc[:, "purpose"] = df_activities.loc[:, "purpose"].fillna("home")
# We're still missing the last activity in the chain.
df_last = df_activities.sort_values(by = ["person_id", "activity_id"]).groupby("person_id").last().reset_index()
df_last = df_activities.sort_values(by=["person_id", "activity_id"]).groupby("person_id").last().reset_index()
df_last.loc[:, "purpose"] = df_last.loc[:, "following_purpose_following_trip"]
df_last.loc[:, "start_time"] = df_last.loc[:, "arrival_time_following_trip"]
df_last.loc[:, "end_time"] = np.nan
......@@ -43,13 +45,13 @@ def execute(context):
df_missing = pd.DataFrame.from_records([
(person_id, 1, "home", True, False) for person_id in missing_ids
], columns = ["person_id", "activity_id", "purpose", "is_last", "is_commute"])
], columns=["person_id", "activity_id", "purpose", "is_last", "is_commute"])
df_activities = pd.concat([df_activities, df_missing], sort = True)
assert(len(np.unique(df_persons["person_id"])) == len(np.unique(df_activities["person_id"])))
df_activities = pd.concat([df_activities, df_missing], sort=True)
assert (len(np.unique(df_persons["person_id"])) == len(np.unique(df_activities["person_id"])))
# Some cleanup
df_activities = df_activities.sort_values(by = ["person_id", "activity_id"])
df_activities = df_activities.sort_values(by=["person_id", "activity_id"])
df_activities.loc[:, "duration"] = df_activities.loc[:, "end_time"] - df_activities.loc[:, "start_time"]
df_activities = df_activities[[
......