Commit 6e2bc89d authored by tchervec's avatar tchervec
Browse files

Merge branch '65-update-python-package-versions' into 'develop'

Resolve "Update python package versions"

See merge request ivt-vpl/populations/ch-zh-synpop!99
parents 2a3d5016 c970730c
**3.0.0** **3.0.0**
- Update python requirements
- Updated statistical matching to most recent version - Updated statistical matching to most recent version
- Duplicate agents after IPU using Truncate-Replicate-Sample method - Duplicate agents after IPU using Truncate-Replicate-Sample method
- Use WMAPE and WMAE as IPU convergence criteria - Use WMAPE and WMAE as IPU convergence criteria
......
...@@ -3,11 +3,11 @@ import pyproj ...@@ -3,11 +3,11 @@ import pyproj
# TODO: Pandas is quite good at working with categorical data. Refactor everything to make use of that. # TODO: Pandas is quite good at working with categorical data. Refactor everything to make use of that.
# It will not only be more readable but will also bring a speedup! # It will not only be more readable but will also bring a speedup!
CH1903 = pyproj.Proj("epsg:21781") CH1903 = "epsg:21781"
LV05 = CH1903 LV05 = CH1903
CH1903_PLUS = pyproj.Proj("epsg:2056") CH1903_PLUS = "epsg:2056"
LV95 = CH1903_PLUS LV95 = CH1903_PLUS
WGS84 = pyproj.Proj("epsg:4326") WGS84 = "epsg:4326"
MAXIMUM_HOUSEHOLD_SIZE = 12 MAXIMUM_HOUSEHOLD_SIZE = 12
MINIMUM_AGE_PER_HOUSEHOLD = 16 MINIMUM_AGE_PER_HOUSEHOLD = 16
......
...@@ -42,7 +42,8 @@ def execute(context): ...@@ -42,7 +42,8 @@ def execute(context):
# Convert coordinates to LV95 # Convert coordinates to LV95
coords = df_mz_households[["W_X_CH1903", "W_Y_CH1903"]].values coords = df_mz_households[["W_X_CH1903", "W_Y_CH1903"]].values
x, y = pyproj.transform(c.CH1903, c.CH1903_PLUS, coords[:, 0], coords[:, 1]) transformer = pyproj.Transformer.from_crs(c.CH1903, c.CH1903_PLUS)
x, y = transformer.transform(coords[:, 0], coords[:, 1])
df_mz_households.loc[:, "home_x"] = x df_mz_households.loc[:, "home_x"] = x
df_mz_households.loc[:, "home_y"] = y df_mz_households.loc[:, "home_y"] = y
......
...@@ -88,7 +88,8 @@ def execute(context): ...@@ -88,7 +88,8 @@ def execute(context):
# Adjust coordinates # Adjust coordinates
for mz_attribute, df_attribute in [("Z", "destination"), ("S", "origin"), ("W", "home")]: for mz_attribute, df_attribute in [("Z", "destination"), ("S", "origin"), ("W", "home")]:
coords = df_mz_trips[["%s_X_CH1903" % mz_attribute, "%s_Y_CH1903" % mz_attribute]].values coords = df_mz_trips[["%s_X_CH1903" % mz_attribute, "%s_Y_CH1903" % mz_attribute]].values
x, y = pyproj.transform(c.CH1903, c.CH1903_PLUS, coords[:,0], coords[:,1]) transformer = pyproj.Transformer.from_crs(c.CH1903, c.CH1903_PLUS)
x, y = transformer.transform(coords[:, 0], coords[:, 1])
df_mz_trips.loc[:, "%s_x" % df_attribute] = x df_mz_trips.loc[:, "%s_x" % df_attribute] = x
df_mz_trips.loc[:, "%s_y" % df_attribute] = y df_mz_trips.loc[:, "%s_y" % df_attribute] = y
......
...@@ -2,7 +2,6 @@ import geopandas as gpd ...@@ -2,7 +2,6 @@ import geopandas as gpd
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from sklearn.neighbors import KDTree from sklearn.neighbors import KDTree
import data.spatial.utils
def configure(context): def configure(context):
...@@ -37,7 +36,7 @@ def execute(context): ...@@ -37,7 +36,7 @@ def execute(context):
"%s/%s" % (data_path, shapefile), "%s/%s" % (data_path, shapefile),
encoding="latin1" encoding="latin1"
).to_crs("epsg:2056") ).to_crs("epsg:2056")
df.crs = "epsg:2056" df.crs = "epsg:2056"
df.loc[:, "municipality_id"] = df[id_field] df.loc[:, "municipality_id"] = df[id_field]
......
...@@ -8,16 +8,17 @@ def configure(context): ...@@ -8,16 +8,17 @@ def configure(context):
context.config("data_path") context.config("data_path")
context.stage("data.spatial.municipalities") context.stage("data.spatial.municipalities")
def execute(context): def execute(context):
# Load data # Load data
data_path = context.config("data_path") data_path = context.config("data_path")
df_types = pd.read_excel("%s/spatial_structure_2018.xlsx" % data_path, df_types = pd.read_excel("%s/spatial_structure_2018.xlsx" % data_path,
names=["municipality_id", "TYP"], names=["municipality_id", "TYP"],
usecols=[0, 21], usecols=[0, 21],
skiprows=6, skiprows=6,
nrows=2229, nrows=2229,
) )
df_municipalities = context.stage("data.spatial.municipalities")[0] df_municipalities = context.stage("data.spatial.municipalities")[0]
# Rewrite classification # Rewrite classification
...@@ -35,14 +36,14 @@ def execute(context): ...@@ -35,14 +36,14 @@ def execute(context):
df_types = df_types[["municipality_id", "municipality_type"]] df_types = df_types[["municipality_id", "municipality_type"]]
# Match by municipality_id # Match by municipality_id
df_existing = pd.merge(df_municipalities, df_types, on = "municipality_id") df_existing = pd.merge(df_municipalities, df_types, on="municipality_id")
df_existing["imputed_municipality_type"] = False df_existing["imputed_municipality_type"] = False
df_existing = df_existing[["municipality_id", "municipality_type", "imputed_municipality_type", "geometry"]] df_existing = df_existing[["municipality_id", "municipality_type", "imputed_municipality_type", "geometry"]]
# Some ids are missing (because they are special zones) # Some ids are missing (because they are special zones)
df_missing = gpd.GeoDataFrame(df_municipalities[ df_missing = gpd.GeoDataFrame(df_municipalities[
~df_municipalities["municipality_id"].isin(df_existing["municipality_id"]) ~df_municipalities["municipality_id"].isin(df_existing["municipality_id"])
]) ])
df_missing.crs = df_municipalities.crs df_missing.crs = df_municipalities.crs
df_missing = df_missing[["municipality_id", "geometry"]] df_missing = df_missing[["municipality_id", "geometry"]]
...@@ -51,7 +52,7 @@ def execute(context): ...@@ -51,7 +52,7 @@ def execute(context):
kd_tree = KDTree(coordinates) kd_tree = KDTree(coordinates)
coordinates = np.vstack([df_missing["geometry"].centroid.x, df_missing["geometry"].centroid.y]).T coordinates = np.vstack([df_missing["geometry"].centroid.x, df_missing["geometry"].centroid.y]).T
indices = kd_tree.query(coordinates, return_distance = False).flatten() indices = kd_tree.query(coordinates, return_distance=False).flatten()
df_missing.loc[:, "municipality_type"] = df_existing.iloc[indices]["municipality_type"].values df_missing.loc[:, "municipality_type"] = df_existing.iloc[indices]["municipality_type"].values
df_missing.loc[:, "imputed_municipality_type"] = True df_missing.loc[:, "imputed_municipality_type"] = True
...@@ -59,17 +60,18 @@ def execute(context): ...@@ -59,17 +60,18 @@ def execute(context):
df_mapping = pd.concat((df_existing, df_missing)) df_mapping = pd.concat((df_existing, df_missing))
assert(len(df_mapping) == len(df_municipalities)) assert (len(df_mapping) == len(df_municipalities))
assert(set(np.unique(df_mapping["municipality_id"])) == set(np.unique(df_municipalities["municipality_id"]))) assert (set(np.unique(df_mapping["municipality_id"])) == set(np.unique(df_municipalities["municipality_id"])))
df_mapping = pd.DataFrame(df_mapping[["municipality_id", "municipality_type", "imputed_municipality_type"]]) df_mapping = pd.DataFrame(df_mapping[["municipality_id", "municipality_type", "imputed_municipality_type"]])
df_mapping["municipality_type"] = df_mapping["municipality_type"].astype("category") df_mapping["municipality_type"] = df_mapping["municipality_type"].astype("category")
return df_mapping return df_mapping
def impute(df, df_municipality_types, remove_unknown = False):
assert("municipality_id" in df.columns) def impute(df, df_municipality_types, remove_unknown=False):
df = pd.merge(df, df_municipality_types, on = "municipality_id") assert ("municipality_id" in df.columns)
df = pd.merge(df, df_municipality_types, on="municipality_id")
if remove_unknown: if remove_unknown:
return df[~np.isnan(df["municipality_type"])] return df[~np.isnan(df["municipality_type"])]
......
...@@ -4,6 +4,7 @@ import pandas as pd ...@@ -4,6 +4,7 @@ import pandas as pd
import shapely.geometry as geo import shapely.geometry as geo
from sklearn.neighbors import KDTree from sklearn.neighbors import KDTree
def sample_coordinates(row, count): def sample_coordinates(row, count):
samples = [] samples = []
bounds = row["geometry"].bounds bounds = row["geometry"].bounds
......
...@@ -7,6 +7,7 @@ import data.constants as c ...@@ -7,6 +7,7 @@ import data.constants as c
def configure(context): def configure(context):
context.stage("data.statpop.persons") context.stage("data.statpop.persons")
def execute(context): def execute(context):
df_statpop = context.stage("data.statpop.persons") df_statpop = context.stage("data.statpop.persons")
density_coordinates = np.vstack([df_statpop["home_x"], df_statpop["home_y"]]).T density_coordinates = np.vstack([df_statpop["home_x"], df_statpop["home_y"]]).T
...@@ -14,8 +15,9 @@ def execute(context): ...@@ -14,8 +15,9 @@ def execute(context):
return kd_tree return kd_tree
def impute(kd_tree, df, x = "x", y = "y", radius = c.POPULATION_DENSITY_RADIUS):
def impute(kd_tree, df, x="x", y="y", radius=c.POPULATION_DENSITY_RADIUS):
print("Imputing population density ...") print("Imputing population density ...")
coordinates = np.vstack([df[x], df[y]]).T coordinates = np.vstack([df[x], df[y]]).T
counts = kd_tree.query_radius(coordinates, radius, count_only = True) counts = kd_tree.query_radius(coordinates, radius, count_only=True)
df["population_density"] = counts # / (np.pi * c.POPULATION_DENSITY_RADIUS**2) df["population_density"] = counts # / (np.pi * c.POPULATION_DENSITY_RADIUS**2)
...@@ -102,12 +102,9 @@ def execute(context): ...@@ -102,12 +102,9 @@ def execute(context):
df_households = df_households.replace(CANTON_TO_ID_MULTILANGUAGE) df_households = df_households.replace(CANTON_TO_ID_MULTILANGUAGE)
# TODO: why do we only use five categories? # TODO: why do we only use five categories?
# Make zero-based with only 5 categories # Limit to 5 categories
df_households["household_size"] = np.minimum(5, df_households["household_size"]) - 1 df_households["household_size"] = np.minimum(5, df_households["household_size"])
df_households = df_households.groupby(["canton_id", "household_size"]).sum().reset_index().sort_values( df_households = df_households.groupby(["canton_id", "household_size"]).sum().reset_index()
["canton_id", "household_size"])
df_households = df_households.rename({"household_size": "household_size_class_projection"}, axis=1)
df_households = df_households.sort_values(["canton_id", "household_size_class_projection"])
else: else:
...@@ -117,17 +114,28 @@ def execute(context): ...@@ -117,17 +114,28 @@ def execute(context):
header=[0, 1], skiprows=2, nrows=27, index_col=0).reset_index().rename({ header=[0, 1], skiprows=2, nrows=27, index_col=0).reset_index().rename({
"index": "canton_id", "index": "canton_id",
"Total": "total", "Total": "total",
"1 Person": 1, "1 Person": "1",
"2 Personen": 2, "2 Personen": "2",
"3 und mehr Personen": 3 "3 und mehr Personen": "3",
2017: "2017",
2045: "2045"
}, axis=1) }, axis=1)
# Flatten multi-index columns
df_households.columns = ['_'.join(col).strip("_") for col in df_households.columns.values]
# Convert to long format # Convert to long format
df_households = df_households.melt( df_households = df_households.melt(
id_vars="canton_id", value_vars=[1, 2, 3], id_vars="canton_id", value_vars=["1_2017", "1_2045", "2_2017", "2_2045", "3_2017", "3_2045"],
value_name="weight", var_name=["household_size", "year"] value_name="weight", var_name=["household_size_year"]
) )
# split and rename columns
temp = df_households["household_size_year"].str.split("_", expand=True)
df_households["household_size"] = temp[0].astype(int)
df_households["year"] = temp[1].astype(int)
df_households = df_households[["canton_id", "household_size", "year", "weight"]]
# Remove Switzerland total # Remove Switzerland total
df_households = df_households[df_households["canton_id"] != "Schweiz"] df_households = df_households[df_households["canton_id"] != "Schweiz"]
...@@ -152,10 +160,6 @@ def execute(context): ...@@ -152,10 +160,6 @@ def execute(context):
df_households = df_households[("weight", scaling_year)].reset_index() df_households = df_households[("weight", scaling_year)].reset_index()
df_households.columns = ["canton_id", "household_size", "weight"] df_households.columns = ["canton_id", "household_size", "weight"]
# Make zero-based
df_households["household_size"] -= 1
df_households = df_households.rename({"household_size": "household_size_class_projection"}, axis=1)
# Replace cantons # Replace cantons
df_households = df_households.replace(CANTON_TO_ID) df_households = df_households.replace(CANTON_TO_ID)
...@@ -163,6 +167,13 @@ def execute(context): ...@@ -163,6 +167,13 @@ def execute(context):
df_households["weight"] = np.round(df_households["weight"]) df_households["weight"] = np.round(df_households["weight"])
df_households["weight"] = df_households["weight"].astype(int) df_households["weight"] = df_households["weight"].astype(int)
# make size class zero-based
df_households = df_households.rename({"household_size": "household_size_class"}, axis=1)
df_households["household_size_class"] = df_households["household_size_class"] - 1
# sort values
df_households = df_households.sort_values(["canton_id", "household_size_class"])
print(df_households.head()) print(df_households.head())
return df_households, scaling_year return df_households, scaling_year
...@@ -36,10 +36,12 @@ def execute(context): ...@@ -36,10 +36,12 @@ def execute(context):
print("Number of households before scaling :", len(df_statpop["household_id"].unique())) print("Number of households before scaling :", len(df_statpop["household_id"].unique()))
print("Number of persons before scaling :", len(df_statpop["person_id"].unique())) print("Number of persons before scaling :", len(df_statpop["person_id"].unique()))
# rename household_size_class column
df_household_controls = df_household_controls.rename({"household_size_class": "household_size_class_projection"}, axis=1)
# we need to add a new household class column with only as many categories as the controls # we need to add a new household class column with only as many categories as the controls
number_household_classes = len(df_household_controls["household_size_class_projection"].unique()) number_household_classes = len(df_household_controls["household_size_class_projection"].unique())
df_statpop["household_size_class_projection"] = np.minimum(number_household_classes, df_statpop["household_size_class_projection"] = np.minimum(number_household_classes, df_statpop["household_size"]) - 1
df_statpop["household_size"]) - 1
# create IPU fitting problem by canton # create IPU fitting problem by canton
problems = [] problems = []
......
...@@ -23,29 +23,30 @@ channels: ...@@ -23,29 +23,30 @@ channels:
- defaults - defaults
dependencies: dependencies:
- matplotlib=3.1.3 - matplotlib=3.3.2
- pandas=1.0.3 - pandas=1.1.3
- scipy=1.4.1 - scipy=1.5.2
- numpy=1.18.1 - numpy=1.19.2
- geopandas=0.6.1 - geopandas=0.6.1
- numba=0.49.0 - numba=0.51.2
- palettable=3.3.0 - palettable=3.3.0
- scikit-learn=0.22.1 - scikit-learn=0.23.2
- shapely=1.7.0 - shapely=1.6.4
- tqdm=4.46.0 - tqdm=4.50.2
- pytables=3.6.1 - pytables=3.6.1
- xlrd=1.2.0 - xlrd=1.2.0
- pip=20.0.2 - pip=20.2.4
- pip: - pip:
- pyproj==3.0.0
- simpledbf==0.2.6 - simpledbf==0.2.6
- synpp==1.3.1 - synpp==1.3.1
- python-Levenshtein==0.12.0 - python-Levenshtein==0.12.0
# For testing # For testing
- pytest==5.4.2 - pytest==6.1.1
- xlwt==1.3.0 - xlwt==1.3.0
- pysal==1.14.4.post1 - pysal==2.3.0
EOF EOF
) )
......
...@@ -4,20 +4,21 @@ import subprocess as sp ...@@ -4,20 +4,21 @@ import subprocess as sp
def configure(context): def configure(context):
context.stage("utils.java") context.stage("utils.java")
def execute(context): def execute(context):
java = context.stage("utils.java") java = context.stage("utils.java")
sp.check_call([ sp.check_call([
"git", "clone", "https://github.com/eqasim-org/eqasim-java.git" "git", "clone", "https://github.com/eqasim-org/eqasim-java.git"
], cwd = context.cache_path) ], cwd=context.cache_path)
sp.check_call([ sp.check_call([
"git", "checkout", "v1.0.5" "git", "checkout", "v1.0.5"
], cwd = "%s/eqasim-java" % context.cache_path) ], cwd="%s/eqasim-java" % context.cache_path)
sp.check_call([ sp.check_call([
"mvn", "-Pstandalone", "package" "mvn", "-Pstandalone", "package"
], cwd = "%s/eqasim-java" % context.cache_path) ], cwd="%s/eqasim-java" % context.cache_path)
jar = "%s/eqasim-java/switzerland/target/switzerland-1.0.5.jar" % context.cache_path jar = "%s/eqasim-java/switzerland/target/switzerland-1.0.5.jar" % context.cache_path
return jar return jar
import os
import subprocess as sp import subprocess as sp
def configure(context): def configure(context):
context.stage("utils.java") context.stage("utils.java")
def execute(context): def execute(context):
java = context.stage("utils.java") java = context.stage("utils.java")
...@@ -11,20 +13,20 @@ def execute(context): ...@@ -11,20 +13,20 @@ def execute(context):
sp.check_call([ sp.check_call([
"git", "clone", "https://github.com/matsim-org/pt2matsim.git" "git", "clone", "https://github.com/matsim-org/pt2matsim.git"
], cwd = context.cache_path) ], cwd=context.cache_path)
sp.check_call([ sp.check_call([
"git", "checkout", "v19.10" "git", "checkout", "v19.10"
], cwd = "%s/pt2matsim" % context.cache_path) ], cwd="%s/pt2matsim" % context.cache_path)
sp.check_call([ sp.check_call([
"mvn", "-Djava.io.tmpdir=%s/java_tmp" % context.cache_path, "package" "mvn", "-Djava.io.tmpdir=%s/java_tmp" % context.cache_path, "package"
], cwd = "%s/pt2matsim" % context.cache_path) ], cwd="%s/pt2matsim" % context.cache_path)
jar = "%s/pt2matsim/target/pt2matsim-19.10-shaded.jar" % context.cache_path jar = "%s/pt2matsim/target/pt2matsim-19.10-shaded.jar" % context.cache_path
java(jar, "org.matsim.pt2matsim.run.CreateDefaultOsmConfig", ["test_config.xml"], cwd = context.cache_path) java(jar, "org.matsim.pt2matsim.run.CreateDefaultOsmConfig", ["test_config.xml"], cwd=context.cache_path)
assert(os.path.exists("%s/test_config.xml" % context.cache_path)) assert (os.path.exists("%s/test_config.xml" % context.cache_path))
assert(os.path.exists("%s/java_tmp/GeoTools" % context.cache_path)) assert (os.path.exists("%s/java_tmp/GeoTools" % context.cache_path))
return jar, "%s/java_tmp" % context.cache_path return jar, "%s/java_tmp" % context.cache_path
import numpy as np
import pandas as pd import pandas as pd
...@@ -5,16 +6,17 @@ def configure(context): ...@@ -5,16 +6,17 @@ def configure(context):
context.stage("data.microcensus.persons") context.stage("data.microcensus.persons")
context.stage("data.microcensus.trips") context.stage("data.microcensus.trips")
def execute(context): def execute(context):
df_trips = pd.DataFrame(context.stage("data.microcensus.trips"), copy = True) df_trips = pd.DataFrame(context.stage("data.microcensus.trips"), copy=True)
df_persons = context.stage("data.microcensus.persons") df_persons = context.stage("data.microcensus.persons")
df_trips = pd.merge(df_trips, df_persons[["person_id", "home_x", "home_y"]], how = "left") df_trips = pd.merge(df_trips, df_persons[["person_id", "home_x", "home_y"]], how="left")
df_trips.loc[:, "previous_trip_id"] = df_trips.loc[:, "trip_id"] - 1 df_trips.loc[:, "previous_trip_id"] = df_trips.loc[:, "trip_id"] - 1
df_activities = pd.merge( df_activities = pd.merge(
df_trips, df_trips, left_on = ["person_id", "previous_trip_id"], right_on = ["person_id", "trip_id"], df_trips, df_trips, left_on=["person_id", "previous_trip_id"], right_on=["person_id", "trip_id"],
suffixes = ["_following_trip", "_previous_trip"], how = "left" suffixes=["_following_trip", "_previous_trip"], how="left"
) )
df_activities.loc[:, "start_time"] = df_activities.loc[:, "arrival_time_previous_trip"] df_activities.loc[:, "start_time"] = df_activities.loc[:, "arrival_time_previous_trip"]
...@@ -33,7 +35,7 @@ def execute(context): ...@@ -33,7 +35,7 @@ def execute(context):
df_activities.loc[f, "location_y"] = df_activities.loc[f, "home_y_following_trip"] df_activities.loc[f, "location_y"] = df_activities.loc[f, "home_y_following_trip"]
# We're still missing the last activity in the chain. # We're still missing the last activity in the chain.
df_last = df_activities.sort_values(by = ["person_id", "activity_id"]).groupby("person_id").last().reset_index() df_last = df_activities.sort_values(by=["person_id", "activity_id"]).groupby("person_id").last().reset_index()
df_last.loc[:, "purpose"] = df_last.loc[:, "purpose_following_trip"] df_last.loc[:, "purpose"] = df_last.loc[:, "purpose_following_trip"]
df_last.loc[:, "start_time"] = df_last.loc[:, "arrival_time_following_trip"] df_last.loc[:, "start_time"] = df_last.loc[:, "arrival_time_following_trip"]
df_last.loc[:, "end_time"] = np.nan df_last.loc[:, "end_time"] = np.nan
...@@ -45,7 +47,7 @@ def execute(context): ...@@ -45,7 +47,7 @@ def execute(context):
df_activities = pd.concat([df_activities, df_last]) df_activities = pd.concat([df_activities, df_last])
# Some cleanup # Some cleanup
df_activities = df_activities.sort_values(by = ["person_id", "activity_id"]) df_activities = df_activities.sort_values(by=["person_id", "activity_id"])
df_activities.loc[:, "duration"] = df_activities.loc[:, "end_time"] - df_activities.loc[:, "start_time"] df_activities.loc[:, "duration"] = df_activities.loc[:, "end_time"] - df_activities.loc[:, "start_time"]
df_activities = df_activities[[ df_activities = df_activities[[
......
import os.path import os
def configure(context): def configure(context):
context.stage("matsim.java.pt2matsim") context.stage("matsim.java.pt2matsim")
context.stage("utils.java") context.stage("utils.java")
context.config("data_path") context.config("data_path")
context.config("hafas_date")
def execute(context): def execute(context):
jar, tmp_path = context.stage("matsim.java.pt2matsim") jar, tmp_path = context.stage("matsim.java.pt2matsim")
...@@ -16,12 +19,12 @@ def execute(context): ...@@ -16,12 +19,12 @@ def execute(context):
"%s/transit_schedule.xml.gz" % context.cache_path, "%s/transit_schedule.xml.gz" % context.cache_path,
"%s/transit_vehicles.xml.gz" % context.cache_path, "%s/transit_vehicles.xml.gz" % context.cache_path,
context.config("hafas_date")