Commit ed8355bc authored by tchervec's avatar tchervec
Browse files

Merge branch '32-trip-legs-in-gte-freight-survey' into 'freight'

separate the transport.csv into meaningful freight trips

See merge request !49
parent 3ecb6f95
**Version v2 (master)**
- Add freight population based on GTE and GQGV survey
- Write canton id -1 if value is NaN
- Set up CI execution environment using the setup/ scripts
- Change from Oracle JDK 8 to Open JDK 12
- Make output directory configurable
- Set default scaling year values
- Put in 2015 totals from BfS
......
......@@ -137,6 +137,27 @@ All predictions are according to the BfS reference scenario.
- State: 1 Apr 2019
- Contract: [Open Data][9] [Open Data][10] [Open Data][11] [Open Data][12]
**NUTS**
- Content: `nuts_borders` contains the borders of the Nomenclature of Territorial Units for Statistics (NUTS) country
subdivisions.
- State: 2013 & 2016
- Contract: [Open Data][13]
**Postal codes**
- Content: `postal_codes` contains shapefiles for postcodes in Switzerland.
- State: 1 Apr 2019
- Contract: [Open Data][14]
**Freight (GTE)**
- Content: `freight/gte` contains data from GTE survey which examines freight travel for freight vehicles registered in Switzerland.
- State: 2017
- Contract: BFS contract until ?
**Freight (GQGV)**
- Content: `freight/gqgv` contains data from GQGV survey which examines freight travel for freight vehicles registered abroad.
- State: 2014
- Contract: BFS contract until ?
[1]: https://www.bfs.admin.ch/bfs/de/home/dienstleistungen/geostat/geodaten-bundesstatistik/administrative-grenzen/generalisierte-gemeindegrenzen.assetdetail.5247306.html
......@@ -153,3 +174,5 @@ All predictions are according to the BfS reference scenario.
[10]: https://www.bfs.admin.ch/bfs/de/home/statistiken/kataloge-datenbanken/tabellen.assetdetail.3882982.html
[11]: https://www.bfs.admin.ch/bfs/de/home/statistiken/bevoelkerung/stand-entwicklung/bevoelkerung.assetdetail.5887433.html
[12]: https://www.bfs.admin.ch/bfs/de/home/statistiken/bevoelkerung/zukuenftige-entwicklung/kantonale-szenarien.assetdetail.255402.html
[13]: https://ec.europa.eu/eurostat/web/gisco/geodata/reference-data/administrative-units-statistical-units/nuts
[14]: https://www.cadastre.ch/en/services/service/plz.html
......@@ -4,10 +4,11 @@ output_path: /run/media/sebastian/shoerl_data/scenarios/switzerland/temp
threads: 4
hot_deck_matching_runners: 2
disable_progress_bar: false
input_downsampling: 0.01
java_memory: 10G
scaling_year: 2020
input_downsampling: 0.01
enable_scaling: true
scaling_year: 2020
use_freight: true
stages:
- data.statpop.projections.households
- data.statpop.scaled
......
......@@ -8,6 +8,7 @@ java_memory: 100G
input_downsampling: 0.01
enable_scaling: true
scaling_year: 2045
use_freight: true
stages:
- matsim.run
- matsim.mz.population
......
import pandas as pd
RENAMES = {"ORIGIN":"origin_nuts_id",
"DESTINATION":"destination_nuts_id",
"CH_MUNICIPALITY_ORIGIN": "origin_municipality",
"CH_MUNICIPALITY_DESTINATION": "destination_municipality",
"COUNTRY_OF_LOADING": "origin_country",
"COUNTRY_OF_UNLOADING": "destination_country",
"VEHICLE_TYPE": "vehicle_type",
"TYPE_OF_GOOD": "good_type",
"WEIGHTING_FACTOR": "weight",
"DIVISOR": "divisor"
}
FIELDS = ["origin_nuts_id","destination_nuts_id",
"origin_municipality", "destination_municipality",
"origin_country", "destination_country",
"vehicle_type", "good_type", "weight"
]
# VEHICLE_TYPES = {
# 1:"truck",
# 2:"road train",
# 3:"semi-trailer truck"
# }
VEHICLE_TYPES = {
1: "truck",
2: "truck",
3: "truck"
}
def configure(context, require):
require.stage("data.freight.gqgv.raw")
require.stage("data.spatial.nuts")
def execute(context):
df = context.stage("data.freight.gqgv.raw")
# rename
df = df.rename(RENAMES, axis=1)
# apply divisor to weight
df["weight"] /= df["divisor"]
# rename vehicle types
df["vehicle_type"] = df["vehicle_type"].replace(VEHICLE_TYPES)
# There are some NUTS ids that do not exist in our NUTS data (maybe old ids)
# for now, drop all trips where NUTS not in NUTS data
print("Dropping all trips where NUTS id not contained in NUTS data ...")
number_trips = len(df)
df_nuts = context.stage("data.spatial.nuts")
nuts_ids = list(df_nuts["nuts_id"].unique())
df = df[(df["origin_nuts_id"].isin(nuts_ids)) & (df["destination_nuts_id"].isin(nuts_ids))]
number_trips_dropped = number_trips - len(df)
print("Dropped %s of %s trips" % (number_trips_dropped, number_trips))
# package
df = df[FIELDS]
return df
import pandas as pd
import numpy as np
def configure(context, require):
require.stage("data.freight.gqgv.cleaned")
def execute(context):
df = context.stage("data.freight.gqgv.cleaned")
number_of_days = 365
# create OD matrix per vehicle type
demands = {}
origin_pdf_matrices = {}
od_pdf_matrices = {}
for vehicle_type in list(df["vehicle_type"].unique()):
df_vehicle_od = df[df["vehicle_type"] == vehicle_type]
# create matrix
matrix = pd.crosstab(
df_vehicle_od["origin_nuts_id"], df_vehicle_od["destination_nuts_id"],
df_vehicle_od["weight"], aggfunc=sum, dropna=False).fillna(0)
matrix_values = matrix.values
# compute demand
demands[vehicle_type] = int(np.round(np.sum(matrix_values) / number_of_days))
# make sure each from sums up to one
f_zero = np.sum(matrix_values, axis=1) == 0.0
for index in np.where(f_zero)[0]:
matrix_values[index,:] = 0.0
matrix_values[index,index] = 1.0
# compute pdfs
origin_pdf_matrix = np.sum(matrix_values, axis=1) / np.sum(matrix_values)
od_pdf_matrix = matrix_values / np.sum(matrix_values, axis=1)[:, np.newaxis]
origin_pdf_matrices[vehicle_type] = pd.DataFrame(index=list(matrix.index), columns=["probability"], data=origin_pdf_matrix)
od_pdf_matrices[vehicle_type] = pd.DataFrame(index=matrix.index,
columns=matrix.columns,
data=od_pdf_matrix)
return demands, origin_pdf_matrices, od_pdf_matrices
import pandas as pd
def configure(context, require):
require.config("raw_data_path")
def execute(context):
raw_data_path = context.config["raw_data_path"]
df = pd.read_csv("%s/freight/gqgv/GQGV_2014/GQGV_2014_Mikrodaten.csv" % raw_data_path, sep=";")
return df
import pandas as pd
RENAMES = {"ernr":"agent_id",
"journeyId":"journey_id",
"fromNuts":"origin_nuts_id",
"toNuts":"destination_nuts_id",
"fromPlz": "origin_postal_code",
"toPlz": "destination_postal_code",
"fromLand": "origin_country",
"toLand": "destination_country",
"vehicleKind":"vehicle_type",
"grossingFactor":"weight"
}
FIELDS = ["week", "weekday",
"origin_nuts_id","destination_nuts_id",
"origin_postal_code", "destination_postal_code",
"origin_country", "destination_country",
"vehicle_type", "weight"
]
# VEHICLE_TYPES = {
# 35:"truck",
# 37:"semi-trailer truck",
# 38:"tractor unit"
# }
VEHICLE_TYPES = {
35:"truck",
37:"truck",
38:"truck"
}
def configure(context, require):
require.stage("data.freight.gte.raw")
require.stage("data.spatial.nuts")
def execute(context):
df_transport, df_journey, df_week = context.stage("data.freight.gte.raw")
# select transport columns of interest
df_transport = df_transport[["ernr", "journeyId", "weekday",
"fromNuts", "toNuts",
"fromLand", "toLand",
"fromPlz", "toPlz",
"transportKm", "transportKmCH"]]
# get start location of each agent
df_start_locations = df_transport[~df_transport["ernr"].duplicated()][["fromPlz", "fromNuts", "fromLand"]]
# get all unique stop along trips
df_transport = df_transport.drop_duplicates()
# separate into different trip legs
df_transport.loc[1:, "fromPlz"] = df_transport.loc[:, "toPlz"].shift(periods=1)
df_transport.loc[1:, "fromNuts"] = df_transport.loc[:, "toNuts"].shift(periods=1)
df_transport.loc[1:, "fromLand"] = df_transport.loc[:, "toLand"].shift(periods=1)
# reset initial start locations of each agent
df_transport.loc[df_start_locations.index, ["fromPlz", "fromNuts", "fromLand"]] = df_start_locations
# # get distances of first journeys
# df_distances = df_transport.drop_duplicates(["ernr", "journeyId"], keep="first")[["transportKm", "transportKmCH"]]
#
# # calculate distance of each trip leg
# df_transport.loc[1:, ["transportKm", "transportKmCH"]] = df_transport[["transportKm", "transportKmCH"]].diff().loc[1:,:]
#
# # reset first leg distance in each journey for each agent
# df_transport.loc[df_distances.index, ["transportKm", "transportKmCH"]] = df_distances
# select week columns of interest
df_week = df_week[["ernr", "week", "vehicleKind", "grossingFactor"]]
# merge
df_merge = pd.merge(df_transport, df_week, on="ernr")
# remove all trips not at least partially in CH
df_merge = df_merge[df_merge["transportKmCH"] > 0]
# rename columns
df_merge = df_merge.rename(RENAMES, axis=1)
# rename vehicle types
df_merge["vehicle_type"] = df_merge["vehicle_type"].replace(VEHICLE_TYPES)
# There are some NUTS ids that do not exist in our NUTS data (maybe old ids)
# for now, drop all trips where NUTS not in NUTS data
print("Dropping all trips where NUTS id not contained in NUTS data ...")
number_trips = len(df_merge)
df_nuts = context.stage("data.spatial.nuts")
nuts_ids = list(df_nuts["nuts_id"].unique())
df_merge = df_merge[(df_merge["origin_nuts_id"].isin(nuts_ids)) & (df_merge["destination_nuts_id"].isin(nuts_ids))]
number_trips_dropped = number_trips - len(df_merge)
print("Dropped %s of %s trips" % (number_trips_dropped, number_trips))
# package
df_merge = df_merge[FIELDS]
return df_merge
import pandas as pd
import numpy as np
def configure(context, require):
require.stage("data.freight.gte.cleaned")
def execute(context):
df_trips = context.stage("data.freight.gte.cleaned")
# filter weekdays and trips in CH
weekdays_to_consider = [1,2,3,4,5]
df_od = df_trips[df_trips["weekday"].isin(weekdays_to_consider)]
number_of_weeks = len(df_od["week"].unique())
number_of_weekdays = len(weekdays_to_consider)
# create OD matrix per vehicle type
demands = {}
origin_pdf_matrices = {}
od_pdf_matrices = {}
for vehicle_type in list(df_od["vehicle_type"].unique()):
df_vehicle_od = df_od[df_od["vehicle_type"] == vehicle_type]
# create matrix
matrix = pd.crosstab(
df_vehicle_od["origin_nuts_id"], df_vehicle_od["destination_nuts_id"],
df_vehicle_od["weight"], aggfunc=sum, dropna=False).fillna(0)
matrix_values = matrix.values
# compute demand
demands[vehicle_type] = int(np.round(np.sum(matrix_values) / number_of_weeks / number_of_weekdays))
# make sure each from sums up to one
f_zero = np.sum(matrix_values, axis = 1) == 0.0
for index in np.where(f_zero)[0]:
matrix_values[index,:] = 0.0
matrix_values[index,index] = 1.0
# compute pdfs
origin_pdf_matrix = np.sum(matrix_values, axis=1) / np.sum(matrix_values)
od_pdf_matrix = matrix_values / np.sum(matrix_values, axis=1)[:, np.newaxis]
origin_pdf_matrices[vehicle_type] = pd.DataFrame(index=list(matrix.index), columns=["probability"], data=origin_pdf_matrix)
od_pdf_matrices[vehicle_type] = pd.DataFrame(index=matrix.index,
columns=matrix.columns,
data=od_pdf_matrix)
return demands, origin_pdf_matrices, od_pdf_matrices
\ No newline at end of file
import pandas as pd
def configure(context, require):
require.config("raw_data_path")
def execute(context):
raw_data_path = context.config["raw_data_path"]
df_transport = pd.read_csv("%s/freight/gte/GTE_2017/Donnees/transport.csv" % raw_data_path, sep=";", low_memory=False)
df_journey = pd.read_csv("%s/freight/gte/GTE_2017/Donnees/journeych.csv" % raw_data_path, sep=";", low_memory=False)
df_week = pd.read_csv("%s/freight/gte/GTE_2017/Donnees/week.csv" % raw_data_path, sep=";", low_memory=False)
return df_transport, df_journey, df_week
import pandas as pd
import numpy as np
import geopandas as gpd
def configure(context, require):
require.config("raw_data_path")
def execute(context):
raw_data_path = context.config["raw_data_path"]
df_nuts = gpd.read_file(
"%s/nuts_borders/ref-nuts-2013-01m.shp/NUTS_RG_01M_2013_4326.shp/NUTS_RG_01M_2013_4326.shp" % raw_data_path,
encoding = "utf-8"
)
df_nuts.crs = {'init' :'EPSG:4326'}
df_nuts = df_nuts.to_crs({'init': 'EPSG:2056'})
df_nuts["nuts_id"] = df_nuts["NUTS_ID"]
df_nuts["nuts_name"] = df_nuts["NUTS_NAME"]
df_nuts["nuts_level"] = df_nuts["LEVL_CODE"]
df_nuts = df_nuts.sort_values(by=["nuts_id", "nuts_level"]).reset_index()
df_nuts = df_nuts[["nuts_id", "nuts_name", "nuts_level", "geometry"]]
return df_nuts
import pandas as pd
import numpy as np
import geopandas as gpd
def configure(context, require):
require.config("raw_data_path")
def execute(context):
raw_data_path = context.config["raw_data_path"]
df = gpd.read_file(
"%s/postal_codes/PLZO_SHP_LV95/PLZO_PLZ.shp" % raw_data_path,
encoding = "latin1"
).to_crs({'init': 'EPSG:2056'})
df["postal_code"] = df["PLZ"]
df = df.sort_values(by="postal_code").reset_index()
df = df[["postal_code", "geometry"]]
return df
\ No newline at end of file
import geopandas as gpd
def configure(context, require):
require.config("raw_data_path")
def execute(context):
raw_data_path = context.config["raw_data_path"]
df = gpd.read_file(
"%s/municipality_borders/gd-b-00.03-875-gg18/ggg_2018-LV95/shp/g1l18.shp" % raw_data_path,
encoding = "latin1"
).to_crs({'init': 'EPSG:2056'})
return df["geometry"]
\ No newline at end of file
......@@ -56,7 +56,7 @@ def impute(df_points, df_zones, point_id_field, zone_id_field, fix_by_distance =
if "left_index" in df_points: del df_points["left_index"]
if "right_index" in df_points: del df_points["right_index"]
invalid_mask = np.isnan(df_points[zone_id_field])
invalid_mask = pd.isnull(df_points[zone_id_field])
if fix_by_distance and np.any(invalid_mask):
print(" Fixing %d points by centroid distance join..." % np.count_nonzero(invalid_mask))
......
......@@ -9,28 +9,42 @@ def configure(context, require):
require.stage("data.spatial.countries")
require.stage("data.spatial.municipalities")
require.stage("data.spatial.quarters")
require.stage("data.spatial.nuts")
require.stage("data.spatial.postal_codes")
def execute(context):
df_countries = pd.DataFrame(context.stage("data.spatial.countries"), copy = True)
df_municipalities = pd.DataFrame(context.stage("data.spatial.municipalities")[0], copy = True)
df_quarters = pd.DataFrame(context.stage("data.spatial.quarters"), copy = True)
df_nuts = pd.DataFrame(context.stage("data.spatial.nuts"), copy=True)
df_postal_code = pd.DataFrame(context.stage("data.spatial.postal_codes"), copy=True)
df_countries["zone_level_id"] = df_countries["country_id"]
df_municipalities["zone_level_id"] = df_municipalities["municipality_id"]
df_quarters["zone_level_id"] = df_quarters["quarter_id"]
df_nuts["zone_level_id"] = df_nuts["nuts_id"]
df_postal_code["zone_level_id"] = df_postal_code["postal_code"]
df_countries["zone_name"] = df_countries["country_name"]
df_municipalities["zone_name"] = df_municipalities["municipality_name"]
df_quarters["zone_name"] = df_quarters["quarter_name"]
df_nuts["zone_name"] = df_nuts["nuts_name"]
df_postal_code["zone_name"] = df_postal_code["postal_code"]
df_countries["zone_level"] = "country"
df_municipalities["zone_level"] = "municipality"
df_quarters["zone_level"] = "quarter"
df_nuts["zone_level"] = "nuts"
for level in df_nuts["nuts_level"].unique():
df_nuts.loc[df_nuts["nuts_level"] == level, "zone_level"] = ("nuts_" + str(level))
df_postal_code["zone_level"] = "postal_code"
df_zones = pd.concat([
df_countries[["zone_level_id", "zone_name", "zone_level"]],
df_municipalities[["zone_level_id", "zone_name", "zone_level"]],
df_quarters[["zone_level_id", "zone_name", "zone_level"]]
df_quarters[["zone_level_id", "zone_name", "zone_level"]],
df_nuts[["zone_level_id", "zone_name", "zone_level"]],
df_postal_code[["zone_level_id", "zone_name", "zone_level"]],
])
df_zones.loc[:, "zone_id"] = np.arange(len(df_zones))
......@@ -38,13 +52,15 @@ def execute(context):
return df_zones[["zone_id", "zone_name", "zone_level", "zone_level_id"]]
def impute(df, df_zones, zone_id_prefix = "", quarter_id_field = "quarter_id", municipality_id_field = "municipality_id", country_id_field = "country_id"):
def impute(df, df_zones, zone_id_prefix = "",
quarter_id_field = "quarter_id", municipality_id_field = "municipality_id", country_id_field = "country_id",
nuts_id_field = "nuts_id", postal_code_field = "postal_code"):
print("Imputing %d zones" % len(df))
remaining_mask = np.ones((len(df),), dtype = np.bool)
df.loc[:, "zone_id"] = np.nan
if quarter_id_field in df:
f = ~np.isnan(df[quarter_id_field]) & remaining_mask
f = ~pd.isnull(df[quarter_id_field]) & remaining_mask
df_join = pd.merge(
df[f][[quarter_id_field]],
......@@ -53,12 +69,12 @@ def impute(df, df_zones, zone_id_prefix = "", quarter_id_field = "quarter_id", m
df.loc[f, zone_id_prefix + "zone_id"] = df_join.loc[:, "zone_id"].values
df.loc[f, zone_id_prefix + "zone_level"] = df_join.loc[:, "zone_level"].values
remaining_mask &= np.isnan(df[zone_id_prefix + "zone_id"])
remaining_mask &= pd.isnull(df[zone_id_prefix + "zone_id"])
print(" Found %d quarters" % np.count_nonzero(df[zone_id_prefix + "zone_level"] == "quarter"))
if municipality_id_field in df:
f = ~np.isnan(df[municipality_id_field]) & remaining_mask
f = ~pd.isnull(df[municipality_id_field]) & remaining_mask
df_join = pd.merge(
df[f][[municipality_id_field]],
......@@ -67,12 +83,12 @@ def impute(df, df_zones, zone_id_prefix = "", quarter_id_field = "quarter_id", m
df.loc[f, zone_id_prefix + "zone_id"] = df_join.loc[:, "zone_id"].values
df.loc[f, zone_id_prefix + "zone_level"] = df_join.loc[:, "zone_level"].values
remaining_mask &= np.isnan(df[zone_id_prefix + "zone_id"])
remaining_mask &= pd.isnull(df[zone_id_prefix + "zone_id"])
print(" Found %d municipalities" % np.count_nonzero(df[zone_id_prefix + "zone_level"] == "municipality"))
if country_id_field in df:
f = ~np.isnan(df[country_id_field]) & remaining_mask
f = ~pd.isnull(df[country_id_field]) & remaining_mask
df_join = pd.merge(
df[f][[country_id_field]],
......@@ -81,11 +97,39 @@ def impute(df, df_zones, zone_id_prefix = "", quarter_id_field = "quarter_id", m
df.loc[f, zone_id_prefix + "zone_id"] = df_join.loc[:, "zone_id"].values
df.loc[f, zone_id_prefix + "zone_level"] = df_join.loc[:, "zone_level"].values
remaining_mask &= np.isnan(df[zone_id_prefix + "zone_id"])
remaining_mask &= pd.isnull(df[zone_id_prefix + "zone_id"])
print(" Found %d countries" % np.count_nonzero(df[zone_id_prefix + "zone_level"] == "country"))
unknown_count = np.count_nonzero(np.isnan(df[zone_id_prefix + "zone_id"]))
if nuts_id_field in df:
f = ~pd.isnull(df[nuts_id_field]) & remaining_mask
df_join = pd.merge(
df[f][[nuts_id_field]],
df_zones[df_zones["zone_level"] == "nuts"][["zone_level_id", "zone_id", "zone_level"]],
how = "left", left_on = nuts_id_field, right_on = "zone_level_id")
df.loc[f, zone_id_prefix + "zone_id"] = df_join.loc[:, "zone_id"].values
df.loc[f, zone_id_prefix + "zone_level"] = df_join.loc[:, "zone_level"].values
remaining_mask &= pd.isnull(df[zone_id_prefix + "zone_id"])
print(" Found %d NUTS zones" % np.count_nonzero(df[zone_id_prefix + "zone_level"] == "nuts"))
if postal_code_field in df:
f = ~pd.isnull(df[postal_code_field]) & remaining_mask
df_join = pd.merge(
df[f][[postal_code_field]],
df_zones[df_zones["zone_level"] == "postal_code"][["zone_level_id", "zone_id", "zone_level"]],
how = "left", left_on = postal_code_field, right_on = "zone_level_id")
df.loc[f, zone_id_prefix + "zone_id"] = df_join.loc[:, "zone_id"].values
df.loc[f, zone_id_prefix + "zone_level"] = df_join.loc[:, "zone_level"].values
remaining_mask &= pd.isnull(df[zone_id_prefix + "zone_id"])
print(" Found %d postal codes" % np.count_nonzero(df[zone_id_prefix + "zone_level"] == "postal_code"))
unknown_count = np.count_nonzero(pd.isnull(df[zone_id_prefix + "zone_id"]))
if unknown_count > 0:
print(" No information for %d observations" % unknown_count)
......
......@@ -13,6 +13,8 @@ def configure(context, require):
require.stage("data.spatial.zones")
require.stage("data.spatial.municipalities")
require.stage("data.spatial.quarters")
require.stage("data.spatial.nuts")
require.stage("data.spatial.postal_codes")
def execute(context):
raw_data_path = context.config["raw_data_path"]
......@@ -39,27 +41,49 @@ def execute(context):
df_zones = context.stage("data.spatial.zones")
df_quarters = context.stage("data.spatial.quarters")
df_municipalities = context.stage("data.spatial.municipalities")[0]
df_nuts = context.stage("data.spatial.nuts")
df_postal_codes = context.stage("data.spatial.postal_codes")
df_spatial = pd.DataFrame(df[["enterprise_id", "x", "y"]])
df_spatial = data.spatial.utils.to_gpd(df_spatial, "x", "y")
df_spatial = df_spatial.drop(["x", "y"], axis=1)
columns = ["enterprise_id"]
# impute municipalities