Commit f69fb0b5 authored by tchervec's avatar tchervec
Browse files

Merge branch '54-port-pipeline-to-synpp' into develop

parents 7295912e 6c746bbc
......@@ -15,9 +15,7 @@ def impute(df_mz):
# TODO: Maybe adjusted later!
classifier = sklearn.tree.DecisionTreeClassifier(min_samples_leaf = 30, max_depth = 5)
classifier.fit(
training_data, training_labels, sample_weight = training_weights
)
classifier.fit(None, training_data, training_labels)
# Predict the incomes
prediction_data = df_mz[no_income_selector][[
......
import pandas as pd
import numpy as np
import data.utils
import pandas as pd
import data.constants as c
import data.microcensus.income
import data.utils
def configure(context, require):
require.config("raw_data_path")
require.stage("data.microcensus.households")
require.stage("data.microcensus.trips")
def configure(context):
context.config("data_path")
context.stage("data.microcensus.households")
context.stage("data.microcensus.trips")
def execute(context):
raw_data_path = context.config["raw_data_path"]
data_path = context.config("data_path")
df_mz_persons = pd.read_csv(
"%s/microcensus/zielpersonen.csv" % raw_data_path,
"%s/microcensus/zielpersonen.csv" % data_path,
sep = ",", encoding = "latin1", parse_dates = ["USTag"]
)
......
import pandas as pd
import numpy as np
import data.utils
import data.spatial.utils
import data.constants as c
import pyproj
import geopandas as gpd
import pandas as pd
def configure(context, require):
require.config("raw_data_path")
require.stage("data.microcensus.trips")
def configure(context):
context.config("data_path")
context.stage("data.microcensus.trips")
def execute(context):
# Load data
raw_data_path = context.config["raw_data_path"]
df_stages = pd.read_csv("%s/microcensus/etappen.csv" % raw_data_path, encoding = "latin1")
data_path = context.config("data_path")
df_stages = pd.read_csv("%s/microcensus/etappen.csv" % data_path, encoding = "latin1")
# Filter stages in pt trips
df_trips = context.stage("data.microcensus.trips")
......
import pandas as pd
import numpy as np
import data.utils
import data.spatial.utils
import data.constants as c
import pandas as pd
import pyproj
import geopandas as gpd
def configure(context, require):
require.config("raw_data_path")
import data.constants as c
def configure(context):
context.config("data_path")
def execute(context):
raw_data_path = context.config["raw_data_path"]
data_path = context.config("data_path")
df_mz_trips = pd.read_csv("%s/microcensus/wege.csv" % raw_data_path, encoding = "latin1")
df_mz_stages = pd.read_csv("%s/microcensus/etappen.csv" % raw_data_path, encoding = "latin1")
df_mz_trips = pd.read_csv("%s/microcensus/wege.csv" % data_path, encoding = "latin1")
df_mz_stages = pd.read_csv("%s/microcensus/etappen.csv" % data_path, encoding = "latin1")
df_mz_trips = df_mz_trips[[
"HHNR", "WEGNR", "f51100", "f51400", "wzweck1", "wzweck2", "wmittel",
......@@ -142,7 +141,7 @@ def execute(context):
print(" Removed %d persons with trips not starting at home location" % (before_length - after_length,))
# Parking cost
df_mz_stages = pd.read_csv("%s/microcensus/etappen.csv" % raw_data_path, encoding = "latin1")
df_mz_stages = pd.read_csv("%s/microcensus/etappen.csv" % data_path, encoding = "latin1")
df_cost = pd.DataFrame(df_mz_stages[["HHNR", "WEGNR", "f51330"]], copy = True)
df_cost.columns = ["person_id", "trip_id", "parking_cost"]
......
import pandas as pd
import numpy as np
import data.constants as c
from tqdm import tqdm
import pandas as pd
def configure(context, require):
require.stage("data.spatial.zones")
require.stage("data.spatial.quarters")
require.stage("data.spatial.municipalities")
def configure(context):
context.stage("data.spatial.zones")
context.stage("data.spatial.quarters")
context.stage("data.spatial.municipalities")
COUNRY_DISTANCE = 800 * 1e3
......
import pandas as pd
import numpy as np
import data.constants as c
from tqdm import tqdm
import data.spatial.zones
import data.spatial.countries
import data.spatial.municipalities
import data.spatial.quarters
def configure(context, require):
require.stage("data.structural_survey.structural_survey")
require.stage("data.spatial.zones")
import pandas as pd
def configure(context):
context.stage("data.structural_survey.structural_survey")
context.stage("data.spatial.zones")
# TODO: Right now we only produce OD matrices for WORK. We have the information
# from statpop on where the schools are, so we can use this in the future. Also,
......
import pandas as pd
import numpy as np
import data.constants as c
import geopandas as gpd
from tqdm import tqdm
from sklearn.neighbors import KDTree
def configure(context, require):
require.config("raw_data_path")
def configure(context):
context.config("data_path")
def execute(context):
# Load data
raw_data_path = context.config["raw_data_path"]
data_path = context.config("data_path")
df_cantons = pd.read_excel("%s/spatial_structure_2018.xlsx" % raw_data_path,
df_cantons = pd.read_excel("%s/spatial_structure_2018.xlsx" % data_path,
names=["municipality_id", "canton_id"],
usecols=[0, 2],
skiprows=6,
......
import pandas as pd
import numpy as np
import pandas as pd
def configure(context, require):
require.config("raw_data_path")
def configure(context):
context.config("data_path")
def execute(context):
raw_data_path = context.config["raw_data_path"]
data_path = context.config("data_path")
df = pd.read_excel(
"%s/country_codes_2018.xlsx" % raw_data_path
"%s/country_codes_2018.xlsx" % data_path
)
df["country_id"] = df["Ländercode BFS\nCode des pays OFS\nCodice del paese UST"]
......
import pandas as pd
import numpy as np
import data.constants as c
import geopandas as gpd
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.neighbors import KDTree
def configure(context, require):
require.config("raw_data_path")
def configure(context):
context.config("data_path")
REFERENCE_YEAR = 2018
......@@ -23,17 +23,18 @@ SHAPEFILES = [
(2009, "municipality_borders/gd-b-00.03-881-gg09g1/g1g09_shp_090626/G1G09.shp", "GMDE", "NAME")
]
def execute(context):
raw_data_path = context.config["raw_data_path"]
data_path = context.config("data_path")
df_all = []
all_ids = set()
# Load all the shape files, only add the municipalities that haven't been found before
for year, shapefile, id_field, name_field in tqdm(SHAPEFILES, desc = "Reading municipality shape files"):
for year, shapefile, id_field, name_field in context.progress(SHAPEFILES, label="Reading municipality shape files"):
df = gpd.read_file(
"%s/%s" % (raw_data_path, shapefile),
encoding = "latin1"
"%s/%s" % (data_path, shapefile),
encoding="latin1"
).to_crs({'init': 'EPSG:2056'})
df.loc[:, "municipality_id"] = df[id_field]
df.loc[:, "municipality_name"] = df[name_field]
......@@ -42,7 +43,8 @@ def execute(context):
df_ids = set(np.unique(df["municipality_id"]))
df_new_ids = df_ids - all_ids
df_all.append(df[df["municipality_id"].isin(df_new_ids)][["municipality_id", "municipality_name", "year", "geometry"]])
df_all.append(
df[df["municipality_id"].isin(df_new_ids)][["municipality_id", "municipality_name", "year", "geometry"]])
all_ids |= df_new_ids
df_all = pd.concat(df_all)
......@@ -58,7 +60,7 @@ def execute(context):
# For each deprecated municipality find the covering reference municipality
df_mapping = gpd.sjoin(
df_reference, df_deprecated, op = "contains"
df_reference, df_deprecated, op="contains"
).reset_index()[["municipality_id", "deprecated_municipality_id"]]
# Now we are left over with some old municipalities whose centroids
......@@ -78,7 +80,7 @@ def execute(context):
kd_tree = KDTree(coordinates)
coordinates = np.vstack([df_missing["geometry"].x, df_missing["geometry"].y]).T
indices = kd_tree.query(coordinates, return_distance = False).flatten()
indices = kd_tree.query(coordinates, return_distance=False).flatten()
df_missing.loc[:, "municipality_id"] = df_reference.iloc[indices]["municipality_id"].values
df_missing = df_missing[["municipality_id", "deprecated_municipality_id"]]
......@@ -91,15 +93,16 @@ def execute(context):
return df_reference, df_mapping
def update_municipality_ids(df, df_mapping, remove_unknown = False):
assert("municipality_id" in df.columns)
def update_municipality_ids(df, df_mapping, remove_unknown=False):
assert ("municipality_id" in df.columns)
df["deprecated_municipality_id"] = df["municipality_id"]
del df["municipality_id"]
df_join = pd.merge(
df[["deprecated_municipality_id"]], df_mapping,
on = "deprecated_municipality_id", how = "left"
on="deprecated_municipality_id", how="left"
)
df.loc[:, "municipality_id"] = df_join.loc[:, "municipality_id"].values
......
import pandas as pd
import numpy as np
import data.constants as c
import geopandas as gpd
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.neighbors import KDTree
def configure(context, require):
require.config("raw_data_path")
require.stage("data.spatial.municipalities")
def configure(context):
context.config("data_path")
context.stage("data.spatial.municipalities")
def execute(context):
# Load data
raw_data_path = context.config["raw_data_path"]
data_path = context.config("data_path")
df_types = pd.read_excel("%s/spatial_structure_2018.xlsx" % raw_data_path,
df_types = pd.read_excel("%s/spatial_structure_2018.xlsx" % data_path,
names=["municipality_id", "TYP"],
usecols=[0, 21],
skiprows=6,
......
import pandas as pd
import numpy as np
import geopandas as gpd
from tqdm import tqdm
import numpy as np
import pandas as pd
def configure(context):
context.config("data_path")
def configure(context, require):
require.config("raw_data_path")
SHAPEFILES = [
(2016, "nuts_borders/ref-nuts-2016-01m.shp/NUTS_RG_01M_2016_4326.shp/NUTS_RG_01M_2016_4326.shp", "NUTS_ID", "NUTS_NAME", "LEVL_CODE"),
(2013, "nuts_borders/ref-nuts-2013-01m.shp/NUTS_RG_01M_2013_4326.shp/NUTS_RG_01M_2013_4326.shp", "NUTS_ID", "NUTS_NAME", "LEVL_CODE"),
(2010, "nuts_borders/ref-nuts-2010-01m.shp/NUTS_RG_01M_2010_4326.shp/NUTS_RG_01M_2010_4326.shp", "NUTS_ID", "NUTS_NAME", "LEVL_CODE"),
(2006, "nuts_borders/ref-nuts-2006-01m.shp/NUTS_RG_01M_2006_4326.shp/NUTS_RG_01M_2006_4326.shp", "NUTS_ID", "NUTS_NAME", "LEVL_CODE"),
(2003, "nuts_borders/ref-nuts-2003-01m.shp/NUTS_RG_01M_2003_4326.shp/NUTS_RG_01M_2003_4326.shp", "NUTS_ID", "NUTS_NAME", "LEVL_CODE")
(2016, "nuts_borders/ref-nuts-2016-01m.shp/NUTS_RG_01M_2016_4326.shp/NUTS_RG_01M_2016_4326.shp", "NUTS_ID",
"NUTS_NAME", "LEVL_CODE"),
(2013, "nuts_borders/ref-nuts-2013-01m.shp/NUTS_RG_01M_2013_4326.shp/NUTS_RG_01M_2013_4326.shp", "NUTS_ID",
"NUTS_NAME", "LEVL_CODE"),
(2010, "nuts_borders/ref-nuts-2010-01m.shp/NUTS_RG_01M_2010_4326.shp/NUTS_RG_01M_2010_4326.shp", "NUTS_ID",
"NUTS_NAME", "LEVL_CODE"),
(2006, "nuts_borders/ref-nuts-2006-01m.shp/NUTS_RG_01M_2006_4326.shp/NUTS_RG_01M_2006_4326.shp", "NUTS_ID",
"NUTS_NAME", "LEVL_CODE"),
(2003, "nuts_borders/ref-nuts-2003-01m.shp/NUTS_RG_01M_2003_4326.shp/NUTS_RG_01M_2003_4326.shp", "NUTS_ID",
"NUTS_NAME", "LEVL_CODE")
]
def execute(context):
raw_data_path = context.config["raw_data_path"]
data_path = context.config("data_path")
df_all = []
all_ids = set()
# Load all the shape files, only add the NUTS zones that haven't been found before
for year, shapefile, id_field, name_field, level_field in tqdm(SHAPEFILES, desc="Reading NUTS shape files"):
for year, shapefile, id_field, name_field, level_field in context.progress(SHAPEFILES,
label="Reading NUTS shape files"):
df = gpd.read_file(
"%s/%s" % (raw_data_path, shapefile),
"%s/%s" % (data_path, shapefile),
encoding="utf-8"
)#.to_crs({'init': 'EPSG:2056'})
) # .to_crs({'init': 'EPSG:2056'})
df.crs = {'init': 'EPSG:4326'}
df = df.to_crs({'init': 'EPSG:2056'})
......
import pandas as pd
import numpy as np
import geopandas as gpd
from tqdm import tqdm
import numpy as np
import pandas as pd
def configure(context):
context.config("data_path")
context.config("threads")
def configure(context, require):
require.config("raw_data_path")
require.config("threads")
def execute(context):
input_path = "%s/ov_guteklasse/LV95/Oev_Gueteklassen_ARE.shp" % context.config["raw_data_path"]
input_path = "%s/ov_guteklasse/LV95/Oev_Gueteklassen_ARE.shp" % context.config("data_path")
df = gpd.read_file(input_path)
df.crs = {"init" : "EPSG:2056"}
df = df[["KLASSE", "geometry"]].rename({"KLASSE" : "ovgk"}, axis = 1)
df.crs = {"init": "EPSG:2056"}
df = df[["KLASSE", "geometry"]].rename({"KLASSE": "ovgk"}, axis=1)
return df
def impute(df_ovgk, df, on):
def impute(context, df_ovgk, df, on):
indices = np.array_split(np.arange(len(df)), 100)
df_join = []
for chunk in tqdm(indices, desc = "Imputing ÖV Güteklasse"):
df_join.append(gpd.sjoin(df.iloc[chunk], df_ovgk, op = "within")[on + ["ovgk"]])
for chunk in context.progress(indices, label="Imputing ÖV Güteklasse"):
df_join.append(gpd.sjoin(df.iloc[chunk], df_ovgk, op="within")[on + ["ovgk"]])
df_join = pd.concat(df_join)
df_join = pd.merge(df, df_join, on = on, how = "left")
df_join = pd.merge(df, df_join, on=on, how="left")
df_join.loc[df_join["ovgk"].isna(), "ovgk"] = "None"
df_join["ovgk"] = df_join["ovgk"].astype("category")
......
import pandas as pd
import numpy as np
import geopandas as gpd
def configure(context, require):
require.config("raw_data_path")
def configure(context):
context.config("data_path")
def execute(context):
raw_data_path = context.config["raw_data_path"]
data_path = context.config("data_path")
df = gpd.read_file(
"%s/postal_codes/PLZO_SHP_LV95/PLZO_PLZ.shp" % raw_data_path,
"%s/postal_codes/PLZO_SHP_LV95/PLZO_PLZ.shp" % data_path,
encoding = "latin1"
).to_crs({'init': 'EPSG:2056'})
......
import pandas as pd
import numpy as np
import geopandas as gpd
from tqdm import tqdm
from sklearn.neighbors import KDTree
import numpy as np
import pandas as pd
def configure(context, require):
require.config("raw_data_path")
def configure(context):
context.config("data_path")
def execute(context):
raw_data_path = context.config["raw_data_path"]
data_path = context.config("data_path")
df = gpd.read_file(
"%s/statistical_quarter_borders/shp/quart17.shp" % raw_data_path,
"%s/statistical_quarter_borders/shp/quart17.shp" % data_path,
encoding = "latin1"
).to_crs({'init': 'EPSG:2056'})
......
import geopandas as gpd
def configure(context, require):
require.config("raw_data_path")
def configure(context):
context.config("data_path")
def execute(context):
raw_data_path = context.config["raw_data_path"]
data_path = context.config("data_path")
df = gpd.read_file(
"%s/municipality_borders/gd-b-00.03-875-gg18/ggg_2018-LV95/shp/g1l18.shp" % raw_data_path,
"%s/municipality_borders/gd-b-00.03-875-gg18/ggg_2018-LV95/shp/g1l18.shp" % data_path,
encoding = "latin1"
).to_crs({'init': 'EPSG:2056'})
......
import shapely.geometry as geo
import numpy as np
from tqdm import tqdm
import geopandas as gpd
import numpy as np
import pandas as pd
import shapely.geometry as geo
from sklearn.neighbors import KDTree
import multiprocessing as mp
def sample_coordinates(row, count):
samples = []
bounds = row["geometry"].bounds
while len(samples) < count:
x = bounds[0] + np.random.random(size = (1000,)) * (bounds[2] - bounds[0])
y = bounds[1] + np.random.random(size = (1000,)) * (bounds[3] - bounds[1])
x = bounds[0] + np.random.random(size=(1000,)) * (bounds[2] - bounds[0])
y = bounds[1] + np.random.random(size=(1000,)) * (bounds[3] - bounds[1])
points = map(geo.Point, zip(x, y))
points = [point for point in points if row["geometry"].contains(point)]
samples += points
return np.array(list(map(lambda p: (p.x, p.y), samples[:count])))
def to_gpd(df, x = "x", y = "y", crs = {"init" : "EPSG:2056"}):
def to_gpd(context, df, x="x", y="y", crs={"init": "EPSG:2056"}):
df["geometry"] = [
geo.Point(*coord) for coord in tqdm(
zip(df[x], df[y]), total = len(df),
desc = "Converting coordinates"
geo.Point(*coord) for coord in context.progress(
zip(df[x], df[y]), total=len(df),
label="Converting coordinates"
)]
df = gpd.GeoDataFrame(df)
df.crs = crs
if not crs == {"init" : "EPSG:2056"}:
df = df.to_crs({"init" : "EPSG:2056"})
if not crs == {"init": "EPSG:2056"}:
df = df.to_crs({"init": "EPSG:2056"})
return df
def impute(df_points, df_zones, point_id_field, zone_id_field, fix_by_distance = True, chunk_size = 10000):
assert(type(df_points) == gpd.GeoDataFrame)
assert(type(df_zones) == gpd.GeoDataFrame)
assert(point_id_field in df_points.columns)
assert(zone_id_field in df_zones.columns)
assert(not zone_id_field in df_points.columns)
def impute(context, df_points, df_zones, point_id_field, zone_id_field, fix_by_distance=True, chunk_size=10000):
assert (type(df_points) == gpd.GeoDataFrame)
assert (type(df_zones) == gpd.GeoDataFrame)
assert (point_id_field in df_points.columns)
assert (zone_id_field in df_zones.columns)
assert (not zone_id_field in df_points.columns)
df_original = df_points
df_points = df_points[[point_id_field, "geometry"]]
......@@ -49,8 +50,8 @@ def impute(df_points, df_zones, point_id_field, zone_id_field, fix_by_distance =
result = []
chunk_count = max(1, int(len(df_points) / chunk_size))
for chunk in tqdm(np.array_split(df_points, chunk_count), total = chunk_count):
result.append(gpd.sjoin(df_zones, chunk, op = "contains", how = "right"))
for chunk in context.progress(np.array_split(df_points, chunk_count), total=chunk_count):
result.append(gpd.sjoin(df_zones, chunk, op="contains", how="right"))
df_points = pd.concat(result).reset_index()
if "left_index" in df_points: del df_points["left_index"]
......@@ -65,8 +66,8 @@ def impute(df_points, df_zones, point_id_field, zone_id_field, fix_by_distance =
df_missing = df_points[invalid_mask]
coordinates = np.vstack([df_missing["geometry"].centroid.x, df_missing["geometry"].centroid.y]).T
indices = kd_tree.query(coordinates, return_distance = False).flatten()
indices = kd_tree.query(coordinates, return_distance=False).flatten()
df_points.loc[invalid_mask, zone_id_field] = df_zones.iloc[indices][zone_id_field].values
return pd.merge(df_original, df_points[[point_id_field, zone_id_field]], on = point_id_field, how = "left")
return pd.merge(df_original, df_points[[point_id_field, zone_id_field]], on=point_id_field, how="left")
import pandas as pd
import numpy as np
import data.constants as c
import geopandas as gpd
from tqdm import tqdm
from sklearn.neighbors import KDTree
import shapely.geometry as geo
def configure(context, require):
require.stage("data.spatial.zones")
require.stage("data.spatial.municipalities")
require.stage("data.spatial.quarters")
def configure(context):
context.stage("data.spatial.zones")
context.stage("data.spatial.municipalities")
context.stage("data.spatial.quarters")
def execute(context):
df_zones = context.stage("data.spatial.zones")
......
import pandas as pd
import numpy as np
import data.constants as c
import geopandas as gpd
from tqdm import tqdm
from sklearn.neighbors import KDTree
def configure(context, require):
require.stage("data.spatial.countries")