municipalities.py 4.57 KB
Newer Older
1
import geopandas as gpd
2
3
import numpy as np
import pandas as pd
4
5
from sklearn.neighbors import KDTree

6

7
def configure(context):
8
    context.config("data_path")
9

tchervec's avatar
tchervec committed
10

11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
REFERENCE_YEAR = 2018

SHAPEFILES = [
    (2018, "municipality_borders/gd-b-00.03-875-gg18/ggg_2018-LV95/shp/g1g18.shp", "GMDNR", "GMDNAME"),
    (2017, "municipality_borders/gd-b-00.03-875-gg17/ggg_2017/shp/LV95/g1g17.shp", "GMDNR", "GMDNAME"),
    (2016, "municipality_borders/gd-b-00.03-875-gg16/ggg_2016/shp/g1g16.shp", "GMDNR", "GMDNAME"),
    (2015, "municipality_borders/gd-b-00.03-876-gg15/GGG_15_V161025/shp/g1g15.shp", "GMDNR", "GMDNAME"),
    (2014, "municipality_borders/gd-b-00.03-877-gg14/ggg_2014/shp/g1g14.shp", "GMDNR", "GMDNAME"),
    (2013, "municipality_borders/gd-b-00.03-877-gg13_r1/ggg_2013/shp/g1g13.shp", "GMDNR", "GMDNAME"),
    (2012, "municipality_borders/gd-b-00.03-878-gg12/g1g12_shp_121130/G1G12.shp", "GMDE", "NAME"),
    (2011, "municipality_borders/gd-b-00.03-879-gg11/g1g11_shp_121130/G1G11.shp", "GMDE", "NAME"),
    (2010, "municipality_borders/gd-b-00.03-880-gg10/g1g10_shp_121130/G1G10.shp", "GMDE", "NAME"),
    (2009, "municipality_borders/gd-b-00.03-881-gg09g1/g1g09_shp_090626/G1G09.shp", "GMDE", "NAME")
]

tchervec's avatar
tchervec committed
26

27
def execute(context):
28
    data_path = context.config("data_path")
29
30
31
32
33

    df_all = []
    all_ids = set()

    # Load all the shape files, only add the municipalities that haven't been found before
tchervec's avatar
tchervec committed
34
    for year, shapefile, id_field, name_field in context.progress(SHAPEFILES, label="Reading municipality shape files"):
35
        df = gpd.read_file(
36
            "%s/%s" % (data_path, shapefile),
tchervec's avatar
tchervec committed
37
            encoding="latin1"
38
39
40
41
42
43
44
45
        ).to_crs({'init': 'EPSG:2056'})
        df.loc[:, "municipality_id"] = df[id_field]
        df.loc[:, "municipality_name"] = df[name_field]
        df.loc[:, "year"] = year

        df_ids = set(np.unique(df["municipality_id"]))
        df_new_ids = df_ids - all_ids

tchervec's avatar
tchervec committed
46
47
        df_all.append(
            df[df["municipality_id"].isin(df_new_ids)][["municipality_id", "municipality_name", "year", "geometry"]])
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
        all_ids |= df_new_ids

    df_all = pd.concat(df_all)

    df_reference = gpd.GeoDataFrame(df_all[df_all["year"] == REFERENCE_YEAR])
    df_reference.crs = df_all.crs

    df_deprecated = gpd.GeoDataFrame(df_all[df_all["year"] != REFERENCE_YEAR])
    df_deprecated["deprecated_municipality_id"] = df_deprecated["municipality_id"]
    del df_deprecated["municipality_id"]
    df_deprecated["geometry"] = df_deprecated.centroid
    df_deprecated.crs = df_all.crs

    # For each deprecated municipality find the covering reference municipality
    df_mapping = gpd.sjoin(
tchervec's avatar
tchervec committed
63
        df_reference, df_deprecated, op="contains"
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
    ).reset_index()[["municipality_id", "deprecated_municipality_id"]]

    # Now we are left over with some old municipalities whose centroids
    # are not covered by any new municipality (mainly at the border and
    # close to lakes). Therefore, we do another run and find the current
    # municipality with the closes distance (more expensive operation).

    missing_ids = set(
        np.unique(df_deprecated["deprecated_municipality_id"])
    ) - set(np.unique(df_mapping["deprecated_municipality_id"]))

    df_missing = df_deprecated[
        df_deprecated["deprecated_municipality_id"].isin(missing_ids)
    ][["deprecated_municipality_id", "geometry"]]

    coordinates = np.vstack([df_reference["geometry"].centroid.x, df_reference["geometry"].centroid.y]).T
    kd_tree = KDTree(coordinates)

    coordinates = np.vstack([df_missing["geometry"].x, df_missing["geometry"].y]).T
tchervec's avatar
tchervec committed
83
    indices = kd_tree.query(coordinates, return_distance=False).flatten()
84
85
86
87
88
89
90
91
92
93
94

    df_missing.loc[:, "municipality_id"] = df_reference.iloc[indices]["municipality_id"].values
    df_missing = df_missing[["municipality_id", "deprecated_municipality_id"]]

    df_existing = pd.DataFrame(df_reference[["municipality_id"]])
    df_existing["deprecated_municipality_id"] = df_existing["municipality_id"]

    df_mapping = pd.concat([df_existing, df_mapping, df_missing])
    df_reference = df_reference[["municipality_id", "municipality_name", "geometry"]]

    return df_reference, df_mapping
95

tchervec's avatar
tchervec committed
96
97
98

def update_municipality_ids(df, df_mapping, remove_unknown=False):
    assert ("municipality_id" in df.columns)
99
100
101
102
103
104

    df["deprecated_municipality_id"] = df["municipality_id"]
    del df["municipality_id"]

    df_join = pd.merge(
        df[["deprecated_municipality_id"]], df_mapping,
tchervec's avatar
tchervec committed
105
        on="deprecated_municipality_id", how="left"
106
107
108
109
110
111
112
113
    )

    df.loc[:, "municipality_id"] = df_join.loc[:, "municipality_id"].values

    if remove_unknown:
        return df[~np.isnan(df["municipality_id"])]
    else:
        return df