utils.py 2.77 KB
Newer Older
1
import geopandas as gpd
tchervec's avatar
tchervec committed
2
import numpy as np
Sebastian Hörl's avatar
Sebastian Hörl committed
3
import pandas as pd
tchervec's avatar
tchervec committed
4
import shapely.geometry as geo
Sebastian Hörl's avatar
Sebastian Hörl committed
5
from sklearn.neighbors import KDTree
tchervec's avatar
tchervec committed
6

7
8
9
10
11
12

def sample_coordinates(row, count):
    samples = []
    bounds = row["geometry"].bounds

    while len(samples) < count:
tchervec's avatar
tchervec committed
13
14
        x = bounds[0] + np.random.random(size=(1000,)) * (bounds[2] - bounds[0])
        y = bounds[1] + np.random.random(size=(1000,)) * (bounds[3] - bounds[1])
15
16
17
18
19
20
        points = map(geo.Point, zip(x, y))
        points = [point for point in points if row["geometry"].contains(point)]
        samples += points

    return np.array(list(map(lambda p: (p.x, p.y), samples[:count])))

tchervec's avatar
tchervec committed
21
22

def to_gpd(context, df, x="x", y="y", crs={"init": "EPSG:2056"}):
23
    df["geometry"] = [
tchervec's avatar
tchervec committed
24
25
        geo.Point(*coord) for coord in context.progress(
            zip(df[x], df[y]), total=len(df),
26
            label="Converting coordinates"
27
28
29
30
        )]
    df = gpd.GeoDataFrame(df)
    df.crs = crs

tchervec's avatar
tchervec committed
31
32
    if not crs == {"init": "EPSG:2056"}:
        df = df.to_crs({"init": "EPSG:2056"})
33
34

    return df
Sebastian Hörl's avatar
Sebastian Hörl committed
35
36


tchervec's avatar
tchervec committed
37
38
39
40
41
42
43
def impute(context, df_points, df_zones, point_id_field, zone_id_field, fix_by_distance=True, chunk_size=10000):
    assert (type(df_points) == gpd.GeoDataFrame)
    assert (type(df_zones) == gpd.GeoDataFrame)

    assert (point_id_field in df_points.columns)
    assert (zone_id_field in df_zones.columns)
    assert (not zone_id_field in df_points.columns)
Sebastian Hörl's avatar
Sebastian Hörl committed
44
45
46
47
48
49
50
51
52

    df_original = df_points
    df_points = df_points[[point_id_field, "geometry"]]
    df_zones = df_zones[[zone_id_field, "geometry"]]

    print("Imputing %d zones into %d points by spatial join..." % (len(df_zones), len(df_points)))

    result = []
    chunk_count = max(1, int(len(df_points) / chunk_size))
tchervec's avatar
tchervec committed
53
54
    for chunk in context.progress(np.array_split(df_points, chunk_count), total=chunk_count):
        result.append(gpd.sjoin(df_zones, chunk, op="contains", how="right"))
Sebastian Hörl's avatar
Sebastian Hörl committed
55
56
57
58
59
    df_points = pd.concat(result).reset_index()

    if "left_index" in df_points: del df_points["left_index"]
    if "right_index" in df_points: del df_points["right_index"]

60
    invalid_mask = pd.isnull(df_points[zone_id_field])
Sebastian Hörl's avatar
Sebastian Hörl committed
61
62
63
64
65
66
67
68

    if fix_by_distance and np.any(invalid_mask):
        print("  Fixing %d points by centroid distance join..." % np.count_nonzero(invalid_mask))
        coordinates = np.vstack([df_zones["geometry"].centroid.x, df_zones["geometry"].centroid.y]).T
        kd_tree = KDTree(coordinates)

        df_missing = df_points[invalid_mask]
        coordinates = np.vstack([df_missing["geometry"].centroid.x, df_missing["geometry"].centroid.y]).T
tchervec's avatar
tchervec committed
69
        indices = kd_tree.query(coordinates, return_distance=False).flatten()
Sebastian Hörl's avatar
Sebastian Hörl committed
70
71
72

        df_points.loc[invalid_mask, zone_id_field] = df_zones.iloc[indices][zone_id_field].values

tchervec's avatar
tchervec committed
73
    return pd.merge(df_original, df_points[[point_id_field, zone_id_field]], on=point_id_field, how="left")