Commit 5b987d05 authored by Aurore Sallard's avatar Aurore Sallard
Browse files

Adding analysis scripts

parent 980deb08
Pipeline #76081 failed with stage
This diff is collapsed.
This diff is collapsed.
import pandas as pd
import numpy as np
import geopandas as gpd
import analysis.myutils as myutils
import analysis.myplottools as myplottools
import matplotlib.pyplot as plt
import data.constants as c
import pyproj
import data.utils
import data.spatial.utils
from tqdm import tqdm
def configure(context, require):
require.config("output_path")
require.config("raw_data_path")
require.config("analysis_path")
require.stage("analysis.import_syn_trips")
def import_data_synthetic(context):
filepath = "%s/syn_trips.csv" % context.config["output_path"]
df_trips = pd.read_csv(filepath, encoding = "latin1")
filepath = "%s/persons.csv" % context.config["output_path"]
df_persons = pd.read_csv(filepath, encoding = "latin1", sep = ";")
df_syn = df_trips.merge(df_persons, left_on="person_id", right_on="person_id")
t_id = df_syn["person_id"].values.tolist()
df_persons_no_trip = df_persons[np.logical_not(df_persons["person_id"].isin(t_id))]
df_persons_no_trip = df_persons_no_trip.set_index(["person_id"])
print("Synthetic: ", len(df_syn), ", ", len(df_persons_no_trip))
return df_syn, df_persons_no_trip
def import_data_actual(context):
df_act_persons = pd.read_csv(
"%s/microcensus/zielpersonen.csv" % context.config["raw_data_path"],
sep = ",", encoding = "latin1", parse_dates = ["USTag"]
)
filepath = "%s/microcensus_trips.csv" % context.config["output_path"]
df_act_trips = pd.read_csv(filepath, encoding = "latin1")
# Merging with person information, correcting trips with erroneous purpose
df_act_persons["age"] = df_act_persons["alter"]
df_act_persons["sex"] = df_act_persons["gesl"] - 1 # Make zero-based
df_act_persons["person_id"] = df_act_persons["HHNR"]
df_act_persons["weight_person"] = df_act_persons["WP"]
df_act_persons["date"] = df_act_persons["USTag"]
# Driving license
df_act_persons["driving_license"] = df_act_persons["f20400a"] == 1
# Car availability
df_act_persons["car_availability"] = c.CAR_AVAILABILITY_NEVER
df_act_persons.loc[df_act_persons["f42100e"] == 1, "car_availability"] = c.CAR_AVAILABILITY_ALWAYS
df_act_persons.loc[df_act_persons["f42100e"] == 2, "car_availability"] = c.CAR_AVAILABILITY_SOMETIMES
df_act_persons.loc[df_act_persons["f42100e"] == 3, "car_availability"] = c.CAR_AVAILABILITY_NEVER
# Employment
df_act_persons["employed"] = df_act_persons["f40800_01"] != -99
# Infer age class
df_act_persons["age_class"] = np.digitize(df_act_persons["age"], c.AGE_CLASS_UPPER_BOUNDS)
df_act_persons.rename(columns = {"binary_car_availability":"car_availability"}, inplace = True)
df_act = df_act_trips.merge(df_act_persons[["person_id", "weight_person", "employed",
"age", "sex", "car_availability"]],
on=["person_id"], how='left')
df_act.loc[(df_act["purpose"]=='work') & (df_act["age"] < 16), "purpose"]="other"
# Only keep the persons that could have been used in activity chain matching
df_act = df_act[~df_act["weight_person"].isna()]
df_act = df_act.set_index(["person_id"])
df_act.sort_index(inplace=True)
t_id = df_act_trips["person_id"].values.tolist()
df_persons_no_trip = df_act_persons[np.logical_not(df_act_persons["person_id"].isin(t_id))]
df_persons_no_trip = df_persons_no_trip.set_index(["person_id"])
print(df_act.columns)
return df_act, df_persons_no_trip
def aux_data_frame(df_act, df_syn):
pers_ids = list(set(df_syn["person_id"].values.tolist()))
ids = []
chains = []
for pid in tqdm(pers_ids):
df = df_syn[df_syn["person_id"] == pid]
purposes = df["following_purpose"].values.tolist()
chain = "h-" + "-".join([purpose[0] for purpose in purposes])
#print(chain)
ids.append(pid)
chains.append(chain)
df_aux_syn = pd.DataFrame.from_dict({"person_id": ids, "weights": [1 for i in range(len(ids))], "chain": chains})
df_act["person_id"] = df_act.index
pers_ids = list(set(df_act["person_id"].values.tolist()))
ids = []
weights = []
chains = []
for pid in pers_ids:
df = df_act[df_act["person_id"] == pid]
weight = np.mean(df["weight_person"].values.tolist())
purposes = df["purpose"].values.tolist()
chain = "h-" + "-".join([purpose[0] for purpose in purposes])
ids.append(pid)
weights.append(weight)
chains.append(chain)
df_aux_act = pd.DataFrame.from_dict({"person_id": ids, "weight_person":weights, "chain": chains})
return df_aux_act, df_aux_syn
def activity_chains_comparison(context, all_CC, suffix = None):
# Get percentages, prepare for plotting
all_CC["synthetic Count"] = all_CC ["synthetic Count"] / all_CC["synthetic Count"].sum() *100
all_CC["actual Count"] = all_CC["actual Count"] / all_CC["actual Count"].sum() *100
all_CC = all_CC.sort_values(by=['actual Count'], ascending=False)
# First step done: plot activity chain counts
title_plot = "Synthetic and HTS activity chain comparison"
title_figure = "activitychains"
if suffix:
title_plot += " - " + suffix
title_figure += "_" + suffix
title_figure += ".png"
myplottools.plot_comparison_bar(context, imtitle = title_figure, plottitle = title_plot, ylabel = "Percentage", xlabel = "Activity chain", lab = all_CC["Chain"], actual = all_CC["actual Count"], synthetic = all_CC["synthetic Count"])
def activity_counts_comparison(context, all_CC, suffix = None):
all_CC_dic = all_CC.to_dict('records')
counts_dic = {}
for actchain in all_CC_dic:
chain = actchain["Chain"]
s = actchain["synthetic Count"]
a = actchain["actual Count"]
if np.isnan(s):
s = 0
if np.isnan(a):
a = 0
if chain == "-" or chain == "h":
x = 0
else:
act = chain.split("-")
x = len(act) - 2
x = min(x, 7)
if x not in counts_dic.keys():
counts_dic[x] = [s, a]
else:
counts_dic[x][0] += s
counts_dic[x][1] += a
counts = pd.DataFrame(columns = ["number", "synthetic Count", "actual Count"])
for k in range(8):
v = counts_dic[k]
if k == 7:
l = "7+"
else:
l = str(int(k))
counts.loc[k] = pd.Series({"number": l,
"synthetic Count": v[0],
"actual Count": v[1]
})
# Get percentages, prepare for plotting
counts["synthetic Count"] = counts["synthetic Count"] / counts["synthetic Count"].sum() *100
counts["actual Count"] = counts["actual Count"] / counts["actual Count"].sum() *100
#counts = counts.sort_values(by=['actual Count'], ascending=False)
# First step done: plot activity chain counts
title_plot = "Synthetic and HTS activity counts comparison"
title_figure = "activitycounts"
if suffix:
title_plot += " - " + suffix
title_figure += "_" + suffix
title_figure += ".png"
myplottools.plot_comparison_bar(context, imtitle = title_figure, plottitle = title_plot,
ylabel = "Percentage", xlabel = "Number of activities in the activity chain",
lab = counts["number"], actual = counts["actual Count"],
synthetic = counts["synthetic Count"])
def activity_counts_per_purpose(context, all_CC, suffix = None):
all_CC_dic = all_CC.to_dict('records')
purposes = ['h', 'w', 'e', 's', 'l', 'o']
counts_dic = {}
cpt = 0
for actchain in all_CC_dic:
chain = actchain["Chain"]
s = actchain["synthetic Count"]
a = actchain["actual Count"]
if np.isnan(s):
s = 0
if np.isnan(a):
a = 0
if chain == "-" or chain == "h":
pass
else:
act = chain.split("-")
act = act[1:-1]
for p in purposes:
cpt_purpose = act.count(p)
if cpt_purpose > 0 :
identifier = p + " - " + str(cpt_purpose)
if cpt_purpose > 1:
identifier += " times"
else:
identifier += " time"
if cpt_purpose >= 3 or (cpt_purpose == 2 and p not in ['h', 'w', 'e']):
identifier = "Other"
if identifier not in counts_dic.keys():
counts_dic[identifier] = [s, a]
else:
counts_dic[identifier][0] += s
counts_dic[identifier][1] += a
counts = pd.DataFrame(columns = ["number", "synthetic Count", "actual Count"])
for k, v in counts_dic.items():
counts.loc[k] = pd.Series({"number": k,
"synthetic Count": v[0],
"actual Count": v[1]
})
# Get percentages, prepare for plotting
counts["synthetic Count"] = counts["synthetic Count"] / counts["synthetic Count"].sum() *100
counts["actual Count"] = counts["actual Count"] / counts["actual Count"].sum() *100
counts = counts.sort_values(by=['actual Count'], ascending=False)
val = "Other"
idx = counts.index.drop(val).tolist() + [val]
counts = counts.reindex(idx)
# First step done: plot activity chain counts
title_plot = "Activity counts per purpose comparison"
title_figure = "activitycountspurpose"
if suffix:
title_plot += " - " + suffix
title_figure += "_" + suffix
title_figure += ".png"
myplottools.plot_comparison_bar(context, imtitle = title_figure, plottitle = title_plot,
ylabel = "Percentage", xlabel = "Activities with the same purpose in the activity chain",
lab = counts["number"], actual = counts["actual Count"],
synthetic = counts["synthetic Count"], t = 20)
def execute(context):
# Import data, merging
df_syn, df_syn_no_trip = import_data_synthetic(context)
df_act, df_act_no_trip = import_data_actual(context)
df_aux_act, df_aux_syn = aux_data_frame(df_act, df_syn)
syn_CC = df_aux_syn.groupby("chain").size().reset_index(name='count')
act_CC = df_aux_act.groupby("chain").size().reset_index(name='count')
act_CC.columns = ["Chain", "actual Count"]
syn_CC.columns = ["Chain", "synthetic Count"]
# 1. ACTIVITY CHAINS
# Creating the new dataframes with activity chain counts
#syn_CC = myutils.process_synthetic_activity_chain_counts(df_syn)
syn_CC.loc[len(syn_CC) + 1] = pd.Series({"Chain": "h", "synthetic Count": df_syn_no_trip.shape[0] })
#act_CC = myutils.process_actual_activity_chain_counts(df_act, df_aux)
act_CC.loc[len(act_CC) + 1] = pd.Series({"Chain": "h", "actual Count": np.sum(df_act_no_trip["weight_person"].values.tolist())})
# Merging together, comparing
all_CC = pd.merge(syn_CC, act_CC, on = "Chain", how = "left")
activity_chains_comparison(context, all_CC)
# Number of activities
activity_counts_comparison(context, all_CC)
# Number of activities per purposes
activity_counts_per_purpose(context, all_CC)
This diff is collapsed.
This diff is collapsed.
import pandas as pd
import numpy as np
import geopandas as gpd
import analysis.myutils as myutils
import analysis.myplottools as myplottools
import matplotlib.pyplot as plt
import data.constants as c
import pyproj
import data.utils
import data.spatial.utils
from tqdm import tqdm
def configure(context, require):
require.config("output_path")
require.config("raw_data_path")
require.config("analysis_path")
require.stage("analysis.import_syn_trips")
def import_data_synthetic(context):
filepath = "%s/syn_trips.csv" % context.config["output_path"]
df_trips = pd.read_csv(filepath, encoding = "latin1")
filepath = "%s/persons.csv" % context.config["output_path"]
df_persons = pd.read_csv(filepath, encoding = "latin1", sep = ";")
df_syn = df_trips.merge(df_persons, left_on="person_id", right_on="person_id")
t_id = df_syn["person_id"].values.tolist()
df_persons_no_trip = df_persons[np.logical_not(df_persons["person_id"].isin(t_id))]
df_persons_no_trip = df_persons_no_trip.set_index(["person_id"])
print("Synthetic: ", len(df_syn), ", ", len(df_persons_no_trip))
return df_syn, df_persons_no_trip
def import_data_actual(context):
df_act_persons = pd.read_csv(
"%s/microcensus/zielpersonen.csv" % context.config["raw_data_path"],
sep = ",", encoding = "latin1", parse_dates = ["USTag"]
)
filepath = "%s/microcensus_trips.csv" % context.config["output_path"]
df_act_trips = pd.read_csv(filepath, encoding = "latin1")
# Merging with person information, correcting trips with erroneous purpose
df_act_persons["age"] = df_act_persons["alter"]
df_act_persons["sex"] = df_act_persons["gesl"] - 1 # Make zero-based
df_act_persons["person_id"] = df_act_persons["HHNR"]
df_act_persons["weight_person"] = df_act_persons["WP"]
df_act_persons["date"] = df_act_persons["USTag"]
# Driving license
df_act_persons["driving_license"] = df_act_persons["f20400a"] == 1
# Car availability
df_act_persons["car_availability"] = c.CAR_AVAILABILITY_NEVER
df_act_persons.loc[df_act_persons["f42100e"] == 1, "car_availability"] = c.CAR_AVAILABILITY_ALWAYS
df_act_persons.loc[df_act_persons["f42100e"] == 2, "car_availability"] = c.CAR_AVAILABILITY_SOMETIMES
df_act_persons.loc[df_act_persons["f42100e"] == 3, "car_availability"] = c.CAR_AVAILABILITY_NEVER
# Employment (TODO: I know that LIMA uses a more fine-grained category here)
df_act_persons["employed"] = df_act_persons["f40800_01"] != -99
# Infer age class
df_act_persons["age_class"] = np.digitize(df_act_persons["age"], c.AGE_CLASS_UPPER_BOUNDS)
df_act_persons.rename(columns = {"binary_car_availability":"car_availability"}, inplace = True)
df_act = df_act_trips.merge(df_act_persons[["person_id", "weight_person", "employed",
"age", "sex", "car_availability"]],
on=["person_id"], how='left')
df_act.loc[(df_act["purpose"]=='work') & (df_act["age"] < 16), "purpose"]="other"
# Only keep the persons that could have been used in activity chain matching
df_act = df_act[~df_act["weight_person"].isna()]
df_act = df_act.set_index(["person_id"])
df_act.sort_index(inplace=True)
t_id = df_act_trips["person_id"].values.tolist()
df_persons_no_trip = df_act_persons[np.logical_not(df_act_persons["person_id"].isin(t_id))]
df_persons_no_trip = df_persons_no_trip.set_index(["person_id"])
print(df_act.columns)
return df_act, df_persons_no_trip
def compute_distances_synthetic(df_syn, threshold = 25):
df_syn["crowfly_distance"] = 0.001 * np.array(df_syn["crowfly_distance"])
# Only consider crowfly distances shorter than <threshold> km
df_syn_dist = df_syn[df_syn["crowfly_distance"] < threshold]
return df_syn_dist
def compute_distances_actual(df_act, threshold = 25):
# Compute the distances
df_act["crowfly_distance"] = 0.001 * np.sqrt(
(df_act["origin_x"] - df_act["destination_x"])**2 +
(df_act["origin_y"] - df_act["destination_y"])**2
)
df_act_dist = df_act[df_act["crowfly_distance"] < threshold]
return df_act_dist
def compare_dist_educ(context, df_syn, df_act, suffix = None):
pers_educ_syn = list(set(df_syn[df_syn["following_purpose"] == "education"]["person_id"].values))
pers_educ_act = list(set(df_act[df_act["purpose"] == "education"].index.values))
df_syn_educ = df_syn[np.isin(df_syn["person_id"], pers_educ_syn)]
df_act_educ = df_act[np.isin(df_act.index, pers_educ_act)]
df_syn_h_e = df_syn_educ[np.logical_or( np.logical_and( df_syn_educ["preceeding_purpose"] == "home", df_syn_educ["following_purpose"] == "education" ), np.logical_and(df_syn_educ["following_purpose"] == "home", df_syn_educ["preceeding_purpose"] == "education") )]
pers_he_syn = list(set(df_syn_h_e["person_id"].values))
df_act_h_e = df_act_educ[np.logical_or( np.logical_and( df_act_educ["origin_purpose"] == "home", df_act_educ["purpose"] == "education" ), np.logical_and(df_act_educ["purpose"] == "home", df_act_educ["origin_purpose"] == "education") )]
pers_he_act = list(set(df_syn_h_e.index.values))
dic_syn = {"person_id": pers_educ_syn, "dist_home_educ": [0 for i in range(len(pers_educ_syn))]}
dic_act = {"person_id": pers_educ_act, "weight_person": [0 for i in range(len(pers_educ_act))], "dist_home_educ": [0 for i in range(len(pers_educ_act))]}
for i in range(len(pers_educ_syn)):
pid = pers_educ_syn[i]
df_pers = df_syn_educ[df_syn_educ["person_id"] == pid]
home_coord = None
educ_coord = None
for index, row in df_pers.iterrows():
if row["preceeding_purpose"] == "home":
home_coord = [int(float(row["origin_x"])), int(float(row["origin_y"]))]
if row["following_purpose"] == "home":
home_coord = [int(float(row["destination_x"])), int(float(row["destination_y"]))]
if row["preceeding_purpose"] == "education":
educ_coord = [int(float(row["origin_x"])), int(float(row["origin_y"]))]
if row["following_purpose"] == "education":
educ_coord = [int(float(row["destination_x"])), int(float(row["destination_y"]))]
if home_coord is not None and educ_coord is not None:
break
dic_syn["dist_home_educ"][i] = 0.001 * np.sqrt(((home_coord[0] - educ_coord[0]) ** 2 + (home_coord[1] - educ_coord[1]) ** 2))
for i in range(len(pers_educ_act)):
pid = pers_educ_act[i]
df_pers = df_act_educ[df_act_educ.index == pid]
home_x = None
educ_y = None
for index, row in df_pers.iterrows():
if row["origin_purpose"] == "home":
home_x = row["origin_x"]
home_y = row["origin_y"]
elif row["purpose"] == "home":
home_x = row["destination_x"]
home_y = row["destination_y"]
if row["origin_purpose"] == "education":
educ_x = row["origin_x"]
educ_y = row["origin_y"]
elif row["purpose"] == "education":
educ_x = row["destination_x"]
educ_y = row["destination_y"]
if educ_y is not None and home_y is not None:
break
dic_act["dist_home_educ"][i] = 0.001 * np.sqrt(((home_x - educ_x) ** 2 + (home_y - educ_y) ** 2))
dic_act["weight_person"][i] = row["weight_person"]
dist_df_syn = pd.DataFrame.from_dict(dic_syn)
dist_df_act = pd.DataFrame.from_dict(dic_act)
syn = dist_df_syn["dist_home_educ"].values
act = dist_df_act["dist_home_educ"].values
act_w = dist_df_act["weight_person"].values
fig, ax = plt.subplots(1,1)
x_data = np.array(syn, dtype=np.float64)
x_sorted = np.argsort(x_data)
x_weights = np.array([1.0 for i in range(len(syn))], dtype=np.float64)
x_cdf = np.cumsum(x_weights[x_sorted])
x_cdf /= x_cdf[-1]
y_data = np.array(act, dtype=np.float64)
y_sorted = np.argsort(y_data)
y_weights = np.array(act_w, dtype=np.float64)
y_cdf = np.cumsum(y_weights[y_sorted])
y_cdf /= y_cdf[-1]
ax.plot(y_data[y_sorted], y_cdf, label="Actual", color = "#A3A3A3")
ax.plot(x_data[x_sorted], x_cdf, label="Synthetic", color="#00205B")
imtitle = "dist_home_educ"
plottitle = "Distance from home to education"
if suffix:
imtitle += "_" + suffix
plottitle += " - " + suffix
imtitle += ".png"
ax.set_ylabel("Probability")
ax.set_xlabel("Crowfly Distance [km]")
ax.legend(loc="best")
ax.set_title(plottitle)
plt.savefig("%s/" % context.config["analysis_path"] + imtitle)
return syn, act, act_w
def all_the_plot_distances(context, df_act_dist, df_syn_dist, suffix = None):
dph_title = "distance_purpose_hist"
dmh_title = "distance_mode_hist"
dpc_title = "distance_purpose_cdf"
dmc_title = "distance_mode_cdf"
if suffix:
dph_title += "_" + suffix
dmh_title += "_" + suffix
dpc_title += "_" + suffix
dmc_title += "_" + suffix
dph_title += ".png"
dph_title += ".png"
dpc_title += ".png"
dmc_title += ".png"
myplottools.plot_comparison_hist_purpose(context, dph_title, df_act_dist, df_syn_dist, bins = np.linspace(0,25,120), dpi = 300, cols = 3, rows = 2)
myplottools.plot_comparison_hist_mode(context, dmh_title, df_act_dist, df_syn_dist, bins = np.linspace(0,25,120), dpi = 300, cols = 3, rows = 2)
myplottools.plot_comparison_cdf_purpose(context, dpc_title, df_act_dist, df_syn_dist, dpi = 300, cols = 3, rows = 2)
myplottools.plot_comparison_cdf_mode(context, dmc_title, df_act_dist, df_syn_dist, dpi = 300, cols = 3, rows = 2)
def execute(context):
# Import data, merging
df_syn, df_syn_no_trip = import_data_synthetic(context)
df_act, df_act_no_trip = import_data_actual(context)
# 3. CROWFLY DISTANCES
# 3.1. Compute the distances
df_syn_dist = compute_distances_synthetic(df_syn)
df_act_dist = compute_distances_actual(df_act)
print(list(set(df_syn["following_purpose"].values.tolist())))
# 3.2 Prepare for plotting
df_act_dist["x"] = df_act_dist["weight_person"] * df_act_dist["crowfly_distance"]
act = df_act_dist.groupby(["purpose"]).sum()["x"] / df_act_dist.groupby(["purpose"]).sum()["weight_person"]
syn = df_syn_dist.groupby(["following_purpose"]).mean()["crowfly_distance"]
print(syn)
# 3.3 Ready to plot!
myplottools.plot_comparison_bar(context, imtitle = "distancepurpose.png", plottitle = "Crowfly distance", ylabel = "Mean crowfly distance [km]", xlabel = "", lab = syn.index, actual = act, synthetic = syn, t = None, xticksrot = True )
all_the_plot_distances(context, df_act_dist, df_syn_dist)
# 3.4 Distance from home to education
syn_0, act_0, act_w0 = compare_dist_educ(context, df_syn, df_act)
# 5 Distance from home to education according to age
ages = [[0, 6], [6, 12], [12,15], [15,19], [19, 24], [25, 1000]]
syn_means = [np.mean(syn_0)]
act_means = [np.average(act_0, weights = act_w0)]
labels = ["All"]
for age in ages:
df_syn_age = df_syn[np.logical_and(df_syn["age"] >= age[0],
df_syn["age"] <= age[1] )]
df_act_age = df_act[np.logical_and(df_act["age"] >= age[0],
df_act["age"] <= age[1] )]
suf = "aged " + str(age[0]) + " to " + str(age[1])
lab = str(age[0]) + " to " + str(age[1]) + " y. o."
if age[1] == 1000:
lab = "25 +"
syn, act, act_w = compare_dist_educ(context, df_syn_age, df_act_age, suffix = suf)
syn_means.append(np.average(syn))
act_means.append(np.average(act, weights = act_w))
labels.append(lab)
myplottools.plot_comparison_bar(context,"avdisthomeeduc - age.png", "Average distances from home to education", "Average distance [km]", "Population group", labels, act_means, syn_means)