To receive notifications about scheduled maintenance, please subscribe to the mailing-list gitlab-operations@sympa.ethz.ch. You can subscribe to the mailing-list at https://sympa.ethz.ch

Commit 49f918b9 authored by scmalte's avatar scmalte
Browse files

aggr.py: joining cluster data with exports from CX and eDoz

parent d6652bb1
......@@ -10,23 +10,67 @@ def main(
clusters_summary_csv_file=DEFAULT_CLUSTERS_SUMMARY_CSV_FILE,
cx_course_students_csv_file=DEFAULT_CX_COURSE_STUDENTS_CSV_FILE):
clusters_csv = pd.read_csv(clusters_summary_csv_file)
students_csv = pd.read_csv(cx_course_students_csv_file)
clusters_csv: pd.DataFrame = pd.read_csv(clusters_summary_csv_file)
clusters = clusters_csv.groupby("cluster_id")
for name, cluster in clusters:
ids = pd.concat([cluster["id1"], cluster["id2"]], ignore_index=False)
# print(cluster[["id1", "id2"]].unique())
print(ids.unique())
# Read CX course data, reduce to relevant columns, set index column
course_csv: pd.DataFrame = pd.read_csv(cx_course_students_csv_file)
course_csv = course_csv[["Legi", "Lastname", "Firstname", "Email"]]
course_csv.set_index("Legi", inplace=True)
# for wtf in clusters[["id1", "id2"]]:
# print(wtf)
# Analogous for eDoz course data
edoz1_csv: pd.DataFrame = pd.read_csv("edoz-252083200L.csv", sep="\t")
edoz1_csv = edoz1_csv[["Nummer", "Departement"]]
edoz1_csv.rename(columns={"Nummer": "Legi"}, inplace=True)
edoz1_csv.set_index("Legi", inplace=True)
# print(edoz1_csv)
# print("edoz1_csv.index.is_unique = {}".format(edoz1_csv.index.is_unique))
# for cluster in clusters.groups:
# print(cluster)
edoz2_csv: pd.DataFrame = pd.read_csv("edoz-252084800L.csv", sep="\t")
edoz2_csv = edoz2_csv[["Nummer", "Departement"]]
edoz2_csv.rename(columns={"Nummer": "Legi"}, inplace=True)
edoz2_csv.set_index("Legi", inplace=True)
# print(edoz2_csv.index)
# print("edoz2_csv.index.is_unique = {}".format(edoz2_csv.index.is_unique))
# for cluster in clusters.groupby("cluster_id"):
# print(cluster["id1"])
# Vertically concat eDoz data. Since students may be enrolled into multiple
# courses, duplicated rows are afterwards dropped.
edoz_csv: pd.DataFrame = pd.concat([edoz1_csv, edoz2_csv])
edoz_csv.drop_duplicates(inplace=True)
assert edoz_csv.index.is_unique, "Expected unique indices (= legis) in edoz_csv"
# # Show rows with non-unique indices (https://stackoverflow.com/questions/20199129)
# print(edoz_csv[edoz_csv.index.duplicated(keep=False)])
cluster_groups: pd.DataFrameGroupBy = clusters_csv.groupby("cluster_id")
for _, cluster in cluster_groups: # cluster: pd.DataFrame
print("-"*60)
# Get all ids (= legis) participating in a cluster
ids_values: numpy.ndarray = pd.concat([cluster["id1"], cluster["id2"]]).unique()
# ids = pd.Series(ids_values, name="Legi", index=ids_values)
# Performs an inner join on the keys; here: legis
# https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_sql.html#compare-with-sql-join
# join = pd.merge(ids, course_csv, left_index=True, right_index=True)
join = course_csv.loc[ids_values]
print("========== ids_values")
print(ids_values.shape)
print(ids_values)
print("========== course_csv")
print(course_csv)
print("========== join")
print(join.shape)
print(join)
# print(edoz_csv)
# full_cluster_data = pd.merge(
# join,
# edoz_csv,
# left_index=True, right_index=True)
# print(full_cluster_data)
break
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment