To receive notifications about scheduled maintenance, please subscribe to the mailing-list gitlab-operations@sympa.ethz.ch. You can subscribe to the mailing-list at https://sympa.ethz.ch

aggr.py 2.84 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
import logging
import csv
import pandas as pd
from .utils import logging as logutils

DEFAULT_CLUSTERS_SUMMARY_CSV_FILE="clusters.csv"
DEFAULT_CX_COURSE_STUDENTS_CSV_FILE="cx_students.csv"

def main(
    clusters_summary_csv_file=DEFAULT_CLUSTERS_SUMMARY_CSV_FILE,
    cx_course_students_csv_file=DEFAULT_CX_COURSE_STUDENTS_CSV_FILE):

13
14
15
16
17
18
  clusters_csv: pd.DataFrame = pd.read_csv(clusters_summary_csv_file)
  
  # Read CX course data, reduce to relevant columns, set index column
  course_csv: pd.DataFrame = pd.read_csv(cx_course_students_csv_file)
  course_csv = course_csv[["Legi", "Lastname", "Firstname", "Email"]]
  course_csv.set_index("Legi", inplace=True)
19

20
21
22
23
24
25
26
27
28
29
30
31
32
33
  # Analogous for eDoz course data
  edoz1_csv: pd.DataFrame = pd.read_csv("edoz-252083200L.csv", sep="\t")
  edoz1_csv = edoz1_csv[["Nummer", "Departement"]]
  edoz1_csv.rename(columns={"Nummer": "Legi"}, inplace=True)
  edoz1_csv.set_index("Legi", inplace=True)
  # print(edoz1_csv)
  # print("edoz1_csv.index.is_unique = {}".format(edoz1_csv.index.is_unique))
  
  edoz2_csv: pd.DataFrame = pd.read_csv("edoz-252084800L.csv", sep="\t")
  edoz2_csv = edoz2_csv[["Nummer", "Departement"]]
  edoz2_csv.rename(columns={"Nummer": "Legi"}, inplace=True)
  edoz2_csv.set_index("Legi", inplace=True)
  # print(edoz2_csv.index)
  # print("edoz2_csv.index.is_unique = {}".format(edoz2_csv.index.is_unique))
34

35
36
37
38
39
40
41
42
43
44
45
46
47
48
  # Vertically concat eDoz data. Since students may be enrolled into multiple
  # courses, duplicated rows are afterwards dropped.
  edoz_csv: pd.DataFrame = pd.concat([edoz1_csv, edoz2_csv])
  edoz_csv.drop_duplicates(inplace=True)
  assert edoz_csv.index.is_unique, "Expected unique indices (= legis) in edoz_csv"
  # # Show rows with non-unique indices (https://stackoverflow.com/questions/20199129) 
  # print(edoz_csv[edoz_csv.index.duplicated(keep=False)])
  
  cluster_groups: pd.DataFrameGroupBy = clusters_csv.groupby("cluster_id")
  for _, cluster in cluster_groups: # cluster: pd.DataFrame
    print("-"*60)
    # Get all ids (= legis) participating in a cluster
    ids_values: numpy.ndarray = pd.concat([cluster["id1"], cluster["id2"]]).unique()
    # ids = pd.Series(ids_values, name="Legi", index=ids_values)
49

50
51
52
    # Performs an inner join on the keys; here: legis
    # https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_sql.html#compare-with-sql-join
    # join = pd.merge(ids, course_csv, left_index=True, right_index=True)
53

54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
    join = course_csv.loc[ids_values]

    print("========== ids_values")
    print(ids_values.shape)
    print(ids_values)
    print("========== course_csv")
    print(course_csv)
    print("========== join")
    print(join.shape)
    print(join)
    # print(edoz_csv)

    # full_cluster_data = pd.merge(
    #   join,
    #   edoz_csv,
    #   left_index=True, right_index=True)

    # print(full_cluster_data)

    break
74
75
76

if __name__ == "__main__":
  main()