To receive notifications about scheduled maintenance, please subscribe to the mailing-list gitlab-operations@sympa.ethz.ch. You can subscribe to the mailing-list at https://sympa.ethz.ch

Commit 90debcf7 authored by scmalte's avatar scmalte
Browse files

aggr.py: write cluster.html report via Jinja2

parent ca0d87fd
import logging import logging
import csv import csv
import jinja2
import pandas as pd import pandas as pd
from .utils import logging as logutils from .utils import logging as logutils
...@@ -13,64 +14,157 @@ def main( ...@@ -13,64 +14,157 @@ def main(
clusters_csv: pd.DataFrame = pd.read_csv(clusters_summary_csv_file) clusters_csv: pd.DataFrame = pd.read_csv(clusters_summary_csv_file)
# Read CX course data, reduce to relevant columns, set index column # Read CX course data, reduce to relevant columns, set index column
relevant_course_columns = ["Legi", "Lastname", "Firstname", "Email", "Gender", "TotalScore"]
course_csv: pd.DataFrame = pd.read_csv(cx_course_students_csv_file) course_csv: pd.DataFrame = pd.read_csv(cx_course_students_csv_file)
course_csv = course_csv[["Legi", "Lastname", "Firstname", "Email"]] course_csv = course_csv[relevant_course_columns]
course_csv.set_index("Legi", inplace=True) course_csv.set_index("Legi", inplace=True)
## TODO: Remove staff from course_csv
# Analogous for eDoz course data # Analogous for eDoz course data
relevant_edoz_columns = ["Nummer", "Departement"]
edoz1_csv: pd.DataFrame = pd.read_csv("edoz-252083200L.csv", sep="\t") edoz1_csv: pd.DataFrame = pd.read_csv("edoz-252083200L.csv", sep="\t")
edoz1_csv = edoz1_csv[["Nummer", "Departement"]] edoz1_csv = edoz1_csv[relevant_edoz_columns]
edoz1_csv.rename(columns={"Nummer": "Legi"}, inplace=True) edoz1_csv.rename(columns={"Nummer": "Legi"}, inplace=True)
edoz1_csv.set_index("Legi", inplace=True) edoz1_csv.set_index("Legi", inplace=True)
# print(edoz1_csv) # print(edoz1_csv)
# print("edoz1_csv.index.is_unique = {}".format(edoz1_csv.index.is_unique)) # print("edoz1_csv.index.is_unique = {}".format(edoz1_csv.index.is_unique))
edoz2_csv: pd.DataFrame = pd.read_csv("edoz-252084800L.csv", sep="\t") edoz2_csv: pd.DataFrame = pd.read_csv("edoz-252084800L.csv", sep="\t")
edoz2_csv = edoz2_csv[["Nummer", "Departement"]] edoz2_csv = edoz2_csv[relevant_edoz_columns]
edoz2_csv.rename(columns={"Nummer": "Legi"}, inplace=True) edoz2_csv.rename(columns={"Nummer": "Legi"}, inplace=True)
edoz2_csv.set_index("Legi", inplace=True) edoz2_csv.set_index("Legi", inplace=True)
# print(edoz2_csv.index) # print(edoz2_csv.index)
# print("edoz2_csv.index.is_unique = {}".format(edoz2_csv.index.is_unique)) # print("edoz2_csv.index.is_unique = {}".format(edoz2_csv.index.is_unique))
## TODO: Could integrate eDoz data "Leistungskontrollen" to get information whether
## or not a student is a repeater
# Vertically concat eDoz data. Since students may be enrolled into multiple # Vertically concat eDoz data. Since students may be enrolled into multiple
# courses, duplicated rows are afterwards dropped. # courses, duplicated rows are afterwards dropped.
edoz_csv: pd.DataFrame = pd.concat([edoz1_csv, edoz2_csv]) edoz_csv: pd.DataFrame = pd.concat([edoz1_csv, edoz2_csv])
edoz_csv.drop_duplicates(inplace=True) # print("========== edoz_csv [initial]")
# print(edoz_csv.shape)
# print(edoz_csv)
# edoz_csv.drop_duplicates(inplace=True) # Not applicable here since indices are ignored
edoz_csv = edoz_csv.loc[~edoz_csv.index.duplicated(keep='first')] # Get rows not in the set of duplicated indices
# print("========== edoz_csv [unique]")
# print(edoz_csv.shape)
# print(edoz_csv)
## TODO: Add "Departement" column to course_csv, by joining with edoz_csv
### Aggregate course overview statistics
edoz_departements: pd.DataFrame = edoz_csv["Departement"].value_counts()
course_genders: pd.DataFrame = course_csv["Gender"].value_counts()
assert edoz_csv.index.is_unique, "Expected unique indices (= legis) in edoz_csv" assert edoz_csv.index.is_unique, "Expected unique indices (= legis) in edoz_csv"
# # Show rows with non-unique indices (https://stackoverflow.com/questions/20199129) # # Show rows with non-unique indices (https://stackoverflow.com/questions/20199129)
# print(edoz_csv[edoz_csv.index.duplicated(keep=False)]) # print(edoz_csv[edoz_csv.index.duplicated(keep=False)])
jinja2_file_loader = jinja2.FileSystemLoader(".")
jinja2_env = jinja2.Environment(loader=jinja2_file_loader)
template = jinja2_env.get_template("clusters.html.jinja")
# output = template.render(colors=colors)
# print(output)
jinja2_rows = []
cluster_groups: pd.DataFrameGroupBy = clusters_csv.groupby("cluster_id") cluster_groups: pd.DataFrameGroupBy = clusters_csv.groupby("cluster_id")
for _, cluster in cluster_groups: # cluster: pd.DataFrame for _, cluster in cluster_groups: # cluster: pd.DataFrame
print("-"*60) # print("-"*60)
# Get all ids (= legis) participating in a cluster # Get all ids (= legis) participating in a cluster
ids_values: numpy.ndarray = pd.concat([cluster["id1"], cluster["id2"]]).unique() ids_values: numpy.ndarray = pd.concat([cluster["id1"], cluster["id2"]]).unique()
# ids = pd.Series(ids_values, name="Legi", index=ids_values) # ids = pd.Series(ids_values, name="Legi", index=ids_values)
# # Performs an inner join on the keys; here: legis
# Performs an inner join on the keys; here: legis # # https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_sql.html#compare-with-sql-join
# https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_sql.html#compare-with-sql-join
# join = pd.merge(ids, course_csv, left_index=True, right_index=True) # join = pd.merge(ids, course_csv, left_index=True, right_index=True)
join = course_csv.loc[ids_values] cluster_course_rows: pd.DataFrame = course_csv.loc[ids_values]
print("========== ids_values") # print("========== cluster ")
print(ids_values.shape) # print(cluster.shape)
print(ids_values) # print(cluster)
print("========== course_csv") # print("========== ids_values ")
print(course_csv) # print(ids_values.shape)
print("========== join") # print(ids_values)
print(join.shape) # print("========== course_csv")
print(join) # print(course_csv)
# print("========== cluster_course_rows")
# print(cluster_course_rows.shape)
# print(cluster_course_rows)
# print("========== edoz_csv")
# print(edoz_csv.shape)
# print(edoz_csv) # print(edoz_csv)
# full_cluster_data = pd.merge( cluster_rows: pd.DataFrame = cluster_course_rows.join(edoz_csv)
# join,
# edoz_csv, # print("========== cluster_rows")
# left_index=True, right_index=True) # print(cluster_rows.shape)
# print(cluster_rows)
# print(cluster)
# print(cluster["svg_file"].iat[0])
jinja2_rows.append((cluster, cluster_rows))
# break
department_counts = {}
for (cluster, cluster_rows) in jinja2_rows:
for index, value in cluster_rows["Departement"].value_counts().iteritems():
if index in department_counts:
department_counts[index] += value
else:
department_counts[index] = value
# print(department_counts)
department_percentage = {}
for dep in department_counts:
department_percentage[dep] = department_counts[dep] / edoz_departements[dep] * 100
# print(department_percentage)
gender_counts = {}
for (cluster, cluster_rows) in jinja2_rows:
for index, value in cluster_rows["Gender"].value_counts().iteritems():
if index in gender_counts:
gender_counts[index] += value
else:
gender_counts[index] = value
# print(gender_counts)
gender_percentage = {}
for dep in gender_counts:
gender_percentage[dep] = gender_counts[dep] / course_genders[dep] * 100
# print(gender_percentage)
percentages = {**department_percentage, **gender_percentage}
for key, value in percentages.items():
percentages[key] = round(value, 1)
# print(percentages)
# print(full_cluster_data) template.stream(
title="Clusters",
clusters=jinja2_rows,
edoz_count=edoz_csv.shape[0],
course_count=course_csv.shape[0],
percentages=percentages
).dump("clusters.html")
break
if __name__ == "__main__": if __name__ == "__main__":
main() main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment