Commit 90debcf7 authored by scmalte's avatar scmalte

aggr.py: write cluster.html report via Jinja2

parent ca0d87fd
import logging
import csv
import jinja2
import pandas as pd
from .utils import logging as logutils
......@@ -13,64 +14,157 @@ def main(
clusters_csv: pd.DataFrame = pd.read_csv(clusters_summary_csv_file)
# Read CX course data, reduce to relevant columns, set index column
relevant_course_columns = ["Legi", "Lastname", "Firstname", "Email", "Gender", "TotalScore"]
course_csv: pd.DataFrame = pd.read_csv(cx_course_students_csv_file)
course_csv = course_csv[["Legi", "Lastname", "Firstname", "Email"]]
course_csv = course_csv[relevant_course_columns]
course_csv.set_index("Legi", inplace=True)
## TODO: Remove staff from course_csv
# Analogous for eDoz course data
relevant_edoz_columns = ["Nummer", "Departement"]
edoz1_csv: pd.DataFrame = pd.read_csv("edoz-252083200L.csv", sep="\t")
edoz1_csv = edoz1_csv[["Nummer", "Departement"]]
edoz1_csv = edoz1_csv[relevant_edoz_columns]
edoz1_csv.rename(columns={"Nummer": "Legi"}, inplace=True)
edoz1_csv.set_index("Legi", inplace=True)
# print(edoz1_csv)
# print("edoz1_csv.index.is_unique = {}".format(edoz1_csv.index.is_unique))
edoz2_csv: pd.DataFrame = pd.read_csv("edoz-252084800L.csv", sep="\t")
edoz2_csv = edoz2_csv[["Nummer", "Departement"]]
edoz2_csv = edoz2_csv[relevant_edoz_columns]
edoz2_csv.rename(columns={"Nummer": "Legi"}, inplace=True)
edoz2_csv.set_index("Legi", inplace=True)
# print(edoz2_csv.index)
# print("edoz2_csv.index.is_unique = {}".format(edoz2_csv.index.is_unique))
## TODO: Could integrate eDoz data "Leistungskontrollen" to get information whether
## or not a student is a repeater
# Vertically concat eDoz data. Since students may be enrolled into multiple
# courses, duplicated rows are afterwards dropped.
edoz_csv: pd.DataFrame = pd.concat([edoz1_csv, edoz2_csv])
edoz_csv.drop_duplicates(inplace=True)
# print("========== edoz_csv [initial]")
# print(edoz_csv.shape)
# print(edoz_csv)
# edoz_csv.drop_duplicates(inplace=True) # Not applicable here since indices are ignored
edoz_csv = edoz_csv.loc[~edoz_csv.index.duplicated(keep='first')] # Get rows not in the set of duplicated indices
# print("========== edoz_csv [unique]")
# print(edoz_csv.shape)
# print(edoz_csv)
## TODO: Add "Departement" column to course_csv, by joining with edoz_csv
### Aggregate course overview statistics
edoz_departements: pd.DataFrame = edoz_csv["Departement"].value_counts()
course_genders: pd.DataFrame = course_csv["Gender"].value_counts()
assert edoz_csv.index.is_unique, "Expected unique indices (= legis) in edoz_csv"
# # Show rows with non-unique indices (https://stackoverflow.com/questions/20199129)
# print(edoz_csv[edoz_csv.index.duplicated(keep=False)])
jinja2_file_loader = jinja2.FileSystemLoader(".")
jinja2_env = jinja2.Environment(loader=jinja2_file_loader)
template = jinja2_env.get_template("clusters.html.jinja")
# output = template.render(colors=colors)
# print(output)
jinja2_rows = []
cluster_groups: pd.DataFrameGroupBy = clusters_csv.groupby("cluster_id")
for _, cluster in cluster_groups: # cluster: pd.DataFrame
print("-"*60)
# print("-"*60)
# Get all ids (= legis) participating in a cluster
ids_values: numpy.ndarray = pd.concat([cluster["id1"], cluster["id2"]]).unique()
# ids = pd.Series(ids_values, name="Legi", index=ids_values)
# Performs an inner join on the keys; here: legis
# https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_sql.html#compare-with-sql-join
# ids = pd.Series(ids_values, name="Legi", index=ids_values)
# # Performs an inner join on the keys; here: legis
# # https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_sql.html#compare-with-sql-join
# join = pd.merge(ids, course_csv, left_index=True, right_index=True)
join = course_csv.loc[ids_values]
cluster_course_rows: pd.DataFrame = course_csv.loc[ids_values]
print("========== ids_values")
print(ids_values.shape)
print(ids_values)
print("========== course_csv")
print(course_csv)
print("========== join")
print(join.shape)
print(join)
# print("========== cluster ")
# print(cluster.shape)
# print(cluster)
# print("========== ids_values ")
# print(ids_values.shape)
# print(ids_values)
# print("========== course_csv")
# print(course_csv)
# print("========== cluster_course_rows")
# print(cluster_course_rows.shape)
# print(cluster_course_rows)
# print("========== edoz_csv")
# print(edoz_csv.shape)
# print(edoz_csv)
# full_cluster_data = pd.merge(
# join,
# edoz_csv,
# left_index=True, right_index=True)
cluster_rows: pd.DataFrame = cluster_course_rows.join(edoz_csv)
# print("========== cluster_rows")
# print(cluster_rows.shape)
# print(cluster_rows)
# print(cluster)
# print(cluster["svg_file"].iat[0])
jinja2_rows.append((cluster, cluster_rows))
# break
department_counts = {}
for (cluster, cluster_rows) in jinja2_rows:
for index, value in cluster_rows["Departement"].value_counts().iteritems():
if index in department_counts:
department_counts[index] += value
else:
department_counts[index] = value
# print(department_counts)
department_percentage = {}
for dep in department_counts:
department_percentage[dep] = department_counts[dep] / edoz_departements[dep] * 100
# print(department_percentage)
gender_counts = {}
for (cluster, cluster_rows) in jinja2_rows:
for index, value in cluster_rows["Gender"].value_counts().iteritems():
if index in gender_counts:
gender_counts[index] += value
else:
gender_counts[index] = value
# print(gender_counts)
gender_percentage = {}
for dep in gender_counts:
gender_percentage[dep] = gender_counts[dep] / course_genders[dep] * 100
# print(gender_percentage)
percentages = {**department_percentage, **gender_percentage}
for key, value in percentages.items():
percentages[key] = round(value, 1)
# print(percentages)
# print(full_cluster_data)
template.stream(
title="Clusters",
clusters=jinja2_rows,
edoz_count=edoz_csv.shape[0],
course_count=course_csv.shape[0],
percentages=percentages
).dump("clusters.html")
break
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment