aggr.py 9.22 KB
Newer Older
1
import os
2 3
import logging
import csv
4
import jinja2
5
import argparse
6 7 8
import pandas as pd
from .utils import logging as logutils

9 10
DEFAULT_CLUSTER_FILES_DIR="_clusters"
DEFAULT_CLUSTERS_MATCHES_CSV_FILE="clusters-matches.csv"
11
DEFAULT_CLUSTERS_STUDENTS_CSV_FILE="clusters-students.csv"
12
DEFAULT_CLUSTER_STUDENTS_CSV_FILE_PATTERN="cluster-students-{}.csv"
13
DEFAULT_JINJA_CLUSTER_TEMPLATE_FILE="./_static/clusters.html.jinja"
14

15
## TODO: Refactor function into separate ones
16 17
def aggregate(
    edoz_exports, # List of argparse.FileType objects
18
    cx_course_students_csv_file, # Single argparse.FileType object
19 20
    cluster_files_dir=DEFAULT_CLUSTER_FILES_DIR,
    clusters_matches_csv_file=DEFAULT_CLUSTERS_MATCHES_CSV_FILE,
21
    clusters_students_csv_file=DEFAULT_CLUSTERS_STUDENTS_CSV_FILE,
22
    cluster_students_csv_file_pattern=DEFAULT_CLUSTER_STUDENTS_CSV_FILE_PATTERN,
23
    jinja_cluster_template_file=DEFAULT_JINJA_CLUSTER_TEMPLATE_FILE):
24

scmalte's avatar
scmalte committed
25 26
  logutils.configure_level_and_format()

27 28
  if not os.path.isfile(clusters_matches_csv_file):
    raise RuntimeError("CSV file {} with matches per clusters doesn't exist. Should have been created by mu-cluster.".format(clusters_matches_csv_file))
29

30
  clusters_csv: pd.DataFrame = pd.read_csv(clusters_matches_csv_file)
31
  
scmalte's avatar
scmalte committed
32
  # Read CX course data, reduce to relevant columns, truncate TotalScore (which are floats), set index column
33
  relevant_course_columns = ["Legi", "Lastname", "Firstname", "Email", "Gender", "TotalScore"]
34
  course_csv: pd.DataFrame = pd.read_csv(cx_course_students_csv_file)
35
  course_csv = course_csv[relevant_course_columns]
scmalte's avatar
scmalte committed
36
  course_csv["TotalScore"] = course_csv["TotalScore"].round(0)
37
  course_csv.set_index("Legi", inplace=True)
38
  ## TODO: Remove staff from course_csv
39

40 41 42
  ## TODO: Could integrate eDoz data "Leistungskontrollen" to get information whether
  ##       or not a student is a repeater  

43 44 45 46 47 48 49 50 51 52 53
  individual_edoz_csv_frames = []
  for csvfile in edoz_exports:
    relevant_edoz_columns = ["Nummer", "Departement"]
    edoz_csv: pd.DataFrame = pd.read_csv(csvfile, sep="\t")
    edoz_csv = edoz_csv[relevant_edoz_columns]
    edoz_csv.rename(columns={"Nummer": "Legi"}, inplace=True)
    edoz_csv.set_index("Legi", inplace=True)    
    # print(edoz1_csv)
    # print("edoz1_csv.index.is_unique = {}".format(edoz1_csv.index.is_unique))    

    individual_edoz_csv_frames.append(edoz_csv)
54

55 56
  # Vertically concat eDoz data. Since students may be enrolled into multiple
  # courses, duplicated rows are afterwards dropped.
57
  edoz_csv: pd.DataFrame = pd.concat(individual_edoz_csv_frames)
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
  # print("========== edoz_csv [initial]")
  # print(edoz_csv.shape)
  # print(edoz_csv)
  # edoz_csv.drop_duplicates(inplace=True) # Not applicable here since indices are ignored
  edoz_csv = edoz_csv.loc[~edoz_csv.index.duplicated(keep='first')] # Get rows not in the set of duplicated indices
  # print("========== edoz_csv [unique]")
  # print(edoz_csv.shape)
  # print(edoz_csv)


  ## TODO: Add "Departement" column to course_csv, by joining with edoz_csv


  ### Aggregate course overview statistics
  edoz_departements: pd.DataFrame = edoz_csv["Departement"].value_counts()
  course_genders: pd.DataFrame = course_csv["Gender"].value_counts()

75 76 77 78
  assert edoz_csv.index.is_unique, "Expected unique indices (= legis) in edoz_csv"
  # # Show rows with non-unique indices (https://stackoverflow.com/questions/20199129) 
  # print(edoz_csv[edoz_csv.index.duplicated(keep=False)])
  
79 80 81

  jinja2_file_loader = jinja2.FileSystemLoader(".")
  jinja2_env = jinja2.Environment(loader=jinja2_file_loader)
82 83 84 85 86

  try:
    template = jinja2_env.get_template(jinja_cluster_template_file)
  except jinja2.exceptions.TemplateNotFound as exception:
    raise RuntimeError("Couldn't load Jinja2 template {}. Should have been created by mu-init.".format(jinja_cluster_template_file))   
87 88 89 90 91 92

  # output = template.render(colors=colors)
  # print(output)

  jinja2_rows = []

93
  cluster_groups: pd.DataFrameGroupBy = clusters_csv.groupby("cluster_id")
94

95
  for cluster_id, cluster in cluster_groups: # cluster: pd.DataFrame
96
    # print("-"*60)
97 98
    # Get all ids (= legis) participating in a cluster
    ids_values: numpy.ndarray = pd.concat([cluster["id1"], cluster["id2"]]).unique()
99
    
100
    # ids = pd.Series(ids_values, name="Legi", index=ids_values)
101 102
    # # Performs an inner join on the keys; here: legis
    # # https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_sql.html#compare-with-sql-join
103
    # join = pd.merge(ids, course_csv, left_index=True, right_index=True)
104

105 106 107 108 109 110 111
    # Select rows for list of indices ids_values.
    # If there is no row for a given index — e.g. when a master solution was send
    # to MOSS as an additional submission — a row with all NaNs is returned.
    # See also https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike.
    cluster_course_rows: pd.DataFrame = course_csv.reindex(ids_values)
    # Preceding row selection is equivalent to the following, iff all indices exist:
    #   cluster_course_rows: pd.DataFrame = course_csv.loc[ids_values]
112

113 114 115 116 117 118 119 120 121 122 123 124 125
    # print("========== cluster ")
    # print(cluster.shape)
    # print(cluster)
    # print("========== ids_values ")
    # print(ids_values.shape)
    # print(ids_values)
    # print("========== course_csv")
    # print(course_csv)
    # print("========== cluster_course_rows")
    # print(cluster_course_rows.shape)
    # print(cluster_course_rows)
    # print("========== edoz_csv")
    # print(edoz_csv.shape)
126 127
    # print(edoz_csv)

128 129
    cluster_rows: pd.DataFrame = cluster_course_rows.join(edoz_csv)

130 131 132 133
    students_per_clusters_file = os.path.join(
        cluster_files_dir, 
        cluster_students_csv_file_pattern.format(cluster_id))
    
134
    logging.info("Writing students from cluster {} to file {}".format(cluster_id, students_per_clusters_file))
135 136
    cluster_rows.to_csv(students_per_clusters_file)

137 138 139
    # print("========== cluster_rows")
    # print(cluster_rows.shape)
    # print(cluster_rows)
140
    # print(name)
141 142 143 144

    # print(cluster)
    # print(cluster["svg_file"].iat[0])

145 146
    jinja2_rows.append((cluster_id, cluster_rows.shape[0], cluster, cluster_rows))

147

148 149 150 151 152 153 154 155 156
  logging.info("Writing all clusters to file {}".format(clusters_students_csv_file))
  write_header = True
  write_mode = "w"
  for cluster_id, _, _, cluster_rows in jinja2_rows:
    cluster_rows["Cluster-ID"] = cluster_id ## Inserts column add end
    # cluster_rows.insert(0, "Cluster-ID", cluster_id) ## Inserts column after index (Legi)
    cluster_rows.to_csv(clusters_students_csv_file, mode=write_mode, header=write_header)
    write_header = False
    write_mode = "a"
scmalte's avatar
scmalte committed
157
  
158

scmalte's avatar
scmalte committed
159 160 161 162
  ## TODO: Support sorting clusters by max (or average) involved percentage


  plagiarist_count = 0
163 164
  for _, size, _, cluster_rows in jinja2_rows:
    plagiarist_count += size # cluster_rows.shape[0]
165 166 167


  department_counts = {}
168
  for _, _, _, cluster_rows in jinja2_rows:
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
    for index, value in cluster_rows["Departement"].value_counts().iteritems():
      if index in department_counts:
        department_counts[index] += value
      else:
        department_counts[index] = value

  # print(department_counts)

  department_percentage = {}
  for dep in department_counts:
    department_percentage[dep] = department_counts[dep] / edoz_departements[dep] * 100
  
  # print(department_percentage)


  gender_counts = {}
185
  for _, _, _, cluster_rows in jinja2_rows:
186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
    for index, value in cluster_rows["Gender"].value_counts().iteritems():
      if index in gender_counts:
        gender_counts[index] += value
      else:
        gender_counts[index] = value

  # print(gender_counts)

  gender_percentage = {}
  for dep in gender_counts:
    gender_percentage[dep] = gender_counts[dep] / course_genders[dep] * 100
  
  # print(gender_percentage)

  percentages = {**department_percentage, **gender_percentage}
  for key, value in percentages.items():
    percentages[key] = round(value, 1)

  # print(percentages)

206

207 208 209 210 211
  template.stream(
    title="Clusters",
    clusters=jinja2_rows,
    edoz_count=edoz_csv.shape[0],
    course_count=course_csv.shape[0],
scmalte's avatar
scmalte committed
212
    plagiarist_count=plagiarist_count,
213 214
    percentages=percentages
  ).dump("clusters.html")
215

216

217 218
def configure_cli_parser(parser):
  parser.add_argument(
219
    "-ee", "--edoz-exports",
220 221 222 223 224
    type=argparse.FileType('r'),
    nargs="+",
    help="eDoz student list exports (CSV)",
    required=True)

225 226 227 228 229 230
  parser.add_argument(
    "-ce", "--code-expert-export",
    type=argparse.FileType('r'),
    help="Code Expert student data export (CSV)",
    required=True)

231 232 233 234 235 236 237 238
  logutils.add_loglevel_argument(parser)


def main(
    cluster_files_dir=DEFAULT_CLUSTER_FILES_DIR,
    clusters_matches_csv_file=DEFAULT_CLUSTERS_MATCHES_CSV_FILE,
    clusters_students_csv_file=DEFAULT_CLUSTERS_STUDENTS_CSV_FILE,
    cluster_students_csv_file_pattern=DEFAULT_CLUSTER_STUDENTS_CSV_FILE_PATTERN,
239
    # cx_course_students_csv_file=DEFAULT_CX_COURSE_STUDENTS_CSV_FILE,
240 241 242 243 244 245 246 247 248 249
    jinja_cluster_template_file=DEFAULT_JINJA_CLUSTER_TEMPLATE_FILE):

  parser = argparse.ArgumentParser()
  configure_cli_parser(parser)
  args = parser.parse_args()

  logutils.configure_level_and_format(args.log_level)    

  aggregate(
    args.edoz_exports,
250
    args.code_expert_export,
251 252 253 254 255 256 257
    cluster_files_dir,
    clusters_matches_csv_file,
    clusters_students_csv_file,
    cluster_students_csv_file_pattern,
    jinja_cluster_template_file)


258 259
if __name__ == "__main__":
  main()