aggr.py 10.4 KB
Newer Older
1
import os
2 3
import logging
import csv
4
import jinja2
5
import argparse
6
import datetime
7
import pandas as pd
8
from dataclasses import dataclass
9 10
from .utils import logging as logutils

11 12
DEFAULT_CLUSTER_FILES_DIR="_clusters"
DEFAULT_CLUSTERS_MATCHES_CSV_FILE="clusters-matches.csv"
13
DEFAULT_CLUSTERS_STUDENTS_CSV_FILE="clusters-students.csv"
14
DEFAULT_CLUSTER_STUDENTS_CSV_FILE_PATTERN="cluster-students-{}.csv"
15
DEFAULT_JINJA_CLUSTER_TEMPLATE_FILE="./_static/clusters.html.jinja"
16

17 18 19 20 21 22 23 24 25 26
@dataclass
class RelativeDataPoint:
  name: str # Data point's name
  part: float # Relevant elements (i.e. numerator)
  base: float # All elements (i.e. denominator)

  @property
  def percentage(self, round_to_digits=1):
    return round(self.part * 100 / self.base, 1)

27
## TODO: Refactor function into separate ones
28 29
def aggregate(
    edoz_exports, # List of argparse.FileType objects
30
    cx_course_students_csv_file, # Single argparse.FileType object
31 32
    cluster_files_dir=DEFAULT_CLUSTER_FILES_DIR,
    clusters_matches_csv_file=DEFAULT_CLUSTERS_MATCHES_CSV_FILE,
33
    clusters_students_csv_file=DEFAULT_CLUSTERS_STUDENTS_CSV_FILE,
34
    cluster_students_csv_file_pattern=DEFAULT_CLUSTER_STUDENTS_CSV_FILE_PATTERN,
35
    jinja_cluster_template_file=DEFAULT_JINJA_CLUSTER_TEMPLATE_FILE):
36

scmalte's avatar
scmalte committed
37 38
  logutils.configure_level_and_format()

39 40
  if not os.path.isfile(clusters_matches_csv_file):
    raise RuntimeError("CSV file {} with matches per clusters doesn't exist. Should have been created by mu-cluster.".format(clusters_matches_csv_file))
41

42
  clusters_csv: pd.DataFrame = pd.read_csv(clusters_matches_csv_file)
43
  
scmalte's avatar
scmalte committed
44
  # Read CX course data, reduce to relevant columns, truncate TotalScore (which are floats), set index column
45
  relevant_course_columns = ["Legi", "Lastname", "Firstname", "Email", "Groupname", "Gender", "TotalScore"]
46
  course_csv: pd.DataFrame = pd.read_csv(cx_course_students_csv_file)
47
  course_csv = course_csv[relevant_course_columns]
scmalte's avatar
scmalte committed
48
  course_csv["TotalScore"] = course_csv["TotalScore"].round(0)
49
  course_csv.set_index("Legi", inplace=True)
50
  ## TODO: Remove staff from course_csv
51

52 53 54
  ## TODO: Could integrate eDoz data "Leistungskontrollen" to get information whether
  ##       or not a student is a repeater  

55 56 57 58 59 60 61 62 63 64 65
  individual_edoz_csv_frames = []
  for csvfile in edoz_exports:
    relevant_edoz_columns = ["Nummer", "Departement"]
    edoz_csv: pd.DataFrame = pd.read_csv(csvfile, sep="\t")
    edoz_csv = edoz_csv[relevant_edoz_columns]
    edoz_csv.rename(columns={"Nummer": "Legi"}, inplace=True)
    edoz_csv.set_index("Legi", inplace=True)    
    # print(edoz1_csv)
    # print("edoz1_csv.index.is_unique = {}".format(edoz1_csv.index.is_unique))    

    individual_edoz_csv_frames.append(edoz_csv)
66

67 68
  # Vertically concat eDoz data. Since students may be enrolled into multiple
  # courses, duplicated rows are afterwards dropped.
69
  edoz_csv: pd.DataFrame = pd.concat(individual_edoz_csv_frames)
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
  # print("========== edoz_csv [initial]")
  # print(edoz_csv.shape)
  # print(edoz_csv)
  # edoz_csv.drop_duplicates(inplace=True) # Not applicable here since indices are ignored
  edoz_csv = edoz_csv.loc[~edoz_csv.index.duplicated(keep='first')] # Get rows not in the set of duplicated indices
  # print("========== edoz_csv [unique]")
  # print(edoz_csv.shape)
  # print(edoz_csv)


  ## TODO: Add "Departement" column to course_csv, by joining with edoz_csv


  ### Aggregate course overview statistics
  edoz_departements: pd.DataFrame = edoz_csv["Departement"].value_counts()
  course_genders: pd.DataFrame = course_csv["Gender"].value_counts()

87 88 89 90
  assert edoz_csv.index.is_unique, "Expected unique indices (= legis) in edoz_csv"
  # # Show rows with non-unique indices (https://stackoverflow.com/questions/20199129) 
  # print(edoz_csv[edoz_csv.index.duplicated(keep=False)])
  
91 92 93

  jinja2_file_loader = jinja2.FileSystemLoader(".")
  jinja2_env = jinja2.Environment(loader=jinja2_file_loader)
94 95 96 97 98

  try:
    template = jinja2_env.get_template(jinja_cluster_template_file)
  except jinja2.exceptions.TemplateNotFound as exception:
    raise RuntimeError("Couldn't load Jinja2 template {}. Should have been created by mu-init.".format(jinja_cluster_template_file))   
99 100 101 102 103 104

  # output = template.render(colors=colors)
  # print(output)

  jinja2_rows = []

105
  cluster_groups: pd.DataFrameGroupBy = clusters_csv.groupby("cluster_id")
106

107
  for cluster_id, cluster in cluster_groups: # cluster: pd.DataFrame
108
    # print("-"*60)
109 110
    # Get all ids (= legis) participating in a cluster
    ids_values: numpy.ndarray = pd.concat([cluster["id1"], cluster["id2"]]).unique()
111
    
112
    # ids = pd.Series(ids_values, name="Legi", index=ids_values)
113 114
    # # Performs an inner join on the keys; here: legis
    # # https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_sql.html#compare-with-sql-join
115
    # join = pd.merge(ids, course_csv, left_index=True, right_index=True)
116

117 118 119 120 121 122 123
    # Select rows for list of indices ids_values.
    # If there is no row for a given index — e.g. when a master solution was send
    # to MOSS as an additional submission — a row with all NaNs is returned.
    # See also https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike.
    cluster_course_rows: pd.DataFrame = course_csv.reindex(ids_values)
    # Preceding row selection is equivalent to the following, iff all indices exist:
    #   cluster_course_rows: pd.DataFrame = course_csv.loc[ids_values]
124

125 126 127 128 129 130 131 132 133 134 135 136 137
    # print("========== cluster ")
    # print(cluster.shape)
    # print(cluster)
    # print("========== ids_values ")
    # print(ids_values.shape)
    # print(ids_values)
    # print("========== course_csv")
    # print(course_csv)
    # print("========== cluster_course_rows")
    # print(cluster_course_rows.shape)
    # print(cluster_course_rows)
    # print("========== edoz_csv")
    # print(edoz_csv.shape)
138 139
    # print(edoz_csv)

140 141
    cluster_rows: pd.DataFrame = cluster_course_rows.join(edoz_csv)

142 143 144 145
    students_per_clusters_file = os.path.join(
        cluster_files_dir, 
        cluster_students_csv_file_pattern.format(cluster_id))
    
146
    logging.info("Writing students from cluster {} to file {}".format(cluster_id, students_per_clusters_file))
147 148
    cluster_rows.to_csv(students_per_clusters_file)

149 150 151
    # print("========== cluster_rows")
    # print(cluster_rows.shape)
    # print(cluster_rows)
152
    # print(name)
153 154 155 156

    # print(cluster)
    # print(cluster["svg_file"].iat[0])

157 158
    jinja2_rows.append((cluster_id, cluster_rows.shape[0], cluster, cluster_rows))

159

160 161 162 163 164 165 166 167 168
  logging.info("Writing all clusters to file {}".format(clusters_students_csv_file))
  write_header = True
  write_mode = "w"
  for cluster_id, _, _, cluster_rows in jinja2_rows:
    cluster_rows["Cluster-ID"] = cluster_id ## Inserts column add end
    # cluster_rows.insert(0, "Cluster-ID", cluster_id) ## Inserts column after index (Legi)
    cluster_rows.to_csv(clusters_students_csv_file, mode=write_mode, header=write_header)
    write_header = False
    write_mode = "a"
scmalte's avatar
scmalte committed
169
  
170

scmalte's avatar
scmalte committed
171 172 173 174
  ## TODO: Support sorting clusters by max (or average) involved percentage


  plagiarist_count = 0
175 176
  for _, size, _, cluster_rows in jinja2_rows:
    plagiarist_count += size # cluster_rows.shape[0]
177 178 179


  department_counts = {}
180
  for _, _, _, cluster_rows in jinja2_rows:
181 182 183 184 185 186 187 188
    for index, value in cluster_rows["Departement"].value_counts().iteritems():
      if index in department_counts:
        department_counts[index] += value
      else:
        department_counts[index] = value

  # print(department_counts)

189
  department_data = []
190
  for dep in department_counts:
191 192 193 194 195 196
    data = RelativeDataPoint(
      dep, 
      department_counts[dep], 
      edoz_departements[dep])
    
    department_data.append(data)
197
  
198
  # print(department_data)
199 200 201


  gender_counts = {}
202
  for _, _, _, cluster_rows in jinja2_rows:
203 204 205 206 207 208 209 210
    for index, value in cluster_rows["Gender"].value_counts().iteritems():
      if index in gender_counts:
        gender_counts[index] += value
      else:
        gender_counts[index] = value

  # print(gender_counts)

211
  gender_data = []
212
  for dep in gender_counts:
213 214 215 216 217 218
    data = RelativeDataPoint(
      dep, 
      gender_counts[dep],
      course_genders[dep])
    
    gender_data.append(data)    
219
  
220
  # print(gender_data)
221

222 223
  datapoints = department_data + gender_data
  # print(datapoints)
224

225

226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246
  # Final preparations before creating the HTML report
  for cluster_id, _, _, cluster_rows in jinja2_rows:
    # Convert total scores from floats to integers, to reduce output width.
    # Total scores are effectively integers, anyway.
    # fillna(0) replaces each NA/NaN with zero; this is done to prevent errors, 
    # since NA/NaN cannot be converted to int.
    cluster_rows["TotalScore"] = \
      cluster_rows["TotalScore"].fillna(0).astype(int)
    
    # Rename columns to reduce output width.
    cluster_rows.rename(columns={
      "Lastname": "Last", 
      "Firstname": "First",
      "Groupname": "Group", 
      "Gender": "Gnd.", 
      "TotalScore": "Tot. score",
      "Departement": "Dept.",
      "Cluster-ID": "CID"
    }, inplace=True)


247 248
  template.stream(
    title="Clusters",
249
    timestamp=datetime.datetime.now().strftime("%Y-%m-%d %H:%M"),
250 251 252
    clusters=jinja2_rows,
    edoz_count=edoz_csv.shape[0],
    course_count=course_csv.shape[0],
scmalte's avatar
scmalte committed
253
    plagiarist_count=plagiarist_count,
254
    datapoints=datapoints
255
  ).dump("clusters.html")
256

257

258 259
def configure_cli_parser(parser):
  parser.add_argument(
260
    "-ee", "--edoz-exports",
261
    type=argparse.FileType("r", encoding="utf-8"),
262 263 264 265
    nargs="+",
    help="eDoz student list exports (CSV)",
    required=True)

266 267
  parser.add_argument(
    "-ce", "--code-expert-export",
268
    type=argparse.FileType("r", encoding="utf-8"),
269 270 271
    help="Code Expert student data export (CSV)",
    required=True)

272 273 274 275 276 277 278 279
  logutils.add_loglevel_argument(parser)


def main(
    cluster_files_dir=DEFAULT_CLUSTER_FILES_DIR,
    clusters_matches_csv_file=DEFAULT_CLUSTERS_MATCHES_CSV_FILE,
    clusters_students_csv_file=DEFAULT_CLUSTERS_STUDENTS_CSV_FILE,
    cluster_students_csv_file_pattern=DEFAULT_CLUSTER_STUDENTS_CSV_FILE_PATTERN,
280
    # cx_course_students_csv_file=DEFAULT_CX_COURSE_STUDENTS_CSV_FILE,
281 282 283 284 285 286 287 288 289 290
    jinja_cluster_template_file=DEFAULT_JINJA_CLUSTER_TEMPLATE_FILE):

  parser = argparse.ArgumentParser()
  configure_cli_parser(parser)
  args = parser.parse_args()

  logutils.configure_level_and_format(args.log_level)    

  aggregate(
    args.edoz_exports,
291
    args.code_expert_export,
292 293 294 295 296 297 298
    cluster_files_dir,
    clusters_matches_csv_file,
    clusters_students_csv_file,
    cluster_students_csv_file_pattern,
    jinja_cluster_template_file)


299 300
if __name__ == "__main__":
  main()