aggr.py 9.54 KB
Newer Older
1
import os
2 3
import logging
import csv
4
import jinja2
5
import argparse
6
import pandas as pd
7
from dataclasses import dataclass
8 9
from .utils import logging as logutils

10 11
DEFAULT_CLUSTER_FILES_DIR="_clusters"
DEFAULT_CLUSTERS_MATCHES_CSV_FILE="clusters-matches.csv"
12
DEFAULT_CLUSTERS_STUDENTS_CSV_FILE="clusters-students.csv"
13
DEFAULT_CLUSTER_STUDENTS_CSV_FILE_PATTERN="cluster-students-{}.csv"
14
DEFAULT_JINJA_CLUSTER_TEMPLATE_FILE="./_static/clusters.html.jinja"
15

16 17 18 19 20 21 22 23 24 25
@dataclass
class RelativeDataPoint:
  name: str # Data point's name
  part: float # Relevant elements (i.e. numerator)
  base: float # All elements (i.e. denominator)

  @property
  def percentage(self, round_to_digits=1):
    return round(self.part * 100 / self.base, 1)

26
## TODO: Refactor function into separate ones
27 28
def aggregate(
    edoz_exports, # List of argparse.FileType objects
29
    cx_course_students_csv_file, # Single argparse.FileType object
30 31
    cluster_files_dir=DEFAULT_CLUSTER_FILES_DIR,
    clusters_matches_csv_file=DEFAULT_CLUSTERS_MATCHES_CSV_FILE,
32
    clusters_students_csv_file=DEFAULT_CLUSTERS_STUDENTS_CSV_FILE,
33
    cluster_students_csv_file_pattern=DEFAULT_CLUSTER_STUDENTS_CSV_FILE_PATTERN,
34
    jinja_cluster_template_file=DEFAULT_JINJA_CLUSTER_TEMPLATE_FILE):
35

scmalte's avatar
scmalte committed
36 37
  logutils.configure_level_and_format()

38 39
  if not os.path.isfile(clusters_matches_csv_file):
    raise RuntimeError("CSV file {} with matches per clusters doesn't exist. Should have been created by mu-cluster.".format(clusters_matches_csv_file))
40

41
  clusters_csv: pd.DataFrame = pd.read_csv(clusters_matches_csv_file)
42
  
scmalte's avatar
scmalte committed
43
  # Read CX course data, reduce to relevant columns, truncate TotalScore (which are floats), set index column
44
  relevant_course_columns = ["Legi", "Lastname", "Firstname", "Email", "Gender", "TotalScore"]
45
  course_csv: pd.DataFrame = pd.read_csv(cx_course_students_csv_file)
46
  course_csv = course_csv[relevant_course_columns]
scmalte's avatar
scmalte committed
47
  course_csv["TotalScore"] = course_csv["TotalScore"].round(0)
48
  course_csv.set_index("Legi", inplace=True)
49
  ## TODO: Remove staff from course_csv
50

51 52 53
  ## TODO: Could integrate eDoz data "Leistungskontrollen" to get information whether
  ##       or not a student is a repeater  

54 55 56 57 58 59 60 61 62 63 64
  individual_edoz_csv_frames = []
  for csvfile in edoz_exports:
    relevant_edoz_columns = ["Nummer", "Departement"]
    edoz_csv: pd.DataFrame = pd.read_csv(csvfile, sep="\t")
    edoz_csv = edoz_csv[relevant_edoz_columns]
    edoz_csv.rename(columns={"Nummer": "Legi"}, inplace=True)
    edoz_csv.set_index("Legi", inplace=True)    
    # print(edoz1_csv)
    # print("edoz1_csv.index.is_unique = {}".format(edoz1_csv.index.is_unique))    

    individual_edoz_csv_frames.append(edoz_csv)
65

66 67
  # Vertically concat eDoz data. Since students may be enrolled into multiple
  # courses, duplicated rows are afterwards dropped.
68
  edoz_csv: pd.DataFrame = pd.concat(individual_edoz_csv_frames)
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
  # print("========== edoz_csv [initial]")
  # print(edoz_csv.shape)
  # print(edoz_csv)
  # edoz_csv.drop_duplicates(inplace=True) # Not applicable here since indices are ignored
  edoz_csv = edoz_csv.loc[~edoz_csv.index.duplicated(keep='first')] # Get rows not in the set of duplicated indices
  # print("========== edoz_csv [unique]")
  # print(edoz_csv.shape)
  # print(edoz_csv)


  ## TODO: Add "Departement" column to course_csv, by joining with edoz_csv


  ### Aggregate course overview statistics
  edoz_departements: pd.DataFrame = edoz_csv["Departement"].value_counts()
  course_genders: pd.DataFrame = course_csv["Gender"].value_counts()

86 87 88 89
  assert edoz_csv.index.is_unique, "Expected unique indices (= legis) in edoz_csv"
  # # Show rows with non-unique indices (https://stackoverflow.com/questions/20199129) 
  # print(edoz_csv[edoz_csv.index.duplicated(keep=False)])
  
90 91 92

  jinja2_file_loader = jinja2.FileSystemLoader(".")
  jinja2_env = jinja2.Environment(loader=jinja2_file_loader)
93 94 95 96 97

  try:
    template = jinja2_env.get_template(jinja_cluster_template_file)
  except jinja2.exceptions.TemplateNotFound as exception:
    raise RuntimeError("Couldn't load Jinja2 template {}. Should have been created by mu-init.".format(jinja_cluster_template_file))   
98 99 100 101 102 103

  # output = template.render(colors=colors)
  # print(output)

  jinja2_rows = []

104
  cluster_groups: pd.DataFrameGroupBy = clusters_csv.groupby("cluster_id")
105

106
  for cluster_id, cluster in cluster_groups: # cluster: pd.DataFrame
107
    # print("-"*60)
108 109
    # Get all ids (= legis) participating in a cluster
    ids_values: numpy.ndarray = pd.concat([cluster["id1"], cluster["id2"]]).unique()
110
    
111
    # ids = pd.Series(ids_values, name="Legi", index=ids_values)
112 113
    # # Performs an inner join on the keys; here: legis
    # # https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_sql.html#compare-with-sql-join
114
    # join = pd.merge(ids, course_csv, left_index=True, right_index=True)
115

116 117 118 119 120 121 122
    # Select rows for list of indices ids_values.
    # If there is no row for a given index — e.g. when a master solution was send
    # to MOSS as an additional submission — a row with all NaNs is returned.
    # See also https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike.
    cluster_course_rows: pd.DataFrame = course_csv.reindex(ids_values)
    # Preceding row selection is equivalent to the following, iff all indices exist:
    #   cluster_course_rows: pd.DataFrame = course_csv.loc[ids_values]
123

124 125 126 127 128 129 130 131 132 133 134 135 136
    # print("========== cluster ")
    # print(cluster.shape)
    # print(cluster)
    # print("========== ids_values ")
    # print(ids_values.shape)
    # print(ids_values)
    # print("========== course_csv")
    # print(course_csv)
    # print("========== cluster_course_rows")
    # print(cluster_course_rows.shape)
    # print(cluster_course_rows)
    # print("========== edoz_csv")
    # print(edoz_csv.shape)
137 138
    # print(edoz_csv)

139 140
    cluster_rows: pd.DataFrame = cluster_course_rows.join(edoz_csv)

141 142 143 144
    students_per_clusters_file = os.path.join(
        cluster_files_dir, 
        cluster_students_csv_file_pattern.format(cluster_id))
    
145
    logging.info("Writing students from cluster {} to file {}".format(cluster_id, students_per_clusters_file))
146 147
    cluster_rows.to_csv(students_per_clusters_file)

148 149 150
    # print("========== cluster_rows")
    # print(cluster_rows.shape)
    # print(cluster_rows)
151
    # print(name)
152 153 154 155

    # print(cluster)
    # print(cluster["svg_file"].iat[0])

156 157
    jinja2_rows.append((cluster_id, cluster_rows.shape[0], cluster, cluster_rows))

158

159 160 161 162 163 164 165 166 167
  logging.info("Writing all clusters to file {}".format(clusters_students_csv_file))
  write_header = True
  write_mode = "w"
  for cluster_id, _, _, cluster_rows in jinja2_rows:
    cluster_rows["Cluster-ID"] = cluster_id ## Inserts column add end
    # cluster_rows.insert(0, "Cluster-ID", cluster_id) ## Inserts column after index (Legi)
    cluster_rows.to_csv(clusters_students_csv_file, mode=write_mode, header=write_header)
    write_header = False
    write_mode = "a"
scmalte's avatar
scmalte committed
168
  
169

scmalte's avatar
scmalte committed
170 171 172 173
  ## TODO: Support sorting clusters by max (or average) involved percentage


  plagiarist_count = 0
174 175
  for _, size, _, cluster_rows in jinja2_rows:
    plagiarist_count += size # cluster_rows.shape[0]
176 177 178


  department_counts = {}
179
  for _, _, _, cluster_rows in jinja2_rows:
180 181 182 183 184 185 186 187
    for index, value in cluster_rows["Departement"].value_counts().iteritems():
      if index in department_counts:
        department_counts[index] += value
      else:
        department_counts[index] = value

  # print(department_counts)

188
  department_data = []
189
  for dep in department_counts:
190 191 192 193 194 195
    data = RelativeDataPoint(
      dep, 
      department_counts[dep], 
      edoz_departements[dep])
    
    department_data.append(data)
196
  
197
  # print(department_data)
198 199 200


  gender_counts = {}
201
  for _, _, _, cluster_rows in jinja2_rows:
202 203 204 205 206 207 208 209
    for index, value in cluster_rows["Gender"].value_counts().iteritems():
      if index in gender_counts:
        gender_counts[index] += value
      else:
        gender_counts[index] = value

  # print(gender_counts)

210
  gender_data = []
211
  for dep in gender_counts:
212 213 214 215 216 217
    data = RelativeDataPoint(
      dep, 
      gender_counts[dep],
      course_genders[dep])
    
    gender_data.append(data)    
218
  
219
  # print(gender_data)
220

221 222
  datapoints = department_data + gender_data
  # print(datapoints)
223

224

225 226 227 228 229
  template.stream(
    title="Clusters",
    clusters=jinja2_rows,
    edoz_count=edoz_csv.shape[0],
    course_count=course_csv.shape[0],
scmalte's avatar
scmalte committed
230
    plagiarist_count=plagiarist_count,
231
    datapoints=datapoints
232
  ).dump("clusters.html")
233

234

235 236
def configure_cli_parser(parser):
  parser.add_argument(
237
    "-ee", "--edoz-exports",
238
    type=argparse.FileType("r", encoding="utf-8"),
239 240 241 242
    nargs="+",
    help="eDoz student list exports (CSV)",
    required=True)

243 244
  parser.add_argument(
    "-ce", "--code-expert-export",
245
    type=argparse.FileType("r", encoding="utf-8"),
246 247 248
    help="Code Expert student data export (CSV)",
    required=True)

249 250 251 252 253 254 255 256
  logutils.add_loglevel_argument(parser)


def main(
    cluster_files_dir=DEFAULT_CLUSTER_FILES_DIR,
    clusters_matches_csv_file=DEFAULT_CLUSTERS_MATCHES_CSV_FILE,
    clusters_students_csv_file=DEFAULT_CLUSTERS_STUDENTS_CSV_FILE,
    cluster_students_csv_file_pattern=DEFAULT_CLUSTER_STUDENTS_CSV_FILE_PATTERN,
257
    # cx_course_students_csv_file=DEFAULT_CX_COURSE_STUDENTS_CSV_FILE,
258 259 260 261 262 263 264 265 266 267
    jinja_cluster_template_file=DEFAULT_JINJA_CLUSTER_TEMPLATE_FILE):

  parser = argparse.ArgumentParser()
  configure_cli_parser(parser)
  args = parser.parse_args()

  logutils.configure_level_and_format(args.log_level)    

  aggregate(
    args.edoz_exports,
268
    args.code_expert_export,
269 270 271 272 273 274 275
    cluster_files_dir,
    clusters_matches_csv_file,
    clusters_students_csv_file,
    cluster_students_csv_file_pattern,
    jinja_cluster_template_file)


276 277
if __name__ == "__main__":
  main()