cluster.py 6.58 KB
Newer Older
1 2 3 4
import re
# import argparse
import logging
import csv
scmalte's avatar
scmalte committed
5 6
import subprocess
import pydot
7 8
import dataclasses
import itertools
scmalte's avatar
scmalte committed
9
import networkx as nx
10
from dataclass_csv import DataclassReader
scmalte's avatar
scmalte committed
11
from .utils import logging as logutils
12

13 14 15 16 17 18
## TODO: cluster.py could create a first, less detailed version of the 
##       clusters.html report, by extracting the strictly necessary information
##       (student name and e-mail address) from the details.json file located
##       in the CX export. This information would already be enough to generate
##       e-mails afterwards.
##       aggr.py would then be optional, if a more detailed cluster report is desired.
scmalte's avatar
scmalte committed
19 20
##
## TODO: Generate DOT, SVG and CSV files in a subdirectory, e.g. "_clusters"
21

22
DEFAULT_RESULTS_CSV_FILE="moss-report.csv"
scmalte's avatar
scmalte committed
23 24
DEFAULT_TOTAL_GRAPH_DOT_FILE="moss-report.dot"
DEFAULT_CLUSTERS_DOT_FILE="clusters.dot"
25
DEFAULT_CLUSTER_FILE_PATTERN="cluster-{}.{}"
26 27 28
DEFAULT_THRESHOLD_PERCENTAGE=90
DEFAULT_THRESHOLD_LINES=50
DEFAULT_CREATE_SVG_FILES=True
29
DEFAULT_SUMMARY_CSV_FILE="clusters.csv"
30

31
@dataclasses.dataclass
scmalte's avatar
scmalte committed
32 33
class MossResult:
  id1: str
34
  percentage1: int
scmalte's avatar
scmalte committed
35
  id2: str
36
  percentage2: int
37
  percentage_avg: float
38
  lines: int
scmalte's avatar
scmalte committed
39
  match_file: str
40

scmalte's avatar
scmalte committed
41
  def fields_flattened(self):
42 43
    return [f.name for f in dataclasses.fields(self)]

scmalte's avatar
scmalte committed
44
  def values_flattened(self):
45 46 47 48 49 50 51 52 53
    return dataclasses.astuple(self)

@dataclasses.dataclass
class ClusterEntry:
  cluster_id: int
  result: MossResult
  dot_file: str
  svg_file: str

scmalte's avatar
scmalte committed
54
  def fields_flattened(self):
55
    field_names = [f.name for f in dataclasses.fields(self)]
scmalte's avatar
scmalte committed
56
    result_headers = self.result.fields_flattened()
57 58 59

    return field_names[:1] + result_headers + field_names[2:]

scmalte's avatar
scmalte committed
60
  def values_flattened(self):
61
    field_data = dataclasses.astuple(self)
scmalte's avatar
scmalte committed
62
    result_data = self.result.values_flattened()
63 64

    return field_data[:1] + result_data + field_data[2:]
65

scmalte's avatar
scmalte committed
66 67
def read_results_from_csv_file(csv_file):
  results = []
68

scmalte's avatar
scmalte committed
69
  logging.info("Reading results from {}".format(csv_file))
70

scmalte's avatar
scmalte committed
71
  with open(csv_file, newline="") as csv_fh:
72
    csv_reader = DataclassReader(csv_fh, MossResult, delimiter=",", quotechar='"')
scmalte's avatar
scmalte committed
73

74 75 76
    # csv_reader = csv.reader(csv_fh, delimiter=",", quotechar='"')
    # next(csv_reader, None) # Skip CSV header line
    # results = [MossResult(*row) for row in csv_reader]
scmalte's avatar
scmalte committed
77

78
    results = list(csv_reader)
scmalte's avatar
scmalte committed
79 80 81 82 83 84

  logging.debug("Read {} results".format(len(results)))

  return results

def get_weight(result):
85
  return max(result.percentage1, result.percentage2)
scmalte's avatar
scmalte committed
86 87 88 89 90 91 92

def get_color(percentage):
  if (percentage >= 90): return "#D83018" # Red
  elif (percentage >= 80): return "#F07241" # Orange
  elif (percentage >= 70): return "#601848" # Purple
  else: return "#000000" # Black

93 94 95 96 97
def include(result, percentage_threshold, lines_threshold):
  return (
    percentage_threshold <= get_weight(result) and
    lines_threshold <= result.lines)

98
def get_results_graph(results, percentage_threshold, lines_threshold):
scmalte's avatar
scmalte committed
99 100
  graph = nx.Graph()

101 102
  logging.debug("Creating graph from {} initial results".format(len(results)))
  logging.debug("Thresholds percentages/lines: ".format(percentage_threshold, lines_threshold))
scmalte's avatar
scmalte committed
103 104

  for result in results:
105 106 107
    if not include(result, percentage_threshold, lines_threshold):
      continue

scmalte's avatar
scmalte committed
108 109 110 111 112
    weight = get_weight(result)
    edge = (result.id1, result.id2, weight)
    color = get_color(weight)

    attributes = {
113
      # Attributes for GraphViz
scmalte's avatar
scmalte committed
114 115 116 117 118 119
      "color": color,
      "penwidth": 2,
      "label": "{0}% ({1})".format(weight, result.lines),
      "labelURL": result.match_file,
      "URL": result.match_file,
      "target": "match",
120 121 122
      "fontcolor": color,
      # Attributes for internal bookkeeping
      "_result": result
scmalte's avatar
scmalte committed
123 124 125 126
    }

    graph.add_weighted_edges_from([edge], **attributes)

127 128 129 130 131
  logging.debug(
    "Graph contains {} nodes and {} edged".format(
      graph.number_of_nodes(),
      graph.number_of_edges()))

scmalte's avatar
scmalte committed
132 133
  return graph

134
def create_cluster_dot_and_svg_files(subgraph, index, cluster_dot_file, cluster_svg_file=None):
135
  logging.debug(  
136 137 138 139 140 141 142 143
    "Writing cluster {} with {}/{} nodes/edge to file {}".format(
      index, 
      subgraph.number_of_nodes(),
      subgraph.number_of_edges(),
      cluster_dot_file))
  
  nx.drawing.nx_pydot.write_dot(subgraph, cluster_dot_file)

144
  if cluster_svg_file:
145 146
    dot_command = ["dot", "-Tsvg", "-o{}".format(cluster_svg_file), cluster_dot_file]

147
    logging.debug("Calling dot to create SVG {} file from {}".format(cluster_svg_file, cluster_dot_file))
148 149 150
    logging.debug("Command: {}".format(" ".join(dot_command)))

    subprocess.run(dot_command)
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189

def create_clusters(graph, cluster_file_pattern, create_svg_files):
  logging.info("Computing connected component (CC) clusters")
  clusters = sorted(nx.connected_components(graph), key=len, reverse=True)
  
  logging.info(
    "Found {} CC clusters, will write them to files {}".format(
      len(clusters),
      cluster_file_pattern.format("#", "dot")))

  cluster_entries = []

  for index, cluster in enumerate(clusters):
    subgraph = graph.subgraph(cluster).copy()
    dot_file = cluster_file_pattern.format(index, "dot")

    svg_file = None
    if create_svg_files:
      svg_file = cluster_file_pattern.format(index, "svg")

    create_cluster_dot_and_svg_files(subgraph, index, dot_file, svg_file)

    for (_, _, data) in subgraph.edges(data=True):
      cluster_entries.append(
        ClusterEntry(
          index,
          data["_result"],
          dot_file,
          svg_file))  

  return cluster_entries

def create_summary_csv_file(cluster_entries, summary_csv_file):
  logging.info("Writing summary file {}".format(summary_csv_file))

  if cluster_entries:
    with open(summary_csv_file, "w", newline="") as csv_fh:
      csv_writer = csv.writer(csv_fh)

scmalte's avatar
scmalte committed
190
      csv_writer.writerow(cluster_entries[0].fields_flattened())
191 192
      
      for entry in cluster_entries:
scmalte's avatar
scmalte committed
193
        csv_writer.writerow(entry.values_flattened())  
194

scmalte's avatar
scmalte committed
195 196 197
def main(
    results_csv_file=DEFAULT_RESULTS_CSV_FILE,
    total_graph_dot_file=DEFAULT_TOTAL_GRAPH_DOT_FILE,
198
    cluster_file_pattern=DEFAULT_CLUSTER_FILE_PATTERN,
199 200
    percentage_threshold=DEFAULT_THRESHOLD_PERCENTAGE,
    lines_threshold=DEFAULT_THRESHOLD_LINES,
201 202
    create_svg_files=DEFAULT_CREATE_SVG_FILES,
    summary_csv_file=DEFAULT_SUMMARY_CSV_FILE):
scmalte's avatar
scmalte committed
203 204 205 206

  logutils.configure_level_and_format()

  results = read_results_from_csv_file(results_csv_file)
207
  graph = get_results_graph(results, percentage_threshold, lines_threshold)
scmalte's avatar
scmalte committed
208 209 210 211

  logging.info("Writing total graph to {}".format(total_graph_dot_file))
  nx.drawing.nx_pydot.write_dot(graph, total_graph_dot_file)

212
  cluster_entries = create_clusters(graph, cluster_file_pattern, create_svg_files)
213

214
  create_summary_csv_file(cluster_entries, summary_csv_file)
215 216 217 218


if __name__ == "__main__":
  main()