To receive notifications about scheduled maintenance, please subscribe to the mailing-list gitlab-operations@sympa.ethz.ch. You can subscribe to the mailing-list at https://sympa.ethz.ch

cluster.py 7.55 KB
Newer Older
1
import re
2
import os
3
import argparse
4 5
import logging
import csv
scmalte's avatar
scmalte committed
6 7
import subprocess
import pydot
8 9
import dataclasses
import itertools
scmalte's avatar
scmalte committed
10
import networkx as nx
11
from dataclass_csv import DataclassReader
scmalte's avatar
scmalte committed
12
from .utils import logging as logutils
13

14 15 16 17 18 19
## TODO: cluster.py could create a first, less detailed version of the 
##       clusters.html report, by extracting the strictly necessary information
##       (student name and e-mail address) from the details.json file located
##       in the CX export. This information would already be enough to generate
##       e-mails afterwards.
##       aggr.py would then be optional, if a more detailed cluster report is desired.
scmalte's avatar
scmalte committed
20 21
##
## TODO: Generate DOT, SVG and CSV files in a subdirectory, e.g. "_clusters"
22

23
DEFAULT_RESULTS_CSV_FILE="moss-report.csv"
scmalte's avatar
scmalte committed
24
DEFAULT_TOTAL_GRAPH_DOT_FILE="moss-report.dot"
25
DEFAULT_CLUSTER_FILES_DIR="_clusters"
26
DEFAULT_CLUSTER_FILE_PATTERN="cluster-{}.{}"
27 28 29
DEFAULT_THRESHOLD_PERCENTAGE=90
DEFAULT_THRESHOLD_LINES=50
DEFAULT_CREATE_SVG_FILES=True
30
DEFAULT_CLUSTERS_MATCHES_CSV_FILE="clusters-matches.csv"
31

32
@dataclasses.dataclass
scmalte's avatar
scmalte committed
33 34
class MossResult:
  id1: str
35
  percentage1: int
scmalte's avatar
scmalte committed
36
  id2: str
37
  percentage2: int
38
  percentage_avg: float
39
  lines: int
scmalte's avatar
scmalte committed
40
  match_file: str
41

scmalte's avatar
scmalte committed
42
  def fields_flattened(self):
43 44
    return [f.name for f in dataclasses.fields(self)]

scmalte's avatar
scmalte committed
45
  def values_flattened(self):
46 47 48 49 50 51 52 53 54
    return dataclasses.astuple(self)

@dataclasses.dataclass
class ClusterEntry:
  cluster_id: int
  result: MossResult
  dot_file: str
  svg_file: str

scmalte's avatar
scmalte committed
55
  def fields_flattened(self):
56
    field_names = [f.name for f in dataclasses.fields(self)]
scmalte's avatar
scmalte committed
57
    result_headers = self.result.fields_flattened()
58 59 60

    return field_names[:1] + result_headers + field_names[2:]

scmalte's avatar
scmalte committed
61
  def values_flattened(self):
62
    field_data = dataclasses.astuple(self)
scmalte's avatar
scmalte committed
63
    result_data = self.result.values_flattened()
64 65

    return field_data[:1] + result_data + field_data[2:]
66

scmalte's avatar
scmalte committed
67 68
def read_results_from_csv_file(csv_file):
  results = []
69

scmalte's avatar
scmalte committed
70
  logging.info("Reading results from {}".format(csv_file))
71

scmalte's avatar
scmalte committed
72
  with open(csv_file, newline="") as csv_fh:
73
    csv_reader = DataclassReader(csv_fh, MossResult, delimiter=",", quotechar='"')
scmalte's avatar
scmalte committed
74

75 76 77
    # csv_reader = csv.reader(csv_fh, delimiter=",", quotechar='"')
    # next(csv_reader, None) # Skip CSV header line
    # results = [MossResult(*row) for row in csv_reader]
scmalte's avatar
scmalte committed
78

79
    results = list(csv_reader)
scmalte's avatar
scmalte committed
80 81 82 83 84 85

  logging.debug("Read {} results".format(len(results)))

  return results

def get_weight(result):
86
  return max(result.percentage1, result.percentage2)
scmalte's avatar
scmalte committed
87 88 89 90 91 92 93

def get_color(percentage):
  if (percentage >= 90): return "#D83018" # Red
  elif (percentage >= 80): return "#F07241" # Orange
  elif (percentage >= 70): return "#601848" # Purple
  else: return "#000000" # Black

94 95 96 97 98
def include(result, percentage_threshold, lines_threshold):
  return (
    percentage_threshold <= get_weight(result) and
    lines_threshold <= result.lines)

99
def get_results_graph(results, percentage_threshold, lines_threshold):
scmalte's avatar
scmalte committed
100 101
  graph = nx.Graph()

102 103
  logging.debug("Creating graph from {} initial results".format(len(results)))
  logging.debug("Thresholds percentages/lines: ".format(percentage_threshold, lines_threshold))
scmalte's avatar
scmalte committed
104 105

  for result in results:
106 107 108
    if not include(result, percentage_threshold, lines_threshold):
      continue

scmalte's avatar
scmalte committed
109 110 111 112
    weight = get_weight(result)
    edge = (result.id1, result.id2, weight)
    color = get_color(weight)

113 114 115
    # TODO: Don't hardcode ../ path prefix
    match_url = "../{}".format(result.match_file)

scmalte's avatar
scmalte committed
116
    attributes = {
117
      # Attributes for GraphViz
scmalte's avatar
scmalte committed
118 119 120
      "color": color,
      "penwidth": 2,
      "label": "{0}% ({1})".format(weight, result.lines),
121 122
      "labelURL": match_url,
      "URL": match_url,
scmalte's avatar
scmalte committed
123
      "target": "match",
124 125 126
      "fontcolor": color,
      # Attributes for internal bookkeeping
      "_result": result
scmalte's avatar
scmalte committed
127 128 129 130
    }

    graph.add_weighted_edges_from([edge], **attributes)

131 132 133 134 135
  logging.debug(
    "Graph contains {} nodes and {} edged".format(
      graph.number_of_nodes(),
      graph.number_of_edges()))

scmalte's avatar
scmalte committed
136 137
  return graph

138
def create_cluster_dot_and_svg_files(subgraph, index, cluster_dot_file, cluster_svg_file=None):
139
  logging.debug(  
140 141 142 143 144 145 146 147
    "Writing cluster {} with {}/{} nodes/edge to file {}".format(
      index, 
      subgraph.number_of_nodes(),
      subgraph.number_of_edges(),
      cluster_dot_file))
  
  nx.drawing.nx_pydot.write_dot(subgraph, cluster_dot_file)

148
  if cluster_svg_file:
149 150
    dot_command = ["dot", "-Tsvg", "-o{}".format(cluster_svg_file), cluster_dot_file]

151
    logging.debug("Calling dot to create SVG {} file from {}".format(cluster_svg_file, cluster_dot_file))
152 153 154
    logging.debug("Command: {}".format(" ".join(dot_command)))

    subprocess.run(dot_command)
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186

def create_clusters(graph, cluster_file_pattern, create_svg_files):
  logging.info("Computing connected component (CC) clusters")
  clusters = sorted(nx.connected_components(graph), key=len, reverse=True)
  
  logging.info(
    "Found {} CC clusters, will write them to files {}".format(
      len(clusters),
      cluster_file_pattern.format("#", "dot")))

  cluster_entries = []

  for index, cluster in enumerate(clusters):
    subgraph = graph.subgraph(cluster).copy()
    dot_file = cluster_file_pattern.format(index, "dot")

    svg_file = None
    if create_svg_files:
      svg_file = cluster_file_pattern.format(index, "svg")

    create_cluster_dot_and_svg_files(subgraph, index, dot_file, svg_file)

    for (_, _, data) in subgraph.edges(data=True):
      cluster_entries.append(
        ClusterEntry(
          index,
          data["_result"],
          dot_file,
          svg_file))  

  return cluster_entries

187 188
def create_clusters_matches_csv_file(cluster_entries, clusters_matches_csv_file):
  logging.info("Writing file with matches per clusters {}".format(clusters_matches_csv_file))
189 190

  if cluster_entries:
191
    with open(clusters_matches_csv_file, "w", newline="") as csv_fh:
192 193
      csv_writer = csv.writer(csv_fh)

scmalte's avatar
scmalte committed
194
      csv_writer.writerow(cluster_entries[0].fields_flattened())
195 196
      
      for entry in cluster_entries:
scmalte's avatar
scmalte committed
197
        csv_writer.writerow(entry.values_flattened())  
198

199 200 201 202 203 204 205 206 207 208 209 210 211
def configure_cli_parser(parser):
  parser.add_argument(
    "-tp", "--threshold-percentage",
    type=int,
    help="Threshold for similarity in percentage; matches below will be excluded (default: {})".format(DEFAULT_THRESHOLD_PERCENTAGE))

  parser.add_argument(
    "-tl", "--threshold-lines",
    type=int,
    help="Threshold for similarity in lines of code; matches below will be excluded (default: {})".format(DEFAULT_THRESHOLD_LINES))

  logutils.add_loglevel_argument(parser)

scmalte's avatar
scmalte committed
212 213 214
def main(
    results_csv_file=DEFAULT_RESULTS_CSV_FILE,
    total_graph_dot_file=DEFAULT_TOTAL_GRAPH_DOT_FILE,
215
    cluster_files_dir=DEFAULT_CLUSTER_FILES_DIR,
216 217
    cluster_file_pattern=DEFAULT_CLUSTER_FILE_PATTERN,
    create_svg_files=DEFAULT_CREATE_SVG_FILES,
218
    clusters_matches_csv_file=DEFAULT_CLUSTERS_MATCHES_CSV_FILE):
scmalte's avatar
scmalte committed
219

220 221 222 223 224 225 226 227
  parser = argparse.ArgumentParser()
  configure_cli_parser(parser)
  args = parser.parse_args()

  logutils.configure_level_and_format(args.log_level)

  percentage_threshold=args.threshold_percentage
  lines_threshold=args.threshold_lines
scmalte's avatar
scmalte committed
228 229

  results = read_results_from_csv_file(results_csv_file)
230
  graph = get_results_graph(results, percentage_threshold, lines_threshold)
scmalte's avatar
scmalte committed
231 232 233 234

  logging.info("Writing total graph to {}".format(total_graph_dot_file))
  nx.drawing.nx_pydot.write_dot(graph, total_graph_dot_file)

235 236 237 238 239 240 241
  logging.info("Creating directory {}".format(cluster_files_dir))
  os.makedirs(cluster_files_dir, exist_ok=True)

  cluster_entries = create_clusters(
      graph, 
      os.path.join(cluster_files_dir, cluster_file_pattern),
      create_svg_files)
242

243
  create_clusters_matches_csv_file(cluster_entries, clusters_matches_csv_file)
244 245 246 247


if __name__ == "__main__":
  main()