To receive notifications about scheduled maintenance, please subscribe to the mailing-list gitlab-operations@sympa.ethz.ch. You can subscribe to the mailing-list at https://sympa.ethz.ch

cluster.py 6.5 KB
Newer Older
1
2
3
4
import re
# import argparse
import logging
import csv
scmalte's avatar
scmalte committed
5
6
import subprocess
import pydot
7
8
import dataclasses
import itertools
scmalte's avatar
scmalte committed
9
import networkx as nx
10
from dataclass_csv import DataclassReader
scmalte's avatar
scmalte committed
11
from .utils import logging as logutils
12

13
14
15
16
17
18
19
## TODO: cluster.py could create a first, less detailed version of the 
##       clusters.html report, by extracting the strictly necessary information
##       (student name and e-mail address) from the details.json file located
##       in the CX export. This information would already be enough to generate
##       e-mails afterwards.
##       aggr.py would then be optional, if a more detailed cluster report is desired.

20
DEFAULT_RESULTS_CSV_FILE="moss-report.csv"
scmalte's avatar
scmalte committed
21
22
DEFAULT_TOTAL_GRAPH_DOT_FILE="moss-report.dot"
DEFAULT_CLUSTERS_DOT_FILE="clusters.dot"
23
DEFAULT_CLUSTER_FILE_PATTERN="cluster-{}.{}"
24
25
26
DEFAULT_THRESHOLD_PERCENTAGE=90
DEFAULT_THRESHOLD_LINES=50
DEFAULT_CREATE_SVG_FILES=True
27
DEFAULT_SUMMARY_CSV_FILE="clusters.csv"
28

29
@dataclasses.dataclass
scmalte's avatar
scmalte committed
30
31
class MossResult:
  id1: str
32
  percentage1: int
scmalte's avatar
scmalte committed
33
  id2: str
34
  percentage2: int
35
  percentage_avg: float
36
  lines: int
scmalte's avatar
scmalte committed
37
  match_file: str
38

scmalte's avatar
scmalte committed
39
  def fields_flattened(self):
40
41
    return [f.name for f in dataclasses.fields(self)]

scmalte's avatar
scmalte committed
42
  def values_flattened(self):
43
44
45
46
47
48
49
50
51
    return dataclasses.astuple(self)

@dataclasses.dataclass
class ClusterEntry:
  cluster_id: int
  result: MossResult
  dot_file: str
  svg_file: str

scmalte's avatar
scmalte committed
52
  def fields_flattened(self):
53
    field_names = [f.name for f in dataclasses.fields(self)]
scmalte's avatar
scmalte committed
54
    result_headers = self.result.fields_flattened()
55
56
57

    return field_names[:1] + result_headers + field_names[2:]

scmalte's avatar
scmalte committed
58
  def values_flattened(self):
59
    field_data = dataclasses.astuple(self)
scmalte's avatar
scmalte committed
60
    result_data = self.result.values_flattened()
61
62

    return field_data[:1] + result_data + field_data[2:]
63

scmalte's avatar
scmalte committed
64
65
def read_results_from_csv_file(csv_file):
  results = []
66

scmalte's avatar
scmalte committed
67
  logging.info("Reading results from {}".format(csv_file))
68

scmalte's avatar
scmalte committed
69
  with open(csv_file, newline="") as csv_fh:
70
    csv_reader = DataclassReader(csv_fh, MossResult, delimiter=",", quotechar='"')
scmalte's avatar
scmalte committed
71

72
73
74
    # csv_reader = csv.reader(csv_fh, delimiter=",", quotechar='"')
    # next(csv_reader, None) # Skip CSV header line
    # results = [MossResult(*row) for row in csv_reader]
scmalte's avatar
scmalte committed
75

76
    results = list(csv_reader)
scmalte's avatar
scmalte committed
77
78
79
80
81
82

  logging.debug("Read {} results".format(len(results)))

  return results

def get_weight(result):
83
  return max(result.percentage1, result.percentage2)
scmalte's avatar
scmalte committed
84
85
86
87
88
89
90

def get_color(percentage):
  if (percentage >= 90): return "#D83018" # Red
  elif (percentage >= 80): return "#F07241" # Orange
  elif (percentage >= 70): return "#601848" # Purple
  else: return "#000000" # Black

91
92
93
94
95
def include(result, percentage_threshold, lines_threshold):
  return (
    percentage_threshold <= get_weight(result) and
    lines_threshold <= result.lines)

96
def get_results_graph(results, percentage_threshold, lines_threshold):
scmalte's avatar
scmalte committed
97
98
  graph = nx.Graph()

99
100
  logging.debug("Creating graph from {} initial results".format(len(results)))
  logging.debug("Thresholds percentages/lines: ".format(percentage_threshold, lines_threshold))
scmalte's avatar
scmalte committed
101
102

  for result in results:
103
104
105
    if not include(result, percentage_threshold, lines_threshold):
      continue

scmalte's avatar
scmalte committed
106
107
108
109
110
    weight = get_weight(result)
    edge = (result.id1, result.id2, weight)
    color = get_color(weight)

    attributes = {
111
      # Attributes for GraphViz
scmalte's avatar
scmalte committed
112
113
114
115
116
117
      "color": color,
      "penwidth": 2,
      "label": "{0}% ({1})".format(weight, result.lines),
      "labelURL": result.match_file,
      "URL": result.match_file,
      "target": "match",
118
119
120
      "fontcolor": color,
      # Attributes for internal bookkeeping
      "_result": result
scmalte's avatar
scmalte committed
121
122
123
124
    }

    graph.add_weighted_edges_from([edge], **attributes)

125
126
127
128
129
  logging.debug(
    "Graph contains {} nodes and {} edged".format(
      graph.number_of_nodes(),
      graph.number_of_edges()))

scmalte's avatar
scmalte committed
130
131
  return graph

132
def create_cluster_dot_and_svg_files(subgraph, index, cluster_dot_file, cluster_svg_file=None):
133
  logging.debug(  
134
135
136
137
138
139
140
141
    "Writing cluster {} with {}/{} nodes/edge to file {}".format(
      index, 
      subgraph.number_of_nodes(),
      subgraph.number_of_edges(),
      cluster_dot_file))
  
  nx.drawing.nx_pydot.write_dot(subgraph, cluster_dot_file)

142
  if cluster_svg_file:
143
144
    dot_command = ["dot", "-Tsvg", "-o{}".format(cluster_svg_file), cluster_dot_file]

145
    logging.debug("Calling dot to create SVG {} file from {}".format(cluster_svg_file, cluster_dot_file))
146
147
148
    logging.debug("Command: {}".format(" ".join(dot_command)))

    subprocess.run(dot_command)
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187

def create_clusters(graph, cluster_file_pattern, create_svg_files):
  logging.info("Computing connected component (CC) clusters")
  clusters = sorted(nx.connected_components(graph), key=len, reverse=True)
  
  logging.info(
    "Found {} CC clusters, will write them to files {}".format(
      len(clusters),
      cluster_file_pattern.format("#", "dot")))

  cluster_entries = []

  for index, cluster in enumerate(clusters):
    subgraph = graph.subgraph(cluster).copy()
    dot_file = cluster_file_pattern.format(index, "dot")

    svg_file = None
    if create_svg_files:
      svg_file = cluster_file_pattern.format(index, "svg")

    create_cluster_dot_and_svg_files(subgraph, index, dot_file, svg_file)

    for (_, _, data) in subgraph.edges(data=True):
      cluster_entries.append(
        ClusterEntry(
          index,
          data["_result"],
          dot_file,
          svg_file))  

  return cluster_entries

def create_summary_csv_file(cluster_entries, summary_csv_file):
  logging.info("Writing summary file {}".format(summary_csv_file))

  if cluster_entries:
    with open(summary_csv_file, "w", newline="") as csv_fh:
      csv_writer = csv.writer(csv_fh)

scmalte's avatar
scmalte committed
188
      csv_writer.writerow(cluster_entries[0].fields_flattened())
189
190
      
      for entry in cluster_entries:
scmalte's avatar
scmalte committed
191
        csv_writer.writerow(entry.values_flattened())  
192

scmalte's avatar
scmalte committed
193
194
195
def main(
    results_csv_file=DEFAULT_RESULTS_CSV_FILE,
    total_graph_dot_file=DEFAULT_TOTAL_GRAPH_DOT_FILE,
196
    cluster_file_pattern=DEFAULT_CLUSTER_FILE_PATTERN,
197
198
    percentage_threshold=DEFAULT_THRESHOLD_PERCENTAGE,
    lines_threshold=DEFAULT_THRESHOLD_LINES,
199
200
    create_svg_files=DEFAULT_CREATE_SVG_FILES,
    summary_csv_file=DEFAULT_SUMMARY_CSV_FILE):
scmalte's avatar
scmalte committed
201
202
203
204

  logutils.configure_level_and_format()

  results = read_results_from_csv_file(results_csv_file)
205
  graph = get_results_graph(results, percentage_threshold, lines_threshold)
scmalte's avatar
scmalte committed
206
207
208
209

  logging.info("Writing total graph to {}".format(total_graph_dot_file))
  nx.drawing.nx_pydot.write_dot(graph, total_graph_dot_file)

210
  cluster_entries = create_clusters(graph, cluster_file_pattern, create_svg_files)
211

212
  create_summary_csv_file(cluster_entries, summary_csv_file)
213
214
215
216


if __name__ == "__main__":
  main()