Commit 76420566 authored by scmalte's avatar scmalte

cluster.py: added several features, including generation of a summary CSV

parent 0091a1ae
...@@ -4,35 +4,55 @@ import logging ...@@ -4,35 +4,55 @@ import logging
import csv import csv
import subprocess import subprocess
import pydot import pydot
import dataclasses
import itertools
import networkx as nx import networkx as nx
from dataclasses import dataclass from dataclass_csv import DataclassReader
from .utils import logging as logutils from .utils import logging as logutils
DEFAULT_RESULTS_CSV_FILE="moss-report.csv" DEFAULT_RESULTS_CSV_FILE="moss-report.csv"
DEFAULT_TOTAL_GRAPH_DOT_FILE="moss-report.dot" DEFAULT_TOTAL_GRAPH_DOT_FILE="moss-report.dot"
DEFAULT_CLUSTERS_DOT_FILE="clusters.dot" DEFAULT_CLUSTERS_DOT_FILE="clusters.dot"
DEFAULT_CLUSTER_DOT_FILE_PATTERN="cluster-{}-{}.dot" DEFAULT_CLUSTER_FILE_PATTERN="cluster-{}.{}"
DEFAULT_THRESHOLD_PERCENTAGE=90 DEFAULT_THRESHOLD_PERCENTAGE=90
DEFAULT_THRESHOLD_LINES=50 DEFAULT_THRESHOLD_LINES=50
DEFAULT_CREATE_SVG_FILES=True DEFAULT_CREATE_SVG_FILES=True
DEFAULT_SUMMARY_CSV_FILE="clusters.csv"
@dataclass # USE AS IF frozen=True @dataclasses.dataclass
class MossResult: class MossResult:
id1: str id1: str
percentage1: int percentage1: int
id2: str id2: str
percentage2: int percentage2: int
avg_percentage: int percentage_avg: float
lines: int lines: int
match_file: str match_file: str
def __post_init__(self): def flat_headers(self):
# Despite the (mandatory) type annotations above, there is no guarantee that return [f.name for f in dataclasses.fields(self)]
# the field values have the expected type, hence the explicit conversions.
self.percentage1 = int(self.percentage1) def flat_data(self):
self.percentage2 = int(self.percentage2) return dataclasses.astuple(self)
self.avg_percentage = float(self.avg_percentage)
self.lines = int(self.lines) @dataclasses.dataclass
class ClusterEntry:
cluster_id: int
result: MossResult
dot_file: str
svg_file: str
def flat_headers(self):
field_names = [f.name for f in dataclasses.fields(self)]
result_headers = self.result.flat_headers()
return field_names[:1] + result_headers + field_names[2:]
def flat_data(self):
field_data = dataclasses.astuple(self)
result_data = self.result.flat_data()
return field_data[:1] + result_data + field_data[2:]
def read_results_from_csv_file(csv_file): def read_results_from_csv_file(csv_file):
results = [] results = []
...@@ -40,11 +60,13 @@ def read_results_from_csv_file(csv_file): ...@@ -40,11 +60,13 @@ def read_results_from_csv_file(csv_file):
logging.info("Reading results from {}".format(csv_file)) logging.info("Reading results from {}".format(csv_file))
with open(csv_file, newline="") as csv_fh: with open(csv_file, newline="") as csv_fh:
csv_reader = csv.reader(csv_fh, delimiter=",", quotechar='"') csv_reader = DataclassReader(csv_fh, MossResult, delimiter=",", quotechar='"')
next(csv_reader, None) # Skip CSV header line # csv_reader = csv.reader(csv_fh, delimiter=",", quotechar='"')
# next(csv_reader, None) # Skip CSV header line
# results = [MossResult(*row) for row in csv_reader]
results = [MossResult(*row) for row in csv_reader] results = list(csv_reader)
logging.debug("Read {} results".format(len(results))) logging.debug("Read {} results".format(len(results)))
...@@ -64,7 +86,7 @@ def include(result, percentage_threshold, lines_threshold): ...@@ -64,7 +86,7 @@ def include(result, percentage_threshold, lines_threshold):
percentage_threshold <= get_weight(result) and percentage_threshold <= get_weight(result) and
lines_threshold <= result.lines) lines_threshold <= result.lines)
def create_results_graph(results, percentage_threshold, lines_threshold): def get_results_graph(results, percentage_threshold, lines_threshold):
graph = nx.Graph() graph = nx.Graph()
logging.debug("Creating graph from {} initial results".format(len(results))) logging.debug("Creating graph from {} initial results".format(len(results)))
...@@ -79,13 +101,16 @@ def create_results_graph(results, percentage_threshold, lines_threshold): ...@@ -79,13 +101,16 @@ def create_results_graph(results, percentage_threshold, lines_threshold):
color = get_color(weight) color = get_color(weight)
attributes = { attributes = {
# Attributes for GraphViz
"color": color, "color": color,
"penwidth": 2, "penwidth": 2,
"label": "{0}% ({1})".format(weight, result.lines), "label": "{0}% ({1})".format(weight, result.lines),
"labelURL": result.match_file, "labelURL": result.match_file,
"URL": result.match_file, "URL": result.match_file,
"target": "match", "target": "match",
"fontcolor": color "fontcolor": color,
# Attributes for internal bookkeeping
"_result": result
} }
graph.add_weighted_edges_from([edge], **attributes) graph.add_weighted_edges_from([edge], **attributes)
...@@ -97,7 +122,7 @@ def create_results_graph(results, percentage_threshold, lines_threshold): ...@@ -97,7 +122,7 @@ def create_results_graph(results, percentage_threshold, lines_threshold):
return graph return graph
def write_cluster_files(subgraph, index, cluster_dot_file, create_svg_files): def create_cluster_dot_and_svg_files(subgraph, index, cluster_dot_file, cluster_svg_file=None):
logging.debug( logging.debug(
"Writing cluster {} with {}/{} nodes/edge to file {}".format( "Writing cluster {} with {}/{} nodes/edge to file {}".format(
index, index,
...@@ -107,40 +132,73 @@ def write_cluster_files(subgraph, index, cluster_dot_file, create_svg_files): ...@@ -107,40 +132,73 @@ def write_cluster_files(subgraph, index, cluster_dot_file, create_svg_files):
nx.drawing.nx_pydot.write_dot(subgraph, cluster_dot_file) nx.drawing.nx_pydot.write_dot(subgraph, cluster_dot_file)
if cluster_svg_file:
logging.debug("Calling dot to create SVG {} file from {}".format(cluster_svg_file, cluster_dot_file))
subprocess.run(["dot", "-Tsvg", "-o{}".format(cluster_svg_file), cluster_dot_file])
def create_clusters(graph, cluster_file_pattern, create_svg_files):
logging.info("Computing connected component (CC) clusters")
clusters = sorted(nx.connected_components(graph), key=len, reverse=True)
logging.info(
"Found {} CC clusters, will write them to files {}".format(
len(clusters),
cluster_file_pattern.format("#", "dot")))
cluster_entries = []
for index, cluster in enumerate(clusters):
subgraph = graph.subgraph(cluster).copy()
dot_file = cluster_file_pattern.format(index, "dot")
svg_file = None
if create_svg_files: if create_svg_files:
logging.debug("Calling dot to create SVG file from {}".format(cluster_dot_file)) svg_file = cluster_file_pattern.format(index, "svg")
subprocess.run(["dot", "-Tsvg", "-O", cluster_dot_file])
create_cluster_dot_and_svg_files(subgraph, index, dot_file, svg_file)
for (_, _, data) in subgraph.edges(data=True):
cluster_entries.append(
ClusterEntry(
index,
data["_result"],
dot_file,
svg_file))
return cluster_entries
def create_summary_csv_file(cluster_entries, summary_csv_file):
logging.info("Writing summary file {}".format(summary_csv_file))
if cluster_entries:
with open(summary_csv_file, "w", newline="") as csv_fh:
csv_writer = csv.writer(csv_fh)
csv_writer.writerow(cluster_entries[0].flat_headers())
for entry in cluster_entries:
csv_writer.writerow(entry.flat_data())
def main( def main(
results_csv_file=DEFAULT_RESULTS_CSV_FILE, results_csv_file=DEFAULT_RESULTS_CSV_FILE,
total_graph_dot_file=DEFAULT_TOTAL_GRAPH_DOT_FILE, total_graph_dot_file=DEFAULT_TOTAL_GRAPH_DOT_FILE,
cluster_dot_file_pattern=DEFAULT_CLUSTER_DOT_FILE_PATTERN, cluster_file_pattern=DEFAULT_CLUSTER_FILE_PATTERN,
percentage_threshold=DEFAULT_THRESHOLD_PERCENTAGE, percentage_threshold=DEFAULT_THRESHOLD_PERCENTAGE,
lines_threshold=DEFAULT_THRESHOLD_LINES, lines_threshold=DEFAULT_THRESHOLD_LINES,
create_svg_files=DEFAULT_CREATE_SVG_FILES): create_svg_files=DEFAULT_CREATE_SVG_FILES,
summary_csv_file=DEFAULT_SUMMARY_CSV_FILE):
logutils.configure_level_and_format() logutils.configure_level_and_format()
results = read_results_from_csv_file(results_csv_file) results = read_results_from_csv_file(results_csv_file)
graph = create_results_graph(results, percentage_threshold, lines_threshold) graph = get_results_graph(results, percentage_threshold, lines_threshold)
logging.info("Writing total graph to {}".format(total_graph_dot_file)) logging.info("Writing total graph to {}".format(total_graph_dot_file))
nx.drawing.nx_pydot.write_dot(graph, total_graph_dot_file) nx.drawing.nx_pydot.write_dot(graph, total_graph_dot_file)
logging.info("Computing connected component (CC) clusters") cluster_entries = create_clusters(graph, cluster_file_pattern, create_svg_files)
clusters = sorted(nx.connected_components(graph), key=len, reverse=True)
cluster_dot_file_pattern = cluster_dot_file_pattern.format("cc", "{}")
logging.info(
"Found {} CC clusters, will write them to files {}".format(
len(clusters),
cluster_dot_file_pattern.format("#")))
for index, cluster in enumerate(clusters): create_summary_csv_file(cluster_entries, summary_csv_file)
subgraph = graph.subgraph(cluster).copy()
dot_file = cluster_dot_file_pattern.format(index)
write_cluster_files(subgraph, index, dot_file, create_svg_files)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -19,8 +19,10 @@ setup( ...@@ -19,8 +19,10 @@ setup(
'mosspy', 'mosspy',
'lxml', 'lxml',
'bs4', 'bs4',
'dataclass-csv',
'pydot', 'pydot',
'networkx' 'networkx',
'Jinja2'
], ],
# scripts=['bin/mossutils-moss'], # scripts=['bin/mossutils-moss'],
entry_points = { entry_points = {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment