Commit 76420566 authored by scmalte's avatar scmalte

cluster.py: added several features, including generation of a summary CSV

parent 0091a1ae
......@@ -4,35 +4,55 @@ import logging
import csv
import subprocess
import pydot
import dataclasses
import itertools
import networkx as nx
from dataclasses import dataclass
from dataclass_csv import DataclassReader
from .utils import logging as logutils
DEFAULT_RESULTS_CSV_FILE="moss-report.csv"
DEFAULT_TOTAL_GRAPH_DOT_FILE="moss-report.dot"
DEFAULT_CLUSTERS_DOT_FILE="clusters.dot"
DEFAULT_CLUSTER_DOT_FILE_PATTERN="cluster-{}-{}.dot"
DEFAULT_CLUSTER_FILE_PATTERN="cluster-{}.{}"
DEFAULT_THRESHOLD_PERCENTAGE=90
DEFAULT_THRESHOLD_LINES=50
DEFAULT_CREATE_SVG_FILES=True
DEFAULT_SUMMARY_CSV_FILE="clusters.csv"
@dataclass # USE AS IF frozen=True
@dataclasses.dataclass
class MossResult:
id1: str
percentage1: int
id2: str
percentage2: int
avg_percentage: int
percentage_avg: float
lines: int
match_file: str
def __post_init__(self):
# Despite the (mandatory) type annotations above, there is no guarantee that
# the field values have the expected type, hence the explicit conversions.
self.percentage1 = int(self.percentage1)
self.percentage2 = int(self.percentage2)
self.avg_percentage = float(self.avg_percentage)
self.lines = int(self.lines)
def flat_headers(self):
return [f.name for f in dataclasses.fields(self)]
def flat_data(self):
return dataclasses.astuple(self)
@dataclasses.dataclass
class ClusterEntry:
cluster_id: int
result: MossResult
dot_file: str
svg_file: str
def flat_headers(self):
field_names = [f.name for f in dataclasses.fields(self)]
result_headers = self.result.flat_headers()
return field_names[:1] + result_headers + field_names[2:]
def flat_data(self):
field_data = dataclasses.astuple(self)
result_data = self.result.flat_data()
return field_data[:1] + result_data + field_data[2:]
def read_results_from_csv_file(csv_file):
results = []
......@@ -40,11 +60,13 @@ def read_results_from_csv_file(csv_file):
logging.info("Reading results from {}".format(csv_file))
with open(csv_file, newline="") as csv_fh:
csv_reader = csv.reader(csv_fh, delimiter=",", quotechar='"')
csv_reader = DataclassReader(csv_fh, MossResult, delimiter=",", quotechar='"')
next(csv_reader, None) # Skip CSV header line
# csv_reader = csv.reader(csv_fh, delimiter=",", quotechar='"')
# next(csv_reader, None) # Skip CSV header line
# results = [MossResult(*row) for row in csv_reader]
results = [MossResult(*row) for row in csv_reader]
results = list(csv_reader)
logging.debug("Read {} results".format(len(results)))
......@@ -64,7 +86,7 @@ def include(result, percentage_threshold, lines_threshold):
percentage_threshold <= get_weight(result) and
lines_threshold <= result.lines)
def create_results_graph(results, percentage_threshold, lines_threshold):
def get_results_graph(results, percentage_threshold, lines_threshold):
graph = nx.Graph()
logging.debug("Creating graph from {} initial results".format(len(results)))
......@@ -79,13 +101,16 @@ def create_results_graph(results, percentage_threshold, lines_threshold):
color = get_color(weight)
attributes = {
# Attributes for GraphViz
"color": color,
"penwidth": 2,
"label": "{0}% ({1})".format(weight, result.lines),
"labelURL": result.match_file,
"URL": result.match_file,
"target": "match",
"fontcolor": color
"fontcolor": color,
# Attributes for internal bookkeeping
"_result": result
}
graph.add_weighted_edges_from([edge], **attributes)
......@@ -97,7 +122,7 @@ def create_results_graph(results, percentage_threshold, lines_threshold):
return graph
def write_cluster_files(subgraph, index, cluster_dot_file, create_svg_files):
def create_cluster_dot_and_svg_files(subgraph, index, cluster_dot_file, cluster_svg_file=None):
logging.debug(
"Writing cluster {} with {}/{} nodes/edge to file {}".format(
index,
......@@ -107,40 +132,73 @@ def write_cluster_files(subgraph, index, cluster_dot_file, create_svg_files):
nx.drawing.nx_pydot.write_dot(subgraph, cluster_dot_file)
if cluster_svg_file:
logging.debug("Calling dot to create SVG {} file from {}".format(cluster_svg_file, cluster_dot_file))
subprocess.run(["dot", "-Tsvg", "-o{}".format(cluster_svg_file), cluster_dot_file])
def create_clusters(graph, cluster_file_pattern, create_svg_files):
logging.info("Computing connected component (CC) clusters")
clusters = sorted(nx.connected_components(graph), key=len, reverse=True)
logging.info(
"Found {} CC clusters, will write them to files {}".format(
len(clusters),
cluster_file_pattern.format("#", "dot")))
cluster_entries = []
for index, cluster in enumerate(clusters):
subgraph = graph.subgraph(cluster).copy()
dot_file = cluster_file_pattern.format(index, "dot")
svg_file = None
if create_svg_files:
logging.debug("Calling dot to create SVG file from {}".format(cluster_dot_file))
subprocess.run(["dot", "-Tsvg", "-O", cluster_dot_file])
svg_file = cluster_file_pattern.format(index, "svg")
create_cluster_dot_and_svg_files(subgraph, index, dot_file, svg_file)
for (_, _, data) in subgraph.edges(data=True):
cluster_entries.append(
ClusterEntry(
index,
data["_result"],
dot_file,
svg_file))
return cluster_entries
def create_summary_csv_file(cluster_entries, summary_csv_file):
logging.info("Writing summary file {}".format(summary_csv_file))
if cluster_entries:
with open(summary_csv_file, "w", newline="") as csv_fh:
csv_writer = csv.writer(csv_fh)
csv_writer.writerow(cluster_entries[0].flat_headers())
for entry in cluster_entries:
csv_writer.writerow(entry.flat_data())
def main(
results_csv_file=DEFAULT_RESULTS_CSV_FILE,
total_graph_dot_file=DEFAULT_TOTAL_GRAPH_DOT_FILE,
cluster_dot_file_pattern=DEFAULT_CLUSTER_DOT_FILE_PATTERN,
cluster_file_pattern=DEFAULT_CLUSTER_FILE_PATTERN,
percentage_threshold=DEFAULT_THRESHOLD_PERCENTAGE,
lines_threshold=DEFAULT_THRESHOLD_LINES,
create_svg_files=DEFAULT_CREATE_SVG_FILES):
create_svg_files=DEFAULT_CREATE_SVG_FILES,
summary_csv_file=DEFAULT_SUMMARY_CSV_FILE):
logutils.configure_level_and_format()
results = read_results_from_csv_file(results_csv_file)
graph = create_results_graph(results, percentage_threshold, lines_threshold)
graph = get_results_graph(results, percentage_threshold, lines_threshold)
logging.info("Writing total graph to {}".format(total_graph_dot_file))
nx.drawing.nx_pydot.write_dot(graph, total_graph_dot_file)
logging.info("Computing connected component (CC) clusters")
clusters = sorted(nx.connected_components(graph), key=len, reverse=True)
cluster_dot_file_pattern = cluster_dot_file_pattern.format("cc", "{}")
logging.info(
"Found {} CC clusters, will write them to files {}".format(
len(clusters),
cluster_dot_file_pattern.format("#")))
cluster_entries = create_clusters(graph, cluster_file_pattern, create_svg_files)
for index, cluster in enumerate(clusters):
subgraph = graph.subgraph(cluster).copy()
dot_file = cluster_dot_file_pattern.format(index)
write_cluster_files(subgraph, index, dot_file, create_svg_files)
create_summary_csv_file(cluster_entries, summary_csv_file)
if __name__ == "__main__":
......
......@@ -19,8 +19,10 @@ setup(
'mosspy',
'lxml',
'bs4',
'dataclass-csv',
'pydot',
'networkx'
'networkx',
'Jinja2'
],
# scripts=['bin/mossutils-moss'],
entry_points = {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment