......@@ -17,10 +17,12 @@ output_file_version_counter = 1
def copy_static_files_to_working_directory(output_dir):
os.makedirs(output_dir, exist_ok=True)
files = ["style.css", "script.js", "sorttable.js"]
for file in files:
logging.debug("Copying {} to current working directory".format(file))
src_file = os.path.join("data", file)
dest_file = os.path.join(output_dir, file)
data = pkgutil.get_data(__name__, src_file)
......@@ -29,25 +31,33 @@ def copy_static_files_to_working_directory(output_dir):
def parse_original_html_report(input_report_file):"Reading Moss report file {}".format(input_report_file))
with open(input_report_file) as input_fh:
# We use BeautifulSoup because of it allows easy DOM manipulation and has
# a nice pretty-printing feature. The lxml parser is used because it is able
# to parse the totally broken HTML report file that Moss generates.
# to parse the somewhat broken HTML report file that Moss generates.
doc = BeautifulSoup(input_fh, features="lxml")
return doc
def prettyprint_html_to_file(doc, file):
logging.debug("Writing report to {}".format(file))
with open(file, "w") as fh:
fh.write(doc.prettify()) # "utf-8"
def prettyprint_html_to_versioned_file(doc, filename_pattern):
global output_file_version_counter
file = filename_pattern.format(output_file_version_counter)
# output_file_version_counter += 1
output_file_version_counter += 1
prettyprint_html_to_file(doc, file)
def make_report_table_sortable(doc, static_dir):"Making report table sortable")
doc.body.table["class"] = "sortable"
# <script src="<static_dir>/sorttable.js"></script>
......@@ -69,11 +79,16 @@ def make_report_table_sortable(doc, static_dir):
def localize_match_links(doc, input_report_subdir):
# E.g.
url_pattern = r"http://moss\.stanford\.edu/results/\d+/\d+/(match.*\.html)"
# E.g. ./12-345-678/main.cpp (77%)
text_pattern = r"\./([\d-]+)/.* (\(\d+%\))"
for row in doc.find_all("tr"):"Localising links to match files")
rows = doc.find_all("tr")
logging.debug("Considering {} rows, including table header".format(len(rows)))
for row in rows:
for a in row.find_all("a"):
# Change remote URLs to local ones
url_match =, a["href"])
......@@ -91,18 +106,23 @@ def get_match_percentage(match_text):
percentage_pattern = r"\((\d+)%\)$"
percentage_string =, match_text).group(1)
return int(percentage_string)
def add_average_percentage_column(doc):"Adding average percentage column")
ths = doc.find_all("th")
th = doc.new_tag("th")
th.string = "Avg. %"
for row in doc.find_all("tr")[1:]: # Skip first TR, since table head
rows = doc.find_all("tr")[1:] # Skip first TR, since table head
logging.debug("Considering {} rows, excluding table header".format(len(rows)))
for row in rows:
cols = row.find_all("td")
first_match_text = cols[0].get_text().strip()
second_match_text = cols[1].get_text().strip()
......@@ -112,16 +132,18 @@ def add_average_percentage_column(doc):
td = doc.new_tag("td")
td.string = str(avg_percentage)
row.insert(2, td)
row.insert(2, td)
def write_result_table_to_csv_file(doc, csv_file):"Writing report data to CSV file {}".format(csv_file))
# E.g. 12-345-678 (77%)
text_pattern = r"([\d-]+) \((\d+)%\)"
with open(csv_file, "w") as csv_fh:
rows = doc.find_all("tr")
rows = doc.find_all("tr")[1:] # Skip first TR, since table head
logging.debug("Considering {} rows, excluding table header".format(len(rows)))
# column_heads = [th.get_text().strip() for th in rows[0].find_all("th")]
column_heads = [
"id1", "percentage1",
"id2", "percentage2",
......@@ -133,26 +155,25 @@ def write_result_table_to_csv_file(doc, csv_file):
for row in rows[1:]:
for td in row.find_all("td"):
tds = [td for td in row.find_all("td")]
tds_text = [td.get_text().strip() for td in tds]
for row in rows:
tds = [td for td in row.find_all("td")]
tds_text = [td.get_text().strip() for td in tds]
file1_match =, tds_text[0])
file2_match =, tds_text[1])
file1_match =, tds_text[0])
file2_match =, tds_text[1])
col_data = [,,,,
col_data = [,,,,
def main(
......@@ -161,16 +182,21 @@ def main(
# Copy static web files (style.css etc.), shipped with this package,
# to the current working directory
## TODO: Insert thead and tbody tags. Makes subsequent steps safer, since table
## rows inside the body can be selected explicitly.
# Parse original Moss report. Should fix the broken HTML that Moss generates.
doc = parse_original_html_report(input_report_file)
# Save fixed (but otherwise unchanged) report in a file
prettyprint_html_to_versioned_file(doc, DEFAULT_OUTPUT_REPORT_VERSIONED_FILE_PATTERN)
make_report_table_sortable(doc, static_dir)
localize_match_links(doc, input_report_subdir)
......@@ -181,4 +207,4 @@ def main(
if __name__ == "__main__":
