revise.py 6.53 KB
Newer Older
1 2
import re
import logging
3
# import argparse
4 5
import pkgutil
import os
6 7 8
from bs4 import BeautifulSoup
from .utils import logging as logutils

9
DEFAULT_INPUT_REPORT_FILE="moss-report.html"
10
DEFAULT_INPUT_REPORT_SUBDIR="_moss-report"
11 12 13
DEFAULT_OUTPUT_REPORT_FILE="moss-report-revised.html"
DEFAULT_OUTPUT_REPORT_VERSIONED_FILE_PATTERN="moss-report-revised.v{}.html"
DEFAULT_STATIC_DIR="_static"
14
DEFAULT_RESULTS_CSV_FILE="moss-report.csv"
15 16 17 18 19

output_file_version_counter = 1

def copy_static_files_to_working_directory(output_dir):
  os.makedirs(output_dir, exist_ok=True)
20

21 22 23
  files = ["style.css", "script.js", "sorttable.js"]

  for file in files:
24 25
    logging.debug("Copying {} to current working directory".format(file))

26 27 28 29 30 31
    src_file = os.path.join("data", file)
    dest_file = os.path.join(output_dir, file)
    data = pkgutil.get_data(__name__, src_file)

    with open(dest_file, "wb") as dest_fh:
      dest_fh.write(data)
32

33
def parse_original_html_report(input_report_file):
34 35
  logging.info("Reading Moss report file {}".format(input_report_file))

36
  with open(input_report_file) as input_fh:
37 38
    # We use BeautifulSoup because of it allows easy DOM manipulation and has
    # a nice pretty-printing feature. The lxml parser is used because it is able
39
    # to parse the somewhat broken HTML report file that Moss generates.
40 41 42
    doc = BeautifulSoup(input_fh, features="lxml")

  return doc
43

44
def prettyprint_html_to_file(doc, file):
45 46
  logging.debug("Writing report to {}".format(file))

47 48
  with open(file, "w") as fh:
    fh.write(doc.prettify()) # "utf-8"
49

50
def prettyprint_html_to_versioned_file(doc, filename_pattern):
51 52
  global output_file_version_counter

53
  file = filename_pattern.format(output_file_version_counter)
54
  output_file_version_counter += 1
55

56
  prettyprint_html_to_file(doc, file)
57

58
def make_report_table_sortable(doc, static_dir):
59 60
  logging.info("Making report table sortable")

61
  doc.body.table["class"] = "sortable"
62

63 64 65 66
  # <script src="<static_dir>/sorttable.js"></script>
  script = doc.new_tag("script")
  script["src"] = "{}/sorttable.js".format(static_dir)
  doc.html.head.append(script) # Add to head
67

68 69 70 71
  # <script src="<static_dir>/script.js"></script>
  script = doc.new_tag("script")
  script["src"] = "{}/script.js".format(static_dir)
  doc.html.body.append(script) # Add to end of body
72

73 74 75 76 77
  # <link href="<static_dir>/style.css" rel="stylesheet">
  link = doc.new_tag("link")
  link["href"] = "{}/style.css".format(static_dir)
  link["rel"] = "stylesheet"
  doc.html.head.append(link) # Add to head
78

79 80 81
def localize_match_links(doc, input_report_subdir):
  # E.g. http://moss.stanford.edu/results/8/7282327060561/match0.html
  url_pattern = r"http://moss\.stanford\.edu/results/\d+/\d+/(match.*\.html)"
82

83 84 85
  # E.g. ./12-345-678/main.cpp (77%)
  text_pattern = r"\./([\d-]+)/.* (\(\d+%\))"

86 87 88 89 90 91
  logging.info("Localising links to match files")

  rows = doc.find_all("tr")
  logging.debug("Considering {} rows, including table header".format(len(rows)))

  for row in rows:
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
    for a in row.find_all("a"):
      # Change remote URLs to local ones
      url_match = re.search(url_pattern, a["href"])
      a["href"] = "./{}/{}".format(input_report_subdir, url_match.group(1))

      # Open links in a new tab/window
      a["target"] = "_blank"

      # Strip away unnecessary link text
      # print(a.get_text().strip())
      text_match = re.search(text_pattern, a.get_text().strip())#.group(1)
      a.string = "{} {}".format(text_match.group(1), text_match.group(2))

def get_match_percentage(match_text):
  percentage_pattern = r"\((\d+)%\)$"

  percentage_string = re.search(percentage_pattern, match_text).group(1)
109

110 111
  return int(percentage_string)

112
def add_average_percentage_column(doc):
113 114
  logging.info("Adding average percentage column")

115 116 117 118
  ths = doc.find_all("th")
  th = doc.new_tag("th")
  th.string = "Avg. %"
  ths[1].insert_after(th)
119

120 121 122 123
  rows = doc.find_all("tr")[1:] # Skip first TR, since table head
  logging.debug("Considering {} rows, excluding table header".format(len(rows)))

  for row in rows:
124
    cols = row.find_all("td")
125

126 127
    first_match_text = cols[0].get_text().strip()
    second_match_text = cols[1].get_text().strip()
128 129 130 131 132 133 134

    first_percentage = get_match_percentage(first_match_text)
    second_percentage = get_match_percentage(second_match_text)
    avg_percentage = (first_percentage + second_percentage) / 2

    td = doc.new_tag("td")
    td.string = str(avg_percentage)
135
    row.insert(2, td)
136

137
def write_result_table_to_csv_file(doc, csv_file):
138 139
  logging.info("Writing report data to CSV file {}".format(csv_file))

140 141 142
  # E.g. 12-345-678 (77%)
  text_pattern = r"([\d-]+) \((\d+)%\)"

143
  with open(csv_file, "w") as csv_fh:
144 145
    rows = doc.find_all("tr")[1:] # Skip first TR, since table head
    logging.debug("Considering {} rows, excluding table header".format(len(rows)))
146

147
    column_heads = [
148 149 150 151 152
      "id1", "percentage1",
      "id2", "percentage2",
      "percentage_avg",
      "lines",
      "match_file"
153 154
    ]

155 156 157
    csv_fh.write(",".join(column_heads))
    csv_fh.write("\n")

158 159 160
    for row in rows:
      tds = [td for td in row.find_all("td")]
      tds_text = [td.get_text().strip() for td in tds]
161

162 163
      file1_match = re.search(text_pattern, tds_text[0])
      file2_match = re.search(text_pattern, tds_text[1])
164

165 166 167 168 169 170 171 172 173
      col_data = [
        file1_match.group(1),
        file1_match.group(2),
        file2_match.group(1),
        file2_match.group(2),
        tds_text[2],
        tds_text[3],
        tds[0].a["href"]
      ]
174

175 176
      csv_fh.write(",".join(col_data))
      csv_fh.write("\n")
177

178 179
def main(
    input_report_file=DEFAULT_INPUT_REPORT_FILE,
180
    input_report_subdir=DEFAULT_INPUT_REPORT_SUBDIR,
181
    output_report_file=DEFAULT_OUTPUT_REPORT_FILE,
182 183
    static_dir=DEFAULT_STATIC_DIR,
    results_csv_file=DEFAULT_RESULTS_CSV_FILE):
184

185 186
  logutils.configure_level_and_format()

187 188 189 190
  # Copy static web files (style.css etc.), shipped with this package,
  # to the current working directory
  copy_static_files_to_working_directory(static_dir)

191 192 193
  ## TODO: Insert thead and tbody tags. Makes subsequent steps safer, since table
  ##       rows inside the body can be selected explicitly.

194 195 196 197 198
  # Parse original Moss report. Should fix the broken HTML that Moss generates.
  doc = parse_original_html_report(input_report_file)

  # Save fixed (but otherwise unchanged) report in a file
  prettyprint_html_to_versioned_file(doc, DEFAULT_OUTPUT_REPORT_VERSIONED_FILE_PATTERN)
199

200
  make_report_table_sortable(doc, static_dir)
201
  localize_match_links(doc, input_report_subdir)
202
  add_average_percentage_column(doc)
203

204 205
  write_result_table_to_csv_file(doc, results_csv_file)

206
  prettyprint_html_to_file(doc, output_report_file)
207 208


209
if __name__ == "__main__":
210
  main()