revise.py 6.94 KB
Newer Older
1 2
import re
import logging
3
# import argparse
4 5
import pkgutil
import os
6 7 8
from bs4 import BeautifulSoup
from .utils import logging as logutils

9
DEFAULT_INPUT_REPORT_FILE="moss-report.html"
10
DEFAULT_INPUT_REPORT_SUBDIR="_moss-report"
11 12 13
DEFAULT_OUTPUT_REPORT_FILE="moss-report-revised.html"
DEFAULT_OUTPUT_REPORT_VERSIONED_FILE_PATTERN="moss-report-revised.v{}.html"
DEFAULT_STATIC_DIR="_static"
14
DEFAULT_RESULTS_CSV_FILE="moss-report.csv"
15 16 17 18 19

output_file_version_counter = 1

def copy_static_files_to_working_directory(output_dir):
  os.makedirs(output_dir, exist_ok=True)
20

21 22 23
  files = ["style.css", "script.js", "sorttable.js"]

  for file in files:
24 25
    logging.debug("Copying {} to current working directory".format(file))

26 27 28 29 30 31
    src_file = os.path.join("data", file)
    dest_file = os.path.join(output_dir, file)
    data = pkgutil.get_data(__name__, src_file)

    with open(dest_file, "wb") as dest_fh:
      dest_fh.write(data)
32

33
def parse_original_html_report(input_report_file):
34 35
  logging.info("Reading Moss report file {}".format(input_report_file))

36
  with open(input_report_file) as input_fh:
37 38
    # We use BeautifulSoup because of it allows easy DOM manipulation and has
    # a nice pretty-printing feature. The lxml parser is used because it is able
39
    # to parse the somewhat broken HTML report file that Moss generates.
40 41 42
    doc = BeautifulSoup(input_fh, features="lxml")

  return doc
43

44
def prettyprint_html_to_file(doc, file):
45 46
  logging.debug("Writing report to {}".format(file))

47 48
  with open(file, "w") as fh:
    fh.write(doc.prettify()) # "utf-8"
49

50
def prettyprint_html_to_versioned_file(doc, filename_pattern):
51 52
  global output_file_version_counter

53
  file = filename_pattern.format(output_file_version_counter)
54
  output_file_version_counter += 1
55

56
  prettyprint_html_to_file(doc, file)
57

58
def make_report_table_sortable(doc, static_dir):
59 60
  logging.info("Making report table sortable")

61
  doc.body.table["class"] = "sortable"
62

63 64 65 66
  # <script src="<static_dir>/sorttable.js"></script>
  script = doc.new_tag("script")
  script["src"] = "{}/sorttable.js".format(static_dir)
  doc.html.head.append(script) # Add to head
67

68 69 70 71
  # <script src="<static_dir>/script.js"></script>
  script = doc.new_tag("script")
  script["src"] = "{}/script.js".format(static_dir)
  doc.html.body.append(script) # Add to end of body
72

73 74 75 76 77
  # <link href="<static_dir>/style.css" rel="stylesheet">
  link = doc.new_tag("link")
  link["href"] = "{}/style.css".format(static_dir)
  link["rel"] = "stylesheet"
  doc.html.head.append(link) # Add to head
78

79 80 81
def localize_match_links(doc, input_report_subdir):
  # E.g. http://moss.stanford.edu/results/8/7282327060561/match0.html
  url_pattern = r"http://moss\.stanford\.edu/results/\d+/\d+/(match.*\.html)"
82

83
  # E.g. ./12-345-678/main.cpp (77%)
84 85
  #      ./some/dir/12-345-678/main.cpp (77%)
  text_pattern = r".*?/([\d-]+)/.* (\(\d+%\))"
86

87 88 89 90 91 92
  logging.info("Localising links to match files")

  rows = doc.find_all("tr")
  logging.debug("Considering {} rows, including table header".format(len(rows)))

  for row in rows:
93 94 95
    for a in row.find_all("a"):
      # Change remote URLs to local ones
      url_match = re.search(url_pattern, a["href"])
96 97 98 99

      if not url_match:
        raise RuntimeError("Failure while localising match links in the Moss report. Failed to match link '{}' against regex '{}'".format(a["href"], url_pattern))

100 101 102 103 104 105
      a["href"] = "./{}/{}".format(input_report_subdir, url_match.group(1))

      # Open links in a new tab/window
      a["target"] = "_blank"

      # Strip away unnecessary link text
106 107 108 109 110 111
      link_text = a.get_text().strip()
      text_match = re.search(text_pattern, link_text)

      if not text_match:
        raise RuntimeError("Failure while localising match links in the Moss report. Failed to match link text '{}' against regex '{}'".format(link_text, text_pattern))

112 113 114 115 116 117
      a.string = "{} {}".format(text_match.group(1), text_match.group(2))

def get_match_percentage(match_text):
  percentage_pattern = r"\((\d+)%\)$"

  percentage_string = re.search(percentage_pattern, match_text).group(1)
118

119 120
  return int(percentage_string)

121
def add_average_percentage_column(doc):
122 123
  logging.info("Adding average percentage column")

124 125 126 127
  ths = doc.find_all("th")
  th = doc.new_tag("th")
  th.string = "Avg. %"
  ths[1].insert_after(th)
128

129 130 131 132
  rows = doc.find_all("tr")[1:] # Skip first TR, since table head
  logging.debug("Considering {} rows, excluding table header".format(len(rows)))

  for row in rows:
133
    cols = row.find_all("td")
134

135 136
    first_match_text = cols[0].get_text().strip()
    second_match_text = cols[1].get_text().strip()
137 138 139 140 141 142 143

    first_percentage = get_match_percentage(first_match_text)
    second_percentage = get_match_percentage(second_match_text)
    avg_percentage = (first_percentage + second_percentage) / 2

    td = doc.new_tag("td")
    td.string = str(avg_percentage)
144
    row.insert(2, td)
145

146
def write_result_table_to_csv_file(doc, csv_file):
147 148
  logging.info("Writing report data to CSV file {}".format(csv_file))

149 150 151
  # E.g. 12-345-678 (77%)
  text_pattern = r"([\d-]+) \((\d+)%\)"

152
  with open(csv_file, "w") as csv_fh:
153 154
    rows = doc.find_all("tr")[1:] # Skip first TR, since table head
    logging.debug("Considering {} rows, excluding table header".format(len(rows)))
155

156
    column_heads = [
157 158 159 160 161
      "id1", "percentage1",
      "id2", "percentage2",
      "percentage_avg",
      "lines",
      "match_file"
162 163
    ]

164 165 166
    csv_fh.write(",".join(column_heads))
    csv_fh.write("\n")

167 168 169
    for row in rows:
      tds = [td for td in row.find_all("td")]
      tds_text = [td.get_text().strip() for td in tds]
170

171 172
      file1_match = re.search(text_pattern, tds_text[0])
      file2_match = re.search(text_pattern, tds_text[1])
173

174 175 176 177 178 179 180 181 182
      col_data = [
        file1_match.group(1),
        file1_match.group(2),
        file2_match.group(1),
        file2_match.group(2),
        tds_text[2],
        tds_text[3],
        tds[0].a["href"]
      ]
183

184 185
      csv_fh.write(",".join(col_data))
      csv_fh.write("\n")
186

187 188
def main(
    input_report_file=DEFAULT_INPUT_REPORT_FILE,
189
    input_report_subdir=DEFAULT_INPUT_REPORT_SUBDIR,
190
    output_report_file=DEFAULT_OUTPUT_REPORT_FILE,
191 192
    static_dir=DEFAULT_STATIC_DIR,
    results_csv_file=DEFAULT_RESULTS_CSV_FILE):
193

194 195
  logutils.configure_level_and_format()

196 197 198 199
  # Copy static web files (style.css etc.), shipped with this package,
  # to the current working directory
  copy_static_files_to_working_directory(static_dir)

200 201 202
  ## TODO: Insert thead and tbody tags. Makes subsequent steps safer, since table
  ##       rows inside the body can be selected explicitly.

203 204 205 206 207
  # Parse original Moss report. Should fix the broken HTML that Moss generates.
  doc = parse_original_html_report(input_report_file)

  # Save fixed (but otherwise unchanged) report in a file
  prettyprint_html_to_versioned_file(doc, DEFAULT_OUTPUT_REPORT_VERSIONED_FILE_PATTERN)
208

209
  make_report_table_sortable(doc, static_dir)
210
  localize_match_links(doc, input_report_subdir)
211
  add_average_percentage_column(doc)
212

213 214
  write_result_table_to_csv_file(doc, results_csv_file)

215
  prettyprint_html_to_file(doc, output_report_file)
216 217


218
if __name__ == "__main__":
219
  main()