From c84ce8ee4504247b770eac2ee9998afb8f902be9 Mon Sep 17 00:00:00 2001 From: Georg Teufelberger Date: Mon, 27 Sep 2021 12:58:32 +0200 Subject: [PATCH 1/3] Skip lecture if we cannot parse JSON Lectures with no video uploaded yet will give a 404 HTML site even on the `*.series-metadata.json` path. Naturally this will cause the JSON parsing to fail. We still might want to download other lectures though, so we just skip this particular one. --- vo-scraper.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vo-scraper.py b/vo-scraper.py index 17deca9..eadc3f4 100755 --- a/vo-scraper.py +++ b/vo-scraper.py @@ -467,7 +467,13 @@ def vo_scrapper(vo_link, video_quality, user, passw): # Get lecture metadata for episode list r = requests.get(vo_link + series_metadata_suffix, headers={'User-Agent': user_agent}) - vo_json_data = json.loads(r.text) + # Try reading the received data as JSON. + # If it fails, e.g. due to no lectures having been uploaded yet, we skip this lecture + try: + vo_json_data = json.loads(r.text) + except json.decoder.JSONDecodeError: + print_information(f"Could not get metadata for {vo_link}.html, skipping", type='warning') + return list() # Return an empty list # Increase counter for stats link_counter += len(vo_json_data['episodes']) -- GitLab From 02ed56df5996a95b3356d1e385054f2bd24fc43e Mon Sep 17 00:00:00 2001 From: Georg Teufelberger Date: Tue, 28 Sep 2021 17:44:22 +0200 Subject: [PATCH 2/3] Remove `www.` prefix from domain name Authentication later on will break if the referer is `https://www.video.ethz.ch/...` instead of `https://.video.ethz.ch/...` --- vo-scraper.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vo-scraper.py b/vo-scraper.py index eadc3f4..410a2e8 100755 --- a/vo-scraper.py +++ b/vo-scraper.py @@ -465,6 +465,10 @@ def vo_scrapper(vo_link, video_quality, user, passw): # Remove `.html` file extension vo_link = vo_link.replace(".html", "") + # Remove `www.` prefix from domain name + # If in link used as a referer during the authentication it causes a failure + vo_link = vo_link.replace("www.", "") + # Get lecture metadata for episode list r = requests.get(vo_link + series_metadata_suffix, headers={'User-Agent': user_agent}) # Try reading the received data as JSON. -- GitLab From 917c2b569e0201eab5c3c5a27337e7b204eac46a Mon Sep 17 00:00:00 2001 From: Georg Teufelberger Date: Tue, 28 Sep 2021 17:47:35 +0200 Subject: [PATCH 3/3] Bump version number --- VERSION | 2 +- vo-scraper.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/VERSION b/VERSION index 56fea8a..13d683c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0 \ No newline at end of file +3.0.1 \ No newline at end of file diff --git a/vo-scraper.py b/vo-scraper.py index 410a2e8..1e8f2ad 100755 --- a/vo-scraper.py +++ b/vo-scraper.py @@ -55,7 +55,7 @@ gitlab_repo_page = "https://gitlab.ethz.ch/tgeorg/vo-scraper/" gitlab_issue_page = gitlab_repo_page + "issues" gitlab_changelog_page = gitlab_repo_page + "-/tags/v" remote_version_link = gitlab_repo_page + "raw/master/VERSION" -program_version = '3.0.0' +program_version = '3.0.1' # For web requests user_agent = 'Mozilla/5.0' -- GitLab