From 9c378bb60f0598913d80a04fa9566360eb71faf2 Mon Sep 17 00:00:00 2001 From: Georg Teufelberger Date: Fri, 13 Mar 2020 23:24:17 +0100 Subject: [PATCH 1/8] Combine print function for available and selected episodes --- vo-scraper.py | 40 +++++++++++++++------------------------- 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/vo-scraper.py b/vo-scraper.py index 40e26f5..54d2a82 100755 --- a/vo-scraper.py +++ b/vo-scraper.py @@ -186,20 +186,18 @@ def acquire_login_cookie(protection, vo_link, user, passw): return cookie_jar -def pretty_print_lectures(vo_json_data): - """Prints the available episodes of a lecture""" - global link_counter - +def pretty_print_episodes(vo_json_data, selected): + """Prints the episode numbers that match `selected`""" + # Get length of longest strings for nice formatting when printing nr_length = len(" Nr.") max_title_length = max([len(episode['title']) for episode in vo_json_data['episodes']]) max_lecturer_length = max([len(str(episode['createdBy'])) for episode in vo_json_data['episodes']]) - # Print available episodes - print_information(" Nr." + " | " + "Name".ljust(max_title_length) + " | " + "Lecturer".ljust(max_lecturer_length) + " | "+ "Date") - counter = 0 - for episode in vo_json_data['episodes']: + # Print the selected episodes + for episode_nr in selected: + episode = vo_json_data['episodes'][episode_nr] print_information( - "%3d".ljust(nr_length) % counter + "%3d".ljust(nr_length) % episode_nr + " | " + episode['title'].ljust(max_title_length) + " | " + @@ -207,21 +205,7 @@ def pretty_print_lectures(vo_json_data): + " | " + episode['createdAt'][:-6] ) - counter += 1 - link_counter += 1 -def pretty_print_selection(vo_json_data, choice): - """Prints the user selected episodes in a nice way """ - - # Get length of longest strings for nice formatting when printing - max_title_length = max([len(episode['title']) for episode in vo_json_data['episodes']]) - max_lecturer_length = max([len(str(episode['createdBy'])) for episode in vo_json_data['episodes']]) - - # Print the selected episodes - print_information("You selected:") - for item_nr in choice: - item = vo_json_data['episodes'][item_nr] - print_information(" - %2d" % item_nr + " " + item['title'].ljust(max_title_length) + " " + str(item['createdBy']).ljust(max_lecturer_length) + " " + item['createdAt'][:-6]) def vo_scrapper(vo_link, user, passw): """ @@ -243,6 +227,8 @@ def vo_scrapper(vo_link, user, passw): global video_info_prefix global directory_prefix + global link_counter + # remove `.html` file extension if vo_link.endswith('.html'): vo_link = vo_link[:-5] @@ -251,8 +237,11 @@ def vo_scrapper(vo_link, user, passw): r = requests.get(vo_link + series_metadata_suffix, headers={'User-Agent': user_agent}) vo_json_data = json.loads(r.text) + # Increase counter for stats + link_counter += len(vo_json_data['episodes']) + # Print available lectures - pretty_print_lectures(vo_json_data) + pretty_print_episodes(vo_json_data, range(len(vo_json_data['episodes']))) # get video selections choice = list() @@ -275,7 +264,8 @@ def vo_scrapper(vo_link, user, passw): print_information("No videos selected") return # nothing to do anymore else: - pretty_print_selection(vo_json_data, choice) + print_information("You selected:") + pretty_print_episodes(vo_json_data, choice) print() # check whether lecture requires login and get credentials if necessary -- GitLab From 3d76ebe42206fe24264872cfde7afcb094cdb40e Mon Sep 17 00:00:00 2001 From: Georg Teufelberger Date: Sat, 14 Mar 2020 12:23:33 +0100 Subject: [PATCH 2/8] Change first character of comments to uppercase (to unify code appearence) --- vo-scraper.py | 99 +++++++++++++++++++++++++++------------------------ 1 file changed, 52 insertions(+), 47 deletions(-) diff --git a/vo-scraper.py b/vo-scraper.py index 54d2a82..eff4b48 100755 --- a/vo-scraper.py +++ b/vo-scraper.py @@ -15,7 +15,7 @@ Check README.md and LICENSE before using this program. # |_| # ======================================================================== -#import urllib.request, urllib.parse, os, sys, http.client +# Import urllib.request, urllib.parse, os, sys, http.client import urllib.request, os, sys, http.client from urllib.request import Request, urlopen from sys import platform @@ -24,13 +24,14 @@ import argparse # For parsing commandline arguments import getpass # For getting the user password -# check whether `requests` is installed +# Check whether `requests` is installed try: import requests except: print_information("Required package `requests` is missing, try installing with `pip3 install requests`", type='error') sys.exit(1) +# Check whether `webbrowser` is installed try: import webbrowser # only used to open the user's browser when reporting a bug except: @@ -45,24 +46,28 @@ except: # # ======================================================================== +# Links to repo +gitlab_repo_page = "https://gitlab.ethz.ch/tgeorg/vo-scraper/" +gitlab_issue_page = gitlab_repo_page+"issues" + +remote_version_link = gitlab_repo_page+"raw/master/VERSION" program_version = '1.0' -remote_version_link = "https://gitlab.ethz.ch/tgeorg/vo-scraper/raw/master/VERSION" +# For web requests user_agent = 'Mozilla/5.0' cookie_jar = requests.cookies.RequestsCookieJar() -#for stats +# For stats link_counter = 0 download_counter = 0 skip_counter = 0 +# series_metadata_suffix = ".series-metadata.json" video_info_prefix = "https://video.ethz.ch/.episode-video.json?recordId=" directory_prefix = "Lecture Recordings/" -gitlab_repo_page = "https://gitlab.ethz.ch/tgeorg/vo-scraper/" -gitlab_issue_page = "https://gitlab.ethz.ch/tgeorg/vo-scraper/issues" - +# Default quality video_quality = "high" download_all = False @@ -110,10 +115,10 @@ def print_information(str, type='info', verbose_only=False): if not verbose_only: if type == 'info' and not verbose: - # print without tag + # Print without tag print(str) else: - # print with tag + # Print with tag print(print_type_dict[type], str) elif verbose: # Always print with tag @@ -132,7 +137,7 @@ def acquire_login_cookie(protection, vo_link, user, passw): """Gets login-cookie by sending user credentials to login server""" global user_agent - # setup cookie_jar + # Setup cookie_jar cookie_jar = requests.cookies.RequestsCookieJar() if protection == "ETH": @@ -140,14 +145,14 @@ def acquire_login_cookie(protection, vo_link, user, passw): while True: (user, passw) = get_credentials(user, passw) - # setup headers and content to send + # Setup headers and content to send headers = { "Content-Type": "application/x-www-form-urlencoded", "CSRF-Token": "undefined", 'User-Agent': user_agent} data = { "__charset__": "utf-8", "j_validate": True, "j_username": user, "j_password": passw} - # request login-cookie + # Request login-cookie r = requests.post("https://video.ethz.ch/j_security_check", headers=headers, data=data) - # put login cookie in cookie_jar + # Put login cookie in cookie_jar cookie_jar = r.cookies if cookie_jar: break @@ -161,14 +166,14 @@ def acquire_login_cookie(protection, vo_link, user, passw): while True: (user, passw) = get_credentials(user, passw) - # setup headers and content to send + # Setup headers and content to send headers = {"Referer": vo_link+".html", "User-Agent":user_agent} data = { "__charset__": "utf-8", "username": user, "password": passw } - # get login cookie + # Get login cookie r = requests.post(vo_link+".series-login.json", headers=headers, data=data) - # put login cookie in cookie_jar + # Put login cookie in cookie_jar cookie_jar = r.cookies if cookie_jar: break @@ -229,11 +234,11 @@ def vo_scrapper(vo_link, user, passw): global link_counter - # remove `.html` file extension + # Remove `.html` file extension if vo_link.endswith('.html'): vo_link = vo_link[:-5] - # get lecture metadata for episode list + # Get lecture metadata for episode list r = requests.get(vo_link + series_metadata_suffix, headers={'User-Agent': user_agent}) vo_json_data = json.loads(r.text) @@ -243,13 +248,13 @@ def vo_scrapper(vo_link, user, passw): # Print available lectures pretty_print_episodes(vo_json_data, range(len(vo_json_data['episodes']))) - # get video selections + # Get video selections choice = list() if download_all: - # add all available videos to the selected + # Add all available videos to the selected choice = list(range(len(vo_json_data['episodes']))) else: - # let user pick videos + # Let user pick videos try: choice = [int(x) for x in input( "Enter numbers of the above lectures you want to download separated by space (e.g. 0 5 12 14)\nJust press enter if you don't want to download anything from this lecture\n" @@ -259,16 +264,16 @@ def vo_scrapper(vo_link, user, passw): print_information("Exiting...") sys.exit() - # print the user's choice + # Print the user's choice if not choice: print_information("No videos selected") - return # nothing to do anymore + return # Nothing to do anymore else: print_information("You selected:") pretty_print_episodes(vo_json_data, choice) print() - # check whether lecture requires login and get credentials if necessary + # Check whether lecture requires login and get credentials if necessary print_information("Protection: " + vo_json_data["protection"], verbose_only=True) if vo_json_data["protection"] != "NONE": try: @@ -278,20 +283,20 @@ def vo_scrapper(vo_link, user, passw): print_information("Keyboard interrupt detected, skipping lecture", type='warning') return - # collect links and download them + # Collect links and download them for item_nr in choice: - # get link to video metadata json file + # Get link to video metadata json file item = vo_json_data['episodes'][item_nr] video_info_link = video_info_prefix+item['id'] - # download the video metadata file - # use login-cookie if provided otherwise make request without cookie + # Download the video metadata file + # Use login-cookie if provided otherwise make request without cookie if(cookie_jar): r = requests.get(video_info_link, cookies=cookie_jar, headers={'User-Agent': user_agent}) else: r = requests.get(video_info_link, headers={'User-Agent': user_agent}) if(r.status_code == 401): - # the lecture requires a login + # The lecture requires a login print_information("Received 401 response. The following lecture requires a valid login cookie:", type='error') item = vo_json_data['episodes'][item_nr] print_information("%2d" % item_nr + " " + item['title'] + " " + str(item['createdBy']) + " " + item['createdAt'][:-6], type='error') @@ -301,7 +306,7 @@ def vo_scrapper(vo_link, user, passw): video_json_data = json.loads(r.text) - # put available versions in list for sorting by video quality + # Put available versions in list for sorting by video quality counter = 0 versions = list() print_information("Available versions:", verbose_only=True) @@ -312,19 +317,19 @@ def vo_scrapper(vo_link, user, passw): versions.sort(key=lambda tup: tup[1]) # Now it's sorted: low -> medium -> high - # get video src url from json + # Get video src url from json video_src_link = video_json_data['streams'][0]['sources']['mp4'][versions[quality_dict[video_quality]][0]]['src'] lecture_titel = vo_json_data['title'] video_title = vo_json_data["episodes"][item_nr]["title"] - # if video and lecture title overlap, remove lecture title from video title + # If video and lecture title overlap, remove lecture title from video title if video_title.startswith(lecture_titel): video_title = video_title[len(lecture_titel):] - # append date + # Append date video_title = item['createdAt'][:-6]+video_title - # create directory for video if it does not already exist + # Create directory for video if it does not already exist directory = directory_prefix + lecture_titel +"/" if not os.path.isdir(directory): os.makedirs(directory) @@ -332,20 +337,20 @@ def vo_scrapper(vo_link, user, passw): else: print_information("This folder already exists: " + directory, verbose_only=True) - # filename is `directory/