diff --git a/VERSION b/VERSION index d3827e75a5cadb9fe4a27e1cb9b6d192e7323120..b123147e2a162f34cf377f8b63d99b0e6f8887c2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.0 +1.1 \ No newline at end of file diff --git a/vo-scraper.py b/vo-scraper.py index 40e26f57e7a22b83f3cd1e69169d7dafd4500656..a4ae0647a348492bb9d6e765d10beb394e6769f7 100755 --- a/vo-scraper.py +++ b/vo-scraper.py @@ -15,7 +15,7 @@ Check README.md and LICENSE before using this program. # |_| # ======================================================================== -#import urllib.request, urllib.parse, os, sys, http.client +# Import urllib.request, urllib.parse, os, sys, http.client import urllib.request, os, sys, http.client from urllib.request import Request, urlopen from sys import platform @@ -24,13 +24,14 @@ import argparse # For parsing commandline arguments import getpass # For getting the user password -# check whether `requests` is installed +# Check whether `requests` is installed try: import requests except: print_information("Required package `requests` is missing, try installing with `pip3 install requests`", type='error') sys.exit(1) +# Check whether `webbrowser` is installed try: import webbrowser # only used to open the user's browser when reporting a bug except: @@ -45,24 +46,28 @@ except: # # ======================================================================== -program_version = '1.0' -remote_version_link = "https://gitlab.ethz.ch/tgeorg/vo-scraper/raw/master/VERSION" +# Links to repo +gitlab_repo_page = "https://gitlab.ethz.ch/tgeorg/vo-scraper/" +gitlab_issue_page = gitlab_repo_page+"issues" +gitlab_changelog_page = gitlab_repo_page+"-/tags/v" +remote_version_link = gitlab_repo_page+"raw/master/VERSION" +program_version = '1.1' +# For web requests user_agent = 'Mozilla/5.0' cookie_jar = requests.cookies.RequestsCookieJar() -#for stats +# For stats link_counter = 0 download_counter = 0 skip_counter = 0 +# series_metadata_suffix = ".series-metadata.json" video_info_prefix = "https://video.ethz.ch/.episode-video.json?recordId=" directory_prefix = "Lecture Recordings/" -gitlab_repo_page = "https://gitlab.ethz.ch/tgeorg/vo-scraper/" -gitlab_issue_page = "https://gitlab.ethz.ch/tgeorg/vo-scraper/issues" - +# Default quality video_quality = "high" download_all = False @@ -110,17 +115,22 @@ def print_information(str, type='info', verbose_only=False): if not verbose_only: if type == 'info' and not verbose: - # print without tag + # Print without tag print(str) else: - # print with tag + # Print with tag print(print_type_dict[type], str) elif verbose: # Always print with tag print(print_type_dict[type],str) def get_credentials(user, passw): - """Gets user credentials and returns them""" + """Gets user credentials and returns them + + Keyword arguments: + user -- The username passed from a text file + passw -- The password passed from a text file + """ if not user: user = input("Enter your username: ") if not passw: @@ -129,10 +139,17 @@ def get_credentials(user, passw): return(user, passw) def acquire_login_cookie(protection, vo_link, user, passw): - """Gets login-cookie by sending user credentials to login server""" + """Gets login-cookie by sending user credentials to login server + + Keyword arguments: + protection -- The type of login the lecture requires (NETHZ or custom password) + vo_link -- The link to the lecture + user -- The username passed from a text file + passw -- The password passed from a text file + """ global user_agent - # setup cookie_jar + # Setup cookie_jar cookie_jar = requests.cookies.RequestsCookieJar() if protection == "ETH": @@ -140,14 +157,14 @@ def acquire_login_cookie(protection, vo_link, user, passw): while True: (user, passw) = get_credentials(user, passw) - # setup headers and content to send + # Setup headers and content to send headers = { "Content-Type": "application/x-www-form-urlencoded", "CSRF-Token": "undefined", 'User-Agent': user_agent} data = { "__charset__": "utf-8", "j_validate": True, "j_username": user, "j_password": passw} - # request login-cookie + # Request login-cookie r = requests.post("https://video.ethz.ch/j_security_check", headers=headers, data=data) - # put login cookie in cookie_jar + # Put login cookie in cookie_jar cookie_jar = r.cookies if cookie_jar: break @@ -161,14 +178,14 @@ def acquire_login_cookie(protection, vo_link, user, passw): while True: (user, passw) = get_credentials(user, passw) - # setup headers and content to send + # Setup headers and content to send headers = {"Referer": vo_link+".html", "User-Agent":user_agent} data = { "__charset__": "utf-8", "username": user, "password": passw } - # get login cookie + # Get login cookie r = requests.post(vo_link+".series-login.json", headers=headers, data=data) - # put login cookie in cookie_jar + # Put login cookie in cookie_jar cookie_jar = r.cookies if cookie_jar: break @@ -186,20 +203,18 @@ def acquire_login_cookie(protection, vo_link, user, passw): return cookie_jar -def pretty_print_lectures(vo_json_data): - """Prints the available episodes of a lecture""" - global link_counter - +def pretty_print_episodes(vo_json_data, selected): + """Prints the episode numbers that match `selected`""" + # Get length of longest strings for nice formatting when printing nr_length = len(" Nr.") max_title_length = max([len(episode['title']) for episode in vo_json_data['episodes']]) max_lecturer_length = max([len(str(episode['createdBy'])) for episode in vo_json_data['episodes']]) - # Print available episodes - print_information(" Nr." + " | " + "Name".ljust(max_title_length) + " | " + "Lecturer".ljust(max_lecturer_length) + " | "+ "Date") - counter = 0 - for episode in vo_json_data['episodes']: + # Print the selected episodes + for episode_nr in selected: + episode = vo_json_data['episodes'][episode_nr] print_information( - "%3d".ljust(nr_length) % counter + "%3d".ljust(nr_length) % episode_nr + " | " + episode['title'].ljust(max_title_length) + " | " + @@ -207,27 +222,18 @@ def pretty_print_lectures(vo_json_data): + " | " + episode['createdAt'][:-6] ) - counter += 1 - link_counter += 1 - -def pretty_print_selection(vo_json_data, choice): - """Prints the user selected episodes in a nice way """ - - # Get length of longest strings for nice formatting when printing - max_title_length = max([len(episode['title']) for episode in vo_json_data['episodes']]) - max_lecturer_length = max([len(str(episode['createdBy'])) for episode in vo_json_data['episodes']]) - # Print the selected episodes - print_information("You selected:") - for item_nr in choice: - item = vo_json_data['episodes'][item_nr] - print_information(" - %2d" % item_nr + " " + item['title'].ljust(max_title_length) + " " + str(item['createdBy']).ljust(max_lecturer_length) + " " + item['createdAt'][:-6]) def vo_scrapper(vo_link, user, passw): """ Gets the list of all available videos for a lecture. Allows user to select multiple videos. Afterwards passes the links to the video source to `downloader()` + + Keyword arguments: + vo_link -- The link to the lecture + user -- The username passed from a text file + passw -- The password passed from a text file """ global user_agent global download_all @@ -243,24 +249,29 @@ def vo_scrapper(vo_link, user, passw): global video_info_prefix global directory_prefix - # remove `.html` file extension + global link_counter + + # Remove `.html` file extension if vo_link.endswith('.html'): vo_link = vo_link[:-5] - # get lecture metadata for episode list + # Get lecture metadata for episode list r = requests.get(vo_link + series_metadata_suffix, headers={'User-Agent': user_agent}) vo_json_data = json.loads(r.text) + # Increase counter for stats + link_counter += len(vo_json_data['episodes']) + # Print available lectures - pretty_print_lectures(vo_json_data) + pretty_print_episodes(vo_json_data, range(len(vo_json_data['episodes']))) - # get video selections + # Get video selections choice = list() if download_all: - # add all available videos to the selected + # Add all available videos to the selected choice = list(range(len(vo_json_data['episodes']))) else: - # let user pick videos + # Let user pick videos try: choice = [int(x) for x in input( "Enter numbers of the above lectures you want to download separated by space (e.g. 0 5 12 14)\nJust press enter if you don't want to download anything from this lecture\n" @@ -270,15 +281,16 @@ def vo_scrapper(vo_link, user, passw): print_information("Exiting...") sys.exit() - # print the user's choice + # Print the user's choice if not choice: print_information("No videos selected") - return # nothing to do anymore + return # Nothing to do anymore else: - pretty_print_selection(vo_json_data, choice) + print_information("You selected:") + pretty_print_episodes(vo_json_data, choice) print() - # check whether lecture requires login and get credentials if necessary + # Check whether lecture requires login and get credentials if necessary print_information("Protection: " + vo_json_data["protection"], verbose_only=True) if vo_json_data["protection"] != "NONE": try: @@ -288,20 +300,20 @@ def vo_scrapper(vo_link, user, passw): print_information("Keyboard interrupt detected, skipping lecture", type='warning') return - # collect links and download them + # Collect links and download them for item_nr in choice: - # get link to video metadata json file + # Get link to video metadata json file item = vo_json_data['episodes'][item_nr] video_info_link = video_info_prefix+item['id'] - # download the video metadata file - # use login-cookie if provided otherwise make request without cookie + # Download the video metadata file + # Use login-cookie if provided otherwise make request without cookie if(cookie_jar): r = requests.get(video_info_link, cookies=cookie_jar, headers={'User-Agent': user_agent}) else: r = requests.get(video_info_link, headers={'User-Agent': user_agent}) if(r.status_code == 401): - # the lecture requires a login + # The lecture requires a login print_information("Received 401 response. The following lecture requires a valid login cookie:", type='error') item = vo_json_data['episodes'][item_nr] print_information("%2d" % item_nr + " " + item['title'] + " " + str(item['createdBy']) + " " + item['createdAt'][:-6], type='error') @@ -311,7 +323,7 @@ def vo_scrapper(vo_link, user, passw): video_json_data = json.loads(r.text) - # put available versions in list for sorting by video quality + # Put available versions in list for sorting by video quality counter = 0 versions = list() print_information("Available versions:", verbose_only=True) @@ -322,19 +334,19 @@ def vo_scrapper(vo_link, user, passw): versions.sort(key=lambda tup: tup[1]) # Now it's sorted: low -> medium -> high - # get video src url from json + # Get video src url from json video_src_link = video_json_data['streams'][0]['sources']['mp4'][versions[quality_dict[video_quality]][0]]['src'] lecture_titel = vo_json_data['title'] video_title = vo_json_data["episodes"][item_nr]["title"] - # if video and lecture title overlap, remove lecture title from video title + # If video and lecture title overlap, remove lecture title from video title if video_title.startswith(lecture_titel): video_title = video_title[len(lecture_titel):] - # append date + # Append date video_title = item['createdAt'][:-6]+video_title - # create directory for video if it does not already exist + # Create directory for video if it does not already exist directory = directory_prefix + lecture_titel +"/" if not os.path.isdir(directory): os.makedirs(directory) @@ -342,35 +354,40 @@ def vo_scrapper(vo_link, user, passw): else: print_information("This folder already exists: " + directory, verbose_only=True) - # filename is `directory/