diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..56ffde9326a75d15501c77e99be9c8358af96c1a --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +Lecture Recordings diff --git a/README.md b/README.md index 8d384ccacb03e4ba4c232154bc2e1021c158ec38..4881814d3bd7ccf38c8b150e34ed3e6ffee22b5c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -#### ⚠ **Important**: In order to not overburden ETH servers during the current situation, I highly recommend only downloading videos outside of peak hours, i.e. early in the morning or late at night ⚠ +#### ⚠ **Important**: In order not to overload ETH servers (like UZH) in the current situation, I highly recommend only downloading videos outside of peak hours, i.e. early in the morning or late at night ⚠ *** diff --git a/vo-scraper.py b/vo-scraper.py index d496ba7e14055ccaaab1588a8f01478eeff7ea0b..ac008162a9c41b162c475aaffaa28dcef0a40e59 100755 --- a/vo-scraper.py +++ b/vo-scraper.py @@ -19,6 +19,9 @@ Check README.md and LICENSE before using this program. import urllib.request, os, sys, http.client from urllib.request import Request, urlopen from sys import platform +from multiprocessing import Pool, Array +import multiprocessing as mp +import queue, traceback # for mp.Queue import json # For handling json files import argparse # For parsing commandline arguments import getpass # For getting the user password @@ -77,6 +80,7 @@ video_quality = "high" download_all = False verbose = False print_src = False +use_multithread = True # Location of text files file_to_print_src_to = "" @@ -116,6 +120,8 @@ def print_information(str, type='info', verbose_only=False): type -- The type of information: {info, warning, error} verbose_only -- If true the string will only be printed when the verbose flag is set. Useful for printing debugging info. + + returns: True if line(s) printed, false otherwise """ global print_type_dict @@ -126,9 +132,12 @@ def print_information(str, type='info', verbose_only=False): else: # Print with tag print(print_type_dict[type], str) + return True elif verbose: # Always print with tag print(print_type_dict[type],str) + return True + return False def get_credentials(user, passw): """Gets user credentials and returns them @@ -371,7 +380,10 @@ def vo_scrapper(vo_link, user, passw): local_video_src_collection = list() # Collect links for download + print(f"\r\nInitializing {vo_link}, please wait...") + print("["+" "*len(choice)+"]\r[", end="") for item_nr in choice: + print("*", end='') # Get link to video metadata json file item = vo_json_data['episodes'][item_nr] video_info_link = video_info_prefix+item['id'] @@ -429,7 +441,9 @@ def vo_scrapper(vo_link, user, passw): file_name = directory+episode_title+"_"+video_quality+".mp4" print_information(file_name, verbose_only=True) - local_video_src_collection.append((file_name, video_src_link, episode_name)) + response_head = requests.head(video_src_link) + local_video_src_collection.append((file_name, video_src_link, episode_name, int(response_head.headers.get('content-length')))) + sys.stdout.flush() return local_video_src_collection @@ -669,6 +683,238 @@ def apply_args(args): history_file = args.history print_information("History file location: " + history_file, verbose_only= True) +#### MULTITHREADING BEGIN @author DBI +def download_multithreaded(links): + global downloader_statuses, print_queue + downloader_statuses = [] #global structure holding a pair of [status|bytes_downloaded, file_size] for each download + print_queue = mp.Queue(len(links)*2) #global buffer structure for worker-to-mainthread comm/output => logging + downloader_jobs = [] # local structure holding relevant info per download + infolines = [] # local buffer of all printed lines in any worker process + n_workers = 5 # the maximum amount of workers to spawn, will later be further bounded by #links + + # Build data structures + for idx, (file_name, video_src_link, episode_name, video_size) in enumerate(video_src_collection): + downloader_statuses.append(mp.Array('i', [-2, int(video_size)])) + downloader_jobs.append((file_name, video_src_link, episode_name, idx)) + + n_workers = min(len(downloader_jobs), n_workers) + os.system('cls' if os.name=='nt' else 'clear') # clear console + with Pool(processes=n_workers) as pool: + try: + results = [pool.apply_async(multithread_downloader_worker, job, error_callback=lambda exc: infolines.append(((str(exc)+traceback.format_exc(),), {}))) for job in downloader_jobs] + done = False + flipper = 0 + while not done: + running, successful, error = [], [], [] + for i in range(len(results)): + if results[i].ready(): + if results[i].successful(): + successful.append(i) + else: + error.append(i) + else: + running.append(i) + done = (len(successful) + len(error)) == len(downloader_jobs) + print_results_table(downloader_jobs, results, running, successful, error, infolines, flipper % 60 == 0) + flipper += 1 + try: + while True: #generally roughly equivalent to time.sleep(0.2) + infolines.append(print_queue.get(True, 0.2)) + except queue.Empty: + pass + except KeyboardInterrupt: + print_results_table(downloader_jobs, results, running, successful, error, infolines, True, True) + print("STOPPING, please be patient") + pool.terminate() + + # cleanup (counters updated within function) + print_results_table(downloader_jobs, results, running, successful, error, infolines, True, True) + +# courtesy of https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size +def sizeof_fmt(num, suffix='B'): + for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']: + if abs(num) < 1024.0: + return "%3.1f%s%s" % (num, unit, suffix) + num /= 1024.0 + return "%.1f%s%s" % (num, 'Yi', suffix) + +def print_results_table(jobs, results, running, successful, error, infolines, full = False, last = False): + global download_counter, skip_counter + global downloader_statuses + global old_stats + + new_stats = (len(running), len(successful), len(error), len(infolines)) + download_counter = sum([1 if (results[job].get()) else 0 for job in successful]) + skip_counter = len(successful) - download_counter + full = full or old_stats != new_stats or last + if full: + os.system('cls' if os.name=='nt' else 'clear') # clear console + print("="*10 + " STATUS " + "="*10) + print("Errors: %d" % len(error)) + print("Completed: %d" % len(successful)) + for job in successful: + data = jobs[job] + progress, total = downloader_statuses[data[3]] + print("{0}:\t{1}\t({2})".format( + ': '.join(data[0].split('/')[-2:]), + "Skipped" if progress == -1 else "Completed", + sizeof_fmt(total) + )) + else: + print("\r\x1B[%dA"% (len(running)+1),end='') + print("Downloading (%d / %d) ... (%s / %s total)\x1B[K" % ( + len(running), + len(jobs), + sizeof_fmt(sum([x for x, _ in downloader_statuses])), + sizeof_fmt(sum([x for _, x in downloader_statuses]))) + ) + for job in running: + data = jobs[job] + progress, total = downloader_statuses[data[3]] + if progress == -2: + print("{0}:\tQueued, {1}\x1B[K".format(': '.join(data[0].split('/')[-2:]), sizeof_fmt(total))) + else: + bars = int(50 * progress / total) + print("\x1B[K{0}:\t[{1}{2}{3}]\t({4} / {5})".format( + ': '.join(data[0].split('/')[-2:]), + '=' * max(bars-1,0), '>' if bars > 0 and bars < 50 else '', ' ' * (50-bars), + sizeof_fmt(progress), + sizeof_fmt(total) + )) + if full: + #print("\x1B7", end="") #save cursor + print("Additional Info:") + up = 0 + for args, kwds in infolines: + up += 1 if print_information(*args, **kwds) else 0 + #print("\x1B8", end="") + if not last: + print("\r\x1B[%dA"% (up+1), end='') + old_stats = new_stats + +def multithread_downloader_worker(file_name, video_src_link, episode_name, download_id): + """Downloads the video and gives progress information + + Keyword arguments: + file_name -- Name of the file to write the data to + video_src_link -- The link to download the data from + episode_name -- Name of the episode + download_id -- ID/index of the current job + + returns: false if skipped + """ + global download_counter #not used in that mode TODO MOVE to wrapper + global skip_counter #not used in that mode TODO MOVE to wrapper + + global print_src + global file_to_print_src_to + + global downloader_statuses, print_queue # grant access to shared variables + + com_q = print_queue # set up alias for print_queue + progress_array = downloader_statuses[download_id] # current shared progress indicator variable + nice_fname = ': '.join(file_name.split('/')[-2:]) + com_q.put(((f"{nice_fname} [{download_id}] PID: " + str(os.getpid()), ), {'verbose_only': True})) + + # Check for print_src flag + if print_src: + # Print to file if given + if file_to_print_src_to: + com_q.put((("Printing " + video_src_link + "to file: "+ file_to_print_src_to, ), {'verbose_only': True})) + with open(file_to_print_src_to,"a") as f: + f.write(video_src_link+"\n") # TODO Check if this works + else: + com_q.put(((video_src_link, ), {})) + # Otherwise download video + else: + com_q.put(((nice_fname + " Video source: " + video_src_link, ), {'verbose_only': True})) + + # get metadata for file + + response_head = requests.head(video_src_link) + total_length = int(response_head.headers.get('content-length')) + partial_allowed = response_head.headers.get('accept-range') == 'bytes' # can resume download? + offset = 0 # download start position + skip = False # whether current file shall be skipped + progress_array[0] = -1 # TODO explain + progress_array[1] = total_length # just to be sure, update size + + # Check history file (if one has been specified) whether episode has already been downloaded + if history_file: + try: + with open(history_file, "r") as file: + if video_src_link in [line.rstrip('\n') for line in file.readlines()]: + print("download skipped - file already recorded in history: " + episode_name) + # TODO check for different size? + skip = True + return False + else: + com_q.put((("Link has not yet been recorded in history file", ), {'verbose_only': True})) + except FileNotFoundError: + com_q.put(((f"No history file found at specified location: {history_file}", ), {'verbose_only': True, 'type': 'warning'})) + + # Create directory for video if it does not already exist + directory = os.path.dirname(os.path.abspath(file_name)) + if not os.path.isdir(directory): + os.makedirs(directory) + com_q.put(((f"This folder was generated: {directory}", ), {'verbose_only': True})) + else: + com_q.put(((f"This folder already exists: {directory}", ), {'verbose_only': True})) + + # Check if file already exists and compare filesizes + if os.path.isfile(file_name): + fsize = os.stat(file_name).st_size + if fsize >= total_length: + com_q.put(((f"{nice_fname} download skipped - file already exists", ), {})) + else: + com_q.put(((f"{nice_fname} file may be corrupted: smaller than video length", ), {})) + skip = True + elif os.path.isfile(file_name+".part"): + fsize = os.stat(file_name+".part").st_size + if fsize < total_length: + com_q.put(((nice_fname + " incomplete video file already exists - resuming download of %s.part at %s" % (file_name.split('/')[-1], sizeof_fmt(fsize)), ), {})) + offset = fsize + else: + com_q.put(((nice_fname + " complete part file found: %s.part, fixing up" % file_name.split('/')[-1], ), {})) + os.rename(file_name+".part", file_name) + skip = True + + + # Otherwise download it + if not skip: + # cf.: https://stackoverflow.com/questions/15644964/python-progress-bar-and-downloads + with open(file_name+".part", "wb") as f: + addl_headers = {} + if partial_allowed and offset != 0: + addl_headers['range'] = "bytes=%d-" % offset + f.seek(offset, os.SEEK_SET) + response = requests.get(video_src_link, stream=True, headers=addl_headers) + total_length = response.headers.get('content-length') + + #print_information("Downloading " + file_name.split('/')[-1] + " (%.2f" % (int(total_length)/1024/1024) + " MiB)") + + if total_length is None: # We received no content length header + f.write(response.content) + else: + # Download file and show progress bar + total_length = int(total_length) + dl = offset + for data in response.iter_content(chunk_size=4096): + dl += len(data) + f.write(data) + progress_array[0] = dl # update progress in shared variable + + # Remove `.part` suffix from file name + os.rename(file_name+".part", file_name) + + if history_file: + # Regardless whether we just downloaded the file or it already exists on disk, we want to add it to the history file + with open(history_file, "a") as file: + file.write(video_src_link + '\n') + + return not skip + +#### MULTITHREADING END def setup_arg_parser(): """Sets the parser up to handle all possible flags""" @@ -821,13 +1067,16 @@ if __name__ == '__main__': print_information(video_src_collection, verbose_only=True) # Strip illegal characters: - video_src_collection = [(remove_illegal_characters(file_name), video_src_link, episode_name) for (file_name, video_src_link, episode_name) in video_src_collection] + video_src_collection = [(remove_illegal_characters(file_name), video_src_link, episode_name, video_size) for (file_name, video_src_link, episode_name, video_size) in video_src_collection] # Download selected episodes - for (file_name, video_src_link, episode_name) in video_src_collection: - downloader(file_name, video_src_link, episode_name) + if use_multithread: + download_multithreaded(video_src_collection) + else: + for (file_name, video_src_link, episode_name, video_size) in video_src_collection: + downloader(file_name, video_src_link, episode_name) # Print summary and exit print_information(str(link_counter) + " files found, " + str(download_counter) + " downloaded and " + str(skip_counter) + " skipped") if platform == "win32": - input('\nEOF') # So Windows users also see the output (apparently) + input('\nEOF') # So Windows users also see the output (apparently) \ No newline at end of file