vo-scraper.py 31.7 KB
Newer Older
Georg Teufelberger's avatar
Georg Teufelberger committed
1
2
3
4
5
6
7
8
9
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

'''
Make sure you have `requests` -> pip3 install requests

Check README.md and LICENSE before using this program.
'''
# ========================================================================
Georg Teufelberger's avatar
Georg Teufelberger committed
10
11
#  ___                                      _
# |_ _|  _ __ ___    _ __     ___    _ __  | |_   ___
Georg Teufelberger's avatar
Georg Teufelberger committed
12
13
14
#  | |  | '_ ` _ \  | '_ \   / _ \  | '__| | __| / __|
#  | |  | | | | | | | |_) | | (_) | | |    | |_  \__ \
# |___| |_| |_| |_| | .__/   \___/  |_|     \__| |___/
Georg Teufelberger's avatar
Georg Teufelberger committed
15
#                   |_|
Georg Teufelberger's avatar
Georg Teufelberger committed
16
17
# ========================================================================

18
# Import urllib.request, urllib.parse, os, sys, http.client
Georg Teufelberger's avatar
Georg Teufelberger committed
19
20
21
22
23
import urllib.request, os, sys, http.client
from urllib.request import Request, urlopen
from sys import platform
import json     # For handling json files
import argparse # For parsing commandline arguments
24
import getpass  # For getting the user password
Georg Teufelberger's avatar
Georg Teufelberger committed
25
26


27
# Check whether `requests` is installed
Georg Teufelberger's avatar
Georg Teufelberger committed
28
29
30
31
32
33
try:
    import requests
except:
    print_information("Required package `requests` is missing, try installing with `pip3 install requests`", type='error')
    sys.exit(1)

34
# Check whether `webbrowser` is installed
Georg Teufelberger's avatar
Georg Teufelberger committed
35
36
37
38
39
40
try:
    import webbrowser # only used to open the user's browser when reporting a bug
except:
    print_information("Failed to import `webbrowser`. It is however not required for downloading videos", type='warning')

# ========================================================================
Georg Teufelberger's avatar
Georg Teufelberger committed
41
42
#   ____   _           _               _
#  / ___| | |   ___   | |__     __ _  | |   __   __   __ _   _ __   ___
Georg Teufelberger's avatar
Georg Teufelberger committed
43
44
45
46
47
48
# | |  _  | |  / _ \  | '_ \   / _` | | |   \ \ / /  / _` | | '__| / __|
# | |_| | | | | (_) | | |_) | | (_| | | |    \ V /  | (_| | | |    \__ \
#  \____| |_|  \___/  |_.__/   \__,_| |_|     \_/    \__,_| |_|    |___/
#
# ========================================================================

49
50
51
# Links to repo
gitlab_repo_page = "https://gitlab.ethz.ch/tgeorg/vo-scraper/"
gitlab_issue_page = gitlab_repo_page+"issues"
52
gitlab_changelog_page = gitlab_repo_page+"-/tags/v"
53
remote_version_link = gitlab_repo_page+"raw/master/VERSION"
Georg Teufelberger's avatar
Georg Teufelberger committed
54
program_version = '1.2.1'
55

56
# For web requests
Georg Teufelberger's avatar
Georg Teufelberger committed
57
user_agent = 'Mozilla/5.0'
58
cookie_jar = requests.cookies.RequestsCookieJar()
Georg Teufelberger's avatar
Georg Teufelberger committed
59

60
61
62
# Store video sources in global list
video_src_collection = list()

63
# For stats
Georg Teufelberger's avatar
Georg Teufelberger committed
64
65
66
67
link_counter = 0
download_counter = 0
skip_counter = 0

Georg Teufelberger's avatar
Georg Teufelberger committed
68
#
Georg Teufelberger's avatar
Georg Teufelberger committed
69
70
series_metadata_suffix = ".series-metadata.json"
video_info_prefix = "https://video.ethz.ch/.episode-video.json?recordId="
71
directory_prefix = "Lecture Recordings" + os.sep
Georg Teufelberger's avatar
Georg Teufelberger committed
72

73
# Default quality
Georg Teufelberger's avatar
Georg Teufelberger committed
74
75
video_quality = "high"

76
# Boolean flags
Georg Teufelberger's avatar
Georg Teufelberger committed
77
78
download_all = False
verbose = False
79
print_src = False
80

Georg Teufelberger's avatar
Georg Teufelberger committed
81
# Location of text files
82
file_to_print_src_to = ""
Georg Teufelberger's avatar
Georg Teufelberger committed
83
history_file = ""
84

Georg Teufelberger's avatar
Georg Teufelberger committed
85
quality_dict = {
86
    'high'  : 0,
Georg Teufelberger's avatar
Georg Teufelberger committed
87
    'medium': 1,
88
    'low'   : 2
Georg Teufelberger's avatar
Georg Teufelberger committed
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
}

class bcolors:
    INFO = '\033[94m'
    ERROR = '\033[91m'
    WARNING = '\033[93m'
    ENDC = '\033[0m'

print_type_dict = {
    'info'    : f"({bcolors.INFO}INF{bcolors.ENDC})",
    'warning' : f"({bcolors.WARNING}WRN{bcolors.ENDC})",
    'error'   : f"({bcolors.ERROR}ERR{bcolors.ENDC})"
}

# ===============================================================
Georg Teufelberger's avatar
Georg Teufelberger committed
104
105
#  _____                          _     _
# |  ___|  _   _   _ __     ___  | |_  (_)   ___    _ __    ___
Georg Teufelberger's avatar
Georg Teufelberger committed
106
107
108
# | |_    | | | | | '_ \   / __| | __| | |  / _ \  | '_ \  / __|
# |  _|   | |_| | | | | | | (__  | |_  | | | (_) | | | | | \__ \
# |_|      \__,_| |_| |_|  \___|  \__| |_|  \___/  |_| |_| |___/
Georg Teufelberger's avatar
Georg Teufelberger committed
109
#
Georg Teufelberger's avatar
Georg Teufelberger committed
110
111
112
113
# ===============================================================

def print_information(str, type='info', verbose_only=False):
    """Print provided string.
Georg Teufelberger's avatar
Georg Teufelberger committed
114

Georg Teufelberger's avatar
Georg Teufelberger committed
115
    Keyword arguments:
Georg Teufelberger's avatar
Georg Teufelberger committed
116
117
118
119
120
    type         -- The type of information: {info, warning, error}
    verbose_only -- If true the string will only be printed when the verbose flag is set.
                    Useful for printing debugging info.
    """
    global print_type_dict
Georg Teufelberger's avatar
Georg Teufelberger committed
121

Georg Teufelberger's avatar
Georg Teufelberger committed
122
123
    if not verbose_only:
        if type == 'info' and not verbose:
124
            # Print without tag
Georg Teufelberger's avatar
Georg Teufelberger committed
125
126
            print(str)
        else:
127
            # Print with tag
Georg Teufelberger's avatar
Georg Teufelberger committed
128
129
130
131
            print(print_type_dict[type], str)
    elif verbose:
        # Always print with tag
        print(print_type_dict[type],str)
132

133
def get_credentials(user, passw):
Georg Teufelberger's avatar
Georg Teufelberger committed
134
135
136
137
138
139
    """Gets user credentials and returns them

    Keyword arguments:
    user  -- The username passed from a text file
    passw -- The password passed from a text file
    """
140
141
142
143
    if not user:
        user  = input("Enter your username: ")
    if not passw:
        passw = getpass.getpass()
Georg Teufelberger's avatar
Georg Teufelberger committed
144

145
146
    return(user, passw)

147
def acquire_login_cookie(protection, vo_link, user, passw):
Georg Teufelberger's avatar
Georg Teufelberger committed
148
149
150
151
152
153
154
155
    """Gets login-cookie by sending user credentials to login server

    Keyword arguments:
    protection  -- The type of login the lecture requires (NETHZ or custom password)
    vo_link     -- The link to the lecture
    user        -- The username passed from a text file
    passw       -- The password passed from a text file
    """
156
157
    global user_agent

158
    # Setup cookie_jar
159
    cookie_jar = requests.cookies.RequestsCookieJar()
160

161
162
163
    if protection == "ETH":
        print_information("This lecture requires a NETHZ login")
        while True:
164
            (user, passw) = get_credentials(user, passw)
Georg Teufelberger's avatar
Georg Teufelberger committed
165

166
            # Setup headers and content to send
167
            headers = {"User-Agent": user_agent, "Referer": vo_link+".html"}
168
            data = { "__charset__": "utf-8", "j_validate": True, "j_username": user, "j_password": passw}
Georg Teufelberger's avatar
Georg Teufelberger committed
169

170
            # Request login-cookie
171
            r = requests.post("https://video.ethz.ch/j_security_check", headers=headers, data=data)
172
            print_information(f"Received response: {r.status_code}", verbose_only=True)
173

174
            # Put login cookie in cookie_jar
175
176
177
178
179
            cookie_jar = r.cookies
            if cookie_jar:
                break
            else:
                print_information("Wrong username or password, please try again", type='warning')
180
                (user, passw) = ('', '') # Reset passed credentials to not end up in loop if wrong credentials were passed
Georg Teufelberger's avatar
Georg Teufelberger committed
181

182
    elif protection == "PWD":
Georg Teufelberger's avatar
Georg Teufelberger committed
183
        print_information("This lecture requires a CUSTOM login. Check the lecture's website or your emails for the credentials.")
Georg Teufelberger's avatar
Georg Teufelberger committed
184

185
        while True:
186
            (user, passw) = get_credentials(user, passw)
187

188
            # Setup headers and content to send
189
190
            headers = {"Referer": vo_link+".html", "User-Agent":user_agent}
            data = { "__charset__": "utf-8", "username": user, "password": passw }
191

192
            # Get login cookie
193
            r = requests.post(vo_link+".series-login.json", headers=headers, data=data)
Georg Teufelberger's avatar
Georg Teufelberger committed
194

195
            # Put login cookie in cookie_jar
196
197
198
199
200
            cookie_jar = r.cookies
            if cookie_jar:
                break
            else:
                print_information("Wrong username or password, please try again", type='warning')
201
                (user, passw) = ('', '') # Reset passed credentials to not end up in loop if wrong credentials were passed
Georg Teufelberger's avatar
Georg Teufelberger committed
202

203
    else:
Georg Teufelberger's avatar
Georg Teufelberger committed
204
        print_information("Unknown protection type: " + protection, type='error')
205
206
207
208
209
        print_information("Please report this to the project's GitLab issue page!", type='error')
        report_bug()

    print_information("Acquired cookie:", verbose_only=True)
    print_information(cookie_jar, verbose_only=True)
Georg Teufelberger's avatar
Georg Teufelberger committed
210

211
    return cookie_jar
212

213
214
215
def pretty_print_episodes(vo_json_data, selected):
    """Prints the episode numbers that match `selected`"""
    # Get length of longest strings for nice formatting when printing
216
    nr_length = len(" Nr.")
217
    max_date_length = max([len(str(episode['createdAt'][:-6])) for episode in vo_json_data['episodes']])
218
219
    max_title_length = max([len(episode['title']) for episode in vo_json_data['episodes']])
    max_lecturer_length = max([len(str(episode['createdBy'])) for episode in vo_json_data['episodes']])
Georg Teufelberger's avatar
Georg Teufelberger committed
220

221
222
223
224
    # Print header
    print_information(
        " Nr."
        + " | " +
225
226
        "Date".ljust(max_date_length)
        + " | " +
227
228
        "Name".ljust(max_title_length)
        + " | " +
229
        "Lecturer".ljust(max_lecturer_length)
230
231
    )

232
233
234
    # Print the selected episodes
    for episode_nr in selected:
        episode = vo_json_data['episodes'][episode_nr]
235
        print_information(
236
            "%3d".ljust(nr_length) % episode_nr
237
            + " | " +
238
239
            episode['createdAt'][:-6].ljust(max_date_length)
            + " | " +
240
241
            episode['title'].ljust(max_title_length)
            + " | " +
242
            str(episode['createdBy']).ljust(max_lecturer_length)
243
        )
Georg Teufelberger's avatar
Georg Teufelberger committed
244

245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
def make_range(item, max_episode_number):
    """

    Keyword arguments:
    item               -- a string in the form of 'x..z' or 'x..y..z'
    max_episode_number -- The highest episode number to have an upperbound for the range of episodes
    
    Returns:
    A range from x to z, with step size y, 1 if y wasn't provided
    """
    if len(item.split('..')) == 2:
        # user passed something like 'x..z', so step size is 1
        lower_bound, upper_bound = item.split('..')
        step = 1 
    else:
        # user passed something like 'x..y..z', so step size is y
        lower_bound, step, upper_bound = item.split('..')

    # set the bounds to the outer limits if no number was passed
    lower_bound = int(lower_bound) if lower_bound else 0
    upper_bound = int(upper_bound) if upper_bound else max_episode_number

    step = int(step)
    return range(lower_bound, upper_bound+1, step)

def get_user_choice(max_episode_number):
    """
    Prompts the user to pick multiple episodes and returns them

    Keyword arguments:
    max_episode_number -- The highest episode number to have an upperbound for the range of episodes
    
    Returns:
    A list containg the user picked choices
    """
    # Prompt user
    user_input = input(
        "Enter numbers of the above lectures you want to download separated by space (e.g. 0 5 12 14)\nJust press enter if you don't want to download anything from this lecture\n"
    ).split()
    choice = list()
    for elem in user_input:
        if elem.isnumeric():
            choice.append(int(elem))
        else:
            choice += make_range(elem, max_episode_number)
    
    # make elements of `choice` unique
    choice = set(choice)   
    # sort them, to download in order and not randomly
    choice = sorted(choice)

    return choice
Georg Teufelberger's avatar
Georg Teufelberger committed
297

298
def vo_scrapper(vo_link, user, passw):
Georg Teufelberger's avatar
Georg Teufelberger committed
299
300
301
    """
    Gets the list of all available videos for a lecture.
    Allows user to select multiple videos.
302
    Returns the selected episodes
Georg Teufelberger's avatar
Georg Teufelberger committed
303
304
305
306
307

    Keyword arguments:
    vo_link -- The link to the lecture
    user    -- The username passed from a text file
    passw   -- The password passed from a text file
308
309
310

    Returns:
    A tuple consisting out of the filename and the video_src_link
Georg Teufelberger's avatar
Georg Teufelberger committed
311
312
313
314
315
316
    """
    global user_agent
    global download_all

    global video_quality
    global quality_dict
317
    global cookie_jar
Georg Teufelberger's avatar
Georg Teufelberger committed
318
319
320
321
322

    global series_metadata_suffix
    global video_info_prefix
    global directory_prefix

323
324
    global link_counter

325
    # Remove `.html` file extension
Georg Teufelberger's avatar
Georg Teufelberger committed
326
327
328
    if vo_link.endswith('.html'):
        vo_link = vo_link[:-5]

329
    # Get lecture metadata for episode list
330
    r = requests.get(vo_link + series_metadata_suffix, headers={'User-Agent': user_agent})
Georg Teufelberger's avatar
Georg Teufelberger committed
331
332
    vo_json_data = json.loads(r.text)

333
334
335
    # Increase counter for stats
    link_counter += len(vo_json_data['episodes'])

336
    # Print available lectures
337
    pretty_print_episodes(vo_json_data, range(len(vo_json_data['episodes'])))
Georg Teufelberger's avatar
Georg Teufelberger committed
338

339
    # Get video selections
Georg Teufelberger's avatar
Georg Teufelberger committed
340
341
    choice = list()
    if download_all:
342
        # Add all available videos to the selected
343
        choice = list(range(len(vo_json_data['episodes'])))
Georg Teufelberger's avatar
Georg Teufelberger committed
344
    else:
345
        # Let user pick videos
Georg Teufelberger's avatar
Georg Teufelberger committed
346
        try:
347
348
            choice = get_user_choice(max(range(len(vo_json_data['episodes']))))
        except KeyboardInterrupt:
349
            print()
Georg Teufelberger's avatar
Georg Teufelberger committed
350
351
            print_information("Exiting...")
            sys.exit()
Georg Teufelberger's avatar
Georg Teufelberger committed
352

353
    # Print the user's choice
Georg Teufelberger's avatar
Georg Teufelberger committed
354
355
    if not choice:
        print_information("No videos selected")
356
        return list() # Nothing to do anymore
Georg Teufelberger's avatar
Georg Teufelberger committed
357
    else:
358
359
        print_information("You selected:")
        pretty_print_episodes(vo_json_data, choice)
360
    print()
Georg Teufelberger's avatar
Georg Teufelberger committed
361

362
    # Check whether lecture requires login and get credentials if necessary
363
    print_information("Protection: " + vo_json_data["protection"], verbose_only=True)
364
    if vo_json_data["protection"] != "NONE":
365
        try:
366
            cookie_jar.update(acquire_login_cookie(vo_json_data["protection"], vo_link, user, passw))
367
368
369
370
        except KeyboardInterrupt:
            print()
            print_information("Keyboard interrupt detected, skipping lecture", type='warning')
            return
Georg Teufelberger's avatar
Georg Teufelberger committed
371

372
373
374
    local_video_src_collection = list()

    # Collect links for download
Georg Teufelberger's avatar
Georg Teufelberger committed
375
    for item_nr in choice:
376
        # Get link to video metadata json file
Georg Teufelberger's avatar
Georg Teufelberger committed
377
378
        item = vo_json_data['episodes'][item_nr]
        video_info_link = video_info_prefix+item['id']
Georg Teufelberger's avatar
Georg Teufelberger committed
379

380
381
        # Download the video metadata file
        # Use login-cookie if provided otherwise make request without cookie
382
383
        if(cookie_jar):
            r = requests.get(video_info_link, cookies=cookie_jar, headers={'User-Agent': user_agent})
Georg Teufelberger's avatar
Georg Teufelberger committed
384
385
386
        else:
            r = requests.get(video_info_link, headers={'User-Agent': user_agent})
        if(r.status_code == 401):
387
            # The lecture requires a login
388
            print_information("Received 401 response. The following lecture requires a valid login cookie:", type='error')
Georg Teufelberger's avatar
Georg Teufelberger committed
389
390
            item = vo_json_data['episodes'][item_nr]
            print_information("%2d" % item_nr + " " + item['title'] + " " + str(item['createdBy']) + " " + item['createdAt'][:-6], type='error')
391
            print_information("Make sure your token is valid. See README.md on how to acquire it.", type='error')
Georg Teufelberger's avatar
Georg Teufelberger committed
392
393
394
395
            print()
            continue
        video_json_data = json.loads(r.text)

Georg Teufelberger's avatar
Georg Teufelberger committed
396

397
        # Put available versions in list for sorting by video quality
Georg Teufelberger's avatar
Georg Teufelberger committed
398
399
400
401
402
403
404
        counter = 0
        versions = list()
        print_information("Available versions:", verbose_only=True)
        for vid_version in video_json_data['streams'][0]['sources']['mp4']:
            versions.append((counter, vid_version['res']['w']*vid_version['res']['h']))
            print_information(str(counter) + ": " + "%4d" %vid_version['res']['w'] + "x" + "%4d" %vid_version['res']['h'], verbose_only=True)
            counter += 1
405
406
        versions.sort(key=lambda tup: tup[1], reverse=True)
        # Now it's sorted: high -> medium -> low
Georg Teufelberger's avatar
Georg Teufelberger committed
407

408
        # Get video src url from json
409
410
411
412
413
        try: # try/except block to handle cases were not all three types of quality exist
            video_src_link = video_json_data['streams'][0]['sources']['mp4'][versions[quality_dict[video_quality]][0]]['src']
        except IndexError:
            print_information("Requested quality \"" + video_quality + "\" not available. Skipping episode!", type='error')
            continue
Georg Teufelberger's avatar
Georg Teufelberger committed
414

Georg Teufelberger's avatar
Georg Teufelberger committed
415
        lecture_title = vo_json_data['title']
416
        episode_title   = vo_json_data["episodes"][item_nr]["title"]
Georg Teufelberger's avatar
Georg Teufelberger committed
417

418
        # If video and lecture title overlap, remove lecture title from video title
419
420
        if episode_title.startswith(lecture_title):
            episode_title = episode_title[len(lecture_title):]
421

422
423
        # Extract episode name before adding the date to episode_title
        episode_name = item['createdAt'][:-6] + " " + lecture_title + episode_title
Georg Teufelberger's avatar
Georg Teufelberger committed
424

425
        # Append date
426
        episode_title = item['createdAt'][:-6]+episode_title
427

428
429
430
431
432
        # Generate a pseudo hash by using part of the filename of the online version (which appears to be a UUID)
        pseudo_hash = video_src_link.replace('https://oc-vp-dist-downloads.ethz.ch/mh_default_org/oaipmh-mmp/','')[:8]
        print_information(pseudo_hash, verbose_only=True)

        # Filename is `directory/<video date (YYYY-MM-DD)><leftovers from video title>_<quality>-<pseudo_hash>.mp4`
Georg Teufelberger's avatar
Georg Teufelberger committed
433
        directory = directory_prefix + lecture_title + os.sep
434
        file_name = directory+episode_title+"_"+video_quality+"-"+pseudo_hash+".mp4"
435
        print_information(file_name, verbose_only=True)
Georg Teufelberger's avatar
Georg Teufelberger committed
436

Georg Teufelberger's avatar
Georg Teufelberger committed
437
        local_video_src_collection.append((file_name, video_src_link, episode_name))
438
439

    return local_video_src_collection
Georg Teufelberger's avatar
Georg Teufelberger committed
440

Georg Teufelberger's avatar
Georg Teufelberger committed
441
def downloader(file_name, video_src_link, episode_name):
Georg Teufelberger's avatar
Georg Teufelberger committed
442
443
444
445
446
    """Downloads the video and gives progress information

    Keyword arguments:
    file_name      -- Name of the file to write the data to
    video_src_link -- The link to download the data from
Georg Teufelberger's avatar
Georg Teufelberger committed
447
    episode_name   -- Name of the episode
Georg Teufelberger's avatar
Georg Teufelberger committed
448
    """
Georg Teufelberger's avatar
Georg Teufelberger committed
449
450
    global download_counter
    global skip_counter
451

452
453
    global print_src
    global file_to_print_src_to
Georg Teufelberger's avatar
Georg Teufelberger committed
454

455
456
457
458
459
460
461
462
463
464
    # Check for print_src flag
    if print_src:
        # Print to file if given
        if file_to_print_src_to:
            print_information("Printing " + video_src_link + "to file: "+ file_to_print_src_to, verbose_only=True)
            with open(file_to_print_src_to,"a") as f:
                f.write(video_src_link+"\n")
        else:
            print_information(video_src_link)
    # Otherwise download video
Georg Teufelberger's avatar
Georg Teufelberger committed
465
    else:
466
        print_information("Video source: " + video_src_link, verbose_only=True)
Georg Teufelberger's avatar
Georg Teufelberger committed
467

Georg Teufelberger's avatar
Georg Teufelberger committed
468
469
470
471
472
473
474
475
476
477
478
        # Check history file (if one has been specified) whether episode has already been downloaded
        if history_file:
            try:
                with open(history_file, "r") as file:
                    if video_src_link in [line.rstrip('\n') for line in file.readlines()]:
                        print("download skipped - file already recorded in history: " + episode_name)
                        skip_counter += 1
                        return
                    else:
                        print_information("Link has not yet been recorded in history file", verbose_only=True)
            except FileNotFoundError:
Georg Teufelberger's avatar
Georg Teufelberger committed
479
                print_information("No history file found at specified location: " + history_file, type='warning', verbose_only=True)
Georg Teufelberger's avatar
Georg Teufelberger committed
480

481
482
483
484
485
486
487
488
        # Create directory for video if it does not already exist
        directory = os.path.dirname(os.path.abspath(file_name))
        if not os.path.isdir(directory):
            os.makedirs(directory)
            print_information("This folder was generated: " + directory, verbose_only=True)
        else:
            print_information("This folder already exists: " + directory, verbose_only=True)

489
490
        # Check if file already exists
        if os.path.isfile(file_name):
Georg Teufelberger's avatar
Georg Teufelberger committed
491
            print_information("download skipped - file already exists: " + episode_name)
492
493
494
495
496
497
498
499
            skip_counter += 1
        # Otherwise download it
        else:
            # cf.: https://stackoverflow.com/questions/15644964/python-progress-bar-and-downloads
            with open(file_name+".part", "wb") as f:
                response = requests.get(video_src_link, stream=True)
                total_length = response.headers.get('content-length')

Georg Teufelberger's avatar
Georg Teufelberger committed
500
                print_information("Downloading " + episode_name + " (%.2f" % (int(total_length)/1024/1024) + " MiB)")
501
502
503
504
505
506
507
508
509
510
511
512
513
514

                if total_length is None: # We received no content length header
                    f.write(response.content)
                else:
                    # Download file and show progress bar
                    dl = 0
                    total_length = int(total_length)
                    for data in response.iter_content(chunk_size=4096):
                        dl += len(data)
                        f.write(data)
                        done = int(50 * dl / total_length)
                        sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) )
                        sys.stdout.flush()
            print()
Georg Teufelberger's avatar
Georg Teufelberger committed
515

Georg Teufelberger's avatar
Georg Teufelberger committed
516
            # Remove `.part` suffix from file name
517
            os.rename(file_name+".part", file_name)
Georg Teufelberger's avatar
Georg Teufelberger committed
518
            print_information("Downloaded file: " + episode_name)
519
            download_counter += 1
520

Georg Teufelberger's avatar
Georg Teufelberger committed
521
522
523
524
        if history_file:
            # Regardless whether we just downloaded the file or it already exists on disk, we want to add it to the history file
            with open(history_file, "a") as file:
                file.write(video_src_link + '\n')
Georg Teufelberger's avatar
Georg Teufelberger committed
525
526

def check_connection():
Georg Teufelberger's avatar
Georg Teufelberger committed
527
    """Checks connection to video.ethz.ch and if it fails then also to the internet"""
Georg Teufelberger's avatar
Georg Teufelberger committed
528
529
530
531
532
533
534
535
536
537
538
539
540
    try:
        print_information("Checking connection to video.ethz.ch", verbose_only=True)
        req = Request('https://video.ethz.ch/', headers={'User-Agent': 'Mozilla/5.0'})
        urllib.request.urlopen(req)
    except urllib.error.URLError:
        try:
            print_information("There seems to be no connection to video.ethz.ch", type='error')
            print_information("Checking connection to the internet by connecting to duckduckgo.com", verbose_only=True)
            urllib.request.urlopen('https://www.duckduckgo.com')
        except urllib.error.URLError:
            print_information("There seems to be no internet connection - please connect to the internet and try again.", type='error')
        sys.exit(1)

541
def report_bug():
542
    """Opens GitLab issue page in browser"""
543
544
545
546
547
548
549
550
551
    print_information(gitlab_issue_page)
    try:
        input("Press enter to open the link in your browser or Ctrl+C to exit.")
        webbrowser.open(gitlab_issue_page)
    except:
        print()
    print_information("Exiting...")
    sys.exit(0)

552
553
554
555
556
557
558
559
560
561
def version_tuple(version):
    """Takes a string and turns into an integer tuple, splitting on `.`"""
    return tuple(map(int, (version.split('.'))))

def check_update():
    """
    Checks for a new version of the scraper and prompts the user if a new version is available
    """
    global program_version
    global remote_version_link
562
563
    global gitlab_repo_page
    global gitlab_changelog_page
Georg Teufelberger's avatar
Georg Teufelberger committed
564

565
566
567
568
569
570
571
572
    print_information("Checking for update", verbose_only=True)

    # try/except block to not crash the scraper just because it couldn't connect to server holding the version number
    try:
        r = requests.get(remote_version_link)
        remote_version_string = r.text

        if r.status_code == 200: # Loading the version number succeeded
Georg Teufelberger's avatar
Georg Teufelberger committed
573

574
575
576
577
578
579
            remote_version = version_tuple(remote_version_string)
            local_version = version_tuple(program_version)


            if remote_version > local_version:
                # There's an update available, prompt the user
580
                print_information("A new version of the VO-scraper is available: " + '.'.join(map(str,remote_version)), type='warning')
581
582
                print_information("You have version: " + '.'.join(map(str,local_version)), type='warning')
                print_information("You can download the new version from here: " + gitlab_repo_page, type='warning')
583
                print_information("The changelog can be found here: " + gitlab_changelog_page + '.'.join(map(str,remote_version)), type='warning')
584
585
586
587
588
589
590
591
592
593
594
595
                print_information("Press enter to continue with the current version", type='warning')
                input()
            else:
                # We are up to date
                print_information("Scraper is up to date according to version number in remote repo.", verbose_only=True)
        else:
            raise Exception # We couldn't get the file for some reason
    except:
        # Update check failed
        print_information("Update check failed, skipping...", type='warning')
        # Note: We don't want the scraper to fail because it couldn't check for a new version so we continue regardless

596
def read_links_from_file(file):
Georg Teufelberger's avatar
Georg Teufelberger committed
597
598
599
600
601
    """Reads the links from a text file
    Each link should be on a seperate line
    Lines starting with `#` will be ignored
    Username and password can be added at the end of the link seperated by a space
    """
602
603
    links = list()
    if os.path.isfile(file):
604
        # Read provided file
605
        with open(file, "r") as myfile:
606
            file_links = myfile.readlines()
Georg Teufelberger's avatar
Georg Teufelberger committed
607

608
609
        # Strip lines containing a `#` symbol as they are comments
        file_links = [line for line in file_links if not line.startswith('#')]
Georg Teufelberger's avatar
Georg Teufelberger committed
610

611
        # Strip newline characters
612
        file_links = [x.rstrip('\n') for x in file_links]
Georg Teufelberger's avatar
Georg Teufelberger committed
613

614
615
616
        # Strip empty lines
        file_links = [line for line in file_links if line]

617
        # Add links from file to the list of links to look at
618
619
620
621
622
        links += file_links
    else:
        print_information("No file with name \"" + file +"\" found", type='error')
    return links

Georg Teufelberger's avatar
Georg Teufelberger committed
623
624
625
626
627
628
629
def apply_args(args):
    """Applies the provided command line arguments
    The following are handled here:
     - verbose
     - bug
     - all
     - quality
630
     - print-source
631
     - destination
Georg Teufelberger's avatar
Georg Teufelberger committed
632
     - history
Georg Teufelberger's avatar
Georg Teufelberger committed
633
634
    """

Georg Teufelberger's avatar
Georg Teufelberger committed
635
    global verbose
Georg Teufelberger's avatar
Georg Teufelberger committed
636
637
    global download_all
    global video_quality
638
    global print_src
639
    global file_to_print_src_to
640
    global directory_prefix
Georg Teufelberger's avatar
Georg Teufelberger committed
641
    global history_file
642

643
    # Enable verbose for debugging
Georg Teufelberger's avatar
Georg Teufelberger committed
644
645
646
647
648
    verbose = args.verbose
    print_information("Verbose enabled", verbose_only=True)

    # Check if user wants to submit bug report and exit
    if(args.bug == True):
649
650
        print_information("If you found a bug you can raise an issue here: ")
        report_bug()
Georg Teufelberger's avatar
Georg Teufelberger committed
651

652
    # Set global variable according to input
Georg Teufelberger's avatar
Georg Teufelberger committed
653
654
655
    download_all = args.all
    video_quality = args.quality

656
    # Check for printing flag
657
658
    if hasattr(args, 'print_src'):
        print_src=True
659
660
661
        # Store where to print video source
        if args.print_src:
            file_to_print_src_to = args.print_src
662

663
    # Check for destination flag
664
    if args.destination:
665
666
        directory_prefix = args.destination
        print_information("The user passed directory is: " + directory_prefix, verbose_only=True)
667
668
669
        if not directory_prefix.endswith(os.sep):
            # Add trailing (back)slash as the user might have forgotten it
            directory_prefix += os.sep
670
            print_information("Added missing slash: " + directory_prefix, verbose_only=True)
671

Georg Teufelberger's avatar
Georg Teufelberger committed
672
673
674
675
676
    # Store where to read/print history
    if args.history:
        history_file = args.history
        print_information("History file location: " + history_file, verbose_only= True)

677

Georg Teufelberger's avatar
Georg Teufelberger committed
678
def setup_arg_parser():
Georg Teufelberger's avatar
Georg Teufelberger committed
679
    """Sets the parser up to handle all possible flags"""
Georg Teufelberger's avatar
Georg Teufelberger committed
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "lecture_link",
        nargs='*',
        help="A link for each lecture you want to download videos from. The link should look like: https://video.ethz.ch/lectures/<department>/<year>/<spring or autumn>/<Number>.html"
    )
    parser.add_argument(
        "-a", "--all",
        action="store_true",
        help="Download all videos of the specified lecture. Already downloaded video will be skipped."
    )
    parser.add_argument(
        "-b", "--bug",
        action="store_true",
        help="Print link to GitLab issue page and open it in browser."
    )
697
698
699
700
    parser.add_argument(
        "-d", "--destination",
        help="Directory where to save the files to. By default this is the folder \"Lecture Recordings/\" of the current working directory."
    )
Georg Teufelberger's avatar
Georg Teufelberger committed
701
702
703
704
    parser.add_argument(
        "-f", "--file",
        help="A file with links to all the lectures you want to download. Each lecture link should be on a new line. See README.md for details."
    )
Georg Teufelberger's avatar
Georg Teufelberger committed
705
706
707
708
    parser.add_argument(
        "-hs", "--history",
        help="A file to which the scraper saves the IDs of downloaded videos to. The scraper will skip downloads if the corresponding ID exists in the specified file."
    )
709
    parser.add_argument(
710
        "-p", "--print-source",
711
712
713
714
715
        metavar="FILE",
        nargs="?",
        default=argparse.SUPPRESS,
        help="Prints the source link for each video but doesn't download it. Follow with filename to print to that file instead. Useful if you want to use your own tool to download the video."
    )
Georg Teufelberger's avatar
Georg Teufelberger committed
716
717
718
719
720
721
722
    parser.add_argument(
        "-q", "--quality",
        choices=['high','medium','low'],
        default='high',
        help="Select video quality. Accepted values are \"high\" (1920x1080), \"medium\" (1280x720), and \"low\" (640x360). Default is \"high\""
    )
    parser.add_argument(
723
        "-sc", "--skip-connection-check",
Georg Teufelberger's avatar
Georg Teufelberger committed
724
725
726
        action="store_true",
        help="Skip checking whether there's a connection to video.ethz.ch or the internet in general."
    )
727
728
729
730
731
    parser.add_argument(
        "-su", "--skip-update-check",
        action="store_true",
        help="Skip checking whether there's an update available for the scraper"
    )
Georg Teufelberger's avatar
Georg Teufelberger committed
732
733
734
735
736
    parser.add_argument(
        "-v", "--verbose",
        action="store_true",
        help="Print additional debugging information."
    )
737
738
739
740
741
    parser.add_argument(
        "--version",
        action="store_true",
        help="Print version number and exit."
    )
Georg Teufelberger's avatar
Georg Teufelberger committed
742
743
    return parser

744
745
746
747
748
749
750
751
752
def print_usage():
    """Prints basic usage of parser and gives examples"""
    print_information("You haven't added any lecture links! To download a lecture video you need to pass a link to the lecture, e.g.:")
    print_information("    \"python3 vo-scraper.py https://video.ethz.ch/lectures/d-infk/2019/spring/252-0028-00L.html\"")
    print_information("")
    print_information("You can also pass optional arguments. For example, the following command downloads all lectures of \"Design of Digital Circuits\" from the year 2019 in low quality:")
    print_information("    \"python3 vo-scraper.py --quality low --all https://video.ethz.ch/lectures/d-infk/2019/spring/252-0028-00L.html\"")
    print_information("")
    print_information("To see all possible arguments run \"python3 vo-scraper.py --help\"")
Georg Teufelberger's avatar
Georg Teufelberger committed
753

754
755
756
757
758
759
760
761
762
763
def remove_illegal_characters(str):
    """Removes characters that are deemed illegal in some file systems such as NTFS from the input string

    Keyword arguments:
    str -- The string from which to remove the characters
    """
    illegal_chars = '?<>:*|"^'
    for c in illegal_chars:
        str = str.replace(c,'')
    return str
764

Georg Teufelberger's avatar
Georg Teufelberger committed
765
# ===============================================================
Georg Teufelberger's avatar
Georg Teufelberger committed
766
767
768
#  __  __           _
# |  \/  |   __ _  (_)  _ __
# | |\/| |  / _` | | | | '_ \
Georg Teufelberger's avatar
Georg Teufelberger committed
769
770
771
772
773
# | |  | | | (_| | | | | | | |
# |_|  |_|  \__,_| |_| |_| |_|
#
# ===============================================================

774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
if __name__ == '__main__':
    # Setup parser
    parser = setup_arg_parser()
    args = parser.parse_args()

    # Check for version flag
    if args.version:
        print_information(program_version)
        sys.exit()

    # Apply commands from input
    apply_args(args)

    # Collect lecture links
    links = list()
    if args.file:
        links += read_links_from_file(args.file)

    # Append links passed through the command line:
    links += args.lecture_link

    # Extract username and password from "link"
    lecture_objects = list()
    lecture_objects +=  [tuple((link.split(' ') + ['',''])[:3]) for link in links] # This gives us tuples of size 3, where user and pw can be empty

    # Print basic usage and exit if no lecture links are passed
    if not links:
        print_usage()
        sys.exit()

    # Connection check
    if not args.skip_connection_check:
        check_connection()
Georg Teufelberger's avatar
Georg Teufelberger committed
807
    else:
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
        print_information("Connection check skipped.", verbose_only=True)

    # Update check
    if not args.skip_update_check:
        check_update()
    else:
        print_information("Update check skipped.", verbose_only=True)

    # Run scraper for every link provided to get video sources for each episode
    for (link, user, password) in lecture_objects:
        print_information("Currently selected: " + link, verbose_only=True)
        if "video.ethz.ch" not in link:
            print_information("Looks like the provided link does not go to 'videos.ethz.ch' and has therefore been skipped. Make sure that it is correct: " + link, type='warning')
        else:
            video_src_collection += vo_scrapper(link, user, password)
        print()
Georg Teufelberger's avatar
Georg Teufelberger committed
824

825
826
    # Print collected episodes
    print_information(video_src_collection, verbose_only=True)
827

828
829
    # Strip illegal characters:
    video_src_collection = [(remove_illegal_characters(file_name), video_src_link, episode_name) for (file_name, video_src_link, episode_name) in video_src_collection]
830

831
832
833
    # Download selected episodes
    for (file_name, video_src_link, episode_name) in video_src_collection:
        downloader(file_name, video_src_link, episode_name)
834

835
836
837
838
    # Print summary and exit
    print_information(str(link_counter) + " files found, " + str(download_counter) + " downloaded and " + str(skip_counter) + " skipped")
    if platform == "win32":
        input('\nEOF') # So Windows users also see the output (apparently)