To receive notifications about scheduled maintenance, please subscribe to the mailing-list gitlab-operations@sympa.ethz.ch. You can subscribe to the mailing-list at https://sympa.ethz.ch

youtubedl.go 15.9 KB
Newer Older
1
package youtubedl
2
3

import (
4
	"context"
5
	"encoding/json"
6
7
	"encoding/xml"
	"errors"
8
9
10
	"fmt"
	"github.com/asticode/go-astisub"
	log "github.com/sirupsen/logrus"
11
	"gitlab.ethz.ch/chgerber/MessageComposition/src/pkg/util"
Christof Gerber's avatar
Christof Gerber committed
12
	"gitlab.ethz.ch/chgerber/annotation/v2"
13
	"gitlab.ethz.ch/chgerber/monitor"
Christof Gerber's avatar
Christof Gerber committed
14
	"go.mongodb.org/mongo-driver/mongo"
15
16
17
	"google.golang.org/api/googleapi/transport"
	"google.golang.org/api/option"
	"google.golang.org/api/youtube/v3"
18
	"io/ioutil"
19
	"net/http"
20
21
	"net/url"
	"os"
22
	"os/exec"
23
24
	"path/filepath"
	"regexp"
25
26
27
	"strings"
)

28
29
30
31
32
33
34
// Subtitle is a wrapper type for YouTube subtitles that can be parsed by astisub
type Subtitle struct {
	*astisub.Subtitles
}

// Annotations turns the subtitles into annotations with the src field set as given
// Method of the annotation.Caption IF
35
func (s *Subtitle) Annotations(src string) []*annotation.Annotation {
36
37
38
	return annotation.SubToAnnotation(s.Subtitles, src)
}

39
// SubtitleFormat type describes a subtitle type (e.g. `vtt`)
40
41
42
43
44
45
46
type SubtitleFormat string

func (sf SubtitleFormat) String() string {
	return string(sf)
}

const (
47
48
49
50
51
52
53
54
55
	// SubVTT caption of type vtt
	SubVTT SubtitleFormat = "vtt"
	// SubSRT caption of type srt
	SubSRT SubtitleFormat = "srt"
	// SubSSA caption of type ssa
	SubSSA SubtitleFormat = "ssa"
	// SubSTL caption of type stl
	SubSTL SubtitleFormat = "stl"
	// SubTTML caption of type ttml
56
	SubTTML SubtitleFormat = "ttml"
57
58
	// SubSRV3 caption of the youtube xml timedtext type with file extension .srv3
	SubSRV3 SubtitleFormat = "srv3"
59
60
)

61
62
// SupportedSubs is a list in preferred order of subtitle types that are supported to be parsed
var SupportedSubs = []SubtitleFormat{
63
	SubSRV3,
64
65
66
67
68
	SubSRT,
	SubTTML,
	SubSSA,
	SubSTL,
	//SubVTT TODO Timing does not work with YouTube subs of this type. -> Fix parsing
69
70
}

71
// ErrYoutubeDLLangNotSupported is thrown when language not supported
72
73
74
75
76
77
type ErrYoutubeDLLangNotSupported string

func (e ErrYoutubeDLLangNotSupported) Error() string {
	return string(e)
}

78
79
80
81
82
83
84
// ErrYoutubeVideoUnavailable is thrown when language not supported
type ErrYoutubeVideoUnavailable string

func (e ErrYoutubeVideoUnavailable) Error() string {
	return string(e)
}

85
// ErrYoutubeDL are errors with related to the youtubedl pkg
86
87
88
89
90
91
type ErrYoutubeDL string

func (e ErrYoutubeDL) Error() string {
	return string(e)
}

92
93
func runYoutubeDL(args []string) (output []byte, err error) {
	cmd := exec.Command("youtube-dl", args...)
94
	log.Debug(args)
95
	output, err = cmd.CombinedOutput()
96
	log.Trace(string(output))
97
98
	if err != nil && strings.Contains(string(output), "video is unavailable") {
		return nil, ErrYoutubeVideoUnavailable(string(output))
99
	}
100
	return output, err
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
}

func getVideoInfo(id string) (info map[string]interface{}, err error) {

	args := []string{
		"--skip-download",
		"-J",
		"https://youtu.be/" + id,
	}

	output, err := runYoutubeDL(args)
	if err != nil {
		return info, err
	}

116
117
118
119
120
121
122
	lines := strings.Split(strings.Trim(string(output), "\n\t "), "\n")

	if len(lines) == 0 {
		return nil, ErrYoutubeDL("youtube-dl response decoding error")
	}

	err = json.Unmarshal([]byte(lines[len(lines)-1]), &info)
123
124
125
126
127
128
129
	if err != nil {
		return info, err
	}

	return info, err
}

130
131
132
// SubList stores two  maps[language][sub-formats]
// Automatic Captions are the automatically generated youtube captions
// Subtitles are the proper Subtitles
133
134
135
136
137
type SubList struct {
	AutomaticCaptions map[string][]string
	Subtitles         map[string][]string
}

138
// NewSubList creates a new reference to a SubList with empty members.
139
140
141
142
143
144
145
func NewSubList() *SubList {
	return &SubList{
		AutomaticCaptions: make(map[string][]string),
		Subtitles:         make(map[string][]string),
	}
}

146
func listSubs(id string) (*SubList, error) {
147

148
	subs := NewSubList()
149

150
151
152
153
154
155
	args := []string{
		"--skip-download",
		"--list-subs",
		"https://youtu.be/" + id,
	}
	output, err := runYoutubeDL(args)
156
157
158
	if err != nil {
		return nil, err
	}
159
160
161

	lines := strings.Split(string(output), "\n")

162
	section := ""
163
	for _, line := range lines {
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
		if strings.Contains(strings.ToLower(line), "youtube") {
			continue
		}
		if strings.Contains(strings.ToLower(line), "available automatic captions") {
			section = "automatic captions"
			continue
		}
		if strings.Contains(strings.ToLower(line), "available subtitles") {
			section = "available subtitles"
			continue
		}
		if strings.Contains(strings.ToLower(line), "language formats") {
			continue
		}
		if section != "" {
			space := regexp.MustCompile(`\s+`)
			line := space.ReplaceAllString(line, " ")
			line = strings.ReplaceAll(line, ",", "")
			fields := strings.Split(string(line), " ")
			if len(fields) >= 2 {
				if section == "automatic captions" {
					subs.AutomaticCaptions[fields[0]] = fields[1:]

				}
				if section == "available subtitles" {
					subs.Subtitles[fields[0]] = fields[1:]
				}
			}
		}
193
194
	}

195
196
197
	return subs, nil
}

198
// findSub return nil when subtitle in the requested language is not available in any format
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
func findSub(id string, language string) (formats []string, err error) {
	subList, err := listSubs(id)
	if err != nil {
		return nil, err
	}
	if formats, ok := subList.Subtitles[language]; ok {
		return formats, nil
	}
	return nil, nil
}

//  findAutomaticCaption return nil when subtitle not available in any format
func findAutomaticCaption(id string, language string) (formats []string, err error) {
	subList, err := listSubs(id)
	if err != nil {
		return nil, err
	}
	if formats, ok := subList.AutomaticCaptions[language]; ok {
		return formats, nil
	}
	return nil, nil
}

// downloadSubtitle downloads the subtitle of the given youtube video in the given language and format and saves it as file to $file.$language.$format.
func downloadSubtitle(auto bool, id string, language string, format string, file string) error {
	subType := "--write-sub"
	if auto {
		subType = "--write-auto-sub"
	}

	args := []string{
		"--skip-download",
		subType,
		"--sub-lang",
		language,
		"--sub-format",
		format,
		"https://youtu.be/" + id,
		"-o",
		file,
	}

	output, err := runYoutubeDL(args)
	if err != nil {
		return err
	}
	if strings.Contains(strings.ToLower(string(output)), "subtitles not available") {
		return ErrYoutubeDL(fmt.Sprintf("subtitles language %s not available", language))
	}
	if strings.Contains(strings.ToLower(string(output)), "no subtitle format found") {
		return ErrYoutubeDL(fmt.Sprintf("subtitle format %s not found", format))

		//TODO delete the alternative subtitle file that it downloads automatically
	}

	return nil
}

// GetSubtitle downloads the subtitle of the video id in the passed language.
// Throws an error if subtitle not found.
259
// Downlaods the automatic caption when auto is true.
260
func GetSubtitle(auto bool, id string, language string) (sub annotation.Caption, err error) {
261

262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
	var formats []string
	if auto == false {
		formats, err = findSub(id, language)
		if err != nil {
			return nil, err
		}

		if formats == nil {
			return nil, ErrYoutubeDLLangNotSupported(fmt.Sprintf("No subtitle with language %s for video %s", language, id))
		}
	} else {
		formats, err = findAutomaticCaption(id, language)
		if err != nil {
			return nil, err
		}

		if formats == nil {
			return nil, ErrYoutubeDLLangNotSupported(fmt.Sprintf("No automatic caption with language %s for video %s", language, id))
		}
281
282
	}

283
284
285
286
287
288
	log.WithFields(log.Fields{
		"videoID":  id,
		"language": language,
		"auto":     auto,
		"types":    formats,
	}).Tracef("Available subtitle types for this video")
289

290
	for _, preferredSubType := range SupportedSubs {
291

292
		for _, format := range formats {
293

294
295
296
297
298
299
			if preferredSubType.String() == format {

				filename := filepath.Join("/tmp", "sub-"+id)
				file := filename + "." + language + "." + format

				defer os.Remove(file)
300
				err := downloadSubtitle(auto, id, language, format, filename)
301
302
303
304
305
				if err != nil {
					log.Warn(err)
					continue
				}

306
				sub, err := parseSub(file, preferredSubType)
307
308
309
310
				if err != nil {
					log.Warn(err)
					continue
				}
311
312

				return sub, nil
313
			}
314
		}
315
316
317
318
319
320

		log.WithFields(log.Fields{"videoID": id,
			"language": language,
			"type":     preferredSubType.String(),
		}).Debugf("Preferred subtitle type not available for this video")

321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
	}

	return nil, ErrYoutubeDL("No available subtitle is supported")
}

// GetVideoURL retrieves the videoplayback url of the video and audio stream
func GetVideoURL(id string) (video *url.URL, audio *url.URL, err error) {

	args := []string{
		"-g",
		"https://www.youtube.com/watch?v=" + id,
	}

	// run command
	cmd := exec.Command("youtube-dl", args...)
336
	out, err := cmd.Output()
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
	if err != nil {
		return nil, nil, err
	}
	links := strings.Split(strings.Trim(string(out), "\n\t "), "\n")
	if len(links) != 2 {
		return nil, nil, ErrYoutubeDL(fmt.Sprintf("Expected two links but got %v", len(links)))
	}
	video, err = url.Parse(links[0])
	if err != nil {
		return nil, nil, err
	}
	audio, err = url.Parse(links[1])
	if err != nil {
		return nil, nil, err
	}
	return video, audio, nil
353
}
354

355
356
// GetPlaylistVideos returns a list of youtube video id's that are in the given playlist.
// The response is limited to the first 50 videos.
357
358
359
360
361
362
363
364
365
366
func GetPlaylistVideos(playListID string, apiKey string) (members []string, err error) {
	client := &http.Client{
		Transport: &transport.APIKey{Key: apiKey},
	}

	service, err := youtube.NewService(context.Background(), option.WithHTTPClient(client))
	if err != nil {
		return nil, err
	}
	// Make the API call to YouTube.
367
368
369
370
	call := service.PlaylistItems.List("id,contentDetails").
		PlaylistId(playListID).
		MaxResults(50)

371
372
373
374
375
376
377
378
379
380
381
	response, err := call.Do()
	if err != nil {
		return nil, err
	}

	for _, item := range response.Items {
		members = append(members, item.ContentDetails.VideoId)
	}

	return members, nil
}
382

383
// LoadYouTubeVideos downloads the subtitles of the specified YouTube videos and creates annotations in the specified mongo db collection
384
// user supplied YouTube subtitles are preferred over auto-generated ones
385
386
387
388
func LoadYouTubeVideos(videoIDs []string, language string, collection *mongo.Collection) error {
	for _, v := range videoIDs {
		err := loadYouTubeAnnotationsVideo(v, language, collection)
		if err != nil {
389
390
			log.Warn(err)
			continue
391
392
393
394
395
396
		}
	}
	return nil
}

func loadYouTubeAnnotationsVideo(videoID string, language string, collection *mongo.Collection) error {
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
	var subs annotation.Caption
	err := errors.New("")

	// priorize manual subtitle before downloading automatic youtube caption (autogenerated)
	subs, err = GetSubtitle(false, videoID, language)
	if err != nil {
		_, ok := err.(ErrYoutubeDLLangNotSupported)
		if ok {
			log.WithFields(log.Fields{"videoID": videoID, "language": language}).Trace("No manual subtitle found")
			// download automatic caption
			subs, err = GetSubtitle(true, videoID, language)
			if err != nil {
				return err
			}

412
413
		} else {
			return err
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
		}
	}

	annotations := subs.Annotations("youtube://" + videoID)

	log.WithFields(log.Fields{"videoID": videoID, "language": language, "quantity": len(annotations)}).Trace("Subtitle items found")

	err = annotation.UploadToDB(annotations, collection)
	if err != nil {
		return err
	}

	return nil
}

// LoadYouTubeAnnotationsPlaylist downloads the subtitles of the specified YouTube playlist and loads them to the specified mongo db collection
// user supplied YouTube subtitles are preferred over auto-generated ones
Christof Gerber's avatar
Christof Gerber committed
431
func LoadYouTubeAnnotationsPlaylist(playlistID string, language string, collection *mongo.Collection) error {
432
433
434
435
436
437
438
439
440
	defer monitor.Elapsed()()

	members, err := GetPlaylistVideos(playlistID, os.Getenv("YOUTUBE_DATA_API_KEY"))
	if err != nil {
		return err
	}
	log.WithFields(log.Fields{"playlistID": playlistID, "language": language, "quantity": len(members)}).Trace("Videos found in playlist")

	for _, videoID := range members {
441
		err := loadYouTubeAnnotationsVideo(videoID, language, collection)
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
		if err != nil {
			log.Warning(err)
			continue
		}
	}

	return nil
}

// TimedText describes a parsed YouTube caption file type of .srv3 format (xml)
type TimedText struct {
	XMLName xml.Name `xml:"timedtext"`
	Format  string   `xml:"format,attr,omitempty"`
	Body    Body     `xml:"body"`
}

// Body contains all the paragraphs
type Body struct {
	Paragraphs []Paragraph `xml:"p"`
}

// Paragraph is one caption line (sequence of words=segments)
// Time [ms] is relative to the start of the caption (t=0)
// Duration ??
type Paragraph struct {
	Time     int       `xml:"t,attr"`
	Duration int       `xml:"d,attr"`
469
	Value    string    `xml:",chardata"`
470
471
472
473
474
475
476
477
478
479
	Segments []Segment `xml:"s,omitempty"`
}

// Segment is one words/term with the Time [ms] (relative to the Paragraph.Time) and Duration [ms]
type Segment struct {
	Time     int    `xml:"t,attr,omitempty"`
	Duration int    `xml:"ac,attr,"`
	Value    string `xml:",chardata"`
}

480
481
482
483
484
485
486
487
488
// SegmentsText returns the space delimited concatenation of all segment values
func (p *Paragraph) SegmentsText() string {
	var texts []string
	for _, segment := range p.Segments {
		texts = append(texts, strings.TrimSpace(segment.Value))
	}
	return util.WordsToString(texts)
}

489
490
491
492
493
494
495
496
497
498
499
500
501
// parseSRV decodes a youtubedl subtitle of type .srv3 into a TimedStruct
func parseSRV3(file []byte) (*TimedText, error) {
	var timedText TimedText

	err := xml.Unmarshal(file, &timedText)
	if err != nil {
		return nil, err
	}
	return &timedText, nil
}

// Annotations turns the TimedText into annotations with the src field set as given
// Method of the annotation.Caption IF
Christof Gerber's avatar
Christof Gerber committed
502
503
504
// Args default values
// 1. minWindowLength := 1
// 2. maxWindowLength := 5
505
func (s *TimedText) Annotations(src string) []*annotation.Annotation {
Christof Gerber's avatar
Christof Gerber committed
506

507
	return fromTimedText(s, src)
508
509
}

510
511
512
// fromTimedText creates one Annotation for each Paragraph
// TODO find smart way which segments to combine into one annotation (e.g. when t_pause > x || len(segments) > 10)
func fromTimedText(subtitle *TimedText, src string) []*annotation.Annotation {
513

514
	var annos []*annotation.Annotation
515

516
	count := 0
517
518
	for paragraphNum, paragraph := range subtitle.Body.Paragraphs {

519
520
521
522
523
524
		// if non-empty paragraph
		if len(paragraph.Segments) > 0 || strings.Join(strings.Fields(paragraph.Value), " ") != "" {
			count++

			// Create annotation for the paragraph
			a := annotation.Annotation{Src: src, Subtitle: annotation.Subtitle{Count: count}}
525

526
527
528
529
530
531
532
533
534
535
536
537
538
			// handle when current paragraph subtitle stays displayed even when next one appears
			var endTime int
			if paragraphNum < len(subtitle.Body.Paragraphs)-1 {
				if paragraph.Time+paragraph.Duration > subtitle.Body.Paragraphs[paragraphNum+1].Time {
					endTime = subtitle.Body.Paragraphs[paragraphNum+1].Time
				} else {
					endTime = paragraph.Time + paragraph.Duration
				}

			} else {
				endTime = paragraph.Time + paragraph.Duration
			}

539
540
			a.Subtitle.Start = paragraph.Time
			a.Subtitle.End = endTime
541

542
543
544
545
			// Set segments of annotation
			// Handle automatic capti0xc00046af50,ons with segments (word granularity timing)
			var segments []annotation.TimedText
			for segmentIdx, segment := range paragraph.Segments {
546
547

				// Compute the start time of the current window
548
				startTime := paragraph.Time + segment.Time
549
550
551

				var endTime int
				// if current segment not last one
552
553
				if segmentIdx < len(paragraph.Segments)-1 {
					endTime = paragraph.Time + paragraph.Segments[segmentIdx+1].Time
554
				} else {
555
					endTime = a.Subtitle.End
556
				}
557

558
559
560
561
				s := annotation.TimedText{
					Text:  strings.TrimSpace(segment.Value),
					Start: startTime,
					End:   endTime,
562
563
				}

564
565
566
567
568
569
570
571
				segments = append(segments, s)
			}

			// set paragraph value if non-empty and no segments present and
			if len(paragraph.Segments) == 0 {
				a.Subtitle.Text = paragraph.Value
			} else {
				a.Subtitle.Text = paragraph.SegmentsText()
572
			}
573
574
575
576

			a.Subtitle.Segments = segments

			annos = append(annos, &a)
577
578
579
		}
	}

580
	return annos
581
}
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616

func parseSub(file string, format SubtitleFormat) (annotation.Caption, error) {

	if format == SubSRV3 {

		// opes file
		xmlFile, err := os.Open(file)
		if err != nil {
			return nil, err
		}
		defer xmlFile.Close()

		// read file
		byteValue, err := ioutil.ReadAll(xmlFile)
		if err != nil {
			return nil, err
		}

		// parse srv3
		timedText, err := parseSRV3(byteValue)
		if err != nil {
			return nil, err
		}

		return timedText, nil

	}

	sub, err := astisub.OpenFile(file)
	if err != nil {
		return nil, err
	}

	return &Subtitle{sub}, nil
}