To receive notifications about scheduled maintenance, please subscribe to the mailing-list gitlab-operations@sympa.ethz.ch. You can subscribe to the mailing-list at https://sympa.ethz.ch

Commit 8106b6d0 authored by Christof Gerber's avatar Christof Gerber
Browse files

Ordered preference for youtubedl subs. TimedText to annotations with window slide approach

parent 7f6c34a0
Pipeline #35956 passed with stages
in 7 minutes
......@@ -3,17 +3,12 @@ module gitlab.ethz.ch/chgerber/youtubedl
go 1.12
require (
github.com/asticode/go-astiamqp v1.0.0 // indirect
github.com/asticode/go-astisub v0.0.0-20181231080834-e2ca1c7ce8f4
github.com/asticode/go-astits v0.0.0-20190105100228-f4a041fc41e5 // indirect
github.com/pkg/errors v0.8.1 // indirect
github.com/sirupsen/logrus v1.4.1
github.com/asticode/go-astisub v0.0.0-20190514140258-c0ed7925c393
github.com/pierrec/lz4 v2.0.5+incompatible // indirect
github.com/sirupsen/logrus v1.4.2
github.com/stretchr/testify v1.3.0
golang.org/x/crypto v0.0.0-20190513172903-22d7a77e9e5f // indirect
golang.org/x/lint v0.0.0-20190409202823-959b441ac422 // indirect
golang.org/x/net v0.0.0-20190522155817-f3200d17e092 // indirect
golang.org/x/sys v0.0.0-20190528183647-3626398d7749 // indirect
golang.org/x/text v0.3.2 // indirect
golang.org/x/tools v0.0.0-20190529010454-aa71c3f32488 // indirect
google.golang.org/api v0.3.2
gitlab.ethz.ch/chgerber/MessageComposition v0.0.0-20190606100759-1b591f45e7e5
gitlab.ethz.ch/chgerber/annotation v0.0.0-20190607113039-396e9d537c9b
gitlab.ethz.ch/chgerber/monitor v0.0.0-20190527191251-2bb9dd731340
google.golang.org/api v0.5.0
)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -3,9 +3,14 @@ package youtubedl
import (
"context"
"encoding/json"
"encoding/xml"
"errors"
"fmt"
"github.com/asticode/go-astisub"
log "github.com/sirupsen/logrus"
"gitlab.ethz.ch/chgerber/MessageComposition/src/pkg/util"
"gitlab.ethz.ch/chgerber/annotation"
"gitlab.ethz.ch/chgerber/monitor"
"google.golang.org/api/googleapi/transport"
"google.golang.org/api/option"
"google.golang.org/api/youtube/v3"
......@@ -15,9 +20,21 @@ import (
"os/exec"
"path/filepath"
"regexp"
"strconv"
"strings"
)
// Subtitle is a wrapper type for YouTube subtitles that can be parsed by astisub
type Subtitle struct {
*astisub.Subtitles
}
// Annotations turns the subtitles into annotations with the src field set as given
// Method of the annotation.Caption IF
func (s *Subtitle) Annotations(src string) []*annotation.Annotation {
return annotation.SubToAnnotation(s.Subtitles, src)
}
// SubtitleFormat type describes a subtitle type (e.g. `vtt`)
type SubtitleFormat string
......@@ -36,15 +53,18 @@ const (
SubSTL SubtitleFormat = "stl"
// SubTTML caption of type ttml
SubTTML SubtitleFormat = "ttml"
// SubSRV3 caption of the youtube xml timedtext type with file extension .srv3
SubSRV3 SubtitleFormat = "srv3"
)
// SupportedSubs contains all supported subtitle formats as key. Value is an empty string.
var SupportedSubs = map[SubtitleFormat]string{
//SubVTT: "", // TODO support vtt from YouTube with timestamp tags
SubSRT: "",
SubSSA: "",
SubSTL: "",
SubTTML: "",
// SupportedSubs is a list in preferred order of subtitle types that are supported to be parsed
var SupportedSubs = []SubtitleFormat{
//SubSRV3,
SubSRT,
SubTTML,
SubSSA,
SubSTL,
//SubVTT TODO Timing does not work with YouTube subs of this type. -> Fix parsing
}
// ErrYoutubeDLLangNotSupported is thrown when language not supported
......@@ -236,7 +256,7 @@ func downloadSubtitle(auto bool, id string, language string, format string, file
// GetSubtitle downloads the subtitle of the video id in the passed language.
// Throws an error if subtitle not found.
// Downlaods the automatic caption when auto is true.
func GetSubtitle(auto bool, id string, language string) (sub *astisub.Subtitles, err error) {
func GetSubtitle(auto bool, id string, language string) (sub annotation.Caption, err error) {
var formats []string
if auto == false {
......@@ -259,28 +279,43 @@ func GetSubtitle(auto bool, id string, language string) (sub *astisub.Subtitles,
}
}
for _, format := range formats {
log.WithFields(log.Fields{
"videoID": id,
"language": language,
"auto": auto,
"types": formats,
}).Tracef("Available subtitle types for this video")
if _, ok := SupportedSubs[SubtitleFormat(format)]; !ok {
log.Infof("Subtitle %s is not supported", format)
continue
}
filename := filepath.Join("/tmp", "sub-"+id)
file := filename + "." + language + "." + format
for _, preferredSubType := range SupportedSubs {
err := downloadSubtitle(auto, id, language, format, filename)
defer os.Remove(file)
if err != nil {
log.Warn(err)
continue
}
for _, format := range formats {
sub, err := astisub.OpenFile(file)
if err != nil {
log.Warn(err)
continue
if preferredSubType.String() == format {
filename := filepath.Join("/tmp", "sub-"+id)
file := filename + "." + language + "." + format
err := downloadSubtitle(auto, id, language, format, filename)
defer os.Remove(file)
if err != nil {
log.Warn(err)
continue
}
sub, err := astisub.OpenFile(file)
if err != nil {
log.Warn(err)
continue
}
return &Subtitle{sub}, nil
}
}
return sub, nil
log.WithFields(log.Fields{"videoID": id,
"language": language,
"type": preferredSubType.String(),
}).Debugf("Preferred subtitle type not available for this video")
}
return nil, ErrYoutubeDL("No available subtitle is supported")
......@@ -342,3 +377,158 @@ func GetPlaylistVideos(playListID string, apiKey string) (members []string, err
return members, nil
}
// LoadYouTubeAnnotationsVideo downloads the subtitles of the specified YouTube video and creates annotations in the specified mongo db collection
// user supplied YouTube subtitles are preferred over auto-generated ones
func LoadYouTubeAnnotationsVideo(videoID string, language string, mongoCollection string) error {
var subs annotation.Caption
err := errors.New("")
// priorize manual subtitle before downloading automatic youtube caption (autogenerated)
subs, err = GetSubtitle(false, videoID, language)
if err != nil {
_, ok := err.(ErrYoutubeDLLangNotSupported)
if ok {
log.WithFields(log.Fields{"videoID": videoID, "language": language}).Trace("No manual subtitle found")
// download automatic caption
subs, err = GetSubtitle(true, videoID, language)
if err != nil {
return err
}
}
}
annotations := subs.Annotations("youtube://" + videoID)
log.WithFields(log.Fields{"videoID": videoID, "language": language, "quantity": len(annotations)}).Trace("Subtitle items found")
mongoPort, err := strconv.Atoi(os.Getenv("MONGO_PORT"))
collection, err := annotation.ConnectMongoDBCollection(os.Getenv("MONGO_HOST"), mongoPort, os.Getenv("MONGO_DB_NAME"), mongoCollection)
if err != nil {
return err
}
err = annotation.UploadToDB(annotations, collection)
if err != nil {
return err
}
return nil
}
// LoadYouTubeAnnotationsPlaylist downloads the subtitles of the specified YouTube playlist and loads them to the specified mongo db collection
// user supplied YouTube subtitles are preferred over auto-generated ones
func LoadYouTubeAnnotationsPlaylist(playlistID string, language string, mongoCollection string) error {
defer monitor.Elapsed()()
members, err := GetPlaylistVideos(playlistID, os.Getenv("YOUTUBE_DATA_API_KEY"))
if err != nil {
return err
}
log.WithFields(log.Fields{"playlistID": playlistID, "language": language, "quantity": len(members)}).Trace("Videos found in playlist")
for _, videoID := range members {
err := LoadYouTubeAnnotationsVideo(videoID, language, mongoCollection)
if err != nil {
log.Warning(err)
continue
}
}
return nil
}
// TimedText describes a parsed YouTube caption file type of .srv3 format (xml)
type TimedText struct {
XMLName xml.Name `xml:"timedtext"`
Format string `xml:"format,attr,omitempty"`
Body Body `xml:"body"`
}
// Body contains all the paragraphs
type Body struct {
Paragraphs []Paragraph `xml:"p"`
}
// Paragraph is one caption line (sequence of words=segments)
// Time [ms] is relative to the start of the caption (t=0)
// Duration ??
type Paragraph struct {
Time int `xml:"t,attr"`
Duration int `xml:"d,attr"`
Segments []Segment `xml:"s,omitempty"`
}
// Segment is one words/term with the Time [ms] (relative to the Paragraph.Time) and Duration [ms]
type Segment struct {
Time int `xml:"t,attr,omitempty"`
Duration int `xml:"ac,attr,"`
Value string `xml:",chardata"`
}
// parseSRV decodes a youtubedl subtitle of type .srv3 into a TimedStruct
func parseSRV3(file []byte) (*TimedText, error) {
var timedText TimedText
err := xml.Unmarshal(file, &timedText)
if err != nil {
return nil, err
}
return &timedText, nil
}
// Annotations turns the TimedText into annotations with the src field set as given
// Method of the annotation.Caption IF
func (s *TimedText) Annotations(src string) []*annotation.Annotation {
return fromTimedText(s, 3, src)
}
// fromTimedText applies the window slide approach to create annotations from TimedText
// window slide only applied within one paragraph
func fromTimedText(subtitle *TimedText, windowLength int, src string) []*annotation.Annotation {
var windowAnnotations []*annotation.Annotation
for paragraphNum, paragraph := range subtitle.Body.Paragraphs {
for startSegmentIdx := range paragraph.Segments {
for wLength := 1; wLength <= windowLength; wLength++ {
// Detect if sliding window end exceeds segment end
if startSegmentIdx+wLength > len(paragraph.Segments) {
break
}
// Collect the values (text) of the current window
var windowSegmentsValue []string
for i := startSegmentIdx; i < startSegmentIdx+wLength; i++ {
windowSegmentsValue = append(windowSegmentsValue, paragraph.Segments[i].Value)
}
// Compute the start time of the current window
startSegment := paragraph.Segments[startSegmentIdx]
endSegment := paragraph.Segments[startSegmentIdx+wLength-1]
startTime := paragraph.Time + startSegment.Time
endTime := paragraph.Time + endSegment.Time + endSegment.Duration
// Create annotation of the current window
a := &annotation.Annotation{
Src: src,
Subtitle: annotation.Subtitle{
Count: paragraphNum,
Text: util.WordsToString(windowSegmentsValue),
Start: startTime,
End: endTime,
},
}
windowAnnotations = append(windowAnnotations, a)
}
}
}
return windowAnnotations
}
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment