To receive notifications about scheduled maintenance, please subscribe to the mailing-list gitlab-operations@sympa.ethz.ch. You can subscribe to the mailing-list at https://sympa.ethz.ch

Commit 205c6c06 authored by Christof Gerber's avatar Christof Gerber
Browse files

Support also srv3 parsing for manual youtube subs

parent 8106b6d0
Pipeline #35960 passed with stages
in 7 minutes and 16 seconds
This diff is collapsed.
......@@ -14,6 +14,7 @@ import (
"google.golang.org/api/googleapi/transport"
"google.golang.org/api/option"
"google.golang.org/api/youtube/v3"
"io/ioutil"
"net/http"
"net/url"
"os"
......@@ -59,7 +60,7 @@ const (
// SupportedSubs is a list in preferred order of subtitle types that are supported to be parsed
var SupportedSubs = []SubtitleFormat{
//SubSRV3,
SubSRV3,
SubSRT,
SubTTML,
SubSSA,
......@@ -295,19 +296,20 @@ func GetSubtitle(auto bool, id string, language string) (sub annotation.Caption,
filename := filepath.Join("/tmp", "sub-"+id)
file := filename + "." + language + "." + format
err := downloadSubtitle(auto, id, language, format, filename)
defer os.Remove(file)
err := downloadSubtitle(auto, id, language, format, filename)
if err != nil {
log.Warn(err)
continue
}
sub, err := astisub.OpenFile(file)
sub, err := parseSub(file, preferredSubType)
if err != nil {
log.Warn(err)
continue
}
return &Subtitle{sub}, nil
return sub, nil
}
}
......@@ -457,6 +459,7 @@ type Body struct {
type Paragraph struct {
Time int `xml:"t,attr"`
Duration int `xml:"d,attr"`
Value string `xml:",chardata"`
Segments []Segment `xml:"s,omitempty"`
}
......@@ -493,6 +496,24 @@ func fromTimedText(subtitle *TimedText, windowLength int, src string) []*annotat
for paragraphNum, paragraph := range subtitle.Body.Paragraphs {
// Handle manual captions which have no segments (no word granularity timing)
if len(paragraph.Segments) == 0 && paragraph.Value != "" {
// Create annotation for the paragraph
a := &annotation.Annotation{
Src: src,
Subtitle: annotation.Subtitle{
Count: paragraphNum,
Text: paragraph.Value,
Start: paragraph.Time,
End: paragraph.Time + paragraph.Duration,
},
}
windowAnnotations = append(windowAnnotations, a)
}
// Handle automatic captions with segments (word granularity timing)
for startSegmentIdx := range paragraph.Segments {
for wLength := 1; wLength <= windowLength; wLength++ {
......@@ -532,3 +553,38 @@ func fromTimedText(subtitle *TimedText, windowLength int, src string) []*annotat
return windowAnnotations
}
func parseSub(file string, format SubtitleFormat) (annotation.Caption, error) {
if format == SubSRV3 {
// opes file
xmlFile, err := os.Open(file)
if err != nil {
return nil, err
}
defer xmlFile.Close()
// read file
byteValue, err := ioutil.ReadAll(xmlFile)
if err != nil {
return nil, err
}
// parse srv3
timedText, err := parseSRV3(byteValue)
if err != nil {
return nil, err
}
return timedText, nil
}
sub, err := astisub.OpenFile(file)
if err != nil {
return nil, err
}
return &Subtitle{sub}, nil
}
......@@ -162,7 +162,7 @@ func TestYouTubeSRV3Loader(t *testing.T) {
}
func TestTimedTextToAnnotation(t *testing.T) {
func TestTimedTextToAnnotationAutomaticCaption(t *testing.T) {
xmlFile, err := os.Open("testData/testSubYouTube-VIlLpnJJl_4.en.srv3")
assert.Equal(t, nil, err)
// defer the closing of our xmlFile so that we can parse it later on
......@@ -201,9 +201,9 @@ func TestTimedTextToAnnotation(t *testing.T) {
Src: "youtube://VIlLpnJJl_4",
Subtitle: annotation.Subtitle{
Count: 2,
Text: "12 hours a",
Text: "12 hours",
Start: 6120,
End: 6821,
End: 6672,
},
}
assert.Equal(t, aExpect, *a[32])
......@@ -255,3 +255,72 @@ func TestNumberOfProducedAnnotations(t *testing.T) {
}
}
func TestSRV3parseManualCaption(t *testing.T) {
xmlFile, err := os.Open("testData/subManual-OsFEV35tWsg.en.srv3")
assert.Equal(t, nil, err)
// defer the closing of our xmlFile so that we can parse it later on
defer xmlFile.Close()
// read our opened xmlFile as a byte array.
byteValue, err := ioutil.ReadAll(xmlFile)
assert.Equal(t, nil, err)
timedText, err := parseSRV3(byteValue)
assert.Equal(t, nil, err)
assert.Equal(t, 12794, timedText.Body.Paragraphs[0].Time)
assert.Equal(t, 3556, timedText.Body.Paragraphs[0].Duration)
assert.Equal(t, "Philosophers, dramatists, theologians", timedText.Body.Paragraphs[0].Value)
assert.Equal(t, 18604, timedText.Body.Paragraphs[2].Time)
assert.Equal(t, 1532, timedText.Body.Paragraphs[2].Duration)
assert.Equal(t, "what makes people go wrong?", timedText.Body.Paragraphs[2].Value)
}
func TestTimedTextToAnnotationManualCaption(t *testing.T) {
xmlFile, err := os.Open("testData/subManual-OsFEV35tWsg.en.srv3")
assert.Equal(t, nil, err)
// defer the closing of our xmlFile so that we can parse it later on
defer xmlFile.Close()
// read our opened xmlFile as a byte array.
byteValue, err := ioutil.ReadAll(xmlFile)
assert.Equal(t, nil, err)
timedText, err := parseSRV3(byteValue)
assert.Equal(t, nil, err)
a := timedText.Annotations("youtube://OsFEV35tWsg")
aExpect := annotation.Annotation{
Src: "youtube://OsFEV35tWsg",
Subtitle: annotation.Subtitle{
Count: 0,
Text: "Philosophers, dramatists, theologians",
Start: 12794,
End: 16350,
},
}
assert.Equal(t, aExpect, *a[0])
aExpect = annotation.Annotation{
Src: "youtube://OsFEV35tWsg",
Subtitle: annotation.Subtitle{
Count: 15,
Text: "with them on the good side,\nthe others on the bad side --",
Start: 56652,
End: 59325,
},
}
assert.Equal(t, aExpect, *a[15])
aExpect = annotation.Annotation{
Src: "youtube://OsFEV35tWsg",
Subtitle: annotation.Subtitle{
Count: 32,
Text: "And apparently, he disobeyed God,",
Start: 104661,
End: 107866,
},
}
assert.Equal(t, aExpect, *a[32])
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment