From 4436efbb8aa3dd2e966d40c06d2676c35818c17a Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Mon, 16 Sep 2024 22:50:10 +0200 Subject: [PATCH] enhancement: better ytdlp integration --- internal/pkg/crawl/capture.go | 47 ++++++++++++++- .../pkg/crawl/dependencies/ytdlp/model.go | 12 ++++ .../pkg/crawl/dependencies/ytdlp/parse.go | 42 +++++++++++++ .../pkg/crawl/dependencies/ytdlp/server.go | 2 +- .../pkg/crawl/dependencies/ytdlp/ytdlp.go | 59 +++++++++++-------- internal/pkg/crawl/extractor/xml_test.go | 5 +- .../pkg/crawl/sitespecific/youtube/youtube.go | 44 ++++++-------- .../sitespecific/youtube/youtube_test.go | 10 ++-- 8 files changed, 160 insertions(+), 61 deletions(-) create mode 100644 internal/pkg/crawl/dependencies/ytdlp/parse.go diff --git a/internal/pkg/crawl/capture.go b/internal/pkg/crawl/capture.go index 15796462..de9b6147 100644 --- a/internal/pkg/crawl/capture.go +++ b/internal/pkg/crawl/capture.go @@ -2,6 +2,7 @@ package crawl import ( "errors" + "fmt" "io" "net/http" "net/url" @@ -10,6 +11,7 @@ import ( "time" "github.com/PuerkitoBio/goquery" + "github.com/internetarchive/Zeno/internal/pkg/crawl/dependencies/ytdlp" "github.com/internetarchive/Zeno/internal/pkg/crawl/extractor" "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream" "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/facebook" @@ -343,20 +345,59 @@ func (c *Crawl) Capture(item *queue.Item) error { // If it was a YouTube watch page, we potentially want to run it through the YouTube extractor // TODO: support other watch page URLs if !c.NoYTDLP && youtube.IsYouTubeWatchPage(item.URL) { - URLs, rawJSON, HTTPHeaders, err := youtube.Parse(resp.Body) + streamURLs, metaURLs, rawJSON, HTTPHeaders, err := ytdlp.Parse(resp.Body) if err != nil { c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing YouTube watch page") return err } resp.Body.Close() + // Capture the 2 stream URLs for the video + var streamErrs []error + var streamWg sync.WaitGroup + + for _, streamURL := range streamURLs { + streamWg.Add(1) + go func(streamURL *url.URL) { + defer streamWg.Done() + resp, err := c.executeGET(item, &http.Request{ + Method: "GET", + URL: streamURL, + }, false) + if err != nil { + streamErrs = append(streamErrs, fmt.Errorf("error executing GET request for %s: %w", streamURL, err)) + return + } + defer resp.Body.Close() + + if resp.StatusCode != 200 { + streamErrs = append(streamErrs, fmt.Errorf("invalid status code for %s: %s", streamURL, resp.Status)) + return + } + + _, err = io.Copy(io.Discard, resp.Body) + if err != nil { + streamErrs = append(streamErrs, fmt.Errorf("error reading response body for %s: %w", streamURL, err)) + } + }(streamURL) + } + + streamWg.Wait() + + if len(streamErrs) > 0 { + for _, err := range streamErrs { + c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while capturing stream URL") + } + return fmt.Errorf("errors occurred while capturing stream URLs: %v", streamErrs) + } + // Write the metadata record for the video if rawJSON != "" { c.Client.WriteMetadataRecord(utils.URLToString(item.URL), "application/json;generator=youtube-dlp", rawJSON) } - if len(URLs) > 0 { - c.captureAssets(item, URLs, resp.Cookies(), HTTPHeaders) + if len(metaURLs) > 0 { + c.captureAssets(item, metaURLs, resp.Cookies(), HTTPHeaders) } return nil diff --git a/internal/pkg/crawl/dependencies/ytdlp/model.go b/internal/pkg/crawl/dependencies/ytdlp/model.go index d9856878..c0e2c503 100644 --- a/internal/pkg/crawl/dependencies/ytdlp/model.go +++ b/internal/pkg/crawl/dependencies/ytdlp/model.go @@ -8,6 +8,18 @@ type Subtitle struct { type Video struct { ID string `json:"id"` + Title string `json:"title"` + Channel string `json:"channel"` + ChannelID string `json:"channel_id"` + ChannelURL string `json:"channel_url"` + Description string `json:"description"` + Timestamp int `json:"timestamp"` + Duration float64 `json:"duration"` + ViewCount float64 `json:"view_count"` + Tags []string `json:"tags"` + Categories []string `json:"categories"` + Thumbnail string `json:"thumbnail"` + Language string `json:"language"` IsLive bool `json:"is_live"` Subtitles map[string][]Subtitle `json:"subtitles"` RequestedFormats []struct { diff --git a/internal/pkg/crawl/dependencies/ytdlp/parse.go b/internal/pkg/crawl/dependencies/ytdlp/parse.go new file mode 100644 index 00000000..1c905a66 --- /dev/null +++ b/internal/pkg/crawl/dependencies/ytdlp/parse.go @@ -0,0 +1,42 @@ +package ytdlp + +import ( + "io" + "net/url" +) + +func Parse(body io.ReadCloser) (streamURLs, metaURLs []*url.URL, rawJSON string, HTTPHeaders map[string]string, err error) { + // Create a temporary server to serve the body and call ytdlp on it + port, stopChan, err := serveBody(body) + if err != nil { + return streamURLs, metaURLs, rawJSON, HTTPHeaders, err + } + defer close(stopChan) + + // Call ytdlp on the temporary server + rawStreamURLs, rawMetaURLs, rawJSON, HTTPHeaders, err := getJSON(port) + if err != nil { + return streamURLs, metaURLs, rawJSON, HTTPHeaders, err + } + + // Range over rawStreamURLs and rawMetaURLs to parse them as url.URL in videoURLs and metaURLs + for _, urlString := range rawStreamURLs { + URL, err := url.Parse(urlString) + if err != nil { + return streamURLs, metaURLs, rawJSON, HTTPHeaders, err + } + + streamURLs = append(streamURLs, URL) + } + + for _, urlString := range rawMetaURLs { + URL, err := url.Parse(urlString) + if err != nil { + return streamURLs, metaURLs, rawJSON, HTTPHeaders, err + } + + metaURLs = append(metaURLs, URL) + } + + return streamURLs, metaURLs, rawJSON, HTTPHeaders, nil +} diff --git a/internal/pkg/crawl/dependencies/ytdlp/server.go b/internal/pkg/crawl/dependencies/ytdlp/server.go index 334c7ee4..4d0e34c4 100644 --- a/internal/pkg/crawl/dependencies/ytdlp/server.go +++ b/internal/pkg/crawl/dependencies/ytdlp/server.go @@ -7,7 +7,7 @@ import ( "strings" ) -func ServeBody(body io.ReadCloser) (port int, stopChan chan struct{}, err error) { +func serveBody(body io.ReadCloser) (port int, stopChan chan struct{}, err error) { stopChan = make(chan struct{}) portChan := make(chan int) diff --git a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go index dd018b98..1d628247 100644 --- a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go +++ b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go @@ -8,11 +8,11 @@ import ( "strconv" ) -func GetJSON(port int) (URLs []string, rawJSON string, HTTPHeaders map[string]string, err error) { +func getJSON(port int) (streamURLs, metaURLs []string, rawJSON string, HTTPHeaders map[string]string, err error) { HTTPHeaders = make(map[string]string) // Prepare the command - cmd := exec.Command("yt-dlp", "--dump-json", "http://localhost:"+strconv.Itoa(port), "-f", "bv[protocol=https]+ba[protocol=https]") + cmd := exec.Command("yt-dlp", "http://localhost:"+strconv.Itoa(port), "--dump-json", "-f", "bv[protocol=https]+ba[protocol=https]") // Buffers to capture stdout and stderr var stdout, stderr bytes.Buffer @@ -22,7 +22,7 @@ func GetJSON(port int) (URLs []string, rawJSON string, HTTPHeaders map[string]st // Run the command err = cmd.Run() if err != nil { - return URLs, rawJSON, HTTPHeaders, fmt.Errorf("yt-dlp error: %v\nstderr: %s", err, stderr.String()) + return streamURLs, metaURLs, rawJSON, HTTPHeaders, fmt.Errorf("yt-dlp error: %v\nstderr: %s", err, stderr.String()) } output := stdout.String() @@ -31,20 +31,7 @@ func GetJSON(port int) (URLs []string, rawJSON string, HTTPHeaders map[string]st var video Video err = json.Unmarshal([]byte(output), &video) if err != nil { - return nil, rawJSON, HTTPHeaders, fmt.Errorf("error unmarshaling yt-dlp JSON: %v", err) - } - - // Get all subtitles (not automatic captions) - var subtitleURLs []string - for _, subtitle := range video.Subtitles { - for _, sub := range subtitle { - subtitleURLs = append(subtitleURLs, sub.URL) - } - } - - // Get all thumbnail URLs - for _, thumbnail := range video.Thumbnails { - URLs = append(URLs, thumbnail.URL) + return streamURLs, metaURLs, rawJSON, HTTPHeaders, fmt.Errorf("error unmarshaling yt-dlp JSON: %v", err) } // Get the manifest URL for the best video & audio quality @@ -53,24 +40,50 @@ func GetJSON(port int) (URLs []string, rawJSON string, HTTPHeaders map[string]st if len(video.RequestedFormats) > 0 { HTTPHeaders = video.RequestedFormats[0].HTTPHeaders for _, format := range video.RequestedFormats { - URLs = append(URLs, format.URL+"&video_id="+video.ID) + // Choose stream_type= + // If acodec == "none" and vcodec != "none", it's "video" + // If acodec != "none" and vcodec == "none", it's "audio" + // If acodec != "none" and vcodec != "none", we don't specify stream_type + var streamType string + if format.Acodec == "none" && format.Vcodec != "none" { + streamType = "video" + } else if format.Acodec != "none" && format.Vcodec == "none" { + streamType = "audio" + } + + var URL = format.URL + "&video_id=" + video.ID + if streamType != "" { + URL += "&stream_type=" + streamType + } + + streamURLs = append(streamURLs, URL) } } } + // Get all subtitles (not automatic captions) + for _, subtitle := range video.Subtitles { + for _, sub := range subtitle { + metaURLs = append(metaURLs, sub.URL) + } + } + + // Get all thumbnail URLs + for _, thumbnail := range video.Thumbnails { + metaURLs = append(metaURLs, thumbnail.URL) + } + // Get the storyboards for _, format := range video.Formats { if format.FormatNote == "storyboard" { - URLs = append(URLs, format.URL) + metaURLs = append(metaURLs, format.URL) for _, fragment := range format.Fragments { - URLs = append(URLs, fragment.URL) + metaURLs = append(metaURLs, fragment.URL) } } } - URLs = append(URLs, subtitleURLs...) - - return URLs, output, HTTPHeaders, nil + return streamURLs, metaURLs, output, HTTPHeaders, nil } func FindPath() (string, bool) { diff --git a/internal/pkg/crawl/extractor/xml_test.go b/internal/pkg/crawl/extractor/xml_test.go index da732228..40b29cc3 100644 --- a/internal/pkg/crawl/extractor/xml_test.go +++ b/internal/pkg/crawl/extractor/xml_test.go @@ -63,8 +63,7 @@ func TestXML(t *testing.T) { Body: io.NopCloser(bytes.NewBufferString(tt.xmlBody)), } - gotURLs, err := XML(resp) - + gotURLs, _, err := XML(resp) if (err != nil) != tt.wantErr { t.Errorf("XML() error = %v, wantErr %v", err, tt.wantErr) return @@ -83,7 +82,7 @@ func TestXMLBodyReadError(t *testing.T) { } resp.Body.Close() // Close the body to simulate a read error - _, err := XML(resp) + _, _, err := XML(resp) if err == nil { t.Errorf("XML() expected error, got nil") } diff --git a/internal/pkg/crawl/sitespecific/youtube/youtube.go b/internal/pkg/crawl/sitespecific/youtube/youtube.go index c11c6fc1..e692b0c1 100644 --- a/internal/pkg/crawl/sitespecific/youtube/youtube.go +++ b/internal/pkg/crawl/sitespecific/youtube/youtube.go @@ -1,42 +1,32 @@ package youtube import ( - "io" "net/url" "strings" - - "github.com/internetarchive/Zeno/internal/pkg/crawl/dependencies/ytdlp" ) func IsYouTubeWatchPage(URL *url.URL) bool { return strings.Contains(URL.Host, "youtube.com") && (strings.Contains(URL.Path, "/watch") || strings.Contains(URL.Path, "/v/")) } -func Parse(body io.ReadCloser) (URLs []*url.URL, rawJSON string, HTTPHeaders map[string]string, err error) { - HTTPHeaders = make(map[string]string) - - // Create a temporary server to serve the body and call ytdlp on it - port, stopChan, err := ytdlp.ServeBody(body) - if err != nil { - return nil, rawJSON, HTTPHeaders, err - } - defer close(stopChan) +// func Parse(body io.ReadCloser) (URLs []*url.URL, rawJSON string, HTTPHeaders map[string]string, err error) { +// HTTPHeaders = make(map[string]string) - // Call ytdlp on the temporary server - rawURLs, rawJSON, HTTPHeaders, err := ytdlp.GetJSON(port) - if err != nil { - return nil, rawJSON, HTTPHeaders, err - } +// // Call ytdlp on the temporary server +// rawURLs, rawJSON, HTTPHeaders, err := ytdlp.GetJSON() +// if err != nil { +// return nil, rawJSON, HTTPHeaders, err +// } - // Parse the URLs - for _, urlString := range rawURLs { - URL, err := url.Parse(urlString) - if err != nil { - return nil, rawJSON, HTTPHeaders, err - } +// // Parse the URLs +// for _, urlString := range rawURLs { +// URL, err := url.Parse(urlString) +// if err != nil { +// return nil, rawJSON, HTTPHeaders, err +// } - URLs = append(URLs, URL) - } +// URLs = append(URLs, URL) +// } - return URLs, rawJSON, HTTPHeaders, nil -} +// return URLs, rawJSON, HTTPHeaders, nil +// } diff --git a/internal/pkg/crawl/sitespecific/youtube/youtube_test.go b/internal/pkg/crawl/sitespecific/youtube/youtube_test.go index 5a86ade0..5a7c43d8 100644 --- a/internal/pkg/crawl/sitespecific/youtube/youtube_test.go +++ b/internal/pkg/crawl/sitespecific/youtube/youtube_test.go @@ -3,6 +3,8 @@ package youtube import ( "os" "testing" + + "github.com/internetarchive/Zeno/internal/pkg/crawl/dependencies/ytdlp" ) func TestParse(t *testing.T) { @@ -14,7 +16,7 @@ func TestParse(t *testing.T) { defer f.Close() // Parse the video - URLs, rawJSON, _, err := Parse(f) + streamURLs, metaURLs, rawJSON, _, err := ytdlp.Parse(f) if err != nil { t.Fatal(err) } @@ -25,8 +27,8 @@ func TestParse(t *testing.T) { } // Check the number of URLs - expected := 204 - if len(URLs) != expected { - t.Fatalf("Expected %d URLs, got %d", expected, len(URLs)) + expected := 174 + if len(streamURLs)+len(metaURLs) != expected { + t.Fatalf("Expected %d URLs, got %d", expected, len(streamURLs)+len(metaURLs)) } }