Skip to content

Commit

Permalink
enhancement: better ytdlp integration
Browse files Browse the repository at this point in the history
  • Loading branch information
CorentinB committed Sep 16, 2024
1 parent f526b93 commit 4436efb
Show file tree
Hide file tree
Showing 8 changed files with 160 additions and 61 deletions.
47 changes: 44 additions & 3 deletions internal/pkg/crawl/capture.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package crawl

import (
"errors"
"fmt"
"io"
"net/http"
"net/url"
Expand All @@ -10,6 +11,7 @@ import (
"time"

"github.com/PuerkitoBio/goquery"
"github.com/internetarchive/Zeno/internal/pkg/crawl/dependencies/ytdlp"
"github.com/internetarchive/Zeno/internal/pkg/crawl/extractor"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/facebook"
Expand Down Expand Up @@ -343,20 +345,59 @@ func (c *Crawl) Capture(item *queue.Item) error {
// If it was a YouTube watch page, we potentially want to run it through the YouTube extractor
// TODO: support other watch page URLs
if !c.NoYTDLP && youtube.IsYouTubeWatchPage(item.URL) {
URLs, rawJSON, HTTPHeaders, err := youtube.Parse(resp.Body)
streamURLs, metaURLs, rawJSON, HTTPHeaders, err := ytdlp.Parse(resp.Body)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing YouTube watch page")
return err
}
resp.Body.Close()

// Capture the 2 stream URLs for the video
var streamErrs []error
var streamWg sync.WaitGroup

for _, streamURL := range streamURLs {
streamWg.Add(1)
go func(streamURL *url.URL) {
defer streamWg.Done()
resp, err := c.executeGET(item, &http.Request{
Method: "GET",
URL: streamURL,
}, false)
if err != nil {
streamErrs = append(streamErrs, fmt.Errorf("error executing GET request for %s: %w", streamURL, err))
return
}
defer resp.Body.Close()

if resp.StatusCode != 200 {
streamErrs = append(streamErrs, fmt.Errorf("invalid status code for %s: %s", streamURL, resp.Status))
return
}

_, err = io.Copy(io.Discard, resp.Body)
if err != nil {
streamErrs = append(streamErrs, fmt.Errorf("error reading response body for %s: %w", streamURL, err))
}
}(streamURL)
}

streamWg.Wait()

if len(streamErrs) > 0 {
for _, err := range streamErrs {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while capturing stream URL")
}
return fmt.Errorf("errors occurred while capturing stream URLs: %v", streamErrs)
}

// Write the metadata record for the video
if rawJSON != "" {
c.Client.WriteMetadataRecord(utils.URLToString(item.URL), "application/json;generator=youtube-dlp", rawJSON)
}

if len(URLs) > 0 {
c.captureAssets(item, URLs, resp.Cookies(), HTTPHeaders)
if len(metaURLs) > 0 {
c.captureAssets(item, metaURLs, resp.Cookies(), HTTPHeaders)
}

return nil
Expand Down
12 changes: 12 additions & 0 deletions internal/pkg/crawl/dependencies/ytdlp/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,18 @@ type Subtitle struct {

type Video struct {
ID string `json:"id"`
Title string `json:"title"`
Channel string `json:"channel"`
ChannelID string `json:"channel_id"`
ChannelURL string `json:"channel_url"`
Description string `json:"description"`
Timestamp int `json:"timestamp"`
Duration float64 `json:"duration"`
ViewCount float64 `json:"view_count"`
Tags []string `json:"tags"`
Categories []string `json:"categories"`
Thumbnail string `json:"thumbnail"`
Language string `json:"language"`
IsLive bool `json:"is_live"`
Subtitles map[string][]Subtitle `json:"subtitles"`
RequestedFormats []struct {
Expand Down
42 changes: 42 additions & 0 deletions internal/pkg/crawl/dependencies/ytdlp/parse.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package ytdlp

import (
"io"
"net/url"
)

func Parse(body io.ReadCloser) (streamURLs, metaURLs []*url.URL, rawJSON string, HTTPHeaders map[string]string, err error) {
// Create a temporary server to serve the body and call ytdlp on it
port, stopChan, err := serveBody(body)
if err != nil {
return streamURLs, metaURLs, rawJSON, HTTPHeaders, err
}
defer close(stopChan)

// Call ytdlp on the temporary server
rawStreamURLs, rawMetaURLs, rawJSON, HTTPHeaders, err := getJSON(port)
if err != nil {
return streamURLs, metaURLs, rawJSON, HTTPHeaders, err
}

// Range over rawStreamURLs and rawMetaURLs to parse them as url.URL in videoURLs and metaURLs
for _, urlString := range rawStreamURLs {
URL, err := url.Parse(urlString)
if err != nil {
return streamURLs, metaURLs, rawJSON, HTTPHeaders, err
}

streamURLs = append(streamURLs, URL)
}

for _, urlString := range rawMetaURLs {
URL, err := url.Parse(urlString)
if err != nil {
return streamURLs, metaURLs, rawJSON, HTTPHeaders, err
}

metaURLs = append(metaURLs, URL)
}

return streamURLs, metaURLs, rawJSON, HTTPHeaders, nil
}
2 changes: 1 addition & 1 deletion internal/pkg/crawl/dependencies/ytdlp/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import (
"strings"
)

func ServeBody(body io.ReadCloser) (port int, stopChan chan struct{}, err error) {
func serveBody(body io.ReadCloser) (port int, stopChan chan struct{}, err error) {
stopChan = make(chan struct{})
portChan := make(chan int)

Expand Down
59 changes: 36 additions & 23 deletions internal/pkg/crawl/dependencies/ytdlp/ytdlp.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ import (
"strconv"
)

func GetJSON(port int) (URLs []string, rawJSON string, HTTPHeaders map[string]string, err error) {
func getJSON(port int) (streamURLs, metaURLs []string, rawJSON string, HTTPHeaders map[string]string, err error) {
HTTPHeaders = make(map[string]string)

// Prepare the command
cmd := exec.Command("yt-dlp", "--dump-json", "http://localhost:"+strconv.Itoa(port), "-f", "bv[protocol=https]+ba[protocol=https]")
cmd := exec.Command("yt-dlp", "http://localhost:"+strconv.Itoa(port), "--dump-json", "-f", "bv[protocol=https]+ba[protocol=https]")

// Buffers to capture stdout and stderr
var stdout, stderr bytes.Buffer
Expand All @@ -22,7 +22,7 @@ func GetJSON(port int) (URLs []string, rawJSON string, HTTPHeaders map[string]st
// Run the command
err = cmd.Run()
if err != nil {
return URLs, rawJSON, HTTPHeaders, fmt.Errorf("yt-dlp error: %v\nstderr: %s", err, stderr.String())
return streamURLs, metaURLs, rawJSON, HTTPHeaders, fmt.Errorf("yt-dlp error: %v\nstderr: %s", err, stderr.String())
}

output := stdout.String()
Expand All @@ -31,20 +31,7 @@ func GetJSON(port int) (URLs []string, rawJSON string, HTTPHeaders map[string]st
var video Video
err = json.Unmarshal([]byte(output), &video)
if err != nil {
return nil, rawJSON, HTTPHeaders, fmt.Errorf("error unmarshaling yt-dlp JSON: %v", err)
}

// Get all subtitles (not automatic captions)
var subtitleURLs []string
for _, subtitle := range video.Subtitles {
for _, sub := range subtitle {
subtitleURLs = append(subtitleURLs, sub.URL)
}
}

// Get all thumbnail URLs
for _, thumbnail := range video.Thumbnails {
URLs = append(URLs, thumbnail.URL)
return streamURLs, metaURLs, rawJSON, HTTPHeaders, fmt.Errorf("error unmarshaling yt-dlp JSON: %v", err)
}

// Get the manifest URL for the best video & audio quality
Expand All @@ -53,24 +40,50 @@ func GetJSON(port int) (URLs []string, rawJSON string, HTTPHeaders map[string]st
if len(video.RequestedFormats) > 0 {
HTTPHeaders = video.RequestedFormats[0].HTTPHeaders
for _, format := range video.RequestedFormats {
URLs = append(URLs, format.URL+"&video_id="+video.ID)
// Choose stream_type=
// If acodec == "none" and vcodec != "none", it's "video"
// If acodec != "none" and vcodec == "none", it's "audio"
// If acodec != "none" and vcodec != "none", we don't specify stream_type
var streamType string
if format.Acodec == "none" && format.Vcodec != "none" {
streamType = "video"
} else if format.Acodec != "none" && format.Vcodec == "none" {
streamType = "audio"
}

var URL = format.URL + "&video_id=" + video.ID
if streamType != "" {
URL += "&stream_type=" + streamType
}

streamURLs = append(streamURLs, URL)
}
}
}

// Get all subtitles (not automatic captions)
for _, subtitle := range video.Subtitles {
for _, sub := range subtitle {
metaURLs = append(metaURLs, sub.URL)
}
}

// Get all thumbnail URLs
for _, thumbnail := range video.Thumbnails {
metaURLs = append(metaURLs, thumbnail.URL)
}

// Get the storyboards
for _, format := range video.Formats {
if format.FormatNote == "storyboard" {
URLs = append(URLs, format.URL)
metaURLs = append(metaURLs, format.URL)
for _, fragment := range format.Fragments {
URLs = append(URLs, fragment.URL)
metaURLs = append(metaURLs, fragment.URL)
}
}
}

URLs = append(URLs, subtitleURLs...)

return URLs, output, HTTPHeaders, nil
return streamURLs, metaURLs, output, HTTPHeaders, nil
}

func FindPath() (string, bool) {
Expand Down
5 changes: 2 additions & 3 deletions internal/pkg/crawl/extractor/xml_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,7 @@ func TestXML(t *testing.T) {
Body: io.NopCloser(bytes.NewBufferString(tt.xmlBody)),
}

gotURLs, err := XML(resp)

gotURLs, _, err := XML(resp)
if (err != nil) != tt.wantErr {
t.Errorf("XML() error = %v, wantErr %v", err, tt.wantErr)
return
Expand All @@ -83,7 +82,7 @@ func TestXMLBodyReadError(t *testing.T) {
}
resp.Body.Close() // Close the body to simulate a read error

_, err := XML(resp)
_, _, err := XML(resp)
if err == nil {
t.Errorf("XML() expected error, got nil")
}
Expand Down
44 changes: 17 additions & 27 deletions internal/pkg/crawl/sitespecific/youtube/youtube.go
Original file line number Diff line number Diff line change
@@ -1,42 +1,32 @@
package youtube

import (
"io"
"net/url"
"strings"

"github.com/internetarchive/Zeno/internal/pkg/crawl/dependencies/ytdlp"
)

func IsYouTubeWatchPage(URL *url.URL) bool {
return strings.Contains(URL.Host, "youtube.com") && (strings.Contains(URL.Path, "/watch") || strings.Contains(URL.Path, "/v/"))
}

func Parse(body io.ReadCloser) (URLs []*url.URL, rawJSON string, HTTPHeaders map[string]string, err error) {
HTTPHeaders = make(map[string]string)

// Create a temporary server to serve the body and call ytdlp on it
port, stopChan, err := ytdlp.ServeBody(body)
if err != nil {
return nil, rawJSON, HTTPHeaders, err
}
defer close(stopChan)
// func Parse(body io.ReadCloser) (URLs []*url.URL, rawJSON string, HTTPHeaders map[string]string, err error) {
// HTTPHeaders = make(map[string]string)

// Call ytdlp on the temporary server
rawURLs, rawJSON, HTTPHeaders, err := ytdlp.GetJSON(port)
if err != nil {
return nil, rawJSON, HTTPHeaders, err
}
// // Call ytdlp on the temporary server
// rawURLs, rawJSON, HTTPHeaders, err := ytdlp.GetJSON()
// if err != nil {
// return nil, rawJSON, HTTPHeaders, err
// }

// Parse the URLs
for _, urlString := range rawURLs {
URL, err := url.Parse(urlString)
if err != nil {
return nil, rawJSON, HTTPHeaders, err
}
// // Parse the URLs
// for _, urlString := range rawURLs {
// URL, err := url.Parse(urlString)
// if err != nil {
// return nil, rawJSON, HTTPHeaders, err
// }

URLs = append(URLs, URL)
}
// URLs = append(URLs, URL)
// }

return URLs, rawJSON, HTTPHeaders, nil
}
// return URLs, rawJSON, HTTPHeaders, nil
// }
10 changes: 6 additions & 4 deletions internal/pkg/crawl/sitespecific/youtube/youtube_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package youtube
import (
"os"
"testing"

"github.com/internetarchive/Zeno/internal/pkg/crawl/dependencies/ytdlp"
)

func TestParse(t *testing.T) {
Expand All @@ -14,7 +16,7 @@ func TestParse(t *testing.T) {
defer f.Close()

// Parse the video
URLs, rawJSON, _, err := Parse(f)
streamURLs, metaURLs, rawJSON, _, err := ytdlp.Parse(f)
if err != nil {
t.Fatal(err)
}
Expand All @@ -25,8 +27,8 @@ func TestParse(t *testing.T) {
}

// Check the number of URLs
expected := 204
if len(URLs) != expected {
t.Fatalf("Expected %d URLs, got %d", expected, len(URLs))
expected := 174
if len(streamURLs)+len(metaURLs) != expected {
t.Fatalf("Expected %d URLs, got %d", expected, len(streamURLs)+len(metaURLs))
}
}

0 comments on commit 4436efb

Please sign in to comment.