From 7c95917040722ff71739e7a6349691d3a6dc2334 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Mon, 12 Aug 2024 16:58:45 +0200 Subject: [PATCH 01/12] add: yt-dlp support to gather YouTube URLs from watch pages --- cmd/get.go | 4 + config/config.go | 4 + internal/pkg/crawl/assets.go | 108 ++++++++++++++++ internal/pkg/crawl/capture.go | 120 +++--------------- internal/pkg/crawl/config.go | 12 +- internal/pkg/crawl/crawl.go | 19 ++- .../pkg/crawl/dependencies/ytdlp/model.go | 11 ++ .../pkg/crawl/dependencies/ytdlp/parse.go | 42 ++++++ .../pkg/crawl/dependencies/ytdlp/server.go | 46 +++++++ .../pkg/crawl/dependencies/ytdlp/ytdlp.go | 65 ++++++++++ .../pkg/crawl/sitespecific/youtube/youtube.go | 35 +++++ .../sitespecific/youtube/youtube_test.go | 27 ++++ .../sitespecific/youtube/youtube_test.html | 88 +++++++++++++ 13 files changed, 475 insertions(+), 106 deletions(-) create mode 100644 internal/pkg/crawl/dependencies/ytdlp/model.go create mode 100644 internal/pkg/crawl/dependencies/ytdlp/parse.go create mode 100644 internal/pkg/crawl/dependencies/ytdlp/server.go create mode 100644 internal/pkg/crawl/dependencies/ytdlp/ytdlp.go create mode 100644 internal/pkg/crawl/sitespecific/youtube/youtube.go create mode 100644 internal/pkg/crawl/sitespecific/youtube/youtube_test.go create mode 100644 internal/pkg/crawl/sitespecific/youtube/youtube_test.html diff --git a/cmd/get.go b/cmd/get.go index 78e1a488..f73bd1dc 100644 --- a/cmd/get.go +++ b/cmd/get.go @@ -84,6 +84,10 @@ func getCMDsFlags(getCmd *cobra.Command) { getCmd.PersistentFlags().String("es-password", "", "ElasticSearch password to use for indexing crawl logs.") getCmd.PersistentFlags().String("es-index-prefix", "zeno", "ElasticSearch index prefix to use for indexing crawl logs. Default is : `zeno`, without `-`") + // Dependencies flags + getCmd.PersistentFlags().Bool("no-ytdlp", false, "Disable youtube-dlp usage for video extraction.") + getCmd.PersistentFlags().String("ytdlp-path", "", "Path to youtube-dlp binary.") + // Alias support // As cobra doesn't support aliases natively (couldn't find a way to do it), we have to do it manually // This is a workaround to allow users to use `--hops` instead of `--max-hops` for example diff --git a/config/config.go b/config/config.go index 298da11d..08745d6d 100644 --- a/config/config.go +++ b/config/config.go @@ -76,6 +76,10 @@ type Config struct { NoStdoutLogging bool `mapstructure:"no-stdout-log"` NoHandover bool `mapstructure:"no-handover"` NoBatchWriteWAL bool `mapstructure:"ultrasafe-queue"` + + // Dependencies + NoYTDLP bool `mapstructure:"no-ytdlp"` + YTDLPPath string `mapstructure:"ytdlp-path"` } var ( diff --git a/internal/pkg/crawl/assets.go b/internal/pkg/crawl/assets.go index f2783e77..541628cf 100644 --- a/internal/pkg/crawl/assets.go +++ b/internal/pkg/crawl/assets.go @@ -1,16 +1,124 @@ package crawl import ( + "io" + "net/http" "net/url" "regexp" "strings" + "sync/atomic" "github.com/PuerkitoBio/goquery" "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream" "github.com/internetarchive/Zeno/internal/pkg/queue" "github.com/internetarchive/Zeno/internal/pkg/utils" + "github.com/remeh/sizedwaitgroup" ) +func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie) error { + var resp *http.Response + + // Prepare GET request + req, err := http.NewRequest("GET", utils.URLToString(item.URL), nil) + if err != nil { + return err + } + + req.Header.Set("Referer", utils.URLToString(item.ParentURL)) + req.Header.Set("User-Agent", c.UserAgent) + + // Apply cookies obtained from the original URL captured + for i := range cookies { + req.AddCookie(cookies[i]) + } + + resp, err = c.executeGET(item, req, false) + if err != nil && err.Error() == "URL from redirection has already been seen" { + return nil + } else if err != nil { + return err + } + defer resp.Body.Close() + + // needed for WARC writing + io.Copy(io.Discard, resp.Body) + + return nil +} + +func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*http.Cookie) { + // TODO: implement a counter for the number of assets + // currently being processed + // c.Frontier.QueueCount.Incr(int64(len(assets))) + swg := sizedwaitgroup.New(int(c.MaxConcurrentAssets)) + excluded := false + + for _, asset := range assets { + // TODO: implement a counter for the number of assets + // currently being processed + // c.Frontier.QueueCount.Incr(-1) + + // Just making sure we do not over archive by archiving the original URL + if utils.URLToString(item.URL) == utils.URLToString(asset) { + continue + } + + // We ban googlevideo.com URLs because they are heavily rate limited by default, and + // we don't want the crawler to spend an innapropriate amount of time archiving them + if strings.Contains(item.URL.Host, "googlevideo.com") { + continue + } + + // If the URL match any excluded string, we ignore it + for _, excludedString := range c.ExcludedStrings { + if strings.Contains(utils.URLToString(asset), excludedString) { + excluded = true + break + } + } + + if excluded { + excluded = false + continue + } + + swg.Add() + c.URIsPerSecond.Incr(1) + + go func(asset *url.URL, swg *sizedwaitgroup.SizedWaitGroup) { + defer swg.Done() + + // Create the asset's item + newAsset, err := queue.NewItem(asset, item.URL, "asset", item.Hop, "", false) + if err != nil { + c.Log.WithFields(c.genLogFields(err, asset, map[string]interface{}{ + "parentHop": item.Hop, + "parentUrl": utils.URLToString(item.URL), + "type": "asset", + })).Error("error while creating asset item") + return + } + + // Capture the asset + err = c.captureAsset(newAsset, cookies) + if err != nil { + c.Log.WithFields(c.genLogFields(err, &asset, map[string]interface{}{ + "parentHop": item.Hop, + "parentUrl": utils.URLToString(item.URL), + "type": "asset", + })).Error("error while capturing asset") + return + } + + // If we made it to this point, it means that the asset have been crawled successfully, + // then we can increment the locallyCrawled variable + atomic.AddUint64(&item.LocallyCrawled, 1) + }(asset, &swg) + } + + swg.Wait() +} + func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Document) (assets []*url.URL, err error) { var rawAssets []string diff --git a/internal/pkg/crawl/capture.go b/internal/pkg/crawl/capture.go index 856d2852..b33a33c3 100644 --- a/internal/pkg/crawl/capture.go +++ b/internal/pkg/crawl/capture.go @@ -8,7 +8,6 @@ import ( "net/url" "strings" "sync" - "sync/atomic" "time" "github.com/PuerkitoBio/goquery" @@ -20,9 +19,9 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/tiktok" "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/truthsocial" "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/vk" + "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/youtube" "github.com/internetarchive/Zeno/internal/pkg/queue" "github.com/internetarchive/Zeno/internal/pkg/utils" - "github.com/remeh/sizedwaitgroup" ) func (c *Crawl) executeGET(item *queue.Item, req *http.Request, isRedirection bool) (resp *http.Response, err error) { @@ -188,37 +187,6 @@ func (c *Crawl) executeGET(item *queue.Item, req *http.Request, isRedirection bo return resp, nil } -func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie) error { - var resp *http.Response - - // Prepare GET request - req, err := http.NewRequest("GET", utils.URLToString(item.URL), nil) - if err != nil { - return err - } - - req.Header.Set("Referer", utils.URLToString(item.ParentURL)) - req.Header.Set("User-Agent", c.UserAgent) - - // Apply cookies obtained from the original URL captured - for i := range cookies { - req.AddCookie(cookies[i]) - } - - resp, err = c.executeGET(item, req, false) - if err != nil && err.Error() == "URL from redirection has already been seen" { - return nil - } else if err != nil { - return err - } - defer resp.Body.Close() - - // needed for WARC writing - io.Copy(io.Discard, resp.Body) - - return nil -} - // Capture capture the URL and return the outlinks func (c *Crawl) Capture(item *queue.Item) error { var ( @@ -370,6 +338,22 @@ func (c *Crawl) Capture(item *queue.Item) error { } defer resp.Body.Close() + // If it was a YouTube watch page, we potentially want to run it through the YouTube extractor + // TODO: support other watch page URLs + if strings.Contains(item.URL.Host, "youtube.com") && strings.Contains(item.URL.Path, "/watch") && !c.NoYTDLP { + URLs, err := youtube.Parse(resp.Body) + if err != nil { + c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing YouTube watch page") + return err + } + + if len(URLs) > 0 { + c.captureAssets(item, URLs, resp.Cookies()) + } + + return nil + } + // Scrape potential URLs from Link HTTP header var ( links = Parse(resp.Header.Get("link")) @@ -577,76 +561,8 @@ func (c *Crawl) Capture(item *queue.Item) error { } } - // TODO: implement a counter for the number of assets - // currently being processed - // c.Frontier.QueueCount.Incr(int64(len(assets))) - swg := sizedwaitgroup.New(int(c.MaxConcurrentAssets)) - excluded := false - - for _, asset := range assets { - // TODO: implement a counter for the number of assets - // currently being processed - // c.Frontier.QueueCount.Incr(-1) - - // Just making sure we do not over archive by archiving the original URL - if utils.URLToString(item.URL) == utils.URLToString(asset) { - continue - } - - // We ban googlevideo.com URLs because they are heavily rate limited by default, and - // we don't want the crawler to spend an innapropriate amount of time archiving them - if strings.Contains(item.URL.Host, "googlevideo.com") { - continue - } - - // If the URL match any excluded string, we ignore it - for _, excludedString := range c.ExcludedStrings { - if strings.Contains(utils.URLToString(asset), excludedString) { - excluded = true - break - } - } - - if excluded { - excluded = false - continue - } - - swg.Add() - c.URIsPerSecond.Incr(1) - - go func(asset *url.URL, swg *sizedwaitgroup.SizedWaitGroup) { - defer swg.Done() - - // Create the asset's item - newAsset, err := queue.NewItem(asset, item.URL, "asset", item.Hop, "", false) - if err != nil { - c.Log.WithFields(c.genLogFields(err, asset, map[string]interface{}{ - "parentHop": item.Hop, - "parentUrl": utils.URLToString(item.URL), - "type": "asset", - })).Error("error while creating asset item") - return - } - - // Capture the asset - err = c.captureAsset(newAsset, resp.Cookies()) - if err != nil { - c.Log.WithFields(c.genLogFields(err, &asset, map[string]interface{}{ - "parentHop": item.Hop, - "parentUrl": utils.URLToString(item.URL), - "type": "asset", - })).Error("error while capturing asset") - return - } - - // If we made it to this point, it means that the asset have been crawled successfully, - // then we can increment the locallyCrawled variable - atomic.AddUint64(&item.LocallyCrawled, 1) - }(asset, &swg) - } + c.captureAssets(item, assets, resp.Cookies()) - swg.Wait() return err } diff --git a/internal/pkg/crawl/config.go b/internal/pkg/crawl/config.go index 5e573144..2dff2270 100644 --- a/internal/pkg/crawl/config.go +++ b/internal/pkg/crawl/config.go @@ -97,7 +97,7 @@ type Crawl struct { CDXDedupeServer string WARCFullOnDisk bool WARCPoolSize int - WARCDedupSize int + WARCDedupeSize int DisableLocalDedupe bool CertValidation bool WARCCustomCookie string @@ -116,6 +116,10 @@ type Crawl struct { HQProducerChannel chan *queue.Item HQChannelsWg *sync.WaitGroup HQRateLimitingSendBack bool + + // Dependencies + NoYTDLP bool + YTDLPPath string } func GenerateCrawlConfig(config *config.Config) (*Crawl, error) { @@ -231,7 +235,7 @@ func GenerateCrawlConfig(config *config.Config) (*Crawl, error) { c.CertValidation = config.CertValidation c.WARCFullOnDisk = config.WARCOnDisk c.WARCPoolSize = config.WARCPoolSize - c.WARCDedupSize = config.WARCDedupeSize + c.WARCDedupeSize = config.WARCDedupeSize c.WARCCustomCookie = config.CDXCookie c.API = config.API @@ -246,6 +250,10 @@ func GenerateCrawlConfig(config *config.Config) (*Crawl, error) { c.PrometheusMetrics.Prefix = config.PrometheusPrefix } + // Dependencies + c.NoYTDLP = config.NoYTDLP + c.YTDLPPath = config.YTDLPPath + if config.UserAgent != "Zeno" { c.UserAgent = config.UserAgent } else { diff --git a/internal/pkg/crawl/crawl.go b/internal/pkg/crawl/crawl.go index 5d8e5af2..7e02983c 100644 --- a/internal/pkg/crawl/crawl.go +++ b/internal/pkg/crawl/crawl.go @@ -8,6 +8,7 @@ import ( "git.archive.org/wb/gocrawlhq" "github.com/CorentinB/warc" + "github.com/internetarchive/Zeno/internal/pkg/crawl/dependencies/ytdlp" "github.com/internetarchive/Zeno/internal/pkg/queue" "github.com/internetarchive/Zeno/internal/pkg/seencheck" "github.com/internetarchive/Zeno/internal/pkg/utils" @@ -67,9 +68,9 @@ func (c *Crawl) Start() (err error) { // Init WARC rotator settings rotatorSettings := c.initWARCRotatorSettings() - dedupeOptions := warc.DedupeOptions{LocalDedupe: !c.DisableLocalDedupe, SizeThreshold: c.WARCDedupSize} + dedupeOptions := warc.DedupeOptions{LocalDedupe: !c.DisableLocalDedupe, SizeThreshold: c.WARCDedupeSize} if c.CDXDedupeServer != "" { - dedupeOptions = warc.DedupeOptions{LocalDedupe: !c.DisableLocalDedupe, CDXDedupe: true, CDXURL: c.CDXDedupeServer, CDXCookie: c.WARCCustomCookie, SizeThreshold: c.WARCDedupSize} + dedupeOptions = warc.DedupeOptions{LocalDedupe: !c.DisableLocalDedupe, CDXDedupe: true, CDXURL: c.CDXDedupeServer, CDXCookie: c.WARCCustomCookie, SizeThreshold: c.WARCDedupeSize} } // Init the HTTP client responsible for recording HTTP(s) requests / responses @@ -125,6 +126,20 @@ func (c *Crawl) Start() (err error) { go c.startAPI() } + // Verify that dependencies exist on the system + if !c.NoYTDLP { + // If a yt-dlp path is specified, we use it, + // otherwise we try to find yt-dlp on the system + if c.YTDLPPath == "" { + path, found := ytdlp.FindPath() + if !found { + c.Log.Warn("yt-dlp not found on the system, please install it or specify the path in the configuration if you wish to use it") + } else { + c.YTDLPPath = path + } + } + } + // Parse input cookie file if specified if c.CookieFile != "" { cookieJar, err := cookiejar.NewFileJar(c.CookieFile, nil) diff --git a/internal/pkg/crawl/dependencies/ytdlp/model.go b/internal/pkg/crawl/dependencies/ytdlp/model.go new file mode 100644 index 00000000..51d990d4 --- /dev/null +++ b/internal/pkg/crawl/dependencies/ytdlp/model.go @@ -0,0 +1,11 @@ +package ytdlp + +type Video struct { + IsLive bool `json:"is_live"` + RequestedFormats []struct { + URL string `json:"url"` + } `json:"requested_formats"` + Thumbnails []struct { + URL string `json:"url"` + } `json:"thumbnails"` +} diff --git a/internal/pkg/crawl/dependencies/ytdlp/parse.go b/internal/pkg/crawl/dependencies/ytdlp/parse.go new file mode 100644 index 00000000..ab49b680 --- /dev/null +++ b/internal/pkg/crawl/dependencies/ytdlp/parse.go @@ -0,0 +1,42 @@ +package ytdlp + +import ( + "encoding/json" + "fmt" +) + +type SubtitleInfo struct { + Ext string `json:"ext"` + URL string `json:"url"` + Name string `json:"name"` +} + +// parseSubtitles parses the subtitles from the yt-dlp JSON output, +// it's needed because the subtitles are not given as a proper array or objects +func parseSubtitles(jsonData string) ([]string, error) { + var data map[string]json.RawMessage + err := json.Unmarshal([]byte(jsonData), &data) + if err != nil { + return nil, fmt.Errorf("error unmarshaling outer JSON: %v", err) + } + + subtitlesRaw, ok := data["subtitles"] + if !ok { + return nil, nil + } + + var subtitles map[string][]SubtitleInfo + err = json.Unmarshal(subtitlesRaw, &subtitles) + if err != nil { + return nil, fmt.Errorf("error unmarshaling subtitles JSON: %v", err) + } + + var URLs []string + for _, langSubtitles := range subtitles { + for _, subtitle := range langSubtitles { + URLs = append(URLs, subtitle.URL) + } + } + + return URLs, nil +} diff --git a/internal/pkg/crawl/dependencies/ytdlp/server.go b/internal/pkg/crawl/dependencies/ytdlp/server.go new file mode 100644 index 00000000..334c7ee4 --- /dev/null +++ b/internal/pkg/crawl/dependencies/ytdlp/server.go @@ -0,0 +1,46 @@ +package ytdlp + +import ( + "io" + "net" + "net/http" + "strings" +) + +func ServeBody(body io.ReadCloser) (port int, stopChan chan struct{}, err error) { + stopChan = make(chan struct{}) + portChan := make(chan int) + + bodyBytes, err := io.ReadAll(body) + if err != nil { + return 0, nil, err + } + + // Start the server + go func() { + // Serve the body on the random port + listener, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + panic(err) + } + defer listener.Close() + + portChan <- listener.Addr().(*net.TCPAddr).Port + + go func() { + <-stopChan + listener.Close() + }() + + // Create a handler that will serve the body on / + handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Write(bodyBytes) + }) + + if err := http.Serve(listener, handler); err != nil && !strings.Contains(err.Error(), "use of closed network connection") { + return + } + }() + + return <-portChan, stopChan, nil +} diff --git a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go new file mode 100644 index 00000000..75afc1c0 --- /dev/null +++ b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go @@ -0,0 +1,65 @@ +package ytdlp + +import ( + "bytes" + "encoding/json" + "fmt" + "os/exec" + "strconv" +) + +func GetJSON(port int) (URLs []string, err error) { + // Prepare the command + cmd := exec.Command("yt-dlp", "--dump-json", "http://localhost:"+strconv.Itoa(port)) + + // Buffers to capture stdout and stderr + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + // Run the command + err = cmd.Run() + if err != nil { + return URLs, fmt.Errorf("yt-dlp error: %v\nstderr: %s", err, stderr.String()) + } + + output := stdout.String() + + // Find subtitles + subtitleURLs, err := parseSubtitles(output) + if err != nil { + return nil, err + } + + // Parse the output as a Video object + var video Video + err = json.Unmarshal([]byte(output), &video) + if err != nil { + return nil, fmt.Errorf("error unmarshaling yt-dlp JSON: %v", err) + } + + // Get all thumbnail URLs + for _, thumbnail := range video.Thumbnails { + URLs = append(URLs, thumbnail.URL) + } + + // Get the manifest URL for the best video & audio quality + // Note: we do not archive live streams + if !video.IsLive { + for format := range video.RequestedFormats { + URLs = append(URLs, video.RequestedFormats[format].URL) + } + } + + URLs = append(URLs, subtitleURLs...) + + return URLs, nil +} + +func FindPath() (string, bool) { + path, err := exec.LookPath("yt-dlp") + if err != nil { + return "", false + } + return path, true +} diff --git a/internal/pkg/crawl/sitespecific/youtube/youtube.go b/internal/pkg/crawl/sitespecific/youtube/youtube.go new file mode 100644 index 00000000..888a5b08 --- /dev/null +++ b/internal/pkg/crawl/sitespecific/youtube/youtube.go @@ -0,0 +1,35 @@ +package youtube + +import ( + "io" + "net/url" + + "github.com/internetarchive/Zeno/internal/pkg/crawl/dependencies/ytdlp" +) + +func Parse(body io.ReadCloser) (URLs []*url.URL, err error) { + // Create a temporary server to serve the body and call ytdlp on it + port, stopChan, err := ytdlp.ServeBody(body) + if err != nil { + return nil, err + } + defer close(stopChan) + + // Call ytdlp on the temporary server + rawURLs, err := ytdlp.GetJSON(port) + if err != nil { + return nil, err + } + + // Parse the URLs + for _, urlString := range rawURLs { + URL, err := url.Parse(urlString) + if err != nil { + return nil, err + } + + URLs = append(URLs, URL) + } + + return URLs, nil +} diff --git a/internal/pkg/crawl/sitespecific/youtube/youtube_test.go b/internal/pkg/crawl/sitespecific/youtube/youtube_test.go new file mode 100644 index 00000000..44449b0d --- /dev/null +++ b/internal/pkg/crawl/sitespecific/youtube/youtube_test.go @@ -0,0 +1,27 @@ +package youtube + +import ( + "os" + "testing" +) + +func TestParse(t *testing.T) { + // Make io.ReadCloser from the youtube_test.html file + f, err := os.Open("youtube_test.html") + if err != nil { + t.Fatal(err) + } + defer f.Close() + + // Parse the video + URLs, err := Parse(f) + if err != nil { + t.Fatal(err) + } + + // Check the number of URLs + expected := 146 + if len(URLs) != expected { + t.Fatalf("Expected %d URLs, got %d", expected, len(URLs)) + } +} diff --git a/internal/pkg/crawl/sitespecific/youtube/youtube_test.html b/internal/pkg/crawl/sitespecific/youtube/youtube_test.html new file mode 100644 index 00000000..77474015 --- /dev/null +++ b/internal/pkg/crawl/sitespecific/youtube/youtube_test.html @@ -0,0 +1,88 @@ +$10,000 Every Day You Survive In A Grocery Store - YouTube
PrésentationPresseDroits d'auteurNous contacterCréateursPublicitéDéveloppeursRésilier vos abonnementsConditions d'utilisationConfidentialitéRègles et sécuritéPremiers pas sur YouTubeTester de nouvelles fonctionnalités
\ No newline at end of file From 6fa843a91da1651dd83b95bb6cf6c6bd943327ab Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Sun, 8 Sep 2024 19:42:37 +0200 Subject: [PATCH 02/12] [site/yt] add: format selection & metadata record --- internal/pkg/crawl/assets.go | 108 ++++++++++++++++++ internal/pkg/crawl/capture.go | 7 +- .../pkg/crawl/dependencies/ytdlp/model.go | 50 +++++++- .../pkg/crawl/dependencies/ytdlp/ytdlp.go | 32 ++++-- .../pkg/crawl/sitespecific/youtube/youtube.go | 12 +- 5 files changed, 190 insertions(+), 19 deletions(-) diff --git a/internal/pkg/crawl/assets.go b/internal/pkg/crawl/assets.go index fed41119..45da590e 100644 --- a/internal/pkg/crawl/assets.go +++ b/internal/pkg/crawl/assets.go @@ -1,20 +1,128 @@ package crawl import ( + "io" + "net/http" "net/url" "regexp" "strings" + "sync/atomic" "github.com/PuerkitoBio/goquery" "github.com/internetarchive/Zeno/internal/pkg/crawl/extractor" "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream" "github.com/internetarchive/Zeno/internal/pkg/queue" "github.com/internetarchive/Zeno/internal/pkg/utils" + "github.com/remeh/sizedwaitgroup" ) var backgroundImageRegex = regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`) var urlRegex = regexp.MustCompile(`(?m)url\((.*?)\)`) +func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie) error { + var resp *http.Response + + // Prepare GET request + req, err := http.NewRequest("GET", utils.URLToString(item.URL), nil) + if err != nil { + return err + } + + req.Header.Set("Referer", utils.URLToString(item.ParentURL)) + req.Header.Set("User-Agent", c.UserAgent) + + // Apply cookies obtained from the original URL captured + for i := range cookies { + req.AddCookie(cookies[i]) + } + + resp, err = c.executeGET(item, req, false) + if err != nil && err.Error() == "URL from redirection has already been seen" { + return nil + } else if err != nil { + return err + } + defer resp.Body.Close() + + // needed for WARC writing + io.Copy(io.Discard, resp.Body) + + return nil +} + +func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*http.Cookie) { + // TODO: implement a counter for the number of assets + // currently being processed + // c.Frontier.QueueCount.Incr(int64(len(assets))) + swg := sizedwaitgroup.New(int(c.MaxConcurrentAssets)) + excluded := false + + for _, asset := range assets { + // TODO: implement a counter for the number of assets + // currently being processed + // c.Frontier.QueueCount.Incr(-1) + + // Just making sure we do not over archive by archiving the original URL + if utils.URLToString(item.URL) == utils.URLToString(asset) { + continue + } + + // We ban googlevideo.com URLs because they are heavily rate limited by default, and + // we don't want the crawler to spend an innapropriate amount of time archiving them + if strings.Contains(item.URL.Host, "googlevideo.com") { + continue + } + + // If the URL match any excluded string, we ignore it + for _, excludedString := range c.ExcludedStrings { + if strings.Contains(utils.URLToString(asset), excludedString) { + excluded = true + break + } + } + + if excluded { + excluded = false + continue + } + + swg.Add() + c.URIsPerSecond.Incr(1) + + go func(asset *url.URL, swg *sizedwaitgroup.SizedWaitGroup) { + defer swg.Done() + + // Create the asset's item + newAsset, err := queue.NewItem(asset, item.URL, "asset", item.Hop, "", false) + if err != nil { + c.Log.WithFields(c.genLogFields(err, asset, map[string]interface{}{ + "parentHop": item.Hop, + "parentUrl": utils.URLToString(item.URL), + "type": "asset", + })).Error("error while creating asset item") + return + } + + // Capture the asset + err = c.captureAsset(newAsset, cookies) + if err != nil { + c.Log.WithFields(c.genLogFields(err, &asset, map[string]interface{}{ + "parentHop": item.Hop, + "parentUrl": utils.URLToString(item.URL), + "type": "asset", + })).Error("error while capturing asset") + return + } + + // If we made it to this point, it means that the asset have been crawled successfully, + // then we can increment the locallyCrawled variable + atomic.AddUint64(&item.LocallyCrawled, 1) + }(asset, &swg) + } + + swg.Wait() +} + func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Document) (assets []*url.URL, err error) { var rawAssets []string var URL = utils.URLToString(item.URL) diff --git a/internal/pkg/crawl/capture.go b/internal/pkg/crawl/capture.go index 20a33a5f..a5e97423 100644 --- a/internal/pkg/crawl/capture.go +++ b/internal/pkg/crawl/capture.go @@ -343,7 +343,7 @@ func (c *Crawl) Capture(item *queue.Item) error { // If it was a YouTube watch page, we potentially want to run it through the YouTube extractor // TODO: support other watch page URLs if strings.Contains(item.URL.Host, "youtube.com") && strings.Contains(item.URL.Path, "/watch") && !c.NoYTDLP { - URLs, err := youtube.Parse(resp.Body) + URLs, rawJSON, err := youtube.Parse(resp.Body) if err != nil { c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing YouTube watch page") return err @@ -353,6 +353,11 @@ func (c *Crawl) Capture(item *queue.Item) error { c.captureAssets(item, URLs, resp.Cookies()) } + // Write the metadata record for the video + if rawJSON != "" { + c.Client.WriteMetadataRecord(utils.URLToString(item.URL), "application/json;generator=youtube-dl", rawJSON) + } + return nil } diff --git a/internal/pkg/crawl/dependencies/ytdlp/model.go b/internal/pkg/crawl/dependencies/ytdlp/model.go index 51d990d4..29892fb4 100644 --- a/internal/pkg/crawl/dependencies/ytdlp/model.go +++ b/internal/pkg/crawl/dependencies/ytdlp/model.go @@ -1,10 +1,52 @@ package ytdlp type Video struct { - IsLive bool `json:"is_live"` - RequestedFormats []struct { - URL string `json:"url"` - } `json:"requested_formats"` + ID string `json:"id"` + IsLive bool `json:"is_live"` + Formats []struct { + Acodec string `json:"acodec"` + AspectRatio float64 `json:"aspect_ratio"` + AudioExt string `json:"audio_ext"` + Columns float64 `json:"columns,omitempty"` + Ext string `json:"ext"` + Format string `json:"format"` + FormatID string `json:"format_id"` + FormatNote string `json:"format_note"` + Fps float64 `json:"fps"` + Fragments []struct { + Duration float64 `json:"duration"` + URL string `json:"url"` + } `json:"fragments,omitempty"` + Height float64 `json:"height"` + HTTPHeaders struct { + Accept string `json:"Accept"` + AcceptLanguage string `json:"Accept-Language"` + SecFetchMode string `json:"Sec-Fetch-Mode"` + UserAgent string `json:"User-Agent"` + } `json:"http_headers"` + Protocol string `json:"protocol"` + Resolution string `json:"resolution"` + Rows float64 `json:"rows,omitempty"` + URL string `json:"url"` + Vcodec string `json:"vcodec"` + VideoExt string `json:"video_ext"` + Width float64 `json:"width"` + Abr float64 `json:"abr,omitempty"` + Asr float64 `json:"asr,omitempty"` + AudioChannels float64 `json:"audio_channels,omitempty"` + Container string `json:"container,omitempty"` + DynamicRange interface{} `json:"dynamic_range,omitempty"` + Filesize float64 `json:"filesize,omitempty"` + HasDrm bool `json:"has_drm,omitempty"` + Language string `json:"language,omitempty"` + LanguagePreference float64 `json:"language_preference,omitempty"` + Preference interface{} `json:"preference,omitempty"` + Quality float64 `json:"quality,omitempty"` + SourcePreference float64 `json:"source_preference,omitempty"` + Tbr float64 `json:"tbr,omitempty"` + Vbr float64 `json:"vbr,omitempty"` + FilesizeApprox float64 `json:"filesize_approx,omitempty"` + } `json:"formats"` Thumbnails []struct { URL string `json:"url"` } `json:"thumbnails"` diff --git a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go index 75afc1c0..2e2ccbc9 100644 --- a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go +++ b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go @@ -6,11 +6,12 @@ import ( "fmt" "os/exec" "strconv" + "strings" ) -func GetJSON(port int) (URLs []string, err error) { +func GetJSON(port int) (URLs []string, rawJSON string, err error) { // Prepare the command - cmd := exec.Command("yt-dlp", "--dump-json", "http://localhost:"+strconv.Itoa(port)) + cmd := exec.Command("yt-dlp", "--dump-json", "-f", "18", "http://localhost:"+strconv.Itoa(port)) // Buffers to capture stdout and stderr var stdout, stderr bytes.Buffer @@ -20,7 +21,7 @@ func GetJSON(port int) (URLs []string, err error) { // Run the command err = cmd.Run() if err != nil { - return URLs, fmt.Errorf("yt-dlp error: %v\nstderr: %s", err, stderr.String()) + return URLs, rawJSON, fmt.Errorf("yt-dlp error: %v\nstderr: %s", err, stderr.String()) } output := stdout.String() @@ -28,14 +29,14 @@ func GetJSON(port int) (URLs []string, err error) { // Find subtitles subtitleURLs, err := parseSubtitles(output) if err != nil { - return nil, err + return nil, rawJSON, fmt.Errorf("error parsing subtitles: %v", err) } // Parse the output as a Video object var video Video err = json.Unmarshal([]byte(output), &video) if err != nil { - return nil, fmt.Errorf("error unmarshaling yt-dlp JSON: %v", err) + return nil, rawJSON, fmt.Errorf("error unmarshaling yt-dlp JSON: %v", err) } // Get all thumbnail URLs @@ -46,14 +47,29 @@ func GetJSON(port int) (URLs []string, err error) { // Get the manifest URL for the best video & audio quality // Note: we do not archive live streams if !video.IsLive { - for format := range video.RequestedFormats { - URLs = append(URLs, video.RequestedFormats[format].URL) + // Find the best format for the video in the formats that + // use the "https" protocol and don't contain "only" in their name (to avoid audio or video-only formats) + // and don't contain "_dash" in their container (to avoid DASH formats) + var bestFormatQuality float64 + var bestFormatPosition int + for i, format := range video.Formats { + if (bestFormatQuality == 0 || format.Quality > bestFormatQuality) && + format.Protocol == "https" && + !strings.Contains(format.Format, "only") && + !strings.Contains(format.Container, "_dash") { + bestFormatQuality = format.Quality + bestFormatPosition = i + } } + + URLs = append(URLs, + video.Formats[bestFormatPosition].URL+"&video_id="+video.ID, + video.Formats[bestFormatPosition].URL) } URLs = append(URLs, subtitleURLs...) - return URLs, nil + return URLs, output, nil } func FindPath() (string, bool) { diff --git a/internal/pkg/crawl/sitespecific/youtube/youtube.go b/internal/pkg/crawl/sitespecific/youtube/youtube.go index 888a5b08..6424000f 100644 --- a/internal/pkg/crawl/sitespecific/youtube/youtube.go +++ b/internal/pkg/crawl/sitespecific/youtube/youtube.go @@ -7,29 +7,29 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/crawl/dependencies/ytdlp" ) -func Parse(body io.ReadCloser) (URLs []*url.URL, err error) { +func Parse(body io.ReadCloser) (URLs []*url.URL, rawJSON string, err error) { // Create a temporary server to serve the body and call ytdlp on it port, stopChan, err := ytdlp.ServeBody(body) if err != nil { - return nil, err + return nil, rawJSON, err } defer close(stopChan) // Call ytdlp on the temporary server - rawURLs, err := ytdlp.GetJSON(port) + rawURLs, rawJSON, err := ytdlp.GetJSON(port) if err != nil { - return nil, err + return nil, rawJSON, err } // Parse the URLs for _, urlString := range rawURLs { URL, err := url.Parse(urlString) if err != nil { - return nil, err + return nil, rawJSON, err } URLs = append(URLs, URL) } - return URLs, nil + return URLs, rawJSON, nil } From 6bdb2cf4a12c2ea4f29d97b95cd5b0ad1ec16669 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Mon, 9 Sep 2024 11:41:40 +0200 Subject: [PATCH 03/12] [ext/m3u8] initial commit --- go.mod | 1 + go.sum | 2 + internal/pkg/crawl/assets.go | 17 ++++--- internal/pkg/crawl/capture.go | 5 +++ .../pkg/crawl/dependencies/ytdlp/model.go | 44 ++++++++++++++++++- .../pkg/crawl/dependencies/ytdlp/ytdlp.go | 32 ++++---------- internal/pkg/crawl/extractor/m3u8.go | 44 +++++++++++++++++++ 7 files changed, 113 insertions(+), 32 deletions(-) create mode 100644 internal/pkg/crawl/extractor/m3u8.go diff --git a/go.mod b/go.mod index 81d31a1e..33eec97f 100644 --- a/go.mod +++ b/go.mod @@ -49,6 +49,7 @@ require ( github.com/golang/snappy v0.0.4 // indirect github.com/gomodule/redigo v1.9.2 // indirect github.com/grafana/pyroscope-go/godeltaprof v0.1.8 // indirect + github.com/grafov/m3u8 v0.12.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/json-iterator/go v1.1.12 // indirect diff --git a/go.sum b/go.sum index a53406d6..675ea40a 100644 --- a/go.sum +++ b/go.sum @@ -70,6 +70,8 @@ github.com/grafana/pyroscope-go v1.1.2 h1:7vCfdORYQMCxIzI3NlYAs3FcBP760+gWuYWOyi github.com/grafana/pyroscope-go v1.1.2/go.mod h1:HSSmHo2KRn6FasBA4vK7BMiQqyQq8KSuBKvrhkXxYPU= github.com/grafana/pyroscope-go/godeltaprof v0.1.8 h1:iwOtYXeeVSAeYefJNaxDytgjKtUuKQbJqgAIjlnicKg= github.com/grafana/pyroscope-go/godeltaprof v0.1.8/go.mod h1:2+l7K7twW49Ct4wFluZD3tZ6e0SjanjcUUBPVD/UuGU= +github.com/grafov/m3u8 v0.12.0 h1:T6iTwTsSEtMcwkayef+FJO8kj+Sglr4Lh81Zj8Ked/4= +github.com/grafov/m3u8 v0.12.0/go.mod h1:nqzOkfBiZJENr52zTVd/Dcl03yzphIMbJqkXGu+u080= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI= diff --git a/internal/pkg/crawl/assets.go b/internal/pkg/crawl/assets.go index 45da590e..985ae785 100644 --- a/internal/pkg/crawl/assets.go +++ b/internal/pkg/crawl/assets.go @@ -44,6 +44,17 @@ func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie) error { } defer resp.Body.Close() + if strings.Contains(resp.Header.Get("Content-Type"), "vnd.apple.mpegurl") { + assets, err := extractor.M3U8(resp) + if err != nil { + c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8") + } + + c.captureAssets(item, assets, cookies) + + return nil + } + // needed for WARC writing io.Copy(io.Discard, resp.Body) @@ -67,12 +78,6 @@ func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*ht continue } - // We ban googlevideo.com URLs because they are heavily rate limited by default, and - // we don't want the crawler to spend an innapropriate amount of time archiving them - if strings.Contains(item.URL.Host, "googlevideo.com") { - continue - } - // If the URL match any excluded string, we ignore it for _, excludedString := range c.ExcludedStrings { if strings.Contains(utils.URLToString(asset), excludedString) { diff --git a/internal/pkg/crawl/capture.go b/internal/pkg/crawl/capture.go index a5e97423..69e000b7 100644 --- a/internal/pkg/crawl/capture.go +++ b/internal/pkg/crawl/capture.go @@ -392,6 +392,11 @@ func (c *Crawl) Capture(item *queue.Item) error { if err != nil { c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from JSON") } + } else if strings.Contains(resp.Header.Get("Content-Type"), "vnd.apple.mpegurl") { + assets, err = extractor.M3U8(resp) + if err != nil { + c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8") + } } else if !strings.Contains(resp.Header.Get("Content-Type"), "text/") || (c.DisableAssetsCapture && !c.DomainsCrawl && (uint64(c.MaxHops) <= item.Hop)) { // If the response isn't a text/*, we do not scrape it. // We also aren't going to scrape if assets and outlinks are turned off. diff --git a/internal/pkg/crawl/dependencies/ytdlp/model.go b/internal/pkg/crawl/dependencies/ytdlp/model.go index 29892fb4..bec84aa9 100644 --- a/internal/pkg/crawl/dependencies/ytdlp/model.go +++ b/internal/pkg/crawl/dependencies/ytdlp/model.go @@ -1,8 +1,48 @@ package ytdlp type Video struct { - ID string `json:"id"` - IsLive bool `json:"is_live"` + ID string `json:"id"` + IsLive bool `json:"is_live"` + RequestedFormats []struct { + Acodec string `json:"acodec"` + AspectRatio float64 `json:"aspect_ratio"` + Asr interface{} `json:"asr"` + AudioChannels interface{} `json:"audio_channels"` + AudioExt string `json:"audio_ext"` + Container string `json:"container"` + DynamicRange string `json:"dynamic_range"` + Ext string `json:"ext"` + Filesize float64 `json:"filesize"` + Format string `json:"format"` + FormatID string `json:"format_id"` + FormatNote string `json:"format_note"` + Fps float64 `json:"fps"` + Fragments []struct { + URL string `json:"url"` + } `json:"fragments"` + HasDrm bool `json:"has_drm"` + Height float64 `json:"height"` + HTTPHeaders struct { + Accept string `json:"Accept"` + AcceptLanguage string `json:"Accept-Language"` + SecFetchMode string `json:"Sec-Fetch-Mode"` + UserAgent string `json:"User-Agent"` + } `json:"http_headers"` + Language interface{} `json:"language"` + LanguagePreference float64 `json:"language_preference"` + Preference interface{} `json:"preference"` + Protocol string `json:"protocol"` + Quality float64 `json:"quality"` + Resolution string `json:"resolution"` + SourcePreference float64 `json:"source_preference"` + Tbr float64 `json:"tbr"` + URL string `json:"url"` + Vbr float64 `json:"vbr,omitempty"` + Vcodec string `json:"vcodec"` + VideoExt string `json:"video_ext"` + Width float64 `json:"width"` + Abr float64 `json:"abr,omitempty"` + } `json:"requested_formats"` Formats []struct { Acodec string `json:"acodec"` AspectRatio float64 `json:"aspect_ratio"` diff --git a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go index 2e2ccbc9..2ae31efb 100644 --- a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go +++ b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go @@ -6,12 +6,11 @@ import ( "fmt" "os/exec" "strconv" - "strings" ) func GetJSON(port int) (URLs []string, rawJSON string, err error) { // Prepare the command - cmd := exec.Command("yt-dlp", "--dump-json", "-f", "18", "http://localhost:"+strconv.Itoa(port)) + cmd := exec.Command("yt-dlp", "--dump-json", "http://localhost:"+strconv.Itoa(port)) // Buffers to capture stdout and stderr var stdout, stderr bytes.Buffer @@ -27,10 +26,10 @@ func GetJSON(port int) (URLs []string, rawJSON string, err error) { output := stdout.String() // Find subtitles - subtitleURLs, err := parseSubtitles(output) - if err != nil { - return nil, rawJSON, fmt.Errorf("error parsing subtitles: %v", err) - } + // subtitleURLs, err := parseSubtitles(output) + // if err != nil { + // return nil, rawJSON, fmt.Errorf("error parsing subtitles: %v", err) + // } // Parse the output as a Video object var video Video @@ -47,27 +46,12 @@ func GetJSON(port int) (URLs []string, rawJSON string, err error) { // Get the manifest URL for the best video & audio quality // Note: we do not archive live streams if !video.IsLive { - // Find the best format for the video in the formats that - // use the "https" protocol and don't contain "only" in their name (to avoid audio or video-only formats) - // and don't contain "_dash" in their container (to avoid DASH formats) - var bestFormatQuality float64 - var bestFormatPosition int - for i, format := range video.Formats { - if (bestFormatQuality == 0 || format.Quality > bestFormatQuality) && - format.Protocol == "https" && - !strings.Contains(format.Format, "only") && - !strings.Contains(format.Container, "_dash") { - bestFormatQuality = format.Quality - bestFormatPosition = i - } + for _, format := range video.RequestedFormats { + URLs = append(URLs, format.URL, format.URL+"&video_id="+video.ID) } - - URLs = append(URLs, - video.Formats[bestFormatPosition].URL+"&video_id="+video.ID, - video.Formats[bestFormatPosition].URL) } - URLs = append(URLs, subtitleURLs...) + //URLs = append(URLs, subtitleURLs...) return URLs, output, nil } diff --git a/internal/pkg/crawl/extractor/m3u8.go b/internal/pkg/crawl/extractor/m3u8.go new file mode 100644 index 00000000..8bc32bbc --- /dev/null +++ b/internal/pkg/crawl/extractor/m3u8.go @@ -0,0 +1,44 @@ +package extractor + +import ( + "net/http" + "net/url" + + "github.com/grafov/m3u8" +) + +func M3U8(resp *http.Response) (URLs []*url.URL, err error) { + p, listType, err := m3u8.DecodeFrom(resp.Body, true) + if err != nil { + panic(err) + } + + var rawURLs []string + switch listType { + case m3u8.MEDIA: + mediapl := p.(*m3u8.MediaPlaylist) + + for _, segment := range mediapl.Segments { + if segment != nil { + rawURLs = append(rawURLs, segment.URI) + } + } + case m3u8.MASTER: + masterpl := p.(*m3u8.MasterPlaylist) + + for _, variant := range masterpl.Variants { + if variant != nil { + rawURLs = append(rawURLs, variant.URI) + } + } + } + + for _, rawURL := range rawURLs { + URL, err := url.Parse(rawURL) + if err == nil { + URLs = append(URLs, URL) + } + } + + return URLs, err +} From 03ec807bdba050468b2e6b875c190c09fee8a122 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Mon, 9 Sep 2024 15:59:20 +0200 Subject: [PATCH 04/12] fix: remove default global HTTP timeout --- cmd/get.go | 2 +- go.mod | 4 +- go.sum | 4 +- internal/pkg/crawl/assets.go | 41 +++++++++++-------- internal/pkg/crawl/capture.go | 28 +++++++++++-- internal/pkg/crawl/crawl.go | 8 +++- .../pkg/crawl/dependencies/ytdlp/model.go | 18 ++++---- .../pkg/crawl/dependencies/ytdlp/ytdlp.go | 31 +++++++++----- .../pkg/crawl/sitespecific/youtube/youtube.go | 12 +++--- 9 files changed, 97 insertions(+), 51 deletions(-) diff --git a/cmd/get.go b/cmd/get.go index 794ac12c..2b73b6c6 100644 --- a/cmd/get.go +++ b/cmd/get.go @@ -43,7 +43,7 @@ func getCMDsFlags(getCmd *cobra.Command) { getCmd.PersistentFlags().String("prometheus-prefix", "zeno:", "String used as a prefix for the exported Prometheus metrics.") getCmd.PersistentFlags().Int("max-redirect", 20, "Specifies the maximum number of redirections to follow for a resource.") getCmd.PersistentFlags().Int("max-retry", 5, "Number of retry if error happen when executing HTTP request.") - getCmd.PersistentFlags().Int("http-timeout", 30, "Number of seconds to wait before timing out a request.") + getCmd.PersistentFlags().Int("http-timeout", -1, "Number of seconds to wait before timing out a request.") getCmd.PersistentFlags().Bool("domains-crawl", false, "If this is turned on, seeds will be treated as domains to crawl, therefore same-domain outlinks will be added to the queue as hop=0.") getCmd.PersistentFlags().StringSlice("disable-html-tag", []string{}, "Specify HTML tag to not extract assets from") getCmd.PersistentFlags().Bool("capture-alternate-pages", false, "If turned on, HTML tags with \"alternate\" values for their \"rel\" attribute will be archived.") diff --git a/go.mod b/go.mod index 33eec97f..1e24390d 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.23.1 require ( git.archive.org/wb/gocrawlhq v1.2.7 - github.com/CorentinB/warc v0.8.44 + github.com/CorentinB/warc v0.8.45 github.com/PuerkitoBio/goquery v1.9.2 github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 github.com/clbanning/mxj/v2 v2.7.0 @@ -14,6 +14,7 @@ require ( github.com/gosuri/uilive v0.0.4 github.com/gosuri/uitable v0.0.4 github.com/grafana/pyroscope-go v1.1.2 + github.com/grafov/m3u8 v0.12.0 github.com/paulbellamy/ratecounter v0.2.0 github.com/philippgille/gokv/leveldb v0.7.0 github.com/prometheus/client_golang v1.20.2 @@ -49,7 +50,6 @@ require ( github.com/golang/snappy v0.0.4 // indirect github.com/gomodule/redigo v1.9.2 // indirect github.com/grafana/pyroscope-go/godeltaprof v0.1.8 // indirect - github.com/grafov/m3u8 v0.12.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/json-iterator/go v1.1.12 // indirect diff --git a/go.sum b/go.sum index 675ea40a..952c69d1 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,7 @@ git.archive.org/wb/gocrawlhq v1.2.7 h1:+LGu6hcG4xpyHFvmk3TCTEmU90wwWj1RW9PPqWVx9TQ= git.archive.org/wb/gocrawlhq v1.2.7/go.mod h1:ursn4DkepW9Z6kKMp5qfeZc2+75gcSBmFgoIWGt2sWA= -github.com/CorentinB/warc v0.8.44 h1:dxtImoHbCDQh84yp6XSnHiBP/MGQypJNw9Hovg2zA+Y= -github.com/CorentinB/warc v0.8.44/go.mod h1:V9uPnP4mv6t1VgqrOSJK4wkPajVxhNz5GTrfcIALOXU= +github.com/CorentinB/warc v0.8.45 h1:AqhjgyLyvF2FKj4iI0nAaLGNmoS9wMEVFw4I3Uf9qr8= +github.com/CorentinB/warc v0.8.45/go.mod h1:V9uPnP4mv6t1VgqrOSJK4wkPajVxhNz5GTrfcIALOXU= github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE= github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk= github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= diff --git a/internal/pkg/crawl/assets.go b/internal/pkg/crawl/assets.go index 985ae785..faac6a7e 100644 --- a/internal/pkg/crawl/assets.go +++ b/internal/pkg/crawl/assets.go @@ -19,7 +19,7 @@ import ( var backgroundImageRegex = regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`) var urlRegex = regexp.MustCompile(`(?m)url\((.*?)\)`) -func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie) error { +func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie, headers map[string]string) error { var resp *http.Response // Prepare GET request @@ -28,8 +28,16 @@ func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie) error { return err } - req.Header.Set("Referer", utils.URLToString(item.ParentURL)) - req.Header.Set("User-Agent", c.UserAgent) + // If headers are passed, apply them to the request + // else, apply the default headers + if headers == nil { + for key, value := range headers { + req.Header.Set(key, value) + } + } else { + req.Header.Set("Referer", utils.URLToString(item.ParentURL)) + req.Header.Set("User-Agent", c.UserAgent) + } // Apply cookies obtained from the original URL captured for i := range cookies { @@ -45,23 +53,24 @@ func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie) error { defer resp.Body.Close() if strings.Contains(resp.Header.Get("Content-Type"), "vnd.apple.mpegurl") { - assets, err := extractor.M3U8(resp) - if err != nil { - c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8") - } - - c.captureAssets(item, assets, cookies) - - return nil + // assets, err := extractor.M3U8(resp) + // if err != nil { + // c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8") + // } + // resp.Body.Close() + + // c.captureAssets(item, assets, cookies) + + // return nil + } else { + // needed for WARC writing + io.Copy(io.Discard, resp.Body) } - // needed for WARC writing - io.Copy(io.Discard, resp.Body) - return nil } -func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*http.Cookie) { +func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*http.Cookie, headers map[string]string) { // TODO: implement a counter for the number of assets // currently being processed // c.Frontier.QueueCount.Incr(int64(len(assets))) @@ -109,7 +118,7 @@ func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*ht } // Capture the asset - err = c.captureAsset(newAsset, cookies) + err = c.captureAsset(newAsset, cookies, headers) if err != nil { c.Log.WithFields(c.genLogFields(err, &asset, map[string]interface{}{ "parentHop": item.Hop, diff --git a/internal/pkg/crawl/capture.go b/internal/pkg/crawl/capture.go index 69e000b7..b9d744b4 100644 --- a/internal/pkg/crawl/capture.go +++ b/internal/pkg/crawl/capture.go @@ -343,14 +343,36 @@ func (c *Crawl) Capture(item *queue.Item) error { // If it was a YouTube watch page, we potentially want to run it through the YouTube extractor // TODO: support other watch page URLs if strings.Contains(item.URL.Host, "youtube.com") && strings.Contains(item.URL.Path, "/watch") && !c.NoYTDLP { - URLs, rawJSON, err := youtube.Parse(resp.Body) + URLs, rawJSON, HTTPHeaders, err := youtube.Parse(resp.Body) if err != nil { c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing YouTube watch page") return err } + resp.Body.Close() + + // Build the cookies + // cookies := append([]*http.Cookie{}, &http.Cookie{ + // Name: "Accept", + // Value: HTTPHeaders.Accept, + // }, &http.Cookie{ + // Name: "Accept-Language", + // Value: HTTPHeaders.AcceptLanguage, + // }, &http.Cookie{ + // Name: "Sec-Fetch-Mode", + // Value: HTTPHeaders.SecFetchMode, + // }, &http.Cookie{ + // Name: "User-Agent", + // Value: HTTPHeaders.UserAgent, + // }) + + var headers = make(map[string]string) + headers["Accept"] = HTTPHeaders.Accept + headers["Accept-Language"] = HTTPHeaders.AcceptLanguage + headers["Sec-Fetch-Mode"] = HTTPHeaders.SecFetchMode + headers["User-Agent"] = HTTPHeaders.UserAgent if len(URLs) > 0 { - c.captureAssets(item, URLs, resp.Cookies()) + c.captureAssets(item, URLs, resp.Cookies(), headers) } // Write the metadata record for the video @@ -545,7 +567,7 @@ func (c *Crawl) Capture(item *queue.Item) error { } } - c.captureAssets(item, assets, resp.Cookies()) + c.captureAssets(item, assets, resp.Cookies(), nil) return err } diff --git a/internal/pkg/crawl/crawl.go b/internal/pkg/crawl/crawl.go index 3d41f1f8..2420db8e 100644 --- a/internal/pkg/crawl/crawl.go +++ b/internal/pkg/crawl/crawl.go @@ -121,8 +121,12 @@ func (c *Crawl) Start() (err error) { } }() - c.Client.Timeout = time.Duration(c.HTTPTimeout) * time.Second - c.Log.Info("HTTP client timeout set", "timeout", c.HTTPTimeout) + if c.HTTPTimeout > 0 { + c.Client.Timeout = time.Duration(c.HTTPTimeout) * time.Second + c.Log.Info("Global HTTP client timeout set", "timeout", c.HTTPTimeout) + } else { + c.Log.Info("Global HTTP client timeout not set") + } if c.Proxy != "" { proxyHTTPClientSettings := HTTPClientSettings diff --git a/internal/pkg/crawl/dependencies/ytdlp/model.go b/internal/pkg/crawl/dependencies/ytdlp/model.go index bec84aa9..5e6b5075 100644 --- a/internal/pkg/crawl/dependencies/ytdlp/model.go +++ b/internal/pkg/crawl/dependencies/ytdlp/model.go @@ -20,14 +20,9 @@ type Video struct { Fragments []struct { URL string `json:"url"` } `json:"fragments"` - HasDrm bool `json:"has_drm"` - Height float64 `json:"height"` - HTTPHeaders struct { - Accept string `json:"Accept"` - AcceptLanguage string `json:"Accept-Language"` - SecFetchMode string `json:"Sec-Fetch-Mode"` - UserAgent string `json:"User-Agent"` - } `json:"http_headers"` + HasDrm bool `json:"has_drm"` + Height float64 `json:"height"` + HTTPHeaders HTTPHeaders `json:"http_headers"` Language interface{} `json:"language"` LanguagePreference float64 `json:"language_preference"` Preference interface{} `json:"preference"` @@ -91,3 +86,10 @@ type Video struct { URL string `json:"url"` } `json:"thumbnails"` } + +type HTTPHeaders struct { + Accept string `json:"Accept"` + AcceptLanguage string `json:"Accept-Language"` + SecFetchMode string `json:"Sec-Fetch-Mode"` + UserAgent string `json:"User-Agent"` +} diff --git a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go index 2ae31efb..23dad79d 100644 --- a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go +++ b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go @@ -8,7 +8,7 @@ import ( "strconv" ) -func GetJSON(port int) (URLs []string, rawJSON string, err error) { +func GetJSON(port int) (URLs []string, rawJSON string, HTTPHeaders HTTPHeaders, err error) { // Prepare the command cmd := exec.Command("yt-dlp", "--dump-json", "http://localhost:"+strconv.Itoa(port)) @@ -20,22 +20,22 @@ func GetJSON(port int) (URLs []string, rawJSON string, err error) { // Run the command err = cmd.Run() if err != nil { - return URLs, rawJSON, fmt.Errorf("yt-dlp error: %v\nstderr: %s", err, stderr.String()) + return URLs, rawJSON, HTTPHeaders, fmt.Errorf("yt-dlp error: %v\nstderr: %s", err, stderr.String()) } output := stdout.String() // Find subtitles - // subtitleURLs, err := parseSubtitles(output) - // if err != nil { - // return nil, rawJSON, fmt.Errorf("error parsing subtitles: %v", err) - // } + subtitleURLs, err := parseSubtitles(output) + if err != nil { + return nil, rawJSON, HTTPHeaders, fmt.Errorf("error parsing subtitles: %v", err) + } // Parse the output as a Video object var video Video err = json.Unmarshal([]byte(output), &video) if err != nil { - return nil, rawJSON, fmt.Errorf("error unmarshaling yt-dlp JSON: %v", err) + return nil, rawJSON, HTTPHeaders, fmt.Errorf("error unmarshaling yt-dlp JSON: %v", err) } // Get all thumbnail URLs @@ -46,14 +46,23 @@ func GetJSON(port int) (URLs []string, rawJSON string, err error) { // Get the manifest URL for the best video & audio quality // Note: we do not archive live streams if !video.IsLive { - for _, format := range video.RequestedFormats { - URLs = append(URLs, format.URL, format.URL+"&video_id="+video.ID) + if len(video.RequestedFormats) > 0 { + HTTPHeaders = video.RequestedFormats[0].HTTPHeaders + for _, format := range video.RequestedFormats { + URLs = append(URLs, format.URL, format.URL+"&video_id="+video.ID) + } } } - //URLs = append(URLs, subtitleURLs...) + // write output to a .json file (debug) + // err = ioutil.WriteFile("output.json", []byte(output), 0644) + // if err != nil { + // return nil, rawJSON, HTTPHeaders, fmt.Errorf("error writing output.json: %v", err) + // } + + URLs = append(URLs, subtitleURLs...) - return URLs, output, nil + return URLs, output, HTTPHeaders, nil } func FindPath() (string, bool) { diff --git a/internal/pkg/crawl/sitespecific/youtube/youtube.go b/internal/pkg/crawl/sitespecific/youtube/youtube.go index 6424000f..643ae052 100644 --- a/internal/pkg/crawl/sitespecific/youtube/youtube.go +++ b/internal/pkg/crawl/sitespecific/youtube/youtube.go @@ -7,29 +7,29 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/crawl/dependencies/ytdlp" ) -func Parse(body io.ReadCloser) (URLs []*url.URL, rawJSON string, err error) { +func Parse(body io.ReadCloser) (URLs []*url.URL, rawJSON string, HTTPHeaders ytdlp.HTTPHeaders, err error) { // Create a temporary server to serve the body and call ytdlp on it port, stopChan, err := ytdlp.ServeBody(body) if err != nil { - return nil, rawJSON, err + return nil, rawJSON, HTTPHeaders, err } defer close(stopChan) // Call ytdlp on the temporary server - rawURLs, rawJSON, err := ytdlp.GetJSON(port) + rawURLs, rawJSON, HTTPHeaders, err := ytdlp.GetJSON(port) if err != nil { - return nil, rawJSON, err + return nil, rawJSON, HTTPHeaders, err } // Parse the URLs for _, urlString := range rawURLs { URL, err := url.Parse(urlString) if err != nil { - return nil, rawJSON, err + return nil, rawJSON, HTTPHeaders, err } URLs = append(URLs, URL) } - return URLs, rawJSON, nil + return URLs, rawJSON, HTTPHeaders, nil } From f2108932ff42edf46f9b52b83fc2b7ba3e0b312f Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Mon, 9 Sep 2024 18:25:30 +0200 Subject: [PATCH 05/12] [site/yt] wip: fix tests --- internal/pkg/crawl/sitespecific/youtube/youtube_test.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/internal/pkg/crawl/sitespecific/youtube/youtube_test.go b/internal/pkg/crawl/sitespecific/youtube/youtube_test.go index 44449b0d..a881c23b 100644 --- a/internal/pkg/crawl/sitespecific/youtube/youtube_test.go +++ b/internal/pkg/crawl/sitespecific/youtube/youtube_test.go @@ -14,11 +14,16 @@ func TestParse(t *testing.T) { defer f.Close() // Parse the video - URLs, err := Parse(f) + URLs, rawJSON, _, err := Parse(f) if err != nil { t.Fatal(err) } + // Check the raw JSON + if rawJSON == "" { + t.Fatal("Expected non-empty raw JSON") + } + // Check the number of URLs expected := 146 if len(URLs) != expected { From c221e8863880045ad734423d93b0a21edf6cb08e Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 10 Sep 2024 00:02:04 +0200 Subject: [PATCH 06/12] chores: small refactoring --- internal/pkg/crawl/assets.go | 31 ++++++++++++---------------- internal/pkg/crawl/capture.go | 2 +- internal/pkg/crawl/extractor/m3u8.go | 8 ++++++- internal/pkg/crawl/finish.go | 2 +- 4 files changed, 22 insertions(+), 21 deletions(-) diff --git a/internal/pkg/crawl/assets.go b/internal/pkg/crawl/assets.go index faac6a7e..408c6511 100644 --- a/internal/pkg/crawl/assets.go +++ b/internal/pkg/crawl/assets.go @@ -28,15 +28,14 @@ func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie, headers m return err } + req.Header.Set("Referer", utils.URLToString(item.ParentURL)) + req.Header.Set("User-Agent", c.UserAgent) + // If headers are passed, apply them to the request - // else, apply the default headers - if headers == nil { + if headers != nil { for key, value := range headers { req.Header.Set(key, value) } - } else { - req.Header.Set("Referer", utils.URLToString(item.ParentURL)) - req.Header.Set("User-Agent", c.UserAgent) } // Apply cookies obtained from the original URL captured @@ -52,21 +51,17 @@ func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie, headers m } defer resp.Body.Close() - if strings.Contains(resp.Header.Get("Content-Type"), "vnd.apple.mpegurl") { - // assets, err := extractor.M3U8(resp) - // if err != nil { - // c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8") - // } - // resp.Body.Close() - - // c.captureAssets(item, assets, cookies) - - // return nil - } else { - // needed for WARC writing - io.Copy(io.Discard, resp.Body) + if extractor.IsM3U8(resp) { + assets, err := extractor.M3U8(resp) + if err == nil { + c.captureAssets(item, assets, cookies, headers) + } else { + c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8") + } } + io.Copy(io.Discard, resp.Body) + return nil } diff --git a/internal/pkg/crawl/capture.go b/internal/pkg/crawl/capture.go index b9d744b4..c7aefcf0 100644 --- a/internal/pkg/crawl/capture.go +++ b/internal/pkg/crawl/capture.go @@ -414,7 +414,7 @@ func (c *Crawl) Capture(item *queue.Item) error { if err != nil { c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from JSON") } - } else if strings.Contains(resp.Header.Get("Content-Type"), "vnd.apple.mpegurl") { + } else if extractor.IsM3U8(resp) { assets, err = extractor.M3U8(resp) if err != nil { c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8") diff --git a/internal/pkg/crawl/extractor/m3u8.go b/internal/pkg/crawl/extractor/m3u8.go index 8bc32bbc..8ba93bae 100644 --- a/internal/pkg/crawl/extractor/m3u8.go +++ b/internal/pkg/crawl/extractor/m3u8.go @@ -3,14 +3,20 @@ package extractor import ( "net/http" "net/url" + "strings" "github.com/grafov/m3u8" ) +func IsM3U8(resp *http.Response) bool { + return strings.Contains(resp.Header.Get("Content-Type"), "application/vnd.apple.mpegurl") || + strings.Contains(resp.Header.Get("Content-Type"), "application/x-mpegURL") +} + func M3U8(resp *http.Response) (URLs []*url.URL, err error) { p, listType, err := m3u8.DecodeFrom(resp.Body, true) if err != nil { - panic(err) + return URLs, err } var rawURLs []string diff --git a/internal/pkg/crawl/finish.go b/internal/pkg/crawl/finish.go index b34d920e..e5b49d7e 100644 --- a/internal/pkg/crawl/finish.go +++ b/internal/pkg/crawl/finish.go @@ -8,7 +8,7 @@ import ( ) // catchFinish is running in the background and detect when the crawl need to be terminated -// because it won't crawl anything more. This doesn't apply for Kafka-powered crawls. +// because it won't crawl anything more. This doesn't apply for HQ-powered crawls. func (crawl *Crawl) catchFinish() { for crawl.CrawledSeeds.Value()+crawl.CrawledAssets.Value() <= 0 { time.Sleep(1 * time.Second) From 83ddefbb75ff2cbf72111d2903fd1b313386eea8 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 10 Sep 2024 00:13:25 +0200 Subject: [PATCH 07/12] [site/yt] fix test --- internal/pkg/crawl/crawl.go | 2 +- internal/pkg/crawl/sitespecific/youtube/youtube_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/pkg/crawl/crawl.go b/internal/pkg/crawl/crawl.go index 2420db8e..a7549136 100644 --- a/internal/pkg/crawl/crawl.go +++ b/internal/pkg/crawl/crawl.go @@ -125,7 +125,7 @@ func (c *Crawl) Start() (err error) { c.Client.Timeout = time.Duration(c.HTTPTimeout) * time.Second c.Log.Info("Global HTTP client timeout set", "timeout", c.HTTPTimeout) } else { - c.Log.Info("Global HTTP client timeout not set") + c.Log.Info("Global HTTP client timeout not set (defaulting to infinite)") } if c.Proxy != "" { diff --git a/internal/pkg/crawl/sitespecific/youtube/youtube_test.go b/internal/pkg/crawl/sitespecific/youtube/youtube_test.go index a881c23b..b5a4cffb 100644 --- a/internal/pkg/crawl/sitespecific/youtube/youtube_test.go +++ b/internal/pkg/crawl/sitespecific/youtube/youtube_test.go @@ -25,7 +25,7 @@ func TestParse(t *testing.T) { } // Check the number of URLs - expected := 146 + expected := 148 if len(URLs) != expected { t.Fatalf("Expected %d URLs, got %d", expected, len(URLs)) } From 91c4b852326678a5f39778f24169465632e690ab Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 10 Sep 2024 00:17:16 +0200 Subject: [PATCH 08/12] ytdlp: remove useless subtitles parsing function --- .../pkg/crawl/dependencies/ytdlp/model.go | 11 ++++- .../pkg/crawl/dependencies/ytdlp/parse.go | 42 ------------------- .../pkg/crawl/dependencies/ytdlp/ytdlp.go | 14 ++++--- 3 files changed, 17 insertions(+), 50 deletions(-) delete mode 100644 internal/pkg/crawl/dependencies/ytdlp/parse.go diff --git a/internal/pkg/crawl/dependencies/ytdlp/model.go b/internal/pkg/crawl/dependencies/ytdlp/model.go index 5e6b5075..ea7d892d 100644 --- a/internal/pkg/crawl/dependencies/ytdlp/model.go +++ b/internal/pkg/crawl/dependencies/ytdlp/model.go @@ -1,8 +1,15 @@ package ytdlp +type Subtitle struct { + Ext string `json:"ext"` + URL string `json:"url"` + Name string `json:"name"` +} + type Video struct { - ID string `json:"id"` - IsLive bool `json:"is_live"` + ID string `json:"id"` + IsLive bool `json:"is_live"` + Subtitles map[string][]Subtitle `json:"subtitles"` RequestedFormats []struct { Acodec string `json:"acodec"` AspectRatio float64 `json:"aspect_ratio"` diff --git a/internal/pkg/crawl/dependencies/ytdlp/parse.go b/internal/pkg/crawl/dependencies/ytdlp/parse.go deleted file mode 100644 index ab49b680..00000000 --- a/internal/pkg/crawl/dependencies/ytdlp/parse.go +++ /dev/null @@ -1,42 +0,0 @@ -package ytdlp - -import ( - "encoding/json" - "fmt" -) - -type SubtitleInfo struct { - Ext string `json:"ext"` - URL string `json:"url"` - Name string `json:"name"` -} - -// parseSubtitles parses the subtitles from the yt-dlp JSON output, -// it's needed because the subtitles are not given as a proper array or objects -func parseSubtitles(jsonData string) ([]string, error) { - var data map[string]json.RawMessage - err := json.Unmarshal([]byte(jsonData), &data) - if err != nil { - return nil, fmt.Errorf("error unmarshaling outer JSON: %v", err) - } - - subtitlesRaw, ok := data["subtitles"] - if !ok { - return nil, nil - } - - var subtitles map[string][]SubtitleInfo - err = json.Unmarshal(subtitlesRaw, &subtitles) - if err != nil { - return nil, fmt.Errorf("error unmarshaling subtitles JSON: %v", err) - } - - var URLs []string - for _, langSubtitles := range subtitles { - for _, subtitle := range langSubtitles { - URLs = append(URLs, subtitle.URL) - } - } - - return URLs, nil -} diff --git a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go index 23dad79d..8cb673f4 100644 --- a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go +++ b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go @@ -25,12 +25,6 @@ func GetJSON(port int) (URLs []string, rawJSON string, HTTPHeaders HTTPHeaders, output := stdout.String() - // Find subtitles - subtitleURLs, err := parseSubtitles(output) - if err != nil { - return nil, rawJSON, HTTPHeaders, fmt.Errorf("error parsing subtitles: %v", err) - } - // Parse the output as a Video object var video Video err = json.Unmarshal([]byte(output), &video) @@ -38,6 +32,14 @@ func GetJSON(port int) (URLs []string, rawJSON string, HTTPHeaders HTTPHeaders, return nil, rawJSON, HTTPHeaders, fmt.Errorf("error unmarshaling yt-dlp JSON: %v", err) } + // Get all subtitles (not automatic captions) + var subtitleURLs []string + for _, subtitle := range video.Subtitles { + for _, sub := range subtitle { + subtitleURLs = append(subtitleURLs, sub.URL) + } + } + // Get all thumbnail URLs for _, thumbnail := range video.Thumbnails { URLs = append(URLs, thumbnail.URL) From 21847cee5f9a576666168a6d4b30aafc3eb05ea2 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 10 Sep 2024 00:34:08 +0200 Subject: [PATCH 09/12] m3u8: handle content-type case insensitively --- internal/pkg/crawl/extractor/m3u8.go | 5 ++--- internal/pkg/crawl/extractor/utils.go | 9 +++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/internal/pkg/crawl/extractor/m3u8.go b/internal/pkg/crawl/extractor/m3u8.go index 8ba93bae..a3b3ee32 100644 --- a/internal/pkg/crawl/extractor/m3u8.go +++ b/internal/pkg/crawl/extractor/m3u8.go @@ -3,14 +3,13 @@ package extractor import ( "net/http" "net/url" - "strings" "github.com/grafov/m3u8" ) func IsM3U8(resp *http.Response) bool { - return strings.Contains(resp.Header.Get("Content-Type"), "application/vnd.apple.mpegurl") || - strings.Contains(resp.Header.Get("Content-Type"), "application/x-mpegURL") + return isContentType(resp.Header.Get("Content-Type"), "application/vnd.apple.mpegurl") || + isContentType(resp.Header.Get("Content-Type"), "application/x-mpegURL") } func M3U8(resp *http.Response) (URLs []*url.URL, err error) { diff --git a/internal/pkg/crawl/extractor/utils.go b/internal/pkg/crawl/extractor/utils.go index 3d8ee94d..bf01e1d8 100644 --- a/internal/pkg/crawl/extractor/utils.go +++ b/internal/pkg/crawl/extractor/utils.go @@ -3,8 +3,17 @@ package extractor import ( "net/url" "sort" + "strings" ) +func isContentType(header, targetContentType string) bool { + // Lowercase the header and target content type for case-insensitive comparison + header = strings.ToLower(header) + targetContentType = strings.ToLower(targetContentType) + + return strings.Contains(header, targetContentType) +} + // compareURLs compares two slices of *url.URL func compareURLs(a, b []*url.URL) bool { if len(a) != len(b) { From 59c402fb95ab4e9d558e616fca34725b030b3d62 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 10 Sep 2024 02:05:22 +0200 Subject: [PATCH 10/12] chore: small refactoring --- internal/pkg/crawl/capture.go | 19 ++----------------- .../pkg/crawl/sitespecific/youtube/youtube.go | 5 +++++ 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/internal/pkg/crawl/capture.go b/internal/pkg/crawl/capture.go index c7aefcf0..504edd80 100644 --- a/internal/pkg/crawl/capture.go +++ b/internal/pkg/crawl/capture.go @@ -342,7 +342,7 @@ func (c *Crawl) Capture(item *queue.Item) error { // If it was a YouTube watch page, we potentially want to run it through the YouTube extractor // TODO: support other watch page URLs - if strings.Contains(item.URL.Host, "youtube.com") && strings.Contains(item.URL.Path, "/watch") && !c.NoYTDLP { + if !c.NoYTDLP && youtube.IsYouTubeWatchPage(item.URL) { URLs, rawJSON, HTTPHeaders, err := youtube.Parse(resp.Body) if err != nil { c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing YouTube watch page") @@ -350,21 +350,6 @@ func (c *Crawl) Capture(item *queue.Item) error { } resp.Body.Close() - // Build the cookies - // cookies := append([]*http.Cookie{}, &http.Cookie{ - // Name: "Accept", - // Value: HTTPHeaders.Accept, - // }, &http.Cookie{ - // Name: "Accept-Language", - // Value: HTTPHeaders.AcceptLanguage, - // }, &http.Cookie{ - // Name: "Sec-Fetch-Mode", - // Value: HTTPHeaders.SecFetchMode, - // }, &http.Cookie{ - // Name: "User-Agent", - // Value: HTTPHeaders.UserAgent, - // }) - var headers = make(map[string]string) headers["Accept"] = HTTPHeaders.Accept headers["Accept-Language"] = HTTPHeaders.AcceptLanguage @@ -377,7 +362,7 @@ func (c *Crawl) Capture(item *queue.Item) error { // Write the metadata record for the video if rawJSON != "" { - c.Client.WriteMetadataRecord(utils.URLToString(item.URL), "application/json;generator=youtube-dl", rawJSON) + c.Client.WriteMetadataRecord(utils.URLToString(item.URL), "application/json;generator=youtube-dlp", rawJSON) } return nil diff --git a/internal/pkg/crawl/sitespecific/youtube/youtube.go b/internal/pkg/crawl/sitespecific/youtube/youtube.go index 643ae052..ab5059db 100644 --- a/internal/pkg/crawl/sitespecific/youtube/youtube.go +++ b/internal/pkg/crawl/sitespecific/youtube/youtube.go @@ -3,10 +3,15 @@ package youtube import ( "io" "net/url" + "strings" "github.com/internetarchive/Zeno/internal/pkg/crawl/dependencies/ytdlp" ) +func IsYouTubeWatchPage(URL *url.URL) bool { + return strings.Contains(URL.Host, "youtube.com") && (strings.Contains(URL.Path, "/watch") || strings.Contains(URL.Path, "/v/")) +} + func Parse(body io.ReadCloser) (URLs []*url.URL, rawJSON string, HTTPHeaders ytdlp.HTTPHeaders, err error) { // Create a temporary server to serve the body and call ytdlp on it port, stopChan, err := ytdlp.ServeBody(body) From 5178cb3756d6be83f7b6a1f103c296f11946196a Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Tue, 10 Sep 2024 02:10:10 +0200 Subject: [PATCH 11/12] ytdlp: add dubbed audio streams --- internal/pkg/crawl/dependencies/ytdlp/ytdlp.go | 8 ++++++++ internal/pkg/crawl/sitespecific/youtube/youtube_test.go | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go index 8cb673f4..fd0a6414 100644 --- a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go +++ b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go @@ -6,6 +6,7 @@ import ( "fmt" "os/exec" "strconv" + "strings" ) func GetJSON(port int) (URLs []string, rawJSON string, HTTPHeaders HTTPHeaders, err error) { @@ -56,6 +57,13 @@ func GetJSON(port int) (URLs []string, rawJSON string, HTTPHeaders HTTPHeaders, } } + // Get all dubbed audio URLs + for _, audio := range video.Formats { + if strings.Contains(audio.FormatNote, "dubbed") { + URLs = append(URLs, audio.URL, audio.URL+"&video_id="+video.ID) + } + } + // write output to a .json file (debug) // err = ioutil.WriteFile("output.json", []byte(output), 0644) // if err != nil { diff --git a/internal/pkg/crawl/sitespecific/youtube/youtube_test.go b/internal/pkg/crawl/sitespecific/youtube/youtube_test.go index b5a4cffb..5a86ade0 100644 --- a/internal/pkg/crawl/sitespecific/youtube/youtube_test.go +++ b/internal/pkg/crawl/sitespecific/youtube/youtube_test.go @@ -25,7 +25,7 @@ func TestParse(t *testing.T) { } // Check the number of URLs - expected := 148 + expected := 204 if len(URLs) != expected { t.Fatalf("Expected %d URLs, got %d", expected, len(URLs)) } From d2cdb3cc4806b8cef38744d40f5317b7f3483cc5 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Thu, 12 Sep 2024 03:51:11 +0200 Subject: [PATCH 12/12] ytdlp: format selection & refactoring --- internal/pkg/crawl/assets.go | 27 +++++++----- internal/pkg/crawl/capture.go | 12 ++--- .../pkg/crawl/dependencies/ytdlp/ytdlp.go | 22 ++++------ internal/pkg/crawl/exclusion.go | 44 +++++++++++++++++++ internal/pkg/crawl/outlinks.go | 34 ++------------ internal/pkg/crawl/utils.go | 21 --------- internal/pkg/crawl/worker.go | 2 +- 7 files changed, 80 insertions(+), 82 deletions(-) create mode 100644 internal/pkg/crawl/exclusion.go diff --git a/internal/pkg/crawl/assets.go b/internal/pkg/crawl/assets.go index 408c6511..a605215e 100644 --- a/internal/pkg/crawl/assets.go +++ b/internal/pkg/crawl/assets.go @@ -315,7 +315,7 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu if err != nil { c.Log.Error("unable to extract URLs from JSON in script tag", "error", err, "url", URL) } else { - rawAssets = append(rawAssets, removeGoogleVideoURLs(URLsFromJSON)...) + rawAssets = append(rawAssets, URLsFromJSON...) } } } @@ -391,21 +391,26 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu // Turn strings into url.URL assets = append(assets, utils.StringSliceToURLSlice(rawAssets)...) - // Ensure that excluded hosts aren't in the assets. - assets = c.excludeHosts(assets) - - // Go over all assets and outlinks and make sure they are absolute links - assets = utils.MakeAbsolute(base, assets) + // Ensure that no asset that would be excluded is added to the list, + // remove all fragments, and make sure that all assets are absolute URLs + assets = c.cleanURLs(base, assets) return utils.DedupeURLs(assets), nil } -func removeGoogleVideoURLs(input []string) (output []string) { - for _, i := range input { - if !strings.Contains(i, "googlevideo.com") { - output = append(output, i) +func (c *Crawl) cleanURLs(base *url.URL, URLs []*url.URL) (output []*url.URL) { + // Remove excluded URLs + for _, URL := range URLs { + if !c.isExcluded(URL) { + output = append(output, URL) } } - return output + // Make all URLs absolute + if base != nil { + output = utils.MakeAbsolute(base, output) + } + + // Remove fragments + return utils.RemoveFragments(output) } diff --git a/internal/pkg/crawl/capture.go b/internal/pkg/crawl/capture.go index 504edd80..31be9f1e 100644 --- a/internal/pkg/crawl/capture.go +++ b/internal/pkg/crawl/capture.go @@ -350,6 +350,11 @@ func (c *Crawl) Capture(item *queue.Item) error { } resp.Body.Close() + // Write the metadata record for the video + if rawJSON != "" { + c.Client.WriteMetadataRecord(utils.URLToString(item.URL), "application/json;generator=youtube-dlp", rawJSON) + } + var headers = make(map[string]string) headers["Accept"] = HTTPHeaders.Accept headers["Accept-Language"] = HTTPHeaders.AcceptLanguage @@ -360,11 +365,6 @@ func (c *Crawl) Capture(item *queue.Item) error { c.captureAssets(item, URLs, resp.Cookies(), headers) } - // Write the metadata record for the video - if rawJSON != "" { - c.Client.WriteMetadataRecord(utils.URLToString(item.URL), "application/json;generator=youtube-dlp", rawJSON) - } - return nil } @@ -484,7 +484,7 @@ func (c *Crawl) Capture(item *queue.Item) error { } // Extract outlinks - outlinks, err := extractOutlinks(base, doc) + outlinks, err := c.extractOutlinks(base, doc) if err != nil { c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting outlinks") return err diff --git a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go index fd0a6414..0a4f5fbe 100644 --- a/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go +++ b/internal/pkg/crawl/dependencies/ytdlp/ytdlp.go @@ -6,12 +6,11 @@ import ( "fmt" "os/exec" "strconv" - "strings" ) func GetJSON(port int) (URLs []string, rawJSON string, HTTPHeaders HTTPHeaders, err error) { // Prepare the command - cmd := exec.Command("yt-dlp", "--dump-json", "http://localhost:"+strconv.Itoa(port)) + cmd := exec.Command("yt-dlp", "--dump-json", "http://localhost:"+strconv.Itoa(port), "-f", "bv[protocol=https]+ba[protocol=https]") // Buffers to capture stdout and stderr var stdout, stderr bytes.Buffer @@ -52,24 +51,21 @@ func GetJSON(port int) (URLs []string, rawJSON string, HTTPHeaders HTTPHeaders, if len(video.RequestedFormats) > 0 { HTTPHeaders = video.RequestedFormats[0].HTTPHeaders for _, format := range video.RequestedFormats { - URLs = append(URLs, format.URL, format.URL+"&video_id="+video.ID) + URLs = append(URLs, format.URL+"&video_id="+video.ID) } } } - // Get all dubbed audio URLs - for _, audio := range video.Formats { - if strings.Contains(audio.FormatNote, "dubbed") { - URLs = append(URLs, audio.URL, audio.URL+"&video_id="+video.ID) + // Get the storyboards + for _, format := range video.Formats { + if format.FormatNote == "storyboard" { + URLs = append(URLs, format.URL) + for _, fragment := range format.Fragments { + URLs = append(URLs, fragment.URL) + } } } - // write output to a .json file (debug) - // err = ioutil.WriteFile("output.json", []byte(output), 0644) - // if err != nil { - // return nil, rawJSON, HTTPHeaders, fmt.Errorf("error writing output.json: %v", err) - // } - URLs = append(URLs, subtitleURLs...) return URLs, output, HTTPHeaders, nil diff --git a/internal/pkg/crawl/exclusion.go b/internal/pkg/crawl/exclusion.go new file mode 100644 index 00000000..192a19f6 --- /dev/null +++ b/internal/pkg/crawl/exclusion.go @@ -0,0 +1,44 @@ +package crawl + +import ( + "net/url" + "strings" + + "github.com/internetarchive/Zeno/internal/pkg/utils" +) + +func (c *Crawl) isExcluded(URL *url.URL) bool { + // If Zeno is ran with --include-host flag, + // only URLs from the included hosts are crawled + if !c.isHostIncluded(URL) { + return false + } + + // Verify if the URL is excluded by the host + // (--exclude-host flag) + if c.isHostExcluded(URL) { + return true + } + + // Verify if the URL is excluded by the --exclude-string flag + for _, excludedString := range c.ExcludedStrings { + if strings.Contains(utils.URLToString(URL), excludedString) { + return true + } + } + + return false +} + +func (c *Crawl) isHostExcluded(URL *url.URL) bool { + return utils.StringInSlice(URL.Host, c.ExcludedHosts) +} + +func (c *Crawl) isHostIncluded(URL *url.URL) bool { + // If no hosts are included, all hosts are included + if len(c.IncludedHosts) == 0 { + return true + } + + return utils.StringInSlice(URL.Host, c.IncludedHosts) +} diff --git a/internal/pkg/crawl/outlinks.go b/internal/pkg/crawl/outlinks.go index b838b803..e66c02fa 100644 --- a/internal/pkg/crawl/outlinks.go +++ b/internal/pkg/crawl/outlinks.go @@ -10,7 +10,7 @@ import ( "github.com/internetarchive/Zeno/internal/pkg/utils" ) -func extractOutlinks(base *url.URL, doc *goquery.Document) (outlinks []*url.URL, err error) { +func (c *Crawl) extractOutlinks(base *url.URL, doc *goquery.Document) (outlinks []*url.URL, err error) { var rawOutlinks []string // Extract outlinks @@ -43,11 +43,9 @@ func extractOutlinks(base *url.URL, doc *goquery.Document) (outlinks []*url.URL, textOutlinks := extractLinksFromText(doc.Find("body").RemoveFiltered("script").Text()) outlinks = append(outlinks, textOutlinks...) - // Go over all outlinks and make sure they are absolute links - outlinks = utils.MakeAbsolute(base, outlinks) - - // Hash (or fragment) URLs are navigational links pointing to the exact same page as such, they should not be treated as new outlinks. - outlinks = utils.RemoveFragments(outlinks) + // Ensure that no outlink that would be excluded is added to the list, + // remove all fragments, and make sure that all assets are absolute URLs + outlinks = c.cleanURLs(base, outlinks) return utils.DedupeURLs(outlinks), nil } @@ -55,33 +53,9 @@ func extractOutlinks(base *url.URL, doc *goquery.Document) (outlinks []*url.URL, func (c *Crawl) queueOutlinks(outlinks []*url.URL, item *queue.Item, wg *sync.WaitGroup) { defer wg.Done() - var excluded bool - // Send the outlinks to the pool of workers var items = make([]*queue.Item, 0, len(outlinks)) for _, outlink := range outlinks { - outlink := outlink - - // If the host of the outlink is in the host exclusion list, or the host is not in the host inclusion list - // if one is specified, we ignore the outlink - if utils.StringInSlice(outlink.Host, c.ExcludedHosts) || !c.checkIncludedHosts(outlink.Host) { - continue - } - - // If the outlink match any excluded string, we ignore it - for _, excludedString := range c.ExcludedStrings { - if strings.Contains(utils.URLToString(outlink), excludedString) { - excluded = true - break - } - } - - if excluded { - excluded = false - continue - } - - // Seencheck the outlink if c.UseSeencheck { if c.Seencheck.SeencheckURL(utils.URLToString(outlink), "seed") { continue diff --git a/internal/pkg/crawl/utils.go b/internal/pkg/crawl/utils.go index da9ccbe4..d59434cd 100644 --- a/internal/pkg/crawl/utils.go +++ b/internal/pkg/crawl/utils.go @@ -39,15 +39,6 @@ func (c *Crawl) crawlSpeedLimiter() { } } -func (c *Crawl) checkIncludedHosts(host string) bool { - // If no hosts are included, all hosts are included - if len(c.IncludedHosts) == 0 { - return true - } - - return utils.StringInSlice(host, c.IncludedHosts) -} - func (c *Crawl) handleCrawlPause() { for { spaceLeft := float64(utils.GetFreeDiskSpace(c.JobPath).Avail) / float64(GB) @@ -65,18 +56,6 @@ func (c *Crawl) handleCrawlPause() { } } -func (c *Crawl) excludeHosts(URLs []*url.URL) (output []*url.URL) { - for _, URL := range URLs { - if utils.StringInSlice(URL.Host, c.ExcludedHosts) || !c.checkIncludedHosts(URL.Host) { - continue - } else { - output = append(output, URL) - } - } - - return output -} - func extractLinksFromText(source string) (links []*url.URL) { // Extract links and dedupe them rawLinks := utils.DedupeStrings(regexOutlinks.FindAllString(source, -1)) diff --git a/internal/pkg/crawl/worker.go b/internal/pkg/crawl/worker.go index b1906f5c..2c790adf 100644 --- a/internal/pkg/crawl/worker.go +++ b/internal/pkg/crawl/worker.go @@ -114,7 +114,7 @@ func (w *Worker) Run() { w.state.lastAction = "got item" // If the host of the item is in the host exclusion list, we skip it - if utils.StringInSlice(item.URL.Host, w.pool.Crawl.ExcludedHosts) || !w.pool.Crawl.checkIncludedHosts(item.URL.Host) { + if utils.StringInSlice(item.URL.Host, w.pool.Crawl.ExcludedHosts) || !w.pool.Crawl.isHostIncluded(item.URL) { if w.pool.Crawl.UseHQ { w.state.lastAction = "skipping item because of host exclusion" // If we are using the HQ, we want to mark the item as done