Skip to content

Commit

Permalink
[site/yt] add: format selection & metadata record
Browse files Browse the repository at this point in the history
  • Loading branch information
CorentinB committed Sep 8, 2024
1 parent eea0270 commit 6fa843a
Show file tree
Hide file tree
Showing 5 changed files with 190 additions and 19 deletions.
108 changes: 108 additions & 0 deletions internal/pkg/crawl/assets.go
Original file line number Diff line number Diff line change
@@ -1,20 +1,128 @@
package crawl

import (
"io"
"net/http"
"net/url"
"regexp"
"strings"
"sync/atomic"

"github.com/PuerkitoBio/goquery"
"github.com/internetarchive/Zeno/internal/pkg/crawl/extractor"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream"
"github.com/internetarchive/Zeno/internal/pkg/queue"
"github.com/internetarchive/Zeno/internal/pkg/utils"
"github.com/remeh/sizedwaitgroup"
)

var backgroundImageRegex = regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`)
var urlRegex = regexp.MustCompile(`(?m)url\((.*?)\)`)

func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie) error {
var resp *http.Response

// Prepare GET request
req, err := http.NewRequest("GET", utils.URLToString(item.URL), nil)
if err != nil {
return err
}

req.Header.Set("Referer", utils.URLToString(item.ParentURL))
req.Header.Set("User-Agent", c.UserAgent)

// Apply cookies obtained from the original URL captured
for i := range cookies {
req.AddCookie(cookies[i])
}

resp, err = c.executeGET(item, req, false)
if err != nil && err.Error() == "URL from redirection has already been seen" {
return nil
} else if err != nil {
return err
}
defer resp.Body.Close()

// needed for WARC writing
io.Copy(io.Discard, resp.Body)

return nil
}

func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*http.Cookie) {
// TODO: implement a counter for the number of assets
// currently being processed
// c.Frontier.QueueCount.Incr(int64(len(assets)))
swg := sizedwaitgroup.New(int(c.MaxConcurrentAssets))
excluded := false

for _, asset := range assets {
// TODO: implement a counter for the number of assets
// currently being processed
// c.Frontier.QueueCount.Incr(-1)

// Just making sure we do not over archive by archiving the original URL
if utils.URLToString(item.URL) == utils.URLToString(asset) {
continue
}

// We ban googlevideo.com URLs because they are heavily rate limited by default, and
// we don't want the crawler to spend an innapropriate amount of time archiving them
if strings.Contains(item.URL.Host, "googlevideo.com") {
continue
}

// If the URL match any excluded string, we ignore it
for _, excludedString := range c.ExcludedStrings {
if strings.Contains(utils.URLToString(asset), excludedString) {
excluded = true
break
}
}

if excluded {
excluded = false
continue
}

swg.Add()
c.URIsPerSecond.Incr(1)

go func(asset *url.URL, swg *sizedwaitgroup.SizedWaitGroup) {
defer swg.Done()

// Create the asset's item
newAsset, err := queue.NewItem(asset, item.URL, "asset", item.Hop, "", false)
if err != nil {
c.Log.WithFields(c.genLogFields(err, asset, map[string]interface{}{
"parentHop": item.Hop,
"parentUrl": utils.URLToString(item.URL),
"type": "asset",
})).Error("error while creating asset item")
return
}

// Capture the asset
err = c.captureAsset(newAsset, cookies)
if err != nil {
c.Log.WithFields(c.genLogFields(err, &asset, map[string]interface{}{
"parentHop": item.Hop,
"parentUrl": utils.URLToString(item.URL),
"type": "asset",
})).Error("error while capturing asset")
return
}

// If we made it to this point, it means that the asset have been crawled successfully,
// then we can increment the locallyCrawled variable
atomic.AddUint64(&item.LocallyCrawled, 1)
}(asset, &swg)
}

swg.Wait()
}

func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Document) (assets []*url.URL, err error) {
var rawAssets []string
var URL = utils.URLToString(item.URL)
Expand Down
7 changes: 6 additions & 1 deletion internal/pkg/crawl/capture.go
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ func (c *Crawl) Capture(item *queue.Item) error {
// If it was a YouTube watch page, we potentially want to run it through the YouTube extractor
// TODO: support other watch page URLs
if strings.Contains(item.URL.Host, "youtube.com") && strings.Contains(item.URL.Path, "/watch") && !c.NoYTDLP {
URLs, err := youtube.Parse(resp.Body)
URLs, rawJSON, err := youtube.Parse(resp.Body)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing YouTube watch page")
return err
Expand All @@ -353,6 +353,11 @@ func (c *Crawl) Capture(item *queue.Item) error {
c.captureAssets(item, URLs, resp.Cookies())
}

// Write the metadata record for the video
if rawJSON != "" {
c.Client.WriteMetadataRecord(utils.URLToString(item.URL), "application/json;generator=youtube-dl", rawJSON)
}

return nil
}

Expand Down
50 changes: 46 additions & 4 deletions internal/pkg/crawl/dependencies/ytdlp/model.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,52 @@
package ytdlp

type Video struct {
IsLive bool `json:"is_live"`
RequestedFormats []struct {
URL string `json:"url"`
} `json:"requested_formats"`
ID string `json:"id"`
IsLive bool `json:"is_live"`
Formats []struct {
Acodec string `json:"acodec"`
AspectRatio float64 `json:"aspect_ratio"`
AudioExt string `json:"audio_ext"`
Columns float64 `json:"columns,omitempty"`
Ext string `json:"ext"`
Format string `json:"format"`
FormatID string `json:"format_id"`
FormatNote string `json:"format_note"`
Fps float64 `json:"fps"`
Fragments []struct {
Duration float64 `json:"duration"`
URL string `json:"url"`
} `json:"fragments,omitempty"`
Height float64 `json:"height"`
HTTPHeaders struct {
Accept string `json:"Accept"`
AcceptLanguage string `json:"Accept-Language"`
SecFetchMode string `json:"Sec-Fetch-Mode"`
UserAgent string `json:"User-Agent"`
} `json:"http_headers"`
Protocol string `json:"protocol"`
Resolution string `json:"resolution"`
Rows float64 `json:"rows,omitempty"`
URL string `json:"url"`
Vcodec string `json:"vcodec"`
VideoExt string `json:"video_ext"`
Width float64 `json:"width"`
Abr float64 `json:"abr,omitempty"`
Asr float64 `json:"asr,omitempty"`
AudioChannels float64 `json:"audio_channels,omitempty"`
Container string `json:"container,omitempty"`
DynamicRange interface{} `json:"dynamic_range,omitempty"`
Filesize float64 `json:"filesize,omitempty"`
HasDrm bool `json:"has_drm,omitempty"`
Language string `json:"language,omitempty"`
LanguagePreference float64 `json:"language_preference,omitempty"`
Preference interface{} `json:"preference,omitempty"`
Quality float64 `json:"quality,omitempty"`
SourcePreference float64 `json:"source_preference,omitempty"`
Tbr float64 `json:"tbr,omitempty"`
Vbr float64 `json:"vbr,omitempty"`
FilesizeApprox float64 `json:"filesize_approx,omitempty"`
} `json:"formats"`
Thumbnails []struct {
URL string `json:"url"`
} `json:"thumbnails"`
Expand Down
32 changes: 24 additions & 8 deletions internal/pkg/crawl/dependencies/ytdlp/ytdlp.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ import (
"fmt"
"os/exec"
"strconv"
"strings"
)

func GetJSON(port int) (URLs []string, err error) {
func GetJSON(port int) (URLs []string, rawJSON string, err error) {
// Prepare the command
cmd := exec.Command("yt-dlp", "--dump-json", "http://localhost:"+strconv.Itoa(port))
cmd := exec.Command("yt-dlp", "--dump-json", "-f", "18", "http://localhost:"+strconv.Itoa(port))

// Buffers to capture stdout and stderr
var stdout, stderr bytes.Buffer
Expand All @@ -20,22 +21,22 @@ func GetJSON(port int) (URLs []string, err error) {
// Run the command
err = cmd.Run()
if err != nil {
return URLs, fmt.Errorf("yt-dlp error: %v\nstderr: %s", err, stderr.String())
return URLs, rawJSON, fmt.Errorf("yt-dlp error: %v\nstderr: %s", err, stderr.String())
}

output := stdout.String()

// Find subtitles
subtitleURLs, err := parseSubtitles(output)
if err != nil {
return nil, err
return nil, rawJSON, fmt.Errorf("error parsing subtitles: %v", err)
}

// Parse the output as a Video object
var video Video
err = json.Unmarshal([]byte(output), &video)
if err != nil {
return nil, fmt.Errorf("error unmarshaling yt-dlp JSON: %v", err)
return nil, rawJSON, fmt.Errorf("error unmarshaling yt-dlp JSON: %v", err)
}

// Get all thumbnail URLs
Expand All @@ -46,14 +47,29 @@ func GetJSON(port int) (URLs []string, err error) {
// Get the manifest URL for the best video & audio quality
// Note: we do not archive live streams
if !video.IsLive {
for format := range video.RequestedFormats {
URLs = append(URLs, video.RequestedFormats[format].URL)
// Find the best format for the video in the formats that
// use the "https" protocol and don't contain "only" in their name (to avoid audio or video-only formats)
// and don't contain "_dash" in their container (to avoid DASH formats)
var bestFormatQuality float64
var bestFormatPosition int
for i, format := range video.Formats {
if (bestFormatQuality == 0 || format.Quality > bestFormatQuality) &&
format.Protocol == "https" &&
!strings.Contains(format.Format, "only") &&
!strings.Contains(format.Container, "_dash") {
bestFormatQuality = format.Quality
bestFormatPosition = i
}
}

URLs = append(URLs,
video.Formats[bestFormatPosition].URL+"&video_id="+video.ID,
video.Formats[bestFormatPosition].URL)
}

URLs = append(URLs, subtitleURLs...)

return URLs, nil
return URLs, output, nil
}

func FindPath() (string, bool) {
Expand Down
12 changes: 6 additions & 6 deletions internal/pkg/crawl/sitespecific/youtube/youtube.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,29 @@ import (
"github.com/internetarchive/Zeno/internal/pkg/crawl/dependencies/ytdlp"
)

func Parse(body io.ReadCloser) (URLs []*url.URL, err error) {
func Parse(body io.ReadCloser) (URLs []*url.URL, rawJSON string, err error) {
// Create a temporary server to serve the body and call ytdlp on it
port, stopChan, err := ytdlp.ServeBody(body)
if err != nil {
return nil, err
return nil, rawJSON, err
}
defer close(stopChan)

// Call ytdlp on the temporary server
rawURLs, err := ytdlp.GetJSON(port)
rawURLs, rawJSON, err := ytdlp.GetJSON(port)
if err != nil {
return nil, err
return nil, rawJSON, err
}

// Parse the URLs
for _, urlString := range rawURLs {
URL, err := url.Parse(urlString)
if err != nil {
return nil, err
return nil, rawJSON, err
}

URLs = append(URLs, URL)
}

return URLs, nil
return URLs, rawJSON, nil
}

0 comments on commit 6fa843a

Please sign in to comment.