From 7bdc09132ffa00e99858d7cb8709f7c0ccdb212f Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Mon, 8 Jul 2024 15:51:39 -0400 Subject: [PATCH] Allow crawl space threshold to be set on CLI, report space avail (#62) * Allow crawl space threshold to be set on CLI, report space avail Closes #61 * Add tweaks/suggestions by @CorentinB in #62 --- cmd/cmd.go | 7 +++++++ cmd/utils.go | 1 + config/config.go | 1 + internal/pkg/crawl/crawl.go | 1 + internal/pkg/crawl/utils.go | 7 +++++-- 5 files changed, 15 insertions(+), 2 deletions(-) diff --git a/cmd/cmd.go b/cmd/cmd.go index 5ffb177d..57ee458a 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -169,6 +169,13 @@ var GlobalFlags = []cli.Flag{ Usage: "Number of seconds until the crawl will automatically panic itself. Default to crawl-time-limit + (crawl-time-limit / 10)", Destination: &config.App.Flags.MaxCrawlTimeLimit, }, + &cli.IntFlag{ + Name: "min-space-required", + Aliases: []string{"msr"}, + Value: 20, + Usage: "Minimum space (GB) required to start crawl", + Destination: &config.App.Flags.MinSpaceRequired, + }, // Proxy flags &cli.StringFlag{ diff --git a/cmd/utils.go b/cmd/utils.go index 711ea27f..c3a4dde1 100644 --- a/cmd/utils.go +++ b/cmd/utils.go @@ -161,6 +161,7 @@ func InitCrawlWithCMD(flags config.Flags) *crawl.Crawl { c.UserAgent = "Mozilla/5.0 (compatible; archive.org_bot +http://archive.org/details/archive.org_bot) Zeno/" + version.Version[:7] + " warc/" + version.WarcVersion } c.Headless = flags.Headless + c.MinSpaceRequired = flags.MinSpaceRequired c.CookieFile = flags.CookieFile c.KeepCookies = flags.KeepCookies diff --git a/config/config.go b/config/config.go index 112d0e18..18eb3f5c 100644 --- a/config/config.go +++ b/config/config.go @@ -13,6 +13,7 @@ type Flags struct { JSON bool LiveStats bool Debug bool + MinSpaceRequired int DisabledHTMLTags cli.StringSlice ExcludedHosts cli.StringSlice diff --git a/internal/pkg/crawl/crawl.go b/internal/pkg/crawl/crawl.go index 802dfb1d..c4880db2 100644 --- a/internal/pkg/crawl/crawl.go +++ b/internal/pkg/crawl/crawl.go @@ -72,6 +72,7 @@ type Crawl struct { Seencheck bool Workers int RandomLocalIP bool + MinSpaceRequired int // Cookie-related settings CookieFile string diff --git a/internal/pkg/crawl/utils.go b/internal/pkg/crawl/utils.go index 1cb63b90..ce6bc230 100644 --- a/internal/pkg/crawl/utils.go +++ b/internal/pkg/crawl/utils.go @@ -1,6 +1,7 @@ package crawl import ( + "fmt" "net/url" "regexp" "strconv" @@ -52,8 +53,10 @@ func (c *Crawl) checkIncludedHosts(host string) bool { func (c *Crawl) handleCrawlPause() { for { - if float64(utils.GetFreeDiskSpace(c.JobPath).Avail)/float64(GB) <= 20 { - logrus.Errorln("Not enough disk space. Please free some space and restart the crawler.") + spaceLeft := float64(utils.GetFreeDiskSpace(c.JobPath).Avail) / float64(GB) + if spaceLeft <= float64(c.MinSpaceRequired) { + logrus.Errorln(fmt.Sprintf("Not enough disk space: %d GB required, %f GB available. "+ + "Please free some space for the crawler to resume.", c.MinSpaceRequired, spaceLeft)) c.Paused.Set(true) c.Frontier.Paused.Set(true) } else {