Skip to content

Commit

Permalink
Allow crawl space threshold to be set on CLI, report space avail (#62)
Browse files Browse the repository at this point in the history
* Allow crawl space threshold to be set on CLI, report space avail

Closes #61

* Add tweaks/suggestions by @CorentinB in #62
  • Loading branch information
machawk1 authored Jul 8, 2024
1 parent 7447f81 commit 7bdc091
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 2 deletions.
7 changes: 7 additions & 0 deletions cmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,13 @@ var GlobalFlags = []cli.Flag{
Usage: "Number of seconds until the crawl will automatically panic itself. Default to crawl-time-limit + (crawl-time-limit / 10)",
Destination: &config.App.Flags.MaxCrawlTimeLimit,
},
&cli.IntFlag{
Name: "min-space-required",
Aliases: []string{"msr"},
Value: 20,
Usage: "Minimum space (GB) required to start crawl",
Destination: &config.App.Flags.MinSpaceRequired,
},

// Proxy flags
&cli.StringFlag{
Expand Down
1 change: 1 addition & 0 deletions cmd/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ func InitCrawlWithCMD(flags config.Flags) *crawl.Crawl {
c.UserAgent = "Mozilla/5.0 (compatible; archive.org_bot +http://archive.org/details/archive.org_bot) Zeno/" + version.Version[:7] + " warc/" + version.WarcVersion
}
c.Headless = flags.Headless
c.MinSpaceRequired = flags.MinSpaceRequired

c.CookieFile = flags.CookieFile
c.KeepCookies = flags.KeepCookies
Expand Down
1 change: 1 addition & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ type Flags struct {
JSON bool
LiveStats bool
Debug bool
MinSpaceRequired int

DisabledHTMLTags cli.StringSlice
ExcludedHosts cli.StringSlice
Expand Down
1 change: 1 addition & 0 deletions internal/pkg/crawl/crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ type Crawl struct {
Seencheck bool
Workers int
RandomLocalIP bool
MinSpaceRequired int

// Cookie-related settings
CookieFile string
Expand Down
7 changes: 5 additions & 2 deletions internal/pkg/crawl/utils.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package crawl

import (
"fmt"
"net/url"
"regexp"
"strconv"
Expand Down Expand Up @@ -52,8 +53,10 @@ func (c *Crawl) checkIncludedHosts(host string) bool {

func (c *Crawl) handleCrawlPause() {
for {
if float64(utils.GetFreeDiskSpace(c.JobPath).Avail)/float64(GB) <= 20 {
logrus.Errorln("Not enough disk space. Please free some space and restart the crawler.")
spaceLeft := float64(utils.GetFreeDiskSpace(c.JobPath).Avail) / float64(GB)
if spaceLeft <= float64(c.MinSpaceRequired) {
logrus.Errorln(fmt.Sprintf("Not enough disk space: %d GB required, %f GB available. "+
"Please free some space for the crawler to resume.", c.MinSpaceRequired, spaceLeft))
c.Paused.Set(true)
c.Frontier.Paused.Set(true)
} else {
Expand Down

0 comments on commit 7bdc091

Please sign in to comment.