diff --git a/cmd/utils.go b/cmd/utils.go index 873aca12..c388f4e3 100644 --- a/cmd/utils.go +++ b/cmd/utils.go @@ -57,6 +57,7 @@ func InitCrawlWithCMD(flags config.Flags) *crawl.Crawl { RotateLogFile: true, RotateElasticSearchIndex: true, ElasticsearchConfig: elasticSearchConfig, + LiveStats: flags.LiveStats, }) if err != nil { fmt.Println(err) diff --git a/internal/pkg/crawl/hq.go b/internal/pkg/crawl/hq.go index f100e5bd..d8815cb6 100644 --- a/internal/pkg/crawl/hq.go +++ b/internal/pkg/crawl/hq.go @@ -38,10 +38,11 @@ func (c *Crawl) HQWebsocket() { GoVersion: utils.GetVersion().GoVersion, }) if err != nil { - logrus.WithFields(c.genLogFields(err, nil, nil)).Errorln("error sending identify payload to crawl HQ, trying to reconnect..") + c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending identify payload to crawl HQ, trying to reconnect..") + err = c.HQClient.InitWebsocketConn() if err != nil { - logrus.WithFields(c.genLogFields(err, nil, nil)).Errorln("error initializing websocket connection to crawl HQ") + c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error initializing websocket connection to crawl HQ") } } @@ -72,7 +73,7 @@ func (c *Crawl) HQProducer() { for { _, err := c.HQClient.Discovered(discoveredArray, "seed", false, false) if err != nil { - logrus.WithFields(c.genLogFields(err, nil, nil)).Errorln("error sending payload to crawl HQ, waiting 1s then retrying..") + c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending payload to crawl HQ, waiting 1s then retrying..") time.Sleep(time.Second) continue } @@ -87,7 +88,7 @@ func (c *Crawl) HQProducer() { for { _, err := c.HQClient.Discovered(discoveredArray, "seed", false, false) if err != nil { - logrus.WithFields(c.genLogFields(err, nil, nil)).Errorln("error sending payload to crawl HQ, waiting 1s then retrying..") + c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending payload to crawl HQ, waiting 1s then retrying..") time.Sleep(time.Second) continue } @@ -125,7 +126,9 @@ func (c *Crawl) HQProducer() { for { _, err := c.HQClient.Discovered([]gocrawlhq.URL{discoveredURL}, "seed", true, false) if err != nil { - logrus.WithFields(c.genLogFields(err, nil, nil)).Errorln("error sending payload to crawl HQ, waiting 1s then retrying..") + c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ + "bypassSeencheck": discoveredItem.BypassSeencheck, + })).Error("error sending payload to crawl HQ, waiting 1s then retrying..") time.Sleep(time.Second) continue } @@ -177,6 +180,10 @@ func (c *Crawl) HQConsumer() { // get batch from crawl HQ batch, err := c.HQClient.Feed(HQBatchSize, c.HQStrategy) if err != nil { + if strings.Contains(err.Error(), "feed is empty") { + time.Sleep(time.Second) + } + c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ "batchSize": HQBatchSize, })).Error("error getting new URLs from crawl HQ") diff --git a/internal/pkg/log/log.go b/internal/pkg/log/log.go index bf4584d4..f1fe2ac7 100644 --- a/internal/pkg/log/log.go +++ b/internal/pkg/log/log.go @@ -44,6 +44,7 @@ type Config struct { RotateLogFile bool ElasticsearchConfig *ElasticsearchConfig RotateElasticSearchIndex bool + LiveStats bool } // New creates a new Logger instance with the given configuration. @@ -60,10 +61,12 @@ func New(cfg Config) (*Logger, error) { var handlers []slog.Handler // Create stdout handler - stdoutHandler := slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{ - Level: cfg.StdoutLevel, - }) - handlers = append(handlers, stdoutHandler) + if !cfg.LiveStats { + stdoutHandler := slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{ + Level: cfg.StdoutLevel, + }) + handlers = append(handlers, stdoutHandler) + } // Create file handler if FileOutput is specified if cfg.FileConfig != nil { diff --git a/internal/pkg/utils/url_string.go b/internal/pkg/utils/url_string.go index d9977937..22272e66 100644 --- a/internal/pkg/utils/url_string.go +++ b/internal/pkg/utils/url_string.go @@ -1,6 +1,8 @@ package utils import ( + "fmt" + "log/slog" "net/url" "strings" @@ -14,12 +16,12 @@ func URLToString(u *url.URL) string { u.RawQuery = q.Encode() u.Host, err = idna.ToASCII(u.Host) if err != nil { - LogWarning.Warningf("could not IDNA encode URL: %s", err) + slog.Warn(fmt.Sprintf("could not IDNA encode URL: %s", err)) } tempHost, err := idna.ToASCII(u.Hostname()) if err != nil { - LogWarning.Warningf("could not IDNA encode URL: %s", err) + slog.Warn(fmt.Sprintf("could not IDNA encode URL: %s", err)) tempHost = u.Hostname() } diff --git a/internal/pkg/utils/utils.go b/internal/pkg/utils/utils.go deleted file mode 100644 index 3cf97ae9..00000000 --- a/internal/pkg/utils/utils.go +++ /dev/null @@ -1,128 +0,0 @@ -package utils - -import ( - "fmt" - "io" - "os" - "path" - "time" - - "github.com/internetarchive/elogrus" - rotatelogs "github.com/lestrrat-go/file-rotatelogs" - "github.com/olivere/elastic/v7" - "github.com/sirupsen/logrus" -) - -var LogInfo, LogWarning, LogError *logrus.Logger - -// SetupLogging setup the logger for the crawl -func SetupLogging(jobPath string, liveStats bool, esURL string) (logInfo, logWarning, logError *logrus.Logger) { - var logsDirectory = path.Join(jobPath, "logs") - - hostname, err := os.Hostname() - if err != nil { - logrus.Panic(err) - } - - logInfo = logrus.New() - logWarning = logrus.New() - logError = logrus.New() - - //logInfo.SetFormatter(&logrus.JSONFormatter{}) - //logWarning.SetFormatter(&logrus.JSONFormatter{}) - //logError.SetFormatter(&logrus.JSONFormatter{}) - - if esURL != "" { - client, err := elastic.NewClient(elastic.SetURL(esURL)) - if err != nil { - logrus.Panic(err) - } - - go func() { - newHook: - hookInfo, err := elogrus.NewAsyncElasticHook(client, hostname, logrus.InfoLevel, "zeno-"+time.Now().Format("2006.01.02")) - if err != nil { - logrus.Error(err) - goto newHook - } - - hookWarning, err := elogrus.NewAsyncElasticHook(client, hostname, logrus.WarnLevel, "zeno-"+time.Now().Format("2006.01.02")) - if err != nil { - logrus.Error(err) - goto newHook - } - - hookError, err := elogrus.NewAsyncElasticHook(client, hostname, logrus.ErrorLevel, "zeno-"+time.Now().Format("2006.01.02")) - if err != nil { - logrus.Error(err) - goto newHook - } - - logInfo.Hooks.Add(hookInfo) - logWarning.Hooks.Add(hookWarning) - logError.Hooks.Add(hookError) - }() - } - - // Create logs directory for the job - os.MkdirAll(logsDirectory, os.ModePerm) - - // Initialize rotating loggers - pathInfo := path.Join(logsDirectory, "zeno_info") - pathWarning := path.Join(logsDirectory, "zeno_warning") - pathError := path.Join(logsDirectory, "zeno_error") - - writerInfo, err := rotatelogs.New( - fmt.Sprintf("%s_%s.log", pathInfo, "%Y%m%d%H%M%S"), - rotatelogs.WithRotationTime(time.Hour*6), - ) - if err != nil { - logrus.WithFields(logrus.Fields{ - "err": err.Error(), - }).Fatalln("failed to initialize info log file") - } - - if !liveStats { - logInfo.SetOutput(io.MultiWriter(writerInfo, os.Stdout)) - } else { - logInfo.SetOutput(writerInfo) - } - - writerWarning, err := rotatelogs.New( - fmt.Sprintf("%s_%s.log", pathWarning, "%Y%m%d%H%M%S"), - rotatelogs.WithRotationTime(time.Hour*6), - ) - if err != nil { - logrus.WithFields(logrus.Fields{ - "err": err.Error(), - }).Fatalln("failed to initialize warning log file") - } - - if !liveStats { - logWarning.SetOutput(io.MultiWriter(writerWarning, os.Stdout)) - } else { - logWarning.SetOutput(writerWarning) - } - - writerError, err := rotatelogs.New( - fmt.Sprintf("%s_%s.log", pathError, "%Y%m%d%H%M%S"), - rotatelogs.WithRotationTime(time.Hour*6), - ) - if err != nil { - logrus.WithFields(logrus.Fields{ - "err": err.Error(), - }).Fatalln("failed to initialize error log file") - } - - if !liveStats { - logError.SetOutput(io.MultiWriter(writerError, os.Stdout)) - } else { - logError.SetOutput(writerError) - } - - LogInfo = logInfo - LogWarning = logWarning - LogError = logError - - return logInfo, logWarning, logError -}