Skip to content

Commit

Permalink
Fix sitemap lastmod and more
Browse files Browse the repository at this point in the history
  • Loading branch information
Eugene Medvedev committed Jun 11, 2019
1 parent a29ab51 commit 7ee8c93
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 20 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ build/sicra -h
Usage of build/sicra:
-add-error
Add URL to sitemap, even if response error (default true)
Add URL to sitemap, even if response error (only for 5xx codes) (default true)
-async
Run async requests
-delay int
Expand Down
44 changes: 29 additions & 15 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (
)

func main() {
addError := flag.Bool("add-error", true, "Add URL to sitemap, even if response error")
addError := flag.Bool("add-error", true, "Add URL to sitemap, even if response error (only for 5xx codes)")
asyncScan := flag.Bool("async", false, "Run async requests")
delay := flag.Int64("delay", 0, "Delay between requests in Millisecond")
maxDepth := flag.Int("max-depth", 0, "MaxDepth limits the recursion depth of visited URLs.")
Expand Down Expand Up @@ -49,24 +49,38 @@ func main() {
*skipNoIndex,
*verbose)

err = sicra.GenerateSiteMap(*outFile, scrape.AddedURLs)
if err != nil {
log.Fatal(err)
p := filepath.Dir(*outFile)
// generate sitemap.xml
if len(scrape.AddedURLs) > 0 {
err = sicra.GenerateSiteMap(*outFile, scrape.AddedURLs)
if err != nil {
log.Fatal(err)
}
}

// generate noindex.txt
if *skipNoIndex {
p := filepath.Dir(*outFile)
err = sicra.GenerateNoIndex(p+"/noindex.txt", scrape.NoIndexURLs)
if len(scrape.NoIndexURLs) > 0 {
err = sicra.GenerateTxt(p+"/noindex.txt", scrape.NoIndexURLs)
if err != nil {
log.Fatal(err)
}
}
}
// generate errors.txt
if len(scrape.ErrorURLs) > 0 {
err = sicra.GenerateTxt(p+"/errors.txt", scrape.ErrorURLs)
if err != nil {
log.Fatal(err)
}
}

fmt.Print(
"Request URLs: ", scrape.AllVisitURLsCount, "\n",
"Added URLs ", scrape.AddedURLsCount, "\n",
"No Index URLs ", scrape.NoIndexURLsCount, "\n",
"Response URLs ", scrape.ResponseURLsCount, "\n",
"Error URLs ", scrape.ErrorURLsCount, "\n",
)
// print stats
if *verbose {
fmt.Print(
"Request URLs: ", scrape.AllVisitURLsCount, "\n",
"Added URLs ", scrape.AddedURLsCount, "\n",
"No Index URLs ", scrape.NoIndexURLsCount, "\n",
"Response URLs ", scrape.ResponseURLsCount, "\n",
"Error URLs ", scrape.ErrorURLsCount, "\n",
)
}
}
13 changes: 11 additions & 2 deletions sicra/crawler.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package sicra

import (
"fmt"
"log"
"net/url"
"regexp"
"strconv"
"time"

"github.com/gocolly/colly"
Expand All @@ -13,9 +15,10 @@ type scrapeURL struct {
AddedURLs []string
AddedURLsCount int
AllVisitURLsCount int
ErrorURLs []string
ErrorURLsCount int
NoIndexURLsCount int
NoIndexURLs []string
NoIndexURLsCount int
ResponseURLsCount int
}

Expand Down Expand Up @@ -68,13 +71,19 @@ func Crawler(

c.OnError(func(er *colly.Response, err error) {
requestURL := urlEscape(er.Request.URL.String())
r := regexp.MustCompile("^5[0-9]{1,2}$")
statusCode := strconv.Itoa(er.StatusCode)
strErr := fmt.Sprint(err)
if verbose {
log.Println("Error:", err, requestURL)
}
if addError {
add(requestURL, verbose, scrapeURLs)
if r.MatchString(statusCode) {
add(requestURL, verbose, scrapeURLs)
}
}
scrapeURLs.ErrorURLsCount++
scrapeURLs.ErrorURLs = append(scrapeURLs.ErrorURLs, statusCode+" "+strErr+" "+requestURL)
})

c.OnResponse(func(re *colly.Response) {
Expand Down
5 changes: 3 additions & 2 deletions sicra/sitemap.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ func GenerateSiteMap(fileName string, urls []string) error {
for _, loc := range urls {
fh.WriteString(" " + "<url>\n")
fh.WriteString(" " + "<loc>" + loc + "</loc>\n")
fh.WriteString(" " + "<changefreq>" + currentTime + "</changefreq>\n")
fh.WriteString(" " + "<lastmod>" + currentTime + "</lastmod>\n")
fh.WriteString(" " + "<changefreq>hourly</changefreq>\n")
fh.WriteString(" " + "<priority>0.5</priority>\n")
fh.WriteString(" " + "</url>\n")
Expand All @@ -34,7 +34,8 @@ func GenerateSiteMap(fileName string, urls []string) error {
return nil
}

func GenerateNoIndex(fileName string, urls []string) error {
// GenerateTxt generate txt file for error list url or skiped noindex
func GenerateTxt(fileName string, urls []string) error {
err := deleteFileIfExists(fileName)
if err != nil {
log.Fatal(err)
Expand Down
23 changes: 23 additions & 0 deletions test/error.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
server {
listen 8080;
server_name _;

location /403 {
return 403;
}
location /404 {
return 404;
}
location /500 {
return 500;
}
location /502 {
return 502;
}
location /444 {
return 444;
}
location /555 {
return 555;
}
}

0 comments on commit 7ee8c93

Please sign in to comment.