Skip to content

Commit

Permalink
extractor/xml: better sitemap detection
Browse files Browse the repository at this point in the history
  • Loading branch information
CorentinB committed Sep 16, 2024
1 parent 4436efb commit 08ba828
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 7 deletions.
4 changes: 4 additions & 0 deletions internal/pkg/crawl/extractor/xml.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ func XML(resp *http.Response) (URLs []*url.URL, sitemap bool, err error) {
return nil, sitemap, err
}

if strings.Contains(string(xmlBody), "sitemaps.org/schemas/sitemap/") {
sitemap = true
}

mv, err := mxj.NewMapXml(xmlBody)
if err != nil {
return nil, sitemap, err
Expand Down
54 changes: 47 additions & 7 deletions internal/pkg/crawl/extractor/xml_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,18 @@ import (
"io"
"net/http"
"net/url"
"os"
"testing"
)

func TestXML(t *testing.T) {
tests := []struct {
name string
xmlBody string
wantURLs []*url.URL
wantErr bool
name string
xmlBody string
wantURLs []*url.URL
wantURLsCount int
wantErr bool
sitemap bool
}{
{
name: "Valid XML with URLs",
Expand All @@ -29,19 +32,22 @@ func TestXML(t *testing.T) {
{Scheme: "http", Host: "example.com"},
{Scheme: "https", Host: "example.org"},
},
sitemap: false,
wantErr: false,
},
{
name: "Empty XML",
xmlBody: `<root></root>`,
wantURLs: nil,
wantErr: false,
sitemap: false,
},
{
name: "Invalid XML",
xmlBody: `<root><unclosed>`,
wantURLs: nil,
wantErr: true,
sitemap: false,
},
{
name: "XML with invalid URL",
Expand All @@ -54,6 +60,14 @@ func TestXML(t *testing.T) {
{Scheme: "http", Host: "example.com"},
},
wantErr: false,
sitemap: false,
},
{
name: "Huge sitemap",
xmlBody: loadTestFile(t, "xml_test_sitemap.xml"),
wantURLsCount: 100002,
wantErr: false,
sitemap: true,
},
}

Expand All @@ -63,19 +77,45 @@ func TestXML(t *testing.T) {
Body: io.NopCloser(bytes.NewBufferString(tt.xmlBody)),
}

gotURLs, _, err := XML(resp)
gotURLs, sitemap, err := XML(resp)
if (err != nil) != tt.wantErr {
t.Errorf("XML() error = %v, wantErr %v", err, tt.wantErr)
return
}

if !compareURLs(gotURLs, tt.wantURLs) {
t.Errorf("XML() gotURLs = %v, want %v", gotURLs, tt.wantURLs)
if tt.wantURLsCount != 0 {
if len(gotURLs) != tt.wantURLsCount {
t.Errorf("XML() gotURLs count = %v, want %v", len(gotURLs), tt.wantURLsCount)
}
}

if tt.wantURLs != nil {
if !compareURLs(gotURLs, tt.wantURLs) {
t.Errorf("XML() gotURLs = %v, want %v", gotURLs, tt.wantURLs)
}
}

if tt.sitemap != sitemap {
t.Errorf("XML() sitemap = %v, want %v", sitemap, tt.sitemap)
}
})
}
}

func loadTestFile(t *testing.T, path string) string {
f, err := os.Open(path)
if err != nil {
t.Fatalf("openFile() error = %v", err)
}

b, err := io.ReadAll(f)
if err != nil {
t.Fatalf("readFile() error = %v", err)
}

return string(b)
}

func TestXMLBodyReadError(t *testing.T) {
resp := &http.Response{
Body: io.NopCloser(bytes.NewReader([]byte{})), // Empty reader to simulate EOF
Expand Down
2 changes: 2 additions & 0 deletions internal/pkg/crawl/extractor/xml_test_sitemap.xml

Large diffs are not rendered by default.

0 comments on commit 08ba828

Please sign in to comment.