Skip to content

Commit

Permalink
Fix attempts to parse xlsx as xml
Browse files Browse the repository at this point in the history
This is a hacky work around colly's handleOnXML behaviour which considers
any response body with content-type containing the substring "xml" to be
parsed as XML. This is an incorrect assumption for docx, xlsx files
which aren't strictly xml structured and cause parsing errors. Here we
remove "xml" from those content types to prevent them being parsed.
  • Loading branch information
theseanything committed Oct 19, 2023
1 parent 2a504ac commit 8797420
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 0 deletions.
7 changes: 7 additions & 0 deletions internal/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,13 @@ func responseHandler(r *colly.Response) {
log.Error().Err(err).Msg("Error attempting to visit link")
}
}
} else if strings.Contains(mediaType, "openxmlformats") {
// This is hacky work around colly's handleOnXML behaviour which
// considers any response body with content-type containing the substring
// "xml" to be parsed as XML. This is an incorrect assumption for docx,
// xlsx files which aren't strictly xml structured and cause parsing
// errors.
r.Headers.Set("Content-Type", strings.ReplaceAll(contentType, "xml", ""))
}

err = file.Save(r.Request.URL, contentType, r.Body)
Expand Down
11 changes: 11 additions & 0 deletions internal/crawler/crawler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ var routes = map[string]struct {
<body>
<a href="/child">Visit child</a>
<a href="/redirect">Visit redirect</a>
<a href="/spreadsheet.xlsx">Spreadsheet</a>
<a href="/external/redirect">Visit external redirect</a>
<img src="/assets/image.jpg">
<script src="assets/script.js"></script>
Expand Down Expand Up @@ -108,6 +109,11 @@ var routes = map[string]struct {
contentType: "image/png",
body: []byte{0xff, 0xd8, 0xff, 0xd9},
},
"/spreadsheet.xlsx": {
status: http.StatusOK,
contentType: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
body: []byte{0x50, 0x4b, 0x03, 0x04, 0x14, 0x00, 0x06, 0x00, 0x08, 0x00, 0x00, 0x00, 0x21, 0x00, 0x36, 0x9d},
},
"/child": {
status: http.StatusOK,
contentType: "text/html",
Expand Down Expand Up @@ -237,6 +243,11 @@ func TestRun(t *testing.T) {
filePath: "/assets/background.png",
expectedOutput: routes["/assets/background.png"].body,
},
{
name: "Test spreadsheet",
filePath: "/spreadsheet.xlsx",
expectedOutput: routes["/spreadsheet.xlsx"].body,
},
{
name: "Test child",
filePath: "/child.html",
Expand Down

0 comments on commit 8797420

Please sign in to comment.