Skip to content

Commit

Permalink
Merge pull request #9 from alphagov/fix-xlsx-errors
Browse files Browse the repository at this point in the history
Fix attempts to parse xlsx as xml
  • Loading branch information
theseanything authored Oct 19, 2023
2 parents 2a504ac + 8797420 commit 1570965
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 0 deletions.
7 changes: 7 additions & 0 deletions internal/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,13 @@ func responseHandler(r *colly.Response) {
log.Error().Err(err).Msg("Error attempting to visit link")
}
}
} else if strings.Contains(mediaType, "openxmlformats") {
// This is hacky work around colly's handleOnXML behaviour which
// considers any response body with content-type containing the substring
// "xml" to be parsed as XML. This is an incorrect assumption for docx,
// xlsx files which aren't strictly xml structured and cause parsing
// errors.
r.Headers.Set("Content-Type", strings.ReplaceAll(contentType, "xml", ""))
}

err = file.Save(r.Request.URL, contentType, r.Body)
Expand Down
11 changes: 11 additions & 0 deletions internal/crawler/crawler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ var routes = map[string]struct {
<body>
<a href="/child">Visit child</a>
<a href="/redirect">Visit redirect</a>
<a href="/spreadsheet.xlsx">Spreadsheet</a>
<a href="/external/redirect">Visit external redirect</a>
<img src="/assets/image.jpg">
<script src="assets/script.js"></script>
Expand Down Expand Up @@ -108,6 +109,11 @@ var routes = map[string]struct {
contentType: "image/png",
body: []byte{0xff, 0xd8, 0xff, 0xd9},
},
"/spreadsheet.xlsx": {
status: http.StatusOK,
contentType: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
body: []byte{0x50, 0x4b, 0x03, 0x04, 0x14, 0x00, 0x06, 0x00, 0x08, 0x00, 0x00, 0x00, 0x21, 0x00, 0x36, 0x9d},
},
"/child": {
status: http.StatusOK,
contentType: "text/html",
Expand Down Expand Up @@ -237,6 +243,11 @@ func TestRun(t *testing.T) {
filePath: "/assets/background.png",
expectedOutput: routes["/assets/background.png"].body,
},
{
name: "Test spreadsheet",
filePath: "/spreadsheet.xlsx",
expectedOutput: routes["/spreadsheet.xlsx"].body,
},
{
name: "Test child",
filePath: "/child.html",
Expand Down

0 comments on commit 1570965

Please sign in to comment.