Skip to content

Commit

Permalink
fix: .trim() urls from pretty-printed sitemap.xml files (#2709)
Browse files Browse the repository at this point in the history
Closes #2698
  • Loading branch information
barjin authored Oct 14, 2024
1 parent fcb098d commit 802a6fe
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 2 deletions.
4 changes: 2 additions & 2 deletions packages/utils/src/internals/sitemap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,12 @@ class SitemapXmlParser extends Transform {
private onText(text: string) {
if (this.currentTag === 'loc') {
if (this.rootTagName === 'sitemapindex') {
this.push({ type: 'sitemapUrl', url: text } satisfies SitemapItem);
this.push({ type: 'sitemapUrl', url: text.trim() } satisfies SitemapItem);
}

if (this.rootTagName === 'urlset') {
this.url ??= {};
this.url.loc = text;
this.url.loc = text.trim();
}
}

Expand Down
85 changes: 85 additions & 0 deletions packages/utils/test/sitemap.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,31 @@ describe('Sitemap', () => {
'</sitemapindex>',
].join('\n'),
)
.get('/sitemap_parent_pretty.xml')
.reply(
200,
[
'<?xml version="1.0" encoding="UTF-8"?>',
'<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">',
'<sitemap>',
`<loc>
http://not-exists.com/sitemap_child.xml
</loc>`,
`<lastmod>
2004-12-23
</lastmod>`,
'</sitemap>',
'<sitemap>',
`<loc>
http://not-exists.com/sitemap_child_2.xml?from=94937939985&amp;to=1318570721404
</loc>`,
`<lastmod>
2004-12-23
</lastmod>`,
'</sitemap>',
'</sitemapindex>',
].join('\n'),
)
.get('/not_actual_xml.xml')
.reply(
200,
Expand All @@ -141,6 +166,30 @@ describe('Sitemap', () => {
'</urlset>',
].join('\n'),
)
.get('/sitemap_pretty.xml')
.reply(
200,
[
'<?xml version="1.0" encoding="UTF-8"?>',
'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">',
'<url>',
`<loc>
http://not-exists.com/catalog?item=80&amp;desc=vacation_turkey
</loc>`,
`<lastmod>
2005-02-03
</lastmod>`,
`<changefreq>
monthly
</changefreq>`,
`<priority>
0.8
</priority>`,
'</url>',
'</urlset>',
].join('\n'),
)
.get('/sitemap.xml')
.reply(
200,
Expand Down Expand Up @@ -269,6 +318,42 @@ describe('Sitemap', () => {
);
});

it('handles pretty-printed XML correctly', async () => {
const sitemap = await Sitemap.load('http://not-exists.com/sitemap_pretty.xml');
expect(new Set(sitemap.urls)).toEqual(new Set(['http://not-exists.com/catalog?item=80&desc=vacation_turkey']));
});

it('extracts metadata from pretty-printed XML', async () => {
const items: SitemapUrl[] = [];

for await (const item of parseSitemap([{ type: 'url', url: 'http://not-exists.com/sitemap_pretty.xml' }])) {
items.push(item);
}

expect(items).toHaveLength(1);
expect(items).toContainEqual(
expect.objectContaining({
loc: 'http://not-exists.com/catalog?item=80&desc=vacation_turkey',
priority: 0.8,
changefreq: 'monthly',
lastmod: new Date('2005-02-03'),
}),
);
});

it('handles pretty-printed nested sitemaps XML correctly', async () => {
const sitemap = await Sitemap.load('http://not-exists.com/sitemap_parent_pretty.xml');
expect(new Set(sitemap.urls)).toEqual(
new Set([
'http://not-exists.com/',
'http://not-exists.com/catalog?item=12&desc=vacation_hawaii',
'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand',
'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland',
'http://not-exists.com/catalog?item=83&desc=vacation_usa',
]),
);
});

it('loads sitemaps from string', async () => {
const sitemap = await Sitemap.fromXmlString(
[
Expand Down

0 comments on commit 802a6fe

Please sign in to comment.