From 802a6fea7b2125e2b36d740fc2d5d131de5d53ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Mon, 14 Oct 2024 13:23:30 +0200 Subject: [PATCH] fix: `.trim()` urls from pretty-printed sitemap.xml files (#2709) Closes #2698 --- packages/utils/src/internals/sitemap.ts | 4 +- packages/utils/test/sitemap.test.ts | 85 +++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 2 deletions(-) diff --git a/packages/utils/src/internals/sitemap.ts b/packages/utils/src/internals/sitemap.ts index 7c3cc1249018..471230c29f01 100644 --- a/packages/utils/src/internals/sitemap.ts +++ b/packages/utils/src/internals/sitemap.ts @@ -141,12 +141,12 @@ class SitemapXmlParser extends Transform { private onText(text: string) { if (this.currentTag === 'loc') { if (this.rootTagName === 'sitemapindex') { - this.push({ type: 'sitemapUrl', url: text } satisfies SitemapItem); + this.push({ type: 'sitemapUrl', url: text.trim() } satisfies SitemapItem); } if (this.rootTagName === 'urlset') { this.url ??= {}; - this.url.loc = text; + this.url.loc = text.trim(); } } diff --git a/packages/utils/test/sitemap.test.ts b/packages/utils/test/sitemap.test.ts index f1691e96e405..5b6225ddd77e 100644 --- a/packages/utils/test/sitemap.test.ts +++ b/packages/utils/test/sitemap.test.ts @@ -117,6 +117,31 @@ describe('Sitemap', () => { '', ].join('\n'), ) + .get('/sitemap_parent_pretty.xml') + .reply( + 200, + [ + '', + '', + '', + ` + http://not-exists.com/sitemap_child.xml + `, + ` + 2004-12-23 + `, + '', + '', + ` + http://not-exists.com/sitemap_child_2.xml?from=94937939985&to=1318570721404 + `, + ` + 2004-12-23 + `, + '', + '', + ].join('\n'), + ) .get('/not_actual_xml.xml') .reply( 200, @@ -141,6 +166,30 @@ describe('Sitemap', () => { '', ].join('\n'), ) + .get('/sitemap_pretty.xml') + .reply( + 200, + [ + '', + '', + '', + ` + http://not-exists.com/catalog?item=80&desc=vacation_turkey + `, + ` + 2005-02-03 + `, + ` + + monthly + `, + ` + 0.8 + `, + '', + '', + ].join('\n'), + ) .get('/sitemap.xml') .reply( 200, @@ -269,6 +318,42 @@ describe('Sitemap', () => { ); }); + it('handles pretty-printed XML correctly', async () => { + const sitemap = await Sitemap.load('http://not-exists.com/sitemap_pretty.xml'); + expect(new Set(sitemap.urls)).toEqual(new Set(['http://not-exists.com/catalog?item=80&desc=vacation_turkey'])); + }); + + it('extracts metadata from pretty-printed XML', async () => { + const items: SitemapUrl[] = []; + + for await (const item of parseSitemap([{ type: 'url', url: 'http://not-exists.com/sitemap_pretty.xml' }])) { + items.push(item); + } + + expect(items).toHaveLength(1); + expect(items).toContainEqual( + expect.objectContaining({ + loc: 'http://not-exists.com/catalog?item=80&desc=vacation_turkey', + priority: 0.8, + changefreq: 'monthly', + lastmod: new Date('2005-02-03'), + }), + ); + }); + + it('handles pretty-printed nested sitemaps XML correctly', async () => { + const sitemap = await Sitemap.load('http://not-exists.com/sitemap_parent_pretty.xml'); + expect(new Set(sitemap.urls)).toEqual( + new Set([ + 'http://not-exists.com/', + 'http://not-exists.com/catalog?item=12&desc=vacation_hawaii', + 'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand', + 'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland', + 'http://not-exists.com/catalog?item=83&desc=vacation_usa', + ]), + ); + }); + it('loads sitemaps from string', async () => { const sitemap = await Sitemap.fromXmlString( [