Skip to content

Commit

Permalink
Replace tripadvisor sitemap example with stormcrawler.apache.org content
Browse files Browse the repository at this point in the history
Replace guardian.rss feed with stormcrawler.apache.org
  • Loading branch information
rzo1 committed Nov 22, 2024
1 parent 60e195d commit 0ae250f
Show file tree
Hide file tree
Showing 7 changed files with 89 additions and 3,659 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ void setupParserBolt() {
}

private void checkOutput() {
Assertions.assertEquals(170, output.getEmitted(Constants.StatusStreamName).size());
Assertions.assertEquals(7, output.getEmitted(Constants.StatusStreamName).size());
List<Object> fields = output.getEmitted(Constants.StatusStreamName).get(0);
Assertions.assertEquals(3, fields.size());
}
Expand All @@ -51,7 +51,7 @@ void testFeedParsing() throws IOException {
Metadata metadata = new Metadata();
// specify that it is a Feed file
metadata.setValue(FeedParserBolt.isFeedKey, "true");
parse("http://www.guardian.com/Feed.xml", "guardian.rss", metadata);
parse("https://stormcrawler.apache.org/rss.xml", "stormcrawler.rss", metadata);
checkOutput();
}

Expand All @@ -66,7 +66,7 @@ void testFeedParsingNoMT() throws IOException {
Metadata metadata = new Metadata();
// set mime-type
metadata.setValue("http." + HttpHeaders.CONTENT_TYPE, "application/rss+xml");
parse("http://www.guardian.com/feed.xml", "guardian.rss", metadata);
parse("https://stormcrawler.apache.org/rss.xml", "stormcrawler.rss", metadata);
checkOutput();
}

Expand All @@ -78,7 +78,7 @@ void testFeedParsingDetextBytes() throws IOException {
bolt.prepare(
parserConfig, TestUtil.getMockedTopologyContext(), new OutputCollector(output));
Metadata metadata = new Metadata();
parse("http://www.guardian.com/feed.xml", "guardian.rss", metadata);
parse("https://stormcrawler.apache.org/rss.xml", "stormcrawler.rss", metadata);
checkOutput();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ void testSitemapIndexParsing() throws IOException {
// and its mime-type
metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
parse(
"http://www.tripadvisor.com/sitemap-index.xml",
"tripadvisor.sitemap.index.xml",
"http://stormcrawler.apache.org/sitemap-index.xml",
"stormcrawler.sitemap.index.xml",
metadata);
for (List<Object> fields : output.getEmitted(Constants.StatusStreamName)) {
Metadata parsedMetadata = (Metadata) fields.get(1);
Expand All @@ -85,8 +85,11 @@ void testGzipSitemapParsing() throws IOException {
Metadata metadata = new Metadata();
// specify that it is a sitemap file
metadata.setValue(SiteMapParserBolt.isSitemapKey, "true");
parse("https://www.tripadvisor.com/sitemap.xml.gz", "tripadvisor.sitemap.xml.gz", metadata);
Assertions.assertEquals(50001, output.getEmitted(Constants.StatusStreamName).size());
parse(
"https://stormcrawler.apache.org/sitemap.xml.gz",
"stormcrawler.sitemap.xml.gz",
metadata);
Assertions.assertEquals(7, output.getEmitted(Constants.StatusStreamName).size());
}

@Test
Expand Down
3,631 changes: 0 additions & 3,631 deletions core/src/test/resources/guardian.rss

This file was deleted.

58 changes: 58 additions & 0 deletions core/src/test/resources/stormcrawler.rss
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
<rss version="2.0">
<channel>
<title>StormCrawler Website Updates</title>
<link>https://stormcrawler.apache.org/</link>
<description>Latest updates and information from the StormCrawler website.</description>
<language>en-us</language>
<lastBuildDate>Sat, 19 Oct 2024 11:21:53 +0000</lastBuildDate>
<pubDate>Sat, 19 Oct 2024 11:21:53 +0000</pubDate>

<item>
<title>StormCrawler Homepage</title>
<link>https://stormcrawler.apache.org/</link>
<description>Main page of the StormCrawler website.</description>
<pubDate>Sat, 19 Oct 2024 11:21:53 +0000</pubDate>
<guid>https://stormcrawler.apache.org/</guid>
</item>

<item>
<title>StormCrawler Index Page</title>
<link>https://stormcrawler.apache.org/index.html</link>
<description>Index page of the StormCrawler website.</description>
<pubDate>Sat, 19 Oct 2024 11:21:53 +0000</pubDate>
<guid>https://stormcrawler.apache.org/index.html</guid>
</item>

<item>
<title>StormCrawler Download</title>
<link>https://stormcrawler.apache.org/download/index.html</link>
<description>Download page for StormCrawler resources.</description>
<pubDate>Sat, 19 Oct 2024 11:21:53 +0000</pubDate>
<guid>https://stormcrawler.apache.org/download/index.html</guid>
</item>

<item>
<title>Getting Started with StormCrawler</title>
<link>https://stormcrawler.apache.org/getting-started/</link>
<description>Guide to getting started with StormCrawler.</description>
<pubDate>Sat, 19 Oct 2024 11:21:53 +0000</pubDate>
<guid>https://stormcrawler.apache.org/getting-started/</guid>
</item>

<item>
<title>StormCrawler FAQ</title>
<link>https://stormcrawler.apache.org/faq/</link>
<description>Frequently asked questions about StormCrawler.</description>
<pubDate>Sat, 19 Oct 2024 11:21:53 +0000</pubDate>
<guid>https://stormcrawler.apache.org/faq/</guid>
</item>

<item>
<title>StormCrawler Support</title>
<link>https://stormcrawler.apache.org/support/</link>
<description>Support information for StormCrawler users.</description>
<pubDate>Sat, 19 Oct 2024 11:21:53 +0000</pubDate>
<guid>https://stormcrawler.apache.org/support/</guid>
</item>
</channel>
</rss>
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,24 @@ specific language governing permissions and limitations
under the License.
-->
<sitemapindex
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd">
<sitemap>
<loc>https://www.tripadvisor.com/sitemap/2/en_US/sitemap-1806509-en_US-hotel_review-1686849999.xml.gz</loc>
<lastmod>2023-06-15T17:26:39Z</lastmod>
</sitemap>
<sitemap>
<loc>https://www.tripadvisor.com/sitemap/2/en_US/sitemap-1806530-en_US-hotel_review-1686850054.xml.gz</loc>
<lastmod>2023-06-15T17:27:34Z</lastmod>
</sitemap>
<sitemap>
<loc>https://www.tripadvisor.com/sitemap/2/en_US/sitemap-1806537-en_US-hotel_review-1686850072.xml.gz</loc>
<lastmod>2023-06-15T17:27:52Z</lastmod>
</sitemap>
<sitemap>
<loc>https://www.tripadvisor.com/sitemap/2/en_US/sitemap-1841024-en_US-hotel_review-1694976638.xml.gz</loc>
<lastmod>2023-09-17T18:50:38Z</lastmod>
</sitemap>
</sitemapindex>
<sitemap>
<loc>https://stormcrawler.apache.org/sitemap-001.xml.gz</loc>
<lastmod>2024-10-19T11:21:53Z</lastmod>
</sitemap>
<sitemap>
<loc>https://stormcrawler.apache.org/sitemap-002.xml.gz</loc>
<lastmod>2024-10-19T11:21:53Z</lastmod>
</sitemap>
<sitemap>
<loc>https://stormcrawler.apache.org/sitemap-003.xml.gz</loc>
<lastmod>2024-10-19T11:21:53Z</lastmod>
</sitemap>
<sitemap>
<loc>https://stormcrawler.apache.org/sitemap-004.xml.gz</loc>
<lastmod>2024-10-19T11:21:53Z</lastmod>
</sitemap>
</sitemapindex>
Binary file added core/src/test/resources/stormcrawler.sitemap.xml.gz
Binary file not shown.
Binary file removed core/src/test/resources/tripadvisor.sitemap.xml.gz
Binary file not shown.

0 comments on commit 0ae250f

Please sign in to comment.