diff --git a/archetype/src/main/resources/archetype-resources/README.md b/archetype/src/main/resources/archetype-resources/README.md index e973f08fb..9f4fce321 100644 --- a/archetype/src/main/resources/archetype-resources/README.md +++ b/archetype/src/main/resources/archetype-resources/README.md @@ -3,8 +3,7 @@ Have a look at the code and resources and modify them to your heart's content. # Prerequisites -You need to install Apache Storm. The instructions on [setting up a Storm cluster](https://storm.apache.org/releases/2.6.2/Setting-up-a-Storm-cluster.html) should help. Alternatively, -the [stormcrawler-docker](https://github.com/DigitalPebble/stormcrawler-docker) project contains resources for running Apache Storm on Docker. +You need to install Apache Storm. The instructions on [setting up a Storm cluster](https://storm.apache.org/releases/2.6.2/Setting-up-a-Storm-cluster.html) should help. You also need to have an instance of URLFrontier running. See [the URLFrontier README](https://github.com/crawler-commons/url-frontier/tree/master/service); the easiest way is to use Docker, like so: diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java index b4da630ab..3f1477d17 100644 --- a/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java +++ b/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java @@ -509,7 +509,7 @@ public void run() { metadata = new Metadata(); } - // https://github.com/DigitalPebble/storm-crawler/issues/813 + // https://github.com/apache/incubator-stormcrawler/issues/813 metadata.remove("fetch.exception"); boolean asap = false; @@ -568,7 +568,7 @@ public void run() { } // has found sitemaps - // https://github.com/DigitalPebble/storm-crawler/issues/710 + // https://github.com/apache/incubator-stormcrawler/issues/710 // note: we don't care if the sitemap URLs where actually // kept boolean foundSitemap = (rules.getSitemaps().size() > 0); @@ -732,7 +732,7 @@ public void run() { mergedMD.setValue("_redirTo", redirection); } - // https://github.com/DigitalPebble/storm-crawler/issues/954 + // https://github.com/apache/incubator-stormcrawler/issues/954 if (allowRedirs() && StringUtils.isNotBlank(redirection)) { emitOutlink(fit.t, url, redirection, mergedMD); } diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java index 015403d06..17214a4d2 100644 --- a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java +++ b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java @@ -347,7 +347,7 @@ public void execute(Tuple tuple) { LOG.info("Found redir in {} to {}", url, redirection); metadata.setValue("_redirTo", redirection); - // https://github.com/DigitalPebble/storm-crawler/issues/954 + // https://github.com/apache/incubator-stormcrawler/issues/954 if (allowRedirs() && StringUtils.isNotBlank(redirection)) { emitOutlink(tuple, new URL(url), redirection, metadata); } diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/SimpleFetcherBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/SimpleFetcherBolt.java index 7c5ccfcc6..0f783d788 100644 --- a/core/src/main/java/org/apache/stormcrawler/bolt/SimpleFetcherBolt.java +++ b/core/src/main/java/org/apache/stormcrawler/bolt/SimpleFetcherBolt.java @@ -256,7 +256,7 @@ public void execute(Tuple input) { metadata = new Metadata(); } - // https://github.com/DigitalPebble/storm-crawler/issues/813 + // https://github.com/apache/incubator-stormcrawler/issues/813 metadata.remove("fetch.exception"); URL url; @@ -326,7 +326,7 @@ public void execute(Tuple input) { } // has found sitemaps - // https://github.com/DigitalPebble/storm-crawler/issues/710 + // https://github.com/apache/incubator-stormcrawler/issues/710 // note: we don't care if the sitemap URLs where actually // kept boolean foundSitemap = (rules.getSitemaps().size() > 0); diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/basic/BasicURLNormalizer.java b/core/src/main/java/org/apache/stormcrawler/filtering/basic/BasicURLNormalizer.java index 7550327c2..629bc976b 100644 --- a/core/src/main/java/org/apache/stormcrawler/filtering/basic/BasicURLNormalizer.java +++ b/core/src/main/java/org/apache/stormcrawler/filtering/basic/BasicURLNormalizer.java @@ -50,7 +50,7 @@ public class BasicURLNormalizer extends URLFilter { /** Nutch 1098 - finds URL encoded parts of the URL */ private static final Pattern unescapeRulePattern = Pattern.compile("%([0-9A-Fa-f]{2})"); - /** https://github.com/DigitalPebble/storm-crawler/issues/401 * */ + /** https://github.com/apache/incubator-stormcrawler/issues/401 * */ private static final Pattern illegalEscapePattern = Pattern.compile("%u([0-9A-Fa-f]{4})"); // charset used for encoding URLs before escaping diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java index 671b9c0f1..50f528f21 100644 --- a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java +++ b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java @@ -112,7 +112,7 @@ public void loadJSONResources(InputStream inputStream) // if it contains a single object // jump directly to its content - // https://github.com/DigitalPebble/storm-crawler/issues/1013 + // https://github.com/apache/incubator-stormcrawler/issues/1013 if (rootNode.size() == 1 && rootNode.isObject()) { rootNode = rootNode.fields().next().getValue(); } diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/sitemap/SitemapFilter.java b/core/src/main/java/org/apache/stormcrawler/filtering/sitemap/SitemapFilter.java index 6670663e6..5beec2782 100644 --- a/core/src/main/java/org/apache/stormcrawler/filtering/sitemap/SitemapFilter.java +++ b/core/src/main/java/org/apache/stormcrawler/filtering/sitemap/SitemapFilter.java @@ -36,7 +36,7 @@ * * * Will be replaced by MetadataFilter to filter based on + * "https://github.com/apache/incubator-stormcrawler/issues/711">MetadataFilter to filter based on * multiple key values * * @since 1.14 diff --git a/core/src/main/java/org/apache/stormcrawler/persistence/AbstractStatusUpdaterBolt.java b/core/src/main/java/org/apache/stormcrawler/persistence/AbstractStatusUpdaterBolt.java index 04bf9bfe6..44d7a89f9 100644 --- a/core/src/main/java/org/apache/stormcrawler/persistence/AbstractStatusUpdaterBolt.java +++ b/core/src/main/java/org/apache/stormcrawler/persistence/AbstractStatusUpdaterBolt.java @@ -207,7 +207,7 @@ public void execute(Tuple tuple) { if (!status.equals(Status.FETCH_ERROR)) { metadata.remove(Constants.fetchErrorCountParamName); } - // https://github.com/DigitalPebble/storm-crawler/issues/415 + // https://github.com/apache/incubator-stormcrawler/issues/415 // remove error related key values in case of success if (status.equals(Status.FETCHED) || status.equals(Status.REDIRECTION)) { metadata.remove(Constants.STATUS_ERROR_CAUSE); diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/ProtocolResponse.java b/core/src/main/java/org/apache/stormcrawler/protocol/ProtocolResponse.java index f997957ff..b79163d81 100644 --- a/core/src/main/java/org/apache/stormcrawler/protocol/ProtocolResponse.java +++ b/core/src/main/java/org/apache/stormcrawler/protocol/ProtocolResponse.java @@ -58,7 +58,7 @@ public class ProtocolResponse { /** * @since 1.17 - * @see Issue 776 + * @see Issue 776 */ public static final String PROTOCOL_MD_PREFIX_PARAM = "protocol.md.prefix"; diff --git a/core/src/main/java/org/apache/stormcrawler/util/CharsetIdentification.java b/core/src/main/java/org/apache/stormcrawler/util/CharsetIdentification.java index b9a767a99..1ef8a7125 100644 --- a/core/src/main/java/org/apache/stormcrawler/util/CharsetIdentification.java +++ b/core/src/main/java/org/apache/stormcrawler/util/CharsetIdentification.java @@ -186,7 +186,7 @@ private static String getCharsetFromMeta(byte buffer[], int maxlength) { int start = html.indexOf(" fields = output.getEmitted(Constants.StatusStreamName).get(0); Assertions.assertEquals(3, fields.size()); @@ -101,7 +101,7 @@ void testSitemapParsingWithImageExtensions() throws IOException { metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); parse( "http://stormcrawler.apache.org/sitemap.xml", - "digitalpebble.sitemap.extensions.image.xml", + "stormcrawler.sitemap.extensions.image.xml", metadata); Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0); Metadata parsedMetadata = (Metadata) values.get(1); @@ -120,7 +120,7 @@ void testSitemapParsingWithMobileExtensions() throws IOException { metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); parse( "http://stormcrawler.apache.org/sitemap.xml", - "digitalpebble.sitemap.extensions.mobile.xml", + "stormcrawler.sitemap.extensions.mobile.xml", metadata); Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0); Metadata parsedMetadata = (Metadata) values.get(1); @@ -139,7 +139,7 @@ void testSitemapParsingWithLinkExtensions() throws IOException { metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); parse( "http://stormcrawler.apache.org/sitemap.xml", - "digitalpebble.sitemap.extensions.links.xml", + "stormcrawler.sitemap.extensions.links.xml", metadata); Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0); Metadata parsedMetadata = (Metadata) values.get(1); @@ -158,7 +158,7 @@ void testSitemapParsingWithNewsExtensions() throws IOException { metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); parse( "http://stormcrawler.apache.org/sitemap.xml", - "digitalpebble.sitemap.extensions.news.xml", + "stormcrawler.sitemap.extensions.news.xml", metadata); Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0); Metadata parsedMetadata = (Metadata) values.get(1); @@ -177,7 +177,7 @@ void testSitemapParsingWithVideoExtensions() throws IOException { metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); parse( "http://stormcrawler.apache.org/sitemap.xml", - "digitalpebble.sitemap.extensions.video.xml", + "stormcrawler.sitemap.extensions.video.xml", metadata); Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0); Metadata parsedMetadata = (Metadata) values.get(1); @@ -203,7 +203,7 @@ void testSitemapParsingWithAllExtensions() throws IOException { metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); parse( "http://stormcrawler.apache.org/sitemap.xml", - "digitalpebble.sitemap.extensions.all.xml", + "stormcrawler.sitemap.extensions.all.xml", metadata); Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0); Metadata parsedMetadata = (Metadata) values.get(1); @@ -237,8 +237,8 @@ void testSitemapParsingNoMT() throws IOException { Metadata metadata = new Metadata(); // do not specify that it is a sitemap file // do not set the mimetype - parse("http://stormcrawler.apache.org/sitemap.xml", "digitalpebble.sitemap.xml", metadata); - Assertions.assertEquals(6, output.getEmitted(Constants.StatusStreamName).size()); + parse("http://stormcrawler.apache.org/sitemap.xml", "stormcrawler.sitemap.xml", metadata); + Assertions.assertEquals(7, output.getEmitted(Constants.StatusStreamName).size()); // TODO test that the new links have the right metadata List fields = output.getEmitted(Constants.StatusStreamName).get(0); Assertions.assertEquals(3, fields.size()); diff --git a/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLNormalizerTest.java b/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLNormalizerTest.java index b9594cc31..250ea401a 100644 --- a/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLNormalizerTest.java +++ b/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLNormalizerTest.java @@ -289,7 +289,7 @@ void testLowerCasing() throws MalformedURLException { assertEquals(expectedResult, normalizedUrl, "Failed to filter query string"); } - // https://github.com/DigitalPebble/storm-crawler/issues/401 + // https://github.com/apache/incubator-stormcrawler/issues/401 @Test void testNonStandardPercentEncoding() throws MalformedURLException { URLFilter urlFilter = createFilter(false, false); diff --git a/core/src/test/java/org/apache/stormcrawler/filtering/FastURLFilterTest.java b/core/src/test/java/org/apache/stormcrawler/filtering/FastURLFilterTest.java index fdf68fb36..4ea88b5b5 100644 --- a/core/src/test/java/org/apache/stormcrawler/filtering/FastURLFilterTest.java +++ b/core/src/test/java/org/apache/stormcrawler/filtering/FastURLFilterTest.java @@ -53,7 +53,7 @@ void testDomainNotAllowed() throws MalformedURLException { String filterResult = createFilter().filter(url, metadata, url.toExternalForm()); Assertions.assertEquals(null, filterResult); // allowed - url = new URL("http://stormcrawler.net/digitalpebble/"); + url = new URL("http://stormcrawler.net/bla/"); filterResult = createFilter().filter(url, metadata, url.toExternalForm()); Assertions.assertEquals(url.toString(), filterResult); } diff --git a/core/src/test/java/org/apache/stormcrawler/indexer/BasicIndexingTest.java b/core/src/test/java/org/apache/stormcrawler/indexer/BasicIndexingTest.java index 9b73fc26b..d00bd4a60 100644 --- a/core/src/test/java/org/apache/stormcrawler/indexer/BasicIndexingTest.java +++ b/core/src/test/java/org/apache/stormcrawler/indexer/BasicIndexingTest.java @@ -87,7 +87,7 @@ void testBadCanonicalURL() throws Exception { config.put(AbstractIndexerBolt.urlFieldParamName, "url"); config.put(AbstractIndexerBolt.canonicalMetadataParamName, "canonical"); Metadata metadata = new Metadata(); - metadata.setValue("canonical", "htp://www.digitalpebble.com/"); + metadata.setValue("canonical", "htp://stormcrawler.apache.org/"); prepareIndexerBolt(config); index(URL, metadata); Map fields = ((DummyIndexer) bolt).returnFields(); diff --git a/core/src/test/java/org/apache/stormcrawler/jsoup/JSoupFiltersTest.java b/core/src/test/java/org/apache/stormcrawler/jsoup/JSoupFiltersTest.java index de433d1c2..33f96dbbb 100644 --- a/core/src/test/java/org/apache/stormcrawler/jsoup/JSoupFiltersTest.java +++ b/core/src/test/java/org/apache/stormcrawler/jsoup/JSoupFiltersTest.java @@ -58,7 +58,7 @@ void testBasicExtraction() throws IOException { } @Test - // https://github.com/DigitalPebble/storm-crawler/issues/219 + // https://github.com/apache/incubator-stormcrawler/issues/219 void testScriptExtraction() throws IOException { prepareParserBolt("test.jsoupfilters.json"); parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html"); diff --git a/core/src/test/java/org/apache/stormcrawler/parse/StackOverflowTest.java b/core/src/test/java/org/apache/stormcrawler/parse/StackOverflowTest.java index 3a0a3956e..02abfab5e 100644 --- a/core/src/test/java/org/apache/stormcrawler/parse/StackOverflowTest.java +++ b/core/src/test/java/org/apache/stormcrawler/parse/StackOverflowTest.java @@ -28,7 +28,7 @@ import org.junit.jupiter.api.Test; /** - * @see https://github.com/DigitalPebble/storm-crawler/pull/653 * + * @see https://github.com/apache/incubator-stormcrawler/pull/653 * */ class StackOverflowTest extends ParsingTester { @@ -47,7 +47,7 @@ void testStackOverflow() throws IOException { } /** - * @see https://github.com/DigitalPebble/storm-crawler/issues/666 * + * @see https://github.com/apache/incubator-stormcrawler/issues/666 * */ @Test void testNamespaceExtraction() throws IOException { diff --git a/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsFilterTest.java b/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsFilterTest.java index 408d85033..f74c34a63 100644 --- a/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsFilterTest.java +++ b/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsFilterTest.java @@ -40,7 +40,7 @@ void testSitemapSubdocuments() throws IOException { config.put("detect.mimetype", false); prepareParserBolt("test.subdocfilter.json", config); Metadata metadata = new Metadata(); - parse("http://stormcrawler.apache.org/sitemap.xml", "digitalpebble.sitemap.xml", metadata); - Assertions.assertEquals(6, output.getEmitted().size()); + parse("http://stormcrawler.apache.org/sitemap.xml", "stormcrawler.sitemap.xml", metadata); + Assertions.assertEquals(7, output.getEmitted().size()); } } diff --git a/core/src/test/java/org/apache/stormcrawler/parse/filter/XPathFilterTest.java b/core/src/test/java/org/apache/stormcrawler/parse/filter/XPathFilterTest.java index a15e08336..7a8077f37 100644 --- a/core/src/test/java/org/apache/stormcrawler/parse/filter/XPathFilterTest.java +++ b/core/src/test/java/org/apache/stormcrawler/parse/filter/XPathFilterTest.java @@ -48,7 +48,7 @@ void testBasicExtraction() throws IOException { } @Test - // https://github.com/DigitalPebble/storm-crawler/issues/219 + // https://github.com/apache/incubator-stormcrawler/issues/219 void testScriptExtraction() throws IOException { prepareParserBolt("test.parsefilters.json"); parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html"); diff --git a/core/src/test/java/org/apache/stormcrawler/protocol/DelegationProtocolTest.java b/core/src/test/java/org/apache/stormcrawler/protocol/DelegationProtocolTest.java index 9a706829b..a4d25cb60 100644 --- a/core/src/test/java/org/apache/stormcrawler/protocol/DelegationProtocolTest.java +++ b/core/src/test/java/org/apache/stormcrawler/protocol/DelegationProtocolTest.java @@ -40,7 +40,7 @@ void getProtocolTest() throws FileNotFoundException { // try single filter Metadata meta = new Metadata(); meta.setValue("js", "true"); - FilteredProtocol pf = superProto.getProtocolFor("https://digitalpebble.com", meta); + FilteredProtocol pf = superProto.getProtocolFor("https://stormcrawler.apache.org", meta); Assertions.assertEquals(pf.id, "second"); // no filter at all meta = new Metadata(); diff --git a/core/src/test/resources/digitalpebble.sitemap.extensions.news.xml b/core/src/test/resources/digitalpebble.sitemap.extensions.news.xml deleted file mode 100644 index 9243b66b8..000000000 --- a/core/src/test/resources/digitalpebble.sitemap.extensions.news.xml +++ /dev/null @@ -1,69 +0,0 @@ - - - - - - - http://digitalpebble.com/ - 2012-12-05T10:59:04+00:00 - monthly - 1.00 - - - The Example Times - en - - PressRelease, Blog - 2008-12-23 - Companies A, B in Merger Talks - business, merger, acquisition, A, B - NASDAQ:A, NASDAQ:B - - - - http://digitalpebble.com/index.html - 2012-12-05T10:59:04+00:00 - monthly - 0.80 - - - http://digitalpebble.com/solutions.html - 2012-09-06T16:53:04+00:00 - monthly - 0.80 - - - http://digitalpebble.com/references.html - 2014-04-16T14:40:10+00:00 - monthly - 0.80 - - - http://digitalpebble.com/contact.html - 2012-12-05T10:59:00+00:00 - monthly - 0.80 - - diff --git a/core/src/test/resources/digitalpebble.sitemap.extensions.video.xml b/core/src/test/resources/digitalpebble.sitemap.extensions.video.xml deleted file mode 100644 index 20a6a792b..000000000 --- a/core/src/test/resources/digitalpebble.sitemap.extensions.video.xml +++ /dev/null @@ -1,79 +0,0 @@ - - - - - - - http://digitalpebble.com/ - 2012-12-05T10:59:04+00:00 - monthly - 1.00 - - http://www.example.com/thumbs/123.jpg - Grilling steaks for summer - Alkis shows you how to get perfectly done steaks every time - http://www.example.com/video123.flv - http://www.example.com/videoplayer.swf?video=123 - 600 - 2009-11-05T19:20:30+08:00 - 4.2 - 12345 - 2007-11-05T19:20:30+08:00 - sample_tag1 - sample_tag2 - yes - IE GB US CA - http://cooking.example.com - 1.99 - yes - GrillyMcGrillerson - no - - - - http://digitalpebble.com/index.html - 2012-12-05T10:59:04+00:00 - monthly - 0.80 - - - http://digitalpebble.com/solutions.html - 2012-09-06T16:53:04+00:00 - monthly - 0.80 - - - http://digitalpebble.com/references.html - 2014-04-16T14:40:10+00:00 - monthly - 0.80 - - - http://digitalpebble.com/contact.html - 2012-12-05T10:59:00+00:00 - monthly - 0.80 - - diff --git a/core/src/test/resources/digitalpebble.sitemap.xml b/core/src/test/resources/digitalpebble.sitemap.xml deleted file mode 100644 index 09cea4ba9..000000000 --- a/core/src/test/resources/digitalpebble.sitemap.xml +++ /dev/null @@ -1,57 +0,0 @@ - - - - - - - http://digitalpebble.com/ - 2012-12-05T10:59:04+00:00 - monthly - 1.00 - - - http://digitalpebble.com/index.html - 2012-12-05T10:59:04+00:00 - monthly - 0.80 - - - http://digitalpebble.com/solutions.html - 2012-09-06T16:53:04+00:00 - monthly - 0.80 - - - http://digitalpebble.com/references.html - 2014-04-16T14:40:10+00:00 - monthly - 0.80 - - - http://digitalpebble.com/contact.html - 2012-12-05T10:59:00+00:00 - monthly - 0.80 - - diff --git a/core/src/test/resources/fast.urlfilter.json b/core/src/test/resources/fast.urlfilter.json index d51953b89..866d8c62b 100644 --- a/core/src/test/resources/fast.urlfilter.json +++ b/core/src/test/resources/fast.urlfilter.json @@ -4,7 +4,7 @@ "patterns" : [ "DenyPathQuery \\.jpg" ] }, { "scope" : "domain:stormcrawler.net", - "patterns" : [ "AllowPath /digitalpebble/", "DenyPath .+" ] + "patterns" : [ "AllowPath /bla/", "DenyPath .+" ] }, { "scope" : "metadata:key=value", "patterns" : [ "DenyPath .+" ] diff --git a/core/src/test/resources/digitalpebble.sitemap.extensions.all.xml b/core/src/test/resources/stormcrawler.sitemap.extensions.all.xml similarity index 80% rename from core/src/test/resources/digitalpebble.sitemap.extensions.all.xml rename to core/src/test/resources/stormcrawler.sitemap.extensions.all.xml index af3f14c74..6958b115a 100644 --- a/core/src/test/resources/digitalpebble.sitemap.extensions.all.xml +++ b/core/src/test/resources/stormcrawler.sitemap.extensions.all.xml @@ -76,28 +76,34 @@ under the License. no - - http://digitalpebble.com/index.html - 2012-12-05T10:59:04+00:00 - monthly - 0.80 - - - http://digitalpebble.com/solutions.html - 2012-09-06T16:53:04+00:00 - monthly - 0.80 - - - http://digitalpebble.com/references.html - 2014-04-16T14:40:10+00:00 - monthly - 0.80 - - - http://digitalpebble.com/contact.html - 2012-12-05T10:59:00+00:00 - monthly - 0.80 - + + https://stormcrawler.apache.org/ + 2024-10-19T11:21:53+00:00 + 1.00 + + + https://stormcrawler.apache.org/index.html + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/download/index.html + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/getting-started/ + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/faq/ + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/support/ + 2024-10-19T11:21:53+00:00 + 0.80 + diff --git a/core/src/test/resources/digitalpebble.sitemap.extensions.image.xml b/core/src/test/resources/stormcrawler.sitemap.extensions.image.xml similarity index 65% rename from core/src/test/resources/digitalpebble.sitemap.extensions.image.xml rename to core/src/test/resources/stormcrawler.sitemap.extensions.image.xml index f5dd7bbb3..99ecb553c 100644 --- a/core/src/test/resources/digitalpebble.sitemap.extensions.image.xml +++ b/core/src/test/resources/stormcrawler.sitemap.extensions.image.xml @@ -38,28 +38,34 @@ under the License. https://creativecommons.org/licenses/by/4.0/legalcode - - http://digitalpebble.com/index.html - 2012-12-05T10:59:04+00:00 - monthly - 0.80 - - - http://digitalpebble.com/solutions.html - 2012-09-06T16:53:04+00:00 - monthly - 0.80 - - - http://digitalpebble.com/references.html - 2014-04-16T14:40:10+00:00 - monthly - 0.80 - - - http://digitalpebble.com/contact.html - 2012-12-05T10:59:00+00:00 - monthly - 0.80 - + + https://stormcrawler.apache.org/ + 2024-10-19T11:21:53+00:00 + 1.00 + + + https://stormcrawler.apache.org/index.html + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/download/index.html + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/getting-started/ + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/faq/ + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/support/ + 2024-10-19T11:21:53+00:00 + 0.80 + diff --git a/core/src/test/resources/digitalpebble.sitemap.extensions.links.xml b/core/src/test/resources/stormcrawler.sitemap.extensions.links.xml similarity index 54% rename from core/src/test/resources/digitalpebble.sitemap.extensions.links.xml rename to core/src/test/resources/stormcrawler.sitemap.extensions.links.xml index 41382dce5..4d52b2845 100644 --- a/core/src/test/resources/digitalpebble.sitemap.extensions.links.xml +++ b/core/src/test/resources/stormcrawler.sitemap.extensions.links.xml @@ -24,36 +24,35 @@ under the License. http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" xmlns:xhtml="http://www.w3.org/1999/xhtml"> - - - http://digitalpebble.com/ - 2012-12-05T10:59:04+00:00 - monthly - 1.00 - - - - http://digitalpebble.com/index.html - 2012-12-05T10:59:04+00:00 - monthly - 0.80 - - - http://digitalpebble.com/solutions.html - 2012-09-06T16:53:04+00:00 - monthly - 0.80 - - - http://digitalpebble.com/references.html - 2014-04-16T14:40:10+00:00 - monthly - 0.80 - - - http://digitalpebble.com/contact.html - 2012-12-05T10:59:00+00:00 - monthly - 0.80 - + + https://stormcrawler.apache.org/ + 2024-10-19T11:21:53+00:00 + 1.00 + + + + https://stormcrawler.apache.org/index.html + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/download/index.html + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/getting-started/ + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/faq/ + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/support/ + 2024-10-19T11:21:53+00:00 + 0.80 + diff --git a/core/src/test/resources/digitalpebble.sitemap.extensions.mobile.xml b/core/src/test/resources/stormcrawler.sitemap.extensions.mobile.xml similarity index 56% rename from core/src/test/resources/digitalpebble.sitemap.extensions.mobile.xml rename to core/src/test/resources/stormcrawler.sitemap.extensions.mobile.xml index 16351dc7e..685e302b4 100644 --- a/core/src/test/resources/digitalpebble.sitemap.extensions.mobile.xml +++ b/core/src/test/resources/stormcrawler.sitemap.extensions.mobile.xml @@ -24,36 +24,35 @@ under the License. http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" xmlns:mobile="http://www.google.com/schemas/sitemap-mobile/1.0"> - - - http://digitalpebble.com/ - 2012-12-05T10:59:04+00:00 - monthly - 1.00 - - - - http://digitalpebble.com/index.html - 2012-12-05T10:59:04+00:00 - monthly - 0.80 - - - http://digitalpebble.com/solutions.html - 2012-09-06T16:53:04+00:00 - monthly - 0.80 - - - http://digitalpebble.com/references.html - 2014-04-16T14:40:10+00:00 - monthly - 0.80 - - - http://digitalpebble.com/contact.html - 2012-12-05T10:59:00+00:00 - monthly - 0.80 - + + https://stormcrawler.apache.org/ + 2024-10-19T11:21:53+00:00 + 1.00 + + + + https://stormcrawler.apache.org/index.html + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/download/index.html + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/getting-started/ + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/faq/ + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/support/ + 2024-10-19T11:21:53+00:00 + 0.80 + diff --git a/core/src/test/resources/stormcrawler.sitemap.extensions.news.xml b/core/src/test/resources/stormcrawler.sitemap.extensions.news.xml new file mode 100644 index 000000000..7723c3c6f --- /dev/null +++ b/core/src/test/resources/stormcrawler.sitemap.extensions.news.xml @@ -0,0 +1,70 @@ + + + + + + https://stormcrawler.apache.org/ + 2024-10-19T11:21:53+00:00 + monthly + 1.00 + + + The Example Times + en + + PressRelease, Blog + 2008-12-23 + Companies A, B in Merger Talks + business, merger, acquisition, A, B + NASDAQ:A, NASDAQ:B + + + + https://stormcrawler.apache.org/index.html + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/download/index.html + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/getting-started/ + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/faq/ + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/support/ + 2024-10-19T11:21:53+00:00 + 0.80 + + + diff --git a/core/src/test/resources/stormcrawler.sitemap.extensions.video.xml b/core/src/test/resources/stormcrawler.sitemap.extensions.video.xml new file mode 100644 index 000000000..8023bdda1 --- /dev/null +++ b/core/src/test/resources/stormcrawler.sitemap.extensions.video.xml @@ -0,0 +1,81 @@ + + + + + + + https://stormcrawler.apache.org/ + 2024-10-19T11:21:53+00:00 + monthly + 1.00 + + http://www.example.com/thumbs/123.jpg + Grilling steaks for summer + Alkis shows you how to get perfectly done steaks every time + http://www.example.com/video123.flv + http://www.example.com/videoplayer.swf?video=123 + 600 + 2009-11-05T19:20:30+08:00 + 4.2 + 12345 + 2007-11-05T19:20:30+08:00 + sample_tag1 + sample_tag2 + yes + IE GB US CA + http://cooking.example.com + 1.99 + yes + GrillyMcGrillerson + no + + + + https://stormcrawler.apache.org/index.html + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/download/index.html + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/getting-started/ + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/faq/ + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/support/ + 2024-10-19T11:21:53+00:00 + 0.80 + + + diff --git a/core/src/test/resources/stormcrawler.sitemap.xml b/core/src/test/resources/stormcrawler.sitemap.xml new file mode 100644 index 000000000..7561b576d --- /dev/null +++ b/core/src/test/resources/stormcrawler.sitemap.xml @@ -0,0 +1,60 @@ + + + + + + + + https://stormcrawler.apache.org/ + 2024-10-19T11:21:53+00:00 + 1.00 + + + https://stormcrawler.apache.org/index.html + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/download/index.html + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/getting-started/ + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/faq/ + 2024-10-19T11:21:53+00:00 + 0.80 + + + https://stormcrawler.apache.org/support/ + 2024-10-19T11:21:53+00:00 + 0.80 + + + + \ No newline at end of file diff --git a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/bolt/DeletionBolt.java b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/bolt/DeletionBolt.java index ceb976c4c..d90c4c691 100644 --- a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/bolt/DeletionBolt.java +++ b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/bolt/DeletionBolt.java @@ -196,7 +196,7 @@ public void afterBulk(long executionId, BulkRequest request, BulkResponse respon return new BulkItemResponseToFailedFlag(bir, failed); }) .collect( - // https://github.com/DigitalPebble/storm-crawler/issues/832 + // https://github.com/apache/incubator-stormcrawler/issues/832 Collectors.groupingBy( idWithFailedFlagTuple -> idWithFailedFlagTuple.id, Collectors.toUnmodifiableList())); diff --git a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java index ee553106c..183bf15e5 100644 --- a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java +++ b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java @@ -306,7 +306,7 @@ public void afterBulk(long executionId, BulkRequest request, BulkResponse respon return new BulkItemResponseToFailedFlag(bir, failed); }) .collect( - // https://github.com/DigitalPebble/storm-crawler/issues/832 + // https://github.com/apache/incubator-stormcrawler/issues/832 Collectors.groupingBy( idWithFailedFlagTuple -> idWithFailedFlagTuple.id, Collectors.toUnmodifiableList())); diff --git a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java index 1f8ea55a5..a7708db3d 100644 --- a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java +++ b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java @@ -339,7 +339,7 @@ public void afterBulk(long executionId, BulkRequest request, BulkResponse respon return new BulkItemResponseToFailedFlag(bir, failed); }) .collect( - // https://github.com/DigitalPebble/storm-crawler/issues/832 + // https://github.com/apache/incubator-stormcrawler/issues/832 Collectors.groupingBy( idWithFailedFlagTuple -> idWithFailedFlagTuple.id, Collectors.toUnmodifiableList())); diff --git a/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/bolt/IndexerBoltTest.java b/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/bolt/IndexerBoltTest.java index 60afe2f2c..a53047da6 100644 --- a/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/bolt/IndexerBoltTest.java +++ b/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/bolt/IndexerBoltTest.java @@ -114,7 +114,7 @@ private int lastIndex(String url, String text, Metadata metadata, long timeoutIn @Test @Timeout(value = 2, unit = TimeUnit.MINUTES) - // https://github.com/DigitalPebble/storm-crawler/issues/832 + // https://github.com/apache/incubator-stormcrawler/issues/832 void simultaneousCanonicals() throws ExecutionException, InterruptedException, TimeoutException { Metadata m1 = new Metadata(); diff --git a/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/bolt/StatusBoltTest.java b/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/bolt/StatusBoltTest.java index b6e41f62d..6e738b0cf 100644 --- a/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/bolt/StatusBoltTest.java +++ b/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/bolt/StatusBoltTest.java @@ -129,7 +129,7 @@ private Future store(String url, Status status, Metadata metadata) { @Test @Timeout(value = 2, unit = TimeUnit.MINUTES) - // see https://github.com/DigitalPebble/storm-crawler/issues/885 + // see https://github.com/apache/incubator-stormcrawler/issues/885 void checkListKeyFromOpensearch() throws IOException, ExecutionException, InterruptedException, TimeoutException { String url = "https://www.url.net/something"; diff --git a/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java b/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java index c41c1403c..f6196b87c 100644 --- a/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java +++ b/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java @@ -74,7 +74,7 @@ void testRecursiveDoc() throws IOException { /** * Checks that the mimetype whitelists are handled correctly * - * @see https://github.com/DigitalPebble/storm-crawler/issues/712 + * @see https://github.com/apache/incubator-stormcrawler/issues/712 */ void testMimeTypeWhileList() throws IOException { Map conf = new HashMap(); diff --git a/external/urlfrontier/src/main/java/org/apache/stormcrawler/urlfrontier/ManagedChannelUtil.java b/external/urlfrontier/src/main/java/org/apache/stormcrawler/urlfrontier/ManagedChannelUtil.java index 1a7c65c84..360b04a85 100644 --- a/external/urlfrontier/src/main/java/org/apache/stormcrawler/urlfrontier/ManagedChannelUtil.java +++ b/external/urlfrontier/src/main/java/org/apache/stormcrawler/urlfrontier/ManagedChannelUtil.java @@ -27,7 +27,7 @@ /* * At some point we have to write a mechanism to share the same ManagedChannel in the same runtime - * see: https://github.com/DigitalPebble/storm-crawler/pull/982#issuecomment-1175272094 + * see: https://github.com/apache/incubator-stormcrawler/pull/982#issuecomment-1175272094 */ final class ManagedChannelUtil { private ManagedChannelUtil() {} diff --git a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRequestRecordFormat.java b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRequestRecordFormat.java index 7e786dc4c..d8c8ec661 100644 --- a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRequestRecordFormat.java +++ b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRequestRecordFormat.java @@ -74,7 +74,7 @@ public byte[] format(Tuple tuple) { /* * The request record ID is stored in the metadata so that a WARC * response record can later refer to it. Deactivated because of - * https://github.com/DigitalPebble/storm-crawler/issues/721 + * https://github.com/apache/incubator-stormcrawler/issues/721 */ // metadata.setValue("_request.warc_record_id_", mainID);