diff --git a/archetype/src/main/resources/archetype-resources/README.md b/archetype/src/main/resources/archetype-resources/README.md
index e973f08fb..9f4fce321 100644
--- a/archetype/src/main/resources/archetype-resources/README.md
+++ b/archetype/src/main/resources/archetype-resources/README.md
@@ -3,8 +3,7 @@ Have a look at the code and resources and modify them to your heart's content.
# Prerequisites
-You need to install Apache Storm. The instructions on [setting up a Storm cluster](https://storm.apache.org/releases/2.6.2/Setting-up-a-Storm-cluster.html) should help. Alternatively,
-the [stormcrawler-docker](https://github.com/DigitalPebble/stormcrawler-docker) project contains resources for running Apache Storm on Docker.
+You need to install Apache Storm. The instructions on [setting up a Storm cluster](https://storm.apache.org/releases/2.6.2/Setting-up-a-Storm-cluster.html) should help.
You also need to have an instance of URLFrontier running. See [the URLFrontier README](https://github.com/crawler-commons/url-frontier/tree/master/service); the easiest way is to use Docker, like so:
diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java
index b4da630ab..3f1477d17 100644
--- a/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java
+++ b/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java
@@ -509,7 +509,7 @@ public void run() {
metadata = new Metadata();
}
- // https://github.com/DigitalPebble/storm-crawler/issues/813
+ // https://github.com/apache/incubator-stormcrawler/issues/813
metadata.remove("fetch.exception");
boolean asap = false;
@@ -568,7 +568,7 @@ public void run() {
}
// has found sitemaps
- // https://github.com/DigitalPebble/storm-crawler/issues/710
+ // https://github.com/apache/incubator-stormcrawler/issues/710
// note: we don't care if the sitemap URLs where actually
// kept
boolean foundSitemap = (rules.getSitemaps().size() > 0);
@@ -732,7 +732,7 @@ public void run() {
mergedMD.setValue("_redirTo", redirection);
}
- // https://github.com/DigitalPebble/storm-crawler/issues/954
+ // https://github.com/apache/incubator-stormcrawler/issues/954
if (allowRedirs() && StringUtils.isNotBlank(redirection)) {
emitOutlink(fit.t, url, redirection, mergedMD);
}
diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java
index 015403d06..17214a4d2 100644
--- a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java
+++ b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java
@@ -347,7 +347,7 @@ public void execute(Tuple tuple) {
LOG.info("Found redir in {} to {}", url, redirection);
metadata.setValue("_redirTo", redirection);
- // https://github.com/DigitalPebble/storm-crawler/issues/954
+ // https://github.com/apache/incubator-stormcrawler/issues/954
if (allowRedirs() && StringUtils.isNotBlank(redirection)) {
emitOutlink(tuple, new URL(url), redirection, metadata);
}
diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/SimpleFetcherBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/SimpleFetcherBolt.java
index 7c5ccfcc6..0f783d788 100644
--- a/core/src/main/java/org/apache/stormcrawler/bolt/SimpleFetcherBolt.java
+++ b/core/src/main/java/org/apache/stormcrawler/bolt/SimpleFetcherBolt.java
@@ -256,7 +256,7 @@ public void execute(Tuple input) {
metadata = new Metadata();
}
- // https://github.com/DigitalPebble/storm-crawler/issues/813
+ // https://github.com/apache/incubator-stormcrawler/issues/813
metadata.remove("fetch.exception");
URL url;
@@ -326,7 +326,7 @@ public void execute(Tuple input) {
}
// has found sitemaps
- // https://github.com/DigitalPebble/storm-crawler/issues/710
+ // https://github.com/apache/incubator-stormcrawler/issues/710
// note: we don't care if the sitemap URLs where actually
// kept
boolean foundSitemap = (rules.getSitemaps().size() > 0);
diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/basic/BasicURLNormalizer.java b/core/src/main/java/org/apache/stormcrawler/filtering/basic/BasicURLNormalizer.java
index 7550327c2..629bc976b 100644
--- a/core/src/main/java/org/apache/stormcrawler/filtering/basic/BasicURLNormalizer.java
+++ b/core/src/main/java/org/apache/stormcrawler/filtering/basic/BasicURLNormalizer.java
@@ -50,7 +50,7 @@ public class BasicURLNormalizer extends URLFilter {
/** Nutch 1098 - finds URL encoded parts of the URL */
private static final Pattern unescapeRulePattern = Pattern.compile("%([0-9A-Fa-f]{2})");
- /** https://github.com/DigitalPebble/storm-crawler/issues/401 * */
+ /** https://github.com/apache/incubator-stormcrawler/issues/401 * */
private static final Pattern illegalEscapePattern = Pattern.compile("%u([0-9A-Fa-f]{4})");
// charset used for encoding URLs before escaping
diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java
index 671b9c0f1..50f528f21 100644
--- a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java
+++ b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java
@@ -112,7 +112,7 @@ public void loadJSONResources(InputStream inputStream)
// if it contains a single object
// jump directly to its content
- // https://github.com/DigitalPebble/storm-crawler/issues/1013
+ // https://github.com/apache/incubator-stormcrawler/issues/1013
if (rootNode.size() == 1 && rootNode.isObject()) {
rootNode = rootNode.fields().next().getValue();
}
diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/sitemap/SitemapFilter.java b/core/src/main/java/org/apache/stormcrawler/filtering/sitemap/SitemapFilter.java
index 6670663e6..5beec2782 100644
--- a/core/src/main/java/org/apache/stormcrawler/filtering/sitemap/SitemapFilter.java
+++ b/core/src/main/java/org/apache/stormcrawler/filtering/sitemap/SitemapFilter.java
@@ -36,7 +36,7 @@
*
*
* Will be replaced by MetadataFilter to filter based on
+ * "https://github.com/apache/incubator-stormcrawler/issues/711">MetadataFilter to filter based on
* multiple key values
*
* @since 1.14
diff --git a/core/src/main/java/org/apache/stormcrawler/persistence/AbstractStatusUpdaterBolt.java b/core/src/main/java/org/apache/stormcrawler/persistence/AbstractStatusUpdaterBolt.java
index 04bf9bfe6..44d7a89f9 100644
--- a/core/src/main/java/org/apache/stormcrawler/persistence/AbstractStatusUpdaterBolt.java
+++ b/core/src/main/java/org/apache/stormcrawler/persistence/AbstractStatusUpdaterBolt.java
@@ -207,7 +207,7 @@ public void execute(Tuple tuple) {
if (!status.equals(Status.FETCH_ERROR)) {
metadata.remove(Constants.fetchErrorCountParamName);
}
- // https://github.com/DigitalPebble/storm-crawler/issues/415
+ // https://github.com/apache/incubator-stormcrawler/issues/415
// remove error related key values in case of success
if (status.equals(Status.FETCHED) || status.equals(Status.REDIRECTION)) {
metadata.remove(Constants.STATUS_ERROR_CAUSE);
diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/ProtocolResponse.java b/core/src/main/java/org/apache/stormcrawler/protocol/ProtocolResponse.java
index f997957ff..b79163d81 100644
--- a/core/src/main/java/org/apache/stormcrawler/protocol/ProtocolResponse.java
+++ b/core/src/main/java/org/apache/stormcrawler/protocol/ProtocolResponse.java
@@ -58,7 +58,7 @@ public class ProtocolResponse {
/**
* @since 1.17
- * @see Issue 776
+ * @see Issue 776
*/
public static final String PROTOCOL_MD_PREFIX_PARAM = "protocol.md.prefix";
diff --git a/core/src/main/java/org/apache/stormcrawler/util/CharsetIdentification.java b/core/src/main/java/org/apache/stormcrawler/util/CharsetIdentification.java
index b9a767a99..1ef8a7125 100644
--- a/core/src/main/java/org/apache/stormcrawler/util/CharsetIdentification.java
+++ b/core/src/main/java/org/apache/stormcrawler/util/CharsetIdentification.java
@@ -186,7 +186,7 @@ private static String getCharsetFromMeta(byte buffer[], int maxlength) {
int start = html.indexOf(" fields = output.getEmitted(Constants.StatusStreamName).get(0);
Assertions.assertEquals(3, fields.size());
@@ -101,7 +101,7 @@ void testSitemapParsingWithImageExtensions() throws IOException {
metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
parse(
"http://stormcrawler.apache.org/sitemap.xml",
- "digitalpebble.sitemap.extensions.image.xml",
+ "stormcrawler.sitemap.extensions.image.xml",
metadata);
Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0);
Metadata parsedMetadata = (Metadata) values.get(1);
@@ -120,7 +120,7 @@ void testSitemapParsingWithMobileExtensions() throws IOException {
metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
parse(
"http://stormcrawler.apache.org/sitemap.xml",
- "digitalpebble.sitemap.extensions.mobile.xml",
+ "stormcrawler.sitemap.extensions.mobile.xml",
metadata);
Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0);
Metadata parsedMetadata = (Metadata) values.get(1);
@@ -139,7 +139,7 @@ void testSitemapParsingWithLinkExtensions() throws IOException {
metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
parse(
"http://stormcrawler.apache.org/sitemap.xml",
- "digitalpebble.sitemap.extensions.links.xml",
+ "stormcrawler.sitemap.extensions.links.xml",
metadata);
Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0);
Metadata parsedMetadata = (Metadata) values.get(1);
@@ -158,7 +158,7 @@ void testSitemapParsingWithNewsExtensions() throws IOException {
metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
parse(
"http://stormcrawler.apache.org/sitemap.xml",
- "digitalpebble.sitemap.extensions.news.xml",
+ "stormcrawler.sitemap.extensions.news.xml",
metadata);
Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0);
Metadata parsedMetadata = (Metadata) values.get(1);
@@ -177,7 +177,7 @@ void testSitemapParsingWithVideoExtensions() throws IOException {
metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
parse(
"http://stormcrawler.apache.org/sitemap.xml",
- "digitalpebble.sitemap.extensions.video.xml",
+ "stormcrawler.sitemap.extensions.video.xml",
metadata);
Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0);
Metadata parsedMetadata = (Metadata) values.get(1);
@@ -203,7 +203,7 @@ void testSitemapParsingWithAllExtensions() throws IOException {
metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
parse(
"http://stormcrawler.apache.org/sitemap.xml",
- "digitalpebble.sitemap.extensions.all.xml",
+ "stormcrawler.sitemap.extensions.all.xml",
metadata);
Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0);
Metadata parsedMetadata = (Metadata) values.get(1);
@@ -237,8 +237,8 @@ void testSitemapParsingNoMT() throws IOException {
Metadata metadata = new Metadata();
// do not specify that it is a sitemap file
// do not set the mimetype
- parse("http://stormcrawler.apache.org/sitemap.xml", "digitalpebble.sitemap.xml", metadata);
- Assertions.assertEquals(6, output.getEmitted(Constants.StatusStreamName).size());
+ parse("http://stormcrawler.apache.org/sitemap.xml", "stormcrawler.sitemap.xml", metadata);
+ Assertions.assertEquals(7, output.getEmitted(Constants.StatusStreamName).size());
// TODO test that the new links have the right metadata
List