Skip to content

Commit

Permalink
Remove references to digitalpebble.com (sitemaps, src issue refs)
Browse files Browse the repository at this point in the history
  • Loading branch information
rzo1 committed Nov 22, 2024
1 parent 553fb58 commit 60e195d
Show file tree
Hide file tree
Showing 38 changed files with 375 additions and 360 deletions.
3 changes: 1 addition & 2 deletions archetype/src/main/resources/archetype-resources/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ Have a look at the code and resources and modify them to your heart's content.

# Prerequisites

You need to install Apache Storm. The instructions on [setting up a Storm cluster](https://storm.apache.org/releases/2.6.2/Setting-up-a-Storm-cluster.html) should help. Alternatively,
the [stormcrawler-docker](https://github.com/DigitalPebble/stormcrawler-docker) project contains resources for running Apache Storm on Docker.
You need to install Apache Storm. The instructions on [setting up a Storm cluster](https://storm.apache.org/releases/2.6.2/Setting-up-a-Storm-cluster.html) should help.

You also need to have an instance of URLFrontier running. See [the URLFrontier README](https://github.com/crawler-commons/url-frontier/tree/master/service); the easiest way is to use Docker, like so:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ public void run() {
metadata = new Metadata();
}

// https://github.com/DigitalPebble/storm-crawler/issues/813
// https://github.com/apache/incubator-stormcrawler/issues/813
metadata.remove("fetch.exception");

boolean asap = false;
Expand Down Expand Up @@ -568,7 +568,7 @@ public void run() {
}

// has found sitemaps
// https://github.com/DigitalPebble/storm-crawler/issues/710
// https://github.com/apache/incubator-stormcrawler/issues/710
// note: we don't care if the sitemap URLs where actually
// kept
boolean foundSitemap = (rules.getSitemaps().size() > 0);
Expand Down Expand Up @@ -732,7 +732,7 @@ public void run() {
mergedMD.setValue("_redirTo", redirection);
}

// https://github.com/DigitalPebble/storm-crawler/issues/954
// https://github.com/apache/incubator-stormcrawler/issues/954
if (allowRedirs() && StringUtils.isNotBlank(redirection)) {
emitOutlink(fit.t, url, redirection, mergedMD);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ public void execute(Tuple tuple) {
LOG.info("Found redir in {} to {}", url, redirection);
metadata.setValue("_redirTo", redirection);

// https://github.com/DigitalPebble/storm-crawler/issues/954
// https://github.com/apache/incubator-stormcrawler/issues/954
if (allowRedirs() && StringUtils.isNotBlank(redirection)) {
emitOutlink(tuple, new URL(url), redirection, metadata);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ public void execute(Tuple input) {
metadata = new Metadata();
}

// https://github.com/DigitalPebble/storm-crawler/issues/813
// https://github.com/apache/incubator-stormcrawler/issues/813
metadata.remove("fetch.exception");

URL url;
Expand Down Expand Up @@ -326,7 +326,7 @@ public void execute(Tuple input) {
}

// has found sitemaps
// https://github.com/DigitalPebble/storm-crawler/issues/710
// https://github.com/apache/incubator-stormcrawler/issues/710
// note: we don't care if the sitemap URLs where actually
// kept
boolean foundSitemap = (rules.getSitemaps().size() > 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ public class BasicURLNormalizer extends URLFilter {
/** Nutch 1098 - finds URL encoded parts of the URL */
private static final Pattern unescapeRulePattern = Pattern.compile("%([0-9A-Fa-f]{2})");

/** https://github.com/DigitalPebble/storm-crawler/issues/401 * */
/** https://github.com/apache/incubator-stormcrawler/issues/401 * */
private static final Pattern illegalEscapePattern = Pattern.compile("%u([0-9A-Fa-f]{4})");

// charset used for encoding URLs before escaping
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ public void loadJSONResources(InputStream inputStream)

// if it contains a single object
// jump directly to its content
// https://github.com/DigitalPebble/storm-crawler/issues/1013
// https://github.com/apache/incubator-stormcrawler/issues/1013
if (rootNode.size() == 1 && rootNode.isObject()) {
rootNode = rootNode.fields().next().getValue();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
* </pre>
*
* Will be replaced by <a href=
* "https://github.com/DigitalPebble/storm-crawler/issues/711">MetadataFilter to filter based on
* "https://github.com/apache/incubator-stormcrawler/issues/711">MetadataFilter to filter based on
* multiple key values</a>
*
* @since 1.14
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ public void execute(Tuple tuple) {
if (!status.equals(Status.FETCH_ERROR)) {
metadata.remove(Constants.fetchErrorCountParamName);
}
// https://github.com/DigitalPebble/storm-crawler/issues/415
// https://github.com/apache/incubator-stormcrawler/issues/415
// remove error related key values in case of success
if (status.equals(Status.FETCHED) || status.equals(Status.REDIRECTION)) {
metadata.remove(Constants.STATUS_ERROR_CAUSE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public class ProtocolResponse {

/**
* @since 1.17
* @see <a href="https://github.com/DigitalPebble/storm-crawler/issues/776">Issue 776</a>
* @see <a href="https://github.com/apache/incubator-stormcrawler/issues/776">Issue 776</a>
*/
public static final String PROTOCOL_MD_PREFIX_PARAM = "protocol.md.prefix";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ private static String getCharsetFromMeta(byte buffer[], int maxlength) {
int start = html.indexOf("<meta charset=\"");
if (start != -1) {
int end = html.indexOf('"', start + 15);
// https://github.com/DigitalPebble/storm-crawler/issues/870
// https://github.com/apache/incubator-stormcrawler/issues/870
// try on a slightly larger section of text if it is trimmed
if (end == -1 && ((maxlength + 10) < buffer.length)) {
return getCharsetFromMeta(buffer, maxlength + 10);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ void testSitemapParsing() throws IOException {
metadata.setValue(SiteMapParserBolt.isSitemapKey, "true");
// and its mime-type
metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
parse("http://stormcrawler.apache.org/sitemap.xml", "digitalpebble.sitemap.xml", metadata);
Assertions.assertEquals(6, output.getEmitted(Constants.StatusStreamName).size());
parse("http://stormcrawler.apache.org/sitemap.xml", "stormcrawler.sitemap.xml", metadata);
Assertions.assertEquals(7, output.getEmitted(Constants.StatusStreamName).size());
// TODO test that the new links have the right metadata
List<Object> fields = output.getEmitted(Constants.StatusStreamName).get(0);
Assertions.assertEquals(3, fields.size());
Expand Down Expand Up @@ -101,7 +101,7 @@ void testSitemapParsingWithImageExtensions() throws IOException {
metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
parse(
"http://stormcrawler.apache.org/sitemap.xml",
"digitalpebble.sitemap.extensions.image.xml",
"stormcrawler.sitemap.extensions.image.xml",
metadata);
Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0);
Metadata parsedMetadata = (Metadata) values.get(1);
Expand All @@ -120,7 +120,7 @@ void testSitemapParsingWithMobileExtensions() throws IOException {
metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
parse(
"http://stormcrawler.apache.org/sitemap.xml",
"digitalpebble.sitemap.extensions.mobile.xml",
"stormcrawler.sitemap.extensions.mobile.xml",
metadata);
Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0);
Metadata parsedMetadata = (Metadata) values.get(1);
Expand All @@ -139,7 +139,7 @@ void testSitemapParsingWithLinkExtensions() throws IOException {
metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
parse(
"http://stormcrawler.apache.org/sitemap.xml",
"digitalpebble.sitemap.extensions.links.xml",
"stormcrawler.sitemap.extensions.links.xml",
metadata);
Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0);
Metadata parsedMetadata = (Metadata) values.get(1);
Expand All @@ -158,7 +158,7 @@ void testSitemapParsingWithNewsExtensions() throws IOException {
metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
parse(
"http://stormcrawler.apache.org/sitemap.xml",
"digitalpebble.sitemap.extensions.news.xml",
"stormcrawler.sitemap.extensions.news.xml",
metadata);
Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0);
Metadata parsedMetadata = (Metadata) values.get(1);
Expand All @@ -177,7 +177,7 @@ void testSitemapParsingWithVideoExtensions() throws IOException {
metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
parse(
"http://stormcrawler.apache.org/sitemap.xml",
"digitalpebble.sitemap.extensions.video.xml",
"stormcrawler.sitemap.extensions.video.xml",
metadata);
Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0);
Metadata parsedMetadata = (Metadata) values.get(1);
Expand All @@ -203,7 +203,7 @@ void testSitemapParsingWithAllExtensions() throws IOException {
metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
parse(
"http://stormcrawler.apache.org/sitemap.xml",
"digitalpebble.sitemap.extensions.all.xml",
"stormcrawler.sitemap.extensions.all.xml",
metadata);
Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0);
Metadata parsedMetadata = (Metadata) values.get(1);
Expand Down Expand Up @@ -237,8 +237,8 @@ void testSitemapParsingNoMT() throws IOException {
Metadata metadata = new Metadata();
// do not specify that it is a sitemap file
// do not set the mimetype
parse("http://stormcrawler.apache.org/sitemap.xml", "digitalpebble.sitemap.xml", metadata);
Assertions.assertEquals(6, output.getEmitted(Constants.StatusStreamName).size());
parse("http://stormcrawler.apache.org/sitemap.xml", "stormcrawler.sitemap.xml", metadata);
Assertions.assertEquals(7, output.getEmitted(Constants.StatusStreamName).size());
// TODO test that the new links have the right metadata
List<Object> fields = output.getEmitted(Constants.StatusStreamName).get(0);
Assertions.assertEquals(3, fields.size());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ void testLowerCasing() throws MalformedURLException {
assertEquals(expectedResult, normalizedUrl, "Failed to filter query string");
}

// https://github.com/DigitalPebble/storm-crawler/issues/401
// https://github.com/apache/incubator-stormcrawler/issues/401
@Test
void testNonStandardPercentEncoding() throws MalformedURLException {
URLFilter urlFilter = createFilter(false, false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ void testDomainNotAllowed() throws MalformedURLException {
String filterResult = createFilter().filter(url, metadata, url.toExternalForm());
Assertions.assertEquals(null, filterResult);
// allowed
url = new URL("http://stormcrawler.net/digitalpebble/");
url = new URL("http://stormcrawler.net/bla/");
filterResult = createFilter().filter(url, metadata, url.toExternalForm());
Assertions.assertEquals(url.toString(), filterResult);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ void testBadCanonicalURL() throws Exception {
config.put(AbstractIndexerBolt.urlFieldParamName, "url");
config.put(AbstractIndexerBolt.canonicalMetadataParamName, "canonical");
Metadata metadata = new Metadata();
metadata.setValue("canonical", "htp://www.digitalpebble.com/");
metadata.setValue("canonical", "htp://stormcrawler.apache.org/");
prepareIndexerBolt(config);
index(URL, metadata);
Map<String, String> fields = ((DummyIndexer) bolt).returnFields();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ void testBasicExtraction() throws IOException {
}

@Test
// https://github.com/DigitalPebble/storm-crawler/issues/219
// https://github.com/apache/incubator-stormcrawler/issues/219
void testScriptExtraction() throws IOException {
prepareParserBolt("test.jsoupfilters.json");
parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
import org.junit.jupiter.api.Test;

/**
* @see https://github.com/DigitalPebble/storm-crawler/pull/653 *
* @see https://github.com/apache/incubator-stormcrawler/pull/653 *
*/
class StackOverflowTest extends ParsingTester {

Expand All @@ -47,7 +47,7 @@ void testStackOverflow() throws IOException {
}

/**
* @see https://github.com/DigitalPebble/storm-crawler/issues/666 *
* @see https://github.com/apache/incubator-stormcrawler/issues/666 *
*/
@Test
void testNamespaceExtraction() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ void testSitemapSubdocuments() throws IOException {
config.put("detect.mimetype", false);
prepareParserBolt("test.subdocfilter.json", config);
Metadata metadata = new Metadata();
parse("http://stormcrawler.apache.org/sitemap.xml", "digitalpebble.sitemap.xml", metadata);
Assertions.assertEquals(6, output.getEmitted().size());
parse("http://stormcrawler.apache.org/sitemap.xml", "stormcrawler.sitemap.xml", metadata);
Assertions.assertEquals(7, output.getEmitted().size());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ void testBasicExtraction() throws IOException {
}

@Test
// https://github.com/DigitalPebble/storm-crawler/issues/219
// https://github.com/apache/incubator-stormcrawler/issues/219
void testScriptExtraction() throws IOException {
prepareParserBolt("test.parsefilters.json");
parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ void getProtocolTest() throws FileNotFoundException {
// try single filter
Metadata meta = new Metadata();
meta.setValue("js", "true");
FilteredProtocol pf = superProto.getProtocolFor("https://digitalpebble.com", meta);
FilteredProtocol pf = superProto.getProtocolFor("https://stormcrawler.apache.org", meta);
Assertions.assertEquals(pf.id, "second");
// no filter at all
meta = new Metadata();
Expand Down
69 changes: 0 additions & 69 deletions core/src/test/resources/digitalpebble.sitemap.extensions.news.xml

This file was deleted.

Loading

0 comments on commit 60e195d

Please sign in to comment.