diff --git a/core/src/main/java/com/digitalpebble/stormcrawler/protocol/DelegatorProtocol.java b/core/src/main/java/com/digitalpebble/stormcrawler/protocol/DelegatorProtocol.java
index e38a25818..bc397523b 100644
--- a/core/src/main/java/com/digitalpebble/stormcrawler/protocol/DelegatorProtocol.java
+++ b/core/src/main/java/com/digitalpebble/stormcrawler/protocol/DelegatorProtocol.java
@@ -21,6 +21,7 @@
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
+import java.util.regex.Pattern;
import org.apache.storm.Config;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
@@ -28,9 +29,9 @@
/**
* Protocol implementation that enables selection from a collection of sub-protocols using filters
- * based on each call's metadata
+ * based on each call's metadata and URL.
*
- *
Is configured like this
+ *
It is configured like this
*
*
* protocol.delegator.config:
@@ -43,12 +44,15 @@
* - className: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol"
* filters:
* robots.txt:
- * - className: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol"
+ * regex:
+ * - \.pdf
+ * - \.doc
+ * - className: "com.digitalpebble.stormcrawler.protocol.selenium.SeleniumProtocol"
*
*
- * The last one in the list must not have filters as it is used as a default value. The protocols
- * are tried for matches in the order in which they are listed in the configuration. The first to
- * match gets used to fetch a URL.
+ * Typically, the last one in the list must not have filters as it is used as a default value. The
+ * protocols are tried for matches in the order in which they are listed in the configuration. The
+ * first to match gets used to fetch a URL.
*
* A filter without value is valid, we just test for the presence of the key.
*
@@ -56,6 +60,9 @@
* files. This is automatically generated by the DelegatorProtocol, you don't need to add it to the
* metadata explicitly.
*
+ *
The regex are considered a hit if they are found in the URL, they do not have to match the
+ * entire URL. The operator logic applies to them as well.
+ *
* @since 2.2
*/
public class DelegatorProtocol implements Protocol {
@@ -83,6 +90,8 @@ static class FilteredProtocol {
final List filters = new ArrayList<>();
final String id;
+ final List urlPatterns = new ArrayList<>();
+
enum Operator {
AND,
OR
@@ -99,7 +108,7 @@ Protocol getProtocolInstance() {
/** Filterless implementation * */
public FilteredProtocol(
@Nullable String id, @NotNull String protocolImpl, @NotNull Config config) {
- this(id, protocolImpl, config, null, null);
+ this(id, protocolImpl, config, null, null, null);
}
public FilteredProtocol(
@@ -107,7 +116,8 @@ public FilteredProtocol(
@NotNull String protocolImpl,
@NotNull Config config,
@Nullable Map filterImpls,
- @Nullable String op) {
+ @Nullable String op,
+ @Nullable List regexps) {
protoInstance =
InitialisationUtil.initializeFromQualifiedName(protocolImpl, Protocol.class);
@@ -123,15 +133,21 @@ public FilteredProtocol(
this.operator = Operator.valueOf(op);
}
+ // regular expressions
+ if (regexps != null) {
+ regexps.forEach(s -> urlPatterns.add(Pattern.compile(s)));
+ }
+
this.id = id;
// log filters found
LOG.info(
- "Loaded {} filters for {}; id {}; operator {}",
+ "Loaded {} filters for {}; id {}; operator {}; regexp {}",
filters.size(),
protocolImpl,
id,
- operator);
+ operator,
+ urlPatterns.size());
}
public ProtocolResponse getProtocolOutput(String url, Metadata metadata) throws Exception {
@@ -146,9 +162,9 @@ public void cleanup() {
protoInstance.cleanup();
}
- boolean isMatch(final Metadata metadata) {
- // if this FP has no filters - it can handle anything
- if (filters.isEmpty()) return true;
+ boolean isMatch(final String url, final Metadata metadata) {
+ // if this FP has no filters nor regexps - it can handle anything
+ if (filters.isEmpty() && urlPatterns.isEmpty()) return true;
boolean atLeastOneMatch = false;
@@ -179,7 +195,18 @@ boolean isMatch(final Metadata metadata) {
else if (operator.equals(Operator.OR) && match) return true;
}
- // if we get to this point and the operator is AND is means everything has
+ // same approach with the URLs
+ for (Pattern p : urlPatterns) {
+ boolean found = p.asPredicate().test(url);
+ if (found) {
+ atLeastOneMatch = true;
+ }
+ // optimisation
+ if (operator.equals(Operator.AND) && !found) return false;
+ else if (operator.equals(Operator.OR) && found) return true;
+ }
+
+ // if we get to this point and the operator is AND, it means everything has
// matched
// but if the operator is OR we need to check that something has matched at all
@@ -207,15 +234,21 @@ public void configure(@NotNull Config conf) {
final Object filters = subConf.get("filters");
final String operator = (String) subConf.get("operator");
final String id = (String) subConf.get("id");
+ final Object regexp = subConf.get("regex");
FilteredProtocol protocol;
- if (filters == null) {
+ if (filters == null && regexp == null) {
protocol = new FilteredProtocol(id, className, conf);
} else {
// noinspection unchecked
protocol =
new FilteredProtocol(
- id, className, conf, (Map) filters, operator);
+ id,
+ className,
+ conf,
+ (Map) filters,
+ operator,
+ (List) regexp);
}
protocols.add(protocol);
}
@@ -238,7 +271,7 @@ public void configure(@NotNull Config conf) {
final FilteredProtocol getProtocolFor(String url, Metadata metadata) {
for (FilteredProtocol p : protocols) {
- if (p.isMatch(metadata)) {
+ if (p.isMatch(url, metadata)) {
return p;
}
}
diff --git a/core/src/test/java/com/digitalpebble/stormcrawler/protocol/DelegationProtocolTest.java b/core/src/test/java/com/digitalpebble/stormcrawler/protocol/DelegationProtocolTest.java
index 8c5ef6b31..ef10e7b04 100644
--- a/core/src/test/java/com/digitalpebble/stormcrawler/protocol/DelegationProtocolTest.java
+++ b/core/src/test/java/com/digitalpebble/stormcrawler/protocol/DelegationProtocolTest.java
@@ -88,6 +88,15 @@ public void getProtocolTest() throws FileNotFoundException {
pf = superProto.getProtocolFor("https://www.example-two.com", meta);
- Assert.assertEquals(pf.id, "third");
+ // URLs
+ meta = new Metadata();
+
+ pf = superProto.getProtocolFor("https://www.example-two.com/large.pdf", meta);
+
+ Assert.assertEquals(pf.id, "fourth");
+
+ pf = superProto.getProtocolFor("https://www.example-two.com/large.doc", meta);
+
+ Assert.assertEquals(pf.id, "fourth");
}
}
diff --git a/core/src/test/resources/delegator-conf.yaml b/core/src/test/resources/delegator-conf.yaml
index 48a1bf210..4422f3c09 100644
--- a/core/src/test/resources/delegator-conf.yaml
+++ b/core/src/test/resources/delegator-conf.yaml
@@ -17,6 +17,12 @@ config:
filters:
ping:
pong:
+ - className: "com.digitalpebble.stormcrawler.protocol.DummyProtocol"
+ id: "fourth"
+ operator: OR
+ regex:
+ - \.pdf
+ - \.doc
- className: "com.digitalpebble.stormcrawler.protocol.DummyProtocol"
id: "default"