diff --git a/core/src/main/java/com/digitalpebble/stormcrawler/protocol/DelegatorProtocol.java b/core/src/main/java/com/digitalpebble/stormcrawler/protocol/DelegatorProtocol.java index e38a25818..bc397523b 100644 --- a/core/src/main/java/com/digitalpebble/stormcrawler/protocol/DelegatorProtocol.java +++ b/core/src/main/java/com/digitalpebble/stormcrawler/protocol/DelegatorProtocol.java @@ -21,6 +21,7 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.regex.Pattern; import org.apache.storm.Config; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; @@ -28,9 +29,9 @@ /** * Protocol implementation that enables selection from a collection of sub-protocols using filters - * based on each call's metadata + * based on each call's metadata and URL. * - *

Is configured like this + *

It is configured like this * *

  * protocol.delegator.config:
@@ -43,12 +44,15 @@
  * - className: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol"
  *   filters:
  *     robots.txt:
- * - className: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol"
+ *   regex:
+ *    - \.pdf
+ *    - \.doc
+ * - className: "com.digitalpebble.stormcrawler.protocol.selenium.SeleniumProtocol"
  * 
* - * The last one in the list must not have filters as it is used as a default value. The protocols - * are tried for matches in the order in which they are listed in the configuration. The first to - * match gets used to fetch a URL. + * Typically, the last one in the list must not have filters as it is used as a default value. The + * protocols are tried for matches in the order in which they are listed in the configuration. The + * first to match gets used to fetch a URL. * *

A filter without value is valid, we just test for the presence of the key. * @@ -56,6 +60,9 @@ * files. This is automatically generated by the DelegatorProtocol, you don't need to add it to the * metadata explicitly. * + *

The regex are considered a hit if they are found in the URL, they do not have to match the + * entire URL. The operator logic applies to them as well. + * * @since 2.2 */ public class DelegatorProtocol implements Protocol { @@ -83,6 +90,8 @@ static class FilteredProtocol { final List filters = new ArrayList<>(); final String id; + final List urlPatterns = new ArrayList<>(); + enum Operator { AND, OR @@ -99,7 +108,7 @@ Protocol getProtocolInstance() { /** Filterless implementation * */ public FilteredProtocol( @Nullable String id, @NotNull String protocolImpl, @NotNull Config config) { - this(id, protocolImpl, config, null, null); + this(id, protocolImpl, config, null, null, null); } public FilteredProtocol( @@ -107,7 +116,8 @@ public FilteredProtocol( @NotNull String protocolImpl, @NotNull Config config, @Nullable Map filterImpls, - @Nullable String op) { + @Nullable String op, + @Nullable List regexps) { protoInstance = InitialisationUtil.initializeFromQualifiedName(protocolImpl, Protocol.class); @@ -123,15 +133,21 @@ public FilteredProtocol( this.operator = Operator.valueOf(op); } + // regular expressions + if (regexps != null) { + regexps.forEach(s -> urlPatterns.add(Pattern.compile(s))); + } + this.id = id; // log filters found LOG.info( - "Loaded {} filters for {}; id {}; operator {}", + "Loaded {} filters for {}; id {}; operator {}; regexp {}", filters.size(), protocolImpl, id, - operator); + operator, + urlPatterns.size()); } public ProtocolResponse getProtocolOutput(String url, Metadata metadata) throws Exception { @@ -146,9 +162,9 @@ public void cleanup() { protoInstance.cleanup(); } - boolean isMatch(final Metadata metadata) { - // if this FP has no filters - it can handle anything - if (filters.isEmpty()) return true; + boolean isMatch(final String url, final Metadata metadata) { + // if this FP has no filters nor regexps - it can handle anything + if (filters.isEmpty() && urlPatterns.isEmpty()) return true; boolean atLeastOneMatch = false; @@ -179,7 +195,18 @@ boolean isMatch(final Metadata metadata) { else if (operator.equals(Operator.OR) && match) return true; } - // if we get to this point and the operator is AND is means everything has + // same approach with the URLs + for (Pattern p : urlPatterns) { + boolean found = p.asPredicate().test(url); + if (found) { + atLeastOneMatch = true; + } + // optimisation + if (operator.equals(Operator.AND) && !found) return false; + else if (operator.equals(Operator.OR) && found) return true; + } + + // if we get to this point and the operator is AND, it means everything has // matched // but if the operator is OR we need to check that something has matched at all @@ -207,15 +234,21 @@ public void configure(@NotNull Config conf) { final Object filters = subConf.get("filters"); final String operator = (String) subConf.get("operator"); final String id = (String) subConf.get("id"); + final Object regexp = subConf.get("regex"); FilteredProtocol protocol; - if (filters == null) { + if (filters == null && regexp == null) { protocol = new FilteredProtocol(id, className, conf); } else { // noinspection unchecked protocol = new FilteredProtocol( - id, className, conf, (Map) filters, operator); + id, + className, + conf, + (Map) filters, + operator, + (List) regexp); } protocols.add(protocol); } @@ -238,7 +271,7 @@ public void configure(@NotNull Config conf) { final FilteredProtocol getProtocolFor(String url, Metadata metadata) { for (FilteredProtocol p : protocols) { - if (p.isMatch(metadata)) { + if (p.isMatch(url, metadata)) { return p; } } diff --git a/core/src/test/java/com/digitalpebble/stormcrawler/protocol/DelegationProtocolTest.java b/core/src/test/java/com/digitalpebble/stormcrawler/protocol/DelegationProtocolTest.java index 8c5ef6b31..ef10e7b04 100644 --- a/core/src/test/java/com/digitalpebble/stormcrawler/protocol/DelegationProtocolTest.java +++ b/core/src/test/java/com/digitalpebble/stormcrawler/protocol/DelegationProtocolTest.java @@ -88,6 +88,15 @@ public void getProtocolTest() throws FileNotFoundException { pf = superProto.getProtocolFor("https://www.example-two.com", meta); - Assert.assertEquals(pf.id, "third"); + // URLs + meta = new Metadata(); + + pf = superProto.getProtocolFor("https://www.example-two.com/large.pdf", meta); + + Assert.assertEquals(pf.id, "fourth"); + + pf = superProto.getProtocolFor("https://www.example-two.com/large.doc", meta); + + Assert.assertEquals(pf.id, "fourth"); } } diff --git a/core/src/test/resources/delegator-conf.yaml b/core/src/test/resources/delegator-conf.yaml index 48a1bf210..4422f3c09 100644 --- a/core/src/test/resources/delegator-conf.yaml +++ b/core/src/test/resources/delegator-conf.yaml @@ -17,6 +17,12 @@ config: filters: ping: pong: + - className: "com.digitalpebble.stormcrawler.protocol.DummyProtocol" + id: "fourth" + operator: OR + regex: + - \.pdf + - \.doc - className: "com.digitalpebble.stormcrawler.protocol.DummyProtocol" id: "default"