diff --git a/archetype/src/main/resources/archetype-resources/crawler-conf.yaml b/archetype/src/main/resources/archetype-resources/crawler-conf.yaml index 65a18280a..286a05159 100644 --- a/archetype/src/main/resources/archetype-resources/crawler-conf.yaml +++ b/archetype/src/main/resources/archetype-resources/crawler-conf.yaml @@ -115,6 +115,17 @@ config: indexer.url.fieldname: "url" indexer.text.fieldname: "content" indexer.canonical.name: "canonical" + # How to convert metadata key values into fields for indexing + # + # if no alias is specified with =alias, the key value is used + # for instance below, _domain_ and _format_ will be used + # as field names, whereas _title_ will be used for _parse.title_. + # You can specify the index of the value to store from the values array + # by using the _key[index]_ format, e.g. _parse.title[0]_ would try to + # get the first value for the metadata _parse.title_ (which is the default anyway). + # Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would + # index all the keys with _parse_ as a prefix. Note that in that case, you can't + # specify an alias with =, nor can you specify an index. indexer.md.mapping: - parse.title=title - parse.keywords=keywords diff --git a/core/src/main/java/com/digitalpebble/stormcrawler/indexing/AbstractIndexerBolt.java b/core/src/main/java/com/digitalpebble/stormcrawler/indexing/AbstractIndexerBolt.java index 1cd62e997..ea731c947 100644 --- a/core/src/main/java/com/digitalpebble/stormcrawler/indexing/AbstractIndexerBolt.java +++ b/core/src/main/java/com/digitalpebble/stormcrawler/indexing/AbstractIndexerBolt.java @@ -21,9 +21,12 @@ import crawlercommons.domains.PaidLevelDomain; import java.net.MalformedURLException; import java.net.URL; +import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; +import java.util.List; import java.util.Map; -import java.util.Map.Entry; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.ArrayUtils; @@ -71,7 +74,7 @@ public abstract class AbstractIndexerBolt extends BaseRichBolt { private String[] filterKeyValue = null; - private final Map metadata2field = new HashMap<>(); + private final List metadata2field = new ArrayList<>(); private String fieldNameForText = null; @@ -83,22 +86,46 @@ public abstract class AbstractIndexerBolt extends BaseRichBolt { private boolean ignoreEmptyFields = false; - static class Key { + private static class Key { private final String key; + private final String alias; private final int index; + private final boolean glob; - public Key(String key, int index) { - this.key = key; + public Key(String key, int index, String alias) { this.index = index; + this.alias = alias; + if (key.endsWith("*")) { + this.key = key.substring(0, key.length() - 1); + this.glob = true; + // can't have an alias + // or an index + if (index != -1 || alias != null) { + throw new RuntimeException( + "Can't have a mapping for indexer.md.mapping with a glob and index or alias"); + } + } else { + this.key = key; + this.glob = false; + } } public String getKey() { return key; } + public String getAlias() { + // return the alias if set + return alias; + } + public int getIndex() { return index; } + + public boolean isGlob() { + return glob; + } } @Override @@ -137,7 +164,7 @@ public void prepare( } else { mapping = mapping.trim(); key = mapping; - value = mapping; + value = null; } int index = -1; Matcher match = indexValuePattern.matcher(key); @@ -145,7 +172,7 @@ public void prepare( index = Integer.parseInt(match.group(1)); key = key.substring(0, match.start()); } - metadata2field.put(new Key(key, index), value); + metadata2field.add(new Key(key, index, value)); LOG.info("Mapping key {} to field {}", key, value); } @@ -175,26 +202,42 @@ protected Map filterMetadata(Metadata meta) { Map fieldVals = new HashMap<>(); - for (Entry entry : metadata2field.entrySet()) { - Key key = entry.getKey(); - String[] values = meta.getValues(key.key); - // not found - if (values == null || values.length == 0) { - continue; - } - // check whether we want a specific value or all of them? - int index = key.index; - // want a value index that it outside the range given - if (index >= values.length) { - continue; - } - // store all values available - if (index == -1) { - fieldVals.put(entry.getValue(), values); + for (Key key : metadata2field) { + Set matchingKeys = new HashSet<>(); + // if it is a glob - look for all matching entries in the metadata + if (key.isGlob()) { + matchingKeys = meta.keySet(key.getKey()); + } else { + matchingKeys.add(key.getKey()); } - // or only the one we want - else { - fieldVals.put(entry.getValue(), new String[] {values[index]}); + + for (String matchingKey : matchingKeys) { + String[] values = meta.getValues(matchingKey); + String label = matchingKey; + + // won't be the case for globs + if (key.getAlias() != null) { + label = key.getAlias(); + } + + // not found + if (values == null || values.length == 0) { + continue; + } + // check whether we want a specific value or all of them? + int index = key.index; + // want a value index that it outside the range given + if (index >= values.length) { + continue; + } + // store all values available + if (index == -1) { + fieldVals.put(label, values); + } + // or only the one we want + else { + fieldVals.put(label, new String[] {values[index]}); + } } } diff --git a/core/src/main/resources/crawler-default.yaml b/core/src/main/resources/crawler-default.yaml index 0917cf175..b1e7995f6 100644 --- a/core/src/main/resources/crawler-default.yaml +++ b/core/src/main/resources/crawler-default.yaml @@ -264,6 +264,17 @@ config: indexer.text.fieldname: "content" indexer.text.maxlength: -1 indexer.canonical.name: "canonical" + # How to convert metadata key values into fields for indexing + # + # if no alias is specified with =alias, the key value is used + # for instance below, _domain_ and _format_ will be used + # as field names, whereas _title_ will be used for _parse.title_. + # You can specify the index of the value to store from the values array + # by using the _key[index]_ format, e.g. _parse.title[0]_ would try to + # get the first value for the metadata _parse.title_ (which is the default anyway). + # Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would + # index all the keys with _parse_ as a prefix. Note that in that case, you can't + # specify an alias with =, nor can you specify an index. indexer.md.mapping: - parse.title=title - parse.keywords=keywords diff --git a/core/src/test/java/com/digitalpebble/stormcrawler/indexer/BasicIndexingTest.java b/core/src/test/java/com/digitalpebble/stormcrawler/indexer/BasicIndexingTest.java index 4689bd5b6..809f274db 100644 --- a/core/src/test/java/com/digitalpebble/stormcrawler/indexer/BasicIndexingTest.java +++ b/core/src/test/java/com/digitalpebble/stormcrawler/indexer/BasicIndexingTest.java @@ -236,4 +236,25 @@ public void testEmptyFilterMetadata() throws Exception { new String[] {"url"}, fields.keySet().toArray()); } + + @Test + public void testGlobFilterMetadata() throws Exception { + Map config = new HashMap(); + config.put(AbstractIndexerBolt.urlFieldParamName, "url"); + List listKV = new ArrayList<>(); + listKV.add("parse.*"); + config.put(AbstractIndexerBolt.metadata2fieldParamName, listKV); + + prepareIndexerBolt(config); + + Metadata metadata = new Metadata(); + metadata.setValue("parse.title", "This is the title"); + metadata.setValue("parse.keywords", "keyword1, keyword2, keyword3"); + metadata.setValue("parse.description", "This is the description"); + + index(URL, metadata); + Map fields = ((DummyIndexer) bolt).returnFields(); + + Assert.assertEquals("Incorrect number of fields", 4, fields.keySet().size()); + } } diff --git a/external/elasticsearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml b/external/elasticsearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml index 4045dbe26..0e2e543de 100644 --- a/external/elasticsearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml +++ b/external/elasticsearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml @@ -109,6 +109,17 @@ config: indexer.url.fieldname: "url" indexer.text.fieldname: "content" indexer.canonical.name: "canonical" + # How to convert metadata key values into fields for indexing + # + # if no alias is specified with =alias, the key value is used + # for instance below, _domain_ and _format_ will be used + # as field names, whereas _title_ will be used for _parse.title_. + # You can specify the index of the value to store from the values array + # by using the _key[index]_ format, e.g. _parse.title[0]_ would try to + # get the first value for the metadata _parse.title_ (which is the default anyway). + # Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would + # index all the keys with _parse_ as a prefix. Note that in that case, you can't + # specify an alias with =, nor can you specify an index. indexer.md.mapping: - parse.title=title - parse.keywords=keywords diff --git a/external/opensearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml b/external/opensearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml index 16a357130..9dc7eb589 100644 --- a/external/opensearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml +++ b/external/opensearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml @@ -109,6 +109,17 @@ config: indexer.url.fieldname: "url" indexer.text.fieldname: "content" indexer.canonical.name: "canonical" + # How to convert metadata key values into fields for indexing + # + # if no alias is specified with =alias, the key value is used + # for instance below, _domain_ and _format_ will be used + # as field names, whereas _title_ will be used for _parse.title_. + # You can specify the index of the value to store from the values array + # by using the _key[index]_ format, e.g. _parse.title[0]_ would try to + # get the first value for the metadata _parse.title_ (which is the default anyway). + # Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would + # index all the keys with _parse_ as a prefix. Note that in that case, you can't + # specify an alias with =, nor can you specify an index. indexer.md.mapping: - parse.title=title - parse.keywords=keywords