Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Glob field mapping for indexer.md.mapping #1130

Merged
merged 3 commits into from
Dec 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions archetype/src/main/resources/archetype-resources/crawler-conf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,17 @@ config:
indexer.url.fieldname: "url"
indexer.text.fieldname: "content"
indexer.canonical.name: "canonical"
# How to convert metadata key values into fields for indexing
#
# if no alias is specified with =alias, the key value is used
# for instance below, _domain_ and _format_ will be used
# as field names, whereas _title_ will be used for _parse.title_.
# You can specify the index of the value to store from the values array
# by using the _key[index]_ format, e.g. _parse.title[0]_ would try to
# get the first value for the metadata _parse.title_ (which is the default anyway).
# Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would
# index all the keys with _parse_ as a prefix. Note that in that case, you can't
# specify an alias with =, nor can you specify an index.
indexer.md.mapping:
- parse.title=title
- parse.keywords=keywords
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,12 @@
import crawlercommons.domains.PaidLevelDomain;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.ArrayUtils;
Expand Down Expand Up @@ -71,7 +74,7 @@ public abstract class AbstractIndexerBolt extends BaseRichBolt {

private String[] filterKeyValue = null;

private final Map<Key, String> metadata2field = new HashMap<>();
private final List<Key> metadata2field = new ArrayList<>();

private String fieldNameForText = null;

Expand All @@ -83,22 +86,46 @@ public abstract class AbstractIndexerBolt extends BaseRichBolt {

private boolean ignoreEmptyFields = false;

static class Key {
private static class Key {
private final String key;
private final String alias;
private final int index;
private final boolean glob;

public Key(String key, int index) {
this.key = key;
public Key(String key, int index, String alias) {
this.index = index;
this.alias = alias;
if (key.endsWith("*")) {
this.key = key.substring(0, key.length() - 1);
this.glob = true;
// can't have an alias
// or an index
if (index != -1 || alias != null) {
throw new RuntimeException(
"Can't have a mapping for indexer.md.mapping with a glob and index or alias");
}
} else {
this.key = key;
this.glob = false;
}
}

public String getKey() {
return key;
}

public String getAlias() {
// return the alias if set
return alias;
}

public int getIndex() {
return index;
}

public boolean isGlob() {
return glob;
}
}

@Override
Expand Down Expand Up @@ -137,15 +164,15 @@ public void prepare(
} else {
mapping = mapping.trim();
key = mapping;
value = mapping;
value = null;
}
int index = -1;
Matcher match = indexValuePattern.matcher(key);
if (match.find()) {
index = Integer.parseInt(match.group(1));
key = key.substring(0, match.start());
}
metadata2field.put(new Key(key, index), value);
metadata2field.add(new Key(key, index, value));
LOG.info("Mapping key {} to field {}", key, value);
}

Expand Down Expand Up @@ -175,26 +202,42 @@ protected Map<String, String[]> filterMetadata(Metadata meta) {

Map<String, String[]> fieldVals = new HashMap<>();

for (Entry<Key, String> entry : metadata2field.entrySet()) {
Key key = entry.getKey();
String[] values = meta.getValues(key.key);
// not found
if (values == null || values.length == 0) {
continue;
}
// check whether we want a specific value or all of them?
int index = key.index;
// want a value index that it outside the range given
if (index >= values.length) {
continue;
}
// store all values available
if (index == -1) {
fieldVals.put(entry.getValue(), values);
for (Key key : metadata2field) {
Set<String> matchingKeys = new HashSet<>();
// if it is a glob - look for all matching entries in the metadata
if (key.isGlob()) {
matchingKeys = meta.keySet(key.getKey());
} else {
matchingKeys.add(key.getKey());
}
// or only the one we want
else {
fieldVals.put(entry.getValue(), new String[] {values[index]});

for (String matchingKey : matchingKeys) {
String[] values = meta.getValues(matchingKey);
String label = matchingKey;

// won't be the case for globs
if (key.getAlias() != null) {
label = key.getAlias();
}

// not found
if (values == null || values.length == 0) {
continue;
}
// check whether we want a specific value or all of them?
int index = key.index;
// want a value index that it outside the range given
if (index >= values.length) {
continue;
}
// store all values available
if (index == -1) {
fieldVals.put(label, values);
}
// or only the one we want
else {
fieldVals.put(label, new String[] {values[index]});
}
}
}

Expand Down
11 changes: 11 additions & 0 deletions core/src/main/resources/crawler-default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,17 @@ config:
indexer.text.fieldname: "content"
indexer.text.maxlength: -1
indexer.canonical.name: "canonical"
# How to convert metadata key values into fields for indexing
#
# if no alias is specified with =alias, the key value is used
# for instance below, _domain_ and _format_ will be used
# as field names, whereas _title_ will be used for _parse.title_.
# You can specify the index of the value to store from the values array
# by using the _key[index]_ format, e.g. _parse.title[0]_ would try to
# get the first value for the metadata _parse.title_ (which is the default anyway).
# Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would
# index all the keys with _parse_ as a prefix. Note that in that case, you can't
# specify an alias with =, nor can you specify an index.
indexer.md.mapping:
- parse.title=title
- parse.keywords=keywords
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,4 +236,25 @@ public void testEmptyFilterMetadata() throws Exception {
new String[] {"url"},
fields.keySet().toArray());
}

@Test
public void testGlobFilterMetadata() throws Exception {
Map config = new HashMap();
config.put(AbstractIndexerBolt.urlFieldParamName, "url");
List<String> listKV = new ArrayList<>();
listKV.add("parse.*");
config.put(AbstractIndexerBolt.metadata2fieldParamName, listKV);

prepareIndexerBolt(config);

Metadata metadata = new Metadata();
metadata.setValue("parse.title", "This is the title");
metadata.setValue("parse.keywords", "keyword1, keyword2, keyword3");
metadata.setValue("parse.description", "This is the description");

index(URL, metadata);
Map<String, String> fields = ((DummyIndexer) bolt).returnFields();

Assert.assertEquals("Incorrect number of fields", 4, fields.keySet().size());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,17 @@ config:
indexer.url.fieldname: "url"
indexer.text.fieldname: "content"
indexer.canonical.name: "canonical"
# How to convert metadata key values into fields for indexing
#
# if no alias is specified with =alias, the key value is used
# for instance below, _domain_ and _format_ will be used
# as field names, whereas _title_ will be used for _parse.title_.
# You can specify the index of the value to store from the values array
# by using the _key[index]_ format, e.g. _parse.title[0]_ would try to
# get the first value for the metadata _parse.title_ (which is the default anyway).
# Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would
# index all the keys with _parse_ as a prefix. Note that in that case, you can't
# specify an alias with =, nor can you specify an index.
indexer.md.mapping:
- parse.title=title
- parse.keywords=keywords
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,17 @@ config:
indexer.url.fieldname: "url"
indexer.text.fieldname: "content"
indexer.canonical.name: "canonical"
# How to convert metadata key values into fields for indexing
#
# if no alias is specified with =alias, the key value is used
# for instance below, _domain_ and _format_ will be used
# as field names, whereas _title_ will be used for _parse.title_.
# You can specify the index of the value to store from the values array
# by using the _key[index]_ format, e.g. _parse.title[0]_ would try to
# get the first value for the metadata _parse.title_ (which is the default anyway).
# Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would
# index all the keys with _parse_ as a prefix. Note that in that case, you can't
# specify an alias with =, nor can you specify an index.
indexer.md.mapping:
- parse.title=title
- parse.keywords=keywords
Expand Down
Loading