Skip to content

Commit

Permalink
Make match and context size configurable (address #128)
Browse files Browse the repository at this point in the history
Change-Id: Ieef96dd68adf4e3ce00f59fc21face545c2ce897
  • Loading branch information
margaretha committed May 21, 2024
1 parent e4a5eed commit 464ae45
Show file tree
Hide file tree
Showing 11 changed files with 294 additions and 60 deletions.
4 changes: 4 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
0.62.4 2024-05-17
- [feature] Make match and context size configurable (address #128,
diewald & margaretha)

0.62.3 2024-04-16
- [cleanup] Added getDocBitsSupplier to VirtualCorpusFilter (margaretha)
- [feature] Make VC cache location customizable (margaretha)
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

<groupId>de.ids-mannheim.korap.krill</groupId>
<artifactId>Krill</artifactId>
<version>0.62.3</version>
<version>0.62.4</version>
<packaging>jar</packaging>

<name>Krill</name>
Expand Down
11 changes: 11 additions & 0 deletions src/main/java/de/ids_mannheim/korap/Krill.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ public class Krill extends Response {
private SpanQuery spanQuery;
private JsonNode request;

private int maxTokenMatchSize;
private final ObjectMapper mapper = new ObjectMapper();

/**
Expand Down Expand Up @@ -347,4 +348,14 @@ public JsonNode retrieveFieldValues (String corpusQuery, KrillIndex index,
VirtualCorpusResponse r = new VirtualCorpusResponse();
return r.createKoralQueryForField(fieldName, fieldValues);
}


public int getMaxTokenMatchSize () {
return maxTokenMatchSize;
}


public void setMaxTokenMatchSize (int maxMatchTokens) {
this.maxTokenMatchSize = maxMatchTokens;
}
};
54 changes: 40 additions & 14 deletions src/main/java/de/ids_mannheim/korap/KrillIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
Expand Down Expand Up @@ -175,22 +175,27 @@ public final class KrillIndex implements IndexInfo {
private HashMap termContexts;
private ObjectMapper mapper = new ObjectMapper();

private int maxTokenMatchSize;

// private ByteBuffer bbTerm;

// Some initializations ...
{
Properties prop = KrillProperties.loadDefaultProperties();
Properties info = KrillProperties.loadInfo();

if (info != null) {
this.version = info.getProperty("krill.version");
this.name = info.getProperty("krill.name");
};

// Check for auto commit value
String autoCommitStr = null;
if (prop != null)
if (prop != null) {
autoCommitStr = prop.getProperty("krill.index.commit.auto");

this.maxTokenMatchSize = KrillProperties.maxTokenMatchSize;
}

if (autoCommitStr != null) {
try {
this.autoCommit = Integer.parseInt(autoCommitStr);
Expand Down Expand Up @@ -429,8 +434,15 @@ public int getAutoCommit () {
public void setAutoCommit (int value) {
this.autoCommit = value;
};



public int getMaxTokenMatchSize () {
return maxTokenMatchSize;
}

public void setMaxTokenMatchSize (int maxMatchTokens) {
this.maxTokenMatchSize = maxMatchTokens;
}

/**
* Update a document in the index as a {@link FieldDocument}
* if it already exists (based on the textSigle), otherwise
Expand Down Expand Up @@ -972,11 +984,20 @@ public Match getMatchInfo (String idString, String field, boolean info,
boolean includeSnippets, boolean includeTokens,
boolean includeHighlights, boolean extendToSentence)
throws QueryException {

return getMatchInfo(idString, field, info, foundry, layer, includeSpans,
includeSnippets, includeTokens, includeHighlights,
extendToSentence, maxTokenMatchSize);
};

public Match getMatchInfo (String idString, String field, boolean info,
List<String> foundry, List<String> layer, boolean includeSpans,
boolean includeSnippets, boolean includeTokens,
boolean includeHighlights, boolean extendToSentence,
int maxMatchTokens) throws QueryException {
if (DEBUG)
log.trace("Get info on {}", idString);

Match match = new Match(idString, includeHighlights);
Match match = new Match(maxMatchTokens, idString, includeHighlights);

if (this.getVersion() != null)
match.setVersion(this.getVersion());
Expand Down Expand Up @@ -1202,8 +1223,8 @@ else if (includeSpans) {
&& spanContext[0] < spanContext[1]) {

// Match needs to be cutted!
if ((spanContext[1] - spanContext[0]) > match.getMaxMatchTokens()) {
int contextLength = match.getMaxMatchTokens() - match.getLength();
if ((spanContext[1] - spanContext[0]) > maxMatchTokens) {
int contextLength = maxMatchTokens - match.getLength();
int halfContext = contextLength / 2;

// This is the extended context calculated
Expand All @@ -1216,8 +1237,8 @@ else if (includeSpans) {
}
}

match.setStartPos(spanContext[0]);
match.setEndPos(spanContext[1]);
match.setStartPos(maxMatchTokens,spanContext[0]);
match.setEndPos(maxMatchTokens,spanContext[1]);
match.potentialStartPosChar = spanContext[2];
match.potentialEndPosChar = spanContext[3];
match.startMore = false;
Expand Down Expand Up @@ -1569,9 +1590,14 @@ public Result search (Krill ks) {
final Document doc = (fields != null)
? lreader.document(localDocID, fieldsSet)
: lreader.document(localDocID);


int maxMatchSize = maxTokenMatchSize;
if (ks.getMaxTokenMatchSize() > 0) {
maxMatchSize = ks.getMaxTokenMatchSize();
};

// Create new Match
final Match match = new Match(pto, localDocID,
final Match match = new Match(maxMatchSize, pto, localDocID,
spans.start(), spans.end());

// Add snippet if existing
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/de/ids_mannheim/korap/index/Indexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ public static void main (String[] argv) {
Options options = new Options();
options.addOption(Option.builder("c").longOpt("config")
.desc("configuration file (defaults to "
+ KrillProperties.defaultPropertiesLocation
+ KrillProperties.DEFAULT_PROPERTIES_LOCATION
+ ").")
.hasArg().argName("properties file").required().build());
options.addOption(Option.builder("i").longOpt("inputDir")
Expand Down
65 changes: 36 additions & 29 deletions src/main/java/de/ids_mannheim/korap/response/Match.java
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
package de.ids_mannheim.korap.response;

import static de.ids_mannheim.korap.util.KrillByte.unsignedByte;
import static de.ids_mannheim.korap.util.KrillString.codePointSubstring;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;

import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
import org.slf4j.Logger;
Expand All @@ -18,13 +27,11 @@
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonInclude.Include;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;

import static de.ids_mannheim.korap.util.KrillByte.*;
import static de.ids_mannheim.korap.util.KrillString.codePointSubstring;
import de.ids_mannheim.korap.index.AbstractDocument;
import de.ids_mannheim.korap.index.PositionsToOffset;
import de.ids_mannheim.korap.query.SpanElementQuery;
Expand All @@ -33,6 +40,7 @@
import de.ids_mannheim.korap.response.match.MatchIdentifier;
import de.ids_mannheim.korap.response.match.PosIdentifier;
import de.ids_mannheim.korap.response.match.Relation;
import de.ids_mannheim.korap.util.KrillProperties;

/*
* The snippet building algorithm is quite complicated for now
Expand Down Expand Up @@ -84,8 +92,6 @@ public class Match extends AbstractDocument {

// Logger
private final static Logger log = LoggerFactory.getLogger(Match.class);

private static final int MAX_MATCH_TOKENS = 50;

// end marker of highlights that are pagebreaks
private static final int PB_MARKER = -99999;
Expand Down Expand Up @@ -151,18 +157,19 @@ public class Match extends AbstractDocument {
@JsonIgnore
public boolean startMore = true, endMore = true;

private Collection<byte[]> payload;
// private Collection<byte[]> payload;
private ArrayList<Highlight> highlight;
private LinkedList<int[]> span;

private PositionsToOffset positionsToOffset;
private boolean processed = false;



/**
* Constructs a new Match object.
* Todo: Maybe that's not necessary!
*
* @param maxTokenMatchSize
* The maximum number of tokens a match may have
* @param pto
* The PositionsToOffset object, containing relevant
* positional information for highlighting
Expand All @@ -177,12 +184,12 @@ public class Match extends AbstractDocument {
* @see #snippetBrackets()
* @see PositionsToOffset
*/
public Match (PositionsToOffset pto, int localDocID, int startPos,
int endPos) {
public Match (int maxTokenMatchSize, PositionsToOffset pto,
int localDocID, int startPos, int endPos) {
this.positionsToOffset = pto;
this.localDocID = localDocID;
this.setStartPos(startPos);
this.setEndPos(endPos);
this.setStartPos(maxTokenMatchSize, startPos);
this.setEndPos(maxTokenMatchSize, endPos);
};


Expand All @@ -201,7 +208,7 @@ public Match (PositionsToOffset pto, int localDocID, int startPos,
* Boolean value indicating if possible provided
* highlight information should be ignored or not.
*/
public Match (String idString, boolean includeHighlights) {
public Match (int maxTokenMatchSize, String idString, boolean includeHighlights) {
MatchIdentifier id = new MatchIdentifier(idString);

if (id.getStartPos() > -1) {
Expand All @@ -215,8 +222,8 @@ public Match (String idString, boolean includeHighlights) {
this.addString("ID", id.getDocID());
// </legacy>

this.setStartPos(id.getStartPos());
this.setEndPos(id.getEndPos());
this.setStartPos(maxTokenMatchSize, id.getStartPos());
this.setEndPos(maxTokenMatchSize, id.getEndPos());

if (includeHighlights)
for (int[] pos : id.getPos()) {
Expand Down Expand Up @@ -506,11 +513,6 @@ public void addPagebreak (int start, int pagenumber) {
this.addHighlight(new Highlight(start, pagenumber));
};

@JsonIgnore
public int getMaxMatchTokens () {
return MAX_MATCH_TOKENS;
}

/**
* Get document id.
*/
Expand Down Expand Up @@ -575,10 +577,10 @@ public int getStartPos (int number) {
* The positional offset.
*/
@JsonIgnore
public void setStartPos (int pos) {
public void setStartPos (int maxTokenMatchSize, int pos) {
this.startPos = pos;
if (this.endPos != -1 && (this.endPos - pos) > MAX_MATCH_TOKENS) {
this.endPos = pos + MAX_MATCH_TOKENS;
if (this.endPos != -1 && (this.endPos - pos) > maxTokenMatchSize) {
this.endPos = pos + maxTokenMatchSize;
this.endCutted = true;
};
};
Expand Down Expand Up @@ -623,10 +625,15 @@ public int getEndPos (int number) {
* The positional offset.
*/
@JsonIgnore
public void setEndPos (int pos) {
if (this.startPos != -1 && (pos - this.startPos) > MAX_MATCH_TOKENS) {
pos = this.startPos + MAX_MATCH_TOKENS;
this.endCutted = true;
public void setEndPos (int maxTokenMatchSize, int pos) {
if (maxTokenMatchSize > KrillProperties.maxTokenMatchSize) {
maxTokenMatchSize = KrillProperties.maxTokenMatchSize;
this.endCutted = true;
}

if (this.startPos != -1 && (pos - this.startPos) > maxTokenMatchSize) {
pos = this.startPos + maxTokenMatchSize;
this.endCutted = true;
};
this.endPos = pos;
};
Expand Down
Loading

0 comments on commit 464ae45

Please sign in to comment.