Skip to content

Commit

Permalink
Make search context configurable to set max token length (#128).
Browse files Browse the repository at this point in the history
Change-Id: I81eacce8fb0991ed2c2470950f36cfbc02b9ea7f
  • Loading branch information
margaretha committed May 15, 2024
1 parent da16975 commit afd2035
Show file tree
Hide file tree
Showing 6 changed files with 171 additions and 36 deletions.
3 changes: 2 additions & 1 deletion Changes
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
0.62.4 2024-05-7
0.62.4 2024-05-15
- [feature] Make match size configurable (address #128,
diewald & margaretha)
- [feature] Make context size configurable (#128)

0.62.3 2024-04-16
- [cleanup] Added getDocBitsSupplier to VirtualCorpusFilter (margaretha)
Expand Down
12 changes: 10 additions & 2 deletions src/main/java/de/ids_mannheim/korap/Krill.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
import de.ids_mannheim.korap.query.wrap.SpanQueryWrapper;
import de.ids_mannheim.korap.response.Response;
import de.ids_mannheim.korap.response.Result;
import de.ids_mannheim.korap.response.SearchContext;
import de.ids_mannheim.korap.response.VirtualCorpusResponse;
import de.ids_mannheim.korap.util.KrillConfiguration;
import de.ids_mannheim.korap.util.KrillProperties;
import de.ids_mannheim.korap.util.QueryException;

/**
Expand Down Expand Up @@ -304,9 +304,17 @@ public Result apply () {
else if (this.index == null) {
kr.addError(601, "Unable to find index");
}

// Apply search
else {

KrillConfiguration config = getConfig();
if (config != null) {
KrillMeta meta = this.getMeta();
SearchContext context = meta.getContext();
context.updateContext(config);
}

// This contains meta and matches
kr = this.index.search(this);
// this.getCollection().setIndex(this.index);
Expand Down
65 changes: 42 additions & 23 deletions src/main/java/de/ids_mannheim/korap/response/SearchContext.java
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
package de.ids_mannheim.korap.response;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.*;
import com.fasterxml.jackson.annotation.*;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.databind.node.TextNode;

import de.ids_mannheim.korap.util.KrillConfiguration;

public class SearchContext {
ObjectMapper mapper = new ObjectMapper();
Expand All @@ -25,14 +27,14 @@ public class SearchContext {


public SearchContext () {};


// EM: not used?
public SearchContext (String spanContext) {
this.spanType = true;
this.spanContext = spanContext;
};


// EM: seems to be deprecated. used in a deprecated search method
public SearchContext (boolean leftTokenContext, short leftContext,
boolean rightTokenContext, short rightContext) {
this.spanType = false;
Expand Down Expand Up @@ -69,40 +71,62 @@ else if (spanContext.equals("paragraph")) {
return this;
};

public void updateContext (KrillConfiguration krillConfig) {
left.setMaxLength(krillConfig.getMaxContextTokens());
right.setMaxLength(krillConfig.getMaxContextTokens());

// update token length
if (left.isToken) {
left.setLength(left.getLength());
}
if (right.isToken) {
right.setLength(right.getLength());
}
}


public class SearchContextSide {
private boolean type = true;
private short length = 6;
private short maxLength = 500;


private boolean isToken = true;
private int length = 6;
private int maxLength = 500;

public SearchContextSide () {}

public int getMaxLength () {
return maxLength;
}
public void setMaxLength (int maxLength) {
this.maxLength = maxLength;
}

public boolean isToken () {
return this.type;
return this.isToken;
};


public boolean isCharacter () {
return !(this.type);
return !(this.isToken);
};


public SearchContextSide setToken (boolean value) {
this.type = value;
this.isToken = value;
return this;
};


public SearchContextSide setCharacter (boolean value) {
this.type = !(value);
this.isToken = !(value);
return this;
};


public short getLength () {
public int getLength () {
return this.length;
};


public SearchContextSide setLength (short value) {
public SearchContextSide setLength (int value) {
if (value >= 0) {
if (value <= maxLength) {
this.length = value;
Expand All @@ -115,11 +139,6 @@ public SearchContextSide setLength (short value) {
};


public SearchContextSide setLength (int value) {
return this.setLength((short) value);
};


public void fromJson (JsonNode json) {
String type = json.get(0).asText();
if (type.equals("token")) {
Expand Down
105 changes: 105 additions & 0 deletions src/test/java/de/ids_mannheim/korap/index/TestMaxContext.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
package de.ids_mannheim.korap.index;

import static de.ids_mannheim.korap.TestSimple.getJsonString;
import static org.junit.Assert.assertEquals;

import java.io.IOException;
import java.util.Properties;

import org.junit.BeforeClass;
import org.junit.Test;

import de.ids_mannheim.korap.Krill;
import de.ids_mannheim.korap.KrillIndex;
import de.ids_mannheim.korap.response.Result;
import de.ids_mannheim.korap.response.SearchContext;
import de.ids_mannheim.korap.util.KrillConfiguration;
import de.ids_mannheim.korap.util.KrillProperties;

public class TestMaxContext {
private static KrillIndex ki;
private static String jsonQuery;

@BeforeClass
public static void init () throws IOException {
ki = new KrillIndex();
for (String i : new String[] { "00001" }) {
ki.addDoc(TestMaxContext.class
.getResourceAsStream("/wiki/" + i + ".json.gz"), true);
};
ki.commit();

// left and right contexts: token 5
jsonQuery = getJsonString(TestMaxContext.class
.getResource("/queries/position/sentence-contain-token.json")
.getFile());
}

@Test
public void searchWithContextTokenSize () throws IOException {
Krill ks = new Krill(jsonQuery);
Result kr = ks.apply(ki);

SearchContext context = kr.getContext();
assertEquals(500, context.left.getMaxLength());
assertEquals(500, context.right.getMaxLength());
assertEquals(5, context.left.getLength());
assertEquals(5, context.right.getLength());

assertEquals(
"... eine durchschnittliche Häufigkeit von 6,51 %. [[Er ist damit der sechsthäufigste Buchstabe in deutschen Texten]]. Mit Ausnahme von Fremdwörtern und ...",
kr.getMatch(0).getSnippetBrackets());
assertEquals(
"<span class=\"context-left\"><span class=\"more\"></span>eine durchschnittliche Häufigkeit von 6,51 %. </span><span class=\"match\"><mark>Er ist damit der sechsthäufigste Buchstabe in deutschen Texten</mark></span><span class=\"context-right\">. Mit Ausnahme von Fremdwörtern und<span class=\"more\"></span></span>",
kr.getMatch(0).getSnippetHTML());


// limiting max context tokens
int maxLength = 2;
KrillConfiguration config = new KrillConfiguration();
config.setMaxContextTokens(maxLength);
ks.setConfig(config);

kr = ks.apply(ki);

context = kr.getContext();
assertEquals(2, context.left.getMaxLength());
assertEquals(2, context.right.getMaxLength());
assertEquals(2, context.left.getLength());
assertEquals(2, context.right.getLength());

assertEquals(
"... von 6,51 %. [[Er ist damit der sechsthäufigste Buchstabe in deutschen Texten]]. Mit Ausnahme ...",
kr.getMatch(0).getSnippetBrackets());
assertEquals(
"<span class=\"context-left\"><span class=\"more\"></span>von 6,51 %. </span><span class=\"match\"><mark>Er ist damit der sechsthäufigste Buchstabe in deutschen Texten</mark></span><span class=\"context-right\">. Mit Ausnahme<span class=\"more\"></span></span>",
kr.getMatch(0).getSnippetHTML());
};


@Test
public void searchWithProperties () throws IOException {
Krill ks = new Krill(jsonQuery);

// limiting max context tokens
Properties properties = KrillProperties.loadDefaultProperties();
KrillConfiguration config = KrillConfiguration
.createNewConfiguration(properties );
ks.setConfig(config);

Result kr = ks.apply(ki);

SearchContext context = kr.getContext();
assertEquals(3, context.left.getMaxLength());
assertEquals(3, context.right.getMaxLength());
assertEquals(3, context.left.getLength());
assertEquals(3, context.right.getLength());

assertEquals(
"... Häufigkeit von 6,51 %. [[Er]<!>] ist damit der ...",
kr.getMatch(0).getSnippetBrackets());
assertEquals(
"<span class=\"context-left\"><span class=\"more\"></span>Häufigkeit von 6,51 %. </span><span class=\"match\"><mark>Er</mark><span class=\"cutted\"></span></span><span class=\"context-right\"> ist damit der<span class=\"more\"></span></span>",
kr.getMatch(0).getSnippetHTML());
};
}
18 changes: 10 additions & 8 deletions src/test/java/de/ids_mannheim/korap/index/TestMaxMatchTokens.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,15 @@ public void searchJSONmatchSize () throws IOException {

Krill ks = new Krill(json);
Result kr = ks.apply(ki);
assertEquals(78, kr.getTotalResults());
assertEquals(3, kr.getTotalResults());

assertTrue(kr.getMatch(0).endCutted);

assertEquals(
"... sechsthäufigste Buchstabe in deutschen Texten. [[Mit Ausnahme]<!>] von Fremdwörtern und Namen ist ...",
"... eine durchschnittliche Häufigkeit von 6,51 %. [[Er ist]<!>] damit der sechsthäufigste Buchstabe in ...",
kr.getMatch(0).getSnippetBrackets());
assertEquals(
"<span class=\"context-left\"><span class=\"more\"></span>sechsthäufigste Buchstabe in deutschen Texten. </span><span class=\"match\"><mark>Mit Ausnahme</mark><span class=\"cutted\"></span></span><span class=\"context-right\"> von Fremdwörtern und Namen ist<span class=\"more\"></span></span>",
"<span class=\"context-left\"><span class=\"more\"></span>eine durchschnittliche Häufigkeit von 6,51 %. </span><span class=\"match\"><mark>Er ist</mark><span class=\"cutted\"></span></span><span class=\"context-right\"> damit der sechsthäufigste Buchstabe in<span class=\"more\"></span></span>",
kr.getMatch(0).getSnippetHTML());
}

Expand All @@ -67,10 +69,10 @@ public void testLimitingMatchInKrill () throws IOException {
Result kr = ks.apply(ki);

assertEquals(
"... sechsthäufigste Buchstabe in deutschen Texten. [[Mit Ausnahme von]<!>] Fremdwörtern und Namen ist das ...",
"... eine durchschnittliche Häufigkeit von 6,51 %. [[Er ist damit]<!>] der sechsthäufigste Buchstabe in deutschen ...",
kr.getMatch(0).getSnippetBrackets());
assertEquals(
"<span class=\"context-left\"><span class=\"more\"></span>sechsthäufigste Buchstabe in deutschen Texten. </span><span class=\"match\"><mark>Mit Ausnahme von</mark><span class=\"cutted\"></span></span><span class=\"context-right\"> Fremdwörtern und Namen ist das<span class=\"more\"></span></span>",
"<span class=\"context-left\"><span class=\"more\"></span>eine durchschnittliche Häufigkeit von 6,51 %. </span><span class=\"match\"><mark>Er ist damit</mark><span class=\"cutted\"></span></span><span class=\"context-right\"> der sechsthäufigste Buchstabe in deutschen<span class=\"more\"></span></span>",
kr.getMatch(0).getSnippetHTML());
};

Expand All @@ -86,12 +88,12 @@ public void testLimitingMatchWithProperties () throws IOException {
ks.setConfig(config);

Result kr = ks.apply(ki);

assertEquals(
"... sechsthäufigste Buchstabe in deutschen Texten. [[Mit]<!>] Ausnahme von Fremdwörtern und Namen ...",
"... Häufigkeit von 6,51 %. [[Er]<!>] ist damit der ...",
kr.getMatch(0).getSnippetBrackets());
assertEquals(
"<span class=\"context-left\"><span class=\"more\"></span>sechsthäufigste Buchstabe in deutschen Texten. </span><span class=\"match\"><mark>Mit</mark><span class=\"cutted\"></span></span><span class=\"context-right\"> Ausnahme von Fremdwörtern und Namen<span class=\"more\"></span></span>",
"<span class=\"context-left\"><span class=\"more\"></span>Häufigkeit von 6,51 %. </span><span class=\"match\"><mark>Er</mark><span class=\"cutted\"></span></span><span class=\"context-right\"> ist damit der<span class=\"more\"></span></span>",
kr.getMatch(0).getSnippetHTML());
};

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"meta":{
"startPage" : 2,
"startPage" : 1,
"count" : 5,
"context" : {
"left" : [ "token", 5 ],
Expand All @@ -24,7 +24,7 @@
"foundry": "opennlp",
"@type": "koral:term",
"match": "match:eq",
"key": "ADJA",
"key": "PROAV",
"layer": "p"
}
}
Expand Down

0 comments on commit afd2035

Please sign in to comment.