Skip to content

Commit

Permalink
Output binary doc values as hex array in SimpleTextCodec (#12987)
Browse files Browse the repository at this point in the history
Binary doc values were being written directly in SimpleTextCodec, though
they may not be valid UTF-8 (i.e. they may not be "text"). This change
encodes them as a string representing an array of hexadecimal bytes.
  • Loading branch information
msfroh authored and jpountz committed Jan 12, 2024
1 parent 7ad2507 commit 6e1ce4c
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,7 @@ public synchronized BinaryDocValues getBinary(FieldInfo fieldInfo) throws IOExce
IntFunction<BytesRef> values =
new IntFunction<BytesRef>() {
final BytesRefBuilder term = new BytesRefBuilder();
final BytesRefBuilder termByteArray = new BytesRefBuilder();

@Override
public BytesRef apply(int docID) {
Expand Down Expand Up @@ -329,9 +330,10 @@ public BytesRef apply(int docID) {
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse int length", in, pe);
}
term.grow(len);
term.setLength(len);
in.readBytes(term.bytes(), 0, len);
termByteArray.grow(len);
termByteArray.setLength(len);
in.readBytes(termByteArray.bytes(), 0, len);
term.copyBytes(SimpleTextUtil.fromBytesRefString(termByteArray.get().utf8ToString()));
return term.get();
} catch (IOException ioe) {
throw new RuntimeException(ioe);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ private void doAddBinaryField(FieldInfo field, DocValuesProducer valuesProducer)
int maxLength = 0;
BinaryDocValues values = valuesProducer.getBinary(field);
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
maxLength = Math.max(maxLength, values.binaryValue().length);
maxLength = Math.max(maxLength, values.binaryValue().toString().length());
}
writeFieldEntry(field, DocValuesType.BINARY);

Expand Down Expand Up @@ -197,25 +197,24 @@ private void doAddBinaryField(FieldInfo field, DocValuesProducer valuesProducer)
values.nextDoc();
assert values.docID() >= i;
}
String stringVal = values.docID() == i ? values.binaryValue().toString() : null;
// write length
final int length = values.docID() != i ? 0 : values.binaryValue().length;
final int length = stringVal == null ? 0 : stringVal.length();
SimpleTextUtil.write(data, LENGTH);
SimpleTextUtil.write(data, encoder.format(length), scratch);
SimpleTextUtil.writeNewline(data);

// write bytes -- don't use SimpleText.write
// because it escapes:
if (values.docID() == i) {
BytesRef value = values.binaryValue();
data.writeBytes(value.bytes, value.offset, value.length);
// write bytes as hex array
if (stringVal != null) {
SimpleTextUtil.write(data, stringVal, scratch);
}

// pad to fit
for (int j = length; j < maxLength; j++) {
data.writeByte((byte) ' ');
}
SimpleTextUtil.writeNewline(data);
if (values.docID() != i) {
if (stringVal == null) {
SimpleTextUtil.write(data, "F", scratch);
} else {
SimpleTextUtil.write(data, "T", scratch);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ public static BytesRef fromBytesRefString(String s) {
throw new IllegalArgumentException(
"string " + s + " was not created from BytesRef.toString?");
}
if (s.length() == 2) {
return new BytesRef(BytesRef.EMPTY_BYTES);
}
String[] parts = s.substring(1, s.length() - 1).split(" ");
byte[] bytes = new byte[parts.length];
for (int i = 0; i < parts.length; i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,23 @@
*/
package org.apache.lucene.codecs.simpletext;

import java.io.IOException;
import java.util.List;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.tests.analysis.MockAnalyzer;
import org.apache.lucene.tests.index.BaseDocValuesFormatTestCase;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.BytesRef;

/** Tests SimpleTextDocValuesFormat */
public class TestSimpleTextDocValuesFormat extends BaseDocValuesFormatTestCase {
Expand All @@ -27,4 +42,37 @@ public class TestSimpleTextDocValuesFormat extends BaseDocValuesFormatTestCase {
protected Codec getCodec() {
return codec;
}

public void testFileIsUTF8() throws IOException {
try (Directory dir = newDirectory()) {
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf)) {
for (int i = 0; i < 100; i++) {
writer.addDocument(
List.of(
new SortedDocValuesField(
"sortedVal", newBytesRef(TestUtil.randomSimpleString(random()))),
new SortedSetDocValuesField(
"sortedSetVal", newBytesRef(TestUtil.randomSimpleString(random()))),
new NumericDocValuesField("numberVal", random().nextLong()),
new BinaryDocValuesField("binaryVal", TestUtil.randomBinaryTerm(random()))));
}
}
for (String file : dir.listAll()) {
if (file.endsWith("dat")) {
try (IndexInput input = dir.openChecksumInput(file, IOContext.READONCE)) {
long length = input.length();
if (length > 20_000) {
// Avoid allocating a huge array if the length is wrong
fail("Doc values should not be this large");
}
byte[] bytes = new byte[(int) length];
input.readBytes(bytes, 0, bytes.length);
BytesRef bytesRef = new BytesRef(bytes);
assertNotEquals(bytesRef.toString(), Term.toString(bytesRef));
}
}
}
}
}
}

0 comments on commit 6e1ce4c

Please sign in to comment.