Output binary doc values as hex array in SimpleTextCodec (#12987)

Binary doc values were being written directly in SimpleTextCodec, though they may not be valid UTF-8 (i.e. they may not be "text"). This change encodes them as a string representing an array of hexadecimal bytes.
apache · Jan 12, 2024 · 6e1ce4c · 6e1ce4c
1 parent 7ad2507
commit 6e1ce4c
Show file tree

Hide file tree

Showing 4 changed files with 63 additions and 11 deletions.
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java
@@ -302,6 +302,7 @@ public synchronized BinaryDocValues getBinary(FieldInfo fieldInfo) throws IOExce
     IntFunction<BytesRef> values =
         new IntFunction<BytesRef>() {
           final BytesRefBuilder term = new BytesRefBuilder();
+          final BytesRefBuilder termByteArray = new BytesRefBuilder();
 
           @Override
           public BytesRef apply(int docID) {
@@ -329,9 +330,10 @@ public BytesRef apply(int docID) {
               } catch (ParseException pe) {
                 throw new CorruptIndexException("failed to parse int length", in, pe);
               }
-              term.grow(len);
-              term.setLength(len);
-              in.readBytes(term.bytes(), 0, len);
+              termByteArray.grow(len);
+              termByteArray.setLength(len);
+              in.readBytes(termByteArray.bytes(), 0, len);
+              term.copyBytes(SimpleTextUtil.fromBytesRefString(termByteArray.get().utf8ToString()));
               return term.get();
             } catch (IOException ioe) {
               throw new RuntimeException(ioe);

diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java
@@ -169,7 +169,7 @@ private void doAddBinaryField(FieldInfo field, DocValuesProducer valuesProducer)
     int maxLength = 0;
     BinaryDocValues values = valuesProducer.getBinary(field);
     for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
-      maxLength = Math.max(maxLength, values.binaryValue().length);
+      maxLength = Math.max(maxLength, values.binaryValue().toString().length());
     }
     writeFieldEntry(field, DocValuesType.BINARY);
 
@@ -197,25 +197,24 @@ private void doAddBinaryField(FieldInfo field, DocValuesProducer valuesProducer)
         values.nextDoc();
         assert values.docID() >= i;
       }
+      String stringVal = values.docID() == i ? values.binaryValue().toString() : null;
       // write length
-      final int length = values.docID() != i ? 0 : values.binaryValue().length;
+      final int length = stringVal == null ? 0 : stringVal.length();
       SimpleTextUtil.write(data, LENGTH);
       SimpleTextUtil.write(data, encoder.format(length), scratch);
       SimpleTextUtil.writeNewline(data);
 
-      // write bytes -- don't use SimpleText.write
-      // because it escapes:
-      if (values.docID() == i) {
-        BytesRef value = values.binaryValue();
-        data.writeBytes(value.bytes, value.offset, value.length);
+      // write bytes as hex array
+      if (stringVal != null) {
+        SimpleTextUtil.write(data, stringVal, scratch);
       }
 
       // pad to fit
       for (int j = length; j < maxLength; j++) {
         data.writeByte((byte) ' ');
       }
       SimpleTextUtil.writeNewline(data);
-      if (values.docID() != i) {
+      if (stringVal == null) {
         SimpleTextUtil.write(data, "F", scratch);
       } else {
         SimpleTextUtil.write(data, "T", scratch);

diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextUtil.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextUtil.java
@@ -111,6 +111,9 @@ public static BytesRef fromBytesRefString(String s) {
       throw new IllegalArgumentException(
           "string " + s + " was not created from BytesRef.toString?");
     }
+    if (s.length() == 2) {
+      return new BytesRef(BytesRef.EMPTY_BYTES);
+    }
     String[] parts = s.substring(1, s.length() - 1).split(" ");
     byte[] bytes = new byte[parts.length];
     for (int i = 0; i < parts.length; i++) {

diff --git a/...ne/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextDocValuesFormat.java b/...ne/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextDocValuesFormat.java
@@ -16,8 +16,23 @@
  */
 package org.apache.lucene.codecs.simpletext;
 
+import java.io.IOException;
+import java.util.List;
 import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.document.BinaryDocValuesField;
+import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.document.SortedDocValuesField;
+import org.apache.lucene.document.SortedSetDocValuesField;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.tests.analysis.MockAnalyzer;
 import org.apache.lucene.tests.index.BaseDocValuesFormatTestCase;
+import org.apache.lucene.tests.index.RandomIndexWriter;
+import org.apache.lucene.tests.util.TestUtil;
+import org.apache.lucene.util.BytesRef;
 
 /** Tests SimpleTextDocValuesFormat */
 public class TestSimpleTextDocValuesFormat extends BaseDocValuesFormatTestCase {
@@ -27,4 +42,37 @@ public class TestSimpleTextDocValuesFormat extends BaseDocValuesFormatTestCase {
   protected Codec getCodec() {
     return codec;
   }
+
+  public void testFileIsUTF8() throws IOException {
+    try (Directory dir = newDirectory()) {
+      IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
+      try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf)) {
+        for (int i = 0; i < 100; i++) {
+          writer.addDocument(
+              List.of(
+                  new SortedDocValuesField(
+                      "sortedVal", newBytesRef(TestUtil.randomSimpleString(random()))),
+                  new SortedSetDocValuesField(
+                      "sortedSetVal", newBytesRef(TestUtil.randomSimpleString(random()))),
+                  new NumericDocValuesField("numberVal", random().nextLong()),
+                  new BinaryDocValuesField("binaryVal", TestUtil.randomBinaryTerm(random()))));
+        }
+      }
+      for (String file : dir.listAll()) {
+        if (file.endsWith("dat")) {
+          try (IndexInput input = dir.openChecksumInput(file, IOContext.READONCE)) {
+            long length = input.length();
+            if (length > 20_000) {
+              // Avoid allocating a huge array if the length is wrong
+              fail("Doc values should not be this large");
+            }
+            byte[] bytes = new byte[(int) length];
+            input.readBytes(bytes, 0, bytes.length);
+            BytesRef bytesRef = new BytesRef(bytes);
+            assertNotEquals(bytesRef.toString(), Term.toString(bytesRef));
+          }
+        }
+      }
+    }
+  }
 }