apache · benwtrent · Apr 19, 2025 · Apr 3, 2025 · Apr 18, 2025 · Apr 18, 2025
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -103,7 +103,10 @@ Bug Fixes
 ---------------------
 
 * GITHUB#14522: Fix DISIDocIdStream::count so that it does not try to count beyond max.
-  (Chris Hegarty}
+  (Chris Hegarty)
+
+* GITHUB#14523: Correct TermOrdValComparator competitive iterator so that it forces sparse
+  field iteration to be at least scoring window baseline when doing intoBitSet. (Ben Trent, Adrien Grand)
 
 ======================= Lucene 10.2.0 =======================
 

diff --git a/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java b/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java
@@ -203,7 +203,7 @@ protected final int slowAdvance(int target) throws IOException {
    * @lucene.internal
    */
   public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException {
-    assert offset <= docID();
+    assert offset <= docID() : "offset=" + offset + " docID()=" + docID() + " upTo=" + upTo;
     for (int doc = docID(); doc < upTo; doc = nextDoc()) {
       bitSet.set(doc - offset);
     }

diff --git a/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java b/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java
@@ -524,17 +524,21 @@ public int advance(int target) throws IOException {
 
     @Override
     public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException {
+      upTo = Math.min(upTo, maxDoc);
       if (upTo <= doc) {
         return;
       }
       // Optimize the case when intersecting the competitive iterator is expensive, which is when it
       // hasn't nailed down a disjunction of competitive terms yet.
       if (disjunction == null) {
         if (docsWithField != null) {
+          // we need to be absolutely sure that the iterator is at least at offset
+          if (docsWithField.docID() < offset) {
+            docsWithField.advance(offset);
+          }
           docsWithField.intoBitSet(upTo, bitSet, offset);
           doc = docsWithField.docID();
         } else {
-          upTo = Math.min(upTo, maxDoc);
           bitSet.set(doc - offset, upTo - offset);
           doc = upTo;
         }

diff --git a/lucene/core/src/test/org/apache/lucene/search/comparators/TestTermOrdValComparator.java b/lucene/core/src/test/org/apache/lucene/search/comparators/TestTermOrdValComparator.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.comparators;
+
+import java.io.IOException;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.KeywordField;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BulkScorer;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.LeafCollector;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreMode;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.SortedSetSelector;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopFieldCollectorManager;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.store.ByteBuffersDirectory;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.tests.util.LuceneTestCase;
+
+public class TestTermOrdValComparator extends LuceneTestCase {
+
+  public void testIntoBitSetBugIssue14517() throws IOException {
+    final int maxDoc = 5_000;
+    try (Directory dir = new ByteBuffersDirectory()) {
+      try (IndexWriter w = new IndexWriter(dir, new IndexWriterConfig())) {
+        // high max doc to have a high number of unique values so that the competitive iterator is
+        // initialized with `docsWithField` rather than specific (< 1024) terms
+        for (int i = 0; i < maxDoc; ++i) {
+          Document doc = new Document();
+          // make the field to be sparse, so that the iterator is initialized with `docsWithField`
+          if (i % 2 == 0) {
+            doc.add(new StringField("field", "value", Field.Store.NO));
+            doc.add(new KeywordField("sort", Integer.toString(i), Field.Store.NO));
+          }
+          w.addDocument(doc);
+        }
+        w.forceMerge(1);
+      }
+      try (DirectoryReader reader = DirectoryReader.open(dir)) {
+        LeafReaderContext context = reader.leaves().get(0);
+        IndexSearcher searcher = new IndexSearcher(reader);
+        Query query = new TermQuery(new Term("field", "value"));
+        Weight weight =
+            searcher.createWeight(query, ScoreMode.COMPLETE_NO_SCORES, RANDOM_MULTIPLIER);
+        SortField sortField = KeywordField.newSortField("sort", false, SortedSetSelector.Type.MIN);
+        sortField.setMissingValue(SortField.STRING_LAST);
+        Sort sort = new Sort(sortField);
+        Collector collector = new TopFieldCollectorManager(sort, 10, 10).newCollector();
+        LeafCollector leafCollector = collector.getLeafCollector(context);
+        BulkScorer bulkScorer = weight.bulkScorer(context);
+        // split on this specific doc ID so that the current doc of the competitive iterator
+        // and the current doc of `docsWithField` are out of sync,
+        // because the competitive iterator was just updated.
+        bulkScorer.score(leafCollector, null, 0, 22);
+        bulkScorer.score(leafCollector, null, 22, maxDoc);
+      }
+    }
+  }
+}