From b50f8f7fd67d2cd5ae9595fe607c06510fbdef8b Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Thu, 3 Apr 2025 07:52:04 -0400 Subject: [PATCH 1/8] Disable HNSW connectedComponents (#14214) --- .../java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java index 56f7bc62e448..04bbc7886f24 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java @@ -440,10 +440,13 @@ private static int getRandomGraphLevel(double ml, SplittableRandom random) { void finish() throws IOException { // System.out.println("finish " + frozen); - connectComponents(); + // TODO: Connect components can be exceptionally expensive, disabling + // see: https://github.com/apache/lucene/issues/14214 + // connectComponents(); frozen = true; } + @SuppressWarnings("unused") private void connectComponents() throws IOException { long start = System.nanoTime(); for (int level = 0; level < hnsw.numLevels(); level++) { From d85eb8994f27e424bfbef1e886be5a28df4c1cfc Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Fri, 18 Apr 2025 10:16:07 -0400 Subject: [PATCH 2/8] Addressing potential past iteration of intoBitSet --- .../apache/lucene/search/DenseConjunctionBulkScorer.java | 7 +++++-- .../java/org/apache/lucene/search/DocIdSetIterator.java | 3 ++- .../lucene/search/comparators/TermOrdValComparator.java | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/DenseConjunctionBulkScorer.java b/lucene/core/src/java/org/apache/lucene/search/DenseConjunctionBulkScorer.java index 57ad593916cb..6cd86fd5c93b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DenseConjunctionBulkScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/DenseConjunctionBulkScorer.java @@ -236,10 +236,13 @@ private void scoreWindowUsingBitSet( windowMatches.set(0, windowMax - windowBase); } else { DocIdSetIterator lead = iterators.get(0); + int intoBitSetBase = windowBase; if (lead.docID() < windowBase) { - lead.advance(windowBase); + intoBitSetBase = lead.advance(windowBase); + } + if (intoBitSetBase < windowMax) { + lead.intoBitSet(windowMax, windowMatches, intoBitSetBase); } - lead.intoBitSet(windowMax, windowMatches, windowBase); } if (acceptDocs != null) { diff --git a/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java b/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java index abfc985b6884..559196bf8b52 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java @@ -263,7 +263,8 @@ protected final int slowAdvance(int target) throws IOException { * @lucene.internal */ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { - assert offset <= docID(); + assert offset <= docID() && offset <= upTo : + "offset=" + offset + " docID()=" + docID() + " upTo=" + upTo; for (int doc = docID(); doc < upTo; doc = nextDoc()) { bitSet.set(doc - offset); } diff --git a/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java b/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java index 2cff1fa1037d..cd56973c6bbc 100644 --- a/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java +++ b/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java @@ -529,6 +529,7 @@ public int advance(int target) throws IOException { @Override public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { + upTo = Math.min(upTo, maxDoc); if (upTo <= doc) { return; } @@ -539,7 +540,6 @@ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOExcept docsWithField.intoBitSet(upTo, bitSet, offset); doc = docsWithField.docID(); } else { - upTo = Math.min(upTo, maxDoc); bitSet.set(doc - offset, upTo - offset); doc = upTo; } From e1bd32fc5e0410be64541b48c86bfecdd2e61397 Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Fri, 18 Apr 2025 10:36:36 -0400 Subject: [PATCH 3/8] address two other strange places for intoBitSet, adding checks that are similar to other impls --- .../lucene/search/DisjunctionDISIApproximation.java | 8 +++++++- .../lucene/search/comparators/NumericComparator.java | 3 +++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java index a408e8f9b0ca..d5699ab8cb1d 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java @@ -146,7 +146,13 @@ public int advance(int target) throws IOException { @Override public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { while (leadTop.doc < upTo) { - leadTop.approximation.intoBitSet(upTo, bitSet, offset); + int intoBitSetBase = offset; + if (leadTop.approximation.docID() < offset) { + intoBitSetBase = leadTop.approximation.advance(offset); + } + if (intoBitSetBase < upTo) { + leadTop.approximation.intoBitSet(upTo, bitSet, intoBitSetBase); + } leadTop.doc = leadTop.approximation.docID(); leadTop = leadIterators.updateTop(); } diff --git a/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java b/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java index a5d49a4b167a..089f1f237b8b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java +++ b/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java @@ -434,6 +434,9 @@ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOExcept if (competitiveIterator.docID() < doc) { competitiveIterator.advance(doc); } + if (doc >= upTo) { + return; + } competitiveIterator.intoBitSet(upTo, bitSet, offset); doc = competitiveIterator.docID(); } From f692b63c43d9f49b6a4c8414600efa65bbde7dd8 Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Fri, 18 Apr 2025 10:42:35 -0400 Subject: [PATCH 4/8] formatting fixes --- .../src/java/org/apache/lucene/search/DocIdSetIterator.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java b/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java index dd3ba7264992..515d4ec0243f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java @@ -203,8 +203,8 @@ protected final int slowAdvance(int target) throws IOException { * @lucene.internal */ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { - assert offset <= docID() && offset <= upTo : - "offset=" + offset + " docID()=" + docID() + " upTo=" + upTo; + assert offset <= docID() && offset <= upTo + : "offset=" + offset + " docID()=" + docID() + " upTo=" + upTo; for (int doc = docID(); doc < upTo; doc = nextDoc()) { bitSet.set(doc - offset); } From c7d674f262bc06dee2737f6e5053c861690fe6ef Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Fri, 18 Apr 2025 13:41:35 -0400 Subject: [PATCH 5/8] reverting some changes --- .../apache/lucene/search/DenseConjunctionBulkScorer.java | 7 ++----- .../apache/lucene/search/DisjunctionDISIApproximation.java | 7 ++----- .../java/org/apache/lucene/search/DocIdSetIterator.java | 3 +-- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/DenseConjunctionBulkScorer.java b/lucene/core/src/java/org/apache/lucene/search/DenseConjunctionBulkScorer.java index 6cd86fd5c93b..57ad593916cb 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DenseConjunctionBulkScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/DenseConjunctionBulkScorer.java @@ -236,13 +236,10 @@ private void scoreWindowUsingBitSet( windowMatches.set(0, windowMax - windowBase); } else { DocIdSetIterator lead = iterators.get(0); - int intoBitSetBase = windowBase; if (lead.docID() < windowBase) { - intoBitSetBase = lead.advance(windowBase); - } - if (intoBitSetBase < windowMax) { - lead.intoBitSet(windowMax, windowMatches, intoBitSetBase); + lead.advance(windowBase); } + lead.intoBitSet(windowMax, windowMatches, windowBase); } if (acceptDocs != null) { diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java index d5699ab8cb1d..39944df0bb2d 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java @@ -146,13 +146,10 @@ public int advance(int target) throws IOException { @Override public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { while (leadTop.doc < upTo) { - int intoBitSetBase = offset; if (leadTop.approximation.docID() < offset) { - intoBitSetBase = leadTop.approximation.advance(offset); - } - if (intoBitSetBase < upTo) { - leadTop.approximation.intoBitSet(upTo, bitSet, intoBitSetBase); + leadTop.approximation.advance(offset); } + leadTop.approximation.intoBitSet(upTo, bitSet, offset); leadTop.doc = leadTop.approximation.docID(); leadTop = leadIterators.updateTop(); } diff --git a/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java b/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java index 515d4ec0243f..cfed223d165b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java @@ -203,8 +203,7 @@ protected final int slowAdvance(int target) throws IOException { * @lucene.internal */ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { - assert offset <= docID() && offset <= upTo - : "offset=" + offset + " docID()=" + docID() + " upTo=" + upTo; + assert offset <= docID() : "offset=" + offset + " docID()=" + docID() + " upTo=" + upTo; for (int doc = docID(); doc < upTo; doc = nextDoc()) { bitSet.set(doc - offset); } From 8102d8907e0c8137c259de26abf52e97c863c188 Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Fri, 18 Apr 2025 15:55:06 -0400 Subject: [PATCH 6/8] always iterate docsWithField --- .../lucene/search/comparators/TermOrdValComparator.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java b/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java index f09112a6f521..bc1a1d0668d3 100644 --- a/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java +++ b/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java @@ -532,6 +532,10 @@ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOExcept // hasn't nailed down a disjunction of competitive terms yet. if (disjunction == null) { if (docsWithField != null) { + // Paranoid check, we need to be absolutely sure that the iterator is at least at offset + if (docsWithField.docID() < offset) { + docsWithField.advance(offset); + } docsWithField.intoBitSet(upTo, bitSet, offset); doc = docsWithField.docID(); } else { From f6ce8df752c79bfe08a31c47b59b55e72e0da2da Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Sat, 19 Apr 2025 09:16:02 -0400 Subject: [PATCH 7/8] removing unnecessary changes, adding test --- lucene/CHANGES.txt | 5 +- .../search/DisjunctionDISIApproximation.java | 3 - .../search/comparators/NumericComparator.java | 3 - .../comparators/TermOrdValComparator.java | 2 +- .../comparators/TestTermOrdValComparator.java | 87 +++++++++++++++++++ 5 files changed, 92 insertions(+), 8 deletions(-) create mode 100644 lucene/core/src/test/org/apache/lucene/search/comparators/TestTermOrdValComparator.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 309549a55b2c..f0e395ca45d5 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -103,7 +103,10 @@ Bug Fixes --------------------- * GITHUB#14522: Fix DISIDocIdStream::count so that it does not try to count beyond max. - (Chris Hegarty} + (Chris Hegarty) + +* GITHUB#14523: Correct TermOrdValComparator competitive iterator so that it forces sparse + field iteration to be at least scoring window baseline when doing intoBitSet. (Ben Trent, Adrien Grand) ======================= Lucene 10.2.0 ======================= diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java index 39944df0bb2d..a408e8f9b0ca 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java @@ -146,9 +146,6 @@ public int advance(int target) throws IOException { @Override public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { while (leadTop.doc < upTo) { - if (leadTop.approximation.docID() < offset) { - leadTop.approximation.advance(offset); - } leadTop.approximation.intoBitSet(upTo, bitSet, offset); leadTop.doc = leadTop.approximation.docID(); leadTop = leadIterators.updateTop(); diff --git a/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java b/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java index 089f1f237b8b..a5d49a4b167a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java +++ b/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java @@ -434,9 +434,6 @@ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOExcept if (competitiveIterator.docID() < doc) { competitiveIterator.advance(doc); } - if (doc >= upTo) { - return; - } competitiveIterator.intoBitSet(upTo, bitSet, offset); doc = competitiveIterator.docID(); } diff --git a/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java b/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java index bc1a1d0668d3..8683aee3c76e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java +++ b/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java @@ -532,7 +532,7 @@ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOExcept // hasn't nailed down a disjunction of competitive terms yet. if (disjunction == null) { if (docsWithField != null) { - // Paranoid check, we need to be absolutely sure that the iterator is at least at offset + // we need to be absolutely sure that the iterator is at least at offset if (docsWithField.docID() < offset) { docsWithField.advance(offset); } diff --git a/lucene/core/src/test/org/apache/lucene/search/comparators/TestTermOrdValComparator.java b/lucene/core/src/test/org/apache/lucene/search/comparators/TestTermOrdValComparator.java new file mode 100644 index 000000000000..fd0354a4b4dc --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/comparators/TestTermOrdValComparator.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.comparators; + +import java.io.IOException; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.KeywordField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BulkScorer; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.LeafCollector; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.SortedSetSelector; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopFieldCollectorManager; +import org.apache.lucene.search.Weight; +import org.apache.lucene.store.ByteBuffersDirectory; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.util.LuceneTestCase; + +public class TestTermOrdValComparator extends LuceneTestCase { + + public void testIntoBitSetBugIssue14517() throws IOException { + final int maxDoc = 5_000; + try (Directory dir = new ByteBuffersDirectory()) { + try (IndexWriter w = new IndexWriter(dir, new IndexWriterConfig())) { + // high max doc to have a high number of unique values so that the competitive iterator is + // initialized with `docsWithField` rather than specific (< 1024) terms + for (int i = 0; i < maxDoc; ++i) { + Document doc = new Document(); + // We need the field to be sparse, so that the competitive iterator is initialized with + // `docsWithField` + if (i % 2 == 0) { + doc.add(new StringField("field", "value", Field.Store.NO)); + doc.add(new KeywordField("sort", Integer.toString(i), Field.Store.NO)); + } + w.addDocument(doc); + } + w.forceMerge(1); + } + try (DirectoryReader reader = DirectoryReader.open(dir)) { + LeafReaderContext context = reader.leaves().get(0); + IndexSearcher searcher = new IndexSearcher(reader); + Query query = new TermQuery(new Term("field", "value")); + Weight weight = + searcher.createWeight(query, ScoreMode.COMPLETE_NO_SCORES, RANDOM_MULTIPLIER); + SortField sortField = KeywordField.newSortField("sort", false, SortedSetSelector.Type.MIN); + sortField.setMissingValue(SortField.STRING_LAST); + Sort sort = new Sort(sortField); + Collector collector = new TopFieldCollectorManager(sort, 10, 10).newCollector(); + LeafCollector leafCollector = collector.getLeafCollector(context); + BulkScorer bulkScorer = weight.bulkScorer(context); + // We need to split on this specific doc ID so that the current doc of the competitive + // iterator + // and the current doc of `docsWithField` are out of sync, because the competitive iterator + // was + // just updated. + bulkScorer.score(leafCollector, null, 0, 22); + bulkScorer.score(leafCollector, null, 22, maxDoc); + } + } + } +} From c39e7ae68a1794bd41608a1bc3c771153e65cff9 Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Sat, 19 Apr 2025 12:07:42 -0400 Subject: [PATCH 8/8] formatting --- .../search/comparators/TestTermOrdValComparator.java | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/search/comparators/TestTermOrdValComparator.java b/lucene/core/src/test/org/apache/lucene/search/comparators/TestTermOrdValComparator.java index fd0354a4b4dc..52738fc77c0b 100644 --- a/lucene/core/src/test/org/apache/lucene/search/comparators/TestTermOrdValComparator.java +++ b/lucene/core/src/test/org/apache/lucene/search/comparators/TestTermOrdValComparator.java @@ -52,8 +52,7 @@ public void testIntoBitSetBugIssue14517() throws IOException { // initialized with `docsWithField` rather than specific (< 1024) terms for (int i = 0; i < maxDoc; ++i) { Document doc = new Document(); - // We need the field to be sparse, so that the competitive iterator is initialized with - // `docsWithField` + // make the field to be sparse, so that the iterator is initialized with `docsWithField` if (i % 2 == 0) { doc.add(new StringField("field", "value", Field.Store.NO)); doc.add(new KeywordField("sort", Integer.toString(i), Field.Store.NO)); @@ -74,11 +73,9 @@ public void testIntoBitSetBugIssue14517() throws IOException { Collector collector = new TopFieldCollectorManager(sort, 10, 10).newCollector(); LeafCollector leafCollector = collector.getLeafCollector(context); BulkScorer bulkScorer = weight.bulkScorer(context); - // We need to split on this specific doc ID so that the current doc of the competitive - // iterator - // and the current doc of `docsWithField` are out of sync, because the competitive iterator - // was - // just updated. + // split on this specific doc ID so that the current doc of the competitive iterator + // and the current doc of `docsWithField` are out of sync, + // because the competitive iterator was just updated. bulkScorer.score(leafCollector, null, 0, 22); bulkScorer.score(leafCollector, null, 22, maxDoc); }