5 年之前 · 2ec1f6b4c4
--- a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
+++ b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
@@ -579,7 +579,7 @@ public class QueryPhase implements SearchPhase {
 
				      * Returns true if more than 50% of data in the index have the same value
			
 
				      * The evaluation is approximation based on finding the median value and estimating its count
			
 
				      */
			
 
				-    static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException {
			
 
				+    private static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException {
			
 
				         long docsNoDupl = 0; // number of docs in segments with NO duplicate data that would benefit optimization
			
 
				         long docsDupl = 0; // number of docs in segments with duplicate data that would NOT benefit optimization
			
 
				         for (LeafReaderContext lrc : reader.leaves()) {
			
@@ -590,24 +590,8 @@ public class QueryPhase implements SearchPhase {
 
				                 continue;
			
 
				             }
			
 
				             assert(pointValues.size() == docCount); // TODO: modify the code to handle multiple values
			
 
				-
			
 
				             int duplDocCount = docCount/2; // expected doc count of duplicate data
			
 
				-            long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
			
 
				-            long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
			
 
				-            boolean hasDuplicateData = true;
			
 
				-            while ((minValue < maxValue) && hasDuplicateData) {
			
 
				-                long midValue = Math.floorDiv(minValue, 2) + Math.floorDiv(maxValue, 2); // to avoid overflow first divide each value by 2
			
 
				-                long countLeft = estimatePointCount(pointValues, minValue, midValue);
			
 
				-                long countRight = estimatePointCount(pointValues, midValue + 1, maxValue);
			
 
				-                if ((countLeft >= countRight) && (countLeft > duplDocCount) ) {
			
 
				-                    maxValue = midValue;
			
 
				-                } else if ((countRight > countLeft) && (countRight > duplDocCount)) {
			
 
				-                    minValue = midValue + 1;
			
 
				-                } else {
			
 
				-                    hasDuplicateData = false;
			
 
				-                }
			
 
				-            }
			
 
				-            if (hasDuplicateData) {
			
 
				+            if (pointsHaveDuplicateData(pointValues, duplDocCount)) {
			
 
				                 docsDupl += docCount;
			
 
				             } else {
			
 
				                 docsNoDupl += docCount;
			
@@ -616,6 +600,25 @@ public class QueryPhase implements SearchPhase {
 
				         return (docsDupl > docsNoDupl);
			
 
				     }
			
 
				 
			
 
				+    static boolean pointsHaveDuplicateData(PointValues pointValues, int duplDocCount) throws IOException {
			
 
				+        long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
			
 
				+        long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
			
 
				+        boolean hasDuplicateData = true;
			
 
				+        while ((minValue < maxValue) && hasDuplicateData) {
			
 
				+            long midValue = Math.floorDiv(minValue, 2) + Math.floorDiv(maxValue, 2); // to avoid overflow first divide each value by 2
			
 
				+            long countLeft = estimatePointCount(pointValues, minValue, midValue);
			
 
				+            long countRight = estimatePointCount(pointValues, midValue + 1, maxValue);
			
 
				+            if ((countLeft >= countRight) && (countLeft > duplDocCount) ) {
			
 
				+                maxValue = midValue;
			
 
				+            } else if ((countRight > countLeft) && (countRight > duplDocCount)) {
			
 
				+                minValue = midValue + 1;
			
 
				+            } else {
			
 
				+                hasDuplicateData = false;
			
 
				+            }
			
 
				+        }
			
 
				+        return hasDuplicateData;
			
 
				+    }
			
 
				+
			
 
				 
			
 
				     private static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
			
 
				         final byte[] minValueAsBytes = new byte[Long.BYTES];
			
--- a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java
+++ b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java
@@ -69,8 +69,13 @@ import org.apache.lucene.search.join.ScoreMode;
 
				 import org.apache.lucene.search.spans.SpanNearQuery;
			
 
				 import org.apache.lucene.search.spans.SpanTermQuery;
			
 
				 import org.apache.lucene.store.Directory;
			
 
				+import org.apache.lucene.store.IOContext;
			
 
				+import org.apache.lucene.store.IndexInput;
			
 
				+import org.apache.lucene.store.IndexOutput;
			
 
				 import org.apache.lucene.util.BytesRef;
			
 
				 import org.apache.lucene.util.FixedBitSet;
			
 
				+import org.apache.lucene.util.bkd.BKDReader;
			
 
				+import org.apache.lucene.util.bkd.BKDWriter;
			
 
				 import org.elasticsearch.action.search.SearchShardTask;
			
 
				 import org.elasticsearch.index.mapper.DateFieldMapper;
			
 
				 import org.elasticsearch.index.mapper.MappedFieldType;
			
@@ -94,7 +99,7 @@ import java.util.ArrayList;
 
				 import java.util.Collections;
			
 
				 import java.util.List;
			
 
				 
			
 
				-import static org.elasticsearch.search.query.QueryPhase.indexFieldHasDuplicateData;
			
 
				+import static org.elasticsearch.search.query.QueryPhase.pointsHaveDuplicateData;
			
 
				 import static org.elasticsearch.search.query.TopDocsCollectorContext.hasInfMaxScore;
			
 
				 import static org.hamcrest.Matchers.anyOf;
			
 
				 import static org.hamcrest.Matchers.equalTo;
			
@@ -701,31 +706,56 @@ public class QueryPhaseTests extends IndexShardTestCase {
 
				         dir.close();
			
 
				     }
			
 
				 
			
 
				-    @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/49703")
			
 
				     public void testIndexHasDuplicateData() throws IOException {
			
 
				-        int docsCount = 7000;
			
 
				-        int duplIndex = docsCount * 7 / 10;
			
 
				-        int duplIndex2 = docsCount * 3 / 10;
			
 
				+        int docsCount = 5000;
			
 
				+        int maxPointsInLeafNode = 40;
			
 
				+        float duplicateRatio = 0.7f;
			
 
				         long duplicateValue = randomLongBetween(-10000000L, 10000000L);
			
 
				-        Directory dir = newDirectory();
			
 
				-        IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
			
 
				-        for (int docId = 0; docId < docsCount; docId++) {
			
 
				-            Document doc = new Document();
			
 
				-            long rndValue = randomLongBetween(-10000000L, 10000000L);
			
 
				-            long value = (docId < duplIndex) ? duplicateValue : rndValue;
			
 
				-            long value2 = (docId < duplIndex2) ? duplicateValue : rndValue;
			
 
				-            doc.add(new LongPoint("duplicateField", value));
			
 
				-            doc.add(new LongPoint("notDuplicateField", value2));
			
 
				-            writer.addDocument(doc);
			
 
				+
			
 
				+        try (Directory dir = newDirectory()) {
			
 
				+            BKDWriter w = new BKDWriter(docsCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, docsCount);
			
 
				+            byte[] longBytes = new byte[8];
			
 
				+            for (int docId = 0; docId < docsCount; docId++) {
			
 
				+                long value = randomFloat() < duplicateRatio ? duplicateValue : randomLongBetween(-10000000L, 10000000L);
			
 
				+                LongPoint.encodeDimension(value, longBytes, 0);
			
 
				+                w.add(longBytes, docId);
			
 
				+            }
			
 
				+            long indexFP;
			
 
				+            try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
			
 
				+                indexFP = w.finish(out);
			
 
				+            }
			
 
				+            try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
			
 
				+                in.seek(indexFP);
			
 
				+                BKDReader r = new BKDReader(in);
			
 
				+                assertTrue(pointsHaveDuplicateData(r, r.getDocCount()/2));
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    public void testIndexHasNoDuplicateData() throws IOException {
			
 
				+        int docsCount = 5000;
			
 
				+        int maxPointsInLeafNode = 40;
			
 
				+        float duplicateRatio = 0.3f;
			
 
				+        long duplicateValue = randomLongBetween(-10000000L, 10000000L);
			
 
				+
			
 
				+        try (Directory dir = newDirectory()) {
			
 
				+            BKDWriter w = new BKDWriter(docsCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, docsCount);
			
 
				+            byte[] longBytes = new byte[8];
			
 
				+            for (int docId = 0; docId < docsCount; docId++) {
			
 
				+                long value = randomFloat() < duplicateRatio ? duplicateValue : randomLongBetween(-10000000L, 10000000L);
			
 
				+                LongPoint.encodeDimension(value, longBytes, 0);
			
 
				+                w.add(longBytes, docId);
			
 
				+            }
			
 
				+            long indexFP;
			
 
				+            try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
			
 
				+                indexFP = w.finish(out);
			
 
				+            }
			
 
				+            try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
			
 
				+                in.seek(indexFP);
			
 
				+                BKDReader r = new BKDReader(in);
			
 
				+                assertFalse(pointsHaveDuplicateData(r, r.getDocCount()/2));
			
 
				+            }
			
 
				         }
			
 
				-        writer.close();
			
 
				-        final IndexReader reader = DirectoryReader.open(dir);
			
 
				-        boolean hasDuplicateData = indexFieldHasDuplicateData(reader, "duplicateField");
			
 
				-        boolean hasDuplicateData2 = indexFieldHasDuplicateData(reader, "notDuplicateField");
			
 
				-        reader.close();
			
 
				-        dir.close();
			
 
				-        assertTrue(hasDuplicateData);
			
 
				-        assertFalse(hasDuplicateData2);
			
 
				     }
			
 
				 
			
 
				     public void testMaxScoreQueryVisitor() {