7 years ago · fba5d39bbb
--- a/server/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java
+++ b/server/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java
@@ -18,7 +18,6 @@
 
				  */
			
 
				 package org.elasticsearch.search.suggest.completion;
			
 
				 
			
 
				-import org.apache.lucene.analysis.CharArraySet;
			
 
				 import org.apache.lucene.index.LeafReaderContext;
			
 
				 import org.apache.lucene.search.BulkScorer;
			
 
				 import org.apache.lucene.search.CollectionTerminatedException;
			
@@ -34,9 +33,7 @@ import org.elasticsearch.search.suggest.Suggest;
 
				 import org.elasticsearch.search.suggest.Suggester;
			
 
				 
			
 
				 import java.io.IOException;
			
 
				-import java.util.ArrayList;
			
 
				 import java.util.Collections;
			
 
				-import java.util.LinkedHashMap;
			
 
				 import java.util.List;
			
 
				 import java.util.Map;
			
 
				 import java.util.Set;
			
@@ -59,15 +56,17 @@ public class CompletionSuggester extends Suggester<CompletionSuggestionContext>
 
				                 new Text(spare.toString()), 0, spare.length());
			
 
				             completionSuggestion.addTerm(completionSuggestEntry);
			
 
				             int shardSize = suggestionContext.getShardSize() != null ? suggestionContext.getShardSize() : suggestionContext.getSize();
			
 
				-            TopSuggestDocsCollector collector = new TopDocumentsCollector(shardSize, suggestionContext.isSkipDuplicates());
			
 
				+            TopSuggestGroupDocsCollector collector = new TopSuggestGroupDocsCollector(shardSize, suggestionContext.isSkipDuplicates());
			
 
				             suggest(searcher, suggestionContext.toQuery(), collector);
			
 
				             int numResult = 0;
			
 
				-            for (TopSuggestDocs.SuggestScoreDoc suggestScoreDoc : collector.get().scoreLookupDocs()) {
			
 
				-                TopDocumentsCollector.SuggestDoc suggestDoc = (TopDocumentsCollector.SuggestDoc) suggestScoreDoc;
			
 
				+            for (TopSuggestDocs.SuggestScoreDoc suggestDoc : collector.get().scoreLookupDocs()) {
			
 
				                 // collect contexts
			
 
				                 Map<String, Set<CharSequence>> contexts = Collections.emptyMap();
			
 
				-                if (fieldType.hasContextMappings() && suggestDoc.getContexts().isEmpty() == false) {
			
 
				-                    contexts = fieldType.getContextMappings().getNamedContexts(suggestDoc.getContexts());
			
 
				+                if (fieldType.hasContextMappings()) {
			
 
				+                    List<CharSequence> rawContexts = collector.getContexts(suggestDoc.doc);
			
 
				+                    if (rawContexts.size() > 0) {
			
 
				+                        contexts = fieldType.getContextMappings().getNamedContexts(rawContexts);
			
 
				+                    }
			
 
				                 }
			
 
				                 if (numResult++ < suggestionContext.getSize()) {
			
 
				                     CompletionSuggestion.Entry.Option option = new CompletionSuggestion.Entry.Option(suggestDoc.doc,
			
@@ -97,120 +96,4 @@ public class CompletionSuggester extends Suggester<CompletionSuggestionContext>
 
				             }
			
 
				         }
			
 
				     }
			
 
				-
			
 
				-    /**
			
 
				-     * TODO: this should be refactored and moved to lucene see https://issues.apache.org/jira/browse/LUCENE-6880
			
 
				-     *
			
 
				-     * Custom collector that returns top documents from the completion suggester.
			
 
				-     * When suggestions are augmented with contexts values this collector groups suggestions coming from the same document
			
 
				-     * but matching different contexts together. Each document is counted as 1 entry and the provided size is the expected number
			
 
				-     * of documents that should be returned (not the number of suggestions).
			
 
				-     * This collector is also able to filter duplicate suggestion coming from different documents.
			
 
				-     * When different contexts match the same suggestion form only the best one (sorted by weight) is kept.
			
 
				-     * In order to keep this feature fast, the de-duplication of suggestions with different contexts is done
			
 
				-     * only on the top N*num_contexts (where N is the number of documents to return) suggestions per segment.
			
 
				-     * This means that skip_duplicates will visit at most N*num_contexts suggestions per segment to find unique suggestions
			
 
				-     * that match the input. If more than N*num_contexts suggestions are duplicated with different contexts this collector
			
 
				-     * will not be able to return more than one suggestion even when N is greater than 1.
			
 
				-     **/
			
 
				-    private static final class TopDocumentsCollector extends TopSuggestDocsCollector {
			
 
				-
			
 
				-        /**
			
 
				-         * Holds a list of suggest meta data for a doc
			
 
				-         */
			
 
				-        private static final class SuggestDoc extends TopSuggestDocs.SuggestScoreDoc {
			
 
				-
			
 
				-            private List<TopSuggestDocs.SuggestScoreDoc> suggestScoreDocs;
			
 
				-
			
 
				-            SuggestDoc(int doc, CharSequence key, CharSequence context, float score) {
			
 
				-                super(doc, key, context, score);
			
 
				-            }
			
 
				-
			
 
				-            void add(CharSequence key, CharSequence context, float score) {
			
 
				-                if (suggestScoreDocs == null) {
			
 
				-                    suggestScoreDocs = new ArrayList<>(1);
			
 
				-                }
			
 
				-                suggestScoreDocs.add(new TopSuggestDocs.SuggestScoreDoc(doc, key, context, score));
			
 
				-            }
			
 
				-
			
 
				-            public List<CharSequence> getKeys() {
			
 
				-                if (suggestScoreDocs == null) {
			
 
				-                    return Collections.singletonList(key);
			
 
				-                } else {
			
 
				-                    List<CharSequence> keys = new ArrayList<>(suggestScoreDocs.size() + 1);
			
 
				-                    keys.add(key);
			
 
				-                    for (TopSuggestDocs.SuggestScoreDoc scoreDoc : suggestScoreDocs) {
			
 
				-                        keys.add(scoreDoc.key);
			
 
				-                    }
			
 
				-                    return keys;
			
 
				-                }
			
 
				-            }
			
 
				-
			
 
				-            public List<CharSequence> getContexts() {
			
 
				-                if (suggestScoreDocs == null) {
			
 
				-                    if (context != null) {
			
 
				-                        return Collections.singletonList(context);
			
 
				-                    } else {
			
 
				-                        return Collections.emptyList();
			
 
				-                    }
			
 
				-                } else {
			
 
				-                    List<CharSequence> contexts = new ArrayList<>(suggestScoreDocs.size() + 1);
			
 
				-                    contexts.add(context);
			
 
				-                    for (TopSuggestDocs.SuggestScoreDoc scoreDoc : suggestScoreDocs) {
			
 
				-                        contexts.add(scoreDoc.context);
			
 
				-                    }
			
 
				-                    return contexts;
			
 
				-                }
			
 
				-            }
			
 
				-        }
			
 
				-
			
 
				-        private final Map<Integer, SuggestDoc> docsMap;
			
 
				-
			
 
				-        TopDocumentsCollector(int num, boolean skipDuplicates) {
			
 
				-            super(Math.max(1, num), skipDuplicates);
			
 
				-            this.docsMap = new LinkedHashMap<>(num);
			
 
				-        }
			
 
				-
			
 
				-        @Override
			
 
				-        public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
			
 
				-            int globalDoc = docID + docBase;
			
 
				-            if (docsMap.containsKey(globalDoc)) {
			
 
				-                docsMap.get(globalDoc).add(key, context, score);
			
 
				-            } else {
			
 
				-                docsMap.put(globalDoc, new SuggestDoc(globalDoc, key, context, score));
			
 
				-                super.collect(docID, key, context, score);
			
 
				-            }
			
 
				-        }
			
 
				-
			
 
				-        @Override
			
 
				-        public TopSuggestDocs get() throws IOException {
			
 
				-            TopSuggestDocs entries = super.get();
			
 
				-            if (entries.scoreDocs.length == 0) {
			
 
				-                return TopSuggestDocs.EMPTY;
			
 
				-            }
			
 
				-            // The parent class returns suggestions, not documents, and dedup only the surface form (without contexts).
			
 
				-            // The following code groups suggestions matching different contexts by document id and dedup the surface form + contexts
			
 
				-            // if needed (skip_duplicates).
			
 
				-            int size = entries.scoreDocs.length;
			
 
				-            final List<TopSuggestDocs.SuggestScoreDoc> suggestDocs = new ArrayList<>(size);
			
 
				-            final CharArraySet seenSurfaceForms = doSkipDuplicates() ? new CharArraySet(size, false) : null;
			
 
				-            for (TopSuggestDocs.SuggestScoreDoc suggestEntry : entries.scoreLookupDocs()) {
			
 
				-                final SuggestDoc suggestDoc;
			
 
				-                if (docsMap != null) {
			
 
				-                    suggestDoc = docsMap.get(suggestEntry.doc);
			
 
				-                } else {
			
 
				-                    suggestDoc = new SuggestDoc(suggestEntry.doc, suggestEntry.key, suggestEntry.context, suggestEntry.score);
			
 
				-                }
			
 
				-                if (doSkipDuplicates()) {
			
 
				-                    if (seenSurfaceForms.contains(suggestDoc.key)) {
			
 
				-                        continue;
			
 
				-                    }
			
 
				-                    seenSurfaceForms.add(suggestDoc.key);
			
 
				-                }
			
 
				-                suggestDocs.add(suggestDoc);
			
 
				-            }
			
 
				-            return new TopSuggestDocs(entries.totalHits,
			
 
				-                suggestDocs.toArray(new TopSuggestDocs.SuggestScoreDoc[0]));
			
 
				-        }
			
 
				-    }
			
 
				 }
			
--- a/server/src/main/java/org/elasticsearch/search/suggest/completion/TopSuggestGroupDocsCollector.java
+++ b/server/src/main/java/org/elasticsearch/search/suggest/completion/TopSuggestGroupDocsCollector.java
@@ -0,0 +1,232 @@
 
				+/*
			
 
				+ * Licensed to Elasticsearch under one or more contributor
			
 
				+ * license agreements. See the NOTICE file distributed with
			
 
				+ * this work for additional information regarding copyright
			
 
				+ * ownership. Elasticsearch licenses this file to you under
			
 
				+ * the Apache License, Version 2.0 (the "License"); you may
			
 
				+ * not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing,
			
 
				+ * software distributed under the License is distributed on an
			
 
				+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
			
 
				+ * KIND, either express or implied.  See the License for the
			
 
				+ * specific language governing permissions and limitations
			
 
				+ * under the License.
			
 
				+ */
			
 
				+package org.elasticsearch.search.suggest.completion;
			
 
				+
			
 
				+import org.apache.lucene.analysis.CharArraySet;
			
 
				+import org.apache.lucene.index.LeafReaderContext;
			
 
				+import org.apache.lucene.search.CollectionTerminatedException;
			
 
				+import org.apache.lucene.search.TotalHits;
			
 
				+import org.apache.lucene.search.suggest.Lookup;
			
 
				+import org.apache.lucene.search.suggest.document.TopSuggestDocs;
			
 
				+import org.apache.lucene.search.suggest.document.TopSuggestDocsCollector;
			
 
				+import org.apache.lucene.util.PriorityQueue;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.Collections;
			
 
				+import java.util.HashMap;
			
 
				+import java.util.List;
			
 
				+import java.util.Map;
			
 
				+
			
 
				+/**
			
 
				+ *
			
 
				+ * Custom {@link TopSuggestDocsCollector} that returns top documents from the completion suggester.
			
 
				+ * <p>
			
 
				+ * TODO: this should be refactored when https://issues.apache.org/jira/browse/LUCENE-8529 is fixed.
			
 
				+ * Unlike the parent class, this collector uses the surface form to tie-break suggestions with identical
			
 
				+ * scores.
			
 
				+ * <p>
			
 
				+ * This collector groups suggestions coming from the same document but matching different contexts
			
 
				+ * or surface form together. When different contexts or surface forms match the same suggestion form only
			
 
				+ * the best one per document (sorted by weight) is kept.
			
 
				+ * <p>
			
 
				+ * This collector is also able to filter duplicate suggestion coming from different documents.
			
 
				+ * In order to keep this feature fast, the de-duplication of suggestions with different contexts is done
			
 
				+ * only on the top N*num_contexts (where N is the number of documents to return) suggestions per segment.
			
 
				+ * This means that skip_duplicates will visit at most N*num_contexts suggestions per segment to find unique suggestions
			
 
				+ * that match the input. If more than N*num_contexts suggestions are duplicated with different contexts this collector
			
 
				+ * will not be able to return more than one suggestion even when N is greater than 1.
			
 
				+ **/
			
 
				+class TopSuggestGroupDocsCollector extends TopSuggestDocsCollector {
			
 
				+    private final class SuggestScoreDocPriorityQueue extends PriorityQueue<TopSuggestDocs.SuggestScoreDoc> {
			
 
				+        /**
			
 
				+         * Creates a new priority queue of the specified size.
			
 
				+         */
			
 
				+        private SuggestScoreDocPriorityQueue(int size) {
			
 
				+            super(size);
			
 
				+        }
			
 
				+
			
 
				+        @Override
			
 
				+        protected boolean lessThan(TopSuggestDocs.SuggestScoreDoc a, TopSuggestDocs.SuggestScoreDoc b) {
			
 
				+            if (a.score == b.score) {
			
 
				+                // tie break by completion key
			
 
				+                int cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);
			
 
				+                // prefer smaller doc id, in case of a tie
			
 
				+                return cmp != 0 ? cmp > 0 : a.doc > b.doc;
			
 
				+            }
			
 
				+            return a.score < b.score;
			
 
				+        }
			
 
				+
			
 
				+        /**
			
 
				+         * Returns the top N results in descending order.
			
 
				+         */
			
 
				+        public TopSuggestDocs.SuggestScoreDoc[] getResults() {
			
 
				+            int size = size();
			
 
				+            TopSuggestDocs.SuggestScoreDoc[] res = new TopSuggestDocs.SuggestScoreDoc[size];
			
 
				+            for (int i = size - 1; i >= 0; i--) {
			
 
				+                res[i] = pop();
			
 
				+            }
			
 
				+            return res;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    private final SuggestScoreDocPriorityQueue priorityQueue;
			
 
				+    private final int num;
			
 
				+
			
 
				+    /** Only set if we are deduplicating hits: holds all per-segment hits until the end, when we dedup them */
			
 
				+    private final List<TopSuggestDocs.SuggestScoreDoc> pendingResults;
			
 
				+
			
 
				+    /** Only set if we are deduplicating hits: holds all surface forms seen so far in the current segment */
			
 
				+    final CharArraySet seenSurfaceForms;
			
 
				+
			
 
				+    /** Document base offset for the current Leaf */
			
 
				+    protected int docBase;
			
 
				+
			
 
				+    private Map<Integer, List<CharSequence>> docContexts = new HashMap<>();
			
 
				+
			
 
				+    /**
			
 
				+     * Sole constructor
			
 
				+     *
			
 
				+     * Collects at most <code>num</code> completions
			
 
				+     * with corresponding document and weight
			
 
				+     */
			
 
				+    TopSuggestGroupDocsCollector(int num, boolean skipDuplicates) {
			
 
				+        super(1, skipDuplicates);
			
 
				+        if (num <= 0) {
			
 
				+            throw new IllegalArgumentException("'num' must be > 0");
			
 
				+        }
			
 
				+        this.num = num;
			
 
				+        this.priorityQueue = new SuggestScoreDocPriorityQueue(num);
			
 
				+        if (skipDuplicates) {
			
 
				+            seenSurfaceForms = new CharArraySet(num, false);
			
 
				+            pendingResults = new ArrayList<>();
			
 
				+        } else {
			
 
				+            seenSurfaceForms = null;
			
 
				+            pendingResults = null;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Returns the contexts associated with the provided <code>doc</code>.
			
 
				+     */
			
 
				+    public List<CharSequence> getContexts(int doc) {
			
 
				+        return docContexts.getOrDefault(doc, Collections.emptyList());
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    protected boolean doSkipDuplicates() {
			
 
				+        return seenSurfaceForms != null;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public int getCountToCollect() {
			
 
				+        return num;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    protected void doSetNextReader(LeafReaderContext context) throws IOException {
			
 
				+        docBase = context.docBase;
			
 
				+        if (seenSurfaceForms != null) {
			
 
				+            seenSurfaceForms.clear();
			
 
				+            // NOTE: this also clears the priorityQueue:
			
 
				+            for (TopSuggestDocs.SuggestScoreDoc hit : priorityQueue.getResults()) {
			
 
				+                pendingResults.add(hit);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
			
 
				+        int globalDoc = docID + docBase;
			
 
				+        boolean isDuplicate = docContexts.containsKey(globalDoc);
			
 
				+        List<CharSequence> contexts = docContexts.computeIfAbsent(globalDoc, k -> new ArrayList<>());
			
 
				+        if (context != null) {
			
 
				+            contexts.add(context);
			
 
				+        }
			
 
				+        if (isDuplicate) {
			
 
				+            return;
			
 
				+        }
			
 
				+        TopSuggestDocs.SuggestScoreDoc current = new TopSuggestDocs.SuggestScoreDoc(globalDoc, key, context, score);
			
 
				+        if (current == priorityQueue.insertWithOverflow(current)) {
			
 
				+            // if the current SuggestScoreDoc has overflown from pq,
			
 
				+            // we can assume all of the successive collections from
			
 
				+            // this leaf will be overflown as well
			
 
				+            // TODO: reuse the overflow instance?
			
 
				+            throw new CollectionTerminatedException();
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public TopSuggestDocs get() throws IOException {
			
 
				+
			
 
				+        TopSuggestDocs.SuggestScoreDoc[] suggestScoreDocs;
			
 
				+
			
 
				+        if (seenSurfaceForms != null) {
			
 
				+            // NOTE: this also clears the priorityQueue:
			
 
				+            for (TopSuggestDocs.SuggestScoreDoc hit : priorityQueue.getResults()) {
			
 
				+                pendingResults.add(hit);
			
 
				+            }
			
 
				+
			
 
				+            // Deduplicate all hits: we already dedup'd efficiently within each segment by
			
 
				+            // truncating the FST top paths search, but across segments there may still be dups:
			
 
				+            seenSurfaceForms.clear();
			
 
				+
			
 
				+            // TODO: we could use a priority queue here to make cost O(N * log(num)) instead of O(N * log(N)), where N = O(num *
			
 
				+            // numSegments), but typically numSegments is smallish and num is smallish so this won't matter much in practice:
			
 
				+
			
 
				+            Collections.sort(pendingResults,
			
 
				+                (a, b) -> {
			
 
				+                    // sort by higher score
			
 
				+                    int cmp = Float.compare(b.score, a.score);
			
 
				+                    if (cmp == 0) {
			
 
				+                        // tie break by completion key
			
 
				+                        cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);
			
 
				+                        if (cmp == 0) {
			
 
				+                            // prefer smaller doc id, in case of a tie
			
 
				+                            cmp = Integer.compare(a.doc, b.doc);
			
 
				+                        }
			
 
				+                    }
			
 
				+                    return cmp;
			
 
				+                });
			
 
				+
			
 
				+            List<TopSuggestDocs.SuggestScoreDoc> hits = new ArrayList<>();
			
 
				+
			
 
				+            for (TopSuggestDocs.SuggestScoreDoc hit : pendingResults) {
			
 
				+                if (seenSurfaceForms.contains(hit.key) == false) {
			
 
				+                    seenSurfaceForms.add(hit.key);
			
 
				+                    hits.add(hit);
			
 
				+                    if (hits.size() == num) {
			
 
				+                        break;
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+            suggestScoreDocs = hits.toArray(new TopSuggestDocs.SuggestScoreDoc[0]);
			
 
				+        } else {
			
 
				+            suggestScoreDocs = priorityQueue.getResults();
			
 
				+        }
			
 
				+
			
 
				+        if (suggestScoreDocs.length > 0) {
			
 
				+            return new TopSuggestDocs(new TotalHits(suggestScoreDocs.length, TotalHits.Relation.EQUAL_TO), suggestScoreDocs);
			
 
				+        } else {
			
 
				+            return TopSuggestDocs.EMPTY;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+}
			
--- a/server/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchIT.java
+++ b/server/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchIT.java
@@ -54,6 +54,7 @@ import java.io.IOException;
 
				 import java.util.ArrayList;
			
 
				 import java.util.Arrays;
			
 
				 import java.util.Collection;
			
 
				+import java.util.HashSet;
			
 
				 import java.util.LinkedHashMap;
			
 
				 import java.util.List;
			
 
				 import java.util.Locale;
			
@@ -91,6 +92,41 @@ public class CompletionSuggestSearchIT extends ESIntegTestCase {
 
				         return Arrays.asList(InternalSettingsPlugin.class);
			
 
				     }
			
 
				 
			
 
				+    public void testTieBreak() throws Exception {
			
 
				+        final CompletionMappingBuilder mapping = new CompletionMappingBuilder();
			
 
				+        mapping.indexAnalyzer("keyword");
			
 
				+        createIndexAndMapping(mapping);
			
 
				+
			
 
				+        int numDocs = randomIntBetween(3, 50);
			
 
				+        List<IndexRequestBuilder> indexRequestBuilders = new ArrayList<>();
			
 
				+        Set<String> entrySet = new HashSet<>();
			
 
				+        for (int i = 0; i < numDocs; i++) {
			
 
				+            String value = "a" + randomValueOtherThanMany(v -> entrySet.contains(v),
			
 
				+                () -> randomAlphaOfLengthBetween(1, 10));
			
 
				+            entrySet.add(value);
			
 
				+            indexRequestBuilders.add(client().prepareIndex(INDEX, TYPE, "" + i)
			
 
				+                .setSource(jsonBuilder()
			
 
				+                    .startObject()
			
 
				+                    .startObject(FIELD)
			
 
				+                    .field("input", value)
			
 
				+                    .field("weight", 10)
			
 
				+                    .endObject()
			
 
				+                    .endObject()
			
 
				+                ));
			
 
				+        }
			
 
				+        String[] entries = entrySet.stream()
			
 
				+            .toArray(String[]::new);
			
 
				+        Arrays.sort(entries);
			
 
				+        indexRandom(true, indexRequestBuilders);
			
 
				+        for (int i = 1; i < numDocs; i++) {
			
 
				+            CompletionSuggestionBuilder prefix = SuggestBuilders.completionSuggestion(FIELD)
			
 
				+                .prefix("a")
			
 
				+                .size(i);
			
 
				+            String[] topEntries = Arrays.copyOfRange(entries, 0, i);
			
 
				+            assertSuggestions("foo", prefix, topEntries);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				     public void testPrefix() throws Exception {
			
 
				         final CompletionMappingBuilder mapping = new CompletionMappingBuilder();
			
 
				         createIndexAndMapping(mapping);