Browse Source

Implement all queries on doc-values only keyword fields (#83404)

Adds doc-values-only search support for wilcard/regexp/prefix/fuzzy etc. queries on keyword fields.

Relates #81210 and #52728
Yannick Welsch 3 years ago
parent
commit
7f0595abe6

+ 5 - 0
docs/changelog/83404.yaml

@@ -0,0 +1,5 @@
+pr: 83404
+summary: Implement all queries on doc-values only keyword fields
+area: Mapping
+type: enhancement
+issues: []

+ 1 - 1
docs/reference/mapping/params/doc-values.asciidoc

@@ -19,7 +19,7 @@ with the __notable exception of `text` and `annotated_text` fields__.
 
 <<number,Numeric types>>, <<date,date types>>, the <<boolean,boolean type>>,
 <<ip,ip type>>, <<geo-point,geo_point type>> and the <<keyword,keyword type>>
-can also be queried using term or range-based queries
+can also be queried
 when they are not <<mapping-index,indexed>> but only have doc values enabled.
 Query performance on doc values is much slower than on index structures, but
 offers an interesting tradeoff between disk usage and query performance for

+ 1 - 2
docs/reference/mapping/types/keyword.asciidoc

@@ -82,8 +82,7 @@ The following parameters are accepted by `keyword` fields:
 
     Should the field be quickly searchable? Accepts `true` (default) and
      `false`. `keyword` fields that only have <<doc-values,`doc_values`>>
-     enabled can still be queried using term or range-based queries,
-     albeit slower.
+     enabled can still be queried, albeit slower.
 
 <<index-options,`index_options`>>::
 

+ 54 - 0
rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/390_doc_values_search.yml

@@ -289,6 +289,60 @@ setup:
         body: { query: { range: { keyword: { gte: "key1" } } } }
   - length:   { hits.hits: 2  }
 
+---
+"Test fuzzy query on keyword field where only doc values are enabled":
+
+  - do:
+      search:
+        index: test
+        body: { query: { fuzzy: { keyword: { value: "kay1", fuzziness: 1 } } } }
+  - length:   { hits.hits: 1  }
+
+---
+"Test prefix query on keyword field where only doc values are enabled":
+
+  - do:
+      search:
+        index: test
+        body: { query: { prefix: { keyword: { value: "key" } } } }
+  - length:   { hits.hits: 2  }
+
+---
+"Test case insensitive term query on keyword field where only doc values are enabled":
+
+  - do:
+      search:
+        index: test
+        body: { query: { term: { keyword: { value: "KeY1", case_insensitive: true } } } }
+  - length:   { hits.hits: 1  }
+
+---
+"Test wildcard query on keyword field where only doc values are enabled":
+
+  - do:
+      search:
+        index: test
+        body: { query: { wildcard: { keyword: { value: "k*1" } } } }
+  - length:   { hits.hits: 1  }
+
+---
+"Test case insensitive wildcard query on keyword field where only doc values are enabled":
+
+  - do:
+      search:
+        index: test
+        body: { query: { wildcard: { keyword: { value: "K*1", case_insensitive: true } } } }
+  - length:   { hits.hits: 1  }
+
+---
+"Test regexp query on keyword field where only doc values are enabled":
+
+  - do:
+      search:
+        index: test
+        body: { query: { regexp: { keyword: { value: "k.*1" } } } }
+  - length:   { hits.hits: 1  }
+
 ---
 "Test match query on boolean field where only doc values are enabled":
 

+ 136 - 1
server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java

@@ -31,8 +31,10 @@ import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE;
 import org.apache.lucene.util.automaton.MinimizationOperations;
 import org.apache.lucene.util.automaton.Operations;
 import org.elasticsearch.common.bytes.BytesReference;
+import org.elasticsearch.common.lucene.BytesRefs;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.lucene.search.AutomatonQueries;
+import org.elasticsearch.common.unit.Fuzziness;
 import org.elasticsearch.index.analysis.IndexAnalyzers;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
 import org.elasticsearch.index.fielddata.FieldData;
@@ -44,9 +46,15 @@ import org.elasticsearch.script.Script;
 import org.elasticsearch.script.ScriptCompiler;
 import org.elasticsearch.script.StringFieldScript;
 import org.elasticsearch.script.field.KeywordDocValuesField;
+import org.elasticsearch.script.field.SortedSetDocValuesStringFieldScript;
 import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
 import org.elasticsearch.search.lookup.FieldValues;
 import org.elasticsearch.search.lookup.SearchLookup;
+import org.elasticsearch.search.runtime.StringScriptFieldFuzzyQuery;
+import org.elasticsearch.search.runtime.StringScriptFieldPrefixQuery;
+import org.elasticsearch.search.runtime.StringScriptFieldRegexpQuery;
+import org.elasticsearch.search.runtime.StringScriptFieldTermQuery;
+import org.elasticsearch.search.runtime.StringScriptFieldWildcardQuery;
 import org.elasticsearch.xcontent.XContentParser;
 
 import java.io.IOException;
@@ -388,6 +396,68 @@ public final class KeywordFieldMapper extends FieldMapper {
             }
         }
 
+        @Override
+        public Query fuzzyQuery(
+            Object value,
+            Fuzziness fuzziness,
+            int prefixLength,
+            int maxExpansions,
+            boolean transpositions,
+            SearchExecutionContext context
+        ) {
+            failIfNotIndexedNorDocValuesFallback(context);
+            if (isIndexed()) {
+                return super.fuzzyQuery(value, fuzziness, prefixLength, maxExpansions, transpositions, context);
+            } else {
+                return StringScriptFieldFuzzyQuery.build(
+                    new Script(""),
+                    ctx -> new SortedSetDocValuesStringFieldScript(name(), context.lookup(), ctx),
+                    name(),
+                    indexedValueForSearch(value).utf8ToString(),
+                    fuzziness.asDistance(BytesRefs.toString(value)),
+                    prefixLength,
+                    transpositions
+                );
+            }
+        }
+
+        @Override
+        public Query prefixQuery(
+            String value,
+            MultiTermQuery.RewriteMethod method,
+            boolean caseInsensitive,
+            SearchExecutionContext context
+        ) {
+            failIfNotIndexedNorDocValuesFallback(context);
+            if (isIndexed()) {
+                return super.prefixQuery(value, method, caseInsensitive, context);
+            } else {
+                return new StringScriptFieldPrefixQuery(
+                    new Script(""),
+                    ctx -> new SortedSetDocValuesStringFieldScript(name(), context.lookup(), ctx),
+                    name(),
+                    indexedValueForSearch(value).utf8ToString(),
+                    caseInsensitive
+                );
+            }
+        }
+
+        @Override
+        public Query termQueryCaseInsensitive(Object value, SearchExecutionContext context) {
+            failIfNotIndexedNorDocValuesFallback(context);
+            if (isIndexed()) {
+                return super.termQueryCaseInsensitive(value, context);
+            } else {
+                return new StringScriptFieldTermQuery(
+                    new Script(""),
+                    ctx -> new SortedSetDocValuesStringFieldScript(name(), context.lookup(), ctx),
+                    name(),
+                    indexedValueForSearch(value).utf8ToString(),
+                    true
+                );
+            }
+        }
+
         @Override
         public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutionContext queryShardContext, String searchAfter)
             throws IOException {
@@ -521,7 +591,72 @@ public final class KeywordFieldMapper extends FieldMapper {
             boolean caseInsensitive,
             SearchExecutionContext context
         ) {
-            return super.wildcardQuery(value, method, caseInsensitive, true, context);
+            failIfNotIndexedNorDocValuesFallback(context);
+            if (isIndexed()) {
+                return super.wildcardQuery(value, method, caseInsensitive, true, context);
+            } else {
+                if (getTextSearchInfo().getSearchAnalyzer() != null) {
+                    value = normalizeWildcardPattern(name(), value, getTextSearchInfo().getSearchAnalyzer());
+                } else {
+                    value = indexedValueForSearch(value).utf8ToString();
+                }
+                return new StringScriptFieldWildcardQuery(
+                    new Script(""),
+                    ctx -> new SortedSetDocValuesStringFieldScript(name(), context.lookup(), ctx),
+                    name(),
+                    value,
+                    caseInsensitive
+                );
+            }
+        }
+
+        @Override
+        public Query normalizedWildcardQuery(String value, MultiTermQuery.RewriteMethod method, SearchExecutionContext context) {
+            failIfNotIndexedNorDocValuesFallback(context);
+            if (isIndexed()) {
+                return super.normalizedWildcardQuery(value, method, context);
+            } else {
+                if (getTextSearchInfo().getSearchAnalyzer() != null) {
+                    value = normalizeWildcardPattern(name(), value, getTextSearchInfo().getSearchAnalyzer());
+                } else {
+                    value = indexedValueForSearch(value).utf8ToString();
+                }
+                return new StringScriptFieldWildcardQuery(
+                    new Script(""),
+                    ctx -> new SortedSetDocValuesStringFieldScript(name(), context.lookup(), ctx),
+                    name(),
+                    value,
+                    false
+                );
+            }
+        }
+
+        @Override
+        public Query regexpQuery(
+            String value,
+            int syntaxFlags,
+            int matchFlags,
+            int maxDeterminizedStates,
+            MultiTermQuery.RewriteMethod method,
+            SearchExecutionContext context
+        ) {
+            failIfNotIndexedNorDocValuesFallback(context);
+            if (isIndexed()) {
+                return super.regexpQuery(value, syntaxFlags, matchFlags, maxDeterminizedStates, method, context);
+            } else {
+                if (matchFlags != 0) {
+                    throw new IllegalArgumentException("Match flags not yet implemented [" + matchFlags + "]");
+                }
+                return new StringScriptFieldRegexpQuery(
+                    new Script(""),
+                    ctx -> new SortedSetDocValuesStringFieldScript(name(), context.lookup(), ctx),
+                    name(),
+                    indexedValueForSearch(value).utf8ToString(),
+                    syntaxFlags,
+                    matchFlags,
+                    maxDeterminizedStates
+                );
+            }
         }
 
         @Override

+ 58 - 0
server/src/main/java/org/elasticsearch/script/field/SortedSetDocValuesStringFieldScript.java

@@ -0,0 +1,58 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.script.field;
+
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.script.StringFieldScript;
+import org.elasticsearch.search.lookup.SearchLookup;
+
+import java.io.IOException;
+import java.util.Map;
+
+public class SortedSetDocValuesStringFieldScript extends StringFieldScript {
+    private final SortedSetDocValues sortedSetDocValues;
+
+    boolean hasValue = false;
+
+    public SortedSetDocValuesStringFieldScript(String fieldName, SearchLookup searchLookup, LeafReaderContext ctx) {
+        super(fieldName, Map.of(), searchLookup, ctx);
+        try {
+            sortedSetDocValues = DocValues.getSortedSet(ctx.reader(), fieldName);
+        } catch (IOException e) {
+            throw new IllegalStateException("Cannot load doc values", e);
+        }
+    }
+
+    @Override
+    public void setDocument(int docID) {
+        try {
+            hasValue = sortedSetDocValues.advanceExact(docID);
+        } catch (IOException e) {
+            throw new IllegalStateException("Cannot load doc values", e);
+        }
+    }
+
+    @Override
+    public void execute() {
+        try {
+            if (hasValue) {
+                long ord;
+                while ((ord = sortedSetDocValues.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
+                    BytesRef bytesRef = sortedSetDocValues.lookupOrd(ord);
+                    emit(bytesRef.utf8ToString());
+                }
+            }
+        } catch (IOException e) {
+            throw new IllegalStateException("Cannot load doc values", e);
+        }
+    }
+}

+ 4 - 4
server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java

@@ -167,12 +167,12 @@ public class KeywordFieldTypeTests extends FieldTypeTestCase {
         MappedFieldType ft = new KeywordFieldType("field");
         assertEquals(new RegexpQuery(new Term("field", "foo.*")), ft.regexpQuery("foo.*", 0, 0, 10, null, MOCK_CONTEXT));
 
-        MappedFieldType unsearchable = new KeywordFieldType("field", false, true, Collections.emptyMap());
+        MappedFieldType unsearchable = new KeywordFieldType("field", false, false, Collections.emptyMap());
         IllegalArgumentException e = expectThrows(
             IllegalArgumentException.class,
             () -> unsearchable.regexpQuery("foo.*", 0, 0, 10, null, MOCK_CONTEXT)
         );
-        assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage());
+        assertEquals("Cannot search on field [field] since it is not indexed nor has doc values.", e.getMessage());
 
         ElasticsearchException ee = expectThrows(
             ElasticsearchException.class,
@@ -188,12 +188,12 @@ public class KeywordFieldTypeTests extends FieldTypeTestCase {
             ft.fuzzyQuery("foo", Fuzziness.fromEdits(2), 1, 50, true, MOCK_CONTEXT)
         );
 
-        MappedFieldType unsearchable = new KeywordFieldType("field", false, true, Collections.emptyMap());
+        MappedFieldType unsearchable = new KeywordFieldType("field", false, false, Collections.emptyMap());
         IllegalArgumentException e = expectThrows(
             IllegalArgumentException.class,
             () -> unsearchable.fuzzyQuery("foo", Fuzziness.fromEdits(2), 1, 50, true, MOCK_CONTEXT)
         );
-        assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage());
+        assertEquals("Cannot search on field [field] since it is not indexed nor has doc values.", e.getMessage());
 
         ElasticsearchException ee = expectThrows(
             ElasticsearchException.class,