Browse Source

Allow doc-values only search on keyword fields (#82846)

Allows searching on keyword fields when those fields are not indexed (index: false) but just doc values are enabled.

This enables searches on archive data, which has access to doc values but not index structures. When combined with
searchable snapshots, it allows downloading only data for a given (doc value) field to quickly filter down to a select set
of documents.

Relates #81210 and #52728
Yannick Welsch 3 years ago
parent
commit
fd7f69cea6

+ 2 - 2
docs/reference/mapping/params/doc-values.asciidoc

@@ -17,8 +17,8 @@ makes this data access pattern possible. They store the same values as the
 sorting and aggregations. Doc values are supported on almost all field types,
 with the __notable exception of `text` and `annotated_text` fields__.
 
-<<number,Numeric types>>, such as `long` and `double`, and <<date,Date types>>
-can also be queried
+<<number,Numeric types>>, <<date,date types>>, and the <<keyword, keyword type>>
+can also be queried using term or range-based queries
 when they are not <<mapping-index,indexed>> but only have doc values enabled.
 Query performance on doc values is much slower than on index structures, but
 offers an interesting tradeoff between disk usage and query performance for

+ 4 - 1
docs/reference/mapping/types/keyword.asciidoc

@@ -80,7 +80,10 @@ The following parameters are accepted by `keyword` fields:
 
 <<mapping-index,`index`>>::
 
-    Should the field be searchable? Accepts `true` (default) or `false`.
+    Should the field be quickly searchable? Accepts `true` (default) and
+     `false`. `keyword` fields that only have <<doc-values,`doc_values`>>
+     enabled can still be queried using term or range-based queries,
+     albeit slower.
 
 <<index-options,`index_options`>>::
 

+ 1 - 1
docs/reference/query-dsl.asciidoc

@@ -33,7 +33,7 @@ the stability of the cluster. Those queries can be categorised as follows:
 
 * Queries that need to do linear scans to identify matches:
 ** <<query-dsl-script-query,`script` queries>>
-** queries on <<number,numeric>> and <<date,date>> fields that are not indexed
+** queries on <<number,numeric>>, <<date,date>>, or <<keyword,keyword>> fields that are not indexed
    but have <<doc-values,doc values>> enabled
 
 * Queries that have a high up-front cost:

+ 15 - 0
rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/field_caps/10_basic.yml

@@ -86,6 +86,9 @@ setup:
                   non_indexed_date:
                     type:     date
                     index:    false
+                  non_indexed_keyword:
+                    type:     keyword
+                    index:    false
                   geo:
                     type:     keyword
                   object:
@@ -225,6 +228,18 @@ setup:
 
   - match: {fields.non_indexed_date.date.searchable:                       true}
 
+---
+"Field caps for keyword field with only doc values":
+  - skip:
+      version: " - 8.0.99"
+      reason: "doc values search was added in 8.1.0"
+  - do:
+      field_caps:
+        index: 'test1,test2,test3'
+        fields: non_indexed_keyword
+
+  - match: {fields.non_indexed_keyword.keyword.searchable:                 true}
+
 ---
 "Get object and nested field caps":
 

+ 32 - 0
rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/390_doc_values_search.yml

@@ -36,6 +36,9 @@ setup:
                 type: date
                 format: yyyy/MM/dd
                 index: false
+              keyword:
+                type: keyword
+                index: false
 
   - do:
       index:
@@ -50,6 +53,7 @@ setup:
           long: 1
           short: 1
           date: "2017/01/01"
+          keyword: "key1"
 
   - do:
       index:
@@ -64,6 +68,7 @@ setup:
           long: 2
           short: 2
           date: "2017/01/02"
+          keyword: "key2"
 
   - do:
       indices.refresh: {}
@@ -220,3 +225,30 @@ setup:
         index: test
         body: { query: { range: { date: { gte: "2017/01/01" } } } }
   - length:   { hits.hits: 2  }
+
+---
+"Test match query on keyword field where only doc values are enabled":
+
+  - do:
+      search:
+        index: test
+        body: { query: { match: { keyword: { query: "key1" } } } }
+  - length:   { hits.hits: 1  }
+
+---
+"Test terms query on keyword field where only doc values are enabled":
+
+  - do:
+      search:
+        index: test
+        body: { query: { terms: { keyword: [ "key1", "key2" ] } } }
+  - length:   { hits.hits: 2  }
+
+---
+"Test range query on keyword field where only doc values are enabled":
+
+  - do:
+      search:
+        index: test
+        body: { query: { range: { keyword: { gte: "key1" } } } }
+  - length:   { hits.hits: 2  }

+ 10 - 0
server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java

@@ -338,6 +338,16 @@ public final class KeywordFieldMapper extends FieldMapper {
             this.isDimension = false;
         }
 
+        @Override
+        protected boolean allowDocValueBasedQueries() {
+            return true;
+        }
+
+        @Override
+        public boolean isSearchable() {
+            return isIndexed() || hasDocValues();
+        }
+
         @Override
         public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutionContext queryShardContext, String searchAfter)
             throws IOException {

+ 23 - 8
server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java

@@ -9,6 +9,7 @@
 package org.elasticsearch.index.mapper;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.SortedSetDocValuesField;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.AutomatonQuery;
 import org.apache.lucene.search.FuzzyQuery;
@@ -210,13 +211,27 @@ public abstract class StringFieldType extends TermBasedFieldType {
                     + "' is set to false."
             );
         }
-        failIfNotIndexed();
-        return new TermRangeQuery(
-            name(),
-            lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
-            upperTerm == null ? null : indexedValueForSearch(upperTerm),
-            includeLower,
-            includeUpper
-        );
+        if (allowDocValueBasedQueries()) {
+            failIfNotIndexedNorDocValuesFallback(context);
+        } else {
+            failIfNotIndexed();
+        }
+        if (isIndexed()) {
+            return new TermRangeQuery(
+                name(),
+                lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
+                upperTerm == null ? null : indexedValueForSearch(upperTerm),
+                includeLower,
+                includeUpper
+            );
+        } else {
+            return SortedSetDocValuesField.newSlowRangeQuery(
+                name(),
+                lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
+                upperTerm == null ? null : indexedValueForSearch(upperTerm),
+                includeLower,
+                includeUpper
+            );
+        }
     }
 }

+ 26 - 4
server/src/main/java/org/elasticsearch/index/mapper/TermBasedFieldType.java

@@ -8,7 +8,9 @@
 
 package org.elasticsearch.index.mapper;
 
+import org.apache.lucene.document.SortedSetDocValuesField;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.sandbox.search.DocValuesTermsQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermInSetQuery;
 import org.apache.lucene.search.TermQuery;
@@ -35,6 +37,10 @@ public abstract class TermBasedFieldType extends SimpleMappedFieldType {
         super(name, isIndexed, isStored, hasDocValues, textSearchInfo, meta);
     }
 
+    protected boolean allowDocValueBasedQueries() {
+        return false;
+    }
+
     /** Returns the indexed value used to construct search "values".
      *  This method is used for the default implementations of most
      *  query factory methods such as {@link #termQuery}. */
@@ -55,15 +61,31 @@ public abstract class TermBasedFieldType extends SimpleMappedFieldType {
 
     @Override
     public Query termQuery(Object value, SearchExecutionContext context) {
-        failIfNotIndexed();
-        return new TermQuery(new Term(name(), indexedValueForSearch(value)));
+        if (allowDocValueBasedQueries()) {
+            failIfNotIndexedNorDocValuesFallback(context);
+        } else {
+            failIfNotIndexed();
+        }
+        if (isIndexed()) {
+            return new TermQuery(new Term(name(), indexedValueForSearch(value)));
+        } else {
+            return SortedSetDocValuesField.newSlowExactQuery(name(), indexedValueForSearch(value));
+        }
     }
 
     @Override
     public Query termsQuery(Collection<?> values, SearchExecutionContext context) {
-        failIfNotIndexed();
+        if (allowDocValueBasedQueries()) {
+            failIfNotIndexedNorDocValuesFallback(context);
+        } else {
+            failIfNotIndexed();
+        }
         BytesRef[] bytesRefs = values.stream().map(this::indexedValueForSearch).toArray(BytesRef[]::new);
-        return new TermInSetQuery(name(), bytesRefs);
+        if (isIndexed()) {
+            return new TermInSetQuery(name(), bytesRefs);
+        } else {
+            return new DocValuesTermsQuery(name(), bytesRefs);
+        }
     }
 
 }

+ 32 - 14
server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java

@@ -17,7 +17,9 @@ import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.SortedSetDocValuesField;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.sandbox.search.DocValuesTermsQuery;
 import org.apache.lucene.search.DocValuesFieldExistsQuery;
 import org.apache.lucene.search.FuzzyQuery;
 import org.apache.lucene.search.NormsFieldExistsQuery;
@@ -52,7 +54,7 @@ import java.util.Map;
 public class KeywordFieldTypeTests extends FieldTypeTestCase {
 
     public void testIsFieldWithinQuery() throws IOException {
-        KeywordFieldType ft = new KeywordFieldType("field");
+        KeywordFieldType ft = new KeywordFieldType("field", randomBoolean(), randomBoolean(), Map.of());
         // current impl ignores args and should always return INTERSECTS
         assertEquals(
             Relation.INTERSECTS,
@@ -64,18 +66,21 @@ public class KeywordFieldTypeTests extends FieldTypeTestCase {
                 randomBoolean(),
                 null,
                 null,
-                null
+                MOCK_CONTEXT
             )
         );
     }
 
     public void testTermQuery() {
         MappedFieldType ft = new KeywordFieldType("field");
-        assertEquals(new TermQuery(new Term("field", "foo")), ft.termQuery("foo", null));
+        assertEquals(new TermQuery(new Term("field", "foo")), ft.termQuery("foo", MOCK_CONTEXT));
 
-        MappedFieldType unsearchable = new KeywordFieldType("field", false, true, Collections.emptyMap());
-        IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> unsearchable.termQuery("bar", null));
-        assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage());
+        MappedFieldType ft2 = new KeywordFieldType("field", false, true, Map.of());
+        assertEquals(SortedSetDocValuesField.newSlowExactQuery("field", new BytesRef("foo")), ft2.termQuery("foo", MOCK_CONTEXT));
+
+        MappedFieldType unsearchable = new KeywordFieldType("field", false, false, Collections.emptyMap());
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> unsearchable.termQuery("bar", MOCK_CONTEXT));
+        assertEquals("Cannot search on field [field] since it is not indexed nor has doc values.", e.getMessage());
     }
 
     public void testTermQueryWithNormalizer() {
@@ -93,7 +98,7 @@ public class KeywordFieldTypeTests extends FieldTypeTestCase {
             }
         };
         MappedFieldType ft = new KeywordFieldType("field", new NamedAnalyzer("my_normalizer", AnalyzerScope.INDEX, normalizer));
-        assertEquals(new TermQuery(new Term("field", "foo bar")), ft.termQuery("fOo BaR", null));
+        assertEquals(new TermQuery(new Term("field", "foo bar")), ft.termQuery("fOo BaR", MOCK_CONTEXT));
     }
 
     public void testTermsQuery() {
@@ -101,30 +106,37 @@ public class KeywordFieldTypeTests extends FieldTypeTestCase {
         List<BytesRef> terms = new ArrayList<>();
         terms.add(new BytesRef("foo"));
         terms.add(new BytesRef("bar"));
-        assertEquals(new TermInSetQuery("field", terms), ft.termsQuery(Arrays.asList("foo", "bar"), null));
+        assertEquals(new TermInSetQuery("field", terms), ft.termsQuery(Arrays.asList("foo", "bar"), MOCK_CONTEXT));
 
-        MappedFieldType unsearchable = new KeywordFieldType("field", false, true, Collections.emptyMap());
+        MappedFieldType ft2 = new KeywordFieldType("field", false, true, Map.of());
+        assertEquals(new DocValuesTermsQuery("field", terms), ft2.termsQuery(Arrays.asList("foo", "bar"), MOCK_CONTEXT));
+
+        MappedFieldType unsearchable = new KeywordFieldType("field", false, false, Collections.emptyMap());
         IllegalArgumentException e = expectThrows(
             IllegalArgumentException.class,
-            () -> unsearchable.termsQuery(Arrays.asList("foo", "bar"), null)
+            () -> unsearchable.termsQuery(Arrays.asList("foo", "bar"), MOCK_CONTEXT)
         );
-        assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage());
+        assertEquals("Cannot search on field [field] since it is not indexed nor has doc values.", e.getMessage());
     }
 
     public void testExistsQuery() {
         {
             KeywordFieldType ft = new KeywordFieldType("field");
-            assertEquals(new DocValuesFieldExistsQuery("field"), ft.existsQuery(null));
+            assertEquals(new DocValuesFieldExistsQuery("field"), ft.existsQuery(MOCK_CONTEXT));
+        }
+        {
+            KeywordFieldType ft = new KeywordFieldType("field", false, true, Map.of());
+            assertEquals(new DocValuesFieldExistsQuery("field"), ft.existsQuery(MOCK_CONTEXT));
         }
         {
             FieldType fieldType = new FieldType();
             fieldType.setOmitNorms(false);
             KeywordFieldType ft = new KeywordFieldType("field", fieldType);
-            assertEquals(new NormsFieldExistsQuery("field"), ft.existsQuery(null));
+            assertEquals(new NormsFieldExistsQuery("field"), ft.existsQuery(MOCK_CONTEXT));
         }
         {
             KeywordFieldType ft = new KeywordFieldType("field", true, false, Collections.emptyMap());
-            assertEquals(new TermQuery(new Term(FieldNamesFieldMapper.NAME, "field")), ft.existsQuery(null));
+            assertEquals(new TermQuery(new Term(FieldNamesFieldMapper.NAME, "field")), ft.existsQuery(MOCK_CONTEXT));
         }
     }
 
@@ -135,6 +147,12 @@ public class KeywordFieldTypeTests extends FieldTypeTestCase {
             ft.rangeQuery("foo", "bar", true, false, null, null, null, MOCK_CONTEXT)
         );
 
+        MappedFieldType ft2 = new KeywordFieldType("field", false, true, Map.of());
+        assertEquals(
+            SortedSetDocValuesField.newSlowRangeQuery("field", BytesRefs.toBytesRef("foo"), BytesRefs.toBytesRef("bar"), true, false),
+            ft2.rangeQuery("foo", "bar", true, false, null, null, null, MOCK_CONTEXT)
+        );
+
         ElasticsearchException ee = expectThrows(
             ElasticsearchException.class,
             () -> ft.rangeQuery("foo", "bar", true, false, null, null, null, MOCK_CONTEXT_DISALLOW_EXPENSIVE)

+ 9 - 9
x-pack/plugin/sql/qa/server/src/main/java/org/elasticsearch/xpack/sql/qa/jdbc/SysColumnsTestCase.java

@@ -50,22 +50,22 @@ public class SysColumnsTestCase extends JdbcIntegrationTestCase {
 
     public void testAliasWithIncompatibleSearchableProperty() throws Exception {
         createIndexWithMapping("test1", builder -> {
-            builder.startObject("id").field("type", "keyword").endObject();
+            builder.startObject("id").field("type", "text").endObject();
             builder.startObject("value").field("type", "boolean").endObject();
         });
 
         createIndexWithMapping("test2", builder -> {
-            builder.startObject("id").field("type", "keyword").field("index", false).endObject();
+            builder.startObject("id").field("type", "text").field("index", false).endObject();
             builder.startObject("value").field("type", "boolean").endObject();
         });
 
         createIndexWithMapping("test3", builder -> {
-            builder.startObject("id").field("type", "keyword").field("index", false).endObject();
+            builder.startObject("id").field("type", "text").field("index", false).endObject();
             builder.startObject("value").field("type", "boolean").endObject();
         });
 
         createIndexWithMapping("test4", builder -> {
-            builder.startObject("id").field("type", "keyword").field("index", false).endObject();
+            builder.startObject("id").field("type", "text").field("index", false).endObject();
             builder.startObject("value").field("type", "boolean").endObject();
         });
 
@@ -79,16 +79,16 @@ public class SysColumnsTestCase extends JdbcIntegrationTestCase {
         assertResultsForQuery(
             "SYS COLUMNS",
             new String[][] {
-                { "test1", "id", "KEYWORD" },
+                { "test1", "id", "TEXT" },
                 { "test1", "value", "BOOLEAN" },
-                { "test2", "id", "KEYWORD" },
+                { "test2", "id", "TEXT" },
                 { "test2", "value", "BOOLEAN" },
-                { "test3", "id", "KEYWORD" },
+                { "test3", "id", "TEXT" },
                 { "test3", "value", "BOOLEAN" },
-                { "test4", "id", "KEYWORD" },
+                { "test4", "id", "TEXT" },
                 { "test4", "value", "BOOLEAN" },
                 { "test_alias", "value", "BOOLEAN" },
-                { "test_alias2", "id", "KEYWORD" },
+                { "test_alias2", "id", "TEXT" },
                 { "test_alias2", "value", "BOOLEAN" } }
         );
     }