Переглянути джерело

Add 'min_input_len' to completion suggester

Restrict the size of the input length to a reasonable size otherwise very
long strings can cause StackOverflowExceptions deep down in lucene land.
Yet, this is simply a saftly limit set to `50` UTF-16 codepoints by default.
This limit is only present at index time and not at query time. If prefix
completions > 50 UTF-16 codepoints are expected / desired this limit should be raised.
Critical string sizes are beyone the 1k UTF-16 Codepoints limit.

Closes #3596
Simon Willnauer 12 роки тому
батько
коміт
eb2fed85f1

+ 9 - 1
docs/reference/search/suggesters/completion-suggest.asciidoc

@@ -65,7 +65,7 @@ Mapping supports the following parameters:
 `payloads`:: 
     Enables the storing of payloads, defaults to `false`
 
-`preserve_separators`: 
+`preserve_separators`::
     Preserves the separators, defaults to `true`.
     If disabled, you could find a field starting with `Foo Fighters`, if you
     suggest for `foof`.
@@ -78,6 +78,14 @@ Mapping supports the following parameters:
     `The Beatles`, no need to change a simple analyzer, if you are able to
     enrich your data.
 
+`max_input_len`::
+    Limits the length of a single input, defaults to `50` UTF-16 code points.
+    This limit is only used at index time to reduce the total number of
+    characters per input string in order to prevent massive inputs from
+    bloating the underlying datastructure. The most usecases won't be influenced
+    by the default value since prefix completions hardly grow beyond prefixes longer
+    than a handful of characters.
+    
 ==== Indexing
 
 [source,js]

+ 76 - 16
src/main/java/org/elasticsearch/index/mapper/core/CompletionFieldMapper.java

@@ -31,16 +31,14 @@ import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
 import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider;
 import org.elasticsearch.index.fielddata.FieldDataType;
-import org.elasticsearch.index.mapper.Mapper;
-import org.elasticsearch.index.mapper.MapperException;
-import org.elasticsearch.index.mapper.MapperParsingException;
-import org.elasticsearch.index.mapper.ParseContext;
+import org.elasticsearch.index.mapper.*;
 import org.elasticsearch.index.similarity.SimilarityProvider;
 import org.elasticsearch.search.suggest.completion.AnalyzingCompletionLookupProvider;
 import org.elasticsearch.search.suggest.completion.CompletionPostingsFormatProvider;
 import org.elasticsearch.search.suggest.completion.CompletionTokenStream;
 
 import java.io.IOException;
+import java.io.Reader;
 import java.util.List;
 import java.util.Map;
 
@@ -62,6 +60,7 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
         public static final boolean DEFAULT_PRESERVE_SEPARATORS = true;
         public static final boolean DEFAULT_POSITION_INCREMENTS = true;
         public static final boolean DEFAULT_HAS_PAYLOADS = false;
+        public static final int DEFAULT_MAX_INPUT_LENGTH = 50;
     }
 
     public static class Fields {
@@ -72,6 +71,7 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
         public static final String PRESERVE_POSITION_INCREMENTS = "preserve_position_increments";
         public static final String PAYLOADS = "payloads";
         public static final String TYPE = "type";
+        public static final String MAX_INPUT_LENGTH = "max_input_len";
     }
 
     public static class Builder extends AbstractFieldMapper.OpenBuilder<Builder, CompletionFieldMapper> {
@@ -81,6 +81,7 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
         private boolean preserveSeparators = Defaults.DEFAULT_PRESERVE_SEPARATORS;
         private boolean payloads = Defaults.DEFAULT_HAS_PAYLOADS;
         private boolean preservePositionIncrements = Defaults.DEFAULT_POSITION_INCREMENTS;
+        private int maxInputLength = Defaults.DEFAULT_MAX_INPUT_LENGTH;
 
         public Builder(String name) {
             super(name, Defaults.FIELD_TYPE);
@@ -110,10 +111,19 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
             this.preservePositionIncrements = preservePositionIncrements;
             return this;
         }
+        
+        public Builder maxInputLength(int maxInputLength) {
+            if (maxInputLength <= 0) {
+                throw new ElasticSearchIllegalArgumentException(Fields.MAX_INPUT_LENGTH + " must be > 0 but was [" + maxInputLength + "]");
+            }
+            this.maxInputLength = maxInputLength;
+            return this;
+        }
 
         @Override
         public CompletionFieldMapper build(Mapper.BuilderContext context) {
-            return new CompletionFieldMapper(buildNames(context), indexAnalyzer, searchAnalyzer, provider, similarity, payloads, preserveSeparators, preservePositionIncrements);
+            return new CompletionFieldMapper(buildNames(context), indexAnalyzer, searchAnalyzer, provider, similarity, payloads,
+                    preserveSeparators, preservePositionIncrements, maxInputLength);
         }
     }
 
@@ -129,18 +139,23 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
                     continue;
                 }
                 if (fieldName.equals("analyzer")) {
-                    builder.indexAnalyzer(parserContext.analysisService().analyzer(fieldNode.toString()));
-                    builder.searchAnalyzer(parserContext.analysisService().analyzer(fieldNode.toString()));
+                    NamedAnalyzer analyzer = getNamedAnalyzer(parserContext, fieldNode.toString());
+                    builder.indexAnalyzer(analyzer);
+                    builder.searchAnalyzer(analyzer);
                 } else if (fieldName.equals(Fields.INDEX_ANALYZER) || fieldName.equals("indexAnalyzer")) {
-                    builder.indexAnalyzer(parserContext.analysisService().analyzer(fieldNode.toString()));
+                    builder.indexAnalyzer(getNamedAnalyzer(parserContext, fieldNode.toString()));
                 } else if (fieldName.equals(Fields.SEARCH_ANALYZER) || fieldName.equals("searchAnalyzer")) {
-                    builder.searchAnalyzer(parserContext.analysisService().analyzer(fieldNode.toString()));
+                    builder.searchAnalyzer(getNamedAnalyzer(parserContext, fieldNode.toString()));
                 } else if (fieldName.equals(Fields.PAYLOADS)) {
                     builder.payloads(Boolean.parseBoolean(fieldNode.toString()));
                 } else if (fieldName.equals(Fields.PRESERVE_SEPARATORS) || fieldName.equals("preserveSeparators")) {
                     builder.preserveSeparators(Boolean.parseBoolean(fieldNode.toString()));
                 } else if (fieldName.equals(Fields.PRESERVE_POSITION_INCREMENTS) || fieldName.equals("preservePositionIncrements")) {
                     builder.preservePositionIncrements(Boolean.parseBoolean(fieldNode.toString()));
+                } else if (fieldName.equals(Fields.MAX_INPUT_LENGTH) || fieldName.equals("maxInputLen")) {
+                    builder.maxInputLength(Integer.parseInt(fieldNode.toString()));
+                } else {
+                    throw new MapperParsingException("Unknown field [" + fieldName + "]");
                 }
             }
 
@@ -155,6 +170,14 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
             builder.postingsFormat(parserContext.postingFormatService().get("default"));
             return builder;
         }
+
+        private NamedAnalyzer getNamedAnalyzer(ParserContext parserContext, String name) {
+            NamedAnalyzer analyzer = parserContext.analysisService().analyzer(name);
+            if (analyzer == null) {
+                throw new ElasticSearchIllegalArgumentException("Can't find default or mapped analyzer with name [" + name +"]");
+            }
+            return analyzer;
+        }
     }
 
     private static final BytesRef EMPTY = new BytesRef();
@@ -164,15 +187,17 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
     private final boolean payloads;
     private final boolean preservePositionIncrements;
     private final boolean preserveSeparators;
+    private int maxInputLength;
 
     public CompletionFieldMapper(Names names, NamedAnalyzer indexAnalyzer, NamedAnalyzer searchAnalyzer, PostingsFormatProvider provider, SimilarityProvider similarity, boolean payloads,
-                                 boolean preserveSeparators, boolean preservePositionIncrements) {
+                                 boolean preserveSeparators, boolean preservePositionIncrements, int maxInputLength) {
         super(names, 1.0f, Defaults.FIELD_TYPE, indexAnalyzer, searchAnalyzer, provider, similarity, null);
         analyzingSuggestLookupProvider = new AnalyzingCompletionLookupProvider(preserveSeparators, false, preservePositionIncrements, payloads);
         this.completionPostingsFormatProvider = new CompletionPostingsFormatProvider("completion", provider, analyzingSuggestLookupProvider);
         this.preserveSeparators = preserveSeparators;
         this.payloads = payloads;
         this.preservePositionIncrements = preservePositionIncrements;
+        this.maxInputLength = maxInputLength;
     }
 
 
@@ -251,8 +276,20 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
     }
 
     public Field getCompletionField(String input, BytesRef payload) {
+        if (input.length() > maxInputLength) {
+            final int len = correctSubStringLen(input, Math.min(maxInputLength, input.length()));
+            input = input.substring(0, len);    
+        }
         return new SuggestField(names().fullName(), input, this.fieldType, payload, analyzingSuggestLookupProvider);
     }
+    
+    public static int correctSubStringLen(String input, int len) {
+        if (Character.isHighSurrogate(input.charAt(len-1))) {
+            assert input.length() >= len+1  && Character.isLowSurrogate(input.charAt(len));
+            return len + 1;
+        }
+        return len;
+    }
 
     public BytesRef buildPayload(BytesRef surfaceForm, long weight, BytesRef payload) throws IOException {
         return analyzingSuggestLookupProvider.buildPayload(
@@ -262,6 +299,12 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
     private static final class SuggestField extends Field {
         private final BytesRef payload;
         private final CompletionTokenStream.ToFiniteStrings toFiniteStrings;
+        
+        public SuggestField(String name, Reader value, FieldType type, BytesRef payload, CompletionTokenStream.ToFiniteStrings toFiniteStrings) {
+            super(name, value, type);
+            this.payload = payload;
+            this.toFiniteStrings = toFiniteStrings;
+        }
 
         public SuggestField(String name, String value, FieldType type, BytesRef payload, CompletionTokenStream.ToFiniteStrings toFiniteStrings) {
             super(name, value, type);
@@ -287,12 +330,11 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
             builder.field(Fields.INDEX_ANALYZER, indexAnalyzer.name())
                     .field(Fields.SEARCH_ANALYZER, searchAnalyzer.name());
         }
-        builder.field(Fields.PAYLOADS, this.payloads)
-                .field(Fields.PRESERVE_SEPARATORS, this.preserveSeparators)
-                .field(Fields.PRESERVE_POSITION_INCREMENTS, this.preservePositionIncrements)
-                .endObject();
-
-        return builder;
+        builder.field(Fields.PAYLOADS, this.payloads);
+        builder.field(Fields.PRESERVE_SEPARATORS, this.preserveSeparators);
+        builder.field(Fields.PRESERVE_POSITION_INCREMENTS, this.preservePositionIncrements);
+        builder.field(Fields.MAX_INPUT_LENGTH, this.maxInputLength);    
+        return builder.endObject();
     }
 
     @Override
@@ -328,4 +370,22 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
     public boolean isStoringPayloads() {
         return payloads;
     }
+    
+    @Override
+    public void merge(Mapper mergeWith, MergeContext mergeContext) throws MergeMappingException {
+        super.merge(mergeWith, mergeContext);
+        CompletionFieldMapper fieldMergeWith = (CompletionFieldMapper) mergeWith;
+        if (payloads != fieldMergeWith.payloads) {
+            mergeContext.addConflict("mapper [" + names.fullName() + "] has different payload values");
+        }
+        if (preservePositionIncrements != fieldMergeWith.preservePositionIncrements) { 
+            mergeContext.addConflict("mapper [" + names.fullName() + "] has different 'preserve_position_increments' values");
+        }
+        if (preserveSeparators != fieldMergeWith.preserveSeparators) {    
+            mergeContext.addConflict("mapper [" + names.fullName() + "] has different 'preserve_separators' values");
+        }
+        if (!mergeContext.mergeFlags().simulate()) {
+            this.maxInputLength = fieldMergeWith.maxInputLength;
+        }
+    }
 }

+ 11 - 0
src/test/java/org/elasticsearch/test/hamcrest/ElasticsearchAssertions.java

@@ -28,6 +28,8 @@ import org.elasticsearch.action.admin.indices.create.CreateIndexRequestBuilder;
 import org.elasticsearch.action.admin.indices.create.CreateIndexResponse;
 import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequestBuilder;
 import org.elasticsearch.action.admin.indices.delete.DeleteIndexResponse;
+import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequestBuilder;
+import org.elasticsearch.action.admin.indices.mapping.put.PutMappingResponse;
 import org.elasticsearch.action.count.CountResponse;
 import org.elasticsearch.action.search.SearchResponse;
 import org.elasticsearch.action.search.ShardSearchFailure;
@@ -50,6 +52,15 @@ import static org.junit.Assert.fail;
 public class ElasticsearchAssertions {
 
 
+    public static void assertAcked(PutMappingRequestBuilder builder) {
+        assertAcked(builder.get());
+    }
+    
+    private static void assertAcked(PutMappingResponse response) {
+        assertThat("Put Mapping failed - not acked", response.isAcknowledged(), equalTo(true));
+
+    }
+
     public static void assertAcked(DeleteIndexRequestBuilder builder) {
         assertAcked(builder.get());
     }

+ 3 - 3
src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionPostingsFormatTest.java

@@ -91,7 +91,7 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase {
         LookupFactory load = provider.load(input);
         PostingsFormatProvider format = new PreBuiltPostingsFormatProvider(new ElasticSearch090PostingsFormat());
         NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer(TEST_VERSION_CURRENT));
-        Lookup lookup = load.getLookup(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true), new CompletionSuggestionContext(null));
+        Lookup lookup = load.getLookup(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true, Integer.MAX_VALUE), new CompletionSuggestionContext(null));
         List<LookupResult> result = lookup.lookup("ge", false, 10);
         assertThat(result.get(0).key.toString(), equalTo("Generator - Foo Fighters"));
         assertThat(result.get(0).payload.utf8ToString(), equalTo("id:10"));
@@ -176,7 +176,7 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase {
 
         NamedAnalyzer namedAnalzyer = new NamedAnalyzer("foo", new StandardAnalyzer(TEST_VERSION_CURRENT));
         final CompletionFieldMapper mapper = new CompletionFieldMapper(new Names("foo"), namedAnalzyer, namedAnalzyer, provider, null, usePayloads,
-                preserveSeparators, preservePositionIncrements);
+                preserveSeparators, preservePositionIncrements, Integer.MAX_VALUE);
         Lookup buildAnalyzingLookup = buildAnalyzingLookup(mapper, titles, titles, weights);
         Field field = buildAnalyzingLookup.getClass().getDeclaredField("maxAnalyzedPathsForOneInput");
         field.setAccessible(true);
@@ -265,7 +265,7 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase {
         LookupFactory load = provider.load(input);
         PostingsFormatProvider format = new PreBuiltPostingsFormatProvider(new ElasticSearch090PostingsFormat());
         NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer(TEST_VERSION_CURRENT));
-        assertNull(load.getLookup(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true), new CompletionSuggestionContext(null)));
+        assertNull(load.getLookup(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true, Integer.MAX_VALUE), new CompletionSuggestionContext(null)));
         dir.close();
     }
 

+ 58 - 0
src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionSuggestSearchTests.java

@@ -32,11 +32,13 @@ import org.elasticsearch.common.settings.ImmutableSettings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.index.mapper.MapperException;
+import org.elasticsearch.index.mapper.core.CompletionFieldMapper;
 import org.elasticsearch.search.suggest.Suggest;
 import org.elasticsearch.search.suggest.completion.CompletionStats;
 import org.elasticsearch.search.suggest.completion.CompletionSuggestion;
 import org.elasticsearch.search.suggest.completion.CompletionSuggestionBuilder;
 import org.elasticsearch.search.suggest.completion.CompletionSuggestionFuzzyBuilder;
+import org.elasticsearch.test.hamcrest.ElasticsearchAssertions;
 import org.elasticsearch.test.integration.AbstractSharedClusterTest;
 import org.junit.Test;
 
@@ -717,4 +719,60 @@ public class CompletionSuggestSearchTests extends AbstractSharedClusterTest {
             }
         }
     }
+    
+    @Test
+    public void testMaxFieldLength() throws IOException {
+        client().admin().indices().prepareCreate(INDEX).get();
+        int iters = atLeast(10);
+        for (int i = 0; i < iters; i++) {
+            int len = between(3, 50);
+            String str = randomRealisticUnicodeOfCodepointLengthBetween(len+1, atLeast(len + 2));
+            ElasticsearchAssertions.assertAcked(client().admin().indices().preparePutMapping(INDEX).setType(TYPE).setSource(jsonBuilder().startObject()
+                    .startObject(TYPE).startObject("properties")
+                    .startObject(FIELD)
+                    .field("type", "completion")
+                    .field("max_input_len", len)
+                    // upgrade mapping each time
+                    .field("analyzer", "keyword")
+                    .endObject()
+                    .endObject().endObject()
+                    .endObject()));
+            ensureYellow();
+            client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder()
+                    .startObject().startObject(FIELD)
+                    .startArray("input").value(str).endArray()
+                    .field("output", "foobar")
+                    .endObject().endObject()
+            ).setRefresh(true).get();
+            int prefixLen = CompletionFieldMapper.correctSubStringLen(str, between(1, len - 1));
+            assertSuggestions(str.substring(0, prefixLen), "foobar");
+            if (len + 1 < str.length()) {
+                assertSuggestions(str.substring(0, CompletionFieldMapper.correctSubStringLen(str,
+                        len + (Character.isHighSurrogate(str.charAt(len-1)) ? 2 : 1))));
+            }
+        }        
+    }
+    
+    @Test
+    // see #3596
+    public void testVeryLongInput()  throws IOException {
+        client().admin().indices().prepareCreate(INDEX).get();
+        ElasticsearchAssertions.assertAcked(client().admin().indices().preparePutMapping(INDEX).setType(TYPE).setSource(jsonBuilder().startObject()
+                .startObject(TYPE).startObject("properties")
+                .startObject(FIELD)
+                .field("type", "completion")
+                .endObject()
+                .endObject().endObject()
+                .endObject()));
+        ensureYellow();
+        // can cause stack overflow without the default max_input_len
+        String longString = randomRealisticUnicodeOfLength(atLeast(5000));
+        client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder()
+                .startObject().startObject(FIELD)
+                .startArray("input").value(longString).endArray()
+                .field("output", "foobar")
+                .endObject().endObject()
+        ).setRefresh(true).get();
+        
+    }
 }

+ 3 - 1
src/test/java/org/elasticsearch/test/unit/index/mapper/completion/CompletionFieldMapperTests.java

@@ -19,7 +19,6 @@
 package org.elasticsearch.test.unit.index.mapper.completion;
 
 import org.elasticsearch.common.xcontent.XContentBuilder;
-import org.elasticsearch.common.xcontent.XContentFactory;
 import org.elasticsearch.common.xcontent.json.JsonXContent;
 import org.elasticsearch.index.mapper.DocumentMapper;
 import org.elasticsearch.index.mapper.FieldMapper;
@@ -64,6 +63,8 @@ public class CompletionFieldMapperTests {
                 .field("payloads", true)
                 .field("preserve_separators", false)
                 .field("preserve_position_increments", true)
+                .field("max_input_len", 14)
+
                 .endObject().endObject()
                 .endObject().endObject().string();
 
@@ -83,6 +84,7 @@ public class CompletionFieldMapperTests {
         assertThat(Boolean.valueOf(configMap.get("payloads").toString()), is(true));
         assertThat(Boolean.valueOf(configMap.get("preserve_separators").toString()), is(false));
         assertThat(Boolean.valueOf(configMap.get("preserve_position_increments").toString()), is(true));
+        assertThat(Integer.valueOf(configMap.get("max_input_len").toString()), is(14));
     }
 
     @Test