Browse Source

Mapping: Default position_offset_gap to 100

This is much more fiddly than you'd expect it to be because of the way
position_offset_gap is applied in StringFieldMapper. Instead of setting
the default to 100 its simpler to make sure that all the analyzers default
to 100 and that StringFieldMapper doesn't override the default unless the
user specifies something different. Unless the index was created before
2.1, in which case the old default of 0 has to take.

Also postition_offset_gaps less than 0 aren't allowed at all.

New tests test that:
1. the new default doesn't match phrases across values with reasonably low
slop (5)
2. the new default doest match phrases across values with reasonably high
slop (50)
3. you can override the value and phrases work as you'd expect
4. if you leave the value undefined in the mapping and define it on a
custom analyzer the the value from the custom analyzer shines through

Closes #7268
Nik Everett 10 years ago
parent
commit
4b9664beeb

+ 22 - 2
core/src/main/java/org/elasticsearch/index/analysis/AnalysisService.java

@@ -29,6 +29,7 @@ import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.AbstractIndexComponent;
 import org.elasticsearch.index.Index;
+import org.elasticsearch.index.mapper.core.StringFieldMapper;
 import org.elasticsearch.index.settings.IndexSettings;
 import org.elasticsearch.indices.analysis.IndicesAnalysisService;
 
@@ -215,19 +216,38 @@ public class AnalysisService extends AbstractIndexComponent implements Closeable
 
         Map<String, NamedAnalyzer> analyzers = newHashMap();
         for (AnalyzerProvider analyzerFactory : analyzerProviders.values()) {
+            /*
+             * Lucene defaults positionOffsetGap to 0 in all analyzers but
+             * Elasticsearch defaults them to 0 only before version 2.1
+             * and 100 afterwards so we override the positionOffsetGap if it
+             * doesn't match here.
+             */
+            int overridePositionOffsetGap = StringFieldMapper.Defaults.positionOffsetGap(Version.indexCreated(indexSettings));
             if (analyzerFactory instanceof CustomAnalyzerProvider) {
                 ((CustomAnalyzerProvider) analyzerFactory).build(this);
+                /*
+                 * Custom analyzers already default to the correct, version
+                 * dependent positionOffsetGap and the user is be able to
+                 * configure the positionOffsetGap directly on the analyzer so
+                 * we disable overriding the positionOffsetGap to preserve the
+                 * user's setting.
+                 */
+                overridePositionOffsetGap = Integer.MIN_VALUE;
             }
             Analyzer analyzerF = analyzerFactory.get();
             if (analyzerF == null) {
                 throw new IllegalArgumentException("analyzer [" + analyzerFactory.name() + "] created null analyzer");
             }
             NamedAnalyzer analyzer;
-            // if we got a named analyzer back, use it...
             if (analyzerF instanceof NamedAnalyzer) {
+                // if we got a named analyzer back, use it...
                 analyzer = (NamedAnalyzer) analyzerF;
+                if (overridePositionOffsetGap >= 0 && analyzer.getPositionIncrementGap(analyzer.name()) != overridePositionOffsetGap) {
+                    // unless the positionOffsetGap needs to be overridden
+                    analyzer = new NamedAnalyzer(analyzer, overridePositionOffsetGap);
+                }
             } else {
-                analyzer = new NamedAnalyzer(analyzerFactory.name(), analyzerFactory.scope(), analyzerF);
+                analyzer = new NamedAnalyzer(analyzerFactory.name(), analyzerFactory.scope(), analyzerF, overridePositionOffsetGap);
             }
             analyzers.put(analyzerFactory.name(), analyzer);
             analyzers.put(Strings.toCamelCase(analyzerFactory.name()), analyzer);

+ 4 - 1
core/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java

@@ -19,10 +19,12 @@
 
 package org.elasticsearch.index.analysis;
 
+import org.elasticsearch.Version;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.Index;
+import org.elasticsearch.index.mapper.core.StringFieldMapper;
 import org.elasticsearch.index.settings.IndexSettings;
 
 import java.util.List;
@@ -77,7 +79,8 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
             tokenFilters.add(tokenFilter);
         }
 
-        int positionOffsetGap = analyzerSettings.getAsInt("position_offset_gap", 0);
+        int positionOffsetGapDefault = StringFieldMapper.Defaults.positionOffsetGap(Version.indexCreated(indexSettings));
+        int positionOffsetGap = analyzerSettings.getAsInt("position_offset_gap", positionOffsetGapDefault);
         int offsetGap = analyzerSettings.getAsInt("offset_gap", -1);
 
         this.customAnalyzer = new CustomAnalyzer(tokenizer,

+ 32 - 5
core/src/main/java/org/elasticsearch/index/mapper/core/StringFieldMapper.java

@@ -24,6 +24,7 @@ import org.apache.lucene.document.SortedSetDocValuesField;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.Version;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.xcontent.XContentBuilder;
@@ -52,6 +53,7 @@ import static org.elasticsearch.index.mapper.core.TypeParsers.parseMultiField;
 public class StringFieldMapper extends FieldMapper implements AllFieldMapper.IncludeInAll {
 
     public static final String CONTENT_TYPE = "string";
+    private static final int POSITION_OFFSET_GAP_USE_ANALYZER = -1;
 
     public static class Defaults {
         public static final MappedFieldType FIELD_TYPE = new StringFieldType();
@@ -62,15 +64,36 @@ public class StringFieldMapper extends FieldMapper implements AllFieldMapper.Inc
 
         // NOTE, when adding defaults here, make sure you add them in the builder
         public static final String NULL_VALUE = null;
-        public static final int POSITION_OFFSET_GAP = 0;
+        /**
+         * Post 2.1 default for position_offset_gap. Set to 100 so that
+         * phrase queries of reasonably high slop will not match across field
+         * values.
+         */
+        public static final int POSITION_OFFSET_GAP = 100;
+        public static final int POSITION_OFFSET_GAP_PRE_2_1 = 0;
         public static final int IGNORE_ABOVE = -1;
+
+        /**
+         * The default position_offset_gap for a particular version of Elasticsearch.
+         */
+        public static int positionOffsetGap(Version version) {
+            if (version.before(Version.V_2_1_0)) {
+                return POSITION_OFFSET_GAP_PRE_2_1;
+            }
+            return POSITION_OFFSET_GAP;
+        }
     }
 
     public static class Builder extends FieldMapper.Builder<Builder, StringFieldMapper> {
 
         protected String nullValue = Defaults.NULL_VALUE;
 
-        protected int positionOffsetGap = Defaults.POSITION_OFFSET_GAP;
+        /**
+         * The distance between tokens from different values in the same field.
+         * POSITION_OFFSET_GAP_USE_ANALYZER means default to the analyzer's
+         * setting which in turn defaults to Defaults.POSITION_OFFSET_GAP.
+         */
+        protected int positionOffsetGap = POSITION_OFFSET_GAP_USE_ANALYZER;
 
         protected int ignoreAbove = Defaults.IGNORE_ABOVE;
 
@@ -102,7 +125,7 @@ public class StringFieldMapper extends FieldMapper implements AllFieldMapper.Inc
 
         @Override
         public StringFieldMapper build(BuilderContext context) {
-            if (positionOffsetGap > 0) {
+            if (positionOffsetGap != POSITION_OFFSET_GAP_USE_ANALYZER) {
                 fieldType.setIndexAnalyzer(new NamedAnalyzer(fieldType.indexAnalyzer(), positionOffsetGap));
                 fieldType.setSearchAnalyzer(new NamedAnalyzer(fieldType.searchAnalyzer(), positionOffsetGap));
                 fieldType.setSearchQuoteAnalyzer(new NamedAnalyzer(fieldType.searchQuoteAnalyzer(), positionOffsetGap));
@@ -154,7 +177,11 @@ public class StringFieldMapper extends FieldMapper implements AllFieldMapper.Inc
                     builder.searchQuotedAnalyzer(analyzer);
                     iterator.remove();
                 } else if (propName.equals("position_offset_gap")) {
-                    builder.positionOffsetGap(XContentMapValues.nodeIntegerValue(propNode, -1));
+                    int newPositionOffsetGap = XContentMapValues.nodeIntegerValue(propNode, -1);
+                    if (newPositionOffsetGap < 0) {
+                        throw new MapperParsingException("positions_offset_gap less than 0 aren't allowed.");
+                    }
+                    builder.positionOffsetGap(newPositionOffsetGap);
                     // we need to update to actual analyzers if they are not set in this case...
                     // so we can inject the position offset gap...
                     if (builder.fieldType().indexAnalyzer() == null) {
@@ -354,7 +381,7 @@ public class StringFieldMapper extends FieldMapper implements AllFieldMapper.Inc
             builder.field("include_in_all", false);
         }
 
-        if (includeDefaults || positionOffsetGap != Defaults.POSITION_OFFSET_GAP) {
+        if (includeDefaults || positionOffsetGap != POSITION_OFFSET_GAP_USE_ANALYZER) {
             builder.field("position_offset_gap", positionOffsetGap);
         }
         NamedAnalyzer searchQuoteAnalyzer = fieldType().searchQuoteAnalyzer();

+ 11 - 0
core/src/test/java/org/elasticsearch/bwcompat/OldIndexBackwardsCompatibilityIT.java

@@ -21,6 +21,7 @@ package org.elasticsearch.bwcompat;
 
 import com.google.common.base.Predicate;
 import com.google.common.util.concurrent.ListenableFuture;
+
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TestUtil;
@@ -40,6 +41,7 @@ import org.elasticsearch.common.util.MultiDataPathUpgrader;
 import org.elasticsearch.common.xcontent.XContentHelper;
 import org.elasticsearch.env.NodeEnvironment;
 import org.elasticsearch.index.engine.EngineConfig;
+import org.elasticsearch.index.mapper.string.StringFieldMapperPositionOffsetGapTests;
 import org.elasticsearch.index.query.QueryBuilders;
 import org.elasticsearch.index.shard.MergePolicyConfig;
 import org.elasticsearch.indices.recovery.RecoverySettings;
@@ -330,6 +332,7 @@ public class OldIndexBackwardsCompatibilityIT extends ESIntegTestCase {
         assertNewReplicasWork(indexName);
         assertUpgradeWorks(indexName, isLatestLuceneVersion(version));
         assertDeleteByQueryWorked(indexName, version);
+        assertPositionOffsetGapDefaults(indexName, version);
         unloadIndex(indexName);
     }
 
@@ -442,6 +445,14 @@ public class OldIndexBackwardsCompatibilityIT extends ESIntegTestCase {
         assertEquals(0, searchReq.get().getHits().getTotalHits());
     }
 
+    void assertPositionOffsetGapDefaults(String indexName, Version version) throws Exception {
+        if (version.before(Version.V_2_1_0)) {
+            StringFieldMapperPositionOffsetGapTests.assertGapIsZero(client(), indexName, "doc");
+        } else {
+            StringFieldMapperPositionOffsetGapTests.assertGapIsOneHundred(client(), indexName, "doc");
+        }
+    }
+
     void assertUpgradeWorks(String indexName, boolean alreadyLatest) throws Exception {
         if (alreadyLatest == false) {
             UpgradeIT.assertNotUpgraded(client(), indexName);

+ 158 - 0
core/src/test/java/org/elasticsearch/index/mapper/string/StringFieldMapperPositionOffsetGapTests.java

@@ -0,0 +1,158 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.mapper.string;
+
+import com.google.common.collect.ImmutableList;
+
+import org.elasticsearch.ExceptionsHelper;
+import org.elasticsearch.client.Client;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.common.xcontent.XContentFactory;
+import org.elasticsearch.index.mapper.MapperParsingException;
+import org.elasticsearch.test.ESSingleNodeTestCase;
+
+import java.io.IOException;
+
+import static org.elasticsearch.index.query.QueryBuilders.matchPhraseQuery;
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
+import static org.hamcrest.Matchers.containsString;
+
+/**
+ * Tests that position_offset_gap is read from the mapper and applies as
+ * expected in queries.
+ */
+public class StringFieldMapperPositionOffsetGapTests extends ESSingleNodeTestCase {
+    /**
+     * The default position_offset_gap should be large enough that most
+     * "sensible" queries phrase slops won't match across values.
+     */
+    public void testDefault() throws IOException {
+        assertGapIsOneHundred(client(), "test", "test");
+    }
+
+    /**
+     * Asserts that the post-2.0 default is being applied.
+     */
+    public static void assertGapIsOneHundred(Client client, String indexName, String type) throws IOException {
+        testGap(client(), indexName, type, 100);
+
+        // No match across gap using default slop with default positionOffsetGap
+        assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two")).get(), 0);
+
+        // Nor with small-ish values
+        assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two").slop(5)).get(), 0);
+        assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two").slop(50)).get(), 0);
+
+        // But huge-ish values still match
+        assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two").slop(500)).get(), 1);
+    }
+
+    public void testZero() throws IOException {
+        setupGapInMapping(0);
+        assertGapIsZero(client(), "test", "test");
+    }
+
+    /**
+     * Asserts that the pre-2.0 default has been applied or explicitly
+     * configured.
+     */
+    public static void assertGapIsZero(Client client, String indexName, String type) throws IOException {
+        testGap(client, indexName, type, 0);
+        /*
+         * Phrases match across different values using default slop with pre-2.0 default
+         * position_offset_gap.
+         */
+        assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two")).get(), 1);
+    }
+
+    public void testLargerThanDefault() throws IOException {
+        setupGapInMapping(10000);
+        testGap(client(), "test", "test", 10000);
+    }
+
+    public void testSmallerThanDefault() throws IOException {
+        setupGapInMapping(2);
+        testGap(client(), "test", "test", 2);
+    }
+
+    public void testNegativeIsError() throws IOException {
+        try {
+            setupGapInMapping(-1);
+            fail("Expected an error");
+        } catch (MapperParsingException e) {
+            assertThat(ExceptionsHelper.detailedMessage(e), containsString("positions_offset_gap less than 0 aren't allowed"));
+        }
+    }
+
+    /**
+     * Tests that the default actually defaults to the position_offset_gap
+     * configured in the analyzer. This behavior is very old and a little
+     * strange but not worth breaking some thought.
+     */
+    public void testDefaultDefaultsToAnalyzer() throws IOException {
+        XContentBuilder settings = XContentFactory.jsonBuilder().startObject().startObject("analysis").startObject("analyzer")
+                .startObject("gappy");
+        settings.field("type", "custom");
+        settings.field("tokenizer", "standard");
+        settings.field("position_offset_gap", 2);
+        setupAnalyzer(settings, "gappy");
+        testGap(client(), "test", "test", 2);
+    }
+
+    /**
+     * Build an index named "test" with a field named "string" with the provided
+     * positionOffsetGap that uses the standard analyzer.
+     */
+    private void setupGapInMapping(int positionOffsetGap) throws IOException {
+        XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("properties").startObject("string");
+        mapping.field("type", "string");
+        mapping.field("position_offset_gap", positionOffsetGap);
+        client().admin().indices().prepareCreate("test").addMapping("test", mapping).get();
+    }
+
+    /**
+     * Build an index named "test" with the provided settings and and a field
+     * named "string" that uses the specified analyzer and default
+     * position_offset_gap.
+     */
+    private void setupAnalyzer(XContentBuilder settings, String analyzer) throws IOException {
+        XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("properties").startObject("string");
+        mapping.field("type", "string");
+        mapping.field("analyzer", analyzer);
+        client().admin().indices().prepareCreate("test").addMapping("test", mapping).setSettings(settings).get();
+    }
+
+    private static void testGap(Client client, String indexName, String type, int positionOffsetGap) throws IOException {
+        client.prepareIndex(indexName, type, "position_gap_test").setSource("string", ImmutableList.of("one", "two three")).setRefresh(true).get();
+
+        // Baseline - phrase query finds matches in the same field value
+        assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "two three")).get(), 1);
+
+        if (positionOffsetGap > 0) {
+            // No match across gaps when slop < position gap
+            assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two").slop(positionOffsetGap - 1)).get(),
+                    0);
+        }
+
+        // Match across gaps when slop >= position gap
+        assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two").slop(positionOffsetGap)).get(), 1);
+        assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two").slop(positionOffsetGap + 1)).get(), 1);
+    }
+}

+ 5 - 3
docs/reference/analysis/analyzers/custom-analyzer.asciidoc

@@ -20,8 +20,10 @@ filters.
 |`char_filter` |An optional list of logical / registered name of char
 filters.
 
-|`position_offset_gap` |An optional number of positions to increment 
-between each field value of a field using this analyzer.
+|`position_offset_gap` |An optional number of positions to increment
+between each field value of a field using this analyzer. Defaults to 100.
+100 was chosen because it prevents phrase queries with reasonably large
+slops (less than 100) from matching terms across field values.
 |=======================================================================
 
 Here is an example:
@@ -30,7 +32,7 @@ Here is an example:
 --------------------------------------------------
 index :
     analysis :
-        analyzer : 
+        analyzer :
             myAnalyzer2 :
                 type : custom
                 tokenizer : myTokenizer1

+ 5 - 2
docs/reference/mapping/types/string.asciidoc

@@ -145,6 +145,11 @@ Defaults depend on the <<mapping-index,`index`>> setting:
 
     The number of fake term positions which should be inserted between
     each element of an array of strings. Defaults to 0.
+    The number of fake term position which should be inserted between each
+    element of an array of strings. Defaults to the position_offset_gap
+    configured on the analyzer which defaults to 100. 100 was chosen because it
+    prevents phrase queries with reasonably large slops (less than 100) from
+    matching terms across field values.
 
 <<mapping-store,`store`>>::
 
@@ -166,5 +171,3 @@ Defaults depend on the <<mapping-index,`index`>> setting:
 
     Whether term vectors should be stored for an <<mapping-index,`analyzed`>>
     field. Defaults to `no`.
-
-

+ 1 - 1
docs/reference/migration/migrate_2_0.asciidoc

@@ -50,4 +50,4 @@ include::migrate_2_0/settings.asciidoc[]
 
 include::migrate_2_0/stats.asciidoc[]
 
-include::migrate_2_0/java.asciidoc[]
+include::migrate_2_0/java.asciidoc[]

+ 10 - 0
docs/reference/migration/migrate_2_1.asciidoc

@@ -25,3 +25,13 @@ GET /my_index/_search?scroll=2m
 Scroll requests sorted by `_doc` have been optimized to more efficiently resume
 from where the previous request stopped, so this will have the same performance
 characteristics as the former `scan` search type.
+
+=== Mapping changes
+
+==== position_offset_gap
+The default `position_offset_grap` is now 100. Indexes created in Elasticsearch
+2.1.0 will default to using 100 and indexes created before that will continue
+to use the old default of 0. This was done to prevent phrase queries from
+matching across different values of the same term unexpectedly. Specifically,
+100 was chosen to cause phrase queries with slops up to 99 to match only within
+a single value of a field.