1
0
Эх сурвалжийг харах

Configurable distance limit with the AUTO fuzziness. (#25731)

Make the distance thresholds configurable with the AUTO fuzziness.
Antonio Matarrese 8 жил өмнө
parent
commit
93cc2d0372

+ 67 - 6
core/src/main/java/org/elasticsearch/common/unit/Fuzziness.java

@@ -18,6 +18,8 @@
  */
 package org.elasticsearch.common.unit;
 
+import org.elasticsearch.ElasticsearchParseException;
+import org.elasticsearch.Version;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
@@ -43,8 +45,12 @@ public final class Fuzziness implements ToXContentFragment, Writeable {
     public static final Fuzziness TWO = new Fuzziness(2);
     public static final Fuzziness AUTO = new Fuzziness("AUTO");
     public static final ParseField FIELD = new ParseField(X_FIELD_NAME);
+    private static final int DEFAULT_LOW_DISTANCE = 3;
+    private static final int DEFAULT_HIGH_DISTANCE = 6;
 
     private final String fuzziness;
+    private int lowDistance = DEFAULT_LOW_DISTANCE;
+    private int highDistance = DEFAULT_HIGH_DISTANCE;
 
     private Fuzziness(int fuzziness) {
         if (fuzziness != 0 && fuzziness != 1 && fuzziness != 2) {
@@ -54,22 +60,48 @@ public final class Fuzziness implements ToXContentFragment, Writeable {
     }
 
     private Fuzziness(String fuzziness) {
-        if (fuzziness == null) {
+        if (fuzziness == null || fuzziness.isEmpty()) {
             throw new IllegalArgumentException("fuzziness can't be null!");
         }
         this.fuzziness = fuzziness.toUpperCase(Locale.ROOT);
     }
 
+    private Fuzziness(String fuzziness, int lowDistance, int highDistance) {
+        this(fuzziness);
+        if (lowDistance < 0 || highDistance < 0 || lowDistance > highDistance) {
+            throw new IllegalArgumentException("fuzziness wrongly configured, must be: lowDistance > 0, highDistance" +
+                " > 0 and lowDistance <= highDistance ");
+        }
+        this.lowDistance = lowDistance;
+        this.highDistance = highDistance;
+    }
+
     /**
      * Read from a stream.
      */
     public Fuzziness(StreamInput in) throws IOException {
         fuzziness = in.readString();
+        if (in.getVersion().onOrAfter(Version.V_6_1_0) && in.readBoolean()) {
+            lowDistance = in.readVInt();
+            highDistance = in.readVInt();
+        }
     }
 
     @Override
     public void writeTo(StreamOutput out) throws IOException {
         out.writeString(fuzziness);
+        if (out.getVersion().onOrAfter(Version.V_6_1_0)) {
+            // we cannot serialize the low/high bounds since the other node does not know about them.
+            // This is a best-effort to not fail queries in case the cluster is being upgraded and users
+            // start using features that are not available on all nodes.
+            if (isAutoWithCustomValues()) {
+                out.writeBoolean(true);
+                out.writeVInt(lowDistance);
+                out.writeVInt(highDistance);
+            } else {
+                out.writeBoolean(false);
+            }
+        }
     }
 
     /**
@@ -88,10 +120,29 @@ public final class Fuzziness implements ToXContentFragment, Writeable {
         String string = fuzziness.toString();
         if (AUTO.asString().equalsIgnoreCase(string)) {
             return AUTO;
+        } else if (string.toUpperCase(Locale.ROOT).startsWith(AUTO.asString() + ":")) {
+            return parseCustomAuto(string);
         }
         return new Fuzziness(string);
     }
 
+    private static Fuzziness parseCustomAuto( final String string) {
+        assert string.toUpperCase(Locale.ROOT).startsWith(AUTO.asString() + ":");
+        String[] fuzzinessLimit = string.substring(AUTO.asString().length() + 1).split(",");
+        if (fuzzinessLimit.length == 2) {
+            try {
+                int lowerLimit = Integer.parseInt(fuzzinessLimit[0]);
+                int highLimit = Integer.parseInt(fuzzinessLimit[1]);
+                return new Fuzziness("AUTO", lowerLimit, highLimit);
+            } catch (NumberFormatException e) {
+                throw new ElasticsearchParseException("failed to parse [{}] as a \"auto:int,int\"", e,
+                    string);
+            }
+        } else {
+            throw new ElasticsearchParseException("failed to find low and high distance values");
+        }
+    }
+
     public static Fuzziness parse(XContentParser parser) throws IOException {
         XContentParser.Token token = parser.currentToken();
         switch (token) {
@@ -100,6 +151,8 @@ public final class Fuzziness implements ToXContentFragment, Writeable {
                 final String fuzziness = parser.text();
                 if (AUTO.asString().equalsIgnoreCase(fuzziness)) {
                     return AUTO;
+                } else if (fuzziness.toUpperCase(Locale.ROOT).startsWith(AUTO.asString() + ":")) {
+                    return parseCustomAuto(fuzziness);
                 }
                 try {
                     final int minimumSimilarity = Integer.parseInt(fuzziness);
@@ -135,19 +188,19 @@ public final class Fuzziness implements ToXContentFragment, Writeable {
     public int asDistance(String text) {
         if (this.equals(AUTO)) { //AUTO
             final int len = termLen(text);
-            if (len <= 2) {
+            if (len < lowDistance) {
                 return 0;
-            } else if (len > 5) {
-                return 2;
-            } else {
+            } else if (len < highDistance) {
                 return 1;
+            } else {
+                return 2;
             }
         }
         return Math.min(2, (int) asFloat());
     }
 
     public float asFloat() {
-        if (this.equals(AUTO)) {
+        if (this.equals(AUTO) || isAutoWithCustomValues()) {
             return 1f;
         }
         return Float.parseFloat(fuzziness.toString());
@@ -158,9 +211,17 @@ public final class Fuzziness implements ToXContentFragment, Writeable {
     }
 
     public String asString() {
+        if (isAutoWithCustomValues()) {
+            return fuzziness.toString() + ":" + lowDistance + "," + highDistance;
+        }
         return fuzziness.toString();
     }
 
+    private boolean isAutoWithCustomValues() {
+        return fuzziness.startsWith("AUTO") && (lowDistance != DEFAULT_LOW_DISTANCE ||
+            highDistance != DEFAULT_HIGH_DISTANCE);
+    }
+
     @Override
     public boolean equals(Object obj) {
         if (this == obj) {

+ 38 - 13
core/src/test/java/org/elasticsearch/common/unit/FuzzinessTests.java

@@ -18,6 +18,7 @@
  */
 package org.elasticsearch.common.unit;
 
+import org.elasticsearch.ElasticsearchParseException;
 import org.elasticsearch.common.io.stream.BytesStreamOutput;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.xcontent.XContentBuilder;
@@ -49,8 +50,8 @@ public class FuzzinessTests extends ESTestCase {
                 assertThat(parser.nextToken(), equalTo(XContentParser.Token.START_OBJECT));
                 assertThat(parser.nextToken(), equalTo(XContentParser.Token.FIELD_NAME));
                 assertThat(parser.nextToken(), equalTo(XContentParser.Token.VALUE_NUMBER));
-                Fuzziness parse = Fuzziness.parse(parser);
-                assertThat(parse.asFloat(), equalTo(floatValue));
+                Fuzziness fuzziness = Fuzziness.parse(parser);
+                assertThat(fuzziness.asFloat(), equalTo(floatValue));
                 assertThat(parser.nextToken(), equalTo(XContentParser.Token.END_OBJECT));
             }
             {
@@ -67,21 +68,21 @@ public class FuzzinessTests extends ESTestCase {
                 assertThat(parser.nextToken(), equalTo(XContentParser.Token.START_OBJECT));
                 assertThat(parser.nextToken(), equalTo(XContentParser.Token.FIELD_NAME));
                 assertThat(parser.nextToken(), anyOf(equalTo(XContentParser.Token.VALUE_NUMBER), equalTo(XContentParser.Token.VALUE_STRING)));
-                Fuzziness parse = Fuzziness.parse(parser);
+                Fuzziness fuzziness = Fuzziness.parse(parser);
                 if (value.intValue() >= 1) {
-                    assertThat(parse.asDistance(), equalTo(Math.min(2, value.intValue())));
+                    assertThat(fuzziness.asDistance(), equalTo(Math.min(2, value.intValue())));
                 }
                 assertThat(parser.nextToken(), equalTo(XContentParser.Token.END_OBJECT));
                 if (intValue.equals(value)) {
                     switch (intValue) {
                         case 1:
-                            assertThat(parse, sameInstance(Fuzziness.ONE));
+                            assertThat(fuzziness, sameInstance(Fuzziness.ONE));
                             break;
                         case 2:
-                            assertThat(parse, sameInstance(Fuzziness.TWO));
+                            assertThat(fuzziness, sameInstance(Fuzziness.TWO));
                             break;
                         case 0:
-                            assertThat(parse, sameInstance(Fuzziness.ZERO));
+                            assertThat(fuzziness, sameInstance(Fuzziness.ZERO));
                             break;
                         default:
                             break;
@@ -90,19 +91,26 @@ public class FuzzinessTests extends ESTestCase {
             }
             {
                 XContentBuilder json;
-                if (randomBoolean()) {
+                boolean isDefaultAutoFuzzinessTested = randomBoolean();
+                if (isDefaultAutoFuzzinessTested) {
                     json = Fuzziness.AUTO.toXContent(jsonBuilder().startObject(), null).endObject();
                 } else {
+                    String auto = randomBoolean() ? "AUTO" : "auto";
+                    if (randomBoolean()) {
+                        auto += ":" + randomIntBetween(1, 3) + "," + randomIntBetween(4, 10);
+                    }
                     json = jsonBuilder().startObject()
-                            .field(Fuzziness.X_FIELD_NAME, randomBoolean() ? "AUTO" : "auto")
-                            .endObject();
+                        .field(Fuzziness.X_FIELD_NAME, auto)
+                        .endObject();
                 }
                 XContentParser parser = createParser(json);
                 assertThat(parser.nextToken(), equalTo(XContentParser.Token.START_OBJECT));
                 assertThat(parser.nextToken(), equalTo(XContentParser.Token.FIELD_NAME));
                 assertThat(parser.nextToken(), equalTo(XContentParser.Token.VALUE_STRING));
-                Fuzziness parse = Fuzziness.parse(parser);
-                assertThat(parse, sameInstance(Fuzziness.AUTO));
+                Fuzziness fuzziness = Fuzziness.parse(parser);
+                if (isDefaultAutoFuzzinessTested) {
+                    assertThat(fuzziness, sameInstance(Fuzziness.AUTO));
+                }
                 assertThat(parser.nextToken(), equalTo(XContentParser.Token.END_OBJECT));
             }
         }
@@ -132,13 +140,30 @@ public class FuzzinessTests extends ESTestCase {
         assertEquals(fuzziness, deserializedFuzziness);
     }
 
-    public void testSerializationAuto() throws IOException {
+    public void testSerializationDefaultAuto() throws IOException {
         Fuzziness fuzziness = Fuzziness.AUTO;
         Fuzziness deserializedFuzziness = doSerializeRoundtrip(fuzziness);
         assertEquals(fuzziness, deserializedFuzziness);
         assertEquals(fuzziness.asFloat(), deserializedFuzziness.asFloat(), 0f);
     }
 
+    public void testSerializationCustomAuto() throws IOException {
+        String auto = "AUTO:4,7";
+        XContentBuilder json = jsonBuilder().startObject()
+            .field(Fuzziness.X_FIELD_NAME, auto)
+            .endObject();
+
+        XContentParser parser = createParser(json);
+        assertThat(parser.nextToken(), equalTo(XContentParser.Token.START_OBJECT));
+        assertThat(parser.nextToken(), equalTo(XContentParser.Token.FIELD_NAME));
+        assertThat(parser.nextToken(), equalTo(XContentParser.Token.VALUE_STRING));
+        Fuzziness fuzziness = Fuzziness.parse(parser);
+
+        Fuzziness deserializedFuzziness = doSerializeRoundtrip(fuzziness);
+        assertEquals(fuzziness, deserializedFuzziness);
+        assertEquals(fuzziness.asString(), deserializedFuzziness.asString());
+    }
+
     private static Fuzziness doSerializeRoundtrip(Fuzziness in) throws IOException {
         BytesStreamOutput output = new BytesStreamOutput();
         in.writeTo(output);

+ 87 - 0
core/src/test/java/org/elasticsearch/index/query/FuzzyQueryBuilderTests.java

@@ -23,6 +23,7 @@ import org.apache.lucene.index.Term;
 import org.apache.lucene.search.BoostQuery;
 import org.apache.lucene.search.FuzzyQuery;
 import org.apache.lucene.search.Query;
+import org.elasticsearch.ElasticsearchParseException;
 import org.elasticsearch.common.ParsingException;
 import org.elasticsearch.common.unit.Fuzziness;
 import org.elasticsearch.search.internal.SearchContext;
@@ -120,6 +121,92 @@ public class FuzzyQueryBuilderTests extends AbstractQueryTestCase<FuzzyQueryBuil
         assertThat(fuzzyQuery.getPrefixLength(), equalTo(1));
     }
 
+    public void testToQueryWithStringFieldDefinedFuzziness() throws IOException {
+        assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0);
+        String query = "{\n" +
+            "    \"fuzzy\":{\n" +
+            "        \"" + STRING_FIELD_NAME + "\":{\n" +
+            "            \"value\":\"sh\",\n" +
+            "            \"fuzziness\": \"AUTO:2,5\",\n" +
+            "            \"prefix_length\":1,\n" +
+            "            \"boost\":2.0\n" +
+            "        }\n" +
+            "    }\n" +
+            "}";
+        Query parsedQuery = parseQuery(query).toQuery(createShardContext());
+        assertThat(parsedQuery, instanceOf(BoostQuery.class));
+        BoostQuery boostQuery = (BoostQuery) parsedQuery;
+        assertThat(boostQuery.getBoost(), equalTo(2.0f));
+        assertThat(boostQuery.getQuery(), instanceOf(FuzzyQuery.class));
+        FuzzyQuery fuzzyQuery = (FuzzyQuery) boostQuery.getQuery();
+        assertThat(fuzzyQuery.getTerm(), equalTo(new Term(STRING_FIELD_NAME, "sh")));
+        assertThat(fuzzyQuery.getMaxEdits(), equalTo(1));
+        assertThat(fuzzyQuery.getPrefixLength(), equalTo(1));
+    }
+
+    public void testToQueryWithStringFieldDefinedWrongFuzziness() throws IOException {
+        assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0);
+        String queryMissingFuzzinessUpLimit = "{\n" +
+            "    \"fuzzy\":{\n" +
+            "        \"" + STRING_FIELD_NAME + "\":{\n" +
+            "            \"value\":\"sh\",\n" +
+            "            \"fuzziness\": \"AUTO:2\",\n" +
+            "            \"prefix_length\":1,\n" +
+            "            \"boost\":2.0\n" +
+            "        }\n" +
+            "    }\n" +
+            "}";
+        ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class,
+            () -> parseQuery(queryMissingFuzzinessUpLimit).toQuery(createShardContext()));
+        String msg = "failed to find low and high distance values";
+        assertTrue(e.getMessage() + " didn't contain: " + msg + " but: " + e.getMessage(), e.getMessage().contains(msg));
+
+        String queryHavingNegativeFuzzinessLowLimit = "{\n" +
+            "    \"fuzzy\":{\n" +
+            "        \"" + STRING_FIELD_NAME + "\":{\n" +
+            "            \"value\":\"sh\",\n" +
+            "            \"fuzziness\": \"AUTO:-1,6\",\n" +
+            "            \"prefix_length\":1,\n" +
+            "            \"boost\":2.0\n" +
+            "        }\n" +
+            "    }\n" +
+            "}";
+        String msg2 = "fuzziness wrongly configured";
+        IllegalArgumentException e2 = expectThrows(IllegalArgumentException.class,
+            () -> parseQuery(queryHavingNegativeFuzzinessLowLimit).toQuery(createShardContext()));
+        assertTrue(e2.getMessage() + " didn't contain: " + msg2 + " but: " + e.getMessage(), e.getMessage().contains
+            (msg));
+
+        String queryMissingFuzzinessUpLimit2 = "{\n" +
+            "    \"fuzzy\":{\n" +
+            "        \"" + STRING_FIELD_NAME + "\":{\n" +
+            "            \"value\":\"sh\",\n" +
+            "            \"fuzziness\": \"AUTO:1,\",\n" +
+            "            \"prefix_length\":1,\n" +
+            "            \"boost\":2.0\n" +
+            "        }\n" +
+            "    }\n" +
+            "}";
+        e = expectThrows(ElasticsearchParseException.class,
+            () -> parseQuery(queryMissingFuzzinessUpLimit2).toQuery(createShardContext()));
+        assertTrue(e.getMessage() + " didn't contain: " + msg + " but: " + e.getMessage(), e.getMessage().contains(msg));
+
+        String queryMissingFuzzinessLowLimit = "{\n" +
+            "    \"fuzzy\":{\n" +
+            "        \"" + STRING_FIELD_NAME + "\":{\n" +
+            "            \"value\":\"sh\",\n" +
+            "            \"fuzziness\": \"AUTO:,5\",\n" +
+            "            \"prefix_length\":1,\n" +
+            "            \"boost\":2.0\n" +
+            "        }\n" +
+            "    }\n" +
+            "}";
+        e = expectThrows(ElasticsearchParseException.class,
+            () -> parseQuery(queryMissingFuzzinessLowLimit).toQuery(createShardContext()));
+        msg = "failed to parse [AUTO:,5] as a \"auto:int,int\"";
+        assertTrue(e.getMessage() + " didn't contain: " + msg + " but: " + e.getMessage(), e.getMessage().contains(msg));
+    }
+
     public void testToQueryWithNumericField() throws IOException {
         assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0);
         String query = "{\n" +

+ 3 - 1
docs/reference/api-conventions.asciidoc

@@ -577,7 +577,9 @@ the maximum allowed Levenshtein Edit Distance (or number of edits)
 `AUTO`::
 +
 --
-generates an edit distance based on the length of the term. For lengths:
+generates an edit distance based on the length of the term.
+Low and high distance arguments may be optionally provided `AUTO:[low],[high]`, if not specified,
+the default values are 3 and 6, equivalent to `AUTO:3,6` that make for lengths:
 
 `0..2`:: must match exactly
 `3..5`:: one edit allowed