Browse Source

Allow fields with dots in sparse vector field mapper (#111981)

* Remove dot validation for sparse vector field mapper

* Update docs/changelog/111981.yaml

* Update changelog

* Fix test permissions

* PR feedback - yaml test

* PR feedback - remove non-dot values from sparse vector query in test to verify the dot is searched correctly

* Add additional test cases for field collissions

* Update docs/changelog/111981.yaml

* Update key for SparseVectorFieldMapper

* Fix test

* Update yaml test to include scores

* Update server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java

Co-authored-by: Jim Ferenczi <jim.ferenczi@elastic.co>

* Revert "Update server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java"

This reverts commit 58fc087535484698426d96d02dd45521fa43d519.

* PR feedback - escape dots

---------

Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
Co-authored-by: Jim Ferenczi <jim.ferenczi@elastic.co>
Kathleen DeRusso 1 year ago
parent
commit
04d192921d

+ 6 - 0
docs/changelog/111981.yaml

@@ -0,0 +1,6 @@
+pr: 111981
+summary: Allow fields with dots in sparse vector field mapper
+area: Mapping
+type: enhancement
+issues:
+ - 109118

+ 2 - 6
server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java

@@ -177,15 +177,11 @@ public class SparseVectorFieldMapper extends FieldMapper {
             for (Token token = context.parser().nextToken(); token != Token.END_OBJECT; token = context.parser().nextToken()) {
                 if (token == Token.FIELD_NAME) {
                     feature = context.parser().currentName();
-                    if (feature.contains(".")) {
-                        throw new IllegalArgumentException(
-                            "[sparse_vector] fields do not support dots in feature names but found [" + feature + "]"
-                        );
-                    }
                 } else if (token == Token.VALUE_NULL) {
                     // ignore feature, this is consistent with numeric fields
                 } else if (token == Token.VALUE_NUMBER || token == Token.VALUE_STRING) {
-                    final String key = fullPath() + "." + feature;
+                    // Use a delimiter that won't collide with subfields & escape the dots in the feature name
+                    final String key = fullPath() + "\\." + feature.replace(".", "\\.");
                     float value = context.parser().floatValue(true);
 
                     // if we have an existing feature of the same name we'll select for the one with the max value

+ 21 - 7
server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java

@@ -111,12 +111,26 @@ public class SparseVectorFieldMapperTests extends MapperTestCase {
 
     public void testDotInFieldName() throws Exception {
         DocumentMapper mapper = createDocumentMapper(fieldMapping(this::minimalMapping));
-        DocumentParsingException ex = expectThrows(
-            DocumentParsingException.class,
-            () -> mapper.parse(source(b -> b.field("field", Map.of("politi.cs", 10, "sports", 20))))
-        );
-        assertThat(ex.getCause().getMessage(), containsString("do not support dots in feature names"));
-        assertThat(ex.getCause().getMessage(), containsString("politi.cs"));
+        ParsedDocument parsedDocument = mapper.parse(source(b -> b.field("field", Map.of("foo.bar", 10, "foobar", 20))));
+
+        List<IndexableField> fields = parsedDocument.rootDoc().getFields("field");
+        assertEquals(2, fields.size());
+        assertThat(fields.get(0), Matchers.instanceOf(FeatureField.class));
+        FeatureField featureField1 = null;
+        FeatureField featureField2 = null;
+        for (IndexableField field : fields) {
+            if (field.stringValue().equals("foo.bar")) {
+                featureField1 = (FeatureField) field;
+            } else if (field.stringValue().equals("foobar")) {
+                featureField2 = (FeatureField) field;
+            } else {
+                throw new UnsupportedOperationException();
+            }
+        }
+
+        int freq1 = getFrequency(featureField1.tokenStream(null, null));
+        int freq2 = getFrequency(featureField2.tokenStream(null, null));
+        assertTrue(freq1 < freq2);
     }
 
     public void testHandlesMultiValuedFields() throws MapperParsingException, IOException {
@@ -156,7 +170,7 @@ public class SparseVectorFieldMapperTests extends MapperTestCase {
         }));
 
         // then validate that the generate document stored both values appropriately and we have only the max value stored
-        FeatureField barField = ((FeatureField) doc1.rootDoc().getByKey("foo.field.bar"));
+        FeatureField barField = ((FeatureField) doc1.rootDoc().getByKey("foo.field\\.bar"));
         assertEquals(20, barField.getFeatureValue(), 1);
 
         FeatureField storedBarField = ((FeatureField) doc1.rootDoc().getFields("foo.field").get(1));

+ 163 - 0
x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml

@@ -313,3 +313,166 @@ setup:
               query: "octopus comforter smells"
 
   - match: { status: 400 }
+
+---
+"Search on a sparse_vector field with dots in the field names":
+
+  - requires:
+      cluster_features: [ "gte_v8.16.0" ]
+      reason: dots in field names allowed starting in in 8.16.0
+
+  - do:
+      indices.create:
+        index: index-with-sparse-vector2
+        body:
+          mappings:
+            properties:
+              ml.tokens:
+                type: sparse_vector
+
+  - match: { acknowledged: true }
+
+  - do:
+      headers:
+        Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser
+      index:
+        index: index-with-sparse-vector2
+        id: "has-dots"
+        refresh: true
+        body:
+          ml:
+            tokens:
+              running: 2.4097164
+              good: 2.170997
+              run: 2.052153
+              race: 1.4575411
+              for: 1.1908325
+              5.0k: 2.489943
+
+  - match: { result: "created" }
+
+  - do:
+      headers:
+        Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser
+      get:
+        index: index-with-sparse-vector2
+        id: "has-dots"
+
+  - match:
+      _source:
+        ml:
+          tokens:
+            running: 2.4097164
+            good: 2.170997
+            run: 2.052153
+            race: 1.4575411
+            for: 1.1908325
+            5.0k: 2.489943
+
+  - do:
+      search:
+        index: index-with-sparse-vector2
+        body:
+          query:
+            sparse_vector:
+              field: ml.tokens
+              query_vector:
+                5.0k: 2.489943
+
+  - match: { hits.total.value: 1 }
+
+---
+"Search on a nested sparse_vector field with dots in the field names and conflicting child fields":
+
+  - requires:
+      cluster_features: [ "gte_v8.16.0" ]
+      reason: dots in field names allowed starting in in 8.16.0
+
+  - do:
+      indices.create:
+        index: index-with-sparse-vector3
+        body:
+          mappings:
+            properties:
+              parent:
+                type: object
+                subobjects: false
+                properties:
+                  foo:
+                    type: sparse_vector
+                  foo.bar:
+                    type: sparse_vector
+
+  - match: { acknowledged: true }
+
+  - do:
+      headers:
+        Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser
+        Content-Type: application/json
+      bulk:
+        index: index-with-sparse-vector3
+        refresh: true
+        body: |
+          {"index": { "_id": "parent-foo" }}
+          {"parent.foo": { "bar.baz": 1.0 }}
+          {"index": { "_id": "parent-foo-bar" }}
+          {"parent.foo.bar": { "baz": 2.0 }}
+          {"index": { "_id": "both-docs" }}
+          {"parent.foo": { "bar.baz": 3.0 }, "parent.foo.bar": { "baz": 4.0 }}
+
+
+  - do:
+      headers:
+        Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser
+      get:
+        index: index-with-sparse-vector3
+        id: "parent-foo"
+
+  - match:
+      _source:
+        parent.foo:
+          bar.baz: 1.0
+
+  - do:
+      headers:
+        Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser
+      get:
+        index: index-with-sparse-vector3
+        id: "parent-foo-bar"
+
+  - match:
+      _source:
+        parent.foo.bar:
+          baz: 2.0
+
+  - do:
+      search:
+        index: index-with-sparse-vector3
+        body:
+          query:
+            sparse_vector:
+              field: parent.foo
+              query_vector:
+                bar.baz: 1.0
+
+  - match: { hits.total.value: 2 }
+  - match: { hits.hits.0._id: "both-docs" }
+  - match: { hits.hits.0._score: 3.0 }
+  - match: { hits.hits.1._id: "parent-foo" }
+  - match: { hits.hits.1._score: 1.0 }
+
+  - do:
+      search:
+        index: index-with-sparse-vector3
+        body:
+          query:
+            sparse_vector:
+              field: parent.foo.bar
+              query_vector:
+                baz: 1.0
+
+  - match: { hits.total.value: 2 }
+  - match: { hits.hits.0._id: "both-docs" }
+  - match: { hits.hits.0._score: 4.0 }
+  - match: { hits.hits.1._id: "parent-foo-bar" }
+  - match: { hits.hits.1._score: 2.0 }