Parcourir la source

[9.1] Defer Semantic Text Failures on Pre-8.11 Indices (#135845) (#135869)

* Defer Semantic Text Failures on Pre-8.11 Indices (#135845)

(cherry picked from commit 66d924132c186333ec4f2232b170d0437974af58)

# Conflicts:
#	server/src/test/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapperTests.java
#	x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java

* [CI] Auto commit changes from spotless

* Update comment

---------

Co-authored-by: elasticsearchmachine <infra-root+elasticsearchmachine@elastic.co>
Mike Pellegrini il y a 3 jours
Parent
commit
90ee222e7e

+ 5 - 0
docs/changelog/135845.yaml

@@ -0,0 +1,5 @@
+pr: 135845
+summary: Fix for creating semantic_text fields on pre-8.11 indices crashing Elasticsearch
+area: Mapping
+type: bug
+issues: []

+ 1 - 2
server/src/test/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapperTests.java

@@ -68,7 +68,6 @@ import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAUL
 import static org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase.randomNormalizedVector;
 import static org.elasticsearch.index.codec.vectors.IVFVectorsFormat.DYNAMIC_NPROBE;
 import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.DEFAULT_OVERSAMPLE;
-import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.INDEXED_BY_DEFAULT_INDEX_VERSION;
 import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.IVF_FORMAT;
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
@@ -108,7 +107,7 @@ public class DenseVectorFieldMapperTests extends MapperTestCase {
         if (elementType != ElementType.FLOAT) {
             b.field("element_type", elementType.toString());
         }
-        if (indexVersion.onOrAfter(INDEXED_BY_DEFAULT_INDEX_VERSION) || indexed) {
+        if (indexVersion.onOrAfter(DenseVectorFieldMapper.INDEXED_BY_DEFAULT_INDEX_VERSION) || indexed) {
             // Serialize if it's new index version, or it was not the default for previous indices
             b.field("index", indexed);
         }

+ 40 - 14
x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java

@@ -125,7 +125,7 @@ import static org.elasticsearch.xpack.inference.services.elasticsearch.Elasticse
  */
 public class SemanticTextFieldMapper extends FieldMapper implements InferenceFieldMapper {
     private static final Logger logger = LogManager.getLogger(SemanticTextFieldMapper.class);
-    public static final String UNSUPPORTED_INDEX_MESSAGE = "[semantic_text] is available on indices created with 8.11 or higher.";
+
     public static final NodeFeature SEMANTIC_TEXT_IN_OBJECT_FIELD_FIX = new NodeFeature("semantic_text.in_object_field_fix");
     public static final NodeFeature SEMANTIC_TEXT_SINGLE_FIELD_UPDATE_FIX = new NodeFeature("semantic_text.single_field_update_fix");
     public static final NodeFeature SEMANTIC_TEXT_DELETE_FIX = new NodeFeature("semantic_text.delete_fix");
@@ -145,6 +145,12 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
     public static final String CONTENT_TYPE = "semantic_text";
     public static final String DEFAULT_ELSER_2_INFERENCE_ID = DEFAULT_ELSER_ID;
 
+    public static final String UNSUPPORTED_INDEX_MESSAGE = "["
+        + CONTENT_TYPE
+        + "] is available on indices created with 8.11 or higher. Please create a new index to use ["
+        + CONTENT_TYPE
+        + "]";
+
     public static final float DEFAULT_RESCORE_OVERSAMPLE = 3.0f;
 
     static final String INDEX_OPTIONS_FIELD = "index_options";
@@ -158,9 +164,6 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
 
     public static BiConsumer<String, MappingParserContext> validateParserContext(String type) {
         return (n, c) -> {
-            if (c.getIndexSettings().getIndexVersionCreated().before(NEW_SPARSE_VECTOR)) {
-                throw new UnsupportedOperationException(UNSUPPORTED_INDEX_MESSAGE);
-            }
             if (InferenceMetadataFieldsMapper.isEnabled(c.getIndexSettings().getSettings()) == false) {
                 notInMultiFields(type).accept(n, c);
             }
@@ -548,16 +551,33 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
         if (parser.currentToken() == XContentParser.Token.VALUE_NULL) {
             return null;
         }
+
+        SemanticTextField semanticTextField;
         boolean isWithinLeaf = context.path().isWithinLeafObject();
         try {
             context.path().setWithinLeafObject(true);
-            return SemanticTextField.parse(
+            semanticTextField = SemanticTextField.parse(
                 context.parser(),
                 new SemanticTextField.ParserContext(fieldType().useLegacyFormat, fullPath(), context.parser().contentType())
             );
         } finally {
             context.path().setWithinLeafObject(isWithinLeaf);
         }
+
+        IndexVersion indexCreatedVersion = context.indexSettings().getIndexVersionCreated();
+        if (semanticTextField != null
+            && semanticTextField.inference().modelSettings() != null
+            && indexCreatedVersion.before(NEW_SPARSE_VECTOR)) {
+            // Explicitly fail to parse semantic text fields that meet the following criteria:
+            // - Are in pre 8.11 indices
+            // - Have model settings, indicating that they have embeddings to be indexed
+            //
+            // We can't fail earlier than this because it causes pre 8.11 indices with semantic text fields to either be in red state or
+            // cause Elasticsearch to not launch.
+            throw new UnsupportedOperationException(UNSUPPORTED_INDEX_MESSAGE);
+        }
+
+        return semanticTextField;
     }
 
     void parseCreateFieldFromContext(DocumentParserContext context, SemanticTextField field, XContentLocation xContentLocation)
@@ -1158,17 +1178,23 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
                     indexVersionCreated
                 );
 
-                SimilarityMeasure similarity = modelSettings.similarity();
-                if (similarity != null) {
-                    switch (similarity) {
-                        case COSINE -> denseVectorMapperBuilder.similarity(DenseVectorFieldMapper.VectorSimilarity.COSINE);
-                        case DOT_PRODUCT -> denseVectorMapperBuilder.similarity(DenseVectorFieldMapper.VectorSimilarity.DOT_PRODUCT);
-                        case L2_NORM -> denseVectorMapperBuilder.similarity(DenseVectorFieldMapper.VectorSimilarity.L2_NORM);
-                        default -> throw new IllegalArgumentException(
-                            "Unknown similarity measure in model_settings [" + similarity.name() + "]"
-                        );
+                // Skip setting similarity on pre 8.11 indices. It causes dense vector field creation to fail because similarity can only be
+                // set on indexed fields, which is not done by default prior to 8.11. The fact that the dense vector field is partially
+                // configured is moot because we will explicitly fail to index docs into this semantic text field anyways.
+                if (indexVersionCreated.onOrAfter(NEW_SPARSE_VECTOR)) {
+                    SimilarityMeasure similarity = modelSettings.similarity();
+                    if (similarity != null) {
+                        switch (similarity) {
+                            case COSINE -> denseVectorMapperBuilder.similarity(DenseVectorFieldMapper.VectorSimilarity.COSINE);
+                            case DOT_PRODUCT -> denseVectorMapperBuilder.similarity(DenseVectorFieldMapper.VectorSimilarity.DOT_PRODUCT);
+                            case L2_NORM -> denseVectorMapperBuilder.similarity(DenseVectorFieldMapper.VectorSimilarity.L2_NORM);
+                            default -> throw new IllegalArgumentException(
+                                "Unknown similarity measure in model_settings [" + similarity.name() + "]"
+                            );
+                        }
                     }
                 }
+
                 denseVectorMapperBuilder.dimensions(modelSettings.dimensions());
                 denseVectorMapperBuilder.elementType(modelSettings.elementType());
                 if (indexOptions != null) {

+ 93 - 51
x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java

@@ -403,57 +403,6 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
         }
     }
 
-    @Override
-    protected IndexVersion boostNotAllowedIndexVersion() {
-        return IndexVersions.NEW_SPARSE_VECTOR;
-    }
-
-    public void testOldIndexSemanticTextDenseVectorRaisesError() throws IOException {
-        final String fieldName = "field";
-        final XContentBuilder fieldMapping = fieldMapping(b -> {
-            b.field("type", "semantic_text");
-            b.field(INFERENCE_ID_FIELD, "test_inference_id");
-            b.startObject("model_settings");
-            b.field("task_type", "text_embedding");
-            b.field("dimensions", 384);
-            b.field("similarity", "cosine");
-            b.field("element_type", "float");
-            b.endObject();
-        });
-        assertOldIndexUnsupported(fieldMapping);
-    }
-
-    public void testOldIndexSemanticTextMinimalMappingRaisesError() throws IOException {
-        final XContentBuilder fieldMapping = fieldMapping(this::minimalMapping);
-        assertOldIndexUnsupported(fieldMapping);
-    }
-
-    public void testOldIndexSemanticTextSparseVersionRaisesError() throws IOException {
-        final XContentBuilder fieldMapping = fieldMapping(b -> {
-            b.field("type", "semantic_text");
-            b.field("inference_id", "another_inference_id");
-            b.startObject("model_settings");
-            b.field("task_type", "sparse_embedding");
-            b.endObject();
-        });
-        assertOldIndexUnsupported(fieldMapping);
-    }
-
-    private void assertOldIndexUnsupported(XContentBuilder fieldMapping) {
-
-        MapperParsingException exception = assertThrows(
-            MapperParsingException.class,
-            () -> createMapperService(
-                fieldMapping,
-                true,
-                IndexVersions.V_8_0_0,
-                IndexVersionUtils.getPreviousVersion(IndexVersions.NEW_SPARSE_VECTOR)
-            )
-        );
-        assertTrue(exception.getMessage().contains(UNSUPPORTED_INDEX_MESSAGE));
-        assertTrue(exception.getRootCause() instanceof UnsupportedOperationException);
-    }
-
     public void testMultiFieldsSupport() throws IOException {
         if (useLegacyFormat) {
             Exception e = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> {
@@ -1134,6 +1083,99 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
         assertThat(ex.getMessage(), containsString("[model_settings] must be set for field [field] when chunks are provided"));
     }
 
+    public void testPre811IndexSemanticTextDenseVectorRaisesError() throws IOException {
+        Model model = TestModel.createRandomInstance(TaskType.TEXT_EMBEDDING);
+        String fieldName = randomAlphaOfLength(8);
+
+        MapperService mapperService = createMapperService(
+            mapping(
+                b -> b.startObject(fieldName).field("type", "semantic_text").field("inference_id", model.getInferenceEntityId()).endObject()
+            ),
+            true,
+            IndexVersions.V_8_0_0,
+            IndexVersionUtils.getPreviousVersion(IndexVersions.NEW_SPARSE_VECTOR)
+        );
+        assertSemanticTextField(mapperService, fieldName, false, null, null);
+
+        merge(
+            mapperService,
+            mapping(
+                b -> b.startObject(fieldName)
+                    .field("type", "semantic_text")
+                    .field("inference_id", model.getInferenceEntityId())
+                    .startObject("model_settings")
+                    .field("task_type", TaskType.TEXT_EMBEDDING.toString())
+                    .field("dimensions", model.getServiceSettings().dimensions())
+                    .field("similarity", model.getServiceSettings().similarity())
+                    .field("element_type", model.getServiceSettings().elementType())
+                    .endObject()
+                    .endObject()
+            )
+        );
+        assertSemanticTextField(mapperService, fieldName, true, null, null);
+
+        DocumentMapper documentMapper = mapperService.documentMapper();
+        DocumentParsingException e = assertThrows(
+            DocumentParsingException.class,
+            () -> documentMapper.parse(
+                source(
+                    b -> addSemanticTextInferenceResults(
+                        true,
+                        b,
+                        List.of(randomSemanticText(true, fieldName, model, null, List.of("foo", "bar"), XContentType.JSON))
+                    )
+                )
+            )
+        );
+        assertThat(e.getCause(), instanceOf(UnsupportedOperationException.class));
+        assertThat(e.getCause().getMessage(), equalTo(UNSUPPORTED_INDEX_MESSAGE));
+    }
+
+    public void testPre811IndexSemanticTextSparseVectorRaisesError() throws IOException {
+        Model model = TestModel.createRandomInstance(TaskType.SPARSE_EMBEDDING);
+        String fieldName = randomAlphaOfLength(8);
+
+        MapperService mapperService = createMapperService(
+            mapping(
+                b -> b.startObject(fieldName).field("type", "semantic_text").field("inference_id", model.getInferenceEntityId()).endObject()
+            ),
+            true,
+            IndexVersions.V_8_0_0,
+            IndexVersionUtils.getPreviousVersion(IndexVersions.NEW_SPARSE_VECTOR)
+        );
+        assertSemanticTextField(mapperService, fieldName, false, null, null);
+
+        merge(
+            mapperService,
+            mapping(
+                b -> b.startObject(fieldName)
+                    .field("type", "semantic_text")
+                    .field("inference_id", model.getInferenceEntityId())
+                    .startObject("model_settings")
+                    .field("task_type", TaskType.SPARSE_EMBEDDING.toString())
+                    .endObject()
+                    .endObject()
+            )
+        );
+        assertSemanticTextField(mapperService, fieldName, true, null, null);
+
+        DocumentMapper documentMapper = mapperService.documentMapper();
+        DocumentParsingException e = assertThrows(
+            DocumentParsingException.class,
+            () -> documentMapper.parse(
+                source(
+                    b -> addSemanticTextInferenceResults(
+                        true,
+                        b,
+                        List.of(randomSemanticText(true, fieldName, model, null, List.of("foo", "bar"), XContentType.JSON))
+                    )
+                )
+            )
+        );
+        assertThat(e.getCause(), instanceOf(UnsupportedOperationException.class));
+        assertThat(e.getCause().getMessage(), equalTo(UNSUPPORTED_INDEX_MESSAGE));
+    }
+
     private MapperService mapperServiceForFieldWithModelSettings(String fieldName, String inferenceId, MinimalServiceSettings modelSettings)
         throws IOException {
         return mapperServiceForFieldWithModelSettings(fieldName, inferenceId, null, modelSettings);