Răsfoiți Sursa

[ML] Validate at least one feature is available for DF analytics (#55876)

We were previously checking at least one supported field existed
when the _explain API was called. However, in the case of analyses
with required fields (e.g. regression) we were not accounting that
the dependent variable is not a feature and thus if the source index
only contains the dependent variable field there are no features to
train a model on.

This commit adds a validation that at least one feature is available
for analysis. Note that we also move that validation away from
`ExtractedFieldsDetector` and the _explain API and straight into
the _start API. The reason for doing this is to allow the user to use
the _explain API in order to understand why they would be seeing an
error like this one.

For example, the user might be using an index that has fields but
they are of unsupported types. If they start the job and get
an error that there are no features, they will wonder why that is.
Calling the _explain API will show them that all their fields are
unsupported. If the _explain API was failing instead, there would
be no way for the user to understand why all those fields are
ignored.

Closes #55593
Dimitris Athanasiou 5 ani în urmă
părinte
comite
0666be5e8d

+ 2 - 1
x-pack/plugin/ml/qa/ml-with-security/build.gradle

@@ -187,7 +187,8 @@ integTest.runner {
     'ml/preview_datafeed/Test preview missing datafeed',
     'ml/revert_model_snapshot/Test revert model with invalid snapshotId',
     'ml/start_data_frame_analytics/Test start given missing source index',
-    'ml/start_data_frame_analytics/Test start given source index has no compatible fields',
+    'ml/start_data_frame_analytics/Test start outlier_detection given source index has no fields',
+    'ml/start_data_frame_analytics/Test start regression given source index only has dependent variable',
     'ml/start_data_frame_analytics/Test start with inconsistent body/param ids',
     'ml/start_data_frame_analytics/Test start given dest index is not empty',
     'ml/start_data_frame_analytics/Test start with compatible fields but no data',

+ 35 - 1
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportStartDataFrameAnalyticsAction.java

@@ -61,6 +61,7 @@ import org.elasticsearch.xpack.core.ml.action.StartDataFrameAnalyticsAction;
 import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig;
 import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsState;
 import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsTaskState;
+import org.elasticsearch.xpack.core.ml.dataframe.analyses.RequiredField;
 import org.elasticsearch.xpack.core.ml.job.messages.Messages;
 import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndex;
 import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
@@ -83,7 +84,9 @@ import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
+import java.util.Set;
 import java.util.function.Predicate;
+import java.util.stream.Collectors;
 
 import static org.elasticsearch.xpack.core.ClientHelper.ML_ORIGIN;
 import static org.elasticsearch.xpack.core.ClientHelper.executeAsyncWithOrigin;
@@ -244,7 +247,7 @@ public class TransportStartDataFrameAnalyticsAction
         ParentTaskAssigningClient parentTaskClient = new ParentTaskAssigningClient(client, task.getParentTaskId());
         // Step 7. Validate that there are analyzable data in the source index
         ActionListener<StartContext> validateMappingsMergeListener = ActionListener.wrap(
-            startContext -> validateSourceIndexHasRows(startContext, finalListener),
+            startContext -> validateSourceIndexHasAnalyzableData(startContext, finalListener),
             finalListener::onFailure
         );
 
@@ -319,6 +322,37 @@ public class TransportStartDataFrameAnalyticsAction
         configProvider.get(id, getConfigListener);
     }
 
+    private void validateSourceIndexHasAnalyzableData(StartContext startContext, ActionListener<StartContext> listener) {
+        ActionListener<Void> validateAtLeastOneAnalyzedFieldListener = ActionListener.wrap(
+            aVoid -> validateSourceIndexHasRows(startContext, listener),
+            listener::onFailure
+        );
+
+        validateSourceIndexHasAtLeastOneAnalyzedField(startContext, validateAtLeastOneAnalyzedFieldListener);
+    }
+
+    private void validateSourceIndexHasAtLeastOneAnalyzedField(StartContext startContext, ActionListener<Void> listener) {
+        Set<String> requiredFields = startContext.config.getAnalysis().getRequiredFields().stream()
+            .map(RequiredField::getName)
+            .collect(Collectors.toSet());
+
+        // We assume here that required fields are not features
+        long nonRequiredFieldsCount = startContext.extractedFields.getAllFields().stream()
+            .filter(extractedField -> requiredFields.contains(extractedField.getName()) == false)
+            .count();
+        if (nonRequiredFieldsCount == 0) {
+            StringBuilder msgBuilder = new StringBuilder("at least one field must be included in the analysis");
+            if (requiredFields.isEmpty() == false) {
+                msgBuilder.append(" (excluding fields ")
+                    .append(requiredFields)
+                    .append(")");
+            }
+            listener.onFailure(ExceptionsHelper.badRequestException(msgBuilder.toString()));
+        } else {
+            listener.onResponse(null);
+        }
+    }
+
     private void validateSourceIndexHasRows(StartContext startContext, ActionListener<StartContext> listener) {
         DataFrameDataExtractorFactory extractorFactory = DataFrameDataExtractorFactory.createForSourceIndices(client,
             "validate_source_index_has_rows-" + startContext.config.getId(),

+ 0 - 6
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetector.java

@@ -95,12 +95,6 @@ public class ExtractedFieldsDetector {
         }
         includeAndExcludeFields(fields, fieldSelection);
 
-        if (fields.isEmpty()) {
-            throw ExceptionsHelper.badRequestException("No compatible fields could be detected in index {}. Supported types are {}.",
-                Arrays.toString(index),
-                getSupportedTypes());
-        }
-
         return fields;
     }
 

+ 23 - 12
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetectorTests.java

@@ -37,6 +37,7 @@ import static org.hamcrest.Matchers.contains;
 import static org.hamcrest.Matchers.containsInAnyOrder;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.hasSize;
+import static org.hamcrest.Matchers.is;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
 
@@ -91,10 +92,14 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
 
         ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
         SOURCE_INDEX, buildOutlierDetectionConfig(), 100, fieldCapabilities, Collections.emptyMap());
-        ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect);
+        Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
 
-        assertThat(e.getMessage(), equalTo("No compatible fields could be detected in index [source_index]." +
-            " Supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]."));
+        assertThat(fieldExtraction.v1().getAllFields().isEmpty(), is(true));
+        assertThat(fieldExtraction.v2().size(), equalTo(1));
+        assertThat(fieldExtraction.v2().get(0).getName(), equalTo("some_keyword"));
+        assertThat(fieldExtraction.v2().get(0).isIncluded(), is(false));
+        assertThat(fieldExtraction.v2().get(0).getReason(), equalTo("unsupported type; supported types are " +
+            "[boolean, byte, double, float, half_float, integer, long, scaled_float, short]"));
     }
 
     public void testDetect_GivenOutlierDetectionAndFieldWithNumericAndNonNumericTypes() {
@@ -103,10 +108,14 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
 
         ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
             SOURCE_INDEX, buildOutlierDetectionConfig(), 100, fieldCapabilities, Collections.emptyMap());
-        ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect);
+        Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
 
-        assertThat(e.getMessage(), equalTo("No compatible fields could be detected in index [source_index]. " +
-            "Supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]."));
+        assertThat(fieldExtraction.v1().getAllFields().isEmpty(), is(true));
+        assertThat(fieldExtraction.v2().size(), equalTo(1));
+        assertThat(fieldExtraction.v2().get(0).getName(), equalTo("indecisive_field"));
+        assertThat(fieldExtraction.v2().get(0).isIncluded(), is(false));
+        assertThat(fieldExtraction.v2().get(0).getReason(), equalTo("unsupported type; supported types are " +
+            "[boolean, byte, double, float, half_float, integer, long, scaled_float, short]"));
     }
 
     public void testDetect_GivenOutlierDetectionAndMultipleFields() {
@@ -306,10 +315,10 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
 
         ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
             SOURCE_INDEX, buildOutlierDetectionConfig(), 100, fieldCapabilities, Collections.emptyMap());
-        ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect);
+        Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
 
-        assertThat(e.getMessage(), equalTo("No compatible fields could be detected in index [source_index]. " +
-            "Supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]."));
+        assertThat(fieldExtraction.v1().getAllFields().isEmpty(), is(true));
+        assertThat(fieldExtraction.v2().isEmpty(), is(true));
     }
 
     public void testDetect_GivenIncludedIgnoredField() {
@@ -410,9 +419,11 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
 
         ExtractedFieldsDetector extractedFieldsDetector = new ExtractedFieldsDetector(
             SOURCE_INDEX, buildOutlierDetectionConfig(), 100, fieldCapabilities, Collections.emptyMap());
-        ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect);
-        assertThat(e.getMessage(), equalTo("No compatible fields could be detected in index [source_index]. " +
-            "Supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]."));
+        Tuple<ExtractedFields, List<FieldSelection>> fieldExtraction = extractedFieldsDetector.detect();
+
+        assertThat(fieldExtraction.v1().getAllFields().isEmpty(), is(true));
+        assertThat(fieldExtraction.v2().size(), equalTo(2));
+        assertThat(fieldExtraction.v2().stream().filter(FieldSelection::isIncluded).findAny().isPresent(), is(false));
     }
 
     public void testDetect_GivenInclusionsAndExclusions() {

+ 36 - 2
x-pack/plugin/src/test/resources/rest-api-spec/test/ml/start_data_frame_analytics.yml

@@ -36,7 +36,7 @@
         id: "missing_index"
 
 ---
-"Test start given source index has no compatible fields":
+"Test start outlier_detection given source index has no fields":
 
   - do:
       indices.create:
@@ -57,7 +57,41 @@
           }
 
   - do:
-      catch: /No compatible fields could be detected in index \[empty-index\]/
+      catch: /at least one field must be included in the analysis/
+      ml.start_data_frame_analytics:
+        id: "foo"
+
+---
+"Test start regression given source index only has dependent variable":
+
+  - do:
+      indices.create:
+        index: index-with-dep-var-only
+        body:
+          mappings:
+            properties:
+              my_dep_var: { type: "long" }
+
+  - do:
+      ml.put_data_frame_analytics:
+        id: "foo"
+        body: >
+          {
+            "source": {
+              "index": "index-with-dep-var-only"
+            },
+            "dest": {
+              "index": "results"
+            },
+            "analysis": {
+              "regression":{
+                "dependent_variable": "my_dep_var"
+              }
+            }
+          }
+
+  - do:
+      catch: /at least one field must be included in the analysis \(excluding fields \[my_dep_var\]\)/
       ml.start_data_frame_analytics:
         id: "foo"