Browse Source

[ML] Lift limit of max number of classes for classification to 100 (#89755)

Limit was previously set to `30`. After the improvements in elastic/ml-cpp#2395
we now raist the limit to `100`.
Dimitris Athanasiou 3 years ago
parent
commit
b5504ea701

+ 1 - 1
docs/reference/ml/df-analytics/apis/put-dfanalytics.asciidoc

@@ -105,7 +105,7 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=class-assignment-objective]
 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=dependent-variable]
 +
 The data type of the field must be numeric (`integer`, `short`, `long`, `byte`),
-categorical (`ip` or `keyword`), or boolean. There must be no more than 30
+categorical (`ip` or `keyword`), or boolean. There must be no more than 100
 different values in this field.
 
 `downsample_factor`::::

+ 1 - 1
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/dataframe/analyses/Classification.java

@@ -68,7 +68,7 @@ public class Classification implements DataFrameAnalysis {
     /**
      * The max number of classes classification supports
      */
-    public static final int MAX_DEPENDENT_VARIABLE_CARDINALITY = 30;
+    public static final int MAX_DEPENDENT_VARIABLE_CARDINALITY = 100;
 
     @SuppressWarnings("unchecked")
     private static ConstructingObjectParser<Classification, Void> createParser(boolean lenient) {

+ 1 - 1
x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/dataframe/analyses/ClassificationTests.java

@@ -411,7 +411,7 @@ public class ClassificationTests extends AbstractBWCSerializationTestCase<Classi
         assertThat(constraints.size(), equalTo(1));
         assertThat(constraints.get(0).getField(), equalTo(classification.getDependentVariable()));
         assertThat(constraints.get(0).getLowerBound(), equalTo(2L));
-        assertThat(constraints.get(0).getUpperBound(), equalTo(30L));
+        assertThat(constraints.get(0).getUpperBound(), equalTo(100L));
     }
 
     public void testGetResultMappings_DependentVariableMappingIsAbsent() {

+ 13 - 7
x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/ClassificationIT.java

@@ -14,7 +14,6 @@ import org.elasticsearch.action.bulk.BulkRequestBuilder;
 import org.elasticsearch.action.bulk.BulkResponse;
 import org.elasticsearch.action.delete.DeleteResponse;
 import org.elasticsearch.action.get.GetResponse;
-import org.elasticsearch.action.index.IndexAction;
 import org.elasticsearch.action.index.IndexRequest;
 import org.elasticsearch.action.search.SearchResponse;
 import org.elasticsearch.action.support.WriteRequest;
@@ -642,17 +641,24 @@ public class ClassificationIT extends MlNativeDataFrameAnalyticsIntegTestCase {
 
         ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, () -> startAnalytics(jobId));
         assertThat(e.status().getStatus(), equalTo(400));
-        assertThat(e.getMessage(), equalTo("Field [keyword-field] must have at most [30] distinct values but there were at least [31]"));
+        assertThat(e.getMessage(), equalTo("Field [keyword-field] must have at most [100] distinct values but there were at least [101]"));
     }
 
     public void testDependentVariableCardinalityTooHighButWithQueryMakesItWithinRange() throws Exception {
         initialize("cardinality_too_high_with_query");
         indexData(sourceIndex, 6, 5, KEYWORD_FIELD);
-        // Index one more document with a class different than the two already used.
-        client().execute(
-            IndexAction.INSTANCE,
-            new IndexRequest(sourceIndex).source(KEYWORD_FIELD, "fox").setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE)
-        ).actionGet();
+
+        // Index enough documents to have more classes than the allowed limit
+        BulkRequestBuilder bulkRequestBuilder = client().prepareBulk().setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE);
+        for (int i = 0; i < Classification.MAX_DEPENDENT_VARIABLE_CARDINALITY - 1; i++) {
+            IndexRequest indexRequest = new IndexRequest(sourceIndex).source(KEYWORD_FIELD, "fox-" + i);
+            bulkRequestBuilder.add(indexRequest);
+        }
+        BulkResponse bulkResponse = bulkRequestBuilder.get();
+        if (bulkResponse.hasFailures()) {
+            fail("Failed to index data: " + bulkResponse.buildFailureMessage());
+        }
+
         QueryBuilder query = QueryBuilders.boolQuery().filter(QueryBuilders.termsQuery(KEYWORD_FIELD, KEYWORD_FIELD_VALUES));
 
         DataFrameAnalyticsConfig config = buildAnalytics(jobId, sourceIndex, destIndex, null, new Classification(KEYWORD_FIELD), query);

+ 2 - 2
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetectorTests.java

@@ -409,11 +409,11 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
             buildClassificationConfig("some_keyword"),
             100,
             fieldCapabilities,
-            Collections.singletonMap("some_keyword", 31L)
+            Collections.singletonMap("some_keyword", 101L)
         );
         ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect);
 
-        assertThat(e.getMessage(), equalTo("Field [some_keyword] must have at most [30] distinct values but there were at least [31]"));
+        assertThat(e.getMessage(), equalTo("Field [some_keyword] must have at most [100] distinct values but there were at least [101]"));
     }
 
     public void testDetect_GivenIgnoredField() {