Browse Source

[ML] Support the unsigned_long type in data frame analytics (#64066)

Adds support for the unsigned_long type to data frame analytics.

This type is handled in the same way as the long type.  Values
sent to the ML native processes are converted to floats and
hence will lose accuracy when outside the range where a float
can uniquely represent long values.

Relates #60050
David Roberts 5 years ago
parent
commit
55fe93ebe5

+ 4 - 3
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/dataframe/analyses/Types.java

@@ -28,12 +28,13 @@ public final class Types {
             .collect(Collectors.toUnmodifiableSet());
 
     private static final Set<String> NUMERICAL_TYPES =
-        Stream.concat(Stream.of(NumberType.values()).map(NumberType::typeName), Stream.of("scaled_float"))
+        Stream.concat(Stream.of(NumberType.values()).map(NumberType::typeName), Stream.of("scaled_float", "unsigned_long"))
             .collect(Collectors.toUnmodifiableSet());
 
     private static final Set<String> DISCRETE_NUMERICAL_TYPES =
-        Stream.of(NumberType.BYTE, NumberType.SHORT, NumberType.INTEGER, NumberType.LONG)
-            .map(NumberType::typeName)
+        Stream.concat(
+            Stream.of(NumberType.BYTE, NumberType.SHORT, NumberType.INTEGER, NumberType.LONG).map(NumberType::typeName),
+            Stream.of("unsigned_long"))
             .collect(Collectors.toUnmodifiableSet());
 
     private static final Set<String> BOOL_TYPES = Collections.singleton(BooleanFieldMapper.CONTENT_TYPE);

+ 1 - 1
x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/DataFrameAnalysisCustomFeatureIT.java

@@ -167,7 +167,7 @@ public class DataFrameAnalysisCustomFeatureIT extends MlNativeDataFrameAnalytics
             "          \"type\": \"double\"\n" +
             "        }," +
             "        \""+ DISCRETE_NUMERICAL_FIELD + "\": {\n" +
-            "          \"type\": \"integer\"\n" +
+            "          \"type\": \"unsigned_long\"\n" +
             "        }," +
             "        \""+ TEXT_FIELD + "\": {\n" +
             "          \"type\": \"text\"\n" +

+ 2 - 2
x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/ExplainDataFrameAnalyticsIT.java

@@ -52,7 +52,7 @@ public class ExplainDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsInteg
         client().admin().indices().prepareCreate(sourceIndex)
             .setMapping(
                 "numeric_1", "type=double",
-                "numeric_2", "type=float",
+                "numeric_2", "type=unsigned_long",
                 "categorical", "type=keyword",
                 "filtered_field", "type=keyword")
             .get();
@@ -64,7 +64,7 @@ public class ExplainDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsInteg
             IndexRequest indexRequest = new IndexRequest(sourceIndex);
             indexRequest.source(
                 "numeric_1", 1.0,
-                "numeric_2", 2.0,
+                "numeric_2", 2,
                 "categorical", i % 2 == 0 ? "class_1" : "class_2",
                 "filtered_field", i < 2 ? "bingo" : "rest"); // We tag bingo on the first two docs to ensure we have 2 classes
             bulkRequestBuilder.add(indexRequest);

+ 1 - 1
x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/RegressionIT.java

@@ -698,7 +698,7 @@ public class RegressionIT extends MlNativeDataFrameAnalyticsIntegTestCase {
             "          \"type\": \"double\"\n" +
             "        }," +
             "        \"" + DISCRETE_NUMERICAL_FEATURE_FIELD + "\": {\n" +
-            "          \"type\": \"long\"\n" +
+            "          \"type\": \"unsigned_long\"\n" +
             "        }," +
             "        \"" + DEPENDENT_VARIABLE_FIELD + "\": {\n" +
             "          \"type\": \"double\"\n" +

+ 2 - 2
x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/RunDataFrameAnalyticsIT.java

@@ -71,7 +71,7 @@ public class RunDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsIntegTest
         String sourceIndex = "test-outlier-detection-with-few-docs";
 
         client().admin().indices().prepareCreate(sourceIndex)
-            .setMapping("numeric_1", "type=double", "numeric_2", "type=float", "categorical_1", "type=keyword")
+            .setMapping("numeric_1", "type=double", "numeric_2", "type=unsigned_long", "categorical_1", "type=keyword")
             .get();
 
         BulkRequestBuilder bulkRequestBuilder = client().prepareBulk();
@@ -83,7 +83,7 @@ public class RunDataFrameAnalyticsIT extends MlNativeDataFrameAnalyticsIntegTest
             // We insert one odd value out of 5 for one feature
             String docId = i == 0 ? "outlier" : "normal" + i;
             indexRequest.id(docId);
-            indexRequest.source("numeric_1", i == 0 ? 100.0 : 1.0, "numeric_2", 1.0, "categorical_1", "foo_" + i);
+            indexRequest.source("numeric_1", i == 0 ? 100.0 : 1.0, "numeric_2", 1, "categorical_1", "foo_" + i);
             bulkRequestBuilder.add(indexRequest);
         }
         BulkResponse bulkResponse = bulkRequestBuilder.get();

+ 11 - 9
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetectorTests.java

@@ -105,7 +105,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
         assertThat(fieldExtraction.v2().get(0).getName(), equalTo("some_keyword"));
         assertThat(fieldExtraction.v2().get(0).isIncluded(), is(false));
         assertThat(fieldExtraction.v2().get(0).getReason(), equalTo("unsupported type; supported types are " +
-            "[boolean, byte, double, float, half_float, integer, long, scaled_float, short]"));
+            "[boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]"));
     }
 
     public void testDetect_GivenOutlierDetectionAndFieldWithNumericAndNonNumericTypes() {
@@ -121,7 +121,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
         assertThat(fieldExtraction.v2().get(0).getName(), equalTo("indecisive_field"));
         assertThat(fieldExtraction.v2().get(0).isIncluded(), is(false));
         assertThat(fieldExtraction.v2().get(0).getReason(), equalTo("unsupported type; supported types are " +
-            "[boolean, byte, double, float, half_float, integer, long, scaled_float, short]"));
+            "[boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]"));
     }
 
     public void testDetect_GivenOutlierDetectionAndMultipleFields() {
@@ -147,7 +147,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
             FieldSelection.included("some_boolean", Collections.singleton("boolean"), false, FieldSelection.FeatureType.NUMERICAL),
             FieldSelection.included("some_float", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL),
             FieldSelection.excluded("some_keyword", Collections.singleton("keyword"), "unsupported type; " +
-                "supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]"),
+                "supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]"),
             FieldSelection.included("some_long", Collections.singleton("long"), false, FieldSelection.FeatureType.NUMERICAL)
         );
     }
@@ -282,7 +282,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
         ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect);
 
         assertThat(e.getMessage(), equalTo("invalid types [keyword] for required field [foo]; " +
-            "expected types are [byte, double, float, half_float, integer, long, scaled_float, short]"));
+            "expected types are [byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]"));
     }
 
     public void testDetect_GivenClassificationAndRequiredFieldHasInvalidType() {
@@ -298,7 +298,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
         ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect);
 
         assertThat(e.getMessage(), equalTo("invalid types [float] for required field [some_float]; " +
-            "expected types are [boolean, byte, integer, ip, keyword, long, short, text]"));
+            "expected types are [boolean, byte, integer, ip, keyword, long, short, text, unsigned_long]"));
     }
 
     public void testDetect_GivenClassificationAndDependentVariableHasInvalidCardinality() {
@@ -371,7 +371,8 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
 
         assertFieldSelectionContains(fieldExtraction.v2(),
             FieldSelection.excluded("categorical", Collections.singleton("keyword"),
-                "unsupported type; supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]"),
+                "unsupported type; supported types are " +
+                    "[boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]"),
             FieldSelection.included("numeric", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL)
         );
     }
@@ -471,7 +472,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
         ElasticsearchStatusException e = expectThrows(ElasticsearchStatusException.class, extractedFieldsDetector::detect);
 
         assertThat(e.getMessage(), equalTo("field [your_keyword] has unsupported type [keyword]. " +
-            "Supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]."));
+            "Supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]."));
     }
 
     public void testDetect_GivenNotIncludedFieldHasUnsupportedType() {
@@ -492,7 +493,8 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
 
         assertFieldSelectionContains(fieldExtraction.v2(),
             FieldSelection.excluded("categorical", Collections.singleton("keyword"),
-                "unsupported type; supported types are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]"),
+                "unsupported type; supported types are " +
+                    "[boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]"),
             FieldSelection.included("numeric", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL)
         );
     }
@@ -517,7 +519,7 @@ public class ExtractedFieldsDetectorTests extends ESTestCase {
             FieldSelection.included("my_field1", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL),
             FieldSelection.included("your_field2", Collections.singleton("float"), false, FieldSelection.FeatureType.NUMERICAL),
             FieldSelection.excluded("your_keyword", Collections.singleton("keyword"), "unsupported type; supported types " +
-                "are [boolean, byte, double, float, half_float, integer, long, scaled_float, short]")
+                "are [boolean, byte, double, float, half_float, integer, long, scaled_float, short, unsigned_long]")
         );
     }
 

+ 2 - 2
x-pack/plugin/src/test/resources/rest-api-spec/test/ml/explain_data_frame_analytics.yml

@@ -225,7 +225,7 @@
   - match: { field_selection.2.is_included: false }
   - match: { field_selection.2.is_required: false }
   - is_false: field_selection.2.feature_type
-  - match: { field_selection.2.reason: "unsupported type; supported types are [boolean, byte, double, float, half_float, integer, ip, keyword, long, scaled_float, short, text]" }
+  - match: { field_selection.2.reason: "unsupported type; supported types are [boolean, byte, double, float, half_float, integer, ip, keyword, long, scaled_float, short, text, unsigned_long]" }
   - match: { field_selection.3.name: "field_4" }
   - match: { field_selection.3.mapping_types: ["text"] }
   - match: { field_selection.3.is_included: false }
@@ -299,7 +299,7 @@
   - match: { field_selection.2.is_included: false }
   - match: { field_selection.2.is_required: false }
   - is_false: field_selection.2.feature_type
-  - match: { field_selection.2.reason: "unsupported type; supported types are [boolean, byte, double, float, half_float, integer, ip, keyword, long, scaled_float, short, text]" }
+  - match: { field_selection.2.reason: "unsupported type; supported types are [boolean, byte, double, float, half_float, integer, ip, keyword, long, scaled_float, short, text, unsigned_long]" }
   - match: { field_selection.3.name: "field_4" }
   - match: { field_selection.3.mapping_types: ["text"] }
   - match: { field_selection.3.is_included: false }