浏览代码

[ML] add new `custom` field to trained model processors (#59542)

This commit adds the new configurable field `custom`.

`custom` indicates if the preprocessor was submitted by a user or automatically created by the analytics job.

Eventually, this field will be used in calculating feature importance. When `custom` is true, the feature importance for 
the processed fields is calculated. When `false` the current behavior is the same (we calculate the importance for the originating field/feature).

This also adds new required methods to the preprocessor interface. If users are to supply their own preprocessors 
in the analytics job configuration, we need to know the input and output field names.
Benjamin Trent 5 年之前
父节点
当前提交
b551f75ec3
共有 18 个文件被更改,包括 433 次插入185 次删除
  1. 22 4
      client/rest-high-level/src/main/java/org/elasticsearch/client/ml/inference/preprocessing/FrequencyEncoding.java
  2. 23 6
      client/rest-high-level/src/main/java/org/elasticsearch/client/ml/inference/preprocessing/OneHotEncoding.java
  3. 23 5
      client/rest-high-level/src/main/java/org/elasticsearch/client/ml/inference/preprocessing/TargetMeanEncoding.java
  4. 4 1
      client/rest-high-level/src/test/java/org/elasticsearch/client/ml/inference/preprocessing/FrequencyEncodingTests.java
  5. 1 1
      client/rest-high-level/src/test/java/org/elasticsearch/client/ml/inference/preprocessing/OneHotEncodingTests.java
  6. 2 1
      client/rest-high-level/src/test/java/org/elasticsearch/client/ml/inference/preprocessing/TargetMeanEncodingTests.java
  7. 12 0
      docs/reference/ml/df-analytics/apis/put-inference.asciidoc
  8. 146 137
      docs/reference/ml/ml-shared.asciidoc
  9. 15 0
      x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/CustomWordEmbedding.java
  10. 35 4
      x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/FrequencyEncoding.java
  11. 38 6
      x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/OneHotEncoding.java
  12. 19 0
      x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/PreProcessor.java
  13. 35 4
      x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/TargetMeanEncoding.java
  14. 17 2
      x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/FrequencyEncodingTests.java
  15. 15 2
      x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/OneHotEncodingTests.java
  16. 16 2
      x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/TargetMeanEncodingTests.java
  17. 4 4
      x-pack/plugin/ml/src/internalClusterTest/java/org/elasticsearch/xpack/ml/integration/ModelInferenceActionIT.java
  18. 6 6
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/loadingservice/LocalModelTests.java

+ 22 - 4
client/rest-high-level/src/main/java/org/elasticsearch/client/ml/inference/preprocessing/FrequencyEncoding.java

@@ -40,18 +40,20 @@ public class FrequencyEncoding implements PreProcessor {
     public static final ParseField FIELD = new ParseField("field");
     public static final ParseField FEATURE_NAME = new ParseField("feature_name");
     public static final ParseField FREQUENCY_MAP = new ParseField("frequency_map");
+    public static final ParseField CUSTOM = new ParseField("custom");
 
     @SuppressWarnings("unchecked")
     public static final ConstructingObjectParser<FrequencyEncoding, Void> PARSER = new ConstructingObjectParser<>(
         NAME,
         true,
-        a -> new FrequencyEncoding((String)a[0], (String)a[1], (Map<String, Double>)a[2]));
+        a -> new FrequencyEncoding((String)a[0], (String)a[1], (Map<String, Double>)a[2], (Boolean)a[3]));
     static {
         PARSER.declareString(ConstructingObjectParser.constructorArg(), FIELD);
         PARSER.declareString(ConstructingObjectParser.constructorArg(), FEATURE_NAME);
         PARSER.declareObject(ConstructingObjectParser.constructorArg(),
             (p, c) -> p.map(HashMap::new, XContentParser::doubleValue),
             FREQUENCY_MAP);
+        PARSER.declareBoolean(ConstructingObjectParser.optionalConstructorArg(), CUSTOM);
     }
 
     public static FrequencyEncoding fromXContent(XContentParser parser) {
@@ -61,11 +63,13 @@ public class FrequencyEncoding implements PreProcessor {
     private final String field;
     private final String featureName;
     private final Map<String, Double> frequencyMap;
+    private final Boolean custom;
 
-    public FrequencyEncoding(String field, String featureName, Map<String, Double> frequencyMap) {
+    FrequencyEncoding(String field, String featureName, Map<String, Double> frequencyMap, Boolean custom) {
         this.field = Objects.requireNonNull(field);
         this.featureName = Objects.requireNonNull(featureName);
         this.frequencyMap = Collections.unmodifiableMap(Objects.requireNonNull(frequencyMap));
+        this.custom = custom;
     }
 
     /**
@@ -94,12 +98,19 @@ public class FrequencyEncoding implements PreProcessor {
         return NAME;
     }
 
+    public Boolean getCustom() {
+        return custom;
+    }
+
     @Override
     public XContentBuilder toXContent(XContentBuilder builder, ToXContent.Params params) throws IOException {
         builder.startObject();
         builder.field(FIELD.getPreferredName(), field);
         builder.field(FEATURE_NAME.getPreferredName(), featureName);
         builder.field(FREQUENCY_MAP.getPreferredName(), frequencyMap);
+        if (custom != null) {
+            builder.field(CUSTOM.getPreferredName(), custom);
+        }
         builder.endObject();
         return builder;
     }
@@ -111,12 +122,13 @@ public class FrequencyEncoding implements PreProcessor {
         FrequencyEncoding that = (FrequencyEncoding) o;
         return Objects.equals(field, that.field)
             && Objects.equals(featureName, that.featureName)
+            && Objects.equals(custom, that.custom)
             && Objects.equals(frequencyMap, that.frequencyMap);
     }
 
     @Override
     public int hashCode() {
-        return Objects.hash(field, featureName, frequencyMap);
+        return Objects.hash(field, featureName, frequencyMap, custom);
     }
 
     public Builder builder(String field) {
@@ -128,6 +140,7 @@ public class FrequencyEncoding implements PreProcessor {
         private String field;
         private String featureName;
         private Map<String, Double> frequencyMap = new HashMap<>();
+        private Boolean custom;
 
         public Builder(String field) {
             this.field = field;
@@ -153,8 +166,13 @@ public class FrequencyEncoding implements PreProcessor {
             return this;
         }
 
+        public Builder setCustom(boolean custom) {
+            this.custom = custom;
+            return this;
+        }
+
         public FrequencyEncoding build() {
-            return new FrequencyEncoding(field, featureName, frequencyMap);
+            return new FrequencyEncoding(field, featureName, frequencyMap, custom);
         }
     }
 

+ 23 - 6
client/rest-high-level/src/main/java/org/elasticsearch/client/ml/inference/preprocessing/OneHotEncoding.java

@@ -38,15 +38,17 @@ public class OneHotEncoding implements PreProcessor {
     public static final String NAME = "one_hot_encoding";
     public static final ParseField FIELD = new ParseField("field");
     public static final ParseField HOT_MAP = new ParseField("hot_map");
+    public static final ParseField CUSTOM = new ParseField("custom");
 
     @SuppressWarnings("unchecked")
     public static final ConstructingObjectParser<OneHotEncoding, Void> PARSER = new ConstructingObjectParser<>(
         NAME,
         true,
-        a -> new OneHotEncoding((String)a[0], (Map<String, String>)a[1]));
+        a -> new OneHotEncoding((String)a[0], (Map<String, String>)a[1], (Boolean)a[2]));
     static {
         PARSER.declareString(ConstructingObjectParser.constructorArg(), FIELD);
         PARSER.declareObject(ConstructingObjectParser.constructorArg(), (p, c) -> p.mapStrings(), HOT_MAP);
+        PARSER.declareBoolean(ConstructingObjectParser.optionalConstructorArg(), CUSTOM);
     }
 
     public static OneHotEncoding fromXContent(XContentParser parser) {
@@ -55,12 +57,13 @@ public class OneHotEncoding implements PreProcessor {
 
     private final String field;
     private final Map<String, String> hotMap;
+    private final Boolean custom;
 
-    public OneHotEncoding(String field, Map<String, String> hotMap) {
+    OneHotEncoding(String field, Map<String, String> hotMap, Boolean custom) {
         this.field = Objects.requireNonNull(field);
         this.hotMap = Collections.unmodifiableMap(Objects.requireNonNull(hotMap));
+        this.custom = custom;
     }
-
     /**
      * @return Field name on which to one hot encode
      */
@@ -80,11 +83,18 @@ public class OneHotEncoding implements PreProcessor {
         return NAME;
     }
 
+    public Boolean getCustom() {
+        return custom;
+    }
+
     @Override
     public XContentBuilder toXContent(XContentBuilder builder, ToXContent.Params params) throws IOException {
         builder.startObject();
         builder.field(FIELD.getPreferredName(), field);
         builder.field(HOT_MAP.getPreferredName(), hotMap);
+        if (custom != null) {
+            builder.field(CUSTOM.getPreferredName(), custom);
+        }
         builder.endObject();
         return builder;
     }
@@ -95,12 +105,13 @@ public class OneHotEncoding implements PreProcessor {
         if (o == null || getClass() != o.getClass()) return false;
         OneHotEncoding that = (OneHotEncoding) o;
         return Objects.equals(field, that.field)
-            && Objects.equals(hotMap, that.hotMap);
+            && Objects.equals(hotMap, that.hotMap)
+            && Objects.equals(custom, that.custom);
     }
 
     @Override
     public int hashCode() {
-        return Objects.hash(field, hotMap);
+        return Objects.hash(field, hotMap, custom);
     }
 
     public Builder builder(String field) {
@@ -111,6 +122,7 @@ public class OneHotEncoding implements PreProcessor {
 
         private String field;
         private Map<String, String> hotMap = new HashMap<>();
+        private Boolean custom;
 
         public Builder(String field) {
             this.field = field;
@@ -131,8 +143,13 @@ public class OneHotEncoding implements PreProcessor {
             return this;
         }
 
+        public Builder setCustom(boolean custom) {
+            this.custom = custom;
+            return this;
+        }
+
         public OneHotEncoding build() {
-            return new OneHotEncoding(field, hotMap);
+            return new OneHotEncoding(field, hotMap, custom);
         }
     }
 }

+ 23 - 5
client/rest-high-level/src/main/java/org/elasticsearch/client/ml/inference/preprocessing/TargetMeanEncoding.java

@@ -41,12 +41,13 @@ public class TargetMeanEncoding implements PreProcessor {
     public static final ParseField FEATURE_NAME = new ParseField("feature_name");
     public static final ParseField TARGET_MAP = new ParseField("target_map");
     public static final ParseField DEFAULT_VALUE = new ParseField("default_value");
+    public static final ParseField CUSTOM = new ParseField("custom");
 
     @SuppressWarnings("unchecked")
     public static final ConstructingObjectParser<TargetMeanEncoding, Void> PARSER = new ConstructingObjectParser<>(
         NAME,
         true,
-        a -> new TargetMeanEncoding((String)a[0], (String)a[1], (Map<String, Double>)a[2], (Double)a[3]));
+        a -> new TargetMeanEncoding((String)a[0], (String)a[1], (Map<String, Double>)a[2], (Double)a[3], (Boolean)a[4]));
     static {
         PARSER.declareString(ConstructingObjectParser.constructorArg(), FIELD);
         PARSER.declareString(ConstructingObjectParser.constructorArg(), FEATURE_NAME);
@@ -54,6 +55,7 @@ public class TargetMeanEncoding implements PreProcessor {
             (p, c) -> p.map(HashMap::new, XContentParser::doubleValue),
             TARGET_MAP);
         PARSER.declareDouble(ConstructingObjectParser.constructorArg(), DEFAULT_VALUE);
+        PARSER.declareBoolean(ConstructingObjectParser.optionalConstructorArg(), CUSTOM);
     }
 
     public static TargetMeanEncoding fromXContent(XContentParser parser) {
@@ -64,12 +66,14 @@ public class TargetMeanEncoding implements PreProcessor {
     private final String featureName;
     private final Map<String, Double> meanMap;
     private final double defaultValue;
+    private final Boolean custom;
 
-    public TargetMeanEncoding(String field, String featureName, Map<String, Double> meanMap, Double defaultValue) {
+    TargetMeanEncoding(String field, String featureName, Map<String, Double> meanMap, Double defaultValue, Boolean custom) {
         this.field = Objects.requireNonNull(field);
         this.featureName = Objects.requireNonNull(featureName);
         this.meanMap = Collections.unmodifiableMap(Objects.requireNonNull(meanMap));
         this.defaultValue = Objects.requireNonNull(defaultValue);
+        this.custom = custom;
     }
 
     /**
@@ -100,6 +104,10 @@ public class TargetMeanEncoding implements PreProcessor {
         return featureName;
     }
 
+    public Boolean getCustom() {
+        return custom;
+    }
+
     @Override
     public String getName() {
         return NAME;
@@ -112,6 +120,9 @@ public class TargetMeanEncoding implements PreProcessor {
         builder.field(FEATURE_NAME.getPreferredName(), featureName);
         builder.field(TARGET_MAP.getPreferredName(), meanMap);
         builder.field(DEFAULT_VALUE.getPreferredName(), defaultValue);
+        if (custom != null) {
+            builder.field(CUSTOM.getPreferredName(), custom);
+        }
         builder.endObject();
         return builder;
     }
@@ -124,12 +135,13 @@ public class TargetMeanEncoding implements PreProcessor {
         return Objects.equals(field, that.field)
             && Objects.equals(featureName, that.featureName)
             && Objects.equals(meanMap, that.meanMap)
-            && Objects.equals(defaultValue, that.defaultValue);
+            && Objects.equals(defaultValue, that.defaultValue)
+            && Objects.equals(custom, that.custom);
     }
 
     @Override
     public int hashCode() {
-        return Objects.hash(field, featureName, meanMap, defaultValue);
+        return Objects.hash(field, featureName, meanMap, defaultValue, custom);
     }
 
     public Builder builder(String field) {
@@ -142,6 +154,7 @@ public class TargetMeanEncoding implements PreProcessor {
         private String featureName;
         private Map<String, Double> meanMap = new HashMap<>();
         private double defaultValue;
+        private Boolean custom;
 
         public Builder(String field) {
             this.field = field;
@@ -176,8 +189,13 @@ public class TargetMeanEncoding implements PreProcessor {
             return this;
         }
 
+        public Builder setCustom(boolean custom) {
+            this.custom = custom;
+            return this;
+        }
+
         public TargetMeanEncoding build() {
-            return new TargetMeanEncoding(field, featureName, meanMap, defaultValue);
+            return new TargetMeanEncoding(field, featureName, meanMap, defaultValue, custom);
         }
     }
 }

+ 4 - 1
client/rest-high-level/src/test/java/org/elasticsearch/client/ml/inference/preprocessing/FrequencyEncodingTests.java

@@ -55,6 +55,9 @@ public class FrequencyEncodingTests extends AbstractXContentTestCase<FrequencyEn
         for (int i = 0; i < valuesSize; i++) {
             valueMap.put(randomAlphaOfLength(10), randomDoubleBetween(0.0, 1.0, false));
         }
-        return new FrequencyEncoding(randomAlphaOfLength(10), randomAlphaOfLength(10), valueMap);
+        return new FrequencyEncoding(randomAlphaOfLength(10),
+            randomAlphaOfLength(10),
+            valueMap,
+            randomBoolean() ? null : randomBoolean());
     }
 }

+ 1 - 1
client/rest-high-level/src/test/java/org/elasticsearch/client/ml/inference/preprocessing/OneHotEncodingTests.java

@@ -55,7 +55,7 @@ public class OneHotEncodingTests extends AbstractXContentTestCase<OneHotEncoding
         for (int i = 0; i < valuesSize; i++) {
             valueMap.put(randomAlphaOfLength(10), randomAlphaOfLength(10));
         }
-        return new OneHotEncoding(randomAlphaOfLength(10), valueMap);
+        return new OneHotEncoding(randomAlphaOfLength(10), valueMap, randomBoolean() ? null : randomBoolean());
     }
 
 }

+ 2 - 1
client/rest-high-level/src/test/java/org/elasticsearch/client/ml/inference/preprocessing/TargetMeanEncodingTests.java

@@ -58,7 +58,8 @@ public class TargetMeanEncodingTests extends AbstractXContentTestCase<TargetMean
         return new TargetMeanEncoding(randomAlphaOfLength(10),
             randomAlphaOfLength(10),
             valueMap,
-            randomDoubleBetween(0.0, 1.0, false));
+            randomDoubleBetween(0.0, 1.0, false),
+            randomBoolean() ? null : randomBoolean());
     }
 
 }

+ 12 - 0
docs/reference/ml/df-analytics/apis/put-inference.asciidoc

@@ -94,6 +94,10 @@ The field name to encode.
 `frequency_map`::
 (Required, object map of string:double)
 Object that maps the field value to the frequency encoded value.
+
+`custom`::
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=custom-preprocessor]
+
 ======
 //End frequency encoding
 
@@ -112,6 +116,10 @@ The field name to encode.
 `hot_map`::
 (Required, object map of strings)
 String map of "field_value: one_hot_column_name".
+
+`custom`::
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=custom-preprocessor]
+
 ======
 //End one hot encoding
 
@@ -138,6 +146,10 @@ The field name to encode.
 `target_map`:::
 (Required, object map of string:double)
 Object that maps the field value to the target mean value.
+
+`custom`::
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=custom-preprocessor]
+
 ======
 //End target mean encoding
 =====

+ 146 - 137
docs/reference/ml/ml-shared.asciidoc

@@ -6,11 +6,11 @@ see
 end::aggregations[]
 
 tag::allow-lazy-open[]
-Advanced configuration option. Specifies whether this job can open when there is 
+Advanced configuration option. Specifies whether this job can open when there is
 insufficient {ml} node capacity for it to be immediately assigned to a node. The
-default value is `false`; if a {ml} node with capacity to run the job cannot 
-immediately be found, the <<ml-open-job,open {anomaly-jobs} API>> returns an 
-error. However, this is also subject to the cluster-wide 
+default value is `false`; if a {ml} node with capacity to run the job cannot
+immediately be found, the <<ml-open-job,open {anomaly-jobs} API>> returns an
+error. However, this is also subject to the cluster-wide
 `xpack.ml.max_lazy_ml_nodes` setting; see <<advanced-ml-settings>>. If this
 option is set to `true`, the <<ml-open-job,open {anomaly-jobs} API>> does not
 return an error and the job waits in the `opening` state until sufficient {ml}
@@ -23,7 +23,7 @@ Specifies what to do when the request:
 --
 * Contains wildcard expressions and there are no {dfeeds} that match.
 * Contains the `_all` string or no identifiers and there are no matches.
-* Contains wildcard expressions and there are only partial matches. 
+* Contains wildcard expressions and there are only partial matches.
 
 The default value is `true`, which returns an empty `datafeeds` array when
 there are no matches and the subset of results when there are partial matches.
@@ -40,8 +40,8 @@ Specifies what to do when the request:
 * Contains the `_all` string or no identifiers and there are no matches.
 * Contains wildcard expressions and there are only partial matches.
 
-The default value is `true`, which returns an empty `jobs` array 
-when there are no matches and the subset of results when there are partial 
+The default value is `true`, which returns an empty `jobs` array
+when there are no matches and the subset of results when there are partial
 matches. If this parameter is `false`, the request returns a `404` status code
 when there are no matches or only partial matches.
 --
@@ -53,17 +53,17 @@ tag::allow-no-match[]
 --
 * Contains wildcard expressions and there are no {dfanalytics-jobs} that match.
 * Contains the `_all` string or no identifiers and there are no matches.
-* Contains wildcard expressions and there are only partial matches. 
+* Contains wildcard expressions and there are only partial matches.
 
-The default value is `true`, which returns an empty `data_frame_analytics` array 
-when there are no matches and the subset of results when there are partial 
-matches. If this parameter is `false`, the request returns a `404` status code 
+The default value is `true`, which returns an empty `data_frame_analytics` array
+when there are no matches and the subset of results when there are partial
+matches. If this parameter is `false`, the request returns a `404` status code
 when there are no matches or only partial matches.
 --
 end::allow-no-match[]
 
 tag::analysis[]
-Defines the type of {dfanalytics} you want to perform on your source index. For 
+Defines the type of {dfanalytics} you want to perform on your source index. For
 example: `outlier_detection`. See <<ml-dfa-analysis-objects>>.
 end::analysis[]
 
@@ -85,7 +85,7 @@ of a node to run the job.
 end::assignment-explanation-anomaly-jobs[]
 
 tag::assignment-explanation-datafeeds[]
-For started {dfeeds} only, contains messages relating to the selection of a 
+For started {dfeeds} only, contains messages relating to the selection of a
 node.
 end::assignment-explanation-datafeeds[]
 
@@ -94,7 +94,7 @@ Contains messages relating to the selection of a node.
 end::assignment-explanation-dfanalytics[]
 
 tag::background-persist-interval[]
-Advanced configuration option. The time between each periodic persistence of the 
+Advanced configuration option. The time between each periodic persistence of the
 model. The default value is a randomized value between 3 to 4 hours, which
 avoids all jobs persisting at exactly the same time. The smallest allowed value
 is 1 hour.
@@ -125,7 +125,7 @@ The size of the interval that the analysis is aggregated into, typically between
 `5m` and `1h`. The default value is `5m`. If the {anomaly-job} uses a {dfeed}
 with {ml-docs}/ml-configuring-aggregation.html[aggregations], this value must be
 divisible by the interval of the date histogram aggregation. For more
-information, see {ml-docs}/ml-buckets.html[Buckets]. 
+information, see {ml-docs}/ml-buckets.html[Buckets].
 end::bucket-span[]
 
 tag::bucket-span-results[]
@@ -155,8 +155,8 @@ Sum of all bucket processing times, in milliseconds.
 end::bucket-time-total[]
 
 tag::by-field-name[]
-The field used to split the data. In particular, this property is used for 
-analyzing the splits with respect to their own history. It is used for finding 
+The field used to split the data. In particular, this property is used for
+analyzing the splits with respect to their own history. It is used for finding
 unusual values in the context of the split.
 end::by-field-name[]
 
@@ -207,7 +207,7 @@ categorization. For more information, see
 end::categorization-examples-limit[]
 
 tag::categorization-field-name[]
-If this property is specified, the values of the specified field will be 
+If this property is specified, the values of the specified field will be
 categorized. The resulting categories must be used in a detector by setting
 `by_field_name`, `over_field_name`, or `partition_field_name` to the keyword
 `mlcategory`. For more information, see
@@ -218,14 +218,14 @@ tag::categorization-filters[]
 If `categorization_field_name` is specified, you can also define optional
 filters. This property expects an array of regular expressions. The expressions
 are used to filter out matching sequences from the categorization field values.
-You can use this functionality to fine tune the categorization by excluding 
-sequences from consideration when categories are defined. For example, you can 
-exclude SQL statements that appear in your log files. For more information, see 
+You can use this functionality to fine tune the categorization by excluding
+sequences from consideration when categories are defined. For example, you can
+exclude SQL statements that appear in your log files. For more information, see
 {ml-docs}/ml-configuring-categories.html[Categorizing log messages]. This
 property cannot be used at the same time as `categorization_analyzer`. If you
-only want to define simple regular expression filters that are applied prior to 
-tokenization, setting this property is the easiest method. If you also want to 
-customize the tokenizer or post-tokenization filtering, use the 
+only want to define simple regular expression filters that are applied prior to
+tokenization, setting this property is the easiest method. If you also want to
+customize the tokenizer or post-tokenization filtering, use the
 `categorization_analyzer` property instead and include the filters as
 `pattern_replace` character filters. The effect is exactly the same.
 end::categorization-filters[]
@@ -251,7 +251,7 @@ end::categorized-doc-count[]
 tag::char-filter[]
 One or more <<analysis-charfilters,character filters>>. In addition to the
 built-in character filters, other plugins can provide more character filters.
-This property is optional. If it is not specified, no character filters are 
+This property is optional. If it is not specified, no character filters are
 applied prior to categorization. If you are customizing some other aspect of the
 analyzer and you need to achieve the equivalent of `categorization_filters`
 (which are not permitted when some other aspect of the analyzer is customized),
@@ -260,9 +260,9 @@ add them here as
 end::char-filter[]
 
 tag::chunking-config[]
-{dfeeds-cap} might be required to search over long time periods, for several 
-months or years. This search is split into time chunks in order to ensure the 
-load on {es} is managed. Chunking configuration controls how the size of these 
+{dfeeds-cap} might be required to search over long time periods, for several
+months or years. This search is split into time chunks in order to ensure the
+load on {es} is managed. Chunking configuration controls how the size of these
 time chunks are calculated and is an advanced configuration option.
 +
 .Properties of `chunking_config`
@@ -291,10 +291,19 @@ Specifies whether the feature influence calculation is enabled. Defaults to
 `true`.
 end::compute-feature-influence[]
 
+tag::custom-preprocessor[]
+(Optional, boolean)
+Boolean value indicating if the analytics job created the preprocessor
+or if a user provided it. This adjusts the feature importance calculation.
+When `true`, the feature importance calculation returns importance for the
+processed feature. When `false`, the total importance of the original field
+is returned. Default is `false`.
+end::custom-preprocessor[]
+
 tag::custom-rules[]
 An array of custom rule objects, which enable you to customize the way detectors
 operate. For example, a rule may dictate to the detector conditions under which
-results should be skipped. For more examples, see 
+results should be skipped. For more examples, see
 {ml-docs}/ml-configuring-detector-custom-rules.html[Customizing detectors with custom rules].
 end::custom-rules[]
 
@@ -334,7 +343,7 @@ end::custom-rules-scope-filter-type[]
 tag::custom-rules-conditions[]
 An optional array of numeric conditions when the rule applies. A rule must
 either have a non-empty scope or at least one condition. Multiple conditions are
-combined together with a logical `AND`. A condition has the following 
+combined together with a logical `AND`. A condition has the following
 properties:
 end::custom-rules-conditions[]
 
@@ -347,7 +356,7 @@ end::custom-rules-conditions-applies-to[]
 
 tag::custom-rules-conditions-operator[]
 Specifies the condition operator. The available options are `gt` (greater than),
-`gte` (greater than or equals), `lt` (less than) and `lte` (less than or 
+`gte` (greater than or equals), `lt` (less than) and `lte` (less than or
 equals).
 end::custom-rules-conditions-operator[]
 
@@ -367,7 +376,7 @@ snapshots for this job. It specifies a period of time (in days) after which only
 the first snapshot per day is retained. This period is relative to the timestamp
 of the most recent snapshot for this job. Valid values range from `0` to
 `model_snapshot_retention_days`. For new jobs, the default value is `1`. For
-jobs created before version 7.8.0, the default value matches 
+jobs created before version 7.8.0, the default value matches
 `model_snapshot_retention_days`. For more information, refer to
 {ml-docs}/ml-model-snapshots.html[Model snapshots].
 end::daily-model-snapshot-retention-after-days[]
@@ -375,8 +384,8 @@ end::daily-model-snapshot-retention-after-days[]
 tag::data-description[]
 The data description defines the format of the input data when you send data to
 the job by using the <<ml-post-data,post data>> API. Note that when configure
-a {dfeed}, these properties are automatically set. When data is received via 
-the <<ml-post-data,post data>> API, it is not stored in {es}. Only the results 
+a {dfeed}, these properties are automatically set. When data is received via
+the <<ml-post-data,post data>> API, it is not stored in {es}. Only the results
 for {anomaly-detect} are retained.
 +
 .Properties of `data_description`
@@ -419,10 +428,10 @@ Specifies whether the {dfeed} checks for missing data and the size of the
 window. For example: `{"enabled": true, "check_window": "1h"}`.
 +
 The {dfeed} can optionally search over indices that have already been read in
-an effort to determine whether any data has subsequently been added to the 
-index. If missing data is found, it is a good indication that the `query_delay` 
-option is set too low and the data is being indexed after the {dfeed} has passed 
-that moment in time. See 
+an effort to determine whether any data has subsequently been added to the
+index. If missing data is found, it is a good indication that the `query_delay`
+option is set too low and the data is being indexed after the {dfeed} has passed
+that moment in time. See
 {ml-docs}/ml-delayed-data-detection.html[Working with delayed data].
 +
 This check runs only on real-time {dfeeds}.
@@ -445,9 +454,9 @@ end::delayed-data-check-config[]
 
 tag::dependent-variable[]
 Defines which field of the document is to be predicted.
-This parameter is supplied by field name and must match one of the fields in 
-the index being used to train. If this field is missing from a document, then 
-that document will not be used for training, but a prediction with the trained 
+This parameter is supplied by field name and must match one of the fields in
+the index being used to train. If this field is missing from a document, then
+that document will not be used for training, but a prediction with the trained
 model will be generated for it. It is also known as continuous target variable.
 end::dependent-variable[]
 
@@ -469,7 +478,7 @@ The destination configuration, consisting of `index` and optionally
 `index`:::
 (Required, string) Defines the _destination index_ to store the results of the
 {dfanalytics-job}.
-  
+
 `results_field`:::
 (Optional, string) Defines the name of the field in which to store the results
 of the analysis. Defaults to `ml`.
@@ -481,7 +490,7 @@ A description of the detector. For example, `Low event rate`.
 end::detector-description[]
 
 tag::detector-field-name[]
-The field that the detector uses in the function. If you use an event rate 
+The field that the detector uses in the function. If you use an event rate
 function such as `count` or `rare`, do not specify this field.
 +
 --
@@ -491,7 +500,7 @@ NOTE: The `field_name` cannot contain double quotes or backslashes.
 end::detector-field-name[]
 
 tag::detector-index[]
-A unique identifier for the detector. This identifier is based on the order of 
+A unique identifier for the detector. This identifier is based on the order of
 the detectors in the `analysis_config`, starting at zero.
 end::detector-index[]
 
@@ -509,7 +518,7 @@ to the forest. For example, a rate of `1.05` increases `eta` by 5%.
 end::dfas-eta-growth[]
 
 tag::dfas-feature-bag-fraction[]
-The fraction of features that is used when selecting a random bag for each 
+The fraction of features that is used when selecting a random bag for each
 candidate split.
 end::dfas-feature-bag-fraction[]
 
@@ -518,16 +527,16 @@ The number of iterations on the analysis.
 end::dfas-iteration[]
 
 tag::dfas-max-attempts[]
-If the algorithm fails to determine a non-trivial tree (more than a single 
-leaf), this parameter determines how many of such consecutive failures are 
-tolerated. Once the number of attempts exceeds the threshold, the forest 
+If the algorithm fails to determine a non-trivial tree (more than a single
+leaf), this parameter determines how many of such consecutive failures are
+tolerated. Once the number of attempts exceeds the threshold, the forest
 training stops.
 end::dfas-max-attempts[]
 
 tag::dfas-max-optimization-rounds[]
-A multiplier responsible for determining the maximum number of 
-hyperparameter optimization steps in the Bayesian optimization procedure. 
-The maximum number of steps is determined based on the number of undefined 
+A multiplier responsible for determining the maximum number of
+hyperparameter optimization steps in the Bayesian optimization procedure.
+The maximum number of steps is determined based on the number of undefined
 hyperparameters times the maximum optimization rounds per hyperparameter.
 end::dfas-max-optimization-rounds[]
 
@@ -536,17 +545,17 @@ The maximum number of folds for the cross-validation procedure.
 end::dfas-num-folds[]
 
 tag::dfas-num-splits[]
-Determines the maximum number of splits for every feature that can occur in a 
+Determines the maximum number of splits for every feature that can occur in a
 decision tree when the tree is trained.
 end::dfas-num-splits[]
 
 tag::dfas-soft-limit[]
-Tree depth limit is used for calculating the tree depth penalty. This is a soft 
+Tree depth limit is used for calculating the tree depth penalty. This is a soft
 limit, it can be exceeded.
 end::dfas-soft-limit[]
 
 tag::dfas-soft-tolerance[]
-Tree depth tolerance is used for calculating the tree depth penalty. This is a 
+Tree depth tolerance is used for calculating the tree depth penalty. This is a
 soft limit, it can be exceeded.
 end::dfas-soft-tolerance[]
 
@@ -571,7 +580,7 @@ An object containing information about validation loss.
 end::dfas-validation-loss[]
 
 tag::dfas-validation-loss-fold[]
-Validation loss values for every added decision tree during the forest growing 
+Validation loss values for every added decision tree during the forest growing
 procedure.
 end::dfas-validation-loss-fold[]
 
@@ -600,15 +609,15 @@ By default, this value is calculated during hyperparameter optimization.
 end::eta[]
 
 tag::exclude-frequent[]
-Contains one of the following values: `all`, `none`, `by`, or `over`. If set, 
+Contains one of the following values: `all`, `none`, `by`, or `over`. If set,
 frequent entities are excluded from influencing the anomaly results. Entities
-can be considered frequent over time or frequent in a population. If you are 
-working with both over and by fields, then you can set `exclude_frequent` to 
+can be considered frequent over time or frequent in a population. If you are
+working with both over and by fields, then you can set `exclude_frequent` to
 `all` for both fields, or to `by` or `over` for those specific fields.
 end::exclude-frequent[]
 
 tag::exclude-interim-results[]
-If `true`, the output excludes interim results. By default, interim results are 
+If `true`, the output excludes interim results. By default, interim results are
 included.
 end::exclude-interim-results[]
 
@@ -626,12 +635,12 @@ value is calculated during hyperparameter optimization.
 end::feature-bag-fraction[]
 
 tag::feature-influence-threshold[]
-The minimum {olscore} that a document needs to have in order to calculate its 
+The minimum {olscore} that a document needs to have in order to calculate its
 {fiscore}. Value range: 0-1 (`0.1` by default).
 end::feature-influence-threshold[]
 
 tag::filter[]
-One or more <<analysis-tokenfilters,token filters>>. In addition to the built-in 
+One or more <<analysis-tokenfilters,token filters>>. In addition to the built-in
 token filters, other plugins can provide more token filters. This property is
 optional. If it is not specified, no token filters are applied prior to
 categorization.
@@ -665,15 +674,15 @@ Skips the specified number of {dfanalytics-jobs}. The default value is `0`.
 end::from[]
 
 tag::function[]
-The analysis function that is used. For example, `count`, `rare`, `mean`, `min`, 
+The analysis function that is used. For example, `count`, `rare`, `mean`, `min`,
 `max`, and `sum`. For more information, see
 {ml-docs}/ml-functions.html[Function reference].
 end::function[]
 
 tag::gamma[]
 Advanced configuration option. Regularization parameter to prevent overfitting
-on the training data set. Multiplies a linear penalty associated with the size of 
-individual trees in the forest. The higher the value the more training will 
+on the training data set. Multiplies a linear penalty associated with the size of
+individual trees in the forest. The higher the value the more training will
 prefer smaller trees. The smaller this parameter the larger individual trees
 will be and the longer training will take. By default, this value is calculated
 during hyperparameter optimization.
@@ -717,7 +726,7 @@ end::inference-config-classification-num-top-classes[]
 
 tag::inference-config-classification-num-top-feature-importance-values[]
 Specifies the maximum number of
-{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document. By 
+{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document. By
 default, it is zero and no {feat-imp} calculation occurs.
 end::inference-config-classification-num-top-feature-importance-values[]
 
@@ -734,7 +743,7 @@ end::inference-config-classification-prediction-field-type[]
 
 tag::inference-config-regression-num-top-feature-importance-values[]
 Specifies the maximum number of
-{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document. 
+{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document.
 By default, it is zero and no {feat-imp} calculation occurs.
 end::inference-config-regression-num-top-feature-importance-values[]
 
@@ -750,11 +759,11 @@ used to train the model, which defaults to `<dependent_variable>_prediction`.
 end::inference-config-results-field-processor[]
 
 tag::influencers[]
-A comma separated list of influencer field names. Typically these can be the by, 
-over, or partition fields that are used in the detector configuration. You might 
-also want to use a field name that is not specifically named in a detector, but 
-is available as part of the input data. When you use multiple detectors, the use 
-of influencers is recommended as it aggregates results for each influencer 
+A comma separated list of influencer field names. Typically these can be the by,
+over, or partition fields that are used in the detector configuration. You might
+also want to use a field name that is not specifically named in a detector, but
+is available as part of the input data. When you use multiple detectors, the use
+of influencers is recommended as it aggregates results for each influencer
 entity.
 end::influencers[]
 
@@ -792,8 +801,8 @@ Identifier for the {dfanalytics-job}.
 end::job-id-data-frame-analytics[]
 
 tag::job-id-anomaly-detection-default[]
-Identifier for the {anomaly-job}. It can be a job identifier, a group name, or a 
-wildcard expression. If you do not specify one of these options, the API returns 
+Identifier for the {anomaly-job}. It can be a job identifier, a group name, or a
+wildcard expression. If you do not specify one of these options, the API returns
 information for all {anomaly-jobs}.
 end::job-id-anomaly-detection-default[]
 
@@ -808,7 +817,7 @@ identifier, a group name, or a comma-separated list of jobs or groups.
 end::job-id-anomaly-detection-list[]
 
 tag::job-id-anomaly-detection-wildcard[]
-Identifier for the {anomaly-job}. It can be a job identifier, a group name, or a 
+Identifier for the {anomaly-job}. It can be a job identifier, a group name, or a
 wildcard expression.
 end::job-id-anomaly-detection-wildcard[]
 
@@ -818,13 +827,13 @@ comma-separated list of jobs or groups, or a wildcard expression.
 end::job-id-anomaly-detection-wildcard-list[]
 
 tag::job-id-anomaly-detection-define[]
-Identifier for the {anomaly-job}. This identifier can contain lowercase 
-alphanumeric characters (a-z and 0-9), hyphens, and underscores. It must start 
+Identifier for the {anomaly-job}. This identifier can contain lowercase
+alphanumeric characters (a-z and 0-9), hyphens, and underscores. It must start
 and end with alphanumeric characters.
 end::job-id-anomaly-detection-define[]
 
 tag::job-id-data-frame-analytics-define[]
-Identifier for the {dfanalytics-job}. This identifier can contain lowercase 
+Identifier for the {dfanalytics-job}. This identifier can contain lowercase
 alphanumeric characters (a-z and 0-9), hyphens, and underscores. It must start
 and end with alphanumeric characters.
 end::job-id-data-frame-analytics-define[]
@@ -837,9 +846,9 @@ tag::lambda[]
 Advanced configuration option. Regularization parameter to prevent overfitting
 on the training data set. Multiplies an L2 regularisation term which applies to
 leaf weights of the individual trees in the forest. The higher the value the
-more training will attempt to keep leaf weights small. This makes the prediction  
-function smoother at the expense of potentially not being able to capture 
-relevant relationships between the features and the {depvar}. The smaller this 
+more training will attempt to keep leaf weights small. This makes the prediction
+function smoother at the expense of potentially not being able to capture
+relevant relationships between the features and the {depvar}. The smaller this
 parameter the larger individual trees will be and the longer training will take.
 By default, this value is calculated during hyperparameter optimization.
 end::lambda[]
@@ -849,9 +858,9 @@ The timestamp at which data was last analyzed, according to server time.
 end::last-data-time[]
 
 tag::latency[]
-The size of the window in which to expect data that is out of time order. The 
-default value is 0 (no latency). If you specify a non-zero value, it must be 
-greater than or equal to one second. For more information about time units, see 
+The size of the window in which to expect data that is out of time order. The
+default value is 0 (no latency). If you specify a non-zero value, it must be
+greater than or equal to one second. For more information about time units, see
 <<time-units>>.
 +
 --
@@ -877,9 +886,9 @@ tag::max-empty-searches[]
 If a real-time {dfeed} has never seen any data (including during any initial
 training period) then it will automatically stop itself and close its associated
 job after this many real-time searches that return no documents. In other words,
-it will stop after `frequency` times `max_empty_searches` of real-time 
-operation. If not set then a {dfeed} with no end time that sees no data will 
-remain started until it is explicitly stopped. By default this setting is not 
+it will stop after `frequency` times `max_empty_searches` of real-time
+operation. If not set then a {dfeed} with no end time that sees no data will
+remain started until it is explicitly stopped. By default this setting is not
 set.
 end::max-empty-searches[]
 
@@ -894,7 +903,7 @@ The method that {oldetection} uses. Available methods are `lof`, `ldof`,
 `distance_kth_nn`, `distance_knn`, and `ensemble`. The default value is
 `ensemble`, which means that {oldetection} uses an ensemble of different methods
 and normalises and combines their individual {olscores} to obtain the overall
-{olscore}. 
+{olscore}.
 end::method[]
 
 tag::missing-field-count[]
@@ -911,7 +920,7 @@ necessarily a cause for concern.
 end::missing-field-count[]
 
 tag::mode[]
-There are three available modes: 
+There are three available modes:
 +
 --
 * `auto`: The chunk size is dynamically calculated. This is the default and
@@ -937,9 +946,9 @@ The unique identifier of the trained {infer} model.
 end::model-id[]
 
 tag::model-memory-limit[]
-The approximate maximum amount of memory resources that are required for 
+The approximate maximum amount of memory resources that are required for
 analytical processing. Once this limit is approached, data pruning becomes
-more aggressive. Upon exceeding this limit, new entities are not modeled. The 
+more aggressive. Upon exceeding this limit, new entities are not modeled. The
 default value for jobs created in version 6.1 and later is `1024mb`.
 This value will need to be increased for jobs that are expected to analyze high
 cardinality fields, but the default is set to a relatively small size to ensure
@@ -981,15 +990,15 @@ This advanced configuration option stores model information along with the
 results. It provides a more detailed view into {anomaly-detect}.
 +
 --
-WARNING: If you enable model plot it can add considerable overhead to the 
+WARNING: If you enable model plot it can add considerable overhead to the
 performance of the system; it is not feasible for jobs with many entities.
 
-Model plot provides a simplified and indicative view of the model and its 
-bounds. It does not display complex features such as multivariate correlations 
-or multimodal data. As such, anomalies may occasionally be reported which cannot 
+Model plot provides a simplified and indicative view of the model and its
+bounds. It does not display complex features such as multivariate correlations
+or multimodal data. As such, anomalies may occasionally be reported which cannot
 be seen in the model plot.
 
-Model plot config can be configured when the job is created or updated later. It 
+Model plot config can be configured when the job is created or updated later. It
 must be disabled if performance issues are experienced.
 --
 end::model-plot-config[]
@@ -1012,7 +1021,7 @@ Only the specified `terms` can be viewed when using the Single Metric Viewer.
 end::model-plot-config-terms[]
 
 tag::model-snapshot-id[]
-A numerical character string that uniquely identifies the model snapshot. For 
+A numerical character string that uniquely identifies the model snapshot. For
 example, `1575402236000 `.
 end::model-snapshot-id[]
 
@@ -1030,12 +1039,12 @@ The timestamp of the last record when the model stats were gathered.
 end::model-timestamp[]
 
 tag::multivariate-by-fields[]
-This functionality is reserved for internal use. It is not supported for use in 
-customer environments and is not subject to the support SLA of official GA 
+This functionality is reserved for internal use. It is not supported for use in
+customer environments and is not subject to the support SLA of official GA
 features.
 +
 --
-If set to `true`, the analysis will automatically find correlations between 
+If set to `true`, the analysis will automatically find correlations between
 metrics for a given `by` field value and report anomalies when those
 correlations cease to hold. For example, suppose CPU and memory usage on host A
 is usually highly correlated with the same metrics on host B. Perhaps this
@@ -1102,20 +1111,20 @@ ascending chronological order.
 end::out-of-order-timestamp-count[]
 
 tag::outlier-fraction[]
-The proportion of the data set that is assumed to be outlying prior to 
-{oldetection}. For example, 0.05 means it is assumed that 5% of values are real 
+The proportion of the data set that is assumed to be outlying prior to
+{oldetection}. For example, 0.05 means it is assumed that 5% of values are real
 outliers and 95% are inliers.
 end::outlier-fraction[]
 
 tag::over-field-name[]
-The field used to split the data. In particular, this property is used for 
-analyzing the splits with respect to the history of all splits. It is used for 
+The field used to split the data. In particular, this property is used for
+analyzing the splits with respect to the history of all splits. It is used for
 finding unusual values in the population of all splits. For more information,
 see {ml-docs}/ml-configuring-pop.html[Performing population analysis].
 end::over-field-name[]
 
 tag::partition-field-name[]
-The field used to segment the analysis. When you use this property, you have 
+The field used to segment the analysis. When you use this property, you have
 completely independent baselines for each value of this field.
 end::partition-field-name[]
 
@@ -1143,7 +1152,7 @@ forever in the partitions where it works badly.
 end::per-partition-categorization-stop-on-warn[]
 
 tag::prediction-field-name[]
-Defines the name of the prediction field in the results. 
+Defines the name of the prediction field in the results.
 Defaults to `<dependent_variable>_prediction`.
 end::prediction-field-name[]
 
@@ -1162,10 +1171,10 @@ number of {es} documents.
 end::processed-record-count[]
 
 tag::randomize-seed[]
-Defines the seed to the random generator that is used to pick which documents 
-will be used for training. By default it is randomly generated. Set it to a 
-specific value to ensure the same documents are used for training assuming other 
-related parameters (for example, `source`, `analyzed_fields`, etc.) are the 
+Defines the seed to the random generator that is used to pick which documents
+will be used for training. By default it is randomly generated. Set it to a
+specific value to ensure the same documents are used for training assuming other
+related parameters (for example, `source`, `analyzed_fields`, etc.) are the
 same.
 end::randomize-seed[]
 
@@ -1197,14 +1206,14 @@ end::renormalization-window-days[]
 
 tag::results-index-name[]
 A text string that affects the name of the {ml} results index. The default value
-is `shared`, which generates an index named `.ml-anomalies-shared`. 
+is `shared`, which generates an index named `.ml-anomalies-shared`.
 end::results-index-name[]
 
 tag::results-retention-days[]
-Advanced configuration option. The period of time (in days) that results are 
-retained. Age is calculated relative to the timestamp of the latest bucket 
-result. If this property has a non-null value, once per day at 00:30 (server 
-time), results that are the specified number of days older than the latest 
+Advanced configuration option. The period of time (in days) that results are
+retained. Age is calculated relative to the timestamp of the latest bucket
+result. If this property has a non-null value, once per day at 00:30 (server
+time), results that are the specified number of days older than the latest
 bucket result are deleted from {es}. The default value is null, which means all
 results are retained.
 end::results-retention-days[]
@@ -1244,7 +1253,7 @@ The total time the {dfeed} spent searching, in milliseconds.
 end::search-time[]
 
 tag::size[]
-Specifies the maximum number of {dfanalytics-jobs} to obtain. The default value 
+Specifies the maximum number of {dfanalytics-jobs} to obtain. The default value
 is `100`.
 end::size[]
 
@@ -1259,9 +1268,9 @@ a longer `bucket_span`.
 end::sparse-bucket-count[]
 
 tag::standardization-enabled[]
-If `true`, the following operation is performed on the columns before computing 
-outlier scores: (x_i - mean(x_i)) / sd(x_i). Defaults to `true`. For more 
-information about this concept, see 
+If `true`, the following operation is performed on the columns before computing
+outlier scores: (x_i - mean(x_i)) / sd(x_i). Defaults to `true`. For more
+information about this concept, see
 {wikipedia}/Feature_scaling#Standardization_(Z-score_Normalization)[Wikipedia].
 end::standardization-enabled[]
 
@@ -1274,10 +1283,10 @@ job must be opened before it can accept further data.
 * `closing`: The job close action is in progress and has not yet completed. A
 closing job cannot accept further data.
 * `failed`: The job did not finish successfully due to an error. This situation
-can occur due to invalid input data, a fatal error occurring during the 
-analysis, or an external interaction such as the process being killed by the 
-Linux out of memory (OOM) killer. If the job had irrevocably failed, it must be 
-force closed and then deleted. If the {dfeed} can be corrected, the job can be 
+can occur due to invalid input data, a fatal error occurring during the
+analysis, or an external interaction such as the process being killed by the
+Linux out of memory (OOM) killer. If the job had irrevocably failed, it must be
+force closed and then deleted. If the {dfeed} can be corrected, the job can be
 closed and then re-opened.
 * `opened`: The job is available to receive and process data.
 * `opening`: The job open action is in progress and has not yet completed.
@@ -1299,8 +1308,8 @@ end::state-datafeed[]
 
 tag::summary-count-field-name[]
 If this property is specified, the data that is fed to the job is expected to be
-pre-summarized. This property value is the name of the field that contains the 
-count of raw data points that have been summarized. The same 
+pre-summarized. This property value is the name of the field that contains the
+count of raw data points that have been summarized. The same
 `summary_count_field_name` applies to all detectors in the job.
 +
 --
@@ -1317,26 +1326,26 @@ returned.
 end::tags[]
 
 tag::timeout-start[]
-Controls the amount of time to wait until the {dfanalytics-job} starts. Defaults 
+Controls the amount of time to wait until the {dfanalytics-job} starts. Defaults
 to 20 seconds.
 end::timeout-start[]
 
 tag::timeout-stop[]
-Controls the amount of time to wait until the {dfanalytics-job} stops. Defaults 
+Controls the amount of time to wait until the {dfanalytics-job} stops. Defaults
 to 20 seconds.
 end::timeout-stop[]
 
 tag::time-format[]
 The time format, which can be `epoch`, `epoch_ms`, or a custom pattern. The
-default value is `epoch`, which refers to UNIX or Epoch time (the number of 
-seconds since 1 Jan 1970). The value `epoch_ms` indicates that time is measured 
-in milliseconds since the epoch. The `epoch` and `epoch_ms` time formats accept 
+default value is `epoch`, which refers to UNIX or Epoch time (the number of
+seconds since 1 Jan 1970). The value `epoch_ms` indicates that time is measured
+in milliseconds since the epoch. The `epoch` and `epoch_ms` time formats accept
 either integer or real values. +
 +
 NOTE: Custom patterns must conform to the Java `DateTimeFormatter` class.
 When you use date-time formatting patterns, it is recommended that you provide
 the full date, time and time zone. For example: `yyyy-MM-dd'T'HH:mm:ssX`.
-If the pattern that you specify is not sufficient to produce a complete 
+If the pattern that you specify is not sufficient to produce a complete
 timestamp, job creation fails.
 end::time-format[]
 
@@ -1350,11 +1359,11 @@ The start time of the bucket for which these results were calculated.
 end::timestamp-results[]
 
 tag::tokenizer[]
-The name or definition of the <<analysis-tokenizers,tokenizer>> to use after 
-character filters are applied. This property is compulsory if 
+The name or definition of the <<analysis-tokenizers,tokenizer>> to use after
+character filters are applied. This property is compulsory if
 `categorization_analyzer` is specified as an object. Machine learning provides a
 tokenizer called `ml_classic` that tokenizes in the same way as the
-non-customizable tokenizer in older versions of the product. If you want to use 
+non-customizable tokenizer in older versions of the product. If you want to use
 that tokenizer but change the character or token filters, specify
 `"tokenizer": "ml_classic"` in your `categorization_analyzer`.
 end::tokenizer[]
@@ -1379,14 +1388,14 @@ value is cumulative for all detectors in the job.
 end::total-partition-field-count[]
 
 tag::training-percent[]
-Defines what percentage of the eligible documents that will 
-be used for training. Documents that are ignored by the analysis (for example 
-those that contain arrays with more than one value) won’t be included in the 
+Defines what percentage of the eligible documents that will
+be used for training. Documents that are ignored by the analysis (for example
+those that contain arrays with more than one value) won’t be included in the
 calculation for used percentage. Defaults to `100`.
 end::training-percent[]
 
 tag::use-null[]
-Defines whether a new series is used as the null series when there is no value 
+Defines whether a new series is used as the null series when there is no value
 for the by or partition fields. The default value is `false`.
 end::use-null[]
 

+ 15 - 0
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/CustomWordEmbedding.java

@@ -220,6 +220,16 @@ public class CustomWordEmbedding implements LenientlyParsedPreProcessor, Strictl
         return data[row * colDim + col];
     }
 
+    @Override
+    public List<String> inputFields() {
+        return Collections.singletonList(fieldName);
+    }
+
+    @Override
+    public List<String> outputFields() {
+        return Collections.singletonList(destField);
+    }
+
     @Override
     public void process(Map<String, Object> fields) {
         Object field = fields.get(fieldName);
@@ -241,6 +251,11 @@ public class CustomWordEmbedding implements LenientlyParsedPreProcessor, Strictl
         return Collections.singletonMap(destField, fieldName);
     }
 
+    @Override
+    public boolean isCustom() {
+        return false;
+    }
+
     @Override
     public long ramBytesUsed() {
         long size = SHALLOW_SIZE;

+ 35 - 4
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/FrequencyEncoding.java

@@ -6,6 +6,7 @@
 package org.elasticsearch.xpack.core.ml.inference.preprocessing;
 
 import org.apache.lucene.util.RamUsageEstimator;
+import org.elasticsearch.Version;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.io.stream.StreamInput;
@@ -18,6 +19,7 @@ import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
 import java.io.IOException;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 
@@ -33,6 +35,7 @@ public class FrequencyEncoding implements LenientlyParsedPreProcessor, StrictlyP
     public static final ParseField FIELD = new ParseField("field");
     public static final ParseField FEATURE_NAME = new ParseField("feature_name");
     public static final ParseField FREQUENCY_MAP = new ParseField("frequency_map");
+    public static final ParseField CUSTOM = new ParseField("custom");
 
     public static final ConstructingObjectParser<FrequencyEncoding, Void> STRICT_PARSER = createParser(false);
     public static final ConstructingObjectParser<FrequencyEncoding, Void> LENIENT_PARSER = createParser(true);
@@ -42,12 +45,13 @@ public class FrequencyEncoding implements LenientlyParsedPreProcessor, StrictlyP
         ConstructingObjectParser<FrequencyEncoding, Void> parser = new ConstructingObjectParser<>(
             NAME.getPreferredName(),
             lenient,
-            a -> new FrequencyEncoding((String)a[0], (String)a[1], (Map<String, Double>)a[2]));
+            a -> new FrequencyEncoding((String)a[0], (String)a[1], (Map<String, Double>)a[2], (Boolean)a[3]));
         parser.declareString(ConstructingObjectParser.constructorArg(), FIELD);
         parser.declareString(ConstructingObjectParser.constructorArg(), FEATURE_NAME);
         parser.declareObject(ConstructingObjectParser.constructorArg(),
             (p, c) -> p.map(HashMap::new, XContentParser::doubleValue),
             FREQUENCY_MAP);
+        parser.declareBoolean(ConstructingObjectParser.optionalConstructorArg(), CUSTOM);
         return parser;
     }
 
@@ -62,17 +66,24 @@ public class FrequencyEncoding implements LenientlyParsedPreProcessor, StrictlyP
     private final String field;
     private final String featureName;
     private final Map<String, Double> frequencyMap;
+    private final boolean custom;
 
-    public FrequencyEncoding(String field, String featureName, Map<String, Double> frequencyMap) {
+    public FrequencyEncoding(String field, String featureName, Map<String, Double> frequencyMap, Boolean custom) {
         this.field = ExceptionsHelper.requireNonNull(field, FIELD);
         this.featureName = ExceptionsHelper.requireNonNull(featureName, FEATURE_NAME);
         this.frequencyMap = Collections.unmodifiableMap(ExceptionsHelper.requireNonNull(frequencyMap, FREQUENCY_MAP));
+        this.custom = custom == null ? false : custom;
     }
 
     public FrequencyEncoding(StreamInput in) throws IOException {
         this.field = in.readString();
         this.featureName = in.readString();
         this.frequencyMap = Collections.unmodifiableMap(in.readMap(StreamInput::readString, StreamInput::readDouble));
+        if (in.getVersion().onOrAfter(Version.V_8_0_0)) {
+            this.custom = in.readBoolean();
+        } else {
+            this.custom = false;
+        }
     }
 
     /**
@@ -101,11 +112,26 @@ public class FrequencyEncoding implements LenientlyParsedPreProcessor, StrictlyP
         return Collections.singletonMap(featureName, field);
     }
 
+    @Override
+    public boolean isCustom() {
+        return custom;
+    }
+
     @Override
     public String getName() {
         return NAME.getPreferredName();
     }
 
+    @Override
+    public List<String> inputFields() {
+        return Collections.singletonList(field);
+    }
+
+    @Override
+    public List<String> outputFields() {
+        return Collections.singletonList(featureName);
+    }
+
     @Override
     public void process(Map<String, Object> fields) {
         Object value = fields.get(field);
@@ -125,6 +151,9 @@ public class FrequencyEncoding implements LenientlyParsedPreProcessor, StrictlyP
         out.writeString(field);
         out.writeString(featureName);
         out.writeMap(frequencyMap, StreamOutput::writeString, StreamOutput::writeDouble);
+        if (out.getVersion().onOrAfter(Version.V_8_0_0)) {
+            out.writeBoolean(custom);
+        }
     }
 
     @Override
@@ -133,6 +162,7 @@ public class FrequencyEncoding implements LenientlyParsedPreProcessor, StrictlyP
         builder.field(FIELD.getPreferredName(), field);
         builder.field(FEATURE_NAME.getPreferredName(), featureName);
         builder.field(FREQUENCY_MAP.getPreferredName(), frequencyMap);
+        builder.field(CUSTOM.getPreferredName(), custom);
         builder.endObject();
         return builder;
     }
@@ -144,12 +174,13 @@ public class FrequencyEncoding implements LenientlyParsedPreProcessor, StrictlyP
         FrequencyEncoding that = (FrequencyEncoding) o;
         return Objects.equals(field, that.field)
             && Objects.equals(featureName, that.featureName)
-            && Objects.equals(frequencyMap, that.frequencyMap);
+            && Objects.equals(frequencyMap, that.frequencyMap)
+            && Objects.equals(custom, that.custom);
     }
 
     @Override
     public int hashCode() {
-        return Objects.hash(field, featureName, frequencyMap);
+        return Objects.hash(field, featureName, frequencyMap, custom);
     }
 
     @Override

+ 38 - 6
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/OneHotEncoding.java

@@ -6,6 +6,7 @@
 package org.elasticsearch.xpack.core.ml.inference.preprocessing;
 
 import org.apache.lucene.util.RamUsageEstimator;
+import org.elasticsearch.Version;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.io.stream.StreamInput;
@@ -16,10 +17,12 @@ import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
 
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Collections;
-import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.Objects;
+import java.util.function.Function;
 import java.util.stream.Collectors;
 
 /**
@@ -31,6 +34,7 @@ public class OneHotEncoding implements LenientlyParsedPreProcessor, StrictlyPars
     public static final ParseField NAME = new ParseField("one_hot_encoding");
     public static final ParseField FIELD = new ParseField("field");
     public static final ParseField HOT_MAP = new ParseField("hot_map");
+    public static final ParseField CUSTOM = new ParseField("custom");
 
     public static final ConstructingObjectParser<OneHotEncoding, Void> STRICT_PARSER = createParser(false);
     public static final ConstructingObjectParser<OneHotEncoding, Void> LENIENT_PARSER = createParser(true);
@@ -40,9 +44,10 @@ public class OneHotEncoding implements LenientlyParsedPreProcessor, StrictlyPars
         ConstructingObjectParser<OneHotEncoding, Void> parser = new ConstructingObjectParser<>(
             NAME.getPreferredName(),
             lenient,
-            a -> new OneHotEncoding((String)a[0], (Map<String, String>)a[1]));
+            a -> new OneHotEncoding((String)a[0], (Map<String, String>)a[1], (Boolean)a[2]));
         parser.declareString(ConstructingObjectParser.constructorArg(), FIELD);
         parser.declareObject(ConstructingObjectParser.constructorArg(), (p, c) -> p.mapStrings(), HOT_MAP);
+        parser.declareBoolean(ConstructingObjectParser.optionalConstructorArg(), CUSTOM);
         return parser;
     }
 
@@ -56,15 +61,22 @@ public class OneHotEncoding implements LenientlyParsedPreProcessor, StrictlyPars
 
     private final String field;
     private final Map<String, String> hotMap;
+    private final boolean custom;
 
-    public OneHotEncoding(String field, Map<String, String> hotMap) {
+    public OneHotEncoding(String field, Map<String, String> hotMap, Boolean custom) {
         this.field = ExceptionsHelper.requireNonNull(field, FIELD);
         this.hotMap = Collections.unmodifiableMap(ExceptionsHelper.requireNonNull(hotMap, HOT_MAP));
+        this.custom = custom == null ? false : custom;
     }
 
     public OneHotEncoding(StreamInput in) throws IOException {
         this.field = in.readString();
         this.hotMap = Collections.unmodifiableMap(in.readMap(StreamInput::readString, StreamInput::readString));
+        if (in.getVersion().onOrAfter(Version.V_8_0_0)) {
+            this.custom = in.readBoolean();
+        } else {
+            this.custom = false;
+        }
     }
 
     /**
@@ -83,7 +95,12 @@ public class OneHotEncoding implements LenientlyParsedPreProcessor, StrictlyPars
 
     @Override
     public Map<String, String> reverseLookup() {
-        return hotMap.entrySet().stream().collect(Collectors.toMap(HashMap.Entry::getValue, (entry) -> field));
+        return hotMap.values().stream().collect(Collectors.toMap(Function.identity(), (value) -> field));
+    }
+
+    @Override
+    public boolean isCustom() {
+        return custom;
     }
 
     @Override
@@ -91,6 +108,16 @@ public class OneHotEncoding implements LenientlyParsedPreProcessor, StrictlyPars
         return NAME.getPreferredName();
     }
 
+    @Override
+    public List<String> inputFields() {
+        return Collections.singletonList(field);
+    }
+
+    @Override
+    public List<String> outputFields() {
+        return new ArrayList<>(hotMap.values());
+    }
+
     @Override
     public void process(Map<String, Object> fields) {
         Object value = fields.get(field);
@@ -112,6 +139,9 @@ public class OneHotEncoding implements LenientlyParsedPreProcessor, StrictlyPars
     public void writeTo(StreamOutput out) throws IOException {
         out.writeString(field);
         out.writeMap(hotMap, StreamOutput::writeString, StreamOutput::writeString);
+        if (out.getVersion().onOrAfter(Version.V_8_0_0)) {
+            out.writeBoolean(custom);
+        }
     }
 
     @Override
@@ -119,6 +149,7 @@ public class OneHotEncoding implements LenientlyParsedPreProcessor, StrictlyPars
         builder.startObject();
         builder.field(FIELD.getPreferredName(), field);
         builder.field(HOT_MAP.getPreferredName(), hotMap);
+        builder.field(CUSTOM.getPreferredName(), custom);
         builder.endObject();
         return builder;
     }
@@ -129,12 +160,13 @@ public class OneHotEncoding implements LenientlyParsedPreProcessor, StrictlyPars
         if (o == null || getClass() != o.getClass()) return false;
         OneHotEncoding that = (OneHotEncoding) o;
         return Objects.equals(field, that.field)
-            && Objects.equals(hotMap, that.hotMap);
+            && Objects.equals(hotMap, that.hotMap)
+            && Objects.equals(custom, that.custom);
     }
 
     @Override
     public int hashCode() {
-        return Objects.hash(field, hotMap);
+        return Objects.hash(field, hotMap, custom);
     }
 
     @Override

+ 19 - 0
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/PreProcessor.java

@@ -9,6 +9,7 @@ import org.apache.lucene.util.Accountable;
 import org.elasticsearch.common.io.stream.NamedWriteable;
 import org.elasticsearch.xpack.core.ml.utils.NamedXContentObject;
 
+import java.util.List;
 import java.util.Map;
 
 /**
@@ -17,6 +18,16 @@ import java.util.Map;
  */
 public interface PreProcessor extends NamedXContentObject, NamedWriteable, Accountable {
 
+    /**
+     * The expected input fields
+     */
+    List<String> inputFields();
+
+    /**
+     * @return The resulting output fields
+     */
+    List<String> outputFields();
+
     /**
      * Process the given fields and their values and return the modified map.
      *
@@ -29,4 +40,12 @@ public interface PreProcessor extends NamedXContentObject, NamedWriteable, Accou
      * @return Reverse lookup map to match resulting features to their original feature name
      */
     Map<String, String> reverseLookup();
+
+    /**
+     * @return Is the pre-processor a custom one provided by the user, or automatically created?
+     *         This changes how feature importance is calculated, as fields generated by custom processors get individual feature
+     *         importance calculations.
+     */
+    boolean isCustom();
+
 }

+ 35 - 4
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/TargetMeanEncoding.java

@@ -6,6 +6,7 @@
 package org.elasticsearch.xpack.core.ml.inference.preprocessing;
 
 import org.apache.lucene.util.RamUsageEstimator;
+import org.elasticsearch.Version;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.io.stream.StreamInput;
@@ -18,6 +19,7 @@ import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
 import java.io.IOException;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 
@@ -33,6 +35,7 @@ public class TargetMeanEncoding implements LenientlyParsedPreProcessor, Strictly
     public static final ParseField FEATURE_NAME = new ParseField("feature_name");
     public static final ParseField TARGET_MAP = new ParseField("target_map");
     public static final ParseField DEFAULT_VALUE = new ParseField("default_value");
+    public static final ParseField CUSTOM = new ParseField("custom");
 
     public static final ConstructingObjectParser<TargetMeanEncoding, Void> STRICT_PARSER = createParser(false);
     public static final ConstructingObjectParser<TargetMeanEncoding, Void> LENIENT_PARSER = createParser(true);
@@ -42,13 +45,14 @@ public class TargetMeanEncoding implements LenientlyParsedPreProcessor, Strictly
         ConstructingObjectParser<TargetMeanEncoding, Void> parser = new ConstructingObjectParser<>(
             NAME.getPreferredName(),
             lenient,
-            a -> new TargetMeanEncoding((String)a[0], (String)a[1], (Map<String, Double>)a[2], (Double)a[3]));
+            a -> new TargetMeanEncoding((String)a[0], (String)a[1], (Map<String, Double>)a[2], (Double)a[3], (Boolean)a[4]));
         parser.declareString(ConstructingObjectParser.constructorArg(), FIELD);
         parser.declareString(ConstructingObjectParser.constructorArg(), FEATURE_NAME);
         parser.declareObject(ConstructingObjectParser.constructorArg(),
             (p, c) -> p.map(HashMap::new, XContentParser::doubleValue),
             TARGET_MAP);
         parser.declareDouble(ConstructingObjectParser.constructorArg(), DEFAULT_VALUE);
+        parser.declareBoolean(ConstructingObjectParser.optionalConstructorArg(), CUSTOM);
         return parser;
     }
 
@@ -64,12 +68,14 @@ public class TargetMeanEncoding implements LenientlyParsedPreProcessor, Strictly
     private final String featureName;
     private final Map<String, Double> meanMap;
     private final double defaultValue;
+    private final boolean custom;
 
-    public TargetMeanEncoding(String field, String featureName, Map<String, Double> meanMap, Double defaultValue) {
+    public TargetMeanEncoding(String field, String featureName, Map<String, Double> meanMap, Double defaultValue, Boolean custom) {
         this.field = ExceptionsHelper.requireNonNull(field, FIELD);
         this.featureName = ExceptionsHelper.requireNonNull(featureName, FEATURE_NAME);
         this.meanMap = Collections.unmodifiableMap(ExceptionsHelper.requireNonNull(meanMap, TARGET_MAP));
         this.defaultValue = ExceptionsHelper.requireNonNull(defaultValue, DEFAULT_VALUE);
+        this.custom = custom == null ? false : custom;
     }
 
     public TargetMeanEncoding(StreamInput in) throws IOException {
@@ -77,6 +83,11 @@ public class TargetMeanEncoding implements LenientlyParsedPreProcessor, Strictly
         this.featureName = in.readString();
         this.meanMap = Collections.unmodifiableMap(in.readMap(StreamInput::readString, StreamInput::readDouble));
         this.defaultValue = in.readDouble();
+        if (in.getVersion().onOrAfter(Version.V_8_0_0)) {
+            this.custom = in.readBoolean();
+        } else {
+            this.custom = false;
+        }
     }
 
     /**
@@ -112,11 +123,26 @@ public class TargetMeanEncoding implements LenientlyParsedPreProcessor, Strictly
         return Collections.singletonMap(featureName, field);
     }
 
+    @Override
+    public boolean isCustom() {
+        return custom;
+    }
+
     @Override
     public String getName() {
         return NAME.getPreferredName();
     }
 
+    @Override
+    public List<String> inputFields() {
+        return Collections.singletonList(field);
+    }
+
+    @Override
+    public List<String> outputFields() {
+        return Collections.singletonList(featureName);
+    }
+
     @Override
     public void process(Map<String, Object> fields) {
         Object value = fields.get(field);
@@ -137,6 +163,9 @@ public class TargetMeanEncoding implements LenientlyParsedPreProcessor, Strictly
         out.writeString(featureName);
         out.writeMap(meanMap, StreamOutput::writeString, StreamOutput::writeDouble);
         out.writeDouble(defaultValue);
+        if (out.getVersion().onOrAfter(Version.V_8_0_0)) {
+            out.writeBoolean(custom);
+        }
     }
 
     @Override
@@ -146,6 +175,7 @@ public class TargetMeanEncoding implements LenientlyParsedPreProcessor, Strictly
         builder.field(FEATURE_NAME.getPreferredName(), featureName);
         builder.field(TARGET_MAP.getPreferredName(), meanMap);
         builder.field(DEFAULT_VALUE.getPreferredName(), defaultValue);
+        builder.field(CUSTOM.getPreferredName(), custom);
         builder.endObject();
         return builder;
     }
@@ -158,12 +188,13 @@ public class TargetMeanEncoding implements LenientlyParsedPreProcessor, Strictly
         return Objects.equals(field, that.field)
             && Objects.equals(featureName, that.featureName)
             && Objects.equals(meanMap, that.meanMap)
-            && Objects.equals(defaultValue, that.defaultValue);
+            && Objects.equals(defaultValue, that.defaultValue)
+            && Objects.equals(custom, that.custom);
     }
 
     @Override
     public int hashCode() {
-        return Objects.hash(field, featureName, meanMap, defaultValue);
+        return Objects.hash(field, featureName, meanMap, defaultValue, custom);
     }
 
     @Override

+ 17 - 2
x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/FrequencyEncodingTests.java

@@ -17,6 +17,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.stream.Collectors;
 
+import static org.hamcrest.Matchers.containsInAnyOrder;
 import static org.hamcrest.Matchers.equalTo;
 
 public class FrequencyEncodingTests extends PreProcessingTests<FrequencyEncoding> {
@@ -37,7 +38,10 @@ public class FrequencyEncodingTests extends PreProcessingTests<FrequencyEncoding
         for (int i = 0; i < valuesSize; i++) {
             valueMap.put(randomAlphaOfLength(10), randomDoubleBetween(0.0, 1.0, false));
         }
-        return new FrequencyEncoding(randomAlphaOfLength(10), randomAlphaOfLength(10), valueMap);
+        return new FrequencyEncoding(randomAlphaOfLength(10),
+            randomAlphaOfLength(10),
+            valueMap,
+            randomBoolean() ? null : randomBoolean());
     }
 
     @Override
@@ -51,7 +55,7 @@ public class FrequencyEncodingTests extends PreProcessingTests<FrequencyEncoding
         Map<String, Double> valueMap = values.stream().collect(Collectors.toMap(Object::toString,
             v -> randomDoubleBetween(0.0, 1.0, false)));
         String encodedFeatureName = "encoded";
-        FrequencyEncoding encoding = new FrequencyEncoding(field, encodedFeatureName, valueMap);
+        FrequencyEncoding encoding = new FrequencyEncoding(field, encodedFeatureName, valueMap, false);
         Object fieldValue = randomFrom(values);
         Map<String, Matcher<? super Object>> matchers = Collections.singletonMap(encodedFeatureName,
             equalTo(valueMap.get(fieldValue.toString())));
@@ -65,4 +69,15 @@ public class FrequencyEncodingTests extends PreProcessingTests<FrequencyEncoding
         testProcess(encoding, fieldValues, matchers);
     }
 
+    public void testInputOutputFields() {
+        String field = randomAlphaOfLength(10);
+        List<Object> values = Arrays.asList("foo", "bar", "foobar", "baz", "farequote", 1.5);
+        Map<String, Double> valueMap = values.stream().collect(Collectors.toMap(Object::toString,
+            v -> randomDoubleBetween(0.0, 1.0, false)));
+        String encodedFeatureName = randomAlphaOfLength(10);
+        FrequencyEncoding encoding = new FrequencyEncoding(field, encodedFeatureName, valueMap, false);
+        assertThat(encoding.inputFields(), containsInAnyOrder(field));
+        assertThat(encoding.outputFields(), containsInAnyOrder(encodedFeatureName));
+    }
+
 }

+ 15 - 2
x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/OneHotEncodingTests.java

@@ -17,6 +17,7 @@ import java.util.Map;
 import java.util.function.Function;
 import java.util.stream.Collectors;
 
+import static org.hamcrest.Matchers.containsInAnyOrder;
 import static org.hamcrest.Matchers.equalTo;
 
 public class OneHotEncodingTests extends PreProcessingTests<OneHotEncoding> {
@@ -37,7 +38,9 @@ public class OneHotEncodingTests extends PreProcessingTests<OneHotEncoding> {
         for (int i = 0; i < valuesSize; i++) {
             valueMap.put(randomAlphaOfLength(10), randomAlphaOfLength(10));
         }
-        return new OneHotEncoding(randomAlphaOfLength(10), valueMap);
+        return new OneHotEncoding(randomAlphaOfLength(10),
+            valueMap,
+            randomBoolean() ? randomBoolean() : null);
     }
 
     @Override
@@ -49,7 +52,7 @@ public class OneHotEncodingTests extends PreProcessingTests<OneHotEncoding> {
         String field = "categorical";
         List<Object> values = Arrays.asList("foo", "bar", "foobar", "baz", "farequote", 1.0);
         Map<String, String> valueMap = values.stream().collect(Collectors.toMap(Object::toString, v -> "Column_" + v.toString()));
-        OneHotEncoding encoding = new OneHotEncoding(field, valueMap);
+        OneHotEncoding encoding = new OneHotEncoding(field, valueMap, false);
         Object fieldValue = randomFrom(values);
         Map<String, Object> fieldValues = randomFieldValues(field, fieldValue);
 
@@ -67,4 +70,14 @@ public class OneHotEncodingTests extends PreProcessingTests<OneHotEncoding> {
         testProcess(encoding, fieldValues, matchers);
     }
 
+    public void testInputOutputFields() {
+        String field = randomAlphaOfLength(10);
+        List<Object> values = Arrays.asList("foo", "bar", "foobar", "baz", "farequote", 1.0);
+        Map<String, String> valueMap = values.stream().collect(Collectors.toMap(Object::toString, v -> "Column_" + v.toString()));
+        OneHotEncoding encoding = new OneHotEncoding(field, valueMap, false);
+        assertThat(encoding.inputFields(), containsInAnyOrder(field));
+        assertThat(encoding.outputFields(),
+            containsInAnyOrder(values.stream().map(v -> "Column_" + v.toString()).toArray(String[]::new)));
+    }
+
 }

+ 16 - 2
x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/preprocessing/TargetMeanEncodingTests.java

@@ -17,6 +17,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.stream.Collectors;
 
+import static org.hamcrest.Matchers.containsInAnyOrder;
 import static org.hamcrest.Matchers.equalTo;
 
 public class TargetMeanEncodingTests extends PreProcessingTests<TargetMeanEncoding> {
@@ -40,7 +41,8 @@ public class TargetMeanEncodingTests extends PreProcessingTests<TargetMeanEncodi
         return new TargetMeanEncoding(randomAlphaOfLength(10),
             randomAlphaOfLength(10),
             valueMap,
-            randomDoubleBetween(0.0, 1.0, false));
+            randomDoubleBetween(0.0, 1.0, false),
+            randomBoolean() ? randomBoolean() : null);
     }
 
     @Override
@@ -55,7 +57,7 @@ public class TargetMeanEncodingTests extends PreProcessingTests<TargetMeanEncodi
             v -> randomDoubleBetween(0.0, 1.0, false)));
         String encodedFeatureName = "encoded";
         Double defaultvalue = randomDouble();
-        TargetMeanEncoding encoding = new TargetMeanEncoding(field, encodedFeatureName, valueMap, defaultvalue);
+        TargetMeanEncoding encoding = new TargetMeanEncoding(field, encodedFeatureName, valueMap, defaultvalue, false);
         Object fieldValue = randomFrom(values);
         Map<String, Matcher<? super Object>> matchers = Collections.singletonMap(encodedFeatureName,
             equalTo(valueMap.get(fieldValue.toString())));
@@ -68,4 +70,16 @@ public class TargetMeanEncodingTests extends PreProcessingTests<TargetMeanEncodi
         testProcess(encoding, fieldValues, matchers);
     }
 
+    public void testInputOutputFields() {
+        String field = randomAlphaOfLength(10);
+        String encodedFeatureName = randomAlphaOfLength(10);
+        Double defaultvalue = randomDouble();
+        List<Object> values = Arrays.asList("foo", "bar", "foobar", "baz", "farequote", 1.0);
+        Map<String, Double> valueMap = values.stream().collect(Collectors.toMap(Object::toString,
+            v -> randomDoubleBetween(0.0, 1.0, false)));
+        TargetMeanEncoding encoding = new TargetMeanEncoding(field, encodedFeatureName, valueMap, defaultvalue, false);
+        assertThat(encoding.inputFields(), containsInAnyOrder(field));
+        assertThat(encoding.outputFields(), containsInAnyOrder(encodedFeatureName));
+    }
+
 }

+ 4 - 4
x-pack/plugin/ml/src/internalClusterTest/java/org/elasticsearch/xpack/ml/integration/ModelInferenceActionIT.java

@@ -74,7 +74,7 @@ public class ModelInferenceActionIT extends MlSingleNodeTestCase {
         TrainedModelConfig config1 = buildTrainedModelConfigBuilder(modelId2)
             .setInput(new TrainedModelInput(Arrays.asList("field.foo", "field.bar", "other.categorical")))
             .setParsedDefinition(new TrainedModelDefinition.Builder()
-                .setPreProcessors(Arrays.asList(new OneHotEncoding("other.categorical", oneHotEncoding)))
+                .setPreProcessors(Arrays.asList(new OneHotEncoding("other.categorical", oneHotEncoding, false)))
                 .setTrainedModel(buildClassification(true)))
             .setVersion(Version.CURRENT)
             .setLicenseLevel(License.OperationMode.PLATINUM.description())
@@ -85,7 +85,7 @@ public class ModelInferenceActionIT extends MlSingleNodeTestCase {
         TrainedModelConfig config2 = buildTrainedModelConfigBuilder(modelId1)
             .setInput(new TrainedModelInput(Arrays.asList("field.foo", "field.bar", "other.categorical")))
             .setParsedDefinition(new TrainedModelDefinition.Builder()
-                .setPreProcessors(Arrays.asList(new OneHotEncoding("other.categorical", oneHotEncoding)))
+                .setPreProcessors(Arrays.asList(new OneHotEncoding("other.categorical", oneHotEncoding, false)))
                 .setTrainedModel(buildRegression()))
             .setVersion(Version.CURRENT)
             .setEstimatedOperations(0)
@@ -203,7 +203,7 @@ public class ModelInferenceActionIT extends MlSingleNodeTestCase {
         TrainedModelConfig config = buildTrainedModelConfigBuilder(modelId)
             .setInput(new TrainedModelInput(Arrays.asList("field.foo", "field.bar", "other.categorical")))
             .setParsedDefinition(new TrainedModelDefinition.Builder()
-                .setPreProcessors(Arrays.asList(new OneHotEncoding("other.categorical", oneHotEncoding)))
+                .setPreProcessors(Arrays.asList(new OneHotEncoding("other.categorical", oneHotEncoding, false)))
                 .setTrainedModel(buildMultiClassClassification()))
             .setVersion(Version.CURRENT)
             .setLicenseLevel(License.OperationMode.PLATINUM.description())
@@ -320,7 +320,7 @@ public class ModelInferenceActionIT extends MlSingleNodeTestCase {
         TrainedModelConfig config = buildTrainedModelConfigBuilder(modelId)
             .setInput(new TrainedModelInput(Arrays.asList("field1", "field2")))
             .setParsedDefinition(new TrainedModelDefinition.Builder()
-                .setPreProcessors(Arrays.asList(new OneHotEncoding("categorical", oneHotEncoding)))
+                .setPreProcessors(Arrays.asList(new OneHotEncoding("categorical", oneHotEncoding, false)))
                 .setTrainedModel(buildRegression()))
             .setVersion(Version.CURRENT)
             .setEstimatedOperations(0)

+ 6 - 6
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/loadingservice/LocalModelTests.java

@@ -67,7 +67,7 @@ public class LocalModelTests extends ESTestCase {
         String modelId = "classification_model";
         List<String> inputFields = Arrays.asList("field.foo", "field.bar", "categorical");
         InferenceDefinition definition = InferenceDefinition.builder()
-            .setPreProcessors(Collections.singletonList(new OneHotEncoding("categorical", oneHotMap())))
+            .setPreProcessors(Collections.singletonList(new OneHotEncoding("categorical", oneHotMap(), false)))
             .setTrainedModel(buildClassificationInference(false))
             .build();
 
@@ -99,7 +99,7 @@ public class LocalModelTests extends ESTestCase {
 
         // Test with labels
         definition = InferenceDefinition.builder()
-            .setPreProcessors(Collections.singletonList(new OneHotEncoding("categorical", oneHotMap())))
+            .setPreProcessors(Collections.singletonList(new OneHotEncoding("categorical", oneHotMap(), false)))
             .setTrainedModel(buildClassificationInference(true))
             .build();
         model = new LocalModel(modelId,
@@ -142,7 +142,7 @@ public class LocalModelTests extends ESTestCase {
         String modelId = "classification_model";
         List<String> inputFields = Arrays.asList("field.foo.keyword", "field.bar", "categorical");
         InferenceDefinition definition = InferenceDefinition.builder()
-            .setPreProcessors(Collections.singletonList(new OneHotEncoding("categorical", oneHotMap())))
+            .setPreProcessors(Collections.singletonList(new OneHotEncoding("categorical", oneHotMap(), false)))
             .setTrainedModel(buildClassificationInference(true))
             .build();
 
@@ -200,7 +200,7 @@ public class LocalModelTests extends ESTestCase {
         doAnswer((args) -> null).when(modelStatsService).queueStats(any(InferenceStats.class), anyBoolean());
         List<String> inputFields = Arrays.asList("foo", "bar", "categorical");
         InferenceDefinition trainedModelDefinition = InferenceDefinition.builder()
-            .setPreProcessors(Collections.singletonList(new OneHotEncoding("categorical", oneHotMap())))
+            .setPreProcessors(Collections.singletonList(new OneHotEncoding("categorical", oneHotMap(), false)))
             .setTrainedModel(buildRegressionInference())
             .build();
         LocalModel model = new LocalModel("regression_model",
@@ -228,7 +228,7 @@ public class LocalModelTests extends ESTestCase {
         doAnswer((args) -> null).when(modelStatsService).queueStats(any(InferenceStats.class), anyBoolean());
         List<String> inputFields = Arrays.asList("foo", "bar", "categorical");
         InferenceDefinition trainedModelDefinition = InferenceDefinition.builder()
-            .setPreProcessors(Collections.singletonList(new OneHotEncoding("categorical", oneHotMap())))
+            .setPreProcessors(Collections.singletonList(new OneHotEncoding("categorical", oneHotMap(), false)))
             .setTrainedModel(buildRegressionInference())
             .build();
         LocalModel model = new LocalModel(
@@ -260,7 +260,7 @@ public class LocalModelTests extends ESTestCase {
         String modelId = "classification_model";
         List<String> inputFields = Arrays.asList("field.foo", "field.bar", "categorical");
         InferenceDefinition definition = InferenceDefinition.builder()
-            .setPreProcessors(Collections.singletonList(new OneHotEncoding("categorical", oneHotMap())))
+            .setPreProcessors(Collections.singletonList(new OneHotEncoding("categorical", oneHotMap(), false)))
             .setTrainedModel(buildClassificationInference(false))
             .build();