3 years ago · 0635f2758f
--- a/docs/reference/ml/ml-shared.asciidoc
+++ b/docs/reference/ml/ml-shared.asciidoc
@@ -897,7 +897,7 @@ end::inference-config-classification-num-top-classes[]
 
				 
			
 
				 tag::inference-config-classification-num-top-feature-importance-values[]
			
 
				 Specifies the maximum number of
			
 
				-{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document. Defaults 
			
 
				+{ml-docs}/ml-feature-importance.html[{feat-imp}] values per document. Defaults
			
 
				 to 0 which means no {feat-imp} calculation occurs.
			
 
				 end::inference-config-classification-num-top-feature-importance-values[]
			
 
				 
			
@@ -908,7 +908,7 @@ end::inference-config-classification-top-classes-results-field[]
 
				 
			
 
				 tag::inference-config-classification-prediction-field-type[]
			
 
				 Specifies the type of the predicted field to write.
			
 
				-Valid values are: `string`, `number`, `boolean`. When `boolean` is provided 
			
 
				+Valid values are: `string`, `number`, `boolean`. When `boolean` is provided
			
 
				 `1.0` is transformed to `true` and `0.0` to `false`.
			
 
				 end::inference-config-classification-prediction-field-type[]
			
 
				 
			
@@ -921,13 +921,13 @@ BERT-style tokenization is to be performed with the enclosed settings.
 
				 end::inference-config-nlp-tokenization-bert[]
			
 
				 
			
 
				 tag::inference-config-nlp-tokenization-bert-do-lower-case[]
			
 
				-Specifies if the tokenization lower case the text sequence when building the 
			
 
				+Specifies if the tokenization lower case the text sequence when building the
			
 
				 tokens.
			
 
				 end::inference-config-nlp-tokenization-bert-do-lower-case[]
			
 
				 
			
 
				 tag::inference-config-nlp-tokenization-bert-truncate[]
			
 
				 Indicates how tokens are truncated when they exceed `max_sequence_length`.
			
 
				-The default value is `none`.
			
 
				+The default value is `first`.
			
 
				 +
			
 
				 --
			
 
				 * `none`: No truncation occurs; the inference request receives an error.
			
@@ -936,7 +936,7 @@ The default value is `none`.
 
				 					 that sequence is truncated.
			
 
				 --
			
 
				 
			
 
				-NOTE: For `zero_shot_classification`, the hypothesis sequence is always the second 
			
 
				+NOTE: For `zero_shot_classification`, the hypothesis sequence is always the second
			
 
				 sequence. Therefore, do not use `second` in this case.
			
 
				 
			
 
				 end::inference-config-nlp-tokenization-bert-truncate[]
			
@@ -951,21 +951,21 @@ Tokenize with special tokens. The tokens typically included in BERT-style tokeni
 
				 end::inference-config-nlp-tokenization-bert-with-special-tokens[]
			
 
				 
			
 
				 tag::inference-config-nlp-tokenization-bert-max-sequence-length[]
			
 
				-Specifies the maximum number of tokens allowed to be output by the tokenizer. 
			
 
				+Specifies the maximum number of tokens allowed to be output by the tokenizer.
			
 
				 The default for BERT-style tokenization is `512`.
			
 
				 end::inference-config-nlp-tokenization-bert-max-sequence-length[]
			
 
				 
			
 
				 tag::inference-config-nlp-vocabulary[]
			
 
				-The configuration for retreiving the vocabulary of the model. The vocabulary is 
			
 
				-then used at inference time. This information is usually provided automatically 
			
 
				+The configuration for retreiving the vocabulary of the model. The vocabulary is
			
 
				+then used at inference time. This information is usually provided automatically
			
 
				 by storing vocabulary in a known, internally managed index.
			
 
				 end::inference-config-nlp-vocabulary[]
			
 
				 
			
 
				 tag::inference-config-nlp-fill-mask[]
			
 
				-Configuration for a fill_mask natural language processing (NLP) task. The 
			
 
				-fill_mask task works with models optimized for a fill mask action. For example, 
			
 
				-for BERT models, the following text may be provided: "The capital of France is 
			
 
				-[MASK].". The response indicates the value most likely to replace `[MASK]`. In 
			
 
				+Configuration for a fill_mask natural language processing (NLP) task. The
			
 
				+fill_mask task works with models optimized for a fill mask action. For example,
			
 
				+for BERT models, the following text may be provided: "The capital of France is
			
 
				+[MASK].". The response indicates the value most likely to replace `[MASK]`. In
			
 
				 this instance, the most probable token is `paris`.
			
 
				 end::inference-config-nlp-fill-mask[]
			
 
				 
			
@@ -993,7 +993,7 @@ end::inference-config-text-classification[]
 
				 tag::inference-config-text-embedding[]
			
 
				 Text embedding takes an input sequence and transforms it into a vector of
			
 
				 numbers. These embeddings capture not simply tokens, but semantic meanings and
			
 
				-context. These embeddings can be used in a <<dense-vector,dense vector>> field 
			
 
				+context. These embeddings can be used in a <<dense-vector,dense vector>> field
			
 
				 for powerful insights.
			
 
				 end::inference-config-text-embedding[]
			
 
				 
			
@@ -1021,7 +1021,7 @@ it is possible to adjust the labels to classify. This makes this type of model
 
				 and task exceptionally flexible.
			
 
				 +
			
 
				 --
			
 
				-If consistently classifying the same labels, it may be better to use a 
			
 
				+If consistently classifying the same labels, it may be better to use a
			
 
				 fine-tuned text classification model.
			
 
				 --
			
 
				 end::inference-config-zero-shot-classification[]
			
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/Tokenization.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/Tokenization.java
@@ -47,7 +47,7 @@ public abstract class Tokenization implements NamedXContentObject, NamedWriteabl
 
				     private static final int DEFAULT_MAX_SEQUENCE_LENGTH = 512;
			
 
				     private static final boolean DEFAULT_DO_LOWER_CASE = false;
			
 
				     private static final boolean DEFAULT_WITH_SPECIAL_TOKENS = true;
			
 
				-    private static final Truncate DEFAULT_TRUNCATION = Truncate.NONE;
			
 
				+    private static final Truncate DEFAULT_TRUNCATION = Truncate.FIRST;
			
 
				 
			
 
				     static <T extends Tokenization> void declareCommonFields(ConstructingObjectParser<T, ?> parser) {
			
 
				         parser.declareBoolean(ConstructingObjectParser.optionalConstructorArg(), DO_LOWER_CASE);
			
@@ -57,7 +57,7 @@ public abstract class Tokenization implements NamedXContentObject, NamedWriteabl
 
				     }
			
 
				 
			
 
				     public static BertTokenization createDefault() {
			
 
				-        return new BertTokenization(null, null, null, Truncate.FIRST);
			
 
				+        return new BertTokenization(null, null, null, Tokenization.DEFAULT_TRUNCATION);
			
 
				     }
			
 
				 
			
 
				     protected final boolean doLowerCase;