3 years ago · 9dc8aea1cb
--- a/docs/reference/ml/ml-shared.asciidoc
+++ b/docs/reference/ml/ml-shared.asciidoc
@@ -929,6 +929,13 @@ end::inference-config-classification-prediction-field-type[]
 
															 tag::inference-config-nlp-tokenization[]
														
 
															 Indicates the tokenization to perform and the desired settings.
														
 
															+The default tokenization configuration is `bert`. Valid tokenization
														
 
															+values are
														
 
															++
														
 
															+--
														
 
															+* `bert`: Use for BERT-style models
														
 
															+* `mpnet`: Use for MPNet-style models
														
 
															+--
														
 
															 end::inference-config-nlp-tokenization[]
														
 
															 tag::inference-config-nlp-tokenization-bert[]
														
@@ -970,6 +977,19 @@ Specifies the maximum number of tokens allowed to be output by the tokenizer.
 
															 The default for BERT-style tokenization is `512`.
														
 
															 end::inference-config-nlp-tokenization-bert-max-sequence-length[]
														
 
															+tag::inference-config-nlp-tokenization-mpnet[]
														
 
															+MPNet-style tokenization is to be performed with the enclosed settings.
														
 
															+end::inference-config-nlp-tokenization-mpnet[]
														
 
															+
														
 
															+tag::inference-config-nlp-tokenization-mpnet-with-special-tokens[]
														
 
															+Tokenize with special tokens. The tokens typically included in MPNet-style tokenization are:
														
 
															++
														
 
															+--
														
 
															+* `<s>`: The first token of the sequence being classified.
														
 
															+* `</s>`: Indicates sequence separation.
														
 
															+--
														
 
															+end::inference-config-nlp-tokenization-mpnet-with-special-tokens[]
														
 
															+
														
 
															 tag::inference-config-nlp-vocabulary[]
														
 
															 The configuration for retreiving the vocabulary of the model. The vocabulary is
														
 
															 then used at inference time. This information is usually provided automatically
														
--- a/docs/reference/ml/trained-models/apis/get-trained-models.asciidoc
+++ b/docs/reference/ml/trained-models/apis/get-trained-models.asciidoc
@@ -202,6 +202,29 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 
															 (Optional, boolean)
														
 
															 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
														
 
															 ========
														
 
															+`mpnet`::::
														
 
															+(Optional, object)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet]
														
 
															++
														
 
															+.Properties of mpnet
														
 
															+[%collapsible%open]
														
 
															+========
														
 
															+`do_lower_case`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-do-lower-case]
														
 
															+
														
 
															+`max_sequence_length`::::
														
 
															+(Optional, integer)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
														
 
															+
														
 
															+`truncate`::::
														
 
															+(Optional, string)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
														
 
															+
														
 
															+`with_special_tokens`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet-with-special-tokens]
														
 
															+========
														
 
															 =======
														
 
															 `vocabulary`::::
														
 
															 (Optional, object)
														
@@ -260,6 +283,29 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 
															 (Optional, boolean)
														
 
															 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
														
 
															 ========
														
 
															+`mpnet`::::
														
 
															+(Optional, object)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet]
														
 
															++
														
 
															+.Properties of mpnet
														
 
															+[%collapsible%open]
														
 
															+========
														
 
															+`do_lower_case`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-do-lower-case]
														
 
															+
														
 
															+`max_sequence_length`::::
														
 
															+(Optional, integer)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
														
 
															+
														
 
															+`truncate`::::
														
 
															+(Optional, string)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
														
 
															+
														
 
															+`with_special_tokens`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet-with-special-tokens]
														
 
															+========
														
 
															 =======
														
 
															 `vocabulary`::::
														
 
															 (Optional, object)
														
@@ -311,6 +357,29 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 
															 (Optional, boolean)
														
 
															 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
														
 
															 ========
														
 
															+`mpnet`::::
														
 
															+(Optional, object)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet]
														
 
															++
														
 
															+.Properties of mpnet
														
 
															+[%collapsible%open]
														
 
															+========
														
 
															+`do_lower_case`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-do-lower-case]
														
 
															+
														
 
															+`max_sequence_length`::::
														
 
															+(Optional, integer)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
														
 
															+
														
 
															+`truncate`::::
														
 
															+(Optional, string)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
														
 
															+
														
 
															+`with_special_tokens`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet-with-special-tokens]
														
 
															+========
														
 
															 =======
														
 
															 `vocabulary`::::
														
 
															 (Optional, object)
														
@@ -385,6 +454,29 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 
															 (Optional, boolean)
														
 
															 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
														
 
															 ========
														
 
															+`mpnet`::::
														
 
															+(Optional, object)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet]
														
 
															++
														
 
															+.Properties of mpnet
														
 
															+[%collapsible%open]
														
 
															+========
														
 
															+`do_lower_case`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-do-lower-case]
														
 
															+
														
 
															+`max_sequence_length`::::
														
 
															+(Optional, integer)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
														
 
															+
														
 
															+`truncate`::::
														
 
															+(Optional, string)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
														
 
															+
														
 
															+`with_special_tokens`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet-with-special-tokens]
														
 
															+========
														
 
															 =======
														
 
															 `vocabulary`::::
														
@@ -436,6 +528,29 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 
															 (Optional, boolean)
														
 
															 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
														
 
															 ========
														
 
															+`mpnet`::::
														
 
															+(Optional, object)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet]
														
 
															++
														
 
															+.Properties of mpnet
														
 
															+[%collapsible%open]
														
 
															+========
														
 
															+`do_lower_case`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-do-lower-case]
														
 
															+
														
 
															+`max_sequence_length`::::
														
 
															+(Optional, integer)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
														
 
															+
														
 
															+`truncate`::::
														
 
															+(Optional, string)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
														
 
															+
														
 
															+`with_special_tokens`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet-with-special-tokens]
														
 
															+========
														
 
															 =======
														
 
															 `vocabulary`::::
														
 
															 (Optional, object)
														
@@ -502,6 +617,29 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 
															 (Optional, boolean)
														
 
															 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
														
 
															 ========
														
 
															+`mpnet`::::
														
 
															+(Optional, object)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet]
														
 
															++
														
 
															+.Properties of mpnet
														
 
															+[%collapsible%open]
														
 
															+========
														
 
															+`do_lower_case`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-do-lower-case]
														
 
															+
														
 
															+`max_sequence_length`::::
														
 
															+(Optional, integer)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
														
 
															+
														
 
															+`truncate`::::
														
 
															+(Optional, string)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
														
 
															+
														
 
															+`with_special_tokens`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet-with-special-tokens]
														
 
															+========
														
 
															 =======
														
 
															 `vocabulary`::::
														
 
															 (Optional, object)
														
--- a/docs/reference/ml/trained-models/apis/put-trained-models.asciidoc
+++ b/docs/reference/ml/trained-models/apis/put-trained-models.asciidoc
@@ -458,6 +458,29 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 
															 (Optional, boolean)
														
 
															 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
														
 
															 =======
														
 
															+`mpnet`::::
														
 
															+(Optional, object)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet]
														
 
															++
														
 
															+.Properties of mpnet
														
 
															+[%collapsible%open]
														
 
															+=======
														
 
															+`do_lower_case`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-do-lower-case]
														
 
															+
														
 
															+`max_sequence_length`::::
														
 
															+(Optional, integer)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
														
 
															+
														
 
															+`truncate`::::
														
 
															+(Optional, string)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
														
 
															+
														
 
															+`with_special_tokens`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet-with-special-tokens]
														
 
															+=======
														
 
															 ======
														
 
															 =====
														
@@ -504,6 +527,29 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 
															 (Optional, boolean)
														
 
															 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
														
 
															 =======
														
 
															+`mpnet`::::
														
 
															+(Optional, object)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet]
														
 
															++
														
 
															+.Properties of mpnet
														
 
															+[%collapsible%open]
														
 
															+=======
														
 
															+`do_lower_case`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-do-lower-case]
														
 
															+
														
 
															+`max_sequence_length`::::
														
 
															+(Optional, integer)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
														
 
															+
														
 
															+`truncate`::::
														
 
															+(Optional, string)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
														
 
															+
														
 
															+`with_special_tokens`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet-with-special-tokens]
														
 
															+=======
														
 
															 ======
														
 
															 =====
														
@@ -544,6 +590,29 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 
															 (Optional, boolean)
														
 
															 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
														
 
															 =======
														
 
															+`mpnet`::::
														
 
															+(Optional, object)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet]
														
 
															++
														
 
															+.Properties of mpnet
														
 
															+[%collapsible%open]
														
 
															+=======
														
 
															+`do_lower_case`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-do-lower-case]
														
 
															+
														
 
															+`max_sequence_length`::::
														
 
															+(Optional, integer)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
														
 
															+
														
 
															+`truncate`::::
														
 
															+(Optional, string)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
														
 
															+
														
 
															+`with_special_tokens`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet-with-special-tokens]
														
 
															+=======
														
 
															 ======
														
 
															 =====
														
@@ -607,6 +676,29 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 
															 (Optional, boolean)
														
 
															 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
														
 
															 =======
														
 
															+`mpnet`::::
														
 
															+(Optional, object)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet]
														
 
															++
														
 
															+.Properties of mpnet
														
 
															+[%collapsible%open]
														
 
															+=======
														
 
															+`do_lower_case`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-do-lower-case]
														
 
															+
														
 
															+`max_sequence_length`::::
														
 
															+(Optional, integer)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
														
 
															+
														
 
															+`truncate`::::
														
 
															+(Optional, string)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
														
 
															+
														
 
															+`with_special_tokens`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet-with-special-tokens]
														
 
															+=======
														
 
															 ======
														
 
															 =====
														
 
															 `text_embedding`:::
														
@@ -646,6 +738,29 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 
															 (Optional, boolean)
														
 
															 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
														
 
															 =======
														
 
															+`mpnet`::::
														
 
															+(Optional, object)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet]
														
 
															++
														
 
															+.Properties of mpnet
														
 
															+[%collapsible%open]
														
 
															+=======
														
 
															+`do_lower_case`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-do-lower-case]
														
 
															+
														
 
															+`max_sequence_length`::::
														
 
															+(Optional, integer)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
														
 
															+
														
 
															+`truncate`::::
														
 
															+(Optional, string)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
														
 
															+
														
 
															+`with_special_tokens`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet-with-special-tokens]
														
 
															+=======
														
 
															 ======
														
 
															 =====
														
 
															 `zero_shot_classification`:::
														
@@ -701,6 +816,29 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenizati
 
															 (Optional, boolean)
														
 
															 include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-with-special-tokens]
														
 
															 =======
														
 
															+`mpnet`::::
														
 
															+(Optional, object)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet]
														
 
															++
														
 
															+.Properties of mpnet
														
 
															+[%collapsible%open]
														
 
															+=======
														
 
															+`do_lower_case`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-do-lower-case]
														
 
															+
														
 
															+`max_sequence_length`::::
														
 
															+(Optional, integer)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-max-sequence-length]
														
 
															+
														
 
															+`truncate`::::
														
 
															+(Optional, string)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-bert-truncate]
														
 
															+
														
 
															+`with_special_tokens`::::
														
 
															+(Optional, boolean)
														
 
															+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=inference-config-nlp-tokenization-mpnet-with-special-tokens]
														
 
															+=======
														
 
															 ======
														
 
															 =====
														
 
															 ====
														
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/MlInferenceNamedXContentProvider.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/MlInferenceNamedXContentProvider.java
@@ -41,6 +41,8 @@ import org.elasticsearch.xpack.core.ml.inference.trainedmodel.InferenceConfigUpd
 
															 import org.elasticsearch.xpack.core.ml.inference.trainedmodel.LenientlyParsedInferenceConfig;
														
 
															 import org.elasticsearch.xpack.core.ml.inference.trainedmodel.LenientlyParsedTrainedModel;
														
 
															 import org.elasticsearch.xpack.core.ml.inference.trainedmodel.LenientlyParsedTrainedModelLocation;
														
 
															+import org.elasticsearch.xpack.core.ml.inference.trainedmodel.MPNetTokenization;
														
 
															+import org.elasticsearch.xpack.core.ml.inference.trainedmodel.MPNetTokenizationUpdate;
														
 
															 import org.elasticsearch.xpack.core.ml.inference.trainedmodel.NerConfig;
														
 
															 import org.elasticsearch.xpack.core.ml.inference.trainedmodel.NerConfigUpdate;
														
 
															 import org.elasticsearch.xpack.core.ml.inference.trainedmodel.PassThroughConfig;
														
@@ -435,6 +437,13 @@ public class MlInferenceNamedXContentProvider implements NamedXContentProvider {
 
															                 (p, c) -> BertTokenization.fromXContent(p, (boolean) c)
														
 
															             )
														
 
															         );
														
 
															+        namedXContent.add(
														
 
															+            new NamedXContentRegistry.Entry(
														
 
															+                Tokenization.class,
														
 
															+                MPNetTokenization.NAME,
														
 
															+                (p, c) -> MPNetTokenization.fromXContent(p, (boolean) c)
														
 
															+            )
														
 
															+        );
														
 
															         namedXContent.add(
														
 
															             new NamedXContentRegistry.Entry(
														
@@ -443,6 +452,13 @@ public class MlInferenceNamedXContentProvider implements NamedXContentProvider {
 
															                 (p, c) -> BertTokenizationUpdate.fromXContent(p)
														
 
															             )
														
 
															         );
														
 
															+        namedXContent.add(
														
 
															+            new NamedXContentRegistry.Entry(
														
 
															+                TokenizationUpdate.class,
														
 
															+                MPNetTokenizationUpdate.NAME,
														
 
															+                (p, c) -> MPNetTokenizationUpdate.fromXContent(p)
														
 
															+            )
														
 
															+        );
														
 
															         return namedXContent;
														
 
															     }
														
@@ -591,6 +607,9 @@ public class MlInferenceNamedXContentProvider implements NamedXContentProvider {
 
															         namedWriteables.add(
														
 
															             new NamedWriteableRegistry.Entry(Tokenization.class, BertTokenization.NAME.getPreferredName(), BertTokenization::new)
														
 
															         );
														
 
															+        namedWriteables.add(
														
 
															+            new NamedWriteableRegistry.Entry(Tokenization.class, MPNetTokenization.NAME.getPreferredName(), MPNetTokenization::new)
														
 
															+        );
														
 
															         namedWriteables.add(
														
 
															             new NamedWriteableRegistry.Entry(
														
@@ -599,6 +618,13 @@ public class MlInferenceNamedXContentProvider implements NamedXContentProvider {
 
															                 BertTokenizationUpdate::new
														
 
															             )
														
 
															         );
														
 
															+        namedWriteables.add(
														
 
															+            new NamedWriteableRegistry.Entry(
														
 
															+                TokenizationUpdate.class,
														
 
															+                MPNetTokenizationUpdate.NAME.getPreferredName(),
														
 
															+                MPNetTokenizationUpdate::new
														
 
															+            )
														
 
															+        );
														
 
															         return namedWriteables;
														
 
															     }
														
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/BertTokenizationUpdate.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/BertTokenizationUpdate.java
@@ -48,10 +48,6 @@ public class BertTokenizationUpdate implements TokenizationUpdate {
 
															     @Override
														
 
															     public Tokenization apply(Tokenization originalConfig) {
														
 
															-        if (isNoop()) {
														
 
															-            return originalConfig;
														
 
															-        }
														
 
															-
														
 
															         if (originalConfig instanceof BertTokenization == false) {
														
 
															             throw ExceptionsHelper.badRequestException(
														
 
															                 "Tokenization config of type [{}] can not be updated with a request of type [{}]",
														
@@ -60,6 +56,10 @@ public class BertTokenizationUpdate implements TokenizationUpdate {
 
															             );
														
 
															         }
														
 
															+        if (isNoop()) {
														
 
															+            return originalConfig;
														
 
															+        }
														
 
															+
														
 
															         return new BertTokenization(
														
 
															             originalConfig.doLowerCase(),
														
 
															             originalConfig.withSpecialTokens(),
														
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/MPNetTokenization.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/MPNetTokenization.java
@@ -0,0 +1,88 @@
 
															+/*
														
 
															+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
														
 
															+ * or more contributor license agreements. Licensed under the Elastic License
														
 
															+ * 2.0; you may not use this file except in compliance with the Elastic License
														
 
															+ * 2.0.
														
 
															+ */
														
 
															+
														
 
															+package org.elasticsearch.xpack.core.ml.inference.trainedmodel;
														
 
															+
														
 
															+import org.elasticsearch.common.io.stream.StreamInput;
														
 
															+import org.elasticsearch.common.io.stream.StreamOutput;
														
 
															+import org.elasticsearch.core.Nullable;
														
 
															+import org.elasticsearch.xcontent.ConstructingObjectParser;
														
 
															+import org.elasticsearch.xcontent.ParseField;
														
 
															+import org.elasticsearch.xcontent.XContentBuilder;
														
 
															+import org.elasticsearch.xcontent.XContentParser;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+
														
 
															+public class MPNetTokenization extends Tokenization {
														
 
															+
														
 
															+    public static final ParseField NAME = new ParseField("mpnet");
														
 
															+
														
 
															+    public static ConstructingObjectParser<MPNetTokenization, Void> createParser(boolean ignoreUnknownFields) {
														
 
															+        ConstructingObjectParser<MPNetTokenization, Void> parser = new ConstructingObjectParser<>(
														
 
															+            "mpnet_tokenization",
														
 
															+            ignoreUnknownFields,
														
 
															+            a -> new MPNetTokenization(
														
 
															+                (Boolean) a[0],
														
 
															+                (Boolean) a[1],
														
 
															+                (Integer) a[2],
														
 
															+                a[3] == null ? null : Truncate.fromString((String) a[3])
														
 
															+            )
														
 
															+        );
														
 
															+        Tokenization.declareCommonFields(parser);
														
 
															+        return parser;
														
 
															+    }
														
 
															+
														
 
															+    private static final ConstructingObjectParser<MPNetTokenization, Void> LENIENT_PARSER = createParser(true);
														
 
															+    private static final ConstructingObjectParser<MPNetTokenization, Void> STRICT_PARSER = createParser(false);
														
 
															+
														
 
															+    public static MPNetTokenization fromXContent(XContentParser parser, boolean lenient) {
														
 
															+        return lenient ? LENIENT_PARSER.apply(parser, null) : STRICT_PARSER.apply(parser, null);
														
 
															+    }
														
 
															+
														
 
															+    public MPNetTokenization(
														
 
															+        @Nullable Boolean doLowerCase,
														
 
															+        @Nullable Boolean withSpecialTokens,
														
 
															+        @Nullable Integer maxSequenceLength,
														
 
															+        @Nullable Truncate truncate
														
 
															+    ) {
														
 
															+        super(doLowerCase, withSpecialTokens, maxSequenceLength, truncate);
														
 
															+    }
														
 
															+
														
 
															+    public MPNetTokenization(StreamInput in) throws IOException {
														
 
															+        super(in);
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public void writeTo(StreamOutput out) throws IOException {
														
 
															+        super.writeTo(out);
														
 
															+    }
														
 
															+
														
 
															+    XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException {
														
 
															+        return builder;
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public String getWriteableName() {
														
 
															+        return NAME.getPreferredName();
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public String getName() {
														
 
															+        return NAME.getPreferredName();
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public boolean equals(Object o) {
														
 
															+        if (o == null || getClass() != o.getClass()) return false;
														
 
															+        return super.equals(o);
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public int hashCode() {
														
 
															+        return super.hashCode();
														
 
															+    }
														
 
															+}
														
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/MPNetTokenizationUpdate.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/MPNetTokenizationUpdate.java
@@ -0,0 +1,111 @@
 
															+/*
														
 
															+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
														
 
															+ * or more contributor license agreements. Licensed under the Elastic License
														
 
															+ * 2.0; you may not use this file except in compliance with the Elastic License
														
 
															+ * 2.0.
														
 
															+ */
														
 
															+
														
 
															+package org.elasticsearch.xpack.core.ml.inference.trainedmodel;
														
 
															+
														
 
															+import org.elasticsearch.common.io.stream.StreamInput;
														
 
															+import org.elasticsearch.common.io.stream.StreamOutput;
														
 
															+import org.elasticsearch.core.Nullable;
														
 
															+import org.elasticsearch.xcontent.ConstructingObjectParser;
														
 
															+import org.elasticsearch.xcontent.ParseField;
														
 
															+import org.elasticsearch.xcontent.XContentBuilder;
														
 
															+import org.elasticsearch.xcontent.XContentParser;
														
 
															+import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+import java.util.Objects;
														
 
															+
														
 
															+public class MPNetTokenizationUpdate implements TokenizationUpdate {
														
 
															+
														
 
															+    public static final ParseField NAME = MPNetTokenization.NAME;
														
 
															+
														
 
															+    public static ConstructingObjectParser<MPNetTokenizationUpdate, Void> PARSER = new ConstructingObjectParser<>(
														
 
															+        "mpnet_tokenization_update",
														
 
															+        a -> new MPNetTokenizationUpdate(a[0] == null ? null : Tokenization.Truncate.fromString((String) a[0]))
														
 
															+    );
														
 
															+
														
 
															+    static {
														
 
															+        PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), Tokenization.TRUNCATE);
														
 
															+    }
														
 
															+
														
 
															+    public static MPNetTokenizationUpdate fromXContent(XContentParser parser) {
														
 
															+        return PARSER.apply(parser, null);
														
 
															+    }
														
 
															+
														
 
															+    private final Tokenization.Truncate truncate;
														
 
															+
														
 
															+    public MPNetTokenizationUpdate(@Nullable Tokenization.Truncate truncate) {
														
 
															+        this.truncate = truncate;
														
 
															+    }
														
 
															+
														
 
															+    public MPNetTokenizationUpdate(StreamInput in) throws IOException {
														
 
															+        this.truncate = in.readOptionalEnum(Tokenization.Truncate.class);
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public Tokenization apply(Tokenization originalConfig) {
														
 
															+        if (originalConfig instanceof MPNetTokenization == false) {
														
 
															+            throw ExceptionsHelper.badRequestException(
														
 
															+                "Tokenization config of type [{}] can not be updated with a request of type [{}]",
														
 
															+                originalConfig.getName(),
														
 
															+                getName()
														
 
															+            );
														
 
															+        }
														
 
															+
														
 
															+        if (isNoop()) {
														
 
															+            return originalConfig;
														
 
															+        }
														
 
															+
														
 
															+        return new MPNetTokenization(
														
 
															+            originalConfig.doLowerCase(),
														
 
															+            originalConfig.withSpecialTokens(),
														
 
															+            originalConfig.maxSequenceLength(),
														
 
															+            this.truncate
														
 
															+        );
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public boolean isNoop() {
														
 
															+        return truncate == null;
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
														
 
															+        builder.startObject();
														
 
															+        builder.field(Tokenization.TRUNCATE.getPreferredName(), truncate.toString());
														
 
															+        builder.endObject();
														
 
															+        return builder;
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public String getWriteableName() {
														
 
															+        return MPNetTokenization.NAME.getPreferredName();
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public void writeTo(StreamOutput out) throws IOException {
														
 
															+        out.writeOptionalEnum(truncate);
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public String getName() {
														
 
															+        return MPNetTokenization.NAME.getPreferredName();
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public boolean equals(Object o) {
														
 
															+        if (this == o) return true;
														
 
															+        if (o == null || getClass() != o.getClass()) return false;
														
 
															+        MPNetTokenizationUpdate that = (MPNetTokenizationUpdate) o;
														
 
															+        return truncate == that.truncate;
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public int hashCode() {
														
 
															+        return Objects.hash(truncate);
														
 
															+    }
														
 
															+}
														
--- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/FillMaskConfigTests.java
+++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/FillMaskConfigTests.java
@@ -50,7 +50,7 @@ public class FillMaskConfigTests extends InferenceConfigItemTestCase<FillMaskCon
 
															     public static FillMaskConfig createRandom() {
														
 
															         return new FillMaskConfig(
														
 
															             randomBoolean() ? null : VocabularyConfigTests.createRandom(),
														
 
															-            randomBoolean() ? null : BertTokenizationTests.createRandom(),
														
 
															+            randomBoolean() ? null : randomFrom(BertTokenizationTests.createRandom(), MPNetTokenizationTests.createRandom()),
														
 
															             randomBoolean() ? null : randomInt(),
														
 
															             randomBoolean() ? null : randomAlphaOfLength(5)
														
 
															         );
														
--- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/InferenceConfigTestScaffolding.java
+++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/InferenceConfigTestScaffolding.java
@@ -0,0 +1,32 @@
 
															+/*
														
 
															+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
														
 
															+ * or more contributor license agreements. Licensed under the Elastic License
														
 
															+ * 2.0; you may not use this file except in compliance with the Elastic License
														
 
															+ * 2.0.
														
 
															+ */
														
 
															+
														
 
															+package org.elasticsearch.xpack.core.ml.inference.trainedmodel;
														
 
															+
														
 
															+public final class InferenceConfigTestScaffolding {
														
 
															+
														
 
															+    static Tokenization cloneWithNewTruncation(Tokenization tokenization, Tokenization.Truncate truncate) {
														
 
															+        return tokenization instanceof MPNetTokenization
														
 
															+            ? new MPNetTokenization(
														
 
															+                tokenization.doLowerCase(),
														
 
															+                tokenization.withSpecialTokens(),
														
 
															+                tokenization.maxSequenceLength(),
														
 
															+                truncate
														
 
															+            )
														
 
															+            : new BertTokenization(
														
 
															+                tokenization.doLowerCase(),
														
 
															+                tokenization.withSpecialTokens(),
														
 
															+                tokenization.maxSequenceLength(),
														
 
															+                truncate
														
 
															+            );
														
 
															+    }
														
 
															+
														
 
															+    static TokenizationUpdate createTokenizationUpdate(Tokenization tokenization, Tokenization.Truncate truncate) {
														
 
															+        return tokenization instanceof MPNetTokenization ? new MPNetTokenizationUpdate(truncate) : new BertTokenizationUpdate(truncate);
														
 
															+    }
														
 
															+
														
 
															+}
														
--- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/MPNetTokenizationTests.java
+++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/MPNetTokenizationTests.java
@@ -0,0 +1,55 @@
 
															+/*
														
 
															+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
														
 
															+ * or more contributor license agreements. Licensed under the Elastic License
														
 
															+ * 2.0; you may not use this file except in compliance with the Elastic License
														
 
															+ * 2.0.
														
 
															+ */
														
 
															+
														
 
															+package org.elasticsearch.xpack.core.ml.inference.trainedmodel;
														
 
															+
														
 
															+import org.elasticsearch.Version;
														
 
															+import org.elasticsearch.common.io.stream.Writeable;
														
 
															+import org.elasticsearch.xcontent.XContentParser;
														
 
															+import org.elasticsearch.xpack.core.ml.AbstractBWCSerializationTestCase;
														
 
															+import org.junit.Before;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+
														
 
															+public class MPNetTokenizationTests extends AbstractBWCSerializationTestCase<MPNetTokenization> {
														
 
															+
														
 
															+    private boolean lenient;
														
 
															+
														
 
															+    @Before
														
 
															+    public void chooseStrictOrLenient() {
														
 
															+        lenient = randomBoolean();
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    protected MPNetTokenization doParseInstance(XContentParser parser) throws IOException {
														
 
															+        return MPNetTokenization.createParser(lenient).apply(parser, null);
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    protected Writeable.Reader<MPNetTokenization> instanceReader() {
														
 
															+        return MPNetTokenization::new;
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    protected MPNetTokenization createTestInstance() {
														
 
															+        return createRandom();
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    protected MPNetTokenization mutateInstanceForVersion(MPNetTokenization instance, Version version) {
														
 
															+        return instance;
														
 
															+    }
														
 
															+
														
 
															+    public static MPNetTokenization createRandom() {
														
 
															+        return new MPNetTokenization(
														
 
															+            randomBoolean() ? null : randomBoolean(),
														
 
															+            randomBoolean() ? null : randomBoolean(),
														
 
															+            randomBoolean() ? null : randomIntBetween(1, 1024),
														
 
															+            randomBoolean() ? null : randomFrom(Tokenization.Truncate.values())
														
 
															+        );
														
 
															+    }
														
 
															+}
														
--- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/NerConfigTests.java
+++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/NerConfigTests.java
@@ -50,7 +50,7 @@ public class NerConfigTests extends InferenceConfigItemTestCase<NerConfig> {
 
															     public static NerConfig createRandom() {
														
 
															         return new NerConfig(
														
 
															             randomBoolean() ? null : VocabularyConfigTests.createRandom(),
														
 
															-            randomBoolean() ? null : BertTokenizationTests.createRandom(),
														
 
															+            randomBoolean() ? null : randomFrom(BertTokenizationTests.createRandom(), MPNetTokenizationTests.createRandom()),
														
 
															             randomBoolean() ? null : randomList(5, () -> randomAlphaOfLength(10)),
														
 
															             randomBoolean() ? null : randomAlphaOfLength(5)
														
 
															         );
														
--- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/NerConfigUpdateTests.java
+++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/NerConfigUpdateTests.java
@@ -21,6 +21,8 @@ import java.util.Collections;
 
															 import java.util.HashMap;
														
 
															 import java.util.Map;
														
 
															+import static org.elasticsearch.xpack.core.ml.inference.trainedmodel.InferenceConfigTestScaffolding.cloneWithNewTruncation;
														
 
															+import static org.elasticsearch.xpack.core.ml.inference.trainedmodel.InferenceConfigTestScaffolding.createTokenizationUpdate;
														
 
															 import static org.hamcrest.Matchers.equalTo;
														
 
															 import static org.hamcrest.Matchers.sameInstance;
														
@@ -65,12 +67,7 @@ public class NerConfigUpdateTests extends AbstractBWCSerializationTestCase<NerCo
 
															         );
														
 
															         Tokenization.Truncate truncate = randomFrom(Tokenization.Truncate.values());
														
 
															-        Tokenization tokenization = new BertTokenization(
														
 
															-            originalConfig.getTokenization().doLowerCase(),
														
 
															-            originalConfig.getTokenization().withSpecialTokens(),
														
 
															-            originalConfig.getTokenization().maxSequenceLength(),
														
 
															-            truncate
														
 
															-        );
														
 
															+        Tokenization tokenization = cloneWithNewTruncation(originalConfig.getTokenization(), truncate);
														
 
															         assertThat(
														
 
															             new NerConfig(
														
 
															                 originalConfig.getVocabularyConfig(),
														
@@ -78,7 +75,11 @@ public class NerConfigUpdateTests extends AbstractBWCSerializationTestCase<NerCo
 
															                 originalConfig.getClassificationLabels(),
														
 
															                 originalConfig.getResultsField()
														
 
															             ),
														
 
															-            equalTo(new NerConfigUpdate.Builder().setTokenizationUpdate(new BertTokenizationUpdate(truncate)).build().apply(originalConfig))
														
 
															+            equalTo(
														
 
															+                new NerConfigUpdate.Builder().setTokenizationUpdate(createTokenizationUpdate(originalConfig.getTokenization(), truncate))
														
 
															+                    .build()
														
 
															+                    .apply(originalConfig)
														
 
															+            )
														
 
															         );
														
 
															     }
														
--- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/PassThroughConfigTests.java
+++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/PassThroughConfigTests.java
@@ -50,7 +50,7 @@ public class PassThroughConfigTests extends InferenceConfigItemTestCase<PassThro
 
															     public static PassThroughConfig createRandom() {
														
 
															         return new PassThroughConfig(
														
 
															             randomBoolean() ? null : VocabularyConfigTests.createRandom(),
														
 
															-            randomBoolean() ? null : BertTokenizationTests.createRandom(),
														
 
															+            randomBoolean() ? null : randomFrom(BertTokenizationTests.createRandom(), MPNetTokenizationTests.createRandom()),
														
 
															             randomBoolean() ? null : randomAlphaOfLength(7)
														
 
															         );
														
 
															     }
														
--- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/PassThroughConfigUpdateTests.java
+++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/PassThroughConfigUpdateTests.java
@@ -21,6 +21,8 @@ import java.util.Collections;
 
															 import java.util.HashMap;
														
 
															 import java.util.Map;
														
 
															+import static org.elasticsearch.xpack.core.ml.inference.trainedmodel.InferenceConfigTestScaffolding.cloneWithNewTruncation;
														
 
															+import static org.elasticsearch.xpack.core.ml.inference.trainedmodel.InferenceConfigTestScaffolding.createTokenizationUpdate;
														
 
															 import static org.hamcrest.Matchers.equalTo;
														
 
															 import static org.hamcrest.Matchers.sameInstance;
														
@@ -63,18 +65,13 @@ public class PassThroughConfigUpdateTests extends AbstractBWCSerializationTestCa
 
															         );
														
 
															         Tokenization.Truncate truncate = randomFrom(Tokenization.Truncate.values());
														
 
															-        Tokenization tokenization = new BertTokenization(
														
 
															-            originalConfig.getTokenization().doLowerCase(),
														
 
															-            originalConfig.getTokenization().withSpecialTokens(),
														
 
															-            originalConfig.getTokenization().maxSequenceLength(),
														
 
															-            truncate
														
 
															-        );
														
 
															+        Tokenization tokenization = cloneWithNewTruncation(originalConfig.getTokenization(), truncate);
														
 
															         assertThat(
														
 
															             new PassThroughConfig(originalConfig.getVocabularyConfig(), tokenization, originalConfig.getResultsField()),
														
 
															             equalTo(
														
 
															-                new PassThroughConfigUpdate.Builder().setTokenizationUpdate(new BertTokenizationUpdate(truncate))
														
 
															-                    .build()
														
 
															-                    .apply(originalConfig)
														
 
															+                new PassThroughConfigUpdate.Builder().setTokenizationUpdate(
														
 
															+                    createTokenizationUpdate(originalConfig.getTokenization(), truncate)
														
 
															+                ).build().apply(originalConfig)
														
 
															             )
														
 
															         );
														
 
															     }
														
--- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/TextClassificationConfigTests.java
+++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/TextClassificationConfigTests.java
@@ -69,7 +69,7 @@ public class TextClassificationConfigTests extends InferenceConfigItemTestCase<T
 
															     public static TextClassificationConfig createRandom() {
														
 
															         return new TextClassificationConfig(
														
 
															             randomBoolean() ? null : VocabularyConfigTests.createRandom(),
														
 
															-            randomBoolean() ? null : BertTokenizationTests.createRandom(),
														
 
															+            randomBoolean() ? null : randomFrom(BertTokenizationTests.createRandom(), MPNetTokenizationTests.createRandom()),
														
 
															             randomList(2, 5, () -> randomAlphaOfLength(10)),
														
 
															             randomBoolean() ? null : randomBoolean() ? -1 : randomIntBetween(1, 10),
														
 
															             randomBoolean() ? null : randomAlphaOfLength(6)
														
--- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/TextClassificationConfigUpdateTests.java
+++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/TextClassificationConfigUpdateTests.java
@@ -23,6 +23,8 @@ import java.util.HashMap;
 
															 import java.util.List;
														
 
															 import java.util.Map;
														
 
															+import static org.elasticsearch.xpack.core.ml.inference.trainedmodel.InferenceConfigTestScaffolding.cloneWithNewTruncation;
														
 
															+import static org.elasticsearch.xpack.core.ml.inference.trainedmodel.InferenceConfigTestScaffolding.createTokenizationUpdate;
														
 
															 import static org.hamcrest.Matchers.containsString;
														
 
															 import static org.hamcrest.Matchers.equalTo;
														
@@ -121,18 +123,13 @@ public class TextClassificationConfigUpdateTests extends AbstractBWCSerializatio
 
															         );
														
 
															         Tokenization.Truncate truncate = randomFrom(Tokenization.Truncate.values());
														
 
															-        Tokenization tokenization = new BertTokenization(
														
 
															-            originalConfig.getTokenization().doLowerCase(),
														
 
															-            originalConfig.getTokenization().withSpecialTokens(),
														
 
															-            originalConfig.getTokenization().maxSequenceLength(),
														
 
															-            truncate
														
 
															-        );
														
 
															+        Tokenization tokenization = cloneWithNewTruncation(originalConfig.getTokenization(), truncate);
														
 
															         assertThat(
														
 
															             new TextClassificationConfig.Builder(originalConfig).setTokenization(tokenization).build(),
														
 
															             equalTo(
														
 
															-                new TextClassificationConfigUpdate.Builder().setTokenizationUpdate(new BertTokenizationUpdate(truncate))
														
 
															-                    .build()
														
 
															-                    .apply(originalConfig)
														
 
															+                new TextClassificationConfigUpdate.Builder().setTokenizationUpdate(
														
 
															+                    createTokenizationUpdate(originalConfig.getTokenization(), truncate)
														
 
															+                ).build().apply(originalConfig)
														
 
															             )
														
 
															         );
														
 
															     }
														
--- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/TextEmbeddingConfigTests.java
+++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/TextEmbeddingConfigTests.java
@@ -50,7 +50,7 @@ public class TextEmbeddingConfigTests extends InferenceConfigItemTestCase<TextEm
 
															     public static TextEmbeddingConfig createRandom() {
														
 
															         return new TextEmbeddingConfig(
														
 
															             randomBoolean() ? null : VocabularyConfigTests.createRandom(),
														
 
															-            randomBoolean() ? null : BertTokenizationTests.createRandom(),
														
 
															+            randomBoolean() ? null : randomFrom(BertTokenizationTests.createRandom(), MPNetTokenizationTests.createRandom()),
														
 
															             randomBoolean() ? null : randomAlphaOfLength(7)
														
 
															         );
														
 
															     }
														
--- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/TextEmbeddingConfigUpdateTests.java
+++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/TextEmbeddingConfigUpdateTests.java
@@ -21,6 +21,8 @@ import java.util.Collections;
 
															 import java.util.HashMap;
														
 
															 import java.util.Map;
														
 
															+import static org.elasticsearch.xpack.core.ml.inference.trainedmodel.InferenceConfigTestScaffolding.cloneWithNewTruncation;
														
 
															+import static org.elasticsearch.xpack.core.ml.inference.trainedmodel.InferenceConfigTestScaffolding.createTokenizationUpdate;
														
 
															 import static org.hamcrest.Matchers.equalTo;
														
 
															 import static org.hamcrest.Matchers.sameInstance;
														
@@ -63,18 +65,13 @@ public class TextEmbeddingConfigUpdateTests extends AbstractBWCSerializationTest
 
															         );
														
 
															         Tokenization.Truncate truncate = randomFrom(Tokenization.Truncate.values());
														
 
															-        Tokenization tokenization = new BertTokenization(
														
 
															-            originalConfig.getTokenization().doLowerCase(),
														
 
															-            originalConfig.getTokenization().withSpecialTokens(),
														
 
															-            originalConfig.getTokenization().maxSequenceLength(),
														
 
															-            truncate
														
 
															-        );
														
 
															+        Tokenization tokenization = cloneWithNewTruncation(originalConfig.getTokenization(), truncate);
														
 
															         assertThat(
														
 
															             new TextEmbeddingConfig(originalConfig.getVocabularyConfig(), tokenization, originalConfig.getResultsField()),
														
 
															             equalTo(
														
 
															-                new TextEmbeddingConfigUpdate.Builder().setTokenizationUpdate(new BertTokenizationUpdate(truncate))
														
 
															-                    .build()
														
 
															-                    .apply(originalConfig)
														
 
															+                new TextEmbeddingConfigUpdate.Builder().setTokenizationUpdate(
														
 
															+                    createTokenizationUpdate(originalConfig.getTokenization(), truncate)
														
 
															+                ).build().apply(originalConfig)
														
 
															             )
														
 
															         );
														
 
															     }
														
--- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/ZeroShotClassificationConfigTests.java
+++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/ZeroShotClassificationConfigTests.java
@@ -52,7 +52,7 @@ public class ZeroShotClassificationConfigTests extends InferenceConfigItemTestCa
 
															         return new ZeroShotClassificationConfig(
														
 
															             randomFrom(List.of("entailment", "neutral", "contradiction"), List.of("contradiction", "neutral", "entailment")),
														
 
															             randomBoolean() ? null : VocabularyConfigTests.createRandom(),
														
 
															-            randomBoolean() ? null : BertTokenizationTests.createRandom(),
														
 
															+            randomBoolean() ? null : randomFrom(BertTokenizationTests.createRandom(), MPNetTokenizationTests.createRandom()),
														
 
															             randomAlphaOfLength(10),
														
 
															             randomBoolean(),
														
 
															             randomBoolean() ? null : randomList(1, 5, () -> randomAlphaOfLength(10)),
														
--- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/ZeroShotClassificationConfigUpdateTests.java
+++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/trainedmodel/ZeroShotClassificationConfigUpdateTests.java
@@ -22,6 +22,8 @@ import java.util.HashMap;
 
															 import java.util.List;
														
 
															 import java.util.Map;
														
 
															+import static org.elasticsearch.xpack.core.ml.inference.trainedmodel.InferenceConfigTestScaffolding.cloneWithNewTruncation;
														
 
															+import static org.elasticsearch.xpack.core.ml.inference.trainedmodel.InferenceConfigTestScaffolding.createTokenizationUpdate;
														
 
															 import static org.hamcrest.Matchers.containsString;
														
 
															 import static org.hamcrest.Matchers.equalTo;
														
@@ -137,12 +139,7 @@ public class ZeroShotClassificationConfigUpdateTests extends InferenceConfigItem
 
															         );
														
 
															         Tokenization.Truncate truncate = randomFrom(Tokenization.Truncate.values());
														
 
															-        Tokenization tokenization = new BertTokenization(
														
 
															-            originalConfig.getTokenization().doLowerCase(),
														
 
															-            originalConfig.getTokenization().withSpecialTokens(),
														
 
															-            originalConfig.getTokenization().maxSequenceLength(),
														
 
															-            truncate
														
 
															-        );
														
 
															+        Tokenization tokenization = cloneWithNewTruncation(originalConfig.getTokenization(), truncate);
														
 
															         assertThat(
														
 
															             new ZeroShotClassificationConfig(
														
 
															                 originalConfig.getClassificationLabels(),
														
@@ -154,9 +151,9 @@ public class ZeroShotClassificationConfigUpdateTests extends InferenceConfigItem
 
															                 originalConfig.getResultsField()
														
 
															             ),
														
 
															             equalTo(
														
 
															-                new ZeroShotClassificationConfigUpdate.Builder().setTokenizationUpdate(new BertTokenizationUpdate(truncate))
														
 
															-                    .build()
														
 
															-                    .apply(originalConfig)
														
 
															+                new ZeroShotClassificationConfigUpdate.Builder().setTokenizationUpdate(
														
 
															+                    createTokenizationUpdate(originalConfig.getTokenization(), truncate)
														
 
															+                ).build().apply(originalConfig)
														
 
															             )
														
 
															         );
														
 
															     }
														
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/BertRequestBuilder.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/BertRequestBuilder.java
@@ -11,7 +11,7 @@ import org.elasticsearch.common.bytes.BytesReference;
 
															 import org.elasticsearch.xcontent.XContentBuilder;
														
 
															 import org.elasticsearch.xcontent.XContentFactory;
														
 
															 import org.elasticsearch.xpack.core.ml.inference.trainedmodel.Tokenization;
														
 
															-import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.BertTokenizer;
														
 
															+import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.NlpTokenizer;
														
 
															 import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.TokenizationResult;
														
 
															 import java.io.IOException;
														
@@ -26,16 +26,16 @@ public class BertRequestBuilder implements NlpTask.RequestBuilder {
 
															     static final String ARG2 = "arg_2";
														
 
															     static final String ARG3 = "arg_3";
														
 
															-    private final BertTokenizer tokenizer;
														
 
															+    private final NlpTokenizer tokenizer;
														
 
															-    public BertRequestBuilder(BertTokenizer tokenizer) {
														
 
															+    public BertRequestBuilder(NlpTokenizer tokenizer) {
														
 
															         this.tokenizer = tokenizer;
														
 
															     }
														
 
															     @Override
														
 
															     public NlpTask.Request buildRequest(List<String> inputs, String requestId, Tokenization.Truncate truncate) throws IOException {
														
 
															         if (tokenizer.getPadTokenId().isEmpty()) {
														
 
															-            throw new IllegalStateException("The input tokenizer does not have a " + BertTokenizer.PAD_TOKEN + " token in its vocabulary");
														
 
															+            throw new IllegalStateException("The input tokenizer does not have a " + tokenizer.getPadToken() + " token in its vocabulary");
														
 
															         }
														
 
															         TokenizationResult tokenization = tokenizer.buildTokenizationResult(
														
@@ -47,7 +47,7 @@ public class BertRequestBuilder implements NlpTask.RequestBuilder {
 
															     @Override
														
 
															     public NlpTask.Request buildRequest(TokenizationResult tokenization, String requestId) throws IOException {
														
 
															         if (tokenizer.getPadTokenId().isEmpty()) {
														
 
															-            throw new IllegalStateException("The input tokenizer does not have a " + BertTokenizer.PAD_TOKEN + " token in its vocabulary");
														
 
															+            throw new IllegalStateException("The input tokenizer does not have a " + tokenizer.getPadToken() + " token in its vocabulary");
														
 
															         }
														
 
															         return new NlpTask.Request(tokenization, jsonRequest(tokenization, tokenizer.getPadTokenId().getAsInt(), requestId));
														
 
															     }
														
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/MPNetRequestBuilder.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/MPNetRequestBuilder.java
@@ -0,0 +1,66 @@
 
															+/*
														
 
															+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
														
 
															+ * or more contributor license agreements. Licensed under the Elastic License
														
 
															+ * 2.0; you may not use this file except in compliance with the Elastic License
														
 
															+ * 2.0.
														
 
															+ */
														
 
															+
														
 
															+package org.elasticsearch.xpack.ml.inference.nlp;
														
 
															+
														
 
															+import org.elasticsearch.common.bytes.BytesReference;
														
 
															+import org.elasticsearch.xcontent.XContentBuilder;
														
 
															+import org.elasticsearch.xcontent.XContentFactory;
														
 
															+import org.elasticsearch.xpack.core.ml.inference.trainedmodel.Tokenization;
														
 
															+import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.NlpTokenizer;
														
 
															+import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.TokenizationResult;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+import java.util.List;
														
 
															+import java.util.stream.Collectors;
														
 
															+
														
 
															+public class MPNetRequestBuilder implements NlpTask.RequestBuilder {
														
 
															+
														
 
															+    static final String REQUEST_ID = "request_id";
														
 
															+    static final String TOKENS = "tokens";
														
 
															+    static final String ARG1 = "arg_1";
														
 
															+
														
 
															+    private final NlpTokenizer tokenizer;
														
 
															+
														
 
															+    public MPNetRequestBuilder(NlpTokenizer tokenizer) {
														
 
															+        this.tokenizer = tokenizer;
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public NlpTask.Request buildRequest(List<String> inputs, String requestId, Tokenization.Truncate truncate) throws IOException {
														
 
															+        if (tokenizer.getPadTokenId().isEmpty()) {
														
 
															+            throw new IllegalStateException("The input tokenizer does not have a " + tokenizer.getPadToken() + " token in its vocabulary");
														
 
															+        }
														
 
															+
														
 
															+        TokenizationResult tokenization = tokenizer.buildTokenizationResult(
														
 
															+            inputs.stream().map(s -> tokenizer.tokenize(s, truncate)).collect(Collectors.toList())
														
 
															+        );
														
 
															+        return buildRequest(tokenization, requestId);
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    public NlpTask.Request buildRequest(TokenizationResult tokenization, String requestId) throws IOException {
														
 
															+        if (tokenizer.getPadTokenId().isEmpty()) {
														
 
															+            throw new IllegalStateException("The input tokenizer does not have a " + tokenizer.getPadToken() + " token in its vocabulary");
														
 
															+        }
														
 
															+        return new NlpTask.Request(tokenization, jsonRequest(tokenization, tokenizer.getPadTokenId().getAsInt(), requestId));
														
 
															+    }
														
 
															+
														
 
															+    static BytesReference jsonRequest(TokenizationResult tokenization, int padToken, String requestId) throws IOException {
														
 
															+        XContentBuilder builder = XContentFactory.jsonBuilder();
														
 
															+        builder.startObject();
														
 
															+        builder.field(REQUEST_ID, requestId);
														
 
															+
														
 
															+        NlpTask.RequestBuilder.writePaddedTokens(TOKENS, tokenization, padToken, (tokens, i) -> tokens.getTokenIds()[i], builder);
														
 
															+        NlpTask.RequestBuilder.writePaddedTokens(ARG1, tokenization, padToken, (tokens, i) -> 1, builder);
														
 
															+        builder.endObject();
														
 
															+
														
 
															+        // BytesReference.bytes closes the builder
														
 
															+        return BytesReference.bytes(builder);
														
 
															+    }
														
 
															+
														
 
															+}
														
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/BertTokenizer.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/BertTokenizer.java
@@ -20,6 +20,8 @@ import java.util.Set;
 
															 import java.util.SortedMap;
														
 
															 import java.util.TreeMap;
														
 
															 import java.util.function.Function;
														
 
															+import java.util.stream.IntStream;
														
 
															+import java.util.stream.Stream;
														
 
															 /**
														
 
															  * Performs basic tokenization and normalization of input text
														
@@ -41,7 +43,7 @@ public class BertTokenizer implements NlpTokenizer {
 
															     public static final int DEFAULT_MAX_INPUT_CHARS_PER_WORD = 100;
														
 
															-    private final Set<String> NEVER_SPLIT = Set.of(MASK_TOKEN);
														
 
															+    private static final Set<String> NEVER_SPLIT = Set.of(MASK_TOKEN);
														
 
															     private final WordPieceTokenizer wordPieceTokenizer;
														
 
															     private final List<String> originalVocab;
														
@@ -50,10 +52,17 @@ public class BertTokenizer implements NlpTokenizer {
 
															     private final boolean doLowerCase;
														
 
															     private final boolean doTokenizeCjKChars;
														
 
															     private final boolean doStripAccents;
														
 
															-    private final boolean withSpecialTokens;
														
 
															+    protected final boolean withSpecialTokens;
														
 
															     private final Set<String> neverSplit;
														
 
															     private final int maxSequenceLength;
														
 
															     private final NlpTask.RequestBuilder requestBuilder;
														
 
															+    private final String sepToken;
														
 
															+    protected final int sepTokenId;
														
 
															+    private final String clsToken;
														
 
															+    private final int clsTokenId;
														
 
															+    private final String padToken;
														
 
															+    private final String maskToken;
														
 
															+    private final String unknownToken;
														
 
															     protected BertTokenizer(
														
 
															         List<String> originalVocab,
														
@@ -63,37 +72,97 @@ public class BertTokenizer implements NlpTokenizer {
 
															         boolean doStripAccents,
														
 
															         boolean withSpecialTokens,
														
 
															         int maxSequenceLength,
														
 
															-        Function<BertTokenizer, NlpTask.RequestBuilder> requestBuilderFactory,
														
 
															+        Function<NlpTokenizer, NlpTask.RequestBuilder> requestBuilderFactory,
														
 
															         Set<String> neverSplit
														
 
															     ) {
														
 
															-        wordPieceTokenizer = new WordPieceTokenizer(vocab, UNKNOWN_TOKEN, DEFAULT_MAX_INPUT_CHARS_PER_WORD);
														
 
															+        this(
														
 
															+            originalVocab,
														
 
															+            vocab,
														
 
															+            doLowerCase,
														
 
															+            doTokenizeCjKChars,
														
 
															+            doStripAccents,
														
 
															+            withSpecialTokens,
														
 
															+            maxSequenceLength,
														
 
															+            requestBuilderFactory,
														
 
															+            Sets.union(neverSplit, NEVER_SPLIT),
														
 
															+            SEPARATOR_TOKEN,
														
 
															+            CLASS_TOKEN,
														
 
															+            PAD_TOKEN,
														
 
															+            MASK_TOKEN,
														
 
															+            UNKNOWN_TOKEN
														
 
															+        );
														
 
															+    }
														
 
															+
														
 
															+    protected BertTokenizer(
														
 
															+        List<String> originalVocab,
														
 
															+        SortedMap<String, Integer> vocab,
														
 
															+        boolean doLowerCase,
														
 
															+        boolean doTokenizeCjKChars,
														
 
															+        boolean doStripAccents,
														
 
															+        boolean withSpecialTokens,
														
 
															+        int maxSequenceLength,
														
 
															+        Function<NlpTokenizer, NlpTask.RequestBuilder> requestBuilderFactory,
														
 
															+        Set<String> neverSplit,
														
 
															+        String sepToken,
														
 
															+        String clsToken,
														
 
															+        String padToken,
														
 
															+        String maskToken,
														
 
															+        String unknownToken
														
 
															+    ) {
														
 
															+        wordPieceTokenizer = new WordPieceTokenizer(vocab, unknownToken, DEFAULT_MAX_INPUT_CHARS_PER_WORD);
														
 
															         this.originalVocab = originalVocab;
														
 
															         this.vocab = vocab;
														
 
															         this.doLowerCase = doLowerCase;
														
 
															         this.doTokenizeCjKChars = doTokenizeCjKChars;
														
 
															         this.doStripAccents = doStripAccents;
														
 
															         this.withSpecialTokens = withSpecialTokens;
														
 
															-        this.neverSplit = Sets.union(neverSplit, NEVER_SPLIT);
														
 
															+        this.neverSplit = neverSplit;
														
 
															         this.maxSequenceLength = maxSequenceLength;
														
 
															         this.requestBuilder = requestBuilderFactory.apply(this);
														
 
															-        if (vocab.containsKey(UNKNOWN_TOKEN) == false) {
														
 
															-            throw ExceptionsHelper.conflictStatusException("stored vocabulary is missing required [{}] token", UNKNOWN_TOKEN);
														
 
															+        if (vocab.containsKey(unknownToken) == false) {
														
 
															+            throw ExceptionsHelper.conflictStatusException("stored vocabulary is missing required [{}] token", unknownToken);
														
 
															         }
														
 
															-        if (vocab.containsKey(PAD_TOKEN) == false) {
														
 
															-            throw ExceptionsHelper.conflictStatusException("stored vocabulary is missing required [{}] token", PAD_TOKEN);
														
 
															+        if (vocab.containsKey(padToken) == false) {
														
 
															+            throw ExceptionsHelper.conflictStatusException("stored vocabulary is missing required [{}] token", padToken);
														
 
															         }
														
 
															         if (withSpecialTokens) {
														
 
															-            Set<String> missingSpecialTokens = Sets.difference(Set.of(SEPARATOR_TOKEN, CLASS_TOKEN), vocab.keySet());
														
 
															+            Set<String> missingSpecialTokens = Sets.difference(Set.of(sepToken, clsToken), vocab.keySet());
														
 
															             if (missingSpecialTokens.isEmpty() == false) {
														
 
															                 throw ExceptionsHelper.conflictStatusException("stored vocabulary is missing required {} token(s)", missingSpecialTokens);
														
 
															             }
														
 
															+            this.sepTokenId = vocab.get(sepToken);
														
 
															+            this.clsTokenId = vocab.get(clsToken);
														
 
															+        } else {
														
 
															+            this.sepTokenId = -1;
														
 
															+            this.clsTokenId = -1;
														
 
															         }
														
 
															+        this.sepToken = sepToken;
														
 
															+        this.clsToken = clsToken;
														
 
															+        this.padToken = padToken;
														
 
															+        this.maskToken = maskToken;
														
 
															+        this.unknownToken = unknownToken;
														
 
															+    }
														
 
															+
														
 
															+    public String getSepToken() {
														
 
															+        return sepToken;
														
 
															+    }
														
 
															+
														
 
															+    public String getClsToken() {
														
 
															+        return clsToken;
														
 
															+    }
														
 
															+
														
 
															+    public String getPadToken() {
														
 
															+        return padToken;
														
 
															+    }
														
 
															+
														
 
															+    public String getUnknownToken() {
														
 
															+        return unknownToken;
														
 
															     }
														
 
															     @Override
														
 
															     public OptionalInt getPadTokenId() {
														
 
															-        Integer pad = vocab.get(PAD_TOKEN);
														
 
															+        Integer pad = vocab.get(this.padToken);
														
 
															         if (pad != null) {
														
 
															             return OptionalInt.of(pad);
														
 
															         } else {
														
@@ -103,7 +172,7 @@ public class BertTokenizer implements NlpTokenizer {
 
															     @Override
														
 
															     public OptionalInt getMaskTokenId() {
														
 
															-        Integer pad = vocab.get(MASK_TOKEN);
														
 
															+        Integer pad = vocab.get(this.maskToken);
														
 
															         if (pad != null) {
														
 
															             return OptionalInt.of(pad);
														
 
															         } else {
														
@@ -113,7 +182,7 @@ public class BertTokenizer implements NlpTokenizer {
 
															     @Override
														
 
															     public String getMaskToken() {
														
 
															-        return MASK_TOKEN;
														
 
															+        return maskToken;
														
 
															     }
														
 
															     @Override
														
@@ -150,6 +219,7 @@ public class BertTokenizer implements NlpTokenizer {
 
															                 case SECOND:
														
 
															                     isTruncated = true;
														
 
															                     wordPieceTokenIds = wordPieceTokenIds.subList(0, withSpecialTokens ? maxSequenceLength - 2 : maxSequenceLength);
														
 
															+                    tokenPositionMap = tokenPositionMap.subList(0, withSpecialTokens ? maxSequenceLength - 2 : maxSequenceLength);
														
 
															                     break;
														
 
															                 case NONE:
														
 
															                     throw ExceptionsHelper.badRequestException(
														
@@ -158,31 +228,16 @@ public class BertTokenizer implements NlpTokenizer {
 
															                         maxSequenceLength
														
 
															                     );
														
 
															             }
														
 
															-            numTokens = maxSequenceLength;
														
 
															-        }
														
 
															-
														
 
															-        int[] tokenIds = new int[numTokens];
														
 
															-        int[] tokenMap = new int[numTokens];
														
 
															-
														
 
															-        if (withSpecialTokens) {
														
 
															-            tokenIds[0] = vocab.get(CLASS_TOKEN);
														
 
															-            tokenMap[0] = SPECIAL_TOKEN_POSITION;
														
 
															-        }
														
 
															-
														
 
															-        int i = withSpecialTokens ? 1 : 0;
														
 
															-        final int decrementHandler = withSpecialTokens ? 1 : 0;
														
 
															-        for (var tokenId : wordPieceTokenIds) {
														
 
															-            tokenIds[i] = tokenId;
														
 
															-            tokenMap[i] = tokenPositionMap.get(i - decrementHandler);
														
 
															-            i++;
														
 
															-        }
														
 
															-
														
 
															-        if (withSpecialTokens) {
														
 
															-            tokenIds[i] = vocab.get(SEPARATOR_TOKEN);
														
 
															-            tokenMap[i] = SPECIAL_TOKEN_POSITION;
														
 
															         }
														
 
															-
														
 
															-        return new TokenizationResult.Tokenization(seq, innerResult.tokens, isTruncated, tokenIds, tokenMap);
														
 
															+        BertTokenizationBuilder bertTokenizationBuilder = bertTokenizationBuilder().addTokens(wordPieceTokenIds, tokenPositionMap)
														
 
															+            .addEndTokensIfNecessary();
														
 
															+        return new TokenizationResult.Tokenization(
														
 
															+            seq,
														
 
															+            innerResult.tokens,
														
 
															+            isTruncated,
														
 
															+            bertTokenizationBuilder.buildIds(),
														
 
															+            bertTokenizationBuilder.buildMap()
														
 
															+        );
														
 
															     }
														
 
															     @Override
														
@@ -196,39 +251,47 @@ public class BertTokenizer implements NlpTokenizer {
 
															         if (withSpecialTokens == false) {
														
 
															             throw new IllegalArgumentException("Unable to do sequence pair tokenization without special tokens");
														
 
															         }
														
 
															-        // [CLS] seq1 [SEP] seq2 [SEP]
														
 
															-        int numTokens = wordPieceTokenIdsSeq1.size() + wordPieceTokenIdsSeq2.size() + 3;
														
 
															+        int extraTokens = getNumExtraTokensForSeqPair();
														
 
															+        int numTokens = wordPieceTokenIdsSeq1.size() + wordPieceTokenIdsSeq2.size() + extraTokens;
														
 
															         boolean isTruncated = false;
														
 
															         if (numTokens > maxSequenceLength) {
														
 
															             switch (truncate) {
														
 
															                 case FIRST:
														
 
															                     isTruncated = true;
														
 
															-                    if (wordPieceTokenIdsSeq2.size() > maxSequenceLength - 3) {
														
 
															+                    if (wordPieceTokenIdsSeq2.size() > maxSequenceLength - extraTokens) {
														
 
															                         throw ExceptionsHelper.badRequestException(
														
 
															                             "Attempting truncation [{}] but input is too large for the second sequence. "
														
 
															                                 + "The tokenized input length [{}] exceeds the maximum sequence length [{}], "
														
 
															                                 + "when taking special tokens into account",
														
 
															                             truncate.toString(),
														
 
															                             wordPieceTokenIdsSeq2.size(),
														
 
															-                            maxSequenceLength - 3
														
 
															+                            maxSequenceLength - extraTokens
														
 
															                         );
														
 
															                     }
														
 
															-                    wordPieceTokenIdsSeq1 = wordPieceTokenIdsSeq1.subList(0, maxSequenceLength - 3 - wordPieceTokenIdsSeq2.size());
														
 
															+                    wordPieceTokenIdsSeq1 = wordPieceTokenIdsSeq1.subList(
														
 
															+                        0,
														
 
															+                        maxSequenceLength - extraTokens - wordPieceTokenIdsSeq2.size()
														
 
															+                    );
														
 
															+                    tokenPositionMapSeq1 = tokenPositionMapSeq1.subList(0, maxSequenceLength - extraTokens - wordPieceTokenIdsSeq2.size());
														
 
															                     break;
														
 
															                 case SECOND:
														
 
															                     isTruncated = true;
														
 
															-                    if (wordPieceTokenIdsSeq1.size() > maxSequenceLength - 3) {
														
 
															+                    if (wordPieceTokenIdsSeq1.size() > maxSequenceLength - extraTokens) {
														
 
															                         throw ExceptionsHelper.badRequestException(
														
 
															                             "Attempting truncation [{}] but input is too large for the first sequence. "
														
 
															                                 + "The tokenized input length [{}] exceeds the maximum sequence length [{}], "
														
 
															                                 + "when taking special tokens into account",
														
 
															                             truncate.toString(),
														
 
															                             wordPieceTokenIdsSeq1.size(),
														
 
															-                            maxSequenceLength - 3
														
 
															+                            maxSequenceLength - extraTokens
														
 
															                         );
														
 
															                     }
														
 
															-                    wordPieceTokenIdsSeq2 = wordPieceTokenIdsSeq2.subList(0, maxSequenceLength - 3 - wordPieceTokenIdsSeq1.size());
														
 
															+                    wordPieceTokenIdsSeq2 = wordPieceTokenIdsSeq2.subList(
														
 
															+                        0,
														
 
															+                        maxSequenceLength - extraTokens - wordPieceTokenIdsSeq1.size()
														
 
															+                    );
														
 
															+                    tokenPositionMapSeq2 = tokenPositionMapSeq2.subList(0, maxSequenceLength - extraTokens - wordPieceTokenIdsSeq1.size());
														
 
															                     break;
														
 
															                 case NONE:
														
 
															                     throw ExceptionsHelper.badRequestException(
														
@@ -237,38 +300,27 @@ public class BertTokenizer implements NlpTokenizer {
 
															                         maxSequenceLength
														
 
															                     );
														
 
															             }
														
 
															-            numTokens = maxSequenceLength;
														
 
															-        }
														
 
															-        int[] tokenIds = new int[numTokens];
														
 
															-        int[] tokenMap = new int[numTokens];
														
 
															-
														
 
															-        tokenIds[0] = vocab.get(CLASS_TOKEN);
														
 
															-        tokenMap[0] = SPECIAL_TOKEN_POSITION;
														
 
															-
														
 
															-        int i = 1;
														
 
															-        for (var tokenId : wordPieceTokenIdsSeq1) {
														
 
															-            tokenIds[i] = tokenId;
														
 
															-            tokenMap[i] = tokenPositionMapSeq1.get(i - 1);
														
 
															-            i++;
														
 
															         }
														
 
															-        tokenIds[i] = vocab.get(SEPARATOR_TOKEN);
														
 
															-        tokenMap[i] = SPECIAL_TOKEN_POSITION;
														
 
															-        ++i;
														
 
															-
														
 
															-        int j = 0;
														
 
															-        for (var tokenId : wordPieceTokenIdsSeq2) {
														
 
															-            tokenIds[i] = tokenId;
														
 
															-            tokenMap[i] = tokenPositionMapSeq2.get(j);
														
 
															-            i++;
														
 
															-            j++;
														
 
															-        }
														
 
															-
														
 
															-        tokenIds[i] = vocab.get(SEPARATOR_TOKEN);
														
 
															-        tokenMap[i] = SPECIAL_TOKEN_POSITION;
														
 
															-
														
 
															+        BertTokenizationBuilder bertTokenizationBuilder = bertTokenizationBuilder().addTokens(wordPieceTokenIdsSeq1, tokenPositionMapSeq1)
														
 
															+            .addTokens(wordPieceTokenIdsSeq2, tokenPositionMapSeq2)
														
 
															+            .addEndTokensIfNecessary();
														
 
															         List<DelimitedToken> tokens = new ArrayList<>(innerResultSeq1.tokens);
														
 
															         tokens.addAll(innerResultSeq2.tokens);
														
 
															-        return new TokenizationResult.Tokenization(seq1 + seq2, tokens, isTruncated, tokenIds, tokenMap);
														
 
															+        return new TokenizationResult.Tokenization(
														
 
															+            seq1 + seq2,
														
 
															+            tokens,
														
 
															+            isTruncated,
														
 
															+            bertTokenizationBuilder.buildIds(),
														
 
															+            bertTokenizationBuilder.buildMap()
														
 
															+        );
														
 
															+    }
														
 
															+
														
 
															+    protected BertTokenizationBuilder bertTokenizationBuilder() {
														
 
															+        return new BertTokenizationBuilder();
														
 
															+    }
														
 
															+
														
 
															+    protected int getNumExtraTokensForSeqPair() {
														
 
															+        return 3;
														
 
															     }
														
 
															     private InnerTokenization innerTokenize(String seq) {
														
@@ -280,7 +332,7 @@ public class BertTokenizer implements NlpTokenizer {
 
															         for (int sourceIndex = 0; sourceIndex < tokenSequences.size(); sourceIndex++) {
														
 
															             String token = tokenSequences.get(sourceIndex).getToken();
														
 
															             if (neverSplit.contains(token)) {
														
 
															-                wordPieceTokens.add(vocab.getOrDefault(token, vocab.get(UNKNOWN_TOKEN)));
														
 
															+                wordPieceTokens.add(vocab.getOrDefault(token, vocab.get(unknownToken)));
														
 
															                 tokenPositionMap.add(sourceIndex);
														
 
															             } else {
														
 
															                 List<Integer> tokens = wordPieceTokenizer.tokenize(tokenSequences.get(sourceIndex));
														
@@ -319,6 +371,48 @@ public class BertTokenizer implements NlpTokenizer {
 
															         return new Builder(vocab, tokenization);
														
 
															     }
														
 
															+    protected class BertTokenizationBuilder {
														
 
															+        Stream.Builder<IntStream> tokenIds;
														
 
															+        Stream.Builder<IntStream> tokenMap;
														
 
															+        int numSeq;
														
 
															+
														
 
															+        BertTokenizationBuilder() {
														
 
															+            tokenIds = Stream.builder();
														
 
															+            tokenMap = Stream.builder();
														
 
															+            if (withSpecialTokens) {
														
 
															+                tokenIds.add(IntStream.of(clsTokenId));
														
 
															+                tokenMap.add(IntStream.of(SPECIAL_TOKEN_POSITION));
														
 
															+            }
														
 
															+        }
														
 
															+
														
 
															+        BertTokenizationBuilder addTokens(List<Integer> wordPieceTokenIds, List<Integer> tokenPositionMap) {
														
 
															+            if (numSeq > 0 && withSpecialTokens) {
														
 
															+                tokenIds.add(IntStream.of(sepTokenId));
														
 
															+                tokenMap.add(IntStream.of(SPECIAL_TOKEN_POSITION));
														
 
															+            }
														
 
															+            tokenIds.add(wordPieceTokenIds.stream().mapToInt(Integer::valueOf));
														
 
															+            tokenMap.add(tokenPositionMap.stream().mapToInt(Integer::valueOf));
														
 
															+            numSeq++;
														
 
															+            return this;
														
 
															+        }
														
 
															+
														
 
															+        BertTokenizationBuilder addEndTokensIfNecessary() {
														
 
															+            if (withSpecialTokens) {
														
 
															+                tokenIds.add(IntStream.of(sepTokenId));
														
 
															+                tokenMap.add(IntStream.of(SPECIAL_TOKEN_POSITION));
														
 
															+            }
														
 
															+            return this;
														
 
															+        }
														
 
															+
														
 
															+        int[] buildIds() {
														
 
															+            return tokenIds.build().flatMapToInt(Function.identity()).toArray();
														
 
															+        }
														
 
															+
														
 
															+        int[] buildMap() {
														
 
															+            return tokenMap.build().flatMapToInt(Function.identity()).toArray();
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															     public static class Builder {
														
 
															         protected final List<String> originalVocab;
														
@@ -329,7 +423,7 @@ public class BertTokenizer implements NlpTokenizer {
 
															         protected int maxSequenceLength;
														
 
															         protected Boolean doStripAccents = null;
														
 
															         protected Set<String> neverSplit;
														
 
															-        protected Function<BertTokenizer, NlpTask.RequestBuilder> requestBuilderFactory = BertRequestBuilder::new;
														
 
															+        protected Function<NlpTokenizer, NlpTask.RequestBuilder> requestBuilderFactory = BertRequestBuilder::new;
														
 
															         protected Builder(List<String> vocab, Tokenization tokenization) {
														
 
															             this.originalVocab = vocab;
														
@@ -382,7 +476,7 @@ public class BertTokenizer implements NlpTokenizer {
 
															             return this;
														
 
															         }
														
 
															-        public Builder setRequestBuilderFactory(Function<BertTokenizer, NlpTask.RequestBuilder> requestBuilderFactory) {
														
 
															+        public Builder setRequestBuilderFactory(Function<NlpTokenizer, NlpTask.RequestBuilder> requestBuilderFactory) {
														
 
															             this.requestBuilderFactory = requestBuilderFactory;
														
 
															             return this;
														
 
															         }
														
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/MPNetTokenizer.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/MPNetTokenizer.java
@@ -0,0 +1,186 @@
 
															+/*
														
 
															+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
														
 
															+ * or more contributor license agreements. Licensed under the Elastic License
														
 
															+ * 2.0; you may not use this file except in compliance with the Elastic License
														
 
															+ * 2.0.
														
 
															+ */
														
 
															+package org.elasticsearch.xpack.ml.inference.nlp.tokenizers;
														
 
															+
														
 
															+import org.elasticsearch.common.util.set.Sets;
														
 
															+import org.elasticsearch.xpack.core.ml.inference.trainedmodel.Tokenization;
														
 
															+import org.elasticsearch.xpack.ml.inference.nlp.MPNetRequestBuilder;
														
 
															+import org.elasticsearch.xpack.ml.inference.nlp.NlpTask;
														
 
															+
														
 
															+import java.util.Collections;
														
 
															+import java.util.List;
														
 
															+import java.util.Set;
														
 
															+import java.util.SortedMap;
														
 
															+import java.util.TreeMap;
														
 
															+import java.util.function.Function;
														
 
															+import java.util.stream.IntStream;
														
 
															+
														
 
															+/**
														
 
															+ * Performs basic tokenization and normalization of input text
														
 
															+ * then tokenizes with the WordPiece algorithm using the given
														
 
															+ * vocabulary.
														
 
															+ */
														
 
															+public class MPNetTokenizer extends BertTokenizer {
														
 
															+
														
 
															+    public static final String UNKNOWN_TOKEN = "[UNK]";
														
 
															+    public static final String SEPARATOR_TOKEN = "</s>";
														
 
															+    public static final String PAD_TOKEN = "<pad>";
														
 
															+    public static final String CLASS_TOKEN = "<s>";
														
 
															+    public static final String MASK_TOKEN = "<mask>";
														
 
															+    private static final Set<String> NEVER_SPLIT = Set.of(MASK_TOKEN);
														
 
															+
														
 
															+    protected MPNetTokenizer(
														
 
															+        List<String> originalVocab,
														
 
															+        SortedMap<String, Integer> vocab,
														
 
															+        boolean doLowerCase,
														
 
															+        boolean doTokenizeCjKChars,
														
 
															+        boolean doStripAccents,
														
 
															+        boolean withSpecialTokens,
														
 
															+        int maxSequenceLength,
														
 
															+        Function<NlpTokenizer, NlpTask.RequestBuilder> requestBuilderFactory,
														
 
															+        Set<String> neverSplit
														
 
															+    ) {
														
 
															+        super(
														
 
															+            originalVocab,
														
 
															+            vocab,
														
 
															+            doLowerCase,
														
 
															+            doTokenizeCjKChars,
														
 
															+            doStripAccents,
														
 
															+            withSpecialTokens,
														
 
															+            maxSequenceLength,
														
 
															+            requestBuilderFactory,
														
 
															+            Sets.union(neverSplit, NEVER_SPLIT),
														
 
															+            SEPARATOR_TOKEN,
														
 
															+            CLASS_TOKEN,
														
 
															+            PAD_TOKEN,
														
 
															+            MASK_TOKEN,
														
 
															+            UNKNOWN_TOKEN
														
 
															+        );
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    protected int getNumExtraTokensForSeqPair() {
														
 
															+        return 4;
														
 
															+    }
														
 
															+
														
 
															+    @Override
														
 
															+    protected BertTokenizationBuilder bertTokenizationBuilder() {
														
 
															+        return new MPNetTokenizationBuilder();
														
 
															+    }
														
 
															+
														
 
															+    protected class MPNetTokenizationBuilder extends BertTokenizationBuilder {
														
 
															+
														
 
															+        @Override
														
 
															+        BertTokenizationBuilder addTokens(List<Integer> wordPieceTokenIds, List<Integer> tokenPositionMap) {
														
 
															+            if (numSeq > 0 && withSpecialTokens) {
														
 
															+                tokenIds.add(IntStream.of(sepTokenId, sepTokenId));
														
 
															+                tokenMap.add(IntStream.of(SPECIAL_TOKEN_POSITION, SPECIAL_TOKEN_POSITION));
														
 
															+            }
														
 
															+            tokenIds.add(wordPieceTokenIds.stream().mapToInt(Integer::valueOf));
														
 
															+            tokenMap.add(tokenPositionMap.stream().mapToInt(Integer::valueOf));
														
 
															+            numSeq++;
														
 
															+            return this;
														
 
															+        }
														
 
															+
														
 
															+    }
														
 
															+
														
 
															+    public static Builder mpBuilder(List<String> vocab, Tokenization tokenization) {
														
 
															+        return new Builder(vocab, tokenization);
														
 
															+    }
														
 
															+
														
 
															+    public static class Builder {
														
 
															+
														
 
															+        protected final List<String> originalVocab;
														
 
															+        protected final SortedMap<String, Integer> vocab;
														
 
															+        protected boolean doLowerCase = false;
														
 
															+        protected boolean doTokenizeCjKChars = true;
														
 
															+        protected boolean withSpecialTokens = true;
														
 
															+        protected int maxSequenceLength;
														
 
															+        protected Boolean doStripAccents = null;
														
 
															+        protected Set<String> neverSplit;
														
 
															+        protected Function<NlpTokenizer, NlpTask.RequestBuilder> requestBuilderFactory = MPNetRequestBuilder::new;
														
 
															+
														
 
															+        protected Builder(List<String> vocab, Tokenization tokenization) {
														
 
															+            this.originalVocab = vocab;
														
 
															+            this.vocab = buildSortedVocab(vocab);
														
 
															+            this.doLowerCase = tokenization.doLowerCase();
														
 
															+            this.withSpecialTokens = tokenization.withSpecialTokens();
														
 
															+            this.maxSequenceLength = tokenization.maxSequenceLength();
														
 
															+        }
														
 
															+
														
 
															+        private static SortedMap<String, Integer> buildSortedVocab(List<String> vocab) {
														
 
															+            SortedMap<String, Integer> sortedVocab = new TreeMap<>();
														
 
															+            for (int i = 0; i < vocab.size(); i++) {
														
 
															+                sortedVocab.put(vocab.get(i), i);
														
 
															+            }
														
 
															+            return sortedVocab;
														
 
															+        }
														
 
															+
														
 
															+        public Builder setDoLowerCase(boolean doLowerCase) {
														
 
															+            this.doLowerCase = doLowerCase;
														
 
															+            return this;
														
 
															+        }
														
 
															+
														
 
															+        public Builder setDoTokenizeCjKChars(boolean doTokenizeCjKChars) {
														
 
															+            this.doTokenizeCjKChars = doTokenizeCjKChars;
														
 
															+            return this;
														
 
															+        }
														
 
															+
														
 
															+        public Builder setDoStripAccents(Boolean doStripAccents) {
														
 
															+            this.doStripAccents = doStripAccents;
														
 
															+            return this;
														
 
															+        }
														
 
															+
														
 
															+        public Builder setNeverSplit(Set<String> neverSplit) {
														
 
															+            this.neverSplit = neverSplit;
														
 
															+            return this;
														
 
															+        }
														
 
															+
														
 
															+        public Builder setMaxSequenceLength(int maxSequenceLength) {
														
 
															+            this.maxSequenceLength = maxSequenceLength;
														
 
															+            return this;
														
 
															+        }
														
 
															+
														
 
															+        /**
														
 
															+         * Include CLS and SEP tokens
														
 
															+         * @param withSpecialTokens if true include CLS and SEP tokens
														
 
															+         * @return this
														
 
															+         */
														
 
															+        public Builder setWithSpecialTokens(boolean withSpecialTokens) {
														
 
															+            this.withSpecialTokens = withSpecialTokens;
														
 
															+            return this;
														
 
															+        }
														
 
															+
														
 
															+        public Builder setRequestBuilderFactory(Function<NlpTokenizer, NlpTask.RequestBuilder> requestBuilderFactory) {
														
 
															+            this.requestBuilderFactory = requestBuilderFactory;
														
 
															+            return this;
														
 
															+        }
														
 
															+
														
 
															+        public MPNetTokenizer build() {
														
 
															+            // if not set strip accents defaults to the value of doLowerCase
														
 
															+            if (doStripAccents == null) {
														
 
															+                doStripAccents = doLowerCase;
														
 
															+            }
														
 
															+
														
 
															+            if (neverSplit == null) {
														
 
															+                neverSplit = Collections.emptySet();
														
 
															+            }
														
 
															+
														
 
															+            return new MPNetTokenizer(
														
 
															+                originalVocab,
														
 
															+                vocab,
														
 
															+                doLowerCase,
														
 
															+                doTokenizeCjKChars,
														
 
															+                doStripAccents,
														
 
															+                withSpecialTokens,
														
 
															+                maxSequenceLength,
														
 
															+                requestBuilderFactory,
														
 
															+                neverSplit
														
 
															+            );
														
 
															+        }
														
 
															+    }
														
 
															+}
														
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/NlpTokenizer.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/NlpTokenizer.java
@@ -8,9 +8,11 @@
 
															 package org.elasticsearch.xpack.ml.inference.nlp.tokenizers;
														
 
															 import org.elasticsearch.xpack.core.ml.inference.trainedmodel.BertTokenization;
														
 
															+import org.elasticsearch.xpack.core.ml.inference.trainedmodel.MPNetTokenization;
														
 
															 import org.elasticsearch.xpack.core.ml.inference.trainedmodel.Tokenization;
														
 
															 import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
														
 
															 import org.elasticsearch.xpack.ml.inference.nlp.BertRequestBuilder;
														
 
															+import org.elasticsearch.xpack.ml.inference.nlp.MPNetRequestBuilder;
														
 
															 import org.elasticsearch.xpack.ml.inference.nlp.NlpTask;
														
 
															 import org.elasticsearch.xpack.ml.inference.nlp.Vocabulary;
														
@@ -32,6 +34,8 @@ public interface NlpTokenizer {
 
															     OptionalInt getPadTokenId();
														
 
															+    String getPadToken();
														
 
															+
														
 
															     OptionalInt getMaskTokenId();
														
 
															     String getMaskToken();
														
@@ -42,6 +46,9 @@ public interface NlpTokenizer {
 
															         if (params instanceof BertTokenization) {
														
 
															             return BertTokenizer.builder(vocabulary.get(), params).setRequestBuilderFactory(BertRequestBuilder::new).build();
														
 
															         }
														
 
															+        if (params instanceof MPNetTokenization) {
														
 
															+            return MPNetTokenizer.mpBuilder(vocabulary.get(), params).setRequestBuilderFactory(MPNetRequestBuilder::new).build();
														
 
															+        }
														
 
															         throw new IllegalArgumentException("unknown tokenization type [" + params.getName() + "]");
														
 
															     }
														
 
															 }
														
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/MPNetRequestBuilderTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/MPNetRequestBuilderTests.java
@@ -0,0 +1,105 @@
 
															+/*
														
 
															+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
														
 
															+ * or more contributor license agreements. Licensed under the Elastic License
														
 
															+ * 2.0; you may not use this file except in compliance with the Elastic License
														
 
															+ * 2.0.
														
 
															+ */
														
 
															+
														
 
															+package org.elasticsearch.xpack.ml.inference.nlp;
														
 
															+
														
 
															+import org.elasticsearch.ElasticsearchStatusException;
														
 
															+import org.elasticsearch.common.xcontent.XContentHelper;
														
 
															+import org.elasticsearch.test.ESTestCase;
														
 
															+import org.elasticsearch.xcontent.XContentType;
														
 
															+import org.elasticsearch.xpack.core.ml.inference.trainedmodel.MPNetTokenization;
														
 
															+import org.elasticsearch.xpack.core.ml.inference.trainedmodel.Tokenization;
														
 
															+import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.MPNetTokenizer;
														
 
															+
														
 
															+import java.io.IOException;
														
 
															+import java.util.Arrays;
														
 
															+import java.util.Collections;
														
 
															+import java.util.List;
														
 
															+import java.util.Map;
														
 
															+
														
 
															+import static org.elasticsearch.xpack.ml.inference.nlp.tokenizers.MPNetTokenizerTests.TEST_CASED_VOCAB;
														
 
															+import static org.hamcrest.Matchers.containsString;
														
 
															+import static org.hamcrest.Matchers.hasSize;
														
 
															+
														
 
															+public class MPNetRequestBuilderTests extends ESTestCase {
														
 
															+
														
 
															+    public void testBuildRequest() throws IOException {
														
 
															+        MPNetTokenizer tokenizer = MPNetTokenizer.mpBuilder(TEST_CASED_VOCAB, new MPNetTokenization(null, null, 512, null)).build();
														
 
															+
														
 
															+        MPNetRequestBuilder requestBuilder = new MPNetRequestBuilder(tokenizer);
														
 
															+        NlpTask.Request request = requestBuilder.buildRequest(List.of("Elasticsearch fun"), "request1", Tokenization.Truncate.NONE);
														
 
															+        Map<String, Object> jsonDocAsMap = XContentHelper.convertToMap(request.processInput, true, XContentType.JSON).v2();
														
 
															+
														
 
															+        assertThat(jsonDocAsMap.keySet(), hasSize(3));
														
 
															+        assertEquals("request1", jsonDocAsMap.get("request_id"));
														
 
															+        assertEquals(Arrays.asList(12, 0, 1, 3, 13), firstListItemFromMap("tokens", jsonDocAsMap));
														
 
															+        assertEquals(Arrays.asList(1, 1, 1, 1, 1), firstListItemFromMap("arg_1", jsonDocAsMap));
														
 
															+    }
														
 
															+
														
 
															+    @SuppressWarnings("unchecked")
														
 
															+    private List<Integer> firstListItemFromMap(String name, Map<String, Object> jsonDocAsMap) {
														
 
															+        return nthListItemFromMap(name, 0, jsonDocAsMap);
														
 
															+    }
														
 
															+
														
 
															+    @SuppressWarnings("unchecked")
														
 
															+    public static List<Integer> nthListItemFromMap(String name, int n, Map<String, Object> jsonDocAsMap) {
														
 
															+        return ((List<List<Integer>>) jsonDocAsMap.get(name)).get(n);
														
 
															+    }
														
 
															+
														
 
															+    public void testInputTooLarge() throws IOException {
														
 
															+        MPNetTokenizer tokenizer = MPNetTokenizer.mpBuilder(TEST_CASED_VOCAB, new MPNetTokenization(null, null, 5, null)).build();
														
 
															+        {
														
 
															+            MPNetRequestBuilder requestBuilder = new MPNetRequestBuilder(tokenizer);
														
 
															+            ElasticsearchStatusException e = expectThrows(
														
 
															+                ElasticsearchStatusException.class,
														
 
															+                () -> requestBuilder.buildRequest(
														
 
															+                    Collections.singletonList("Elasticsearch fun Elasticsearch fun Elasticsearch fun"),
														
 
															+                    "request1",
														
 
															+                    Tokenization.Truncate.NONE
														
 
															+                )
														
 
															+            );
														
 
															+
														
 
															+            assertThat(
														
 
															+                e.getMessage(),
														
 
															+                containsString("Input too large. The tokenized input length [11] exceeds the maximum sequence length [5]")
														
 
															+            );
														
 
															+        }
														
 
															+        {
														
 
															+            MPNetRequestBuilder requestBuilder = new MPNetRequestBuilder(tokenizer);
														
 
															+            // input will become 3 tokens + the Class and Separator token = 5 which is
														
 
															+            // our max sequence length
														
 
															+            requestBuilder.buildRequest(Collections.singletonList("Elasticsearch fun"), "request1", Tokenization.Truncate.NONE);
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+    @SuppressWarnings("unchecked")
														
 
															+    public void testBatchWithPadding() throws IOException {
														
 
															+        MPNetTokenizer tokenizer = MPNetTokenizer.mpBuilder(TEST_CASED_VOCAB, new MPNetTokenization(null, null, 512, null)).build();
														
 
															+
														
 
															+        MPNetRequestBuilder requestBuilder = new MPNetRequestBuilder(tokenizer);
														
 
															+        NlpTask.Request request = requestBuilder.buildRequest(
														
 
															+            List.of("Elasticsearch", "my little red car", "Godzilla day"),
														
 
															+            "request1",
														
 
															+            Tokenization.Truncate.NONE
														
 
															+        );
														
 
															+        Map<String, Object> jsonDocAsMap = XContentHelper.convertToMap(request.processInput, true, XContentType.JSON).v2();
														
 
															+
														
 
															+        assertThat(jsonDocAsMap.keySet(), hasSize(3));
														
 
															+        assertThat((List<List<Integer>>) jsonDocAsMap.get("tokens"), hasSize(3));
														
 
															+        assertThat((List<List<Integer>>) jsonDocAsMap.get("arg_1"), hasSize(3));
														
 
															+
														
 
															+        assertEquals("request1", jsonDocAsMap.get("request_id"));
														
 
															+        assertEquals(Arrays.asList(12, 0, 1, 13, 19, 19), nthListItemFromMap("tokens", 0, jsonDocAsMap));
														
 
															+        assertEquals(Arrays.asList(1, 1, 1, 1, 19, 19), nthListItemFromMap("arg_1", 0, jsonDocAsMap));
														
 
															+
														
 
															+        assertEquals(Arrays.asList(12, 4, 5, 6, 7, 13), nthListItemFromMap("tokens", 1, jsonDocAsMap));
														
 
															+        assertEquals(Arrays.asList(1, 1, 1, 1, 1, 1), nthListItemFromMap("arg_1", 1, jsonDocAsMap));
														
 
															+
														
 
															+        assertEquals(Arrays.asList(12, 8, 9, 16, 13, 19), nthListItemFromMap("tokens", 2, jsonDocAsMap));
														
 
															+        assertEquals(Arrays.asList(1, 1, 1, 1, 1, 19), nthListItemFromMap("arg_1", 2, jsonDocAsMap));
														
 
															+    }
														
 
															+}
														
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/MPNetTokenizerTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/MPNetTokenizerTests.java
@@ -0,0 +1,95 @@
 
															+/*
														
 
															+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
														
 
															+ * or more contributor license agreements. Licensed under the Elastic License
														
 
															+ * 2.0; you may not use this file except in compliance with the Elastic License
														
 
															+ * 2.0.
														
 
															+ */
														
 
															+
														
 
															+package org.elasticsearch.xpack.ml.inference.nlp.tokenizers;
														
 
															+
														
 
															+import org.elasticsearch.test.ESTestCase;
														
 
															+import org.elasticsearch.xpack.core.ml.inference.trainedmodel.MPNetTokenization;
														
 
															+import org.elasticsearch.xpack.core.ml.inference.trainedmodel.Tokenization;
														
 
															+
														
 
															+import java.util.Arrays;
														
 
															+import java.util.List;
														
 
															+import java.util.stream.Collectors;
														
 
															+
														
 
															+import static org.hamcrest.Matchers.contains;
														
 
															+
														
 
															+public class MPNetTokenizerTests extends ESTestCase {
														
 
															+
														
 
															+    public static final List<String> TEST_CASED_VOCAB = List.of(
														
 
															+        "Elastic",
														
 
															+        "##search",
														
 
															+        "is",
														
 
															+        "fun",
														
 
															+        "my",
														
 
															+        "little",
														
 
															+        "red",
														
 
															+        "car",
														
 
															+        "God",
														
 
															+        "##zilla",
														
 
															+        ".",
														
 
															+        ",",
														
 
															+        MPNetTokenizer.CLASS_TOKEN,
														
 
															+        MPNetTokenizer.SEPARATOR_TOKEN,
														
 
															+        MPNetTokenizer.MASK_TOKEN,
														
 
															+        MPNetTokenizer.UNKNOWN_TOKEN,
														
 
															+        "day",
														
 
															+        "Pancake",
														
 
															+        "with",
														
 
															+        MPNetTokenizer.PAD_TOKEN
														
 
															+    );
														
 
															+
														
 
															+    private List<String> tokenStrings(List<DelimitedToken> tokens) {
														
 
															+        return tokens.stream().map(DelimitedToken::getToken).collect(Collectors.toList());
														
 
															+    }
														
 
															+
														
 
															+    public void testTokenize() {
														
 
															+        BertTokenizer tokenizer = MPNetTokenizer.mpBuilder(
														
 
															+            TEST_CASED_VOCAB,
														
 
															+            new MPNetTokenization(null, false, null, Tokenization.Truncate.NONE)
														
 
															+        ).build();
														
 
															+
														
 
															+        TokenizationResult.Tokenization tokenization = tokenizer.tokenize("Elasticsearch fun", Tokenization.Truncate.NONE);
														
 
															+        assertThat(tokenStrings(tokenization.getTokens()), contains("Elasticsearch", "fun"));
														
 
															+        assertArrayEquals(new int[] { 0, 1, 3 }, tokenization.getTokenIds());
														
 
															+        assertArrayEquals(new int[] { 0, 0, 1 }, tokenization.getTokenMap());
														
 
															+    }
														
 
															+
														
 
															+    public void testMultiSeqTokenization() {
														
 
															+        MPNetTokenizer tokenizer = MPNetTokenizer.mpBuilder(
														
 
															+            TEST_CASED_VOCAB,
														
 
															+            new MPNetTokenization(null, false, null, Tokenization.Truncate.NONE)
														
 
															+        ).setDoLowerCase(false).setWithSpecialTokens(true).build();
														
 
															+        TokenizationResult.Tokenization tokenization = tokenizer.tokenize(
														
 
															+            "Elasticsearch is fun",
														
 
															+            "Godzilla my little red car",
														
 
															+            Tokenization.Truncate.NONE
														
 
															+        );
														
 
															+
														
 
															+        var tokenStream = Arrays.stream(tokenization.getTokenIds()).mapToObj(TEST_CASED_VOCAB::get).collect(Collectors.toList());
														
 
															+        assertThat(
														
 
															+            tokenStream,
														
 
															+            contains(
														
 
															+                MPNetTokenizer.CLASS_TOKEN,
														
 
															+                "Elastic",
														
 
															+                "##search",
														
 
															+                "is",
														
 
															+                "fun",
														
 
															+                MPNetTokenizer.SEPARATOR_TOKEN,
														
 
															+                MPNetTokenizer.SEPARATOR_TOKEN,
														
 
															+                "God",
														
 
															+                "##zilla",
														
 
															+                "my",
														
 
															+                "little",
														
 
															+                "red",
														
 
															+                "car",
														
 
															+                MPNetTokenizer.SEPARATOR_TOKEN
														
 
															+            )
														
 
															+        );
														
 
															+        assertArrayEquals(new int[] { 12, 0, 1, 2, 3, 13, 13, 8, 9, 4, 5, 6, 7, 13 }, tokenization.getTokenIds());
														
 
															+    }
														
 
															+
														
 
															+}