|
@@ -50,7 +50,7 @@ public class BertTokenizerTests extends ESTestCase {
|
|
|
new BertTokenization(null, false, null, Tokenization.Truncate.NONE)
|
|
|
).build();
|
|
|
|
|
|
- TokenizationResult.Tokenization tokenization = tokenizer.tokenize("Elasticsearch fun");
|
|
|
+ TokenizationResult.Tokenization tokenization = tokenizer.tokenize("Elasticsearch fun", Tokenization.Truncate.NONE);
|
|
|
assertThat(tokenization.getTokens(), arrayContaining("Elastic", "##search", "fun"));
|
|
|
assertArrayEquals(new int[] { 0, 1, 3 }, tokenization.getTokenIds());
|
|
|
assertArrayEquals(new int[] { 0, 0, 1 }, tokenization.getTokenMap());
|
|
@@ -62,7 +62,7 @@ public class BertTokenizerTests extends ESTestCase {
|
|
|
|
|
|
ElasticsearchStatusException ex = expectThrows(
|
|
|
ElasticsearchStatusException.class,
|
|
|
- () -> tokenizer.tokenize("Elasticsearch fun with Pancake and Godzilla")
|
|
|
+ () -> tokenizer.tokenize("Elasticsearch fun with Pancake and Godzilla", Tokenization.Truncate.NONE)
|
|
|
);
|
|
|
assertThat(ex.getMessage(), equalTo("Input too large. The tokenized input length [8] exceeds the maximum sequence length [5]"));
|
|
|
|
|
@@ -72,28 +72,34 @@ public class BertTokenizerTests extends ESTestCase {
|
|
|
).build();
|
|
|
|
|
|
// Shouldn't throw
|
|
|
- tokenizer.tokenize("Elasticsearch fun with Pancake");
|
|
|
+ tokenizer.tokenize("Elasticsearch fun with Pancake", Tokenization.Truncate.NONE);
|
|
|
|
|
|
// Should throw as special chars add two tokens
|
|
|
- expectThrows(ElasticsearchStatusException.class, () -> specialCharTokenizer.tokenize("Elasticsearch fun with Pancake"));
|
|
|
+ expectThrows(
|
|
|
+ ElasticsearchStatusException.class,
|
|
|
+ () -> specialCharTokenizer.tokenize("Elasticsearch fun with Pancake", Tokenization.Truncate.NONE)
|
|
|
+ );
|
|
|
}
|
|
|
|
|
|
public void testTokenizeLargeInputTruncation() {
|
|
|
BertTokenizer tokenizer = BertTokenizer.builder(TEST_CASED_VOCAB, new BertTokenization(null, false, 5, Tokenization.Truncate.FIRST))
|
|
|
.build();
|
|
|
|
|
|
- TokenizationResult.Tokenization tokenization = tokenizer.tokenize("Elasticsearch fun with Pancake and Godzilla");
|
|
|
+ TokenizationResult.Tokenization tokenization = tokenizer.tokenize(
|
|
|
+ "Elasticsearch fun with Pancake and Godzilla",
|
|
|
+ Tokenization.Truncate.FIRST
|
|
|
+ );
|
|
|
assertThat(tokenization.getTokens(), arrayContaining("Elastic", "##search", "fun", "with", "Pancake"));
|
|
|
|
|
|
tokenizer = BertTokenizer.builder(TEST_CASED_VOCAB, new BertTokenization(null, true, 5, Tokenization.Truncate.FIRST)).build();
|
|
|
- tokenization = tokenizer.tokenize("Elasticsearch fun with Pancake and Godzilla");
|
|
|
+ tokenization = tokenizer.tokenize("Elasticsearch fun with Pancake and Godzilla", Tokenization.Truncate.FIRST);
|
|
|
assertThat(tokenization.getTokens(), arrayContaining("[CLS]", "Elastic", "##search", "fun", "[SEP]"));
|
|
|
}
|
|
|
|
|
|
public void testTokenizeAppendSpecialTokens() {
|
|
|
BertTokenizer tokenizer = BertTokenizer.builder(TEST_CASED_VOCAB, Tokenization.createDefault()).build();
|
|
|
|
|
|
- TokenizationResult.Tokenization tokenization = tokenizer.tokenize("Elasticsearch fun");
|
|
|
+ TokenizationResult.Tokenization tokenization = tokenizer.tokenize("Elasticsearch fun", Tokenization.Truncate.NONE);
|
|
|
assertThat(tokenization.getTokens(), arrayContaining("[CLS]", "Elastic", "##search", "fun", "[SEP]"));
|
|
|
assertArrayEquals(new int[] { 12, 0, 1, 3, 13 }, tokenization.getTokenIds());
|
|
|
assertArrayEquals(new int[] { -1, 0, 0, 1, -1 }, tokenization.getTokenMap());
|
|
@@ -107,7 +113,10 @@ public class BertTokenizerTests extends ESTestCase {
|
|
|
.setWithSpecialTokens(false)
|
|
|
.build();
|
|
|
|
|
|
- TokenizationResult.Tokenization tokenization = tokenizer.tokenize("Elasticsearch " + specialToken + " fun");
|
|
|
+ TokenizationResult.Tokenization tokenization = tokenizer.tokenize(
|
|
|
+ "Elasticsearch " + specialToken + " fun",
|
|
|
+ Tokenization.Truncate.NONE
|
|
|
+ );
|
|
|
assertThat(tokenization.getTokens(), arrayContaining("Elastic", "##search", specialToken, "fun"));
|
|
|
assertArrayEquals(new int[] { 0, 1, 15, 3 }, tokenization.getTokenIds());
|
|
|
assertArrayEquals(new int[] { 0, 0, 1, 2 }, tokenization.getTokenMap());
|
|
@@ -120,12 +129,12 @@ public class BertTokenizerTests extends ESTestCase {
|
|
|
Tokenization.createDefault()
|
|
|
).setDoLowerCase(false).setWithSpecialTokens(false).build();
|
|
|
|
|
|
- TokenizationResult.Tokenization tokenization = tokenizer.tokenize("Elasticsearch fun");
|
|
|
+ TokenizationResult.Tokenization tokenization = tokenizer.tokenize("Elasticsearch fun", Tokenization.Truncate.NONE);
|
|
|
assertThat(tokenization.getTokens(), arrayContaining(BertTokenizer.UNKNOWN_TOKEN, "fun"));
|
|
|
assertArrayEquals(new int[] { 3, 2 }, tokenization.getTokenIds());
|
|
|
assertArrayEquals(new int[] { 0, 1 }, tokenization.getTokenMap());
|
|
|
|
|
|
- tokenization = tokenizer.tokenize("elasticsearch fun");
|
|
|
+ tokenization = tokenizer.tokenize("elasticsearch fun", Tokenization.Truncate.NONE);
|
|
|
assertThat(tokenization.getTokens(), arrayContaining("elastic", "##search", "fun"));
|
|
|
}
|
|
|
|
|
@@ -135,7 +144,7 @@ public class BertTokenizerTests extends ESTestCase {
|
|
|
.setWithSpecialTokens(false)
|
|
|
.build();
|
|
|
|
|
|
- TokenizationResult.Tokenization tokenization = tokenizer.tokenize("Elasticsearch fun");
|
|
|
+ TokenizationResult.Tokenization tokenization = tokenizer.tokenize("Elasticsearch fun", Tokenization.Truncate.NONE);
|
|
|
assertThat(tokenization.getTokens(), arrayContaining("elastic", "##search", "fun"));
|
|
|
}
|
|
|
}
|
|
@@ -143,12 +152,12 @@ public class BertTokenizerTests extends ESTestCase {
|
|
|
public void testPunctuation() {
|
|
|
BertTokenizer tokenizer = BertTokenizer.builder(TEST_CASED_VOCAB, Tokenization.createDefault()).setWithSpecialTokens(false).build();
|
|
|
|
|
|
- TokenizationResult.Tokenization tokenization = tokenizer.tokenize("Elasticsearch, fun.");
|
|
|
+ TokenizationResult.Tokenization tokenization = tokenizer.tokenize("Elasticsearch, fun.", Tokenization.Truncate.NONE);
|
|
|
assertThat(tokenization.getTokens(), arrayContaining("Elastic", "##search", ",", "fun", "."));
|
|
|
assertArrayEquals(new int[] { 0, 1, 11, 3, 10 }, tokenization.getTokenIds());
|
|
|
assertArrayEquals(new int[] { 0, 0, 1, 2, 3 }, tokenization.getTokenMap());
|
|
|
|
|
|
- tokenization = tokenizer.tokenize("Elasticsearch, fun [MASK].");
|
|
|
+ tokenization = tokenizer.tokenize("Elasticsearch, fun [MASK].", Tokenization.Truncate.NONE);
|
|
|
assertThat(tokenization.getTokens(), arrayContaining("Elastic", "##search", ",", "fun", "[MASK]", "."));
|
|
|
assertArrayEquals(new int[] { 0, 1, 11, 3, 14, 10 }, tokenization.getTokenIds());
|
|
|
assertArrayEquals(new int[] { 0, 0, 1, 2, 3, 4 }, tokenization.getTokenMap());
|
|
@@ -162,10 +171,10 @@ public class BertTokenizerTests extends ESTestCase {
|
|
|
|
|
|
TokenizationResult tr = tokenizer.buildTokenizationResult(
|
|
|
List.of(
|
|
|
- tokenizer.tokenize("Elasticsearch"),
|
|
|
- tokenizer.tokenize("my little red car"),
|
|
|
- tokenizer.tokenize("Godzilla day"),
|
|
|
- tokenizer.tokenize("Godzilla Pancake red car day")
|
|
|
+ tokenizer.tokenize("Elasticsearch", Tokenization.Truncate.NONE),
|
|
|
+ tokenizer.tokenize("my little red car", Tokenization.Truncate.NONE),
|
|
|
+ tokenizer.tokenize("Godzilla day", Tokenization.Truncate.NONE),
|
|
|
+ tokenizer.tokenize("Godzilla Pancake red car day", Tokenization.Truncate.NONE)
|
|
|
)
|
|
|
);
|
|
|
assertThat(tr.getTokenizations(), hasSize(4));
|
|
@@ -196,7 +205,11 @@ public class BertTokenizerTests extends ESTestCase {
|
|
|
.setDoLowerCase(false)
|
|
|
.setWithSpecialTokens(true)
|
|
|
.build();
|
|
|
- TokenizationResult.Tokenization tokenization = tokenizer.tokenize("Elasticsearch is fun", "Godzilla my little red car");
|
|
|
+ TokenizationResult.Tokenization tokenization = tokenizer.tokenize(
|
|
|
+ "Elasticsearch is fun",
|
|
|
+ "Godzilla my little red car",
|
|
|
+ Tokenization.Truncate.NONE
|
|
|
+ );
|
|
|
assertThat(
|
|
|
tokenization.getTokens(),
|
|
|
arrayContaining(
|
|
@@ -222,7 +235,11 @@ public class BertTokenizerTests extends ESTestCase {
|
|
|
BertTokenizer tokenizer = BertTokenizer.builder(TEST_CASED_VOCAB, new BertTokenization(null, true, 10, Tokenization.Truncate.FIRST))
|
|
|
.build();
|
|
|
|
|
|
- TokenizationResult.Tokenization tokenization = tokenizer.tokenize("Elasticsearch is fun", "Godzilla my little red car");
|
|
|
+ TokenizationResult.Tokenization tokenization = tokenizer.tokenize(
|
|
|
+ "Elasticsearch is fun",
|
|
|
+ "Godzilla my little red car",
|
|
|
+ Tokenization.Truncate.FIRST
|
|
|
+ );
|
|
|
assertThat(
|
|
|
tokenization.getTokens(),
|
|
|
arrayContaining(
|
|
@@ -243,12 +260,12 @@ public class BertTokenizerTests extends ESTestCase {
|
|
|
ElasticsearchStatusException.class,
|
|
|
() -> BertTokenizer.builder(TEST_CASED_VOCAB, new BertTokenization(null, true, 8, Tokenization.Truncate.NONE))
|
|
|
.build()
|
|
|
- .tokenize("Elasticsearch is fun", "Godzilla my little red car")
|
|
|
+ .tokenize("Elasticsearch is fun", "Godzilla my little red car", Tokenization.Truncate.NONE)
|
|
|
);
|
|
|
|
|
|
tokenizer = BertTokenizer.builder(TEST_CASED_VOCAB, new BertTokenization(null, true, 10, Tokenization.Truncate.SECOND)).build();
|
|
|
|
|
|
- tokenization = tokenizer.tokenize("Elasticsearch is fun", "Godzilla my little red car");
|
|
|
+ tokenization = tokenizer.tokenize("Elasticsearch is fun", "Godzilla my little red car", Tokenization.Truncate.SECOND);
|
|
|
assertThat(
|
|
|
tokenization.getTokens(),
|
|
|
arrayContaining(
|
|
@@ -272,6 +289,6 @@ public class BertTokenizerTests extends ESTestCase {
|
|
|
.setDoLowerCase(false)
|
|
|
.setWithSpecialTokens(false)
|
|
|
.build();
|
|
|
- expectThrows(Exception.class, () -> tokenizer.tokenize("foo", "foo"));
|
|
|
+ expectThrows(Exception.class, () -> tokenizer.tokenize("foo", "foo", Tokenization.Truncate.NONE));
|
|
|
}
|
|
|
}
|