|
@@ -80,14 +80,16 @@ public class NerProcessorTests extends ESTestCase {
|
|
|
}
|
|
|
|
|
|
public void testProcessResults_GivenNoTokens() {
|
|
|
- NerProcessor.NerResultProcessor processor = createProcessor(Collections.emptyList(), "");
|
|
|
- NerResults result = (NerResults) processor.processResult(new PyTorchResult("test", null, 0L, null));
|
|
|
+ NerProcessor.NerResultProcessor processor = new NerProcessor.NerResultProcessor(NerProcessor.IobTag.values());
|
|
|
+ BertTokenizer.TokenizationResult tokenization = tokenize(Collections.emptyList(), "");
|
|
|
+ NerResults result = (NerResults) processor.processResult(tokenization, new PyTorchResult("test", null, 0L, null));
|
|
|
assertThat(result.getEntityGroups(), is(empty()));
|
|
|
}
|
|
|
|
|
|
public void testProcessResults() {
|
|
|
- NerProcessor.NerResultProcessor processor =
|
|
|
- createProcessor(Arrays.asList("el", "##astic", "##search", "many", "use", "in", "london"), "Many use Elasticsearch in London");
|
|
|
+ NerProcessor.NerResultProcessor processor = new NerProcessor.NerResultProcessor(NerProcessor.IobTag.values());
|
|
|
+ BertTokenizer.TokenizationResult tokenization = tokenize(Arrays.asList("el", "##astic", "##search", "many", "use", "in", "london"),
|
|
|
+ "Many use Elasticsearch in London");
|
|
|
double[][] scores = {
|
|
|
{ 7, 0, 0, 0, 0, 0, 0, 0, 0}, // many
|
|
|
{ 7, 0, 0, 0, 0, 0, 0, 0, 0}, // use
|
|
@@ -97,7 +99,7 @@ public class NerProcessorTests extends ESTestCase {
|
|
|
{ 0, 0, 0, 0, 0, 0, 0, 0, 0}, // in
|
|
|
{ 0, 0, 0, 0, 0, 0, 0, 6, 0} // london
|
|
|
};
|
|
|
- NerResults result = (NerResults) processor.processResult(new PyTorchResult("1", scores, 1L, null));
|
|
|
+ NerResults result = (NerResults) processor.processResult(tokenization, new PyTorchResult("1", scores, 1L, null));
|
|
|
|
|
|
assertThat(result.getEntityGroups().size(), equalTo(2));
|
|
|
assertThat(result.getEntityGroups().get(0).getWord(), equalTo("elasticsearch"));
|
|
@@ -120,11 +122,9 @@ public class NerProcessorTests extends ESTestCase {
|
|
|
NerProcessor.IobTag.O
|
|
|
};
|
|
|
|
|
|
- NerProcessor.NerResultProcessor processor = createProcessor(
|
|
|
- Arrays.asList("el", "##astic", "##search", "many", "use", "in", "london"),
|
|
|
- "Elasticsearch in London",
|
|
|
- iobMap
|
|
|
- );
|
|
|
+ NerProcessor.NerResultProcessor processor = new NerProcessor.NerResultProcessor(iobMap);
|
|
|
+ BertTokenizer.TokenizationResult tokenization = tokenize(Arrays.asList("el", "##astic", "##search", "many", "use", "in", "london"),
|
|
|
+ "Elasticsearch in London");
|
|
|
|
|
|
double[][] scores = {
|
|
|
{ 0.01, 0.01, 0, 0.01, 0, 0, 7, 3, 0}, // el
|
|
@@ -133,7 +133,7 @@ public class NerProcessorTests extends ESTestCase {
|
|
|
{ 0, 0, 0, 0, 0, 0, 0, 0, 5}, // in
|
|
|
{ 6, 0, 0, 0, 0, 0, 0, 0, 0} // london
|
|
|
};
|
|
|
- NerResults result = (NerResults) processor.processResult(new PyTorchResult("1", scores, 1L, null));
|
|
|
+ NerResults result = (NerResults) processor.processResult(tokenization, new PyTorchResult("1", scores, 1L, null));
|
|
|
|
|
|
assertThat(result.getEntityGroups().size(), equalTo(2));
|
|
|
assertThat(result.getEntityGroups().get(0).getWord(), equalTo("elasticsearch"));
|
|
@@ -210,21 +210,11 @@ public class NerProcessorTests extends ESTestCase {
|
|
|
assertThat(entityGroups.get(2).getLabel(), equalTo("organisation"));
|
|
|
}
|
|
|
|
|
|
- private static NerProcessor.NerResultProcessor createProcessor(List<String> vocab, String input){
|
|
|
+ private static BertTokenizer.TokenizationResult tokenize(List<String> vocab, String input) {
|
|
|
BertTokenizer tokenizer = BertTokenizer.builder(vocab)
|
|
|
.setDoLowerCase(true)
|
|
|
.setWithSpecialTokens(false)
|
|
|
.build();
|
|
|
- BertTokenizer.TokenizationResult tokenizationResult = tokenizer.tokenize(input);
|
|
|
- return new NerProcessor.NerResultProcessor(tokenizationResult, NerProcessor.IobTag.values());
|
|
|
- }
|
|
|
-
|
|
|
- private static NerProcessor.NerResultProcessor createProcessor(List<String> vocab, String input, NerProcessor.IobTag[] iobMap){
|
|
|
- BertTokenizer tokenizer = BertTokenizer.builder(vocab)
|
|
|
- .setDoLowerCase(true)
|
|
|
- .setWithSpecialTokens(false)
|
|
|
- .build();
|
|
|
- BertTokenizer.TokenizationResult tokenizationResult = tokenizer.tokenize(input);
|
|
|
- return new NerProcessor.NerResultProcessor(tokenizationResult, iobMap);
|
|
|
+ return tokenizer.tokenize(input);
|
|
|
}
|
|
|
}
|