Browse Source

[ML] add new default char filter `first_line_with_letters` for machine learning categorization (#77457)

The char filter replaces the previous default of `first_non_blank_line`.

`first_non_blank_line` worked well to figure out what line had characters at all, but log lines 
like the following were handled poorly:
```
--------------------------------------------------------------------------------

Alias 'foo' already exists and this prevents setting up ILM for logs

--------------------------------------------------------------------------------
```
When combined with the `ml_standard` tokenizer, the first line was used:
```
--------------------------------------------------------------------------------
```
This has no valid tokens for our standard tokenizer. Consequently, no tokens were found by `ml_standard` tokenizer.


The new filter, `first_line_with_letters`, returns the first line with any letter character (e.g. `Character#isLetter` returns true).

Given the previously poorly handled log, when combining with our `ml_standard` tokenizer, we get the following, more appropriate, tokens:

```
"tokens" : ["Alias", "foo", "already", "exists", "and", "this", "prevents", "setting", "up", "ILM", "for", "logs"]
```
Benjamin Trent 4 years ago
parent
commit
281ec58b8d

+ 2 - 2
docs/reference/ml/anomaly-detection/apis/get-ml-info.asciidoc

@@ -18,7 +18,7 @@ Returns defaults and limits used by machine learning.
 [[get-ml-info-prereqs]]
 == {api-prereq-title}
 
-Requires the `monitor_ml` cluster privilege. This privilege is included in the 
+Requires the `monitor_ml` cluster privilege. This privilege is included in the
 `machine_learning_user` built-in role.
 
 [[get-ml-info-desc]]
@@ -50,7 +50,7 @@ This is a possible response:
     "anomaly_detectors" : {
       "categorization_analyzer" : {
         "char_filter" : [
-          "first_non_blank_line"
+          "first_line_with_letters"
         ],
         "tokenizer" : "ml_standard",
         "filter" : [

+ 3 - 3
docs/reference/ml/anomaly-detection/ml-configuring-categories.asciidoc

@@ -158,7 +158,7 @@ POST _ml/anomaly_detectors/_validate
   "analysis_config" : {
     "categorization_analyzer" : {
       "char_filter" : [
-        "first_non_blank_line"
+        "first_line_with_letters"
       ],
       "tokenizer" : "ml_standard",
       "filter" : [
@@ -205,7 +205,7 @@ PUT _ml/anomaly_detectors/it_ops_new_logs3
     }],
     "categorization_analyzer":{
       "char_filter" : [
-        "first_non_blank_line" <1>
+        "first_line_with_letters" <1>
       ],
       "tokenizer": {
         "type" : "simple_pattern_split",
@@ -238,7 +238,7 @@ PUT _ml/anomaly_detectors/it_ops_new_logs3
 ----------------------------------
 // TEST[skip:needs-licence]
 
-<1> Only consider the first non-blank line of the message for categorization purposes.
+<1> Only consider the first line of the message with letters for categorization purposes.
 <2> Tokens basically consist of hyphens, digits, letters, underscores, dots and slashes.
 <3> By default, categorization ignores tokens that begin with a digit.
 <4> By default, categorization also ignores tokens that are hexadecimal numbers.

+ 2 - 0
x-pack/plugin/build.gradle

@@ -158,6 +158,8 @@ tasks.named("yamlRestTestV7CompatTest").configure {
       // still this is a cat api, and we don't support them with rest api compatibility. (the test would be very hard to transform too)
       'ml/trained_model_cat_apis/Test cat trained models',
       'service_accounts/10_basic/Test get service accounts', //#76449, will remove upon backport
+      // Mute for backport https://github.com/elastic/elasticsearch/pull/77457
+      'ml/jobs_crud/Test update job'
   ].join(',')
   dependsOn "copyExtraResources"
 }

+ 1 - 1
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/CategorizationAnalyzerConfig.java

@@ -173,7 +173,7 @@ public class CategorizationAnalyzerConfig implements ToXContentFragment, Writeab
     public static CategorizationAnalyzerConfig buildStandardCategorizationAnalyzer(List<String> categorizationFilters) {
 
         return new CategorizationAnalyzerConfig.Builder()
-            .addCharFilter("first_non_blank_line")
+            .addCharFilter("first_line_with_letters")
             .addCategorizationFilters(categorizationFilters)
             .setTokenizer("ml_standard")
             .addDateWordsTokenFilter()

+ 5 - 4
x-pack/plugin/ml/qa/ml-with-security/build.gradle

@@ -20,10 +20,11 @@ tasks.named("yamlRestTest").configure {
     // Remove these tests because they don't call an ML endpoint and we don't want
     // to grant extra permissions to the users used in this test suite
     'ml/ml_classic_analyze/Test analyze API with an analyzer that does what we used to do in native code',
-    'ml/ml_standard_analyze/Test analyze API with the standard 7.14 ML analyzer',
-    'ml/ml_standard_analyze/Test 7.14 analyzer with blank lines',
-    'ml/ml_standard_analyze/Test 7.14 analyzer with multiple multiline messages',
-    'ml/ml_standard_analyze/Test 7.14 analyzer with stop words in messages',
+    'ml/ml_standard_analyze/Test analyze API with the standard 7.16 ML analyzer',
+    'ml/ml_standard_analyze/Test 7.16 analyzer with blank lines',
+    'ml/ml_standard_analyze/Test 7.16 analyzer with multiple multiline messages',
+    'ml/ml_standard_analyze/Test 7.16 analyzer with stop words in messages',
+    'ml/ml_standard_analyze/Test 7.16 analyzer with stop words in messages and strange lines without letters',
     // Remove tests that are expected to throw an exception, because we cannot then
     // know whether to expect an authorization exception or a validation exception
     'ml/3rd_party_deployment/Test start deployment fails with missing model definition',

+ 7 - 1
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java

@@ -30,6 +30,7 @@ import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodeRole;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.collect.MapBuilder;
 import org.elasticsearch.common.xcontent.ParseField;
 import org.elasticsearch.common.breaker.CircuitBreaker;
 import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
@@ -302,6 +303,8 @@ import org.elasticsearch.xpack.ml.job.JobManager;
 import org.elasticsearch.xpack.ml.job.JobManagerHolder;
 import org.elasticsearch.xpack.ml.job.NodeLoadDetector;
 import org.elasticsearch.xpack.ml.job.UpdateJobProcessNotifier;
+import org.elasticsearch.xpack.ml.job.categorization.FirstLineWithLettersCharFilter;
+import org.elasticsearch.xpack.ml.job.categorization.FirstLineWithLettersCharFilterFactory;
 import org.elasticsearch.xpack.ml.job.categorization.FirstNonBlankLineCharFilter;
 import org.elasticsearch.xpack.ml.job.categorization.FirstNonBlankLineCharFilterFactory;
 import org.elasticsearch.xpack.ml.job.categorization.MlClassicTokenizer;
@@ -1216,7 +1219,10 @@ public class MachineLearning extends Plugin implements SystemIndexPlugin,
     }
 
     public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() {
-        return Collections.singletonMap(FirstNonBlankLineCharFilter.NAME, FirstNonBlankLineCharFilterFactory::new);
+        return MapBuilder.<String, AnalysisProvider<CharFilterFactory>>newMapBuilder()
+            .put(FirstNonBlankLineCharFilter.NAME, FirstNonBlankLineCharFilterFactory::new)
+            .put(FirstLineWithLettersCharFilter.NAME, FirstLineWithLettersCharFilterFactory::new)
+            .map();
     }
 
     @Override

+ 104 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/FirstLineWithLettersCharFilter.java

@@ -0,0 +1,104 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.ml.job.categorization;
+
+import org.apache.lucene.analysis.charfilter.BaseCharFilter;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+
+/**
+ * A character filter that keeps the first line with alpha letters in the input, and discards everything before and after it.
+ * Treats both <code>\n</code> and <code>\r\n</code> as line endings.
+ *
+ * If there is a line ending at the end of the first valid line, this is discarded.
+ *
+ * A line is considered valid if any {@link Character#isLetter} returns
+ * <code>true</code>.
+ *
+ * It is possible to achieve the same effect with a <code>pattern_replace</code> filter, but since this filter
+ * needs to be run on every single message to be categorized it is worth having a more performant specialization.
+ */
+public class FirstLineWithLettersCharFilter extends BaseCharFilter {
+
+    public static final String NAME = "first_line_with_letters";
+
+    private Reader transformedInput;
+
+    FirstLineWithLettersCharFilter(Reader in) {
+        super(in);
+    }
+
+    @Override
+    public int read(char[] cbuf, int off, int len) throws IOException {
+        // Buffer all input on the first call.
+        if (transformedInput == null) {
+            fill();
+        }
+
+        return transformedInput.read(cbuf, off, len);
+    }
+
+    @Override
+    public int read() throws IOException {
+        if (transformedInput == null) {
+            fill();
+        }
+
+        return transformedInput.read();
+    }
+
+    private void fill() throws IOException {
+        StringBuilder buffered = new StringBuilder();
+        char[] temp = new char[1024];
+        for (int cnt = input.read(temp); cnt > 0; cnt = input.read(temp)) {
+            buffered.append(temp, 0, cnt);
+        }
+        transformedInput = new StringReader(process(buffered).toString());
+    }
+
+    private CharSequence process(CharSequence input) {
+
+        boolean seenLetter = false;
+        int prevNewlineIndex = -1;
+        int endIndex = -1;
+
+        for (int index = 0; index < input.length(); ++index) {
+            if (input.charAt(index) == '\n') {
+                if (seenLetter) {
+                    // With Windows line endings chop the \r as well as the \n
+                    endIndex = (input.charAt(index - 1) == '\r') ? (index - 1) : index;
+                    break;
+                }
+                prevNewlineIndex = index;
+            } else {
+                seenLetter = seenLetter || Character.isLetter(input.charAt(index));
+            }
+        }
+
+        if (seenLetter == false) {
+            return "";
+        }
+
+        if (endIndex == -1) {
+            if (prevNewlineIndex == -1) {
+                // This is pretty likely, as most log messages _aren't_ multiline, so worth optimising
+                // for even though the return at the end of the method would be functionally identical
+                return input;
+            }
+            endIndex = input.length();
+        }
+
+        addOffCorrectMap(0, prevNewlineIndex + 1);
+        if (endIndex < input.length()) {
+            addOffCorrectMap(endIndex - prevNewlineIndex - 1, input.length() - endIndex + prevNewlineIndex + 1);
+        }
+        return input.subSequence(prevNewlineIndex + 1, endIndex);
+    }
+}

+ 27 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/FirstLineWithLettersCharFilterFactory.java

@@ -0,0 +1,27 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.ml.job.categorization;
+
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractCharFilterFactory;
+
+import java.io.Reader;
+
+public class FirstLineWithLettersCharFilterFactory extends AbstractCharFilterFactory {
+
+    public FirstLineWithLettersCharFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+        super(indexSettings, name);
+    }
+
+    @Override
+    public Reader create(Reader tokenStream) {
+        return new FirstLineWithLettersCharFilter(tokenStream);
+    }
+}

+ 134 - 0
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/FirstLineWithLettersCharFilterTests.java

@@ -0,0 +1,134 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.ml.job.categorization;
+
+import org.elasticsearch.test.ESTestCase;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import static org.hamcrest.Matchers.equalTo;
+
+public class FirstLineWithLettersCharFilterTests extends ESTestCase {
+
+    public void testEmpty() throws IOException {
+
+        String input = "";
+        FirstLineWithLettersCharFilter filter = new FirstLineWithLettersCharFilter(new StringReader(input));
+
+        assertThat(filter.read(), equalTo(-1));
+    }
+
+    public void testAllBlankOneLine() throws IOException {
+
+        String input = randomFrom("!@#$%^&*()1234567890{}[]\\;':,./<>?`~", "\t", " ", "");
+        if (randomBoolean()) {
+            input = " " + input;
+        }
+        if (randomBoolean()) {
+            input = input + " ";
+        }
+        FirstLineWithLettersCharFilter filter = new FirstLineWithLettersCharFilter(new StringReader(input));
+
+        assertThat(filter.read(), equalTo(-1));
+    }
+
+    public void testNonBlankNoNewlines() throws IOException {
+
+        String input = "the quick brown fox jumped over the lazy dog";
+        if (randomBoolean()) {
+            input = " " + input;
+        }
+        if (randomBoolean()) {
+            input = input + " ";
+        }
+        FirstLineWithLettersCharFilter filter = new FirstLineWithLettersCharFilter(new StringReader(input));
+
+        char[] output = new char[input.length()];
+        assertThat(filter.read(output, 0, output.length), equalTo(input.length()));
+        assertThat(filter.read(), equalTo(-1));
+        assertThat(new String(output), equalTo(input));
+    }
+
+    public void testNonBlankMultiline() throws IOException {
+
+        StringBuilder input = new StringBuilder();
+        String lineEnding = randomBoolean() ? "\n" : "\r\n";
+        for (int lineBeforeNum = randomIntBetween(2, 5); lineBeforeNum > 0; --lineBeforeNum) {
+            for (int charNum = randomIntBetween(0, 5); charNum > 0; --charNum) {
+                input.append(randomBoolean() ? " " : "\t");
+            }
+            input.append(lineEnding);
+        }
+        String lineToKeep = "the quick brown fox jumped over the lazy dog";
+        if (randomBoolean()) {
+            lineToKeep = " " + lineToKeep;
+        }
+        if (randomBoolean()) {
+            lineToKeep = lineToKeep + " ";
+        }
+        input.append(lineToKeep).append(lineEnding);
+        for (int lineAfterNum = randomIntBetween(2, 5); lineAfterNum > 0; --lineAfterNum) {
+            for (int charNum = randomIntBetween(0, 5); charNum > 0; --charNum) {
+                input.append(randomBoolean() ? " " : "more");
+            }
+            if (lineAfterNum > 1 || randomBoolean()) {
+                input.append(lineEnding);
+            }
+        }
+
+        FirstLineWithLettersCharFilter filter = new FirstLineWithLettersCharFilter(new StringReader(input.toString()));
+
+        char[] output = new char[lineToKeep.length()];
+        assertThat(filter.read(output, 0, output.length), equalTo(lineToKeep.length()));
+        assertThat(filter.read(), equalTo(-1));
+        assertThat(new String(output), equalTo(lineToKeep));
+    }
+
+    public void testNoLinesWithLetters() throws IOException {
+        StringBuilder input = new StringBuilder();
+        String lineEnding = randomBoolean() ? "\n" : "\r\n";
+        for (int lineNum = randomIntBetween(2, 5); lineNum > 0; --lineNum) {
+            for (int charNum = randomIntBetween(0, 5); charNum > 0; --charNum) {
+                input.append(randomFrom("!@#$%^&*()1234567890{}[]\\;':,./<>?`~", "\t", " ", ""));
+            }
+            if (lineNum > 1 || randomBoolean()) {
+                input.append(lineEnding);
+            }
+        }
+
+        FirstLineWithLettersCharFilter filter = new FirstLineWithLettersCharFilter(new StringReader(input.toString()));
+
+        assertThat(filter.read(), equalTo(-1));
+    }
+
+    public void testCorrect() throws IOException {
+
+        String input = "  --------------------------------------------------------------------------------\n" +
+            "\n" +
+            "Alias 'foo' already exists and this prevents setting up ILM for logs\n" +
+            "\n" +
+            "--------------------------------------------------------------------------------";
+        FirstLineWithLettersCharFilter filter = new FirstLineWithLettersCharFilter(new StringReader(input));
+
+        String expectedOutput = "Alias 'foo' already exists and this prevents setting up ILM for logs";
+
+        char[] output = new char[expectedOutput.length()];
+        assertThat(filter.read(output, 0, output.length), equalTo(expectedOutput.length()));
+        assertThat(filter.read(), equalTo(-1));
+        assertThat(new String(output), equalTo(expectedOutput));
+
+        int expectedOutputIndex = input.indexOf(expectedOutput);
+        for (int i = 0; i < expectedOutput.length(); ++i) {
+            assertThat(filter.correctOffset(i), equalTo(expectedOutputIndex + i));
+        }
+        // When the input gets chopped by a char filter immediately after a token, that token must be reported as
+        // ending at the very end of the original input, otherwise multi-message analysis will have incorrect offsets
+        assertThat(filter.correctOffset(expectedOutput.length()), equalTo(input.length()));
+    }
+}

+ 1 - 1
x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/jobs_crud.yml

@@ -388,7 +388,7 @@
   - length: { analysis_config.categorization_analyzer.filter: 1 }
   - match: { analysis_config.categorization_analyzer.tokenizer: "ml_standard" }
   - length: { analysis_config.categorization_analyzer.char_filter: 3 }
-  - match: { analysis_config.categorization_analyzer.char_filter.0: "first_non_blank_line" }
+  - match: { analysis_config.categorization_analyzer.char_filter.0: "first_line_with_letters" }
   - match: { analysis_config.categorization_analyzer.char_filter.1.pattern: "cat1.*" }
   - match: { analysis_config.categorization_analyzer.char_filter.2.pattern: "cat2.*" }
   - match: { analysis_config.bucket_span: "5m" }

+ 56 - 8
x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/ml_standard_analyze.yml

@@ -1,11 +1,11 @@
 ---
-"Test analyze API with the standard 7.14 ML analyzer":
+"Test analyze API with the standard 7.16 ML analyzer":
   - do:
       indices.analyze:
         body:  >
           {
             "char_filter" : [
-              "first_non_blank_line"
+              "first_line_with_letters"
             ],
             "tokenizer" : "ml_standard",
             "filter" : [
@@ -85,13 +85,13 @@
   - match: { tokens.15.position: 22 }
 
 ---
-"Test 7.14 analyzer with blank lines":
+"Test 7.16 analyzer with blank lines":
   - do:
       indices.analyze:
         body:  >
           {
             "char_filter" : [
-              "first_non_blank_line"
+              "first_line_with_letters"
             ],
             "tokenizer" : "ml_standard",
             "filter" : [
@@ -115,13 +115,13 @@
   - match: { tokens.1.position: 1 }
 
 ---
-"Test 7.14 analyzer with multiple multiline messages":
+"Test 7.16 analyzer with multiple multiline messages":
   - do:
       indices.analyze:
         body:  >
           {
             "char_filter" : [
-              "first_non_blank_line"
+              "first_line_with_letters"
             ],
             "tokenizer" : "ml_standard",
             "filter" : [
@@ -168,13 +168,13 @@
   - match: { tokens.6.position: 106 }
 
 ---
-"Test 7.14 analyzer with stop words in messages":
+"Test 7.16 analyzer with stop words in messages":
   - do:
       indices.analyze:
         body:  >
           {
             "char_filter" : [
-              "first_non_blank_line"
+              "first_line_with_letters"
             ],
             "tokenizer" : "ml_standard",
             "filter" : [
@@ -215,3 +215,51 @@
   - match: { tokens.5.start_offset: 92 }
   - match: { tokens.5.end_offset: 95 }
   - match: { tokens.5.position: 119 }
+---
+"Test 7.16 analyzer with stop words in messages and strange lines without letters":
+  - do:
+      indices.analyze:
+        body:  >
+          {
+            "char_filter" : [
+              "first_line_with_letters"
+            ],
+            "tokenizer" : "ml_standard",
+            "filter" : [
+              { "type" : "stop", "stopwords": [
+                "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
+                "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun",
+                "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December",
+                "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
+                "GMT", "UTC"
+              ] }
+            ],
+            "text" : [
+              "\n-----\nMay 27, 2021 @ 19:51:15.288 UTC log message one\n-----\n",
+              "\n-----\nMay 27, 2021 @ 19:52:25.288 UTC log message two\n-----\n"
+            ]
+          }
+  - match: { tokens.0.token: "log" }
+  - match: { tokens.0.start_offset: 39 }
+  - match: { tokens.0.end_offset: 42 }
+  - match: { tokens.0.position: 7 }
+  - match: { tokens.1.token: "message" }
+  - match: { tokens.1.start_offset: 43 }
+  - match: { tokens.1.end_offset: 50 }
+  - match: { tokens.1.position: 8 }
+  - match: { tokens.2.token: "one" }
+  - match: { tokens.2.start_offset: 51 }
+  - match: { tokens.2.end_offset: 61 }
+  - match: { tokens.2.position: 9 }
+  - match: { tokens.3.token: "log" }
+  - match: { tokens.3.start_offset: 101 }
+  - match: { tokens.3.end_offset: 104 }
+  - match: { tokens.3.position: 117 }
+  - match: { tokens.4.token: "message" }
+  - match: { tokens.4.start_offset: 105 }
+  - match: { tokens.4.end_offset: 112 }
+  - match: { tokens.4.position: 118 }
+  - match: { tokens.5.token: "two" }
+  - match: { tokens.5.start_offset: 113 }
+  - match: { tokens.5.end_offset: 123 }
+  - match: { tokens.5.position: 119 }