4 anni fa · 334ad82c99
--- a/x-pack/plugin/build.gradle
+++ b/x-pack/plugin/build.gradle
@@ -121,6 +121,8 @@ tasks.named("yamlRestCompatTest").configure {
 
				     'ml/jobs_get_stats/Test get job stats after uploading data prompting the creation of some stats',
			
 
				     'ml/jobs_get_stats/Test get job stats for closed job',
			
 
				     'ml/jobs_get_stats/Test no exception on get job stats with missing index',
			
 
				+    // TODO: remove the next one after backporting https://github.com/elastic/elasticsearch/pull/73828
			
 
				+    'ml/ml_standard_analyze/Test 7.14 analyzer with blank lines',
			
 
				     'ml/post_data/Test POST data job api, flush, close and verify DataCounts doc',
			
 
				     'ml/post_data/Test flush with skip_time',
			
 
				     'ml/set_upgrade_mode/Setting upgrade mode to disabled from enabled',
			
--- a/x-pack/plugin/ml/qa/ml-with-security/build.gradle
+++ b/x-pack/plugin/ml/qa/ml-with-security/build.gradle
@@ -22,6 +22,8 @@ tasks.named("yamlRestTest").configure {
 
				     'ml/ml_classic_analyze/Test analyze API with an analyzer that does what we used to do in native code',
			
 
				     'ml/ml_standard_analyze/Test analyze API with the standard 7.14 ML analyzer',
			
 
				     'ml/ml_standard_analyze/Test 7.14 analyzer with blank lines',
			
 
				+    'ml/ml_standard_analyze/Test 7.14 analyzer with multiple multiline messages',
			
 
				+    'ml/ml_standard_analyze/Test 7.14 analyzer with stop words in messages',
			
 
				     // Remove tests that are expected to throw an exception, because we cannot then
			
 
				     // know whether to expect an authorization exception or a validation exception
			
 
				     'ml/calendar_crud/Test get calendar given missing',
			
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/FirstNonBlankLineCharFilter.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/FirstNonBlankLineCharFilter.java
@@ -93,6 +93,9 @@ public class FirstNonBlankLineCharFilter extends BaseCharFilter {
 
				         }
			
 
				 
			
 
				         addOffCorrectMap(0, prevNewlineIndex + 1);
			
 
				+        if (endIndex < input.length()) {
			
 
				+            addOffCorrectMap(endIndex - prevNewlineIndex - 1, input.length() - endIndex + prevNewlineIndex + 1);
			
 
				+        }
			
 
				         return input.subSequence(prevNewlineIndex + 1, endIndex);
			
 
				     }
			
 
				 }
			
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/FirstNonBlankLineCharFilterTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/FirstNonBlankLineCharFilterTests.java
@@ -121,8 +121,11 @@ public class FirstNonBlankLineCharFilterTests extends ESTestCase {
 
				         assertThat(new String(output), equalTo(expectedOutput));
			
 
				 
			
 
				         int expectedOutputIndex = input.indexOf(expectedOutput);
			
 
				-        for (int i = 0; i <= expectedOutput.length(); ++i) {
			
 
				+        for (int i = 0; i < expectedOutput.length(); ++i) {
			
 
				             assertThat(filter.correctOffset(i), equalTo(expectedOutputIndex + i));
			
 
				         }
			
 
				+        // When the input gets chopped by a char filter immediately after a token, that token must be reported as
			
 
				+        // ending at the very end of the original input, otherwise multi-message analysis will have incorrect offsets
			
 
				+        assertThat(filter.correctOffset(expectedOutput.length()), equalTo(input.length()));
			
 
				     }
			
 
				 }
			
--- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/ml_standard_analyze.yml
+++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/ml_standard_analyze.yml
@@ -111,5 +111,107 @@
 
				   - match: { tokens.0.position: 0 }
			
 
				   - match: { tokens.1.token: "line" }
			
 
				   - match: { tokens.1.start_offset: 10 }
			
 
				-  - match: { tokens.1.end_offset: 14 }
			
 
				+  - match: { tokens.1.end_offset: 26 }
			
 
				   - match: { tokens.1.position: 1 }
			
 
				+
			
 
				+---
			
 
				+"Test 7.14 analyzer with multiple multiline messages":
			
 
				+  - do:
			
 
				+      indices.analyze:
			
 
				+        body:  >
			
 
				+          {
			
 
				+            "char_filter" : [
			
 
				+              "first_non_blank_line"
			
 
				+            ],
			
 
				+            "tokenizer" : "ml_standard",
			
 
				+            "filter" : [
			
 
				+              { "type" : "stop", "stopwords": [
			
 
				+                "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
			
 
				+                "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun",
			
 
				+                "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December",
			
 
				+                "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
			
 
				+                "GMT", "UTC"
			
 
				+              ] }
			
 
				+            ],
			
 
				+            "text" : [
			
 
				+              "   \nfirst line\nsecond line",
			
 
				+              "   \nfirst line of second message\nsecond line of second message"
			
 
				+            ]
			
 
				+          }
			
 
				+  - match: { tokens.0.token: "first" }
			
 
				+  - match: { tokens.0.start_offset: 4 }
			
 
				+  - match: { tokens.0.end_offset: 9 }
			
 
				+  - match: { tokens.0.position: 0 }
			
 
				+  - match: { tokens.1.token: "line" }
			
 
				+  - match: { tokens.1.start_offset: 10 }
			
 
				+  - match: { tokens.1.end_offset: 26 }
			
 
				+  - match: { tokens.1.position: 1 }
			
 
				+  - match: { tokens.2.token: "first" }
			
 
				+  - match: { tokens.2.start_offset: 31 }
			
 
				+  - match: { tokens.2.end_offset: 36 }
			
 
				+  - match: { tokens.2.position: 102 }
			
 
				+  - match: { tokens.3.token: "line" }
			
 
				+  - match: { tokens.3.start_offset: 37 }
			
 
				+  - match: { tokens.3.end_offset: 41 }
			
 
				+  - match: { tokens.3.position: 103 }
			
 
				+  - match: { tokens.4.token: "of" }
			
 
				+  - match: { tokens.4.start_offset: 42 }
			
 
				+  - match: { tokens.4.end_offset: 44 }
			
 
				+  - match: { tokens.4.position: 104 }
			
 
				+  - match: { tokens.5.token: "second" }
			
 
				+  - match: { tokens.5.start_offset: 45 }
			
 
				+  - match: { tokens.5.end_offset: 51 }
			
 
				+  - match: { tokens.5.position: 105 }
			
 
				+  - match: { tokens.6.token: "message" }
			
 
				+  - match: { tokens.6.start_offset: 52 }
			
 
				+  - match: { tokens.6.end_offset: 89 }
			
 
				+  - match: { tokens.6.position: 106 }
			
 
				+
			
 
				+---
			
 
				+"Test 7.14 analyzer with stop words in messages":
			
 
				+  - do:
			
 
				+      indices.analyze:
			
 
				+        body:  >
			
 
				+          {
			
 
				+            "char_filter" : [
			
 
				+              "first_non_blank_line"
			
 
				+            ],
			
 
				+            "tokenizer" : "ml_standard",
			
 
				+            "filter" : [
			
 
				+              { "type" : "stop", "stopwords": [
			
 
				+                "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
			
 
				+                "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun",
			
 
				+                "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December",
			
 
				+                "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
			
 
				+                "GMT", "UTC"
			
 
				+              ] }
			
 
				+            ],
			
 
				+            "text" : [
			
 
				+              "May 27, 2021 @ 19:51:15.288 UTC log message one",
			
 
				+              "May 27, 2021 @ 19:52:25.288 UTC log message two"
			
 
				+            ]
			
 
				+          }
			
 
				+  - match: { tokens.0.token: "log" }
			
 
				+  - match: { tokens.0.start_offset: 32 }
			
 
				+  - match: { tokens.0.end_offset: 35 }
			
 
				+  - match: { tokens.0.position: 7 }
			
 
				+  - match: { tokens.1.token: "message" }
			
 
				+  - match: { tokens.1.start_offset: 36 }
			
 
				+  - match: { tokens.1.end_offset: 43 }
			
 
				+  - match: { tokens.1.position: 8 }
			
 
				+  - match: { tokens.2.token: "one" }
			
 
				+  - match: { tokens.2.start_offset: 44 }
			
 
				+  - match: { tokens.2.end_offset: 47 }
			
 
				+  - match: { tokens.2.position: 9 }
			
 
				+  - match: { tokens.3.token: "log" }
			
 
				+  - match: { tokens.3.start_offset: 80 }
			
 
				+  - match: { tokens.3.end_offset: 83 }
			
 
				+  - match: { tokens.3.position: 117 }
			
 
				+  - match: { tokens.4.token: "message" }
			
 
				+  - match: { tokens.4.start_offset: 84 }
			
 
				+  - match: { tokens.4.end_offset: 91 }
			
 
				+  - match: { tokens.4.position: 118 }
			
 
				+  - match: { tokens.5.token: "two" }
			
 
				+  - match: { tokens.5.start_offset: 92 }
			
 
				+  - match: { tokens.5.end_offset: 95 }
			
 
				+  - match: { tokens.5.position: 119 }