7 years ago · 5ba04e23fc
--- a/x-pack/plugin/ml/log-structure-finder/build.gradle
+++ b/x-pack/plugin/ml/log-structure-finder/build.gradle
@@ -0,0 +1,36 @@
 
				+import org.elasticsearch.gradle.precommit.PrecommitTasks
			
 
				+
			
 
				+apply plugin: 'elasticsearch.build'
			
 
				+
			
 
				+archivesBaseName = 'x-pack-log-structure-finder'
			
 
				+
			
 
				+description = 'Common code for reverse engineering log structure'
			
 
				+
			
 
				+dependencies {
			
 
				+    compile "org.elasticsearch:elasticsearch-core:${version}"
			
 
				+    compile "org.elasticsearch:elasticsearch-x-content:${version}"
			
 
				+    compile project(':libs:grok')
			
 
				+    compile "com.ibm.icu:icu4j:${versions.icu4j}"
			
 
				+    compile "net.sf.supercsv:super-csv:${versions.supercsv}"
			
 
				+
			
 
				+    testCompile "org.elasticsearch.test:framework:${version}"
			
 
				+}
			
 
				+
			
 
				+configurations {
			
 
				+    testArtifacts.extendsFrom testRuntime
			
 
				+}
			
 
				+task testJar(type: Jar) {
			
 
				+    appendix 'test'
			
 
				+    from sourceSets.test.output
			
 
				+}
			
 
				+artifacts {
			
 
				+    // normal es plugins do not publish the jar but we need to since users need it for Transport Clients and extensions
			
 
				+    archives jar
			
 
				+    testArtifacts testJar
			
 
				+}
			
 
				+
			
 
				+forbiddenApisMain {
			
 
				+    // log-structure-finder does not depend on server, so cannot forbid server methods
			
 
				+    signaturesURLs = [PrecommitTasks.getResource('/forbidden/jdk-signatures.txt')]
			
 
				+}
			
 
				+
			
--- a/x-pack/plugin/ml/log-structure-finder/licenses/icu4j-62.1.jar.sha1
+++ b/x-pack/plugin/ml/log-structure-finder/licenses/icu4j-62.1.jar.sha1
@@ -0,0 +1 @@
 
				+7a4d00d5ec5febd252a6182e8b6e87a0a9821f81
			
--- a/x-pack/plugin/ml/log-structure-finder/licenses/icu4j-LICENSE.txt
+++ b/x-pack/plugin/ml/log-structure-finder/licenses/icu4j-LICENSE.txt
@@ -0,0 +1,33 @@
 
				+ICU License - ICU 1.8.1 and later
			
 
				+
			
 
				+COPYRIGHT AND PERMISSION NOTICE
			
 
				+
			
 
				+Copyright (c) 1995-2012 International Business Machines Corporation and others
			
 
				+
			
 
				+All rights reserved.
			
 
				+
			
 
				+Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+of this software and associated documentation files (the "Software"), to deal
			
 
				+in the Software without restriction, including without limitation the rights
			
 
				+to use, copy, modify, merge, publish, distribute, and/or sell copies of the
			
 
				+Software, and to permit persons to whom the Software is furnished to do so,
			
 
				+provided that the above copyright notice(s) and this permission notice appear
			
 
				+in all copies of the Software and that both the above copyright notice(s) and
			
 
				+this permission notice appear in supporting documentation.
			
 
				+
			
 
				+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
			
 
				+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
			
 
				+LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
			
 
				+ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
			
 
				+IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
			
 
				+OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
			
 
				+
			
 
				+Except as contained in this notice, the name of a copyright holder shall not
			
 
				+be used in advertising or otherwise to promote the sale, use or other
			
 
				+dealings in this Software without prior written authorization of the
			
 
				+copyright holder.
			
 
				+
			
 
				+All trademarks and registered trademarks mentioned herein are the property of
			
 
				+their respective owners.
			
--- a/x-pack/plugin/ml/log-structure-finder/licenses/icu4j-NOTICE.txt
+++ b/x-pack/plugin/ml/log-structure-finder/licenses/icu4j-NOTICE.txt
@@ -0,0 +1,3 @@
 
				+ICU4J, (under lucene/analysis/icu) is licensed under an MIT style license
			
 
				+(modules/analysis/icu/lib/icu4j-LICENSE-BSD_LIKE.txt) and Copyright (c) 1995-2012
			
 
				+International Business Machines Corporation and others
			
--- a/x-pack/plugin/ml/log-structure-finder/licenses/super-csv-2.4.0.jar.sha1
+++ b/x-pack/plugin/ml/log-structure-finder/licenses/super-csv-2.4.0.jar.sha1
@@ -0,0 +1 @@
 
				+017f8708c929029dde48bc298deaf3c7ae2452d3
			
--- a/x-pack/plugin/ml/log-structure-finder/licenses/super-csv-LICENSE.txt
+++ b/x-pack/plugin/ml/log-structure-finder/licenses/super-csv-LICENSE.txt
@@ -0,0 +1,203 @@
 
				+/*
			
 
				+ *                                 Apache License
			
 
				+ *                           Version 2.0, January 2004
			
 
				+ *                        http://www.apache.org/licenses/
			
 
				+ *
			
 
				+ *   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
			
 
				+ *
			
 
				+ *   1. Definitions.
			
 
				+ *
			
 
				+ *      "License" shall mean the terms and conditions for use, reproduction,
			
 
				+ *      and distribution as defined by Sections 1 through 9 of this document.
			
 
				+ *
			
 
				+ *      "Licensor" shall mean the copyright owner or entity authorized by
			
 
				+ *      the copyright owner that is granting the License.
			
 
				+ *
			
 
				+ *      "Legal Entity" shall mean the union of the acting entity and all
			
 
				+ *      other entities that control, are controlled by, or are under common
			
 
				+ *      control with that entity. For the purposes of this definition,
			
 
				+ *      "control" means (i) the power, direct or indirect, to cause the
			
 
				+ *      direction or management of such entity, whether by contract or
			
 
				+ *      otherwise, or (ii) ownership of fifty percent (50%) or more of the
			
 
				+ *      outstanding shares, or (iii) beneficial ownership of such entity.
			
 
				+ *
			
 
				+ *      "You" (or "Your") shall mean an individual or Legal Entity
			
 
				+ *      exercising permissions granted by this License.
			
 
				+ *
			
 
				+ *      "Source" form shall mean the preferred form for making modifications,
			
 
				+ *      including but not limited to software source code, documentation
			
 
				+ *      source, and configuration files.
			
 
				+ *
			
 
				+ *      "Object" form shall mean any form resulting from mechanical
			
 
				+ *      transformation or translation of a Source form, including but
			
 
				+ *      not limited to compiled object code, generated documentation,
			
 
				+ *      and conversions to other media types.
			
 
				+ *
			
 
				+ *      "Work" shall mean the work of authorship, whether in Source or
			
 
				+ *      Object form, made available under the License, as indicated by a
			
 
				+ *      copyright notice that is included in or attached to the work
			
 
				+ *      (an example is provided in the Appendix below).
			
 
				+ *
			
 
				+ *      "Derivative Works" shall mean any work, whether in Source or Object
			
 
				+ *      form, that is based on (or derived from) the Work and for which the
			
 
				+ *      editorial revisions, annotations, elaborations, or other modifications
			
 
				+ *      represent, as a whole, an original work of authorship. For the purposes
			
 
				+ *      of this License, Derivative Works shall not include works that remain
			
 
				+ *      separable from, or merely link (or bind by name) to the interfaces of,
			
 
				+ *      the Work and Derivative Works thereof.
			
 
				+ *
			
 
				+ *      "Contribution" shall mean any work of authorship, including
			
 
				+ *      the original version of the Work and any modifications or additions
			
 
				+ *      to that Work or Derivative Works thereof, that is intentionally
			
 
				+ *      submitted to Licensor for inclusion in the Work by the copyright owner
			
 
				+ *      or by an individual or Legal Entity authorized to submit on behalf of
			
 
				+ *      the copyright owner. For the purposes of this definition, "submitted"
			
 
				+ *      means any form of electronic, verbal, or written communication sent
			
 
				+ *      to the Licensor or its representatives, including but not limited to
			
 
				+ *      communication on electronic mailing lists, source code control systems,
			
 
				+ *      and issue tracking systems that are managed by, or on behalf of, the
			
 
				+ *      Licensor for the purpose of discussing and improving the Work, but
			
 
				+ *      excluding communication that is conspicuously marked or otherwise
			
 
				+ *      designated in writing by the copyright owner as "Not a Contribution."
			
 
				+ *
			
 
				+ *      "Contributor" shall mean Licensor and any individual or Legal Entity
			
 
				+ *      on behalf of whom a Contribution has been received by Licensor and
			
 
				+ *      subsequently incorporated within the Work.
			
 
				+ *
			
 
				+ *   2. Grant of Copyright License. Subject to the terms and conditions of
			
 
				+ *      this License, each Contributor hereby grants to You a perpetual,
			
 
				+ *      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
			
 
				+ *      copyright license to reproduce, prepare Derivative Works of,
			
 
				+ *      publicly display, publicly perform, sublicense, and distribute the
			
 
				+ *      Work and such Derivative Works in Source or Object form.
			
 
				+ *
			
 
				+ *   3. Grant of Patent License. Subject to the terms and conditions of
			
 
				+ *      this License, each Contributor hereby grants to You a perpetual,
			
 
				+ *      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
			
 
				+ *      (except as stated in this section) patent license to make, have made,
			
 
				+ *      use, offer to sell, sell, import, and otherwise transfer the Work,
			
 
				+ *      where such license applies only to those patent claims licensable
			
 
				+ *      by such Contributor that are necessarily infringed by their
			
 
				+ *      Contribution(s) alone or by combination of their Contribution(s)
			
 
				+ *      with the Work to which such Contribution(s) was submitted. If You
			
 
				+ *      institute patent litigation against any entity (including a
			
 
				+ *      cross-claim or counterclaim in a lawsuit) alleging that the Work
			
 
				+ *      or a Contribution incorporated within the Work constitutes direct
			
 
				+ *      or contributory patent infringement, then any patent licenses
			
 
				+ *      granted to You under this License for that Work shall terminate
			
 
				+ *      as of the date such litigation is filed.
			
 
				+ *
			
 
				+ *   4. Redistribution. You may reproduce and distribute copies of the
			
 
				+ *      Work or Derivative Works thereof in any medium, with or without
			
 
				+ *      modifications, and in Source or Object form, provided that You
			
 
				+ *      meet the following conditions:
			
 
				+ *
			
 
				+ *      (a) You must give any other recipients of the Work or
			
 
				+ *          Derivative Works a copy of this License; and
			
 
				+ *
			
 
				+ *      (b) You must cause any modified files to carry prominent notices
			
 
				+ *          stating that You changed the files; and
			
 
				+ *
			
 
				+ *      (c) You must retain, in the Source form of any Derivative Works
			
 
				+ *          that You distribute, all copyright, patent, trademark, and
			
 
				+ *          attribution notices from the Source form of the Work,
			
 
				+ *          excluding those notices that do not pertain to any part of
			
 
				+ *          the Derivative Works; and
			
 
				+ *
			
 
				+ *      (d) If the Work includes a "NOTICE" text file as part of its
			
 
				+ *          distribution, then any Derivative Works that You distribute must
			
 
				+ *          include a readable copy of the attribution notices contained
			
 
				+ *          within such NOTICE file, excluding those notices that do not
			
 
				+ *          pertain to any part of the Derivative Works, in at least one
			
 
				+ *          of the following places: within a NOTICE text file distributed
			
 
				+ *          as part of the Derivative Works; within the Source form or
			
 
				+ *          documentation, if provided along with the Derivative Works; or,
			
 
				+ *          within a display generated by the Derivative Works, if and
			
 
				+ *          wherever such third-party notices normally appear. The contents
			
 
				+ *          of the NOTICE file are for informational purposes only and
			
 
				+ *          do not modify the License. You may add Your own attribution
			
 
				+ *          notices within Derivative Works that You distribute, alongside
			
 
				+ *          or as an addendum to the NOTICE text from the Work, provided
			
 
				+ *          that such additional attribution notices cannot be construed
			
 
				+ *          as modifying the License.
			
 
				+ *
			
 
				+ *      You may add Your own copyright statement to Your modifications and
			
 
				+ *      may provide additional or different license terms and conditions
			
 
				+ *      for use, reproduction, or distribution of Your modifications, or
			
 
				+ *      for any such Derivative Works as a whole, provided Your use,
			
 
				+ *      reproduction, and distribution of the Work otherwise complies with
			
 
				+ *      the conditions stated in this License.
			
 
				+ *
			
 
				+ *   5. Submission of Contributions. Unless You explicitly state otherwise,
			
 
				+ *      any Contribution intentionally submitted for inclusion in the Work
			
 
				+ *      by You to the Licensor shall be under the terms and conditions of
			
 
				+ *      this License, without any additional terms or conditions.
			
 
				+ *      Notwithstanding the above, nothing herein shall supersede or modify
			
 
				+ *      the terms of any separate license agreement you may have executed
			
 
				+ *      with Licensor regarding such Contributions.
			
 
				+ *
			
 
				+ *   6. Trademarks. This License does not grant permission to use the trade
			
 
				+ *      names, trademarks, service marks, or product names of the Licensor,
			
 
				+ *      except as required for reasonable and customary use in describing the
			
 
				+ *      origin of the Work and reproducing the content of the NOTICE file.
			
 
				+ *
			
 
				+ *   7. Disclaimer of Warranty. Unless required by applicable law or
			
 
				+ *      agreed to in writing, Licensor provides the Work (and each
			
 
				+ *      Contributor provides its Contributions) on an "AS IS" BASIS,
			
 
				+ *      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
			
 
				+ *      implied, including, without limitation, any warranties or conditions
			
 
				+ *      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
			
 
				+ *      PARTICULAR PURPOSE. You are solely responsible for determining the
			
 
				+ *      appropriateness of using or redistributing the Work and assume any
			
 
				+ *      risks associated with Your exercise of permissions under this License.
			
 
				+ *
			
 
				+ *   8. Limitation of Liability. In no event and under no legal theory,
			
 
				+ *      whether in tort (including negligence), contract, or otherwise,
			
 
				+ *      unless required by applicable law (such as deliberate and grossly
			
 
				+ *      negligent acts) or agreed to in writing, shall any Contributor be
			
 
				+ *      liable to You for damages, including any direct, indirect, special,
			
 
				+ *      incidental, or consequential damages of any character arising as a
			
 
				+ *      result of this License or out of the use or inability to use the
			
 
				+ *      Work (including but not limited to damages for loss of goodwill,
			
 
				+ *      work stoppage, computer failure or malfunction, or any and all
			
 
				+ *      other commercial damages or losses), even if such Contributor
			
 
				+ *      has been advised of the possibility of such damages.
			
 
				+ *
			
 
				+ *   9. Accepting Warranty or Additional Liability. While redistributing
			
 
				+ *      the Work or Derivative Works thereof, You may choose to offer,
			
 
				+ *      and charge a fee for, acceptance of support, warranty, indemnity,
			
 
				+ *      or other liability obligations and/or rights consistent with this
			
 
				+ *      License. However, in accepting such obligations, You may act only
			
 
				+ *      on Your own behalf and on Your sole responsibility, not on behalf
			
 
				+ *      of any other Contributor, and only if You agree to indemnify,
			
 
				+ *      defend, and hold each Contributor harmless for any liability
			
 
				+ *      incurred by, or claims asserted against, such Contributor by reason
			
 
				+ *      of your accepting any such warranty or additional liability.
			
 
				+ *
			
 
				+ *   END OF TERMS AND CONDITIONS
			
 
				+ *
			
 
				+ *   APPENDIX: How to apply the Apache License to your work.
			
 
				+ *
			
 
				+ *      To apply the Apache License to your work, attach the following
			
 
				+ *      boilerplate notice, with the fields enclosed by brackets "[]"
			
 
				+ *      replaced with your own identifying information. (Don't include
			
 
				+ *      the brackets!)  The text should be enclosed in the appropriate
			
 
				+ *      comment syntax for the file format. We also recommend that a
			
 
				+ *      file or class name and description of purpose be included on the
			
 
				+ *      same "printed page" as the copyright notice for easier
			
 
				+ *      identification within third-party archives.
			
 
				+ *
			
 
				+ *   Copyright 2007 Kasper B. Graversen
			
 
				+ *
			
 
				+ *   Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ *   you may not use this file except in compliance with the License.
			
 
				+ *   You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *       http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ *   Unless required by applicable law or agreed to in writing, software
			
 
				+ *   distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ *   See the License for the specific language governing permissions and
			
 
				+ *   limitations under the License.
			
 
				+ */
			
--- a/x-pack/plugin/ml/log-structure-finder/licenses/super-csv-NOTICE.txt
+++ b/x-pack/plugin/ml/log-structure-finder/licenses/super-csv-NOTICE.txt
--- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/CsvLogStructureFinderFactory.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/CsvLogStructureFinderFactory.java
@@ -0,0 +1,35 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.supercsv.prefs.CsvPreference;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.util.List;
			
 
				+
			
 
				+public class CsvLogStructureFinderFactory implements LogStructureFinderFactory {
			
 
				+
			
 
				+    /**
			
 
				+     * Rules are:
			
 
				+     * - The file must be valid CSV
			
 
				+     * - It must contain at least two complete records
			
 
				+     * - There must be at least two fields per record (otherwise files with no commas could be treated as CSV!)
			
 
				+     * - Every CSV record except the last must have the same number of fields
			
 
				+     * The reason the last record is allowed to have fewer fields than the others is that
			
 
				+     * it could have been truncated when the file was sampled.
			
 
				+     */
			
 
				+    @Override
			
 
				+    public boolean canCreateFromSample(List<String> explanation, String sample) {
			
 
				+        return SeparatedValuesLogStructureFinder.canCreateFromSample(explanation, sample, 2, CsvPreference.EXCEL_PREFERENCE, "CSV");
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
			
 
				+        throws IOException {
			
 
				+        return SeparatedValuesLogStructureFinder.makeSeparatedValuesLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
			
 
				+            CsvPreference.EXCEL_PREFERENCE, false);
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/GrokPatternCreator.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/GrokPatternCreator.java
@@ -0,0 +1,615 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.elasticsearch.common.collect.Tuple;
			
 
				+import org.elasticsearch.grok.Grok;
			
 
				+import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
			
 
				+
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.Arrays;
			
 
				+import java.util.Collection;
			
 
				+import java.util.Collections;
			
 
				+import java.util.HashMap;
			
 
				+import java.util.LinkedHashSet;
			
 
				+import java.util.List;
			
 
				+import java.util.Map;
			
 
				+import java.util.Set;
			
 
				+import java.util.regex.Matcher;
			
 
				+import java.util.regex.Pattern;
			
 
				+import java.util.stream.Collectors;
			
 
				+
			
 
				+/**
			
 
				+ * Creates Grok patterns that will match all provided sample messages.
			
 
				+ *
			
 
				+ * The choice of field names is quite primitive.  The intention is that a human will edit these.
			
 
				+ */
			
 
				+public final class GrokPatternCreator {
			
 
				+
			
 
				+    private static final Map<Character, Boolean> PUNCTUATION_OR_SPACE_NEEDS_ESCAPING;
			
 
				+    static {
			
 
				+        HashMap<Character, Boolean> punctuationOrSpaceNeedsEscaping = new HashMap<>();
			
 
				+        String punctuationAndSpaceCharacters = "\"'`‘’“”#@%=\\/|~:;,<>()[]{}«»^$*¿?¡!§¶ \t\n";
			
 
				+        String punctuationThatNeedsEscaping = "\\|()[]{}^$*?";
			
 
				+        punctuationAndSpaceCharacters.chars()
			
 
				+            .forEach(c -> punctuationOrSpaceNeedsEscaping.put((char) c, punctuationThatNeedsEscaping.indexOf(c) >= 0));
			
 
				+        PUNCTUATION_OR_SPACE_NEEDS_ESCAPING = Collections.unmodifiableMap(punctuationOrSpaceNeedsEscaping);
			
 
				+    }
			
 
				+
			
 
				+    private static final String PREFACE = "preface";
			
 
				+    private static final String VALUE = "value";
			
 
				+    private static final String EPILOGUE = "epilogue";
			
 
				+
			
 
				+    /**
			
 
				+     * Grok patterns that are designed to match the whole message, not just a part of it.
			
 
				+     */
			
 
				+    private static final List<FullMatchGrokPatternCandidate> FULL_MATCH_GROK_PATTERNS = Arrays.asList(
			
 
				+        new FullMatchGrokPatternCandidate("BACULA_LOGLINE", "bts"),
			
 
				+        new FullMatchGrokPatternCandidate("CATALINALOG", "timestamp"),
			
 
				+        new FullMatchGrokPatternCandidate("COMBINEDAPACHELOG", "timestamp"),
			
 
				+        new FullMatchGrokPatternCandidate("COMMONAPACHELOG", "timestamp"),
			
 
				+        new FullMatchGrokPatternCandidate("ELB_ACCESS_LOG", "timestamp"),
			
 
				+        new FullMatchGrokPatternCandidate("HAPROXYHTTP", "syslog_timestamp"),
			
 
				+        new FullMatchGrokPatternCandidate("HAPROXYTCP", "syslog_timestamp"),
			
 
				+        new FullMatchGrokPatternCandidate("HTTPD20_ERRORLOG", "timestamp"),
			
 
				+        new FullMatchGrokPatternCandidate("HTTPD24_ERRORLOG", "timestamp"),
			
 
				+        new FullMatchGrokPatternCandidate("NAGIOSLOGLINE", "nagios_epoch"),
			
 
				+        new FullMatchGrokPatternCandidate("NETSCREENSESSIONLOG", "date"),
			
 
				+        new FullMatchGrokPatternCandidate("RAILS3", "timestamp"),
			
 
				+        new FullMatchGrokPatternCandidate("RUBY_LOGGER", "timestamp"),
			
 
				+        new FullMatchGrokPatternCandidate("SHOREWALL", "timestamp"),
			
 
				+        new FullMatchGrokPatternCandidate("TOMCATLOG", "timestamp")
			
 
				+    );
			
 
				+
			
 
				+    /**
			
 
				+     * The first match in this list will be chosen, so it needs to be ordered
			
 
				+     * such that more generic patterns come after more specific patterns.
			
 
				+     */
			
 
				+    private static final List<GrokPatternCandidate> ORDERED_CANDIDATE_GROK_PATTERNS = Arrays.asList(
			
 
				+        new ValueOnlyGrokPatternCandidate("TOMCAT_DATESTAMP", "date", "extra_timestamp"),
			
 
				+        new ValueOnlyGrokPatternCandidate("TIMESTAMP_ISO8601", "date", "extra_timestamp"),
			
 
				+        new ValueOnlyGrokPatternCandidate("DATESTAMP_RFC822", "date", "extra_timestamp"),
			
 
				+        new ValueOnlyGrokPatternCandidate("DATESTAMP_RFC2822", "date", "extra_timestamp"),
			
 
				+        new ValueOnlyGrokPatternCandidate("DATESTAMP_OTHER", "date", "extra_timestamp"),
			
 
				+        new ValueOnlyGrokPatternCandidate("DATESTAMP_EVENTLOG", "date", "extra_timestamp"),
			
 
				+        new ValueOnlyGrokPatternCandidate("SYSLOGTIMESTAMP", "date", "extra_timestamp"),
			
 
				+        new ValueOnlyGrokPatternCandidate("HTTPDATE", "date", "extra_timestamp"),
			
 
				+        new ValueOnlyGrokPatternCandidate("CATALINA_DATESTAMP", "date", "extra_timestamp"),
			
 
				+        new ValueOnlyGrokPatternCandidate("CISCOTIMESTAMP", "date", "extra_timestamp"),
			
 
				+        new ValueOnlyGrokPatternCandidate("LOGLEVEL", "keyword", "loglevel"),
			
 
				+        new ValueOnlyGrokPatternCandidate("URI", "keyword", "uri"),
			
 
				+        new ValueOnlyGrokPatternCandidate("UUID", "keyword", "uuid"),
			
 
				+        new ValueOnlyGrokPatternCandidate("MAC", "keyword", "macaddress"),
			
 
				+        // Can't use \b as the breaks, because slashes are not "word" characters
			
 
				+        new ValueOnlyGrokPatternCandidate("PATH", "keyword", "path", "(?<!\\w)", "(?!\\w)"),
			
 
				+        new ValueOnlyGrokPatternCandidate("EMAILADDRESS", "keyword", "email"),
			
 
				+        // TODO: would be nice to have IPORHOST here, but HOST matches almost all words
			
 
				+        new ValueOnlyGrokPatternCandidate("IP", "ip", "ipaddress"),
			
 
				+        new ValueOnlyGrokPatternCandidate("DATE", "date", "date"),
			
 
				+        new ValueOnlyGrokPatternCandidate("TIME", "date", "time"),
			
 
				+        // This already includes pre/post break conditions
			
 
				+        new ValueOnlyGrokPatternCandidate("QUOTEDSTRING", "keyword", "field", "", ""),
			
 
				+        // Disallow +, - and . before numbers, as well as "word" characters, otherwise we'll pick
			
 
				+        // up numeric suffices too eagerly
			
 
				+        new ValueOnlyGrokPatternCandidate("INT", "long", "field", "(?<![\\w.+-])", "(?![\\w+-]|\\.\\d)"),
			
 
				+        new ValueOnlyGrokPatternCandidate("NUMBER", "double", "field", "(?<![\\w.+-])", "(?![\\w+-]|\\.\\d)"),
			
 
				+        new ValueOnlyGrokPatternCandidate("BASE16NUM", "keyword", "field", "(?<![\\w.+-])", "(?![\\w+-]|\\.\\w)")
			
 
				+        // TODO: also unfortunately can't have USERNAME in the list as it matches too broadly
			
 
				+        // Fixing these problems with overly broad matches would require some extra intelligence
			
 
				+        // to be added to remove inappropriate matches.  One idea would be to use a dictionary,
			
 
				+        // but that doesn't necessarily help as "jay" could be a username but is also a dictionary
			
 
				+        // word (plus there's the international headache with relying on dictionaries).  Similarly,
			
 
				+        // hostnames could also be dictionary words - I've worked on machines called "hippo" and
			
 
				+        // "scarf" in the past.  Another idea would be to look at the adjacent characters and
			
 
				+        // apply some heuristic based on those.
			
 
				+    );
			
 
				+
			
 
				+    /**
			
 
				+     * It is expected that the explanation will be shared with other code.
			
 
				+     * Both this class and other classes will update it.
			
 
				+     */
			
 
				+    private final List<String> explanation;
			
 
				+    private final Collection<String> sampleMessages;
			
 
				+
			
 
				+    /**
			
 
				+     * It is expected that the mappings will be shared with other code.
			
 
				+     * Both this class and other classes will update it.
			
 
				+     */
			
 
				+    private final Map<String, Object> mappings;
			
 
				+    private final Map<String, Integer> fieldNameCountStore = new HashMap<>();
			
 
				+    private final StringBuilder overallGrokPatternBuilder = new StringBuilder();
			
 
				+
			
 
				+    /**
			
 
				+     *
			
 
				+     * @param explanation List of reasons for making decisions.  May contain items when passed and new reasons
			
 
				+     *                    can be appended by the methods of this class.
			
 
				+     * @param sampleMessages Sample messages that any Grok pattern found must match.
			
 
				+     * @param mappings Will be updated with mappings appropriate for the returned pattern, if non-<code>null</code>.
			
 
				+     */
			
 
				+    public GrokPatternCreator(List<String> explanation, Collection<String> sampleMessages, Map<String, Object> mappings) {
			
 
				+        this.explanation = explanation;
			
 
				+        this.sampleMessages = Collections.unmodifiableCollection(sampleMessages);
			
 
				+        this.mappings = mappings;
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * This method attempts to find a Grok pattern that will match all of the sample messages in their entirety.
			
 
				+     * @return A tuple of (time field name, Grok string), or <code>null</code> if no suitable Grok pattern was found.
			
 
				+     */
			
 
				+    public Tuple<String, String> findFullLineGrokPattern() {
			
 
				+
			
 
				+        for (FullMatchGrokPatternCandidate candidate : FULL_MATCH_GROK_PATTERNS) {
			
 
				+            if (candidate.matchesAll(sampleMessages)) {
			
 
				+                return candidate.processMatch(explanation, sampleMessages, mappings);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        return null;
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Build a Grok pattern that will match all of the sample messages in their entirety.
			
 
				+     * @param seedPatternName A pattern that has already been determined to match some portion of every sample message.
			
 
				+     * @param seedFieldName The field name to be used for the portion of every sample message that the seed pattern matches.
			
 
				+     * @return The built Grok pattern.
			
 
				+     */
			
 
				+    public String createGrokPatternFromExamples(String seedPatternName, String seedFieldName) {
			
 
				+
			
 
				+        overallGrokPatternBuilder.setLength(0);
			
 
				+
			
 
				+        GrokPatternCandidate seedCandidate = new NoMappingGrokPatternCandidate(seedPatternName, seedFieldName);
			
 
				+
			
 
				+        processCandidateAndSplit(seedCandidate, true, sampleMessages, false, 0, false, 0);
			
 
				+
			
 
				+        return overallGrokPatternBuilder.toString().replace("\t", "\\t").replace("\n", "\\n");
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * This is purely to allow unit tests to inspect the partial Grok pattern after testing implementation details.
			
 
				+     * It should not be used in production code.
			
 
				+     */
			
 
				+    StringBuilder getOverallGrokPatternBuilder() {
			
 
				+        return overallGrokPatternBuilder;
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Given a chosen Grok pattern and a collection of message snippets, split the snippets into the
			
 
				+     * matched section and the pieces before and after it.  Recurse to find more matches in the pieces
			
 
				+     * before and after and update the supplied string builder.
			
 
				+     */
			
 
				+    private void processCandidateAndSplit(GrokPatternCandidate chosenPattern, boolean isLast, Collection<String> snippets,
			
 
				+                                          boolean ignoreKeyValueCandidateLeft, int ignoreValueOnlyCandidatesLeft,
			
 
				+                                          boolean ignoreKeyValueCandidateRight, int ignoreValueOnlyCandidatesRight) {
			
 
				+
			
 
				+        Collection<String> prefaces = new ArrayList<>();
			
 
				+        Collection<String> epilogues = new ArrayList<>();
			
 
				+        String patternBuilderContent = chosenPattern.processCaptures(fieldNameCountStore, snippets, prefaces, epilogues, mappings);
			
 
				+        appendBestGrokMatchForStrings(false, prefaces, ignoreKeyValueCandidateLeft, ignoreValueOnlyCandidatesLeft);
			
 
				+        overallGrokPatternBuilder.append(patternBuilderContent);
			
 
				+        appendBestGrokMatchForStrings(isLast, epilogues, ignoreKeyValueCandidateRight, ignoreValueOnlyCandidatesRight);
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Given a collection of message snippets, work out which (if any) of the Grok patterns we're allowed
			
 
				+     * to use matches it best.  Then append the appropriate Grok language to represent that finding onto
			
 
				+     * the supplied string builder.
			
 
				+     */
			
 
				+    void appendBestGrokMatchForStrings(boolean isLast, Collection<String> snippets,
			
 
				+                                       boolean ignoreKeyValueCandidate, int ignoreValueOnlyCandidates) {
			
 
				+
			
 
				+        snippets = adjustForPunctuation(snippets);
			
 
				+
			
 
				+        GrokPatternCandidate bestCandidate = null;
			
 
				+        if (snippets.isEmpty() == false) {
			
 
				+            GrokPatternCandidate kvCandidate = new KeyValueGrokPatternCandidate(explanation);
			
 
				+            if (ignoreKeyValueCandidate == false && kvCandidate.matchesAll(snippets)) {
			
 
				+                bestCandidate = kvCandidate;
			
 
				+            } else {
			
 
				+                ignoreKeyValueCandidate = true;
			
 
				+                for (GrokPatternCandidate candidate :
			
 
				+                    ORDERED_CANDIDATE_GROK_PATTERNS.subList(ignoreValueOnlyCandidates, ORDERED_CANDIDATE_GROK_PATTERNS.size())) {
			
 
				+                    if (candidate.matchesAll(snippets)) {
			
 
				+                        bestCandidate = candidate;
			
 
				+                        break;
			
 
				+                    }
			
 
				+                    ++ignoreValueOnlyCandidates;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        if (bestCandidate == null) {
			
 
				+            if (isLast) {
			
 
				+                finalizeGrokPattern(snippets);
			
 
				+            } else {
			
 
				+                addIntermediateRegex(snippets);
			
 
				+            }
			
 
				+        } else {
			
 
				+            processCandidateAndSplit(bestCandidate, isLast, snippets, true, ignoreValueOnlyCandidates + (ignoreKeyValueCandidate ? 1 : 0),
			
 
				+                ignoreKeyValueCandidate, ignoreValueOnlyCandidates);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * If the snippets supplied begin with more than 1 character of common punctuation or whitespace
			
 
				+     * then add all but the last of these characters to the overall pattern and remove them from the
			
 
				+     * snippets.
			
 
				+     * @param snippets Input snippets - not modified.
			
 
				+     * @return Output snippets, which will be a copy of the input snippets but with whatever characters
			
 
				+     *         were added to <code>overallPatternBuilder</code> removed from the beginning.
			
 
				+     */
			
 
				+    Collection<String> adjustForPunctuation(Collection<String> snippets) {
			
 
				+
			
 
				+        assert snippets.isEmpty() == false;
			
 
				+
			
 
				+        StringBuilder commonInitialPunctuation = new StringBuilder();
			
 
				+
			
 
				+        for (String snippet : snippets) {
			
 
				+
			
 
				+            if (commonInitialPunctuation.length() == 0) {
			
 
				+                for (int index = 0; index < snippet.length(); ++index) {
			
 
				+                    char ch = snippet.charAt(index);
			
 
				+                    if (PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.get(ch) != null) {
			
 
				+                        commonInitialPunctuation.append(ch);
			
 
				+                    } else {
			
 
				+                        break;
			
 
				+                    }
			
 
				+                }
			
 
				+            } else {
			
 
				+                if (commonInitialPunctuation.length() > snippet.length()) {
			
 
				+                    commonInitialPunctuation.delete(snippet.length(), commonInitialPunctuation.length());
			
 
				+                }
			
 
				+                for (int index = 0; index < commonInitialPunctuation.length(); ++index) {
			
 
				+                    char ch = snippet.charAt(index);
			
 
				+                    if (ch != commonInitialPunctuation.charAt(index)) {
			
 
				+                        commonInitialPunctuation.delete(index, commonInitialPunctuation.length());
			
 
				+                        break;
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            if (commonInitialPunctuation.length() <= 1) {
			
 
				+                return snippets;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        int numLiteralCharacters = commonInitialPunctuation.length() - 1;
			
 
				+
			
 
				+        for (int index = 0; index < numLiteralCharacters; ++index) {
			
 
				+            char ch = commonInitialPunctuation.charAt(index);
			
 
				+            if (PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.getOrDefault(ch, false)) {
			
 
				+                overallGrokPatternBuilder.append('\\');
			
 
				+            }
			
 
				+            overallGrokPatternBuilder.append(ch);
			
 
				+        }
			
 
				+
			
 
				+        return snippets.stream().map(snippet -> snippet.substring(numLiteralCharacters)).collect(Collectors.toList());
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * The first time a particular field name is passed, simply return it.
			
 
				+     * The second time return it with "2" appended.
			
 
				+     * The third time return it with "3" appended.
			
 
				+     * Etc.
			
 
				+     */
			
 
				+    static String buildFieldName(Map<String, Integer> fieldNameCountStore, String fieldName) {
			
 
				+        Integer numberSeen = fieldNameCountStore.compute(fieldName, (k, v) -> 1 + ((v == null) ? 0 : v));
			
 
				+        return (numberSeen > 1) ? fieldName + numberSeen : fieldName;
			
 
				+    }
			
 
				+
			
 
				+    private void addIntermediateRegex(Collection<String> snippets) {
			
 
				+        addIntermediateRegex(overallGrokPatternBuilder, snippets);
			
 
				+    }
			
 
				+
			
 
				+    public static void addIntermediateRegex(StringBuilder patternBuilder, Collection<String> snippets) {
			
 
				+        if (snippets.isEmpty()) {
			
 
				+            return;
			
 
				+        }
			
 
				+
			
 
				+        List<String> others = new ArrayList<>(snippets);
			
 
				+        String driver = others.remove(others.size() - 1);
			
 
				+
			
 
				+        boolean wildcardRequiredIfNonMatchFound = true;
			
 
				+        for (int i = 0; i < driver.length(); ++i) {
			
 
				+            char ch = driver.charAt(i);
			
 
				+            Boolean punctuationOrSpaceNeedsEscaping = PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.get(ch);
			
 
				+            if (punctuationOrSpaceNeedsEscaping != null && others.stream().allMatch(other -> other.indexOf(ch) >= 0)) {
			
 
				+                if (wildcardRequiredIfNonMatchFound && others.stream().anyMatch(other -> other.indexOf(ch) > 0)) {
			
 
				+                    patternBuilder.append(".*?");
			
 
				+                }
			
 
				+                if (punctuationOrSpaceNeedsEscaping) {
			
 
				+                    patternBuilder.append('\\');
			
 
				+                }
			
 
				+                patternBuilder.append(ch);
			
 
				+                wildcardRequiredIfNonMatchFound = true;
			
 
				+                others = others.stream().map(other -> other.substring(other.indexOf(ch) + 1)).collect(Collectors.toList());
			
 
				+            } else if (wildcardRequiredIfNonMatchFound) {
			
 
				+                patternBuilder.append(".*?");
			
 
				+                wildcardRequiredIfNonMatchFound = false;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        if (wildcardRequiredIfNonMatchFound && others.stream().anyMatch(s -> s.isEmpty() == false)) {
			
 
				+            patternBuilder.append(".*?");
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    private void finalizeGrokPattern(Collection<String> snippets) {
			
 
				+        if (snippets.stream().allMatch(String::isEmpty)) {
			
 
				+            return;
			
 
				+        }
			
 
				+
			
 
				+        List<String> others = new ArrayList<>(snippets);
			
 
				+        String driver = others.remove(others.size() - 1);
			
 
				+
			
 
				+        for (int i = 0; i < driver.length(); ++i) {
			
 
				+            char ch = driver.charAt(i);
			
 
				+            int driverIndex = i;
			
 
				+            Boolean punctuationOrSpaceNeedsEscaping = PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.get(ch);
			
 
				+            if (punctuationOrSpaceNeedsEscaping != null &&
			
 
				+                others.stream().allMatch(other -> other.length() > driverIndex && other.charAt(driverIndex) == ch)) {
			
 
				+                if (punctuationOrSpaceNeedsEscaping) {
			
 
				+                    overallGrokPatternBuilder.append('\\');
			
 
				+                }
			
 
				+                overallGrokPatternBuilder.append(ch);
			
 
				+                if (i == driver.length() - 1 && others.stream().allMatch(driver::equals)) {
			
 
				+                    return;
			
 
				+                }
			
 
				+            } else {
			
 
				+                break;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        overallGrokPatternBuilder.append(".*");
			
 
				+    }
			
 
				+
			
 
				+    interface GrokPatternCandidate {
			
 
				+
			
 
				+        /**
			
 
				+         * @return Does this Grok pattern candidate match all the snippets?
			
 
				+         */
			
 
				+        boolean matchesAll(Collection<String> snippets);
			
 
				+
			
 
				+        /**
			
 
				+         * After it has been determined that this Grok pattern candidate matches a collection of strings,
			
 
				+         * return collections of the bits that come before (prefaces) and after (epilogues) the bit
			
 
				+         * that matches.  Also update mappings with the most appropriate field name and type.
			
 
				+         * @return The string that needs to be incorporated into the overall Grok pattern for the line.
			
 
				+         */
			
 
				+        String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
			
 
				+                               Collection<String> epilogues, Map<String, Object> mappings);
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * A Grok pattern candidate that will match a single named Grok pattern.
			
 
				+     */
			
 
				+    static class ValueOnlyGrokPatternCandidate implements GrokPatternCandidate {
			
 
				+
			
 
				+        private final String grokPatternName;
			
 
				+        private final String mappingType;
			
 
				+        private final String fieldName;
			
 
				+        private final Grok grok;
			
 
				+
			
 
				+        /**
			
 
				+         * Pre/post breaks default to \b, but this may not be appropriate for Grok patterns that start or
			
 
				+         * end with a non "word" character (i.e. letter, number or underscore).  For such patterns use one
			
 
				+         * of the other constructors.
			
 
				+         * <p>
			
 
				+         * In cases where the Grok pattern defined by Logstash already includes conditions on what must
			
 
				+         * come before and after the match, use one of the other constructors and specify an empty string
			
 
				+         * for the pre and/or post breaks.
			
 
				+         *
			
 
				+         * @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash.
			
 
				+         * @param fieldName       Name of the field to extract from the match.
			
 
				+         */
			
 
				+        ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName) {
			
 
				+            this(grokPatternName, mappingType, fieldName, "\\b", "\\b");
			
 
				+        }
			
 
				+
			
 
				+        /**
			
 
				+         * @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash.
			
 
				+         * @param mappingType     Data type for field in Elasticsearch mappings.
			
 
				+         * @param fieldName       Name of the field to extract from the match.
			
 
				+         * @param preBreak        Only consider the match if it's broken from the previous text by this.
			
 
				+         * @param postBreak       Only consider the match if it's broken from the following text by this.
			
 
				+         */
			
 
				+        ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName, String preBreak, String postBreak) {
			
 
				+            this.grokPatternName = grokPatternName;
			
 
				+            this.mappingType = mappingType;
			
 
				+            this.fieldName = fieldName;
			
 
				+            // The (?m) here has the Ruby meaning, which is equivalent to (?s) in Java
			
 
				+            grok = new Grok(Grok.getBuiltinPatterns(), "(?m)%{DATA:" + PREFACE + "}" + preBreak +
			
 
				+                "%{" + grokPatternName + ":" + VALUE + "}" + postBreak + "%{GREEDYDATA:" + EPILOGUE + "}");
			
 
				+        }
			
 
				+
			
 
				+        @Override
			
 
				+        public boolean matchesAll(Collection<String> snippets) {
			
 
				+            return snippets.stream().allMatch(grok::match);
			
 
				+        }
			
 
				+
			
 
				+        /**
			
 
				+         * Given a collection of strings, and a Grok pattern that matches some part of them all,
			
 
				+         * return collections of the bits that come before (prefaces) and after (epilogues) the
			
 
				+         * bit that matches.
			
 
				+         */
			
 
				+        @Override
			
 
				+        public String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
			
 
				+                                      Collection<String> epilogues, Map<String, Object> mappings) {
			
 
				+            String sampleValue = null;
			
 
				+            for (String snippet : snippets) {
			
 
				+                Map<String, Object> captures = grok.captures(snippet);
			
 
				+                // If the pattern doesn't match then captures will be null
			
 
				+                if (captures == null) {
			
 
				+                    throw new IllegalStateException("[%{" + grokPatternName + "}] does not match snippet [" + snippet + "]");
			
 
				+                }
			
 
				+                prefaces.add(captures.getOrDefault(PREFACE, "").toString());
			
 
				+                if (sampleValue == null) {
			
 
				+                    sampleValue = captures.get(VALUE).toString();
			
 
				+                }
			
 
				+                epilogues.add(captures.getOrDefault(EPILOGUE, "").toString());
			
 
				+            }
			
 
				+            String adjustedFieldName = buildFieldName(fieldNameCountStore, fieldName);
			
 
				+            if (mappings != null) {
			
 
				+                Map<String, String> fullMappingType = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, mappingType);
			
 
				+                if ("date".equals(mappingType)) {
			
 
				+                    TimestampMatch timestampMatch = TimestampFormatFinder.findFirstFullMatch(sampleValue);
			
 
				+                    if (timestampMatch != null) {
			
 
				+                        fullMappingType = timestampMatch.getEsDateMappingTypeWithFormat();
			
 
				+                    }
			
 
				+                }
			
 
				+                mappings.put(adjustedFieldName, fullMappingType);
			
 
				+            }
			
 
				+            return "%{" + grokPatternName + ":" + adjustedFieldName + "}";
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Unlike the {@link ValueOnlyGrokPatternCandidate} an object of this class is not immutable and not thread safe.
			
 
				+     * When a given object matches a set of strings it chooses a field name.  Then that same field name is used when
			
 
				+     * processing captures from the pattern.  Hence only a single thread may use any particular instance of this
			
 
				+     * class.
			
 
				+     */
			
 
				+    static class KeyValueGrokPatternCandidate implements GrokPatternCandidate {
			
 
				+
			
 
				+        private static final Pattern kvFinder = Pattern.compile("\\b(\\w+)=[\\w.-]+");
			
 
				+        private final List<String> explanation;
			
 
				+        private String fieldName;
			
 
				+
			
 
				+        KeyValueGrokPatternCandidate(List<String> explanation) {
			
 
				+            this.explanation = explanation;
			
 
				+        }
			
 
				+
			
 
				+        @Override
			
 
				+        public boolean matchesAll(Collection<String> snippets) {
			
 
				+            Set<String> candidateNames = new LinkedHashSet<>();
			
 
				+            boolean isFirst = true;
			
 
				+            for (String snippet : snippets) {
			
 
				+                if (isFirst) {
			
 
				+                    Matcher matcher = kvFinder.matcher(snippet);
			
 
				+                    while (matcher.find()) {
			
 
				+                        candidateNames.add(matcher.group(1));
			
 
				+                    }
			
 
				+                    isFirst = false;
			
 
				+                } else {
			
 
				+                    candidateNames.removeIf(candidateName ->
			
 
				+                        Pattern.compile("\\b" + candidateName + "=[\\w.-]+").matcher(snippet).find() == false);
			
 
				+                }
			
 
				+                if (candidateNames.isEmpty()) {
			
 
				+                    break;
			
 
				+                }
			
 
				+            }
			
 
				+            return (fieldName = candidateNames.stream().findFirst().orElse(null)) != null;
			
 
				+        }
			
 
				+
			
 
				+        @Override
			
 
				+        public String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
			
 
				+                                      Collection<String> epilogues, Map<String, Object> mappings) {
			
 
				+            if (fieldName == null) {
			
 
				+                throw new IllegalStateException("Cannot process KV matches until a field name has been determined");
			
 
				+            }
			
 
				+            Grok grok = new Grok(Grok.getBuiltinPatterns(), "(?m)%{DATA:" + PREFACE + "}\\b" +
			
 
				+                fieldName + "=%{USER:" + VALUE + "}%{GREEDYDATA:" + EPILOGUE + "}");
			
 
				+            Collection<String> values = new ArrayList<>();
			
 
				+            for (String snippet : snippets) {
			
 
				+                Map<String, Object> captures = grok.captures(snippet);
			
 
				+                // If the pattern doesn't match then captures will be null
			
 
				+                if (captures == null) {
			
 
				+                    throw new IllegalStateException("[\\b" + fieldName + "=%{USER}] does not match snippet [" + snippet + "]");
			
 
				+                }
			
 
				+                prefaces.add(captures.getOrDefault(PREFACE, "").toString());
			
 
				+                values.add(captures.getOrDefault(VALUE, "").toString());
			
 
				+                epilogues.add(captures.getOrDefault(EPILOGUE, "").toString());
			
 
				+            }
			
 
				+            String adjustedFieldName = buildFieldName(fieldNameCountStore, fieldName);
			
 
				+            if (mappings != null) {
			
 
				+                mappings.put(adjustedFieldName, LogStructureUtils.guessScalarMapping(explanation, adjustedFieldName, values));
			
 
				+            }
			
 
				+            return "\\b" + fieldName + "=%{USER:" + adjustedFieldName + "}";
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * A Grok pattern candidate that matches a single named Grok pattern but will not update mappings.
			
 
				+     */
			
 
				+    static class NoMappingGrokPatternCandidate extends ValueOnlyGrokPatternCandidate {
			
 
				+
			
 
				+        NoMappingGrokPatternCandidate(String grokPatternName, String fieldName) {
			
 
				+            super(grokPatternName, null, fieldName);
			
 
				+        }
			
 
				+
			
 
				+        @Override
			
 
				+        public String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
			
 
				+                                      Collection<String> epilogues, Map<String, Object> mappings) {
			
 
				+            return super.processCaptures(fieldNameCountStore, snippets, prefaces, epilogues, null);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Used to check whether a single Grok pattern matches every sample message in its entirety.
			
 
				+     */
			
 
				+    static class FullMatchGrokPatternCandidate {
			
 
				+
			
 
				+        private final String grokString;
			
 
				+        private final String timeField;
			
 
				+        private final Grok grok;
			
 
				+
			
 
				+        FullMatchGrokPatternCandidate(String grokPatternName, String timeField) {
			
 
				+            grokString = "%{" + grokPatternName + "}";
			
 
				+            this.timeField = timeField;
			
 
				+            grok = new Grok(Grok.getBuiltinPatterns(), grokString);
			
 
				+        }
			
 
				+
			
 
				+        public boolean matchesAll(Collection<String> sampleMessages) {
			
 
				+            return sampleMessages.stream().allMatch(grok::match);
			
 
				+        }
			
 
				+
			
 
				+        /**
			
 
				+         * This must only be called if {@link #matchesAll} returns <code>true</code>.
			
 
				+         * @return A tuple of (time field name, Grok string).
			
 
				+         */
			
 
				+        public Tuple<String, String> processMatch(List<String> explanation, Collection<String> sampleMessages,
			
 
				+                                                  Map<String, Object> mappings) {
			
 
				+
			
 
				+            explanation.add("A full message Grok pattern [" + grokString.substring(2, grokString.length() - 1) + "] looks appropriate");
			
 
				+
			
 
				+            if (mappings != null) {
			
 
				+                Map<String, Collection<String>> valuesPerField = new HashMap<>();
			
 
				+
			
 
				+                for (String sampleMessage : sampleMessages) {
			
 
				+                    Map<String, Object> captures = grok.captures(sampleMessage);
			
 
				+                    // If the pattern doesn't match then captures will be null
			
 
				+                    if (captures == null) {
			
 
				+                        throw new IllegalStateException("[" + grokString + "] does not match snippet [" + sampleMessage + "]");
			
 
				+                    }
			
 
				+                    for (Map.Entry<String, Object> capture : captures.entrySet()) {
			
 
				+
			
 
				+                        String fieldName = capture.getKey();
			
 
				+                        String fieldValue = capture.getValue().toString();
			
 
				+
			
 
				+                        // Exclude the time field because that will be dropped and replaced with @timestamp
			
 
				+                        if (fieldName.equals(timeField) == false) {
			
 
				+                            valuesPerField.compute(fieldName, (k, v) -> {
			
 
				+                                if (v == null) {
			
 
				+                                    return new ArrayList<>(Collections.singletonList(fieldValue));
			
 
				+                                } else {
			
 
				+                                    v.add(fieldValue);
			
 
				+                                    return v;
			
 
				+                                }
			
 
				+                            });
			
 
				+                        }
			
 
				+                    }
			
 
				+                }
			
 
				+
			
 
				+                for (Map.Entry<String, Collection<String>> valuesForField : valuesPerField.entrySet()) {
			
 
				+                    String fieldName = valuesForField.getKey();
			
 
				+                    mappings.put(fieldName,
			
 
				+                        LogStructureUtils.guessScalarMapping(explanation, fieldName, valuesForField.getValue()));
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            return new Tuple<>(timeField, grokString);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/JsonLogStructureFinder.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/JsonLogStructureFinder.java
@@ -0,0 +1,84 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.elasticsearch.common.collect.Tuple;
			
 
				+import org.elasticsearch.common.xcontent.DeprecationHandler;
			
 
				+import org.elasticsearch.common.xcontent.NamedXContentRegistry;
			
 
				+import org.elasticsearch.common.xcontent.XContentParser;
			
 
				+import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.Arrays;
			
 
				+import java.util.Collections;
			
 
				+import java.util.List;
			
 
				+import java.util.Map;
			
 
				+import java.util.SortedMap;
			
 
				+import java.util.stream.Collectors;
			
 
				+
			
 
				+import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;
			
 
				+
			
 
				+/**
			
 
				+ * Really ND-JSON.
			
 
				+ */
			
 
				+public class JsonLogStructureFinder implements LogStructureFinder {
			
 
				+
			
 
				+    private final List<String> sampleMessages;
			
 
				+    private final LogStructure structure;
			
 
				+
			
 
				+    static JsonLogStructureFinder makeJsonLogStructureFinder(List<String> explanation, String sample, String charsetName,
			
 
				+                                                             Boolean hasByteOrderMarker) throws IOException {
			
 
				+
			
 
				+        List<Map<String, ?>> sampleRecords = new ArrayList<>();
			
 
				+
			
 
				+        List<String> sampleMessages = Arrays.asList(sample.split("\n"));
			
 
				+        for (String sampleMessage : sampleMessages) {
			
 
				+            XContentParser parser = jsonXContent.createParser(NamedXContentRegistry.EMPTY, DeprecationHandler.THROW_UNSUPPORTED_OPERATION,
			
 
				+                sampleMessage);
			
 
				+            sampleRecords.add(parser.mapOrdered());
			
 
				+        }
			
 
				+
			
 
				+        LogStructure.Builder structureBuilder = new LogStructure.Builder(LogStructure.Format.JSON)
			
 
				+            .setCharset(charsetName)
			
 
				+            .setHasByteOrderMarker(hasByteOrderMarker)
			
 
				+            .setSampleStart(sampleMessages.stream().limit(2).collect(Collectors.joining("\n", "", "\n")))
			
 
				+            .setNumLinesAnalyzed(sampleMessages.size())
			
 
				+            .setNumMessagesAnalyzed(sampleRecords.size());
			
 
				+
			
 
				+        Tuple<String, TimestampMatch> timeField = LogStructureUtils.guessTimestampField(explanation, sampleRecords);
			
 
				+        if (timeField != null) {
			
 
				+            structureBuilder.setTimestampField(timeField.v1())
			
 
				+                .setTimestampFormats(timeField.v2().dateFormats)
			
 
				+                .setNeedClientTimezone(timeField.v2().hasTimezoneDependentParsing());
			
 
				+        }
			
 
				+
			
 
				+        SortedMap<String, Object> mappings = LogStructureUtils.guessMappings(explanation, sampleRecords);
			
 
				+        mappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
			
 
				+
			
 
				+        LogStructure structure = structureBuilder
			
 
				+            .setMappings(mappings)
			
 
				+            .setExplanation(explanation)
			
 
				+            .build();
			
 
				+
			
 
				+        return new JsonLogStructureFinder(sampleMessages, structure);
			
 
				+    }
			
 
				+
			
 
				+    private JsonLogStructureFinder(List<String> sampleMessages, LogStructure structure) {
			
 
				+        this.sampleMessages = Collections.unmodifiableList(sampleMessages);
			
 
				+        this.structure = structure;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public List<String> getSampleMessages() {
			
 
				+        return sampleMessages;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public LogStructure getStructure() {
			
 
				+        return structure;
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/JsonLogStructureFinderFactory.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/JsonLogStructureFinderFactory.java
@@ -0,0 +1,87 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.elasticsearch.common.xcontent.DeprecationHandler;
			
 
				+import org.elasticsearch.common.xcontent.NamedXContentRegistry;
			
 
				+import org.elasticsearch.common.xcontent.XContentParser;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.io.StringReader;
			
 
				+import java.util.List;
			
 
				+import java.util.Locale;
			
 
				+
			
 
				+import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;
			
 
				+
			
 
				+public class JsonLogStructureFinderFactory implements LogStructureFinderFactory {
			
 
				+
			
 
				+    /**
			
 
				+     * This format matches if the sample consists of one or more JSON documents.
			
 
				+     * If there is more than one, they must be newline-delimited.  The
			
 
				+     * documents must be non-empty, to prevent lines containing "{}" from matching.
			
 
				+     */
			
 
				+    @Override
			
 
				+    public boolean canCreateFromSample(List<String> explanation, String sample) {
			
 
				+
			
 
				+        int completeDocCount = 0;
			
 
				+
			
 
				+        try {
			
 
				+            String[] sampleLines = sample.split("\n");
			
 
				+            for (String sampleLine : sampleLines) {
			
 
				+                try (XContentParser parser = jsonXContent.createParser(NamedXContentRegistry.EMPTY,
			
 
				+                    DeprecationHandler.THROW_UNSUPPORTED_OPERATION, new ContextPrintingStringReader(sampleLine))) {
			
 
				+
			
 
				+                    if (parser.map().isEmpty()) {
			
 
				+                        explanation.add("Not JSON because an empty object was parsed: [" + sampleLine + "]");
			
 
				+                        return false;
			
 
				+                    }
			
 
				+                    ++completeDocCount;
			
 
				+                    if (parser.nextToken() != null) {
			
 
				+                        explanation.add("Not newline delimited JSON because a line contained more than a single object: [" +
			
 
				+                            sampleLine + "]");
			
 
				+                        return false;
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        } catch (IOException | IllegalStateException e) {
			
 
				+            explanation.add("Not JSON because there was a parsing exception: [" + e.getMessage().replaceAll("\\s?\r?\n\\s?", " ") + "]");
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+        if (completeDocCount == 0) {
			
 
				+            explanation.add("Not JSON because sample didn't contain a complete document");
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+        explanation.add("Deciding sample is newline delimited JSON");
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
			
 
				+        throws IOException {
			
 
				+        return JsonLogStructureFinder.makeJsonLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker);
			
 
				+    }
			
 
				+
			
 
				+    private static class ContextPrintingStringReader extends StringReader {
			
 
				+
			
 
				+        private final String str;
			
 
				+
			
 
				+        ContextPrintingStringReader(String str) {
			
 
				+            super(str);
			
 
				+            this.str = str;
			
 
				+        }
			
 
				+
			
 
				+        @Override
			
 
				+        public String toString() {
			
 
				+            if (str.length() <= 80) {
			
 
				+                return String.format(Locale.ROOT, "\"%s\"", str);
			
 
				+            } else {
			
 
				+                return String.format(Locale.ROOT, "\"%.77s...\"", str);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructure.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructure.java
@@ -0,0 +1,614 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.elasticsearch.common.ParseField;
			
 
				+import org.elasticsearch.common.xcontent.ObjectParser;
			
 
				+import org.elasticsearch.common.xcontent.ToXContentObject;
			
 
				+import org.elasticsearch.common.xcontent.XContentBuilder;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.Collections;
			
 
				+import java.util.List;
			
 
				+import java.util.Locale;
			
 
				+import java.util.Map;
			
 
				+import java.util.Objects;
			
 
				+import java.util.SortedMap;
			
 
				+import java.util.TreeMap;
			
 
				+
			
 
				+/**
			
 
				+ * Stores the log file format determined by a {@link LogStructureFinder}.
			
 
				+ */
			
 
				+public class LogStructure implements ToXContentObject {
			
 
				+
			
 
				+    public enum Format {
			
 
				+
			
 
				+        JSON, XML, CSV, TSV, SEMI_COLON_SEPARATED_VALUES, PIPE_SEPARATED_VALUES, SEMI_STRUCTURED_TEXT;
			
 
				+
			
 
				+        public Character separator() {
			
 
				+            switch (this) {
			
 
				+                case JSON:
			
 
				+                case XML:
			
 
				+                    return null;
			
 
				+                case CSV:
			
 
				+                    return ',';
			
 
				+                case TSV:
			
 
				+                    return '\t';
			
 
				+                case SEMI_COLON_SEPARATED_VALUES:
			
 
				+                    return ';';
			
 
				+                case PIPE_SEPARATED_VALUES:
			
 
				+                    return '|';
			
 
				+                case SEMI_STRUCTURED_TEXT:
			
 
				+                    return null;
			
 
				+                default:
			
 
				+                    throw new IllegalStateException("enum value [" + this + "] missing from switch.");
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        public boolean supportsNesting() {
			
 
				+            switch (this) {
			
 
				+                case JSON:
			
 
				+                case XML:
			
 
				+                    return true;
			
 
				+                case CSV:
			
 
				+                case TSV:
			
 
				+                case SEMI_COLON_SEPARATED_VALUES:
			
 
				+                case PIPE_SEPARATED_VALUES:
			
 
				+                case SEMI_STRUCTURED_TEXT:
			
 
				+                    return false;
			
 
				+                default:
			
 
				+                    throw new IllegalStateException("enum value [" + this + "] missing from switch.");
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        public boolean isStructured() {
			
 
				+            switch (this) {
			
 
				+                case JSON:
			
 
				+                case XML:
			
 
				+                case CSV:
			
 
				+                case TSV:
			
 
				+                case SEMI_COLON_SEPARATED_VALUES:
			
 
				+                case PIPE_SEPARATED_VALUES:
			
 
				+                    return true;
			
 
				+                case SEMI_STRUCTURED_TEXT:
			
 
				+                    return false;
			
 
				+                default:
			
 
				+                    throw new IllegalStateException("enum value [" + this + "] missing from switch.");
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        public boolean isSemiStructured() {
			
 
				+            switch (this) {
			
 
				+                case JSON:
			
 
				+                case XML:
			
 
				+                case CSV:
			
 
				+                case TSV:
			
 
				+                case SEMI_COLON_SEPARATED_VALUES:
			
 
				+                case PIPE_SEPARATED_VALUES:
			
 
				+                    return false;
			
 
				+                case SEMI_STRUCTURED_TEXT:
			
 
				+                    return true;
			
 
				+                default:
			
 
				+                    throw new IllegalStateException("enum value [" + this + "] missing from switch.");
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        public boolean isSeparatedValues() {
			
 
				+            switch (this) {
			
 
				+                case JSON:
			
 
				+                case XML:
			
 
				+                    return false;
			
 
				+                case CSV:
			
 
				+                case TSV:
			
 
				+                case SEMI_COLON_SEPARATED_VALUES:
			
 
				+                case PIPE_SEPARATED_VALUES:
			
 
				+                    return true;
			
 
				+                case SEMI_STRUCTURED_TEXT:
			
 
				+                    return false;
			
 
				+                default:
			
 
				+                    throw new IllegalStateException("enum value [" + this + "] missing from switch.");
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        public static Format fromSeparator(char separator) {
			
 
				+            switch (separator) {
			
 
				+                case ',':
			
 
				+                    return CSV;
			
 
				+                case '\t':
			
 
				+                    return TSV;
			
 
				+                case ';':
			
 
				+                    return SEMI_COLON_SEPARATED_VALUES;
			
 
				+                case '|':
			
 
				+                    return PIPE_SEPARATED_VALUES;
			
 
				+                default:
			
 
				+                    throw new IllegalArgumentException("No known format has separator [" + separator + "]");
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        public static Format fromString(String name) {
			
 
				+            return valueOf(name.trim().toUpperCase(Locale.ROOT));
			
 
				+        }
			
 
				+
			
 
				+        @Override
			
 
				+        public String toString() {
			
 
				+            return name().toLowerCase(Locale.ROOT);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    static final ParseField NUM_LINES_ANALYZED = new ParseField("num_lines_analyzed");
			
 
				+    static final ParseField NUM_MESSAGES_ANALYZED = new ParseField("num_messages_analyzed");
			
 
				+    static final ParseField SAMPLE_START = new ParseField("sample_start");
			
 
				+    static final ParseField CHARSET = new ParseField("charset");
			
 
				+    static final ParseField HAS_BYTE_ORDER_MARKER = new ParseField("has_byte_order_marker");
			
 
				+    static final ParseField STRUCTURE = new ParseField("format");
			
 
				+    static final ParseField MULTILINE_START_PATTERN = new ParseField("multiline_start_pattern");
			
 
				+    static final ParseField EXCLUDE_LINES_PATTERN = new ParseField("exclude_lines_pattern");
			
 
				+    static final ParseField INPUT_FIELDS = new ParseField("input_fields");
			
 
				+    static final ParseField HAS_HEADER_ROW = new ParseField("has_header_row");
			
 
				+    static final ParseField SEPARATOR = new ParseField("separator");
			
 
				+    static final ParseField SHOULD_TRIM_FIELDS = new ParseField("should_trim_fields");
			
 
				+    static final ParseField GROK_PATTERN = new ParseField("grok_pattern");
			
 
				+    static final ParseField TIMESTAMP_FIELD = new ParseField("timestamp_field");
			
 
				+    static final ParseField TIMESTAMP_FORMATS = new ParseField("timestamp_formats");
			
 
				+    static final ParseField NEED_CLIENT_TIMEZONE = new ParseField("need_client_timezone");
			
 
				+    static final ParseField MAPPINGS = new ParseField("mappings");
			
 
				+    static final ParseField EXPLANATION = new ParseField("explanation");
			
 
				+
			
 
				+    public static final ObjectParser<Builder, Void> PARSER = new ObjectParser<>("log_file_structure", false, Builder::new);
			
 
				+
			
 
				+    static {
			
 
				+        PARSER.declareInt(Builder::setNumLinesAnalyzed, NUM_LINES_ANALYZED);
			
 
				+        PARSER.declareInt(Builder::setNumMessagesAnalyzed, NUM_MESSAGES_ANALYZED);
			
 
				+        PARSER.declareString(Builder::setSampleStart, SAMPLE_START);
			
 
				+        PARSER.declareString(Builder::setCharset, CHARSET);
			
 
				+        PARSER.declareBoolean(Builder::setHasByteOrderMarker, HAS_BYTE_ORDER_MARKER);
			
 
				+        PARSER.declareString((p, c) -> p.setFormat(Format.fromString(c)), STRUCTURE);
			
 
				+        PARSER.declareString(Builder::setMultilineStartPattern, MULTILINE_START_PATTERN);
			
 
				+        PARSER.declareString(Builder::setExcludeLinesPattern, EXCLUDE_LINES_PATTERN);
			
 
				+        PARSER.declareStringArray(Builder::setInputFields, INPUT_FIELDS);
			
 
				+        PARSER.declareBoolean(Builder::setHasHeaderRow, HAS_HEADER_ROW);
			
 
				+        PARSER.declareString((p, c) -> p.setSeparator(c.charAt(0)), SEPARATOR);
			
 
				+        PARSER.declareBoolean(Builder::setShouldTrimFields, SHOULD_TRIM_FIELDS);
			
 
				+        PARSER.declareString(Builder::setGrokPattern, GROK_PATTERN);
			
 
				+        PARSER.declareString(Builder::setTimestampField, TIMESTAMP_FIELD);
			
 
				+        PARSER.declareStringArray(Builder::setTimestampFormats, TIMESTAMP_FORMATS);
			
 
				+        PARSER.declareBoolean(Builder::setNeedClientTimezone, NEED_CLIENT_TIMEZONE);
			
 
				+        PARSER.declareObject(Builder::setMappings, (p, c) -> new TreeMap<>(p.map()), MAPPINGS);
			
 
				+        PARSER.declareStringArray(Builder::setExplanation, EXPLANATION);
			
 
				+    }
			
 
				+
			
 
				+    private final int numLinesAnalyzed;
			
 
				+    private final int numMessagesAnalyzed;
			
 
				+    private final String sampleStart;
			
 
				+    private final String charset;
			
 
				+    private final Boolean hasByteOrderMarker;
			
 
				+    private final Format format;
			
 
				+    private final String multilineStartPattern;
			
 
				+    private final String excludeLinesPattern;
			
 
				+    private final List<String> inputFields;
			
 
				+    private final Boolean hasHeaderRow;
			
 
				+    private final Character separator;
			
 
				+    private final Boolean shouldTrimFields;
			
 
				+    private final String grokPattern;
			
 
				+    private final List<String> timestampFormats;
			
 
				+    private final String timestampField;
			
 
				+    private final boolean needClientTimezone;
			
 
				+    private final SortedMap<String, Object> mappings;
			
 
				+    private final List<String> explanation;
			
 
				+
			
 
				+    public LogStructure(int numLinesAnalyzed, int numMessagesAnalyzed, String sampleStart, String charset, Boolean hasByteOrderMarker,
			
 
				+                        Format format, String multilineStartPattern, String excludeLinesPattern, List<String> inputFields,
			
 
				+                        Boolean hasHeaderRow, Character separator, Boolean shouldTrimFields, String grokPattern, String timestampField,
			
 
				+                        List<String> timestampFormats, boolean needClientTimezone, Map<String, Object> mappings,
			
 
				+                        List<String> explanation) {
			
 
				+
			
 
				+        this.numLinesAnalyzed = numLinesAnalyzed;
			
 
				+        this.numMessagesAnalyzed = numMessagesAnalyzed;
			
 
				+        this.sampleStart = Objects.requireNonNull(sampleStart);
			
 
				+        this.charset = Objects.requireNonNull(charset);
			
 
				+        this.hasByteOrderMarker = hasByteOrderMarker;
			
 
				+        this.format = Objects.requireNonNull(format);
			
 
				+        this.multilineStartPattern = multilineStartPattern;
			
 
				+        this.excludeLinesPattern = excludeLinesPattern;
			
 
				+        this.inputFields = (inputFields == null) ? null : Collections.unmodifiableList(new ArrayList<>(inputFields));
			
 
				+        this.hasHeaderRow = hasHeaderRow;
			
 
				+        this.separator = separator;
			
 
				+        this.shouldTrimFields = shouldTrimFields;
			
 
				+        this.grokPattern = grokPattern;
			
 
				+        this.timestampField = timestampField;
			
 
				+        this.timestampFormats = (timestampFormats == null) ? null : Collections.unmodifiableList(new ArrayList<>(timestampFormats));
			
 
				+        this.needClientTimezone = needClientTimezone;
			
 
				+        this.mappings = Collections.unmodifiableSortedMap(new TreeMap<>(mappings));
			
 
				+        this.explanation = Collections.unmodifiableList(new ArrayList<>(explanation));
			
 
				+    }
			
 
				+
			
 
				+    public int getNumLinesAnalyzed() {
			
 
				+        return numLinesAnalyzed;
			
 
				+    }
			
 
				+
			
 
				+    public int getNumMessagesAnalyzed() {
			
 
				+        return numMessagesAnalyzed;
			
 
				+    }
			
 
				+
			
 
				+    public String getSampleStart() {
			
 
				+        return sampleStart;
			
 
				+    }
			
 
				+
			
 
				+    public String getCharset() {
			
 
				+        return charset;
			
 
				+    }
			
 
				+
			
 
				+    public Boolean getHasByteOrderMarker() {
			
 
				+        return hasByteOrderMarker;
			
 
				+    }
			
 
				+
			
 
				+    public Format getFormat() {
			
 
				+        return format;
			
 
				+    }
			
 
				+
			
 
				+    public String getMultilineStartPattern() {
			
 
				+        return multilineStartPattern;
			
 
				+    }
			
 
				+
			
 
				+    public String getExcludeLinesPattern() {
			
 
				+        return excludeLinesPattern;
			
 
				+    }
			
 
				+
			
 
				+    public List<String> getInputFields() {
			
 
				+        return inputFields;
			
 
				+    }
			
 
				+
			
 
				+    public Boolean getHasHeaderRow() {
			
 
				+        return hasHeaderRow;
			
 
				+    }
			
 
				+
			
 
				+    public Character getSeparator() {
			
 
				+        return separator;
			
 
				+    }
			
 
				+
			
 
				+    public Boolean getShouldTrimFields() {
			
 
				+        return shouldTrimFields;
			
 
				+    }
			
 
				+
			
 
				+    public String getGrokPattern() {
			
 
				+        return grokPattern;
			
 
				+    }
			
 
				+
			
 
				+    public String getTimestampField() {
			
 
				+        return timestampField;
			
 
				+    }
			
 
				+
			
 
				+    public List<String> getTimestampFormats() {
			
 
				+        return timestampFormats;
			
 
				+    }
			
 
				+
			
 
				+    public boolean needClientTimezone() {
			
 
				+        return needClientTimezone;
			
 
				+    }
			
 
				+
			
 
				+    public SortedMap<String, Object> getMappings() {
			
 
				+        return mappings;
			
 
				+    }
			
 
				+
			
 
				+    public List<String> getExplanation() {
			
 
				+        return explanation;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
			
 
				+
			
 
				+        builder.startObject();
			
 
				+        builder.field(NUM_LINES_ANALYZED.getPreferredName(), numLinesAnalyzed);
			
 
				+        builder.field(NUM_MESSAGES_ANALYZED.getPreferredName(), numMessagesAnalyzed);
			
 
				+        builder.field(SAMPLE_START.getPreferredName(), sampleStart);
			
 
				+        builder.field(CHARSET.getPreferredName(), charset);
			
 
				+        if (hasByteOrderMarker != null) {
			
 
				+            builder.field(HAS_BYTE_ORDER_MARKER.getPreferredName(), hasByteOrderMarker.booleanValue());
			
 
				+        }
			
 
				+        builder.field(STRUCTURE.getPreferredName(), format);
			
 
				+        if (multilineStartPattern != null && multilineStartPattern.isEmpty() == false) {
			
 
				+            builder.field(MULTILINE_START_PATTERN.getPreferredName(), multilineStartPattern);
			
 
				+        }
			
 
				+        if (excludeLinesPattern != null && excludeLinesPattern.isEmpty() == false) {
			
 
				+            builder.field(EXCLUDE_LINES_PATTERN.getPreferredName(), excludeLinesPattern);
			
 
				+        }
			
 
				+        if (inputFields != null && inputFields.isEmpty() == false) {
			
 
				+            builder.field(INPUT_FIELDS.getPreferredName(), inputFields);
			
 
				+        }
			
 
				+        if (hasHeaderRow != null) {
			
 
				+            builder.field(HAS_HEADER_ROW.getPreferredName(), hasHeaderRow.booleanValue());
			
 
				+        }
			
 
				+        if (separator != null) {
			
 
				+            builder.field(SEPARATOR.getPreferredName(), String.valueOf(separator));
			
 
				+        }
			
 
				+        if (shouldTrimFields != null) {
			
 
				+            builder.field(SHOULD_TRIM_FIELDS.getPreferredName(), shouldTrimFields.booleanValue());
			
 
				+        }
			
 
				+        if (grokPattern != null && grokPattern.isEmpty() == false) {
			
 
				+            builder.field(GROK_PATTERN.getPreferredName(), grokPattern);
			
 
				+        }
			
 
				+        if (timestampField != null && timestampField.isEmpty() == false) {
			
 
				+            builder.field(TIMESTAMP_FIELD.getPreferredName(), timestampField);
			
 
				+        }
			
 
				+        if (timestampFormats != null && timestampFormats.isEmpty() == false) {
			
 
				+            builder.field(TIMESTAMP_FORMATS.getPreferredName(), timestampFormats);
			
 
				+        }
			
 
				+        builder.field(NEED_CLIENT_TIMEZONE.getPreferredName(), needClientTimezone);
			
 
				+        builder.field(MAPPINGS.getPreferredName(), mappings);
			
 
				+        builder.field(EXPLANATION.getPreferredName(), explanation);
			
 
				+        builder.endObject();
			
 
				+
			
 
				+        return builder;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public int hashCode() {
			
 
				+
			
 
				+        return Objects.hash(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format,
			
 
				+            multilineStartPattern, excludeLinesPattern, inputFields, hasHeaderRow, separator, shouldTrimFields, grokPattern, timestampField,
			
 
				+            timestampFormats, needClientTimezone, mappings, explanation);
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public boolean equals(Object other) {
			
 
				+
			
 
				+        if (this == other) {
			
 
				+            return true;
			
 
				+        }
			
 
				+
			
 
				+        if (other == null || getClass() != other.getClass()) {
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+        LogStructure that = (LogStructure) other;
			
 
				+        return this.numLinesAnalyzed == that.numLinesAnalyzed &&
			
 
				+            this.numMessagesAnalyzed == that.numMessagesAnalyzed &&
			
 
				+            this.needClientTimezone == that.needClientTimezone &&
			
 
				+            Objects.equals(this.sampleStart, that.sampleStart) &&
			
 
				+            Objects.equals(this.charset, that.charset) &&
			
 
				+            Objects.equals(this.hasByteOrderMarker, that.hasByteOrderMarker) &&
			
 
				+            Objects.equals(this.format, that.format) &&
			
 
				+            Objects.equals(this.multilineStartPattern, that.multilineStartPattern) &&
			
 
				+            Objects.equals(this.excludeLinesPattern, that.excludeLinesPattern) &&
			
 
				+            Objects.equals(this.inputFields, that.inputFields) &&
			
 
				+            Objects.equals(this.hasHeaderRow, that.hasHeaderRow) &&
			
 
				+            Objects.equals(this.separator, that.separator) &&
			
 
				+            Objects.equals(this.shouldTrimFields, that.shouldTrimFields) &&
			
 
				+            Objects.equals(this.grokPattern, that.grokPattern) &&
			
 
				+            Objects.equals(this.timestampField, that.timestampField) &&
			
 
				+            Objects.equals(this.timestampFormats, that.timestampFormats) &&
			
 
				+            Objects.equals(this.mappings, that.mappings) &&
			
 
				+            Objects.equals(this.explanation, that.explanation);
			
 
				+    }
			
 
				+
			
 
				+    public static class Builder {
			
 
				+
			
 
				+        private int numLinesAnalyzed;
			
 
				+        private int numMessagesAnalyzed;
			
 
				+        private String sampleStart;
			
 
				+        private String charset;
			
 
				+        private Boolean hasByteOrderMarker;
			
 
				+        private Format format;
			
 
				+        private String multilineStartPattern;
			
 
				+        private String excludeLinesPattern;
			
 
				+        private List<String> inputFields;
			
 
				+        private Boolean hasHeaderRow;
			
 
				+        private Character separator;
			
 
				+        private Boolean shouldTrimFields;
			
 
				+        private String grokPattern;
			
 
				+        private String timestampField;
			
 
				+        private List<String> timestampFormats;
			
 
				+        private boolean needClientTimezone;
			
 
				+        private Map<String, Object> mappings;
			
 
				+        private List<String> explanation;
			
 
				+
			
 
				+        public Builder() {
			
 
				+            this(Format.SEMI_STRUCTURED_TEXT);
			
 
				+        }
			
 
				+
			
 
				+        public Builder(Format format) {
			
 
				+            setFormat(format);
			
 
				+        }
			
 
				+
			
 
				+        public Builder setNumLinesAnalyzed(int numLinesAnalyzed) {
			
 
				+            this.numLinesAnalyzed = numLinesAnalyzed;
			
 
				+            return this;
			
 
				+        }
			
 
				+
			
 
				+        public Builder setNumMessagesAnalyzed(int numMessagesAnalyzed) {
			
 
				+            this.numMessagesAnalyzed = numMessagesAnalyzed;
			
 
				+            return this;
			
 
				+        }
			
 
				+
			
 
				+        public Builder setSampleStart(String sampleStart) {
			
 
				+            this.sampleStart = Objects.requireNonNull(sampleStart);
			
 
				+            return this;
			
 
				+        }
			
 
				+
			
 
				+        public Builder setCharset(String charset) {
			
 
				+            this.charset = Objects.requireNonNull(charset);
			
 
				+            return this;
			
 
				+        }
			
 
				+
			
 
				+        public Builder setHasByteOrderMarker(Boolean hasByteOrderMarker) {
			
 
				+            this.hasByteOrderMarker = hasByteOrderMarker;
			
 
				+            return this;
			
 
				+        }
			
 
				+
			
 
				+        public Builder setFormat(Format format) {
			
 
				+            this.format = Objects.requireNonNull(format);
			
 
				+            this.separator = format.separator();
			
 
				+            return this;
			
 
				+        }
			
 
				+
			
 
				+        public Builder setMultilineStartPattern(String multilineStartPattern) {
			
 
				+            this.multilineStartPattern = multilineStartPattern;
			
 
				+            return this;
			
 
				+        }
			
 
				+
			
 
				+        public Builder setExcludeLinesPattern(String excludeLinesPattern) {
			
 
				+            this.excludeLinesPattern = excludeLinesPattern;
			
 
				+            return this;
			
 
				+        }
			
 
				+
			
 
				+        public Builder setInputFields(List<String> inputFields) {
			
 
				+            this.inputFields = inputFields;
			
 
				+            return this;
			
 
				+        }
			
 
				+
			
 
				+        public Builder setHasHeaderRow(Boolean hasHeaderRow) {
			
 
				+            this.hasHeaderRow = hasHeaderRow;
			
 
				+            return this;
			
 
				+        }
			
 
				+
			
 
				+        public Builder setShouldTrimFields(Boolean shouldTrimFields) {
			
 
				+            this.shouldTrimFields = shouldTrimFields;
			
 
				+            return this;
			
 
				+        }
			
 
				+
			
 
				+        public Builder setSeparator(Character separator) {
			
 
				+            this.separator = separator;
			
 
				+            return this;
			
 
				+        }
			
 
				+
			
 
				+        public Builder setGrokPattern(String grokPattern) {
			
 
				+            this.grokPattern = grokPattern;
			
 
				+            return this;
			
 
				+        }
			
 
				+
			
 
				+        public Builder setTimestampField(String timestampField) {
			
 
				+            this.timestampField = timestampField;
			
 
				+            return this;
			
 
				+        }
			
 
				+
			
 
				+        public Builder setTimestampFormats(List<String> timestampFormats) {
			
 
				+            this.timestampFormats = timestampFormats;
			
 
				+            return this;
			
 
				+        }
			
 
				+
			
 
				+        public Builder setNeedClientTimezone(boolean needClientTimezone) {
			
 
				+            this.needClientTimezone = needClientTimezone;
			
 
				+            return this;
			
 
				+        }
			
 
				+
			
 
				+        public Builder setMappings(Map<String, Object> mappings) {
			
 
				+            this.mappings = Objects.requireNonNull(mappings);
			
 
				+            return this;
			
 
				+        }
			
 
				+
			
 
				+        public Builder setExplanation(List<String> explanation) {
			
 
				+            this.explanation = Objects.requireNonNull(explanation);
			
 
				+            return this;
			
 
				+        }
			
 
				+
			
 
				+        @SuppressWarnings("fallthrough")
			
 
				+        public LogStructure build() {
			
 
				+
			
 
				+            if (numLinesAnalyzed <= 0) {
			
 
				+                throw new IllegalArgumentException("Number of lines analyzed must be positive.");
			
 
				+            }
			
 
				+
			
 
				+            if (numMessagesAnalyzed <= 0) {
			
 
				+                throw new IllegalArgumentException("Number of messages analyzed must be positive.");
			
 
				+            }
			
 
				+
			
 
				+            if (numMessagesAnalyzed > numLinesAnalyzed) {
			
 
				+                throw new IllegalArgumentException("Number of messages analyzed cannot be greater than number of lines analyzed.");
			
 
				+            }
			
 
				+
			
 
				+            if (sampleStart == null || sampleStart.isEmpty()) {
			
 
				+                throw new IllegalArgumentException("Sample start must be specified.");
			
 
				+            }
			
 
				+
			
 
				+            if (charset == null || charset.isEmpty()) {
			
 
				+                throw new IllegalArgumentException("A character set must be specified.");
			
 
				+            }
			
 
				+
			
 
				+            if (charset.toUpperCase(Locale.ROOT).startsWith("UTF") == false && hasByteOrderMarker != null) {
			
 
				+                throw new IllegalArgumentException("A byte order marker is only possible for UTF character sets.");
			
 
				+            }
			
 
				+
			
 
				+            switch (format) {
			
 
				+                case JSON:
			
 
				+                    if (shouldTrimFields != null) {
			
 
				+                        throw new IllegalArgumentException("Should trim fields may not be specified for [" + format + "] structures.");
			
 
				+                    }
			
 
				+                    // $FALL-THROUGH$
			
 
				+                case XML:
			
 
				+                    if (hasHeaderRow != null) {
			
 
				+                        throw new IllegalArgumentException("Has header row may not be specified for [" + format + "] structures.");
			
 
				+                    }
			
 
				+                    if (separator != null) {
			
 
				+                        throw new IllegalArgumentException("Separator may not be specified for [" + format + "] structures.");
			
 
				+                    }
			
 
				+                    if (grokPattern != null) {
			
 
				+                        throw new IllegalArgumentException("Grok pattern may not be specified for [" + format + "] structures.");
			
 
				+                    }
			
 
				+                    break;
			
 
				+                case CSV:
			
 
				+                case TSV:
			
 
				+                case SEMI_COLON_SEPARATED_VALUES:
			
 
				+                case PIPE_SEPARATED_VALUES:
			
 
				+                    if (inputFields == null || inputFields.isEmpty()) {
			
 
				+                        throw new IllegalArgumentException("Input fields must be specified for [" + format + "] structures.");
			
 
				+                    }
			
 
				+                    if (hasHeaderRow == null) {
			
 
				+                        throw new IllegalArgumentException("Has header row must be specified for [" + format + "] structures.");
			
 
				+                    }
			
 
				+                    Character expectedSeparator = format.separator();
			
 
				+                    assert expectedSeparator != null;
			
 
				+                    if (expectedSeparator.equals(separator) == false) {
			
 
				+                        throw new IllegalArgumentException("Separator must be [" + expectedSeparator + "] for [" + format +
			
 
				+                            "] structures.");
			
 
				+                    }
			
 
				+                    if (grokPattern != null) {
			
 
				+                        throw new IllegalArgumentException("Grok pattern may not be specified for [" + format + "] structures.");
			
 
				+                    }
			
 
				+                    break;
			
 
				+                case SEMI_STRUCTURED_TEXT:
			
 
				+                    if (inputFields != null) {
			
 
				+                        throw new IllegalArgumentException("Input fields may not be specified for [" + format + "] structures.");
			
 
				+                    }
			
 
				+                    if (hasHeaderRow != null) {
			
 
				+                        throw new IllegalArgumentException("Has header row may not be specified for [" + format + "] structures.");
			
 
				+                    }
			
 
				+                    if (separator != null) {
			
 
				+                        throw new IllegalArgumentException("Separator may not be specified for [" + format + "] structures.");
			
 
				+                    }
			
 
				+                    if (shouldTrimFields != null) {
			
 
				+                        throw new IllegalArgumentException("Should trim fields may not be specified for [" + format + "] structures.");
			
 
				+                    }
			
 
				+                    if (grokPattern == null || grokPattern.isEmpty()) {
			
 
				+                        throw new IllegalArgumentException("Grok pattern must be specified for [" + format + "] structures.");
			
 
				+                    }
			
 
				+                    break;
			
 
				+                default:
			
 
				+                    throw new IllegalStateException("enum value [" + format + "] missing from switch.");
			
 
				+            }
			
 
				+
			
 
				+            if ((timestampField == null) != (timestampFormats == null || timestampFormats.isEmpty())) {
			
 
				+                throw new IllegalArgumentException("Timestamp field and timestamp formats must both be specified or neither be specified.");
			
 
				+            }
			
 
				+
			
 
				+            if (needClientTimezone && timestampField == null) {
			
 
				+                throw new IllegalArgumentException("Client timezone cannot be needed if there is no timestamp field.");
			
 
				+            }
			
 
				+
			
 
				+            if (mappings == null || mappings.isEmpty()) {
			
 
				+                throw new IllegalArgumentException("Mappings must be specified.");
			
 
				+            }
			
 
				+
			
 
				+            if (explanation == null || explanation.isEmpty()) {
			
 
				+                throw new IllegalArgumentException("Explanation must be specified.");
			
 
				+            }
			
 
				+
			
 
				+            return new LogStructure(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format,
			
 
				+                multilineStartPattern, excludeLinesPattern, inputFields, hasHeaderRow, separator, shouldTrimFields, grokPattern,
			
 
				+                timestampField, timestampFormats, needClientTimezone, mappings, explanation);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinder.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinder.java
@@ -0,0 +1,23 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import java.util.List;
			
 
				+
			
 
				+public interface LogStructureFinder {
			
 
				+
			
 
				+    /**
			
 
				+     * The (possibly multi-line) messages that the log sample was split into.
			
 
				+     * @return A list of messages.
			
 
				+     */
			
 
				+    List<String> getSampleMessages();
			
 
				+
			
 
				+    /**
			
 
				+     * Retrieve the structure of the log file used to instantiate the finder.
			
 
				+     * @return The log file structure.
			
 
				+     */
			
 
				+    LogStructure getStructure();
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderFactory.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderFactory.java
@@ -0,0 +1,35 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import java.util.List;
			
 
				+
			
 
				+public interface LogStructureFinderFactory {
			
 
				+
			
 
				+    /**
			
 
				+     * Given a sample of a log file, decide whether this factory will be able
			
 
				+     * to create an appropriate object to represent its ingestion configs.
			
 
				+     * @param explanation List of reasons for making decisions.  May contain items when passed and new reasons
			
 
				+     *                    can be appended by this method.
			
 
				+     * @param sample A sample from the log file to be ingested.
			
 
				+     * @return <code>true</code> if this factory can create an appropriate log
			
 
				+     *         file structure given the sample; otherwise <code>false</code>.
			
 
				+     */
			
 
				+    boolean canCreateFromSample(List<String> explanation, String sample);
			
 
				+
			
 
				+    /**
			
 
				+     * Create an object representing the structure of a log file.
			
 
				+     * @param explanation List of reasons for making decisions.  May contain items when passed and new reasons
			
 
				+     *                    can be appended by this method.
			
 
				+     * @param sample A sample from the log file to be ingested.
			
 
				+     * @param charsetName The name of the character set in which the sample was provided.
			
 
				+     * @param hasByteOrderMarker Did the sample have a byte order marker?  <code>null</code> means "not relevant".
			
 
				+     * @return A log file structure object suitable for ingesting the supplied sample.
			
 
				+     * @throws Exception if something goes wrong during creation.
			
 
				+     */
			
 
				+    LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
			
 
				+        throws Exception;
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderManager.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderManager.java
@@ -0,0 +1,232 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import com.ibm.icu.text.CharsetDetector;
			
 
				+import com.ibm.icu.text.CharsetMatch;
			
 
				+import org.elasticsearch.common.collect.Tuple;
			
 
				+
			
 
				+import java.io.BufferedInputStream;
			
 
				+import java.io.BufferedReader;
			
 
				+import java.io.IOException;
			
 
				+import java.io.InputStream;
			
 
				+import java.io.Reader;
			
 
				+import java.nio.charset.Charset;
			
 
				+import java.nio.charset.StandardCharsets;
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.Arrays;
			
 
				+import java.util.Collections;
			
 
				+import java.util.HashSet;
			
 
				+import java.util.List;
			
 
				+import java.util.Locale;
			
 
				+import java.util.Optional;
			
 
				+import java.util.Set;
			
 
				+
			
 
				+/**
			
 
				+ * Runs the high-level steps needed to create ingest configs for the specified log file.  In order:
			
 
				+ * 1. Determine the most likely character set (UTF-8, UTF-16LE, ISO-8859-2, etc.)
			
 
				+ * 2. Load a sample of the file, consisting of the first 1000 lines of the file
			
 
				+ * 3. Determine the most likely file structure - one of ND-JSON, XML, CSV, TSV or semi-structured text
			
 
				+ * 4. Create an appropriate structure object and delegate writing configs to it
			
 
				+ */
			
 
				+public final class LogStructureFinderManager {
			
 
				+
			
 
				+    public static final int MIN_SAMPLE_LINE_COUNT = 2;
			
 
				+
			
 
				+    static final Set<String> FILEBEAT_SUPPORTED_ENCODINGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
			
 
				+        "866", "ansi_x3.4-1968", "arabic", "ascii", "asmo-708", "big5", "big5-hkscs", "chinese", "cn-big5", "cp1250", "cp1251", "cp1252",
			
 
				+        "cp1253", "cp1254", "cp1255", "cp1256", "cp1257", "cp1258", "cp819", "cp866", "csbig5", "cseuckr", "cseucpkdfmtjapanese",
			
 
				+        "csgb2312", "csibm866", "csiso2022jp", "csiso2022kr", "csiso58gb231280", "csiso88596e", "csiso88596i", "csiso88598e", "csiso88598i",
			
 
				+        "csisolatin1", "csisolatin2", "csisolatin3", "csisolatin4", "csisolatin5", "csisolatin6", "csisolatin9", "csisolatinarabic",
			
 
				+        "csisolatincyrillic", "csisolatingreek", "csisolatinhebrew", "cskoi8r", "csksc56011987", "csmacintosh", "csshiftjis", "cyrillic",
			
 
				+        "dos-874", "ecma-114", "ecma-118", "elot_928", "euc-jp", "euc-kr", "gb18030", "gb2312", "gb_2312", "gb_2312-80", "gbk", "greek",
			
 
				+        "greek8", "hebrew", "hz-gb-2312", "ibm819", "ibm866", "iso-2022-cn", "iso-2022-cn-ext", "iso-2022-jp", "iso-2022-kr", "iso-8859-1",
			
 
				+        "iso-8859-10", "iso-8859-11", "iso-8859-13", "iso-8859-14", "iso-8859-15", "iso-8859-16", "iso-8859-2", "iso-8859-3", "iso-8859-4",
			
 
				+        "iso-8859-5", "iso-8859-6", "iso-8859-6-e", "iso-8859-6-i", "iso-8859-7", "iso-8859-8", "iso-8859-8-e", "iso-8859-8-i",
			
 
				+        "iso-8859-9", "iso-ir-100", "iso-ir-101", "iso-ir-109", "iso-ir-110", "iso-ir-126", "iso-ir-127", "iso-ir-138", "iso-ir-144",
			
 
				+        "iso-ir-148", "iso-ir-149", "iso-ir-157", "iso-ir-58", "iso8859-1", "iso8859-10", "iso8859-11", "iso8859-13", "iso8859-14",
			
 
				+        "iso8859-15", "iso8859-2", "iso8859-3", "iso8859-4", "iso8859-5", "iso8859-6", "iso8859-6e", "iso8859-6i", "iso8859-7", "iso8859-8",
			
 
				+        "iso8859-8e", "iso8859-8i", "iso8859-9", "iso88591", "iso885910", "iso885911", "iso885913", "iso885914", "iso885915", "iso88592",
			
 
				+        "iso88593", "iso88594", "iso88595", "iso88596", "iso88597", "iso88598", "iso88599", "iso_8859-1", "iso_8859-15", "iso_8859-1:1987",
			
 
				+        "iso_8859-2", "iso_8859-2:1987", "iso_8859-3", "iso_8859-3:1988", "iso_8859-4", "iso_8859-4:1988", "iso_8859-5", "iso_8859-5:1988",
			
 
				+        "iso_8859-6", "iso_8859-6:1987", "iso_8859-7", "iso_8859-7:1987", "iso_8859-8", "iso_8859-8:1988", "iso_8859-9", "iso_8859-9:1989",
			
 
				+        "koi", "koi8", "koi8-r", "koi8-ru", "koi8-u", "koi8_r", "korean", "ks_c_5601-1987", "ks_c_5601-1989", "ksc5601", "ksc_5601", "l1",
			
 
				+        "l2", "l3", "l4", "l5", "l6", "l9", "latin1", "latin2", "latin3", "latin4", "latin5", "latin6", "logical", "mac", "macintosh",
			
 
				+        "ms932", "ms_kanji", "shift-jis", "shift_jis", "sjis", "sun_eu_greek", "tis-620", "unicode-1-1-utf-8", "us-ascii", "utf-16",
			
 
				+        "utf-16-bom", "utf-16be", "utf-16be-bom", "utf-16le", "utf-16le-bom", "utf-8", "utf8", "visual", "windows-1250", "windows-1251",
			
 
				+        "windows-1252", "windows-1253", "windows-1254", "windows-1255", "windows-1256", "windows-1257", "windows-1258", "windows-31j",
			
 
				+        "windows-874", "windows-949", "x-cp1250", "x-cp1251", "x-cp1252", "x-cp1253", "x-cp1254", "x-cp1255", "x-cp1256", "x-cp1257",
			
 
				+        "x-cp1258", "x-euc-jp", "x-gbk", "x-mac-cyrillic", "x-mac-roman", "x-mac-ukrainian", "x-sjis", "x-x-big5"
			
 
				+    )));
			
 
				+
			
 
				+    /**
			
 
				+     * These need to be ordered so that the more generic formats come after the more specific ones
			
 
				+     */
			
 
				+    private static final List<LogStructureFinderFactory> ORDERED_STRUCTURE_FACTORIES = Collections.unmodifiableList(Arrays.asList(
			
 
				+        new JsonLogStructureFinderFactory(),
			
 
				+        new XmlLogStructureFinderFactory(),
			
 
				+        // ND-JSON will often also be valid (although utterly weird) CSV, so JSON must come before CSV
			
 
				+        new CsvLogStructureFinderFactory(),
			
 
				+        new TsvLogStructureFinderFactory(),
			
 
				+        new SemiColonSeparatedValuesLogStructureFinderFactory(),
			
 
				+        new PipeSeparatedValuesLogStructureFinderFactory(),
			
 
				+        new TextLogStructureFinderFactory()
			
 
				+    ));
			
 
				+
			
 
				+    private static final int BUFFER_SIZE = 8192;
			
 
				+
			
 
				+    /**
			
 
				+     * Given a stream of data from some log file, determine its structure.
			
 
				+     * @param idealSampleLineCount Ideally, how many lines from the stream will be read to determine the structure?
			
 
				+     *                             If the stream has fewer lines then an attempt will still be made, providing at
			
 
				+     *                             least {@link #MIN_SAMPLE_LINE_COUNT} lines can be read.
			
 
				+     * @param fromFile A stream from which the sample will be read.
			
 
				+     * @return A {@link LogStructureFinder} object from which the structure and messages can be queried.
			
 
				+     * @throws Exception A variety of problems could occur at various stages of the structure finding process.
			
 
				+     */
			
 
				+    public LogStructureFinder findLogStructure(int idealSampleLineCount, InputStream fromFile) throws Exception {
			
 
				+        return findLogStructure(new ArrayList<>(), idealSampleLineCount, fromFile);
			
 
				+    }
			
 
				+
			
 
				+    public LogStructureFinder findLogStructure(List<String> explanation, int idealSampleLineCount, InputStream fromFile)
			
 
				+        throws Exception {
			
 
				+
			
 
				+        CharsetMatch charsetMatch = findCharset(explanation, fromFile);
			
 
				+        String charsetName = charsetMatch.getName();
			
 
				+
			
 
				+        Tuple<String, Boolean> sampleInfo = sampleFile(charsetMatch.getReader(), charsetName, MIN_SAMPLE_LINE_COUNT,
			
 
				+            Math.max(MIN_SAMPLE_LINE_COUNT, idealSampleLineCount));
			
 
				+
			
 
				+        return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2());
			
 
				+    }
			
 
				+
			
 
				+    CharsetMatch findCharset(List<String> explanation, InputStream inputStream) throws Exception {
			
 
				+
			
 
				+        // We need an input stream that supports mark and reset, so wrap the argument
			
 
				+        // in a BufferedInputStream if it doesn't already support this feature
			
 
				+        if (inputStream.markSupported() == false) {
			
 
				+            inputStream = new BufferedInputStream(inputStream, BUFFER_SIZE);
			
 
				+        }
			
 
				+
			
 
				+        // This is from ICU4J
			
 
				+        CharsetDetector charsetDetector = new CharsetDetector().setText(inputStream);
			
 
				+        CharsetMatch[] charsetMatches = charsetDetector.detectAll();
			
 
				+
			
 
				+        // Determine some extra characteristics of the input to compensate for some deficiencies of ICU4J
			
 
				+        boolean pureAscii = true;
			
 
				+        boolean containsZeroBytes = false;
			
 
				+        inputStream.mark(BUFFER_SIZE);
			
 
				+        byte[] workspace = new byte[BUFFER_SIZE];
			
 
				+        int remainingLength = BUFFER_SIZE;
			
 
				+        do {
			
 
				+            int bytesRead = inputStream.read(workspace, 0, remainingLength);
			
 
				+            if (bytesRead <= 0) {
			
 
				+                break;
			
 
				+            }
			
 
				+            for (int i = 0; i < bytesRead && containsZeroBytes == false; ++i) {
			
 
				+                if (workspace[i] == 0) {
			
 
				+                    containsZeroBytes = true;
			
 
				+                    pureAscii = false;
			
 
				+                } else {
			
 
				+                    pureAscii = pureAscii && workspace[i] > 0 && workspace[i] < 128;
			
 
				+                }
			
 
				+            }
			
 
				+            remainingLength -= bytesRead;
			
 
				+        } while (containsZeroBytes == false && remainingLength > 0);
			
 
				+        inputStream.reset();
			
 
				+
			
 
				+        if (pureAscii) {
			
 
				+            // If the input is pure ASCII then many single byte character sets will match.  We want to favour
			
 
				+            // UTF-8 in this case, as it avoids putting a bold declaration of a dubious character set choice
			
 
				+            // in the config files.
			
 
				+            Optional<CharsetMatch> utf8CharsetMatch = Arrays.stream(charsetMatches)
			
 
				+                .filter(charsetMatch -> StandardCharsets.UTF_8.name().equals(charsetMatch.getName())).findFirst();
			
 
				+            if (utf8CharsetMatch.isPresent()) {
			
 
				+                explanation.add("Using character encoding [" + StandardCharsets.UTF_8.name() +
			
 
				+                    "], which matched the input with [" + utf8CharsetMatch.get().getConfidence() + "%] confidence - first [" +
			
 
				+                    (BUFFER_SIZE / 1024) + "kB] of input was pure ASCII");
			
 
				+                return utf8CharsetMatch.get();
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        // Input wasn't pure ASCII, so use the best matching character set that's supported by both Java and Go.
			
 
				+        // Additionally, if the input contains zero bytes then avoid single byte character sets, as ICU4J will
			
 
				+        // suggest these for binary files but then
			
 
				+        for (CharsetMatch charsetMatch : charsetMatches) {
			
 
				+            String name = charsetMatch.getName();
			
 
				+            if (Charset.isSupported(name) && FILEBEAT_SUPPORTED_ENCODINGS.contains(name.toLowerCase(Locale.ROOT))) {
			
 
				+
			
 
				+                // This extra test is to avoid trying to read binary files as text.  Running the log config
			
 
				+                // deduction algorithms on binary files is very slow as the binary files generally appear to
			
 
				+                // have very long lines.
			
 
				+                boolean spaceEncodingContainsZeroByte = false;
			
 
				+                byte[] spaceBytes = " ".getBytes(name);
			
 
				+                for (int i = 0; i < spaceBytes.length && spaceEncodingContainsZeroByte == false; ++i) {
			
 
				+                    spaceEncodingContainsZeroByte = (spaceBytes[i] == 0);
			
 
				+                }
			
 
				+                if (containsZeroBytes && spaceEncodingContainsZeroByte == false) {
			
 
				+                    explanation.add("Character encoding [" + name + "] matched the input with [" + charsetMatch.getConfidence() +
			
 
				+                        "%] confidence but was rejected as the input contains zero bytes and the [" + name + "] encoding does not");
			
 
				+                } else {
			
 
				+                    explanation.add("Using character encoding [" + name + "], which matched the input with [" +
			
 
				+                        charsetMatch.getConfidence() + "%] confidence");
			
 
				+                    return charsetMatch;
			
 
				+                }
			
 
				+            } else {
			
 
				+                explanation.add("Character encoding [" + name + "] matched the input with [" + charsetMatch.getConfidence() +
			
 
				+                    "%] confidence but was rejected as it is not supported by [" +
			
 
				+                    (Charset.isSupported(name) ? "Filebeat" : "the JVM") + "]");
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        throw new IllegalArgumentException("Could not determine a usable character encoding for the input" +
			
 
				+            (containsZeroBytes ? " - could it be binary data?" : ""));
			
 
				+    }
			
 
				+
			
 
				+    LogStructureFinder makeBestStructureFinder(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
			
 
				+        throws Exception {
			
 
				+
			
 
				+        for (LogStructureFinderFactory factory : ORDERED_STRUCTURE_FACTORIES) {
			
 
				+            if (factory.canCreateFromSample(explanation, sample)) {
			
 
				+                return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker);
			
 
				+            }
			
 
				+        }
			
 
				+        throw new IllegalArgumentException("Input did not match any known formats");
			
 
				+    }
			
 
				+
			
 
				+    private Tuple<String, Boolean> sampleFile(Reader reader, String charsetName, int minLines, int maxLines) throws IOException {
			
 
				+
			
 
				+        int lineCount = 0;
			
 
				+        BufferedReader bufferedReader = new BufferedReader(reader);
			
 
				+        StringBuilder sample = new StringBuilder();
			
 
				+
			
 
				+        // Don't include any byte-order-marker in the sample.  (The logic to skip it works for both
			
 
				+        // UTF-8 and UTF-16 assuming the character set of the reader was correctly detected.)
			
 
				+        Boolean hasByteOrderMarker = null;
			
 
				+        if (charsetName.toUpperCase(Locale.ROOT).startsWith("UTF")) {
			
 
				+            int maybeByteOrderMarker = reader.read();
			
 
				+            hasByteOrderMarker = ((char) maybeByteOrderMarker == '\uFEFF');
			
 
				+            if (maybeByteOrderMarker >= 0 && hasByteOrderMarker == false && (char) maybeByteOrderMarker != '\r')
			
 
				+            {
			
 
				+                sample.appendCodePoint(maybeByteOrderMarker);
			
 
				+                if ((char) maybeByteOrderMarker == '\n') {
			
 
				+                    ++lineCount;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        String line;
			
 
				+        while ((line = bufferedReader.readLine()) != null && ++lineCount <= maxLines) {
			
 
				+            sample.append(line).append('\n');
			
 
				+        }
			
 
				+
			
 
				+        if (lineCount < minLines) {
			
 
				+            throw new IllegalArgumentException("Input contained too few lines to sample");
			
 
				+        }
			
 
				+
			
 
				+        return new Tuple<>(sample.toString(), hasByteOrderMarker);
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureUtils.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureUtils.java
@@ -0,0 +1,238 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.elasticsearch.common.collect.Tuple;
			
 
				+import org.elasticsearch.grok.Grok;
			
 
				+import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
			
 
				+
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.Arrays;
			
 
				+import java.util.Collection;
			
 
				+import java.util.Collections;
			
 
				+import java.util.Iterator;
			
 
				+import java.util.List;
			
 
				+import java.util.Map;
			
 
				+import java.util.SortedMap;
			
 
				+import java.util.TreeMap;
			
 
				+import java.util.stream.Collectors;
			
 
				+import java.util.stream.Stream;
			
 
				+
			
 
				+final class LogStructureUtils {
			
 
				+
			
 
				+    static final String DEFAULT_TIMESTAMP_FIELD = "@timestamp";
			
 
				+    static final String MAPPING_TYPE_SETTING = "type";
			
 
				+    static final String MAPPING_FORMAT_SETTING = "format";
			
 
				+    static final String MAPPING_PROPERTIES_SETTING = "properties";
			
 
				+
			
 
				+    // NUMBER Grok pattern doesn't support scientific notation, so we extend it
			
 
				+    private static final Grok NUMBER_GROK = new Grok(Grok.getBuiltinPatterns(), "^%{NUMBER}(?:[eE][+-]?[0-3]?[0-9]{1,2})?$");
			
 
				+    private static final Grok IP_GROK = new Grok(Grok.getBuiltinPatterns(), "^%{IP}$");
			
 
				+    private static final int KEYWORD_MAX_LEN = 256;
			
 
				+    private static final int KEYWORD_MAX_SPACES = 5;
			
 
				+
			
 
				+    private LogStructureUtils() {
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Given one or more sample records, find a timestamp field that is consistently present in them all.
			
 
				+     * To be returned the timestamp field:
			
 
				+     * - Must exist in every record
			
 
				+     * - Must have the same timestamp format in every record
			
 
				+     * If multiple fields meet these criteria then the one that occurred first in the first sample record
			
 
				+     * is chosen.
			
 
				+     * @param explanation List of reasons for choosing the overall log structure.  This list
			
 
				+     *                    may be non-empty when the method is called, and this method may
			
 
				+     *                    append to it.
			
 
				+     * @param sampleRecords List of records derived from the provided log sample.
			
 
				+     * @return A tuple of (field name, timestamp format) if one can be found, or <code>null</code> if
			
 
				+     *         there is no consistent timestamp.
			
 
				+     */
			
 
				+    static Tuple<String, TimestampMatch> guessTimestampField(List<String> explanation, List<Map<String, ?>> sampleRecords) {
			
 
				+        if (sampleRecords.isEmpty()) {
			
 
				+            return null;
			
 
				+        }
			
 
				+
			
 
				+        // Accept the first match from the first sample that is compatible with all the other samples
			
 
				+        for (Tuple<String, TimestampMatch> candidate : findCandidates(explanation, sampleRecords)) {
			
 
				+
			
 
				+            boolean allGood = true;
			
 
				+            for (Map<String, ?> sampleRecord : sampleRecords.subList(1, sampleRecords.size())) {
			
 
				+                Object fieldValue = sampleRecord.get(candidate.v1());
			
 
				+                if (fieldValue == null) {
			
 
				+                    explanation.add("First sample match [" + candidate.v1() + "] ruled out because record [" + sampleRecord +
			
 
				+                        "] doesn't have field");
			
 
				+                    allGood = false;
			
 
				+                    break;
			
 
				+                }
			
 
				+
			
 
				+                TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(fieldValue.toString());
			
 
				+                if (match == null || match.candidateIndex != candidate.v2().candidateIndex) {
			
 
				+                    explanation.add("First sample match [" + candidate.v1() + "] ruled out because record [" + sampleRecord +
			
 
				+                        "] matches differently: [" + match + "]");
			
 
				+                    allGood = false;
			
 
				+                    break;
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            if (allGood) {
			
 
				+                explanation.add("Guessing timestamp field is [" + candidate.v1() + "] with format [" + candidate.v2() + "]");
			
 
				+                return candidate;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        return null;
			
 
				+    }
			
 
				+
			
 
				+    private static List<Tuple<String, TimestampMatch>> findCandidates(List<String> explanation, List<Map<String, ?>> sampleRecords) {
			
 
				+
			
 
				+        List<Tuple<String, TimestampMatch>> candidates = new ArrayList<>();
			
 
				+
			
 
				+        // Get candidate timestamps from the first sample record
			
 
				+        for (Map.Entry<String, ?> entry : sampleRecords.get(0).entrySet()) {
			
 
				+            Object value = entry.getValue();
			
 
				+            if (value != null) {
			
 
				+                TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(value.toString());
			
 
				+                if (match != null) {
			
 
				+                    Tuple<String, TimestampMatch> candidate = new Tuple<>(entry.getKey(), match);
			
 
				+                    candidates.add(candidate);
			
 
				+                    explanation.add("First sample timestamp match [" + candidate + "]");
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        return candidates;
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Given the sampled records, guess appropriate Elasticsearch mappings.
			
 
				+     * @param sampleRecords The sampled records.
			
 
				+     * @return A map of field name to mapping settings.
			
 
				+     */
			
 
				+    static SortedMap<String, Object> guessMappings(List<String> explanation, List<Map<String, ?>> sampleRecords) {
			
 
				+
			
 
				+        SortedMap<String, Object> mappings = new TreeMap<>();
			
 
				+
			
 
				+        for (Map<String, ?> sampleRecord : sampleRecords) {
			
 
				+            for (String fieldName : sampleRecord.keySet()) {
			
 
				+                mappings.computeIfAbsent(fieldName, key -> guessMapping(explanation, fieldName,
			
 
				+                    sampleRecords.stream().flatMap(record -> {
			
 
				+                            Object fieldValue = record.get(fieldName);
			
 
				+                            return (fieldValue == null) ? Stream.empty() : Stream.of(fieldValue);
			
 
				+                        }
			
 
				+                    ).collect(Collectors.toList())));
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        return mappings;
			
 
				+    }
			
 
				+
			
 
				+    static Map<String, String> guessMapping(List<String> explanation, String fieldName, List<Object> fieldValues) {
			
 
				+
			
 
				+        if (fieldValues == null || fieldValues.isEmpty()) {
			
 
				+            // We can get here if all the records that contained a given field had a null value for it.
			
 
				+            // In this case it's best not to make any statement about what the mapping type should be.
			
 
				+            return null;
			
 
				+        }
			
 
				+
			
 
				+        if (fieldValues.stream().anyMatch(value -> value instanceof Map)) {
			
 
				+            if (fieldValues.stream().allMatch(value -> value instanceof Map)) {
			
 
				+                return Collections.singletonMap(MAPPING_TYPE_SETTING, "object");
			
 
				+            }
			
 
				+            throw new IllegalArgumentException("Field [" + fieldName +
			
 
				+                "] has both object and non-object values - this is not supported by Elasticsearch");
			
 
				+        }
			
 
				+
			
 
				+        if (fieldValues.stream().anyMatch(value -> value instanceof List || value instanceof Object[])) {
			
 
				+            // Elasticsearch fields can be either arrays or single values, but array values must all have the same type
			
 
				+            return guessMapping(explanation, fieldName,
			
 
				+                fieldValues.stream().flatMap(LogStructureUtils::flatten).collect(Collectors.toList()));
			
 
				+        }
			
 
				+
			
 
				+        return guessScalarMapping(explanation, fieldName, fieldValues.stream().map(Object::toString).collect(Collectors.toList()));
			
 
				+    }
			
 
				+
			
 
				+    private static Stream<Object> flatten(Object value) {
			
 
				+        if (value instanceof List) {
			
 
				+            @SuppressWarnings("unchecked")
			
 
				+            List<Object> objectList = (List<Object>) value;
			
 
				+            return objectList.stream();
			
 
				+        } else if (value instanceof Object[]) {
			
 
				+            return Arrays.stream((Object[]) value);
			
 
				+        } else {
			
 
				+            return Stream.of(value);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Given some sample values for a field, guess the most appropriate index mapping for the
			
 
				+     * field.
			
 
				+     * @param explanation List of reasons for choosing the overall log structure.  This list
			
 
				+     *                    may be non-empty when the method is called, and this method may
			
 
				+     *                    append to it.
			
 
				+     * @param fieldName Name of the field for which mappings are to be guessed.
			
 
				+     * @param fieldValues Values of the field for which mappings are to be guessed.  The guessed
			
 
				+     *                    mapping will be compatible with all the provided values.  Must not be
			
 
				+     *                    empty.
			
 
				+     * @return The sub-section of the index mappings most appropriate for the field,
			
 
				+     *         for example <code>{ "type" : "keyword" }</code>.
			
 
				+     */
			
 
				+    static Map<String, String> guessScalarMapping(List<String> explanation, String fieldName, Collection<String> fieldValues) {
			
 
				+
			
 
				+        assert fieldValues.isEmpty() == false;
			
 
				+
			
 
				+        if (fieldValues.stream().allMatch(value -> "true".equals(value) || "false".equals(value))) {
			
 
				+            return Collections.singletonMap(MAPPING_TYPE_SETTING, "boolean");
			
 
				+        }
			
 
				+
			
 
				+        // This checks if a date mapping would be appropriate, and, if so, finds the correct format
			
 
				+        Iterator<String> iter = fieldValues.iterator();
			
 
				+        TimestampMatch timestampMatch = TimestampFormatFinder.findFirstFullMatch(iter.next());
			
 
				+        while (timestampMatch != null && iter.hasNext()) {
			
 
				+            // To be mapped as type date all the values must match the same date format - it is
			
 
				+            // not acceptable for all values to be dates, but with different formats
			
 
				+            if (timestampMatch.equals(TimestampFormatFinder.findFirstFullMatch(iter.next(), timestampMatch.candidateIndex)) == false) {
			
 
				+                timestampMatch = null;
			
 
				+            }
			
 
				+        }
			
 
				+        if (timestampMatch != null) {
			
 
				+            return timestampMatch.getEsDateMappingTypeWithFormat();
			
 
				+        }
			
 
				+
			
 
				+        if (fieldValues.stream().allMatch(NUMBER_GROK::match)) {
			
 
				+            try {
			
 
				+                fieldValues.forEach(Long::parseLong);
			
 
				+                return Collections.singletonMap(MAPPING_TYPE_SETTING, "long");
			
 
				+            } catch (NumberFormatException e) {
			
 
				+                explanation.add("Rejecting type 'long' for field [" + fieldName + "] due to parse failure: [" + e.getMessage() + "]");
			
 
				+            }
			
 
				+            try {
			
 
				+                fieldValues.forEach(Double::parseDouble);
			
 
				+                return Collections.singletonMap(MAPPING_TYPE_SETTING, "double");
			
 
				+            } catch (NumberFormatException e) {
			
 
				+                explanation.add("Rejecting type 'double' for field [" + fieldName + "] due to parse failure: [" + e.getMessage() + "]");
			
 
				+            }
			
 
				+        }
			
 
				+        else if (fieldValues.stream().allMatch(IP_GROK::match)) {
			
 
				+            return Collections.singletonMap(MAPPING_TYPE_SETTING, "ip");
			
 
				+        }
			
 
				+
			
 
				+        if (fieldValues.stream().anyMatch(LogStructureUtils::isMoreLikelyTextThanKeyword)) {
			
 
				+            return Collections.singletonMap(MAPPING_TYPE_SETTING, "text");
			
 
				+        }
			
 
				+
			
 
				+        return Collections.singletonMap(MAPPING_TYPE_SETTING, "keyword");
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * The thinking is that the longer the field value and the more spaces it contains,
			
 
				+     * the more likely it is that it should be indexed as text rather than keyword.
			
 
				+     */
			
 
				+    static boolean isMoreLikelyTextThanKeyword(String str) {
			
 
				+        int length = str.length();
			
 
				+        return length > KEYWORD_MAX_LEN || length - str.replaceAll("\\s", "").length() > KEYWORD_MAX_SPACES;
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/PipeSeparatedValuesLogStructureFinderFactory.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/PipeSeparatedValuesLogStructureFinderFactory.java
@@ -0,0 +1,38 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.supercsv.prefs.CsvPreference;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.util.List;
			
 
				+
			
 
				+public class PipeSeparatedValuesLogStructureFinderFactory implements LogStructureFinderFactory {
			
 
				+
			
 
				+    private static final CsvPreference PIPE_PREFERENCE = new CsvPreference.Builder('"', '|', "\n").build();
			
 
				+
			
 
				+    /**
			
 
				+     * Rules are:
			
 
				+     * - The file must be valid pipe (<code>|</code>) separated values
			
 
				+     * - It must contain at least two complete records
			
 
				+     * - There must be at least five fields per record (otherwise files with coincidental
			
 
				+     *   or no pipe characters could be treated as pipe separated)
			
 
				+     * - Every pipe separated value record except the last must have the same number of fields
			
 
				+     * The reason the last record is allowed to have fewer fields than the others is that
			
 
				+     * it could have been truncated when the file was sampled.
			
 
				+     */
			
 
				+    @Override
			
 
				+    public boolean canCreateFromSample(List<String> explanation, String sample) {
			
 
				+        return SeparatedValuesLogStructureFinder.canCreateFromSample(explanation, sample, 5, PIPE_PREFERENCE, "pipe separated values");
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
			
 
				+        throws IOException {
			
 
				+        return SeparatedValuesLogStructureFinder.makeSeparatedValuesLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
			
 
				+            PIPE_PREFERENCE, true);
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/SemiColonSeparatedValuesLogStructureFinderFactory.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/SemiColonSeparatedValuesLogStructureFinderFactory.java
@@ -0,0 +1,37 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.supercsv.prefs.CsvPreference;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.util.List;
			
 
				+
			
 
				+public class SemiColonSeparatedValuesLogStructureFinderFactory implements LogStructureFinderFactory {
			
 
				+
			
 
				+    /**
			
 
				+     * Rules are:
			
 
				+     * - The file must be valid semi-colon separated values
			
 
				+     * - It must contain at least two complete records
			
 
				+     * - There must be at least four fields per record (otherwise files with coincidental
			
 
				+     *   or no semi-colons could be treated as semi-colon separated)
			
 
				+     * - Every semi-colon separated value record except the last must have the same number of fields
			
 
				+     * The reason the last record is allowed to have fewer fields than the others is that
			
 
				+     * it could have been truncated when the file was sampled.
			
 
				+     */
			
 
				+    @Override
			
 
				+    public boolean canCreateFromSample(List<String> explanation, String sample) {
			
 
				+        return SeparatedValuesLogStructureFinder.canCreateFromSample(explanation, sample, 4,
			
 
				+            CsvPreference.EXCEL_NORTH_EUROPE_PREFERENCE, "semi-colon separated values");
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
			
 
				+        throws IOException {
			
 
				+        return SeparatedValuesLogStructureFinder.makeSeparatedValuesLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
			
 
				+            CsvPreference.EXCEL_NORTH_EUROPE_PREFERENCE, false);
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/SeparatedValuesLogStructureFinder.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/SeparatedValuesLogStructureFinder.java
@@ -0,0 +1,486 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.elasticsearch.common.collect.Tuple;
			
 
				+import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
			
 
				+import org.supercsv.exception.SuperCsvException;
			
 
				+import org.supercsv.io.CsvListReader;
			
 
				+import org.supercsv.prefs.CsvPreference;
			
 
				+import org.supercsv.util.Util;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.io.StringReader;
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.Arrays;
			
 
				+import java.util.Collections;
			
 
				+import java.util.DoubleSummaryStatistics;
			
 
				+import java.util.HashSet;
			
 
				+import java.util.LinkedHashMap;
			
 
				+import java.util.List;
			
 
				+import java.util.Locale;
			
 
				+import java.util.Map;
			
 
				+import java.util.Random;
			
 
				+import java.util.SortedMap;
			
 
				+import java.util.regex.Pattern;
			
 
				+import java.util.stream.Collectors;
			
 
				+import java.util.stream.IntStream;
			
 
				+
			
 
				+public class SeparatedValuesLogStructureFinder implements LogStructureFinder {
			
 
				+
			
 
				+    private static final int MAX_LEVENSHTEIN_COMPARISONS = 100;
			
 
				+
			
 
				+    private final List<String> sampleMessages;
			
 
				+    private final LogStructure structure;
			
 
				+
			
 
				+    static SeparatedValuesLogStructureFinder makeSeparatedValuesLogStructureFinder(List<String> explanation, String sample,
			
 
				+                                                                                   String charsetName, Boolean hasByteOrderMarker,
			
 
				+                                                                                   CsvPreference csvPreference, boolean trimFields)
			
 
				+        throws IOException {
			
 
				+
			
 
				+        Tuple<List<List<String>>, List<Integer>> parsed = readRows(sample, csvPreference);
			
 
				+        List<List<String>> rows = parsed.v1();
			
 
				+        List<Integer> lineNumbers = parsed.v2();
			
 
				+
			
 
				+        Tuple<Boolean, String[]> headerInfo = findHeaderFromSample(explanation, rows);
			
 
				+        boolean isHeaderInFile = headerInfo.v1();
			
 
				+        String[] header = headerInfo.v2();
			
 
				+        String[] headerWithNamedBlanks = new String[header.length];
			
 
				+        for (int i = 0; i < header.length; ++i) {
			
 
				+            String rawHeader = header[i].isEmpty() ? "column" + (i + 1) : header[i];
			
 
				+            headerWithNamedBlanks[i] = trimFields ? rawHeader.trim() : rawHeader;
			
 
				+        }
			
 
				+
			
 
				+        List<String> sampleLines = Arrays.asList(sample.split("\n"));
			
 
				+        List<String> sampleMessages = new ArrayList<>();
			
 
				+        List<Map<String, ?>> sampleRecords = new ArrayList<>();
			
 
				+        int prevMessageEndLineNumber = isHeaderInFile ? lineNumbers.get(0) : -1;
			
 
				+        for (int index = isHeaderInFile ? 1 : 0; index < rows.size(); ++index) {
			
 
				+            List<String> row = rows.get(index);
			
 
				+            int lineNumber = lineNumbers.get(index);
			
 
				+            Map<String, String> sampleRecord = new LinkedHashMap<>();
			
 
				+            Util.filterListToMap(sampleRecord, headerWithNamedBlanks,
			
 
				+                trimFields ? row.stream().map(String::trim).collect(Collectors.toList()) : row);
			
 
				+            sampleRecords.add(sampleRecord);
			
 
				+            sampleMessages.add(
			
 
				+                sampleLines.subList(prevMessageEndLineNumber + 1, lineNumbers.get(index)).stream().collect(Collectors.joining("\n")));
			
 
				+            prevMessageEndLineNumber = lineNumber;
			
 
				+        }
			
 
				+
			
 
				+        String preamble = Pattern.compile("\n").splitAsStream(sample).limit(lineNumbers.get(1)).collect(Collectors.joining("\n", "", "\n"));
			
 
				+
			
 
				+        char delimiter = (char) csvPreference.getDelimiterChar();
			
 
				+        LogStructure.Builder structureBuilder = new LogStructure.Builder(LogStructure.Format.fromSeparator(delimiter))
			
 
				+            .setCharset(charsetName)
			
 
				+            .setHasByteOrderMarker(hasByteOrderMarker)
			
 
				+            .setSampleStart(preamble)
			
 
				+            .setNumLinesAnalyzed(lineNumbers.get(lineNumbers.size() - 1))
			
 
				+            .setNumMessagesAnalyzed(sampleRecords.size())
			
 
				+            .setHasHeaderRow(isHeaderInFile)
			
 
				+            .setInputFields(Arrays.stream(headerWithNamedBlanks).collect(Collectors.toList()));
			
 
				+
			
 
				+        if (trimFields) {
			
 
				+            structureBuilder.setShouldTrimFields(true);
			
 
				+        }
			
 
				+
			
 
				+        Tuple<String, TimestampMatch> timeField = LogStructureUtils.guessTimestampField(explanation, sampleRecords);
			
 
				+        if (timeField != null) {
			
 
				+            String timeLineRegex = null;
			
 
				+            StringBuilder builder = new StringBuilder("^");
			
 
				+            // We make the assumption that the timestamp will be on the first line of each record.  Therefore, if the
			
 
				+            // timestamp is the last column then either our assumption is wrong (and the approach will completely
			
 
				+            // break down) or else every record is on a single line and there's no point creating a multiline config.
			
 
				+            // This is why the loop excludes the last column.
			
 
				+            for (String column : Arrays.asList(header).subList(0, header.length - 1)) {
			
 
				+                if (timeField.v1().equals(column)) {
			
 
				+                    builder.append("\"?");
			
 
				+                    String simpleTimePattern = timeField.v2().simplePattern.pattern();
			
 
				+                    builder.append(simpleTimePattern.startsWith("\\b") ? simpleTimePattern.substring(2) : simpleTimePattern);
			
 
				+                    timeLineRegex = builder.toString();
			
 
				+                    break;
			
 
				+                } else {
			
 
				+                    builder.append(".*?");
			
 
				+                    if (delimiter == '\t') {
			
 
				+                        builder.append("\\t");
			
 
				+                    } else {
			
 
				+                        builder.append(delimiter);
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            if (isHeaderInFile) {
			
 
				+                structureBuilder.setExcludeLinesPattern("^" + Arrays.stream(header)
			
 
				+                    .map(column -> "\"?" + column.replace("\"", "\"\"").replaceAll("([\\\\|()\\[\\]{}^$*?])", "\\\\$1") + "\"?")
			
 
				+                    .collect(Collectors.joining(",")));
			
 
				+            }
			
 
				+
			
 
				+            structureBuilder.setTimestampField(timeField.v1())
			
 
				+                .setTimestampFormats(timeField.v2().dateFormats)
			
 
				+                .setNeedClientTimezone(timeField.v2().hasTimezoneDependentParsing())
			
 
				+                .setMultilineStartPattern(timeLineRegex);
			
 
				+        }
			
 
				+
			
 
				+        SortedMap<String, Object> mappings = LogStructureUtils.guessMappings(explanation, sampleRecords);
			
 
				+        mappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
			
 
				+
			
 
				+        LogStructure structure = structureBuilder
			
 
				+            .setMappings(mappings)
			
 
				+            .setExplanation(explanation)
			
 
				+            .build();
			
 
				+
			
 
				+        return new SeparatedValuesLogStructureFinder(sampleMessages, structure);
			
 
				+    }
			
 
				+
			
 
				+    private SeparatedValuesLogStructureFinder(List<String> sampleMessages, LogStructure structure) {
			
 
				+        this.sampleMessages = Collections.unmodifiableList(sampleMessages);
			
 
				+        this.structure = structure;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public List<String> getSampleMessages() {
			
 
				+        return sampleMessages;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public LogStructure getStructure() {
			
 
				+        return structure;
			
 
				+    }
			
 
				+
			
 
				+    static Tuple<List<List<String>>, List<Integer>> readRows(String sample, CsvPreference csvPreference) throws IOException {
			
 
				+
			
 
				+        int fieldsInFirstRow = -1;
			
 
				+
			
 
				+        List<List<String>> rows = new ArrayList<>();
			
 
				+        List<Integer> lineNumbers = new ArrayList<>();
			
 
				+
			
 
				+        try (CsvListReader csvReader = new CsvListReader(new StringReader(sample), csvPreference)) {
			
 
				+
			
 
				+            try {
			
 
				+                List<String> row;
			
 
				+                while ((row = csvReader.read()) != null) {
			
 
				+                    if (fieldsInFirstRow < 0) {
			
 
				+                        fieldsInFirstRow = row.size();
			
 
				+                    } else {
			
 
				+                        // Tolerate extra columns if and only if they're empty
			
 
				+                        while (row.size() > fieldsInFirstRow && row.get(row.size() - 1) == null) {
			
 
				+                            row.remove(row.size() - 1);
			
 
				+                        }
			
 
				+                    }
			
 
				+                    rows.add(row);
			
 
				+                    lineNumbers.add(csvReader.getLineNumber());
			
 
				+                }
			
 
				+            } catch (SuperCsvException e) {
			
 
				+                // Tolerate an incomplete last row
			
 
				+                if (notUnexpectedEndOfFile(e)) {
			
 
				+                    throw e;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        assert rows.isEmpty() == false;
			
 
				+        assert lineNumbers.size() == rows.size();
			
 
				+
			
 
				+        if (rows.get(0).size() != rows.get(rows.size() - 1).size()) {
			
 
				+            rows.remove(rows.size() - 1);
			
 
				+            lineNumbers.remove(lineNumbers.size() - 1);
			
 
				+        }
			
 
				+
			
 
				+        // This should have been enforced by canCreateFromSample()
			
 
				+        assert rows.size() > 1;
			
 
				+
			
 
				+        return new Tuple<>(rows, lineNumbers);
			
 
				+    }
			
 
				+
			
 
				+    static Tuple<Boolean, String[]> findHeaderFromSample(List<String> explanation, List<List<String>> rows) {
			
 
				+
			
 
				+        assert rows.isEmpty() == false;
			
 
				+
			
 
				+        List<String> firstRow = rows.get(0);
			
 
				+
			
 
				+        boolean isHeaderInFile = true;
			
 
				+        if (rowContainsDuplicateNonEmptyValues(firstRow)) {
			
 
				+            isHeaderInFile = false;
			
 
				+            explanation.add("First row contains duplicate values, so assuming it's not a header");
			
 
				+        } else {
			
 
				+            if (rows.size() < 3) {
			
 
				+                explanation.add("Too little data to accurately assess whether header is in sample - guessing it is");
			
 
				+            } else {
			
 
				+                isHeaderInFile = isFirstRowUnusual(explanation, rows);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        if (isHeaderInFile) {
			
 
				+            // SuperCSV will put nulls in the header if any columns don't have names, but empty strings are better for us
			
 
				+            return new Tuple<>(true, firstRow.stream().map(field -> (field == null) ? "" : field).toArray(String[]::new));
			
 
				+        } else {
			
 
				+            return new Tuple<>(false, IntStream.rangeClosed(1, firstRow.size()).mapToObj(num -> "column" + num).toArray(String[]::new));
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    static boolean rowContainsDuplicateNonEmptyValues(List<String> row) {
			
 
				+
			
 
				+        HashSet<String> values = new HashSet<>();
			
 
				+
			
 
				+        for (String value : row) {
			
 
				+            if (value != null && value.isEmpty() == false && values.add(value) == false) {
			
 
				+                return true;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+    private static boolean isFirstRowUnusual(List<String> explanation, List<List<String>> rows) {
			
 
				+
			
 
				+        assert rows.size() >= 3;
			
 
				+
			
 
				+        List<String> firstRow = rows.get(0);
			
 
				+        String firstRowStr = firstRow.stream().map(field -> (field == null) ? "" : field).collect(Collectors.joining(""));
			
 
				+        List<List<String>> otherRows = rows.subList(1, rows.size());
			
 
				+        List<String> otherRowStrs = new ArrayList<>();
			
 
				+        for (List<String> row : otherRows) {
			
 
				+            otherRowStrs.add(row.stream().map(str -> (str == null) ? "" : str).collect(Collectors.joining("")));
			
 
				+        }
			
 
				+
			
 
				+        // Check lengths
			
 
				+
			
 
				+        double firstRowLength = firstRowStr.length();
			
 
				+        DoubleSummaryStatistics otherRowStats = otherRowStrs.stream().mapToDouble(otherRow -> (double) otherRow.length())
			
 
				+            .collect(DoubleSummaryStatistics::new, DoubleSummaryStatistics::accept, DoubleSummaryStatistics::combine);
			
 
				+
			
 
				+        double otherLengthRange = otherRowStats.getMax() - otherRowStats.getMin();
			
 
				+        if (firstRowLength < otherRowStats.getMin() - otherLengthRange / 10.0 ||
			
 
				+            firstRowLength > otherRowStats.getMax() + otherLengthRange / 10.0) {
			
 
				+            explanation.add("First row is unusual based on length test: [" + firstRowLength + "] and [" +
			
 
				+                toNiceString(otherRowStats) + "]");
			
 
				+            return true;
			
 
				+        }
			
 
				+
			
 
				+        explanation.add("First row is not unusual based on length test: [" + firstRowLength + "] and [" +
			
 
				+            toNiceString(otherRowStats) + "]");
			
 
				+
			
 
				+        // Check edit distances
			
 
				+
			
 
				+        DoubleSummaryStatistics firstRowStats = otherRows.stream().limit(MAX_LEVENSHTEIN_COMPARISONS)
			
 
				+            .mapToDouble(otherRow -> (double) levenshteinFieldwiseCompareRows(firstRow, otherRow))
			
 
				+            .collect(DoubleSummaryStatistics::new, DoubleSummaryStatistics::accept, DoubleSummaryStatistics::combine);
			
 
				+
			
 
				+        otherRowStats = new DoubleSummaryStatistics();
			
 
				+        int numComparisons = 0;
			
 
				+        int proportion = otherRowStrs.size() / MAX_LEVENSHTEIN_COMPARISONS;
			
 
				+        int innerIncrement = 1 + proportion * proportion;
			
 
				+        Random random = new Random(firstRow.hashCode());
			
 
				+        for (int i = 0; numComparisons < MAX_LEVENSHTEIN_COMPARISONS && i < otherRowStrs.size(); ++i) {
			
 
				+            for (int j = i + 1 + random.nextInt(innerIncrement); numComparisons < MAX_LEVENSHTEIN_COMPARISONS && j < otherRowStrs.size();
			
 
				+                 j += innerIncrement) {
			
 
				+                otherRowStats.accept((double) levenshteinFieldwiseCompareRows(otherRows.get(i), otherRows.get(j)));
			
 
				+                ++numComparisons;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        if (firstRowStats.getAverage() > otherRowStats.getAverage() * 1.2) {
			
 
				+            explanation.add("First row is unusual based on Levenshtein test [" + toNiceString(firstRowStats) +
			
 
				+                "] and [" + toNiceString(otherRowStats) + "]");
			
 
				+            return true;
			
 
				+        }
			
 
				+
			
 
				+        explanation.add("First row is not unusual based on Levenshtein test [" + toNiceString(firstRowStats) +
			
 
				+            "] and [" + toNiceString(otherRowStats) + "]");
			
 
				+
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+    private static String toNiceString(DoubleSummaryStatistics stats) {
			
 
				+        return String.format(Locale.ROOT, "count=%d, min=%f, average=%f, max=%f", stats.getCount(), stats.getMin(), stats.getAverage(),
			
 
				+            stats.getMax());
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Sum of the Levenshtein distances between corresponding elements
			
 
				+     * in the two supplied lists _excluding_ the biggest difference.
			
 
				+     * The reason the biggest difference is excluded is that sometimes
			
 
				+     * there's a "message" field that is much longer than any of the other
			
 
				+     * fields, varies enormously between rows, and skews the comparison.
			
 
				+     */
			
 
				+    static int levenshteinFieldwiseCompareRows(List<String> firstRow, List<String> secondRow) {
			
 
				+
			
 
				+        int largestSize = Math.max(firstRow.size(), secondRow.size());
			
 
				+        if (largestSize <= 1) {
			
 
				+            return 0;
			
 
				+        }
			
 
				+
			
 
				+        int[] distances = new int[largestSize];
			
 
				+
			
 
				+        for (int index = 0; index < largestSize; ++index) {
			
 
				+            distances[index] = levenshteinDistance((index < firstRow.size()) ? firstRow.get(index) : "",
			
 
				+                (index < secondRow.size()) ? secondRow.get(index) : "");
			
 
				+        }
			
 
				+
			
 
				+        Arrays.sort(distances);
			
 
				+
			
 
				+        return IntStream.of(distances).limit(distances.length - 1).sum();
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * This method implements the simple algorithm for calculating Levenshtein distance.
			
 
				+     */
			
 
				+    static int levenshteinDistance(String first, String second) {
			
 
				+
			
 
				+        // There are some examples with pretty pictures of the matrix on Wikipedia here:
			
 
				+        // http://en.wikipedia.org/wiki/Levenshtein_distance
			
 
				+
			
 
				+        int firstLen = (first == null) ? 0 : first.length();
			
 
				+        int secondLen = (second == null) ? 0 : second.length();
			
 
				+        if (firstLen == 0) {
			
 
				+            return secondLen;
			
 
				+        }
			
 
				+        if (secondLen == 0) {
			
 
				+            return firstLen;
			
 
				+        }
			
 
				+
			
 
				+        int[] currentCol = new int[secondLen + 1];
			
 
				+        int[] prevCol = new int[secondLen + 1];
			
 
				+
			
 
				+        // Populate the left column
			
 
				+        for (int down = 0; down <= secondLen; ++down) {
			
 
				+            currentCol[down] = down;
			
 
				+        }
			
 
				+
			
 
				+        // Calculate the other entries in the matrix
			
 
				+        for (int across = 1; across <= firstLen; ++across) {
			
 
				+            int[] tmp = prevCol;
			
 
				+            prevCol = currentCol;
			
 
				+            // We could allocate a new array for currentCol here, but it's more efficient to reuse the one that's now redundant
			
 
				+            currentCol = tmp;
			
 
				+
			
 
				+            currentCol[0] = across;
			
 
				+
			
 
				+            for (int down = 1; down <= secondLen; ++down) {
			
 
				+
			
 
				+                // Do the strings differ at the point we've reached?
			
 
				+                if (first.charAt(across - 1) == second.charAt(down - 1)) {
			
 
				+
			
 
				+                    // No, they're the same => no extra cost
			
 
				+                    currentCol[down] = prevCol[down - 1];
			
 
				+                } else {
			
 
				+                    // Yes, they differ, so there are 3 options:
			
 
				+
			
 
				+                    // 1) Deletion => cell to the left's value plus 1
			
 
				+                    int option1 = prevCol[down];
			
 
				+
			
 
				+                    // 2) Insertion => cell above's value plus 1
			
 
				+                    int option2 = currentCol[down - 1];
			
 
				+
			
 
				+                    // 3) Substitution => cell above left's value plus 1
			
 
				+                    int option3 = prevCol[down - 1];
			
 
				+
			
 
				+                    // Take the cheapest option of the 3
			
 
				+                    currentCol[down] = Math.min(Math.min(option1, option2), option3) + 1;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        // Result is the value in the bottom right hand corner of the matrix
			
 
				+        return currentCol[secondLen];
			
 
				+    }
			
 
				+
			
 
				+    static boolean lineHasUnescapedQuote(String line, CsvPreference csvPreference) {
			
 
				+        char quote = csvPreference.getQuoteChar();
			
 
				+        String lineWithEscapedQuotesRemoved = line.replace(String.valueOf(quote) + quote, "");
			
 
				+        for (int index = 1; index < lineWithEscapedQuotesRemoved.length() - 1; ++index) {
			
 
				+            if (lineWithEscapedQuotesRemoved.charAt(index) == quote &&
			
 
				+                lineWithEscapedQuotesRemoved.codePointAt(index - 1) != csvPreference.getDelimiterChar() &&
			
 
				+                lineWithEscapedQuotesRemoved.codePointAt(index + 1) != csvPreference.getDelimiterChar()) {
			
 
				+                return true;
			
 
				+            }
			
 
				+        }
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+    static boolean canCreateFromSample(List<String> explanation, String sample, int minFieldsPerRow, CsvPreference csvPreference,
			
 
				+                                       String formatName) {
			
 
				+
			
 
				+        // Logstash's CSV parser won't tolerate fields where just part of the
			
 
				+        // value is quoted, whereas SuperCSV will, hence this extra check
			
 
				+        String[] sampleLines = sample.split("\n");
			
 
				+        for (String sampleLine : sampleLines) {
			
 
				+            if (lineHasUnescapedQuote(sampleLine, csvPreference)) {
			
 
				+                explanation.add("Not " + formatName +
			
 
				+                    " because a line has an unescaped quote that is not at the beginning or end of a field: [" + sampleLine + "]");
			
 
				+                return false;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        try (CsvListReader csvReader = new CsvListReader(new StringReader(sample), csvPreference)) {
			
 
				+
			
 
				+            int fieldsInFirstRow = -1;
			
 
				+            int fieldsInLastRow = -1;
			
 
				+
			
 
				+            int numberOfRows = 0;
			
 
				+            try {
			
 
				+                List<String> row;
			
 
				+                while ((row = csvReader.read()) != null) {
			
 
				+
			
 
				+                    int fieldsInThisRow = row.size();
			
 
				+                    ++numberOfRows;
			
 
				+                    if (fieldsInFirstRow < 0) {
			
 
				+                        fieldsInFirstRow = fieldsInThisRow;
			
 
				+                        if (fieldsInFirstRow < minFieldsPerRow) {
			
 
				+                            explanation.add("Not " + formatName + " because the first row has fewer than [" + minFieldsPerRow +
			
 
				+                                "] fields: [" + fieldsInFirstRow + "]");
			
 
				+                            return false;
			
 
				+                        }
			
 
				+                        fieldsInLastRow = fieldsInFirstRow;
			
 
				+                        continue;
			
 
				+                    }
			
 
				+
			
 
				+                    // Tolerate extra columns if and only if they're empty
			
 
				+                    while (fieldsInThisRow > fieldsInFirstRow && row.get(fieldsInThisRow - 1) == null) {
			
 
				+                        --fieldsInThisRow;
			
 
				+                    }
			
 
				+
			
 
				+                    if (fieldsInLastRow != fieldsInFirstRow) {
			
 
				+                        explanation.add("Not " + formatName + " because row [" + (numberOfRows - 1) +
			
 
				+                            "] has a different number of fields to the first row: [" + fieldsInFirstRow + "] and [" +
			
 
				+                            fieldsInLastRow + "]");
			
 
				+                        return false;
			
 
				+                    }
			
 
				+
			
 
				+                    fieldsInLastRow = fieldsInThisRow;
			
 
				+                }
			
 
				+
			
 
				+                if (fieldsInLastRow > fieldsInFirstRow) {
			
 
				+                    explanation.add("Not " + formatName + " because last row has more fields than first row: [" + fieldsInFirstRow +
			
 
				+                        "] and [" + fieldsInLastRow + "]");
			
 
				+                    return false;
			
 
				+                }
			
 
				+                if (fieldsInLastRow < fieldsInFirstRow) {
			
 
				+                    --numberOfRows;
			
 
				+                }
			
 
				+            } catch (SuperCsvException e) {
			
 
				+                // Tolerate an incomplete last row
			
 
				+                if (notUnexpectedEndOfFile(e)) {
			
 
				+                    explanation.add("Not " + formatName + " because there was a parsing exception: [" + e.getMessage() + "]");
			
 
				+                    return false;
			
 
				+                }
			
 
				+            }
			
 
				+            if (numberOfRows <= 1) {
			
 
				+                explanation.add("Not " + formatName + " because fewer than 2 complete records in sample: [" + numberOfRows + "]");
			
 
				+                return false;
			
 
				+            }
			
 
				+            explanation.add("Deciding sample is " + formatName);
			
 
				+            return true;
			
 
				+
			
 
				+        } catch (IOException e) {
			
 
				+            explanation.add("Not " + formatName + " because there was a parsing exception: [" + e.getMessage() + "]");
			
 
				+            return false;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    private static boolean notUnexpectedEndOfFile(SuperCsvException e) {
			
 
				+        return e.getMessage().startsWith("unexpected end of file while reading quoted column") == false;
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/TextLogStructureFinder.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/TextLogStructureFinder.java
@@ -0,0 +1,201 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.elasticsearch.common.collect.Tuple;
			
 
				+import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
			
 
				+
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.Collection;
			
 
				+import java.util.Collections;
			
 
				+import java.util.HashSet;
			
 
				+import java.util.LinkedHashMap;
			
 
				+import java.util.List;
			
 
				+import java.util.Map;
			
 
				+import java.util.Set;
			
 
				+import java.util.SortedMap;
			
 
				+import java.util.TreeMap;
			
 
				+import java.util.regex.Pattern;
			
 
				+
			
 
				+public class TextLogStructureFinder implements LogStructureFinder {
			
 
				+
			
 
				+    private final List<String> sampleMessages;
			
 
				+    private final LogStructure structure;
			
 
				+
			
 
				+    static TextLogStructureFinder makeTextLogStructureFinder(List<String> explanation, String sample, String charsetName,
			
 
				+                                                             Boolean hasByteOrderMarker) {
			
 
				+
			
 
				+        String[] sampleLines = sample.split("\n");
			
 
				+        Tuple<TimestampMatch, Set<String>> bestTimestamp = mostLikelyTimestamp(sampleLines);
			
 
				+        if (bestTimestamp == null) {
			
 
				+            // Is it appropriate to treat a file that is neither structured nor has
			
 
				+            // a regular pattern of timestamps as a log file?  Probably not...
			
 
				+            throw new IllegalArgumentException("Could not find a timestamp in the log sample provided");
			
 
				+        }
			
 
				+
			
 
				+        explanation.add("Most likely timestamp format is [" + bestTimestamp.v1() + "]");
			
 
				+
			
 
				+        List<String> sampleMessages = new ArrayList<>();
			
 
				+        StringBuilder preamble = new StringBuilder();
			
 
				+        int linesConsumed = 0;
			
 
				+        StringBuilder message = null;
			
 
				+        int linesInMessage = 0;
			
 
				+        String multiLineRegex = createMultiLineMessageStartRegex(bestTimestamp.v2(), bestTimestamp.v1().simplePattern.pattern());
			
 
				+        Pattern multiLinePattern = Pattern.compile(multiLineRegex);
			
 
				+        for (String sampleLine : sampleLines) {
			
 
				+            if (multiLinePattern.matcher(sampleLine).find()) {
			
 
				+                if (message != null) {
			
 
				+                    sampleMessages.add(message.toString());
			
 
				+                    linesConsumed += linesInMessage;
			
 
				+                }
			
 
				+                message = new StringBuilder(sampleLine);
			
 
				+                linesInMessage = 1;
			
 
				+            } else {
			
 
				+                // If message is null here then the sample probably began with the incomplete ending of a previous message
			
 
				+                if (message == null) {
			
 
				+                    // We count lines before the first message as consumed (just like we would
			
 
				+                    // for the CSV header or lines before the first XML document starts)
			
 
				+                    ++linesConsumed;
			
 
				+                } else {
			
 
				+                    message.append('\n').append(sampleLine);
			
 
				+                    ++linesInMessage;
			
 
				+                }
			
 
				+            }
			
 
				+            if (sampleMessages.size() < 2) {
			
 
				+                preamble.append(sampleLine).append('\n');
			
 
				+            }
			
 
				+        }
			
 
				+        // Don't add the last message, as it might be partial and mess up subsequent pattern finding
			
 
				+
			
 
				+        LogStructure.Builder structureBuilder = new LogStructure.Builder(LogStructure.Format.SEMI_STRUCTURED_TEXT)
			
 
				+            .setCharset(charsetName)
			
 
				+            .setHasByteOrderMarker(hasByteOrderMarker)
			
 
				+            .setSampleStart(preamble.toString())
			
 
				+            .setNumLinesAnalyzed(linesConsumed)
			
 
				+            .setNumMessagesAnalyzed(sampleMessages.size())
			
 
				+            .setMultilineStartPattern(multiLineRegex);
			
 
				+
			
 
				+        SortedMap<String, Object> mappings = new TreeMap<>();
			
 
				+        mappings.put("message", Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "text"));
			
 
				+        mappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
			
 
				+
			
 
				+        // We can't parse directly into @timestamp using Grok, so parse to some other time field, which the date filter will then remove
			
 
				+        String interimTimestampField;
			
 
				+        String grokPattern;
			
 
				+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
			
 
				+        Tuple<String, String> timestampFieldAndFullMatchGrokPattern = grokPatternCreator.findFullLineGrokPattern();
			
 
				+        if (timestampFieldAndFullMatchGrokPattern != null) {
			
 
				+            interimTimestampField = timestampFieldAndFullMatchGrokPattern.v1();
			
 
				+            grokPattern = timestampFieldAndFullMatchGrokPattern.v2();
			
 
				+        } else {
			
 
				+            interimTimestampField = "timestamp";
			
 
				+            grokPattern = grokPatternCreator.createGrokPatternFromExamples(bestTimestamp.v1().grokPatternName, interimTimestampField);
			
 
				+        }
			
 
				+
			
 
				+        LogStructure structure = structureBuilder
			
 
				+            .setTimestampField(interimTimestampField)
			
 
				+            .setTimestampFormats(bestTimestamp.v1().dateFormats)
			
 
				+            .setNeedClientTimezone(bestTimestamp.v1().hasTimezoneDependentParsing())
			
 
				+            .setGrokPattern(grokPattern)
			
 
				+            .setMappings(mappings)
			
 
				+            .setExplanation(explanation)
			
 
				+            .build();
			
 
				+
			
 
				+        return new TextLogStructureFinder(sampleMessages, structure);
			
 
				+    }
			
 
				+
			
 
				+    private TextLogStructureFinder(List<String> sampleMessages, LogStructure structure) {
			
 
				+        this.sampleMessages = Collections.unmodifiableList(sampleMessages);
			
 
				+        this.structure = structure;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public List<String> getSampleMessages() {
			
 
				+        return sampleMessages;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public LogStructure getStructure() {
			
 
				+        return structure;
			
 
				+    }
			
 
				+
			
 
				+    static Tuple<TimestampMatch, Set<String>> mostLikelyTimestamp(String[] sampleLines) {
			
 
				+
			
 
				+        Map<TimestampMatch, Tuple<Double, Set<String>>> timestampMatches = new LinkedHashMap<>();
			
 
				+
			
 
				+        int remainingLines = sampleLines.length;
			
 
				+        double differenceBetweenTwoHighestWeights = 0.0;
			
 
				+        for (String sampleLine : sampleLines) {
			
 
				+            TimestampMatch match = TimestampFormatFinder.findFirstMatch(sampleLine);
			
 
				+            if (match != null) {
			
 
				+                TimestampMatch pureMatch = new TimestampMatch(match.candidateIndex, "", match.dateFormats, match.simplePattern,
			
 
				+                    match.grokPatternName, "");
			
 
				+                timestampMatches.compute(pureMatch, (k, v) -> {
			
 
				+                    if (v == null) {
			
 
				+                        return new Tuple<>(weightForMatch(match.preface), new HashSet<>(Collections.singletonList(match.preface)));
			
 
				+                    } else {
			
 
				+                        v.v2().add(match.preface);
			
 
				+                        return new Tuple<>(v.v1() + weightForMatch(match.preface), v.v2());
			
 
				+                    }
			
 
				+                });
			
 
				+                differenceBetweenTwoHighestWeights = findDifferenceBetweenTwoHighestWeights(timestampMatches.values());
			
 
				+            }
			
 
				+            // The highest possible weight is 1, so if the difference between the two highest weights
			
 
				+            // is less than the number of lines remaining then the leader cannot possibly be overtaken
			
 
				+            if (differenceBetweenTwoHighestWeights > --remainingLines) {
			
 
				+                break;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        double highestWeight = 0.0;
			
 
				+        Tuple<TimestampMatch, Set<String>> highestWeightMatch = null;
			
 
				+        for (Map.Entry<TimestampMatch, Tuple<Double, Set<String>>> entry : timestampMatches.entrySet()) {
			
 
				+            double weight = entry.getValue().v1();
			
 
				+            if (weight > highestWeight) {
			
 
				+                highestWeight = weight;
			
 
				+                highestWeightMatch = new Tuple<>(entry.getKey(), entry.getValue().v2());
			
 
				+            }
			
 
				+        }
			
 
				+        return highestWeightMatch;
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Used to weight a timestamp match according to how far along the line it is found.
			
 
				+     * Timestamps at the very beginning of the line are given a weight of 1.  The weight
			
 
				+     * progressively decreases the more text there is preceding the timestamp match, but
			
 
				+     * is always greater than 0.
			
 
				+     * @return A weight in the range (0, 1].
			
 
				+     */
			
 
				+    private static double weightForMatch(String preface) {
			
 
				+        return Math.pow(1.0 + preface.length() / 15.0, -1.1);
			
 
				+    }
			
 
				+
			
 
				+    private static double findDifferenceBetweenTwoHighestWeights(Collection<Tuple<Double, Set<String>>> timestampMatches) {
			
 
				+        double highestWeight = 0.0;
			
 
				+        double secondHighestWeight = 0.0;
			
 
				+        for (Tuple<Double, Set<String>> timestampMatch : timestampMatches) {
			
 
				+            double weight = timestampMatch.v1();
			
 
				+            if (weight > highestWeight) {
			
 
				+                secondHighestWeight = highestWeight;
			
 
				+                highestWeight = weight;
			
 
				+            } else if (weight > secondHighestWeight) {
			
 
				+                secondHighestWeight = weight;
			
 
				+            }
			
 
				+        }
			
 
				+        return highestWeight - secondHighestWeight;
			
 
				+    }
			
 
				+
			
 
				+    static String createMultiLineMessageStartRegex(Collection<String> prefaces, String timestampRegex) {
			
 
				+
			
 
				+        StringBuilder builder = new StringBuilder("^");
			
 
				+        GrokPatternCreator.addIntermediateRegex(builder, prefaces);
			
 
				+        builder.append(timestampRegex);
			
 
				+        if (builder.substring(0, 3).equals("^\\b")) {
			
 
				+            builder.delete(1, 3);
			
 
				+        }
			
 
				+        return builder.toString();
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/TextLogStructureFinderFactory.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/TextLogStructureFinderFactory.java
@@ -0,0 +1,39 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import java.util.List;
			
 
				+import java.util.regex.Pattern;
			
 
				+
			
 
				+public class TextLogStructureFinderFactory implements LogStructureFinderFactory {
			
 
				+
			
 
				+    // This works because, by default, dot doesn't match newlines
			
 
				+    private static final Pattern TWO_NON_BLANK_LINES_PATTERN = Pattern.compile(".\n+.");
			
 
				+
			
 
				+    /**
			
 
				+     * This format matches if the sample contains at least one newline and at least two
			
 
				+     * non-blank lines.
			
 
				+     */
			
 
				+    @Override
			
 
				+    public boolean canCreateFromSample(List<String> explanation, String sample) {
			
 
				+        if (sample.indexOf('\n') < 0) {
			
 
				+            explanation.add("Not text because sample contains no newlines");
			
 
				+            return false;
			
 
				+        }
			
 
				+        if (TWO_NON_BLANK_LINES_PATTERN.matcher(sample).find() == false) {
			
 
				+            explanation.add("Not text because sample contains fewer than two non-blank lines");
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+        explanation.add("Deciding sample is text");
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker) {
			
 
				+        return TextLogStructureFinder.makeTextLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker);
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/TimestampFormatFinder.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/TimestampFormatFinder.java
@@ -0,0 +1,427 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.elasticsearch.common.collect.Tuple;
			
 
				+import org.elasticsearch.grok.Grok;
			
 
				+
			
 
				+import java.util.Arrays;
			
 
				+import java.util.Collections;
			
 
				+import java.util.LinkedHashMap;
			
 
				+import java.util.List;
			
 
				+import java.util.Locale;
			
 
				+import java.util.Map;
			
 
				+import java.util.Objects;
			
 
				+import java.util.regex.Matcher;
			
 
				+import java.util.regex.Pattern;
			
 
				+import java.util.stream.Collectors;
			
 
				+import java.util.stream.Stream;
			
 
				+
			
 
				+/**
			
 
				+ * Used to find the best timestamp format for one of the following situations:
			
 
				+ * 1. Matching an entire field value
			
 
				+ * 2. Matching a timestamp found somewhere within a message
			
 
				+ */
			
 
				+public final class TimestampFormatFinder {
			
 
				+
			
 
				+    private static final String PREFACE = "preface";
			
 
				+    private static final String EPILOGUE = "epilogue";
			
 
				+
			
 
				+    private static final Pattern FRACTIONAL_SECOND_INTERPRETER = Pattern.compile("([:.,])(\\d{3,9})");
			
 
				+    private static final char DEFAULT_FRACTIONAL_SECOND_SEPARATOR = ',';
			
 
				+
			
 
				+    /**
			
 
				+     * The timestamp patterns are complex and it can be slow to prove they do not
			
 
				+     * match anywhere in a long message.  Many of the timestamps are similar and
			
 
				+     * will never be found in a string if simpler sub-patterns do not exist in the
			
 
				+     * string.  These sub-patterns can be used to quickly rule out multiple complex
			
 
				+     * patterns.  These patterns do not need to represent quantities that are
			
 
				+     * useful to know the value of, merely character sequences that can be used to
			
 
				+     * prove that <em>several</em> more complex patterns cannot possibly match.
			
 
				+     */
			
 
				+    private static final List<Pattern> QUICK_RULE_OUT_PATTERNS = Arrays.asList(
			
 
				+        // YYYY-MM-dd followed by a space
			
 
				+        Pattern.compile("\\b\\d{4}-\\d{2}-\\d{2} "),
			
 
				+        // The end of some number (likely year or day) followed by a space then HH:mm
			
 
				+        Pattern.compile("\\d \\d{2}:\\d{2}\\b"),
			
 
				+        // HH:mm:ss surrounded by spaces
			
 
				+        Pattern.compile(" \\d{2}:\\d{2}:\\d{2} ")
			
 
				+    );
			
 
				+
			
 
				+    /**
			
 
				+     * The first match in this list will be chosen, so it needs to be ordered
			
 
				+     * such that more generic patterns come after more specific patterns.
			
 
				+     */
			
 
				+    static final List<CandidateTimestampFormat> ORDERED_CANDIDATE_FORMATS = Arrays.asList(
			
 
				+        // The TOMCAT_DATESTAMP format has to come before ISO8601 because it's basically ISO8601 but
			
 
				+        // with a space before the timezone, and because the timezone is optional in ISO8601 it will
			
 
				+        // be recognised as that with the timezone missed off if ISO8601 is checked first
			
 
				+        new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSS Z", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}",
			
 
				+            "\\b20\\d{2}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9} (?:Z|[+-]%{HOUR}%{MINUTE})\\b",
			
 
				+            "TOMCAT_DATESTAMP", Arrays.asList(0, 1)),
			
 
				+        // The Elasticsearch ISO8601 parser requires a literal T between the date and time, so
			
 
				+        // longhand formats are needed if there's a space instead
			
 
				+        new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSSZ", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}",
			
 
				+            "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}(?:Z|[+-]%{HOUR}%{MINUTE})\\b",
			
 
				+            "TIMESTAMP_ISO8601", Arrays.asList(0, 1)),
			
 
				+        new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSSZZ", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}",
			
 
				+            "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}[+-]%{HOUR}:%{MINUTE}\\b",
			
 
				+            "TIMESTAMP_ISO8601", Arrays.asList(0, 1)),
			
 
				+        new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSS", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}",
			
 
				+            "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}\\b", "TIMESTAMP_ISO8601",
			
 
				+            Arrays.asList(0, 1)),
			
 
				+        new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ssZ", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}",
			
 
				+            "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)(?:Z|[+-]%{HOUR}%{MINUTE})\\b", "TIMESTAMP_ISO8601",
			
 
				+            Arrays.asList(0, 1)),
			
 
				+        new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ssZZ", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}",
			
 
				+            "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[+-]%{HOUR}:%{MINUTE}\\b", "TIMESTAMP_ISO8601",
			
 
				+            Arrays.asList(0, 1)),
			
 
				+        new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}",
			
 
				+            "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)\\b", "TIMESTAMP_ISO8601",
			
 
				+            Arrays.asList(0, 1)),
			
 
				+        new CandidateTimestampFormat("ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "\\b%{TIMESTAMP_ISO8601}\\b",
			
 
				+            "TIMESTAMP_ISO8601"),
			
 
				+        new CandidateTimestampFormat("EEE MMM dd YYYY HH:mm:ss zzz",
			
 
				+            "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2} ",
			
 
				+            "\\b%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{TZ}\\b", "DATESTAMP_RFC822", Arrays.asList(1, 2)),
			
 
				+        new CandidateTimestampFormat("EEE MMM dd YYYY HH:mm zzz", "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2} ",
			
 
				+            "\\b%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE} %{TZ}\\b", "DATESTAMP_RFC822", Collections.singletonList(1)),
			
 
				+        new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm:ss ZZ",
			
 
				+            "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ",
			
 
				+            "\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:Z|[+-]%{HOUR}:%{MINUTE})\\b",
			
 
				+            "DATESTAMP_RFC2822", Arrays.asList(1, 2)),
			
 
				+        new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm:ss Z",
			
 
				+            "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ",
			
 
				+            "\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:Z|[+-]%{HOUR}%{MINUTE})\\b",
			
 
				+            "DATESTAMP_RFC2822", Arrays.asList(1, 2)),
			
 
				+        new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm ZZ", "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ",
			
 
				+            "\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE} (?:Z|[+-]%{HOUR}:%{MINUTE})\\b", "DATESTAMP_RFC2822",
			
 
				+            Collections.singletonList(1)),
			
 
				+        new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm Z", "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ",
			
 
				+            "\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE} (?:Z|[+-]%{HOUR}%{MINUTE})\\b", "DATESTAMP_RFC2822",
			
 
				+            Collections.singletonList(1)),
			
 
				+        new CandidateTimestampFormat("EEE MMM dd HH:mm:ss zzz YYYY",
			
 
				+            "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b",
			
 
				+            "\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{TZ} %{YEAR}\\b", "DATESTAMP_OTHER",
			
 
				+            Arrays.asList(1, 2)),
			
 
				+        new CandidateTimestampFormat("EEE MMM dd HH:mm zzz YYYY",
			
 
				+            "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b",
			
 
				+            "\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE} %{TZ} %{YEAR}\\b", "DATESTAMP_OTHER", Collections.singletonList(1)),
			
 
				+        new CandidateTimestampFormat("YYYYMMddHHmmss", "\\b\\d{14}\\b",
			
 
				+            "\\b20\\d{2}%{MONTHNUM2}(?:(?:0[1-9])|(?:[12][0-9])|(?:3[01]))(?:2[0123]|[01][0-9])%{MINUTE}(?:[0-5][0-9]|60)\\b",
			
 
				+            "DATESTAMP_EVENTLOG"),
			
 
				+        new CandidateTimestampFormat("EEE MMM dd HH:mm:ss YYYY",
			
 
				+            "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b",
			
 
				+            "\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{YEAR}\\b", "HTTPDERROR_DATE", Arrays.asList(1, 2)),
			
 
				+        new CandidateTimestampFormat(Arrays.asList("MMM dd HH:mm:ss,SSS", "MMM  d HH:mm:ss,SSS"),
			
 
				+            "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2},\\d{3}",
			
 
				+            "%{MONTH} +%{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}\\b", "SYSLOGTIMESTAMP",
			
 
				+            Collections.singletonList(1)),
			
 
				+        new CandidateTimestampFormat(Arrays.asList("MMM dd HH:mm:ss", "MMM  d HH:mm:ss"),
			
 
				+            "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "%{MONTH} +%{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)\\b",
			
 
				+            "SYSLOGTIMESTAMP", Collections.singletonList(1)),
			
 
				+        new CandidateTimestampFormat("dd/MMM/YYYY:HH:mm:ss Z", "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ",
			
 
				+            "\\b%{MONTHDAY}/%{MONTH}/%{YEAR}:%{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) [+-]?%{HOUR}%{MINUTE}\\b", "HTTPDATE"),
			
 
				+        new CandidateTimestampFormat("MMM dd, YYYY K:mm:ss a", "\\b[A-Z]\\S{2,8} \\d{1,2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b",
			
 
				+            "%{MONTH} %{MONTHDAY}, 20\\d{2} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:AM|PM)\\b", "CATALINA_DATESTAMP"),
			
 
				+        new CandidateTimestampFormat(Arrays.asList("MMM dd YYYY HH:mm:ss", "MMM  d YYYY HH:mm:ss"),
			
 
				+            "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b",
			
 
				+            "%{MONTH} +%{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)\\b", "CISCOTIMESTAMP", Collections.singletonList(1)),
			
 
				+        new CandidateTimestampFormat("UNIX_MS", "\\b\\d{13}\\b", "\\b\\d{13}\\b", "POSINT"),
			
 
				+        new CandidateTimestampFormat("UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "\\b\\d{10}\\.(?:\\d{3}){1,3}\\b", "NUMBER"),
			
 
				+        new CandidateTimestampFormat("UNIX", "\\b\\d{10}\\b", "\\b\\d{10}\\b", "POSINT"),
			
 
				+        new CandidateTimestampFormat("TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM")
			
 
				+    );
			
 
				+
			
 
				+    private TimestampFormatFinder() {
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Find the first timestamp format that matches part of the supplied value.
			
 
				+     * @param text The value that the returned timestamp format must exist within.
			
 
				+     * @return The timestamp format, or <code>null</code> if none matches.
			
 
				+     */
			
 
				+    public static TimestampMatch findFirstMatch(String text) {
			
 
				+        return findFirstMatch(text, 0);
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Find the first timestamp format that matches part of the supplied value,
			
 
				+     * excluding a specified number of candidate formats.
			
 
				+     * @param text The value that the returned timestamp format must exist within.
			
 
				+     * @param ignoreCandidates The number of candidate formats to exclude from the search.
			
 
				+     * @return The timestamp format, or <code>null</code> if none matches.
			
 
				+     */
			
 
				+    public static TimestampMatch findFirstMatch(String text, int ignoreCandidates) {
			
 
				+        Boolean[] quickRuleoutMatches = new Boolean[QUICK_RULE_OUT_PATTERNS.size()];
			
 
				+        int index = ignoreCandidates;
			
 
				+        for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS.subList(ignoreCandidates, ORDERED_CANDIDATE_FORMATS.size())) {
			
 
				+            boolean quicklyRuledOut = false;
			
 
				+            for (Integer quickRuleOutIndex : candidate.quickRuleOutIndices) {
			
 
				+                if (quickRuleoutMatches[quickRuleOutIndex] == null) {
			
 
				+                    quickRuleoutMatches[quickRuleOutIndex] = QUICK_RULE_OUT_PATTERNS.get(quickRuleOutIndex).matcher(text).find();
			
 
				+                }
			
 
				+                if (quickRuleoutMatches[quickRuleOutIndex] == false) {
			
 
				+                    quicklyRuledOut = true;
			
 
				+                    break;
			
 
				+                }
			
 
				+            }
			
 
				+            if (quicklyRuledOut == false) {
			
 
				+                Map<String, Object> captures = candidate.strictSearchGrok.captures(text);
			
 
				+                if (captures != null) {
			
 
				+                    String preface = captures.getOrDefault(PREFACE, "").toString();
			
 
				+                    String epilogue = captures.getOrDefault(EPILOGUE, "").toString();
			
 
				+                    return makeTimestampMatch(candidate, index, preface, text.substring(preface.length(),
			
 
				+                        text.length() - epilogue.length()), epilogue);
			
 
				+                }
			
 
				+            }
			
 
				+            ++index;
			
 
				+        }
			
 
				+        return null;
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Find the best timestamp format for matching an entire field value.
			
 
				+     * @param text The value that the returned timestamp format must match in its entirety.
			
 
				+     * @return The timestamp format, or <code>null</code> if none matches.
			
 
				+     */
			
 
				+    public static TimestampMatch findFirstFullMatch(String text) {
			
 
				+        return findFirstFullMatch(text, 0);
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Find the best timestamp format for matching an entire field value,
			
 
				+     * excluding a specified number of candidate formats.
			
 
				+     * @param text The value that the returned timestamp format must match in its entirety.
			
 
				+     * @param ignoreCandidates The number of candidate formats to exclude from the search.
			
 
				+     * @return The timestamp format, or <code>null</code> if none matches.
			
 
				+     */
			
 
				+    public static TimestampMatch findFirstFullMatch(String text, int ignoreCandidates) {
			
 
				+        int index = ignoreCandidates;
			
 
				+        for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS.subList(ignoreCandidates, ORDERED_CANDIDATE_FORMATS.size())) {
			
 
				+            Map<String, Object> captures = candidate.strictFullMatchGrok.captures(text);
			
 
				+            if (captures != null) {
			
 
				+                return makeTimestampMatch(candidate, index, "", text, "");
			
 
				+            }
			
 
				+            ++index;
			
 
				+        }
			
 
				+        return null;
			
 
				+    }
			
 
				+
			
 
				+    private static TimestampMatch makeTimestampMatch(CandidateTimestampFormat chosenTimestampFormat, int chosenIndex,
			
 
				+                                                     String preface, String matchedDate, String epilogue) {
			
 
				+        Tuple<Character, Integer> fractionalSecondsInterpretation = interpretFractionalSeconds(matchedDate);
			
 
				+        List<String> dateFormats = chosenTimestampFormat.dateFormats;
			
 
				+        Pattern simplePattern = chosenTimestampFormat.simplePattern;
			
 
				+        char separator = fractionalSecondsInterpretation.v1();
			
 
				+        if (separator != DEFAULT_FRACTIONAL_SECOND_SEPARATOR) {
			
 
				+            dateFormats = dateFormats.stream().map(dateFormat -> dateFormat.replace(DEFAULT_FRACTIONAL_SECOND_SEPARATOR, separator))
			
 
				+                .collect(Collectors.toList());
			
 
				+            if (dateFormats.stream().noneMatch(dateFormat -> dateFormat.startsWith("UNIX"))) {
			
 
				+                String patternStr = simplePattern.pattern();
			
 
				+                int separatorPos = patternStr.lastIndexOf(DEFAULT_FRACTIONAL_SECOND_SEPARATOR);
			
 
				+                if (separatorPos >= 0) {
			
 
				+                    StringBuilder newPatternStr = new StringBuilder(patternStr);
			
 
				+                    newPatternStr.replace(separatorPos, separatorPos + 1, ((separator == '.') ? "\\" : "") + separator);
			
 
				+                    simplePattern = Pattern.compile(newPatternStr.toString());
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        int numberOfDigitsInFractionalComponent = fractionalSecondsInterpretation.v2();
			
 
				+        if (numberOfDigitsInFractionalComponent > 3) {
			
 
				+            String fractionalSecondsFormat = "SSSSSSSSS".substring(0, numberOfDigitsInFractionalComponent);
			
 
				+            dateFormats = dateFormats.stream().map(dateFormat -> dateFormat.replace("SSS", fractionalSecondsFormat))
			
 
				+                .collect(Collectors.toList());
			
 
				+        }
			
 
				+        return new TimestampMatch(chosenIndex, preface, dateFormats, simplePattern, chosenTimestampFormat.standardGrokPatternName,
			
 
				+            epilogue);
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Interpret the fractional seconds component of a date to determine two things:
			
 
				+     * 1. The separator character - one of colon, comma and dot.
			
 
				+     * 2. The number of digits in the fractional component.
			
 
				+     * @param date The textual representation of the date for which fractional seconds are to be interpreted.
			
 
				+     * @return A tuple of (fractional second separator character, number of digits in fractional component).
			
 
				+     */
			
 
				+    static Tuple<Character, Integer> interpretFractionalSeconds(String date) {
			
 
				+
			
 
				+        Matcher matcher = FRACTIONAL_SECOND_INTERPRETER.matcher(date);
			
 
				+        if (matcher.find()) {
			
 
				+            return new Tuple<>(matcher.group(1).charAt(0), matcher.group(2).length());
			
 
				+        }
			
 
				+
			
 
				+        return new Tuple<>(DEFAULT_FRACTIONAL_SECOND_SEPARATOR, 0);
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Represents a timestamp that has matched a field value or been found within a message.
			
 
				+     */
			
 
				+    public static final class TimestampMatch {
			
 
				+
			
 
				+        /**
			
 
				+         * The index of the corresponding entry in the <code>ORDERED_CANDIDATE_FORMATS</code> list.
			
 
				+         */
			
 
				+        public final int candidateIndex;
			
 
				+
			
 
				+        /**
			
 
				+         * Text that came before the timestamp in the matched field/message.
			
 
				+         */
			
 
				+        public final String preface;
			
 
				+
			
 
				+        /**
			
 
				+         * Time format specifier(s) that will work with Logstash and Ingest pipeline date parsers.
			
 
				+         */
			
 
				+        public final List<String> dateFormats;
			
 
				+
			
 
				+        /**
			
 
				+         * A simple regex that will work in many languages to detect whether the timestamp format
			
 
				+         * exists in a particular line.
			
 
				+         */
			
 
				+        public final Pattern simplePattern;
			
 
				+
			
 
				+        /**
			
 
				+         * Name of an out-of-the-box Grok pattern that will match the timestamp.
			
 
				+         */
			
 
				+        public final String grokPatternName;
			
 
				+
			
 
				+        /**
			
 
				+         * Text that came after the timestamp in the matched field/message.
			
 
				+         */
			
 
				+        public final String epilogue;
			
 
				+
			
 
				+        TimestampMatch(int candidateIndex, String preface, String dateFormat, String simpleRegex, String grokPatternName, String epilogue) {
			
 
				+            this(candidateIndex, preface, Collections.singletonList(dateFormat), simpleRegex, grokPatternName, epilogue);
			
 
				+        }
			
 
				+
			
 
				+        TimestampMatch(int candidateIndex, String preface, String dateFormat, String simpleRegex, String grokPatternName, String epilogue,
			
 
				+                       boolean hasFractionalComponentSmallerThanMillisecond) {
			
 
				+            this(candidateIndex, preface, Collections.singletonList(dateFormat), simpleRegex, grokPatternName, epilogue);
			
 
				+        }
			
 
				+
			
 
				+        TimestampMatch(int candidateIndex, String preface, List<String> dateFormats, String simpleRegex, String grokPatternName,
			
 
				+                       String epilogue) {
			
 
				+            this(candidateIndex, preface, dateFormats, Pattern.compile(simpleRegex), grokPatternName, epilogue);
			
 
				+        }
			
 
				+
			
 
				+        TimestampMatch(int candidateIndex, String preface, List<String> dateFormats, Pattern simplePattern, String grokPatternName,
			
 
				+                       String epilogue) {
			
 
				+            this.candidateIndex = candidateIndex;
			
 
				+            this.preface = preface;
			
 
				+            this.dateFormats = dateFormats;
			
 
				+            this.simplePattern = simplePattern;
			
 
				+            this.grokPatternName = grokPatternName;
			
 
				+            this.epilogue = epilogue;
			
 
				+        }
			
 
				+
			
 
				+        /**
			
 
				+         * Does the parsing the timestamp produce different results depending on the timezone of the parser?
			
 
				+         * I.e., does the textual representation NOT define the timezone?
			
 
				+         */
			
 
				+        public boolean hasTimezoneDependentParsing() {
			
 
				+            return dateFormats.stream()
			
 
				+                .anyMatch(dateFormat -> dateFormat.contains("HH") && dateFormat.toLowerCase(Locale.ROOT).indexOf('z') == -1);
			
 
				+        }
			
 
				+
			
 
				+        /**
			
 
				+         * Sometimes Elasticsearch mappings for dates need to include the format.
			
 
				+         * This method returns appropriate mappings settings: at minimum "type"="date",
			
 
				+         * and possibly also a "format" setting.
			
 
				+         */
			
 
				+        public Map<String, String> getEsDateMappingTypeWithFormat() {
			
 
				+            if (dateFormats.contains("TAI64N")) {
			
 
				+                // There's no format for TAI64N in the date formats used in mappings
			
 
				+                return Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword");
			
 
				+            }
			
 
				+            Map<String, String> mapping = new LinkedHashMap<>();
			
 
				+            mapping.put(LogStructureUtils.MAPPING_TYPE_SETTING, "date");
			
 
				+            String formats = dateFormats.stream().flatMap(format -> {
			
 
				+                switch (format) {
			
 
				+                    case "ISO8601":
			
 
				+                        return Stream.empty();
			
 
				+                    case "UNIX_MS":
			
 
				+                        return Stream.of("epoch_millis");
			
 
				+                    case "UNIX":
			
 
				+                        return Stream.of("epoch_second");
			
 
				+                    default:
			
 
				+                        return Stream.of(format);
			
 
				+                }
			
 
				+            }).collect(Collectors.joining("||"));
			
 
				+            if (formats.isEmpty() == false) {
			
 
				+                mapping.put(LogStructureUtils.MAPPING_FORMAT_SETTING, formats);
			
 
				+            }
			
 
				+            return mapping;
			
 
				+        }
			
 
				+
			
 
				+        @Override
			
 
				+        public int hashCode() {
			
 
				+            return Objects.hash(candidateIndex, preface, dateFormats, simplePattern.pattern(), grokPatternName, epilogue);
			
 
				+        }
			
 
				+
			
 
				+        @Override
			
 
				+        public boolean equals(Object other) {
			
 
				+            if (this == other) {
			
 
				+                return true;
			
 
				+            }
			
 
				+            if (other == null || getClass() != other.getClass()) {
			
 
				+                return false;
			
 
				+            }
			
 
				+
			
 
				+            TimestampMatch that = (TimestampMatch) other;
			
 
				+            return this.candidateIndex == that.candidateIndex &&
			
 
				+                Objects.equals(this.preface, that.preface) &&
			
 
				+                Objects.equals(this.dateFormats, that.dateFormats) &&
			
 
				+                Objects.equals(this.simplePattern.pattern(), that.simplePattern.pattern()) &&
			
 
				+                Objects.equals(this.grokPatternName, that.grokPatternName) &&
			
 
				+                Objects.equals(this.epilogue, that.epilogue);
			
 
				+        }
			
 
				+
			
 
				+        @Override
			
 
				+        public String toString() {
			
 
				+            return "index = " + candidateIndex + (preface.isEmpty() ? "" : ", preface = '" + preface + "'") +
			
 
				+                ", date formats = " + dateFormats.stream().collect(Collectors.joining("', '", "[ '", "' ]")) +
			
 
				+                ", simple pattern = '" + simplePattern.pattern() + "', grok pattern = '" + grokPatternName + "'" +
			
 
				+                (epilogue.isEmpty() ? "" : ", epilogue = '" + epilogue + "'");
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    static final class CandidateTimestampFormat {
			
 
				+
			
 
				+        final List<String> dateFormats;
			
 
				+        final Pattern simplePattern;
			
 
				+        final Grok strictSearchGrok;
			
 
				+        final Grok strictFullMatchGrok;
			
 
				+        final String standardGrokPatternName;
			
 
				+        final List<Integer> quickRuleOutIndices;
			
 
				+
			
 
				+        CandidateTimestampFormat(String dateFormat, String simpleRegex, String strictGrokPattern, String standardGrokPatternName) {
			
 
				+            this(Collections.singletonList(dateFormat), simpleRegex, strictGrokPattern, standardGrokPatternName);
			
 
				+        }
			
 
				+
			
 
				+        CandidateTimestampFormat(String dateFormat, String simpleRegex, String strictGrokPattern, String standardGrokPatternName,
			
 
				+                                 List<Integer> quickRuleOutIndices) {
			
 
				+            this(Collections.singletonList(dateFormat), simpleRegex, strictGrokPattern, standardGrokPatternName, quickRuleOutIndices);
			
 
				+        }
			
 
				+
			
 
				+        CandidateTimestampFormat(List<String> dateFormats, String simpleRegex, String strictGrokPattern, String standardGrokPatternName) {
			
 
				+            this(dateFormats, simpleRegex, strictGrokPattern, standardGrokPatternName, Collections.emptyList());
			
 
				+        }
			
 
				+
			
 
				+        CandidateTimestampFormat(List<String> dateFormats, String simpleRegex, String strictGrokPattern, String standardGrokPatternName,
			
 
				+                                 List<Integer> quickRuleOutIndices) {
			
 
				+            this.dateFormats = dateFormats;
			
 
				+            this.simplePattern = Pattern.compile(simpleRegex, Pattern.MULTILINE);
			
 
				+            // The (?m) here has the Ruby meaning, which is equivalent to (?s) in Java
			
 
				+            this.strictSearchGrok = new Grok(Grok.getBuiltinPatterns(), "(?m)%{DATA:" + PREFACE + "}" + strictGrokPattern +
			
 
				+                "%{GREEDYDATA:" + EPILOGUE + "}");
			
 
				+            this.strictFullMatchGrok = new Grok(Grok.getBuiltinPatterns(), strictGrokPattern);
			
 
				+            this.standardGrokPatternName = standardGrokPatternName;
			
 
				+            assert quickRuleOutIndices.stream()
			
 
				+                .noneMatch(quickRuleOutIndex -> quickRuleOutIndex < 0 || quickRuleOutIndex >= QUICK_RULE_OUT_PATTERNS.size());
			
 
				+            this.quickRuleOutIndices = quickRuleOutIndices;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/TsvLogStructureFinderFactory.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/TsvLogStructureFinderFactory.java
@@ -0,0 +1,35 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.supercsv.prefs.CsvPreference;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.util.List;
			
 
				+
			
 
				+public class TsvLogStructureFinderFactory implements LogStructureFinderFactory {
			
 
				+
			
 
				+    /**
			
 
				+     * Rules are:
			
 
				+     * - The file must be valid TSV
			
 
				+     * - It must contain at least two complete records
			
 
				+     * - There must be at least two fields per record (otherwise files with no tabs could be treated as TSV!)
			
 
				+     * - Every TSV record except the last must have the same number of fields
			
 
				+     * The reason the last record is allowed to have fewer fields than the others is that
			
 
				+     * it could have been truncated when the file was sampled.
			
 
				+     */
			
 
				+    @Override
			
 
				+    public boolean canCreateFromSample(List<String> explanation, String sample) {
			
 
				+        return SeparatedValuesLogStructureFinder.canCreateFromSample(explanation, sample, 2, CsvPreference.TAB_PREFERENCE, "TSV");
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
			
 
				+        throws IOException {
			
 
				+        return SeparatedValuesLogStructureFinder.makeSeparatedValuesLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
			
 
				+            CsvPreference.TAB_PREFERENCE, false);
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/XmlLogStructureFinder.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/XmlLogStructureFinder.java
@@ -0,0 +1,172 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.elasticsearch.common.collect.Tuple;
			
 
				+import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
			
 
				+import org.w3c.dom.Document;
			
 
				+import org.w3c.dom.NamedNodeMap;
			
 
				+import org.w3c.dom.Node;
			
 
				+import org.w3c.dom.NodeList;
			
 
				+import org.xml.sax.SAXException;
			
 
				+
			
 
				+import javax.xml.parsers.DocumentBuilder;
			
 
				+import javax.xml.parsers.DocumentBuilderFactory;
			
 
				+import javax.xml.parsers.ParserConfigurationException;
			
 
				+import java.io.ByteArrayInputStream;
			
 
				+import java.io.IOException;
			
 
				+import java.io.InputStream;
			
 
				+import java.nio.charset.StandardCharsets;
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.Collections;
			
 
				+import java.util.LinkedHashMap;
			
 
				+import java.util.List;
			
 
				+import java.util.Map;
			
 
				+import java.util.Scanner;
			
 
				+import java.util.SortedMap;
			
 
				+import java.util.TreeMap;
			
 
				+import java.util.regex.Pattern;
			
 
				+
			
 
				+public class XmlLogStructureFinder implements LogStructureFinder {
			
 
				+
			
 
				+    private final List<String> sampleMessages;
			
 
				+    private final LogStructure structure;
			
 
				+
			
 
				+    static XmlLogStructureFinder makeXmlLogStructureFinder(List<String> explanation, String sample, String charsetName,
			
 
				+                                                           Boolean hasByteOrderMarker)
			
 
				+        throws IOException, ParserConfigurationException, SAXException {
			
 
				+
			
 
				+        String messagePrefix;
			
 
				+        try (Scanner scanner = new Scanner(sample)) {
			
 
				+            messagePrefix = scanner.next();
			
 
				+        }
			
 
				+
			
 
				+        DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
			
 
				+        docBuilderFactory.setNamespaceAware(false);
			
 
				+        docBuilderFactory.setValidating(false);
			
 
				+
			
 
				+        List<String> sampleMessages = new ArrayList<>();
			
 
				+        List<Map<String, ?>> sampleRecords = new ArrayList<>();
			
 
				+
			
 
				+        String[] sampleDocEnds = sample.split(Pattern.quote(messagePrefix));
			
 
				+        StringBuilder preamble = new StringBuilder(sampleDocEnds[0]);
			
 
				+        int linesConsumed = numNewlinesIn(sampleDocEnds[0]);
			
 
				+        for (int i = 1; i < sampleDocEnds.length; ++i) {
			
 
				+            String sampleDoc = messagePrefix + sampleDocEnds[i];
			
 
				+            if (i < 3) {
			
 
				+                preamble.append(sampleDoc);
			
 
				+            }
			
 
				+            DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
			
 
				+            try (InputStream is = new ByteArrayInputStream(sampleDoc.getBytes(StandardCharsets.UTF_8))) {
			
 
				+                sampleRecords.add(docToMap(docBuilder.parse(is)));
			
 
				+                sampleMessages.add(sampleDoc);
			
 
				+                linesConsumed += numNewlinesIn(sampleDoc);
			
 
				+            } catch (SAXException e) {
			
 
				+                // Tolerate an incomplete last record as long as we have one complete record
			
 
				+                if (sampleRecords.isEmpty() || i < sampleDocEnds.length - 1) {
			
 
				+                    throw e;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        if (sample.endsWith("\n") == false) {
			
 
				+            ++linesConsumed;
			
 
				+        }
			
 
				+
			
 
				+        // If we get here the XML parser should have confirmed this
			
 
				+        assert messagePrefix.charAt(0) == '<';
			
 
				+        String topLevelTag = messagePrefix.substring(1);
			
 
				+
			
 
				+        LogStructure.Builder structureBuilder = new LogStructure.Builder(LogStructure.Format.XML)
			
 
				+            .setCharset(charsetName)
			
 
				+            .setHasByteOrderMarker(hasByteOrderMarker)
			
 
				+            .setSampleStart(preamble.toString())
			
 
				+            .setNumLinesAnalyzed(linesConsumed)
			
 
				+            .setNumMessagesAnalyzed(sampleRecords.size())
			
 
				+            .setMultilineStartPattern("^\\s*<" + topLevelTag);
			
 
				+
			
 
				+        Tuple<String, TimestampMatch> timeField = LogStructureUtils.guessTimestampField(explanation, sampleRecords);
			
 
				+        if (timeField != null) {
			
 
				+            structureBuilder.setTimestampField(timeField.v1())
			
 
				+                .setTimestampFormats(timeField.v2().dateFormats)
			
 
				+                .setNeedClientTimezone(timeField.v2().hasTimezoneDependentParsing());
			
 
				+        }
			
 
				+
			
 
				+        SortedMap<String, Object> innerMappings = LogStructureUtils.guessMappings(explanation, sampleRecords);
			
 
				+        Map<String, Object> secondLevelProperties = new LinkedHashMap<>();
			
 
				+        secondLevelProperties.put(LogStructureUtils.MAPPING_TYPE_SETTING, "object");
			
 
				+        secondLevelProperties.put(LogStructureUtils.MAPPING_PROPERTIES_SETTING, innerMappings);
			
 
				+        SortedMap<String, Object> outerMappings = new TreeMap<>();
			
 
				+        outerMappings.put(topLevelTag, secondLevelProperties);
			
 
				+        outerMappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD,
			
 
				+            Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
			
 
				+
			
 
				+        LogStructure structure = structureBuilder
			
 
				+            .setMappings(outerMappings)
			
 
				+            .setExplanation(explanation)
			
 
				+            .build();
			
 
				+
			
 
				+        return new XmlLogStructureFinder(sampleMessages, structure);
			
 
				+    }
			
 
				+
			
 
				+    private XmlLogStructureFinder(List<String> sampleMessages, LogStructure structure) {
			
 
				+        this.sampleMessages = Collections.unmodifiableList(sampleMessages);
			
 
				+        this.structure = structure;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public List<String> getSampleMessages() {
			
 
				+        return sampleMessages;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public LogStructure getStructure() {
			
 
				+        return structure;
			
 
				+    }
			
 
				+
			
 
				+    private static int numNewlinesIn(String str) {
			
 
				+        return (int) str.chars().filter(c -> c == '\n').count();
			
 
				+    }
			
 
				+
			
 
				+    private static Map<String, Object> docToMap(Document doc) {
			
 
				+
			
 
				+        Map<String, Object> docAsMap = new LinkedHashMap<>();
			
 
				+
			
 
				+        doc.getDocumentElement().normalize();
			
 
				+        addNodeToMap(doc.getDocumentElement(), docAsMap);
			
 
				+
			
 
				+        return docAsMap;
			
 
				+    }
			
 
				+
			
 
				+    private static void addNodeToMap(Node node, Map<String, Object> nodeAsMap) {
			
 
				+
			
 
				+        NamedNodeMap attributes = node.getAttributes();
			
 
				+        for (int i = 0; i < attributes.getLength(); ++i) {
			
 
				+            Node attribute = attributes.item(i);
			
 
				+            nodeAsMap.put(attribute.getNodeName(), attribute.getNodeValue());
			
 
				+        }
			
 
				+
			
 
				+        NodeList children = node.getChildNodes();
			
 
				+        for (int i = 0; i < children.getLength(); ++i) {
			
 
				+            Node child = children.item(i);
			
 
				+            if (child.getNodeType() == Node.ELEMENT_NODE) {
			
 
				+                if (child.getChildNodes().getLength() == 1) {
			
 
				+                    Node grandChild = child.getChildNodes().item(0);
			
 
				+                    String value = grandChild.getNodeValue().trim();
			
 
				+                    if (value.isEmpty() == false) {
			
 
				+                        nodeAsMap.put(child.getNodeName(), value);
			
 
				+                    }
			
 
				+                } else {
			
 
				+                    Map<String, Object> childNodeAsMap = new LinkedHashMap<>();
			
 
				+                    addNodeToMap(child, childNodeAsMap);
			
 
				+                    if (childNodeAsMap.isEmpty() == false) {
			
 
				+                        nodeAsMap.put(child.getNodeName(), childNodeAsMap);
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/XmlLogStructureFinderFactory.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/XmlLogStructureFinderFactory.java
@@ -0,0 +1,122 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.xml.sax.SAXException;
			
 
				+
			
 
				+import javax.xml.parsers.ParserConfigurationException;
			
 
				+import javax.xml.stream.Location;
			
 
				+import javax.xml.stream.XMLInputFactory;
			
 
				+import javax.xml.stream.XMLStreamException;
			
 
				+import javax.xml.stream.XMLStreamReader;
			
 
				+import java.io.IOException;
			
 
				+import java.io.Reader;
			
 
				+import java.io.StringReader;
			
 
				+import java.util.List;
			
 
				+
			
 
				+public class XmlLogStructureFinderFactory implements LogStructureFinderFactory {
			
 
				+
			
 
				+    private final XMLInputFactory xmlFactory;
			
 
				+
			
 
				+    public XmlLogStructureFinderFactory() {
			
 
				+        xmlFactory = XMLInputFactory.newInstance();
			
 
				+        xmlFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.FALSE);
			
 
				+        xmlFactory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE);
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * This format matches if the sample consists of one or more XML documents,
			
 
				+     * all with the same root element name.  If there is more than one document,
			
 
				+     * only whitespace is allowed in between them.  The last one does not
			
 
				+     * necessarily have to be complete (as the sample could have truncated it).
			
 
				+     */
			
 
				+    @Override
			
 
				+    public boolean canCreateFromSample(List<String> explanation, String sample) {
			
 
				+
			
 
				+        int completeDocCount = 0;
			
 
				+        String commonRootElementName = null;
			
 
				+        String remainder = sample.trim();
			
 
				+        boolean mightBeAnotherDocument = !remainder.isEmpty();
			
 
				+
			
 
				+        // This processing is extremely complicated because it's necessary
			
 
				+        // to create a new XML stream reader per document, but each one
			
 
				+        // will read ahead so will potentially consume characters from the
			
 
				+        // following document.  We must therefore also recreate the string
			
 
				+        // reader for each document.
			
 
				+        while (mightBeAnotherDocument) {
			
 
				+
			
 
				+            try (Reader reader = new StringReader(remainder)) {
			
 
				+
			
 
				+                XMLStreamReader xmlReader = xmlFactory.createXMLStreamReader(reader);
			
 
				+                try {
			
 
				+                    int nestingLevel = 0;
			
 
				+                    while ((mightBeAnotherDocument = xmlReader.hasNext())) {
			
 
				+                        switch (xmlReader.next()) {
			
 
				+                            case XMLStreamReader.START_ELEMENT:
			
 
				+                                if (nestingLevel++ == 0) {
			
 
				+                                    String rootElementName = xmlReader.getLocalName();
			
 
				+                                    if (commonRootElementName == null) {
			
 
				+                                        commonRootElementName = rootElementName;
			
 
				+                                    } else if (commonRootElementName.equals(rootElementName) == false) {
			
 
				+                                        explanation.add("Not XML because different documents have different root " +
			
 
				+                                            "element names: [" + commonRootElementName + "] and [" + rootElementName + "]");
			
 
				+                                        return false;
			
 
				+                                    }
			
 
				+                                }
			
 
				+                                break;
			
 
				+                            case XMLStreamReader.END_ELEMENT:
			
 
				+                                if (--nestingLevel < 0) {
			
 
				+                                    explanation.add("Not XML because an end element occurs before a start element");
			
 
				+                                    return false;
			
 
				+                                }
			
 
				+                                break;
			
 
				+                        }
			
 
				+                        if (nestingLevel == 0) {
			
 
				+                            ++completeDocCount;
			
 
				+                            // Find the position that's one character beyond end of the end element.
			
 
				+                            // The next document (if there is one) must start after this (possibly
			
 
				+                            // preceeded by whitespace).
			
 
				+                            Location location = xmlReader.getLocation();
			
 
				+                            int endPos = 0;
			
 
				+                            // Line and column numbers start at 1, not 0
			
 
				+                            for (int wholeLines = location.getLineNumber() - 1; wholeLines > 0; --wholeLines) {
			
 
				+                                endPos = remainder.indexOf('\n', endPos) + 1;
			
 
				+                                if (endPos == 0) {
			
 
				+                                    explanation.add("Not XML because XML parser location is inconsistent: line [" +
			
 
				+                                        location.getLineNumber() + "], column [" + location.getColumnNumber() + "] in [" + remainder + "]");
			
 
				+                                    return false;
			
 
				+                                }
			
 
				+                            }
			
 
				+                            endPos += location.getColumnNumber() - 1;
			
 
				+                            remainder = remainder.substring(endPos).trim();
			
 
				+                            mightBeAnotherDocument = !remainder.isEmpty();
			
 
				+                            break;
			
 
				+                        }
			
 
				+                    }
			
 
				+                } finally {
			
 
				+                    xmlReader.close();
			
 
				+                }
			
 
				+            } catch (IOException | XMLStreamException e) {
			
 
				+                explanation.add("Not XML because there was a parsing exception: [" + e.getMessage().replaceAll("\\s?\r?\n\\s?", " ") + "]");
			
 
				+                return false;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        if (completeDocCount == 0) {
			
 
				+            explanation.add("Not XML because sample didn't contain a complete document");
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+        explanation.add("Deciding sample is XML");
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
			
 
				+        throws IOException, ParserConfigurationException, SAXException {
			
 
				+        return XmlLogStructureFinder.makeXmlLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker);
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/CsvLogStructureFinderFactoryTests.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/CsvLogStructureFinderFactoryTests.java
@@ -0,0 +1,38 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+public class CsvLogStructureFinderFactoryTests extends LogStructureTestCase {
			
 
				+
			
 
				+    private LogStructureFinderFactory factory = new CsvLogStructureFinderFactory();
			
 
				+
			
 
				+    // No need to check JSON or XML because they come earlier in the order we check formats
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenCsv() {
			
 
				+
			
 
				+        assertTrue(factory.canCreateFromSample(explanation, CSV_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenTsv() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, TSV_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenSemiColonSeparatedValues() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_SEPARATED_VALUES_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenPipeSeparatedValues() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenText() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/GrokPatternCreatorTests.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/GrokPatternCreatorTests.java
@@ -0,0 +1,326 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.elasticsearch.common.collect.Tuple;
			
 
				+import org.elasticsearch.xpack.ml.logstructurefinder.GrokPatternCreator.ValueOnlyGrokPatternCandidate;
			
 
				+
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.Arrays;
			
 
				+import java.util.Collection;
			
 
				+import java.util.Collections;
			
 
				+import java.util.HashMap;
			
 
				+import java.util.Map;
			
 
				+
			
 
				+import static org.hamcrest.Matchers.containsInAnyOrder;
			
 
				+
			
 
				+public class GrokPatternCreatorTests extends LogStructureTestCase {
			
 
				+
			
 
				+    public void testBuildFieldName() {
			
 
				+        Map<String, Integer> fieldNameCountStore = new HashMap<>();
			
 
				+        assertEquals("field", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
			
 
				+        assertEquals("field2", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
			
 
				+        assertEquals("field3", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
			
 
				+        assertEquals("extra_timestamp", GrokPatternCreator.buildFieldName(fieldNameCountStore, "extra_timestamp"));
			
 
				+        assertEquals("field4", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
			
 
				+        assertEquals("uri", GrokPatternCreator.buildFieldName(fieldNameCountStore, "uri"));
			
 
				+        assertEquals("extra_timestamp2", GrokPatternCreator.buildFieldName(fieldNameCountStore, "extra_timestamp"));
			
 
				+        assertEquals("field5", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
			
 
				+    }
			
 
				+
			
 
				+    public void testPopulatePrefacesAndEpiloguesGivenTimestamp() {
			
 
				+
			
 
				+        Collection<String> matchingStrings = Arrays.asList("[2018-01-25T15:33:23] DEBUG ",
			
 
				+            "[2018-01-24T12:33:23] ERROR ",
			
 
				+            "junk [2018-01-22T07:33:23] INFO ",
			
 
				+            "[2018-01-21T03:33:23] DEBUG ");
			
 
				+        ValueOnlyGrokPatternCandidate candidate = new ValueOnlyGrokPatternCandidate("TIMESTAMP_ISO8601", "date", "extra_timestamp");
			
 
				+
			
 
				+        Map<String, Integer> fieldNameCountStore = new HashMap<>();
			
 
				+        Collection<String> prefaces = new ArrayList<>();
			
 
				+        Collection<String> epilogues = new ArrayList<>();
			
 
				+
			
 
				+        candidate.processCaptures(fieldNameCountStore, matchingStrings, prefaces, epilogues, null);
			
 
				+
			
 
				+        assertThat(prefaces, containsInAnyOrder("[", "[", "junk [", "["));
			
 
				+        assertThat(epilogues, containsInAnyOrder("] DEBUG ", "] ERROR ", "] INFO ", "] DEBUG "));
			
 
				+    }
			
 
				+
			
 
				+    public void testPopulatePrefacesAndEpiloguesGivenEmailAddress() {
			
 
				+
			
 
				+        Collection<String> matchingStrings = Arrays.asList("before alice@acme.com after",
			
 
				+            "abc bob@acme.com xyz",
			
 
				+            "carol@acme.com");
			
 
				+        ValueOnlyGrokPatternCandidate candidate = new ValueOnlyGrokPatternCandidate("EMAILADDRESS", "keyword", "email");
			
 
				+
			
 
				+        Map<String, Integer> fieldNameCountStore = new HashMap<>();
			
 
				+        Collection<String> prefaces = new ArrayList<>();
			
 
				+        Collection<String> epilogues = new ArrayList<>();
			
 
				+
			
 
				+        candidate.processCaptures(fieldNameCountStore, matchingStrings, prefaces, epilogues, null);
			
 
				+
			
 
				+        assertThat(prefaces, containsInAnyOrder("before ", "abc ", ""));
			
 
				+        assertThat(epilogues, containsInAnyOrder(" after", " xyz", ""));
			
 
				+    }
			
 
				+
			
 
				+    public void testAppendBestGrokMatchForStringsGivenTimestampsAndLogLevels() {
			
 
				+
			
 
				+        Collection<String> snippets = Arrays.asList("[2018-01-25T15:33:23] DEBUG ",
			
 
				+            "[2018-01-24T12:33:23] ERROR ",
			
 
				+            "junk [2018-01-22T07:33:23] INFO ",
			
 
				+            "[2018-01-21T03:33:23] DEBUG ");
			
 
				+
			
 
				+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
			
 
				+        grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
			
 
				+
			
 
				+        assertEquals(".*?\\[%{TIMESTAMP_ISO8601:extra_timestamp}\\] %{LOGLEVEL:loglevel} ",
			
 
				+            grokPatternCreator.getOverallGrokPatternBuilder().toString());
			
 
				+    }
			
 
				+
			
 
				+    public void testAppendBestGrokMatchForStringsGivenNumbersInBrackets() {
			
 
				+
			
 
				+        Collection<String> snippets = Arrays.asList("(-2)",
			
 
				+            "  (-3)",
			
 
				+            " (4)",
			
 
				+            " (-5) ");
			
 
				+
			
 
				+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
			
 
				+        grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
			
 
				+
			
 
				+        assertEquals(".*?\\(%{INT:field}\\).*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
			
 
				+    }
			
 
				+
			
 
				+    public void testAppendBestGrokMatchForStringsGivenNegativeNumbersWithoutBreak() {
			
 
				+
			
 
				+        Collection<String> snippets = Arrays.asList("before-2 ",
			
 
				+            "prior to-3",
			
 
				+            "-4");
			
 
				+
			
 
				+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
			
 
				+        grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
			
 
				+
			
 
				+        // It seems sensible that we don't detect these suffices as either base 10 or base 16 numbers
			
 
				+        assertEquals(".*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
			
 
				+    }
			
 
				+
			
 
				+    public void testAppendBestGrokMatchForStringsGivenHexNumbers() {
			
 
				+
			
 
				+        Collection<String> snippets = Arrays.asList(" abc",
			
 
				+            "  123",
			
 
				+            " -123",
			
 
				+            "1f is hex");
			
 
				+
			
 
				+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
			
 
				+        grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
			
 
				+
			
 
				+        assertEquals(".*?%{BASE16NUM:field}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
			
 
				+    }
			
 
				+
			
 
				+    public void testAppendBestGrokMatchForStringsGivenHostnamesWithNumbers() {
			
 
				+
			
 
				+        Collection<String> snippets = Arrays.asList("<host1.1.p2ps:",
			
 
				+            "<host2.1.p2ps:");
			
 
				+
			
 
				+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
			
 
				+        grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
			
 
				+
			
 
				+        // We don't want the .1. in the middle to get detected as a hex number
			
 
				+        assertEquals("<.*?:", grokPatternCreator.getOverallGrokPatternBuilder().toString());
			
 
				+    }
			
 
				+
			
 
				+    public void testAppendBestGrokMatchForStringsGivenEmailAddresses() {
			
 
				+
			
 
				+        Collection<String> snippets = Arrays.asList("before alice@acme.com after",
			
 
				+            "abc bob@acme.com xyz",
			
 
				+            "carol@acme.com");
			
 
				+
			
 
				+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
			
 
				+        grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
			
 
				+
			
 
				+        assertEquals(".*?%{EMAILADDRESS:email}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
			
 
				+    }
			
 
				+
			
 
				+    public void testAppendBestGrokMatchForStringsGivenUris() {
			
 
				+
			
 
				+        Collection<String> snippets = Arrays.asList("main site https://www.elastic.co/ with trailing slash",
			
 
				+            "https://www.elastic.co/guide/en/x-pack/current/ml-configuring-categories.html#ml-configuring-categories is a section",
			
 
				+            "download today from https://www.elastic.co/downloads");
			
 
				+
			
 
				+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
			
 
				+        grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
			
 
				+
			
 
				+        assertEquals(".*?%{URI:uri}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
			
 
				+    }
			
 
				+
			
 
				+    public void testAppendBestGrokMatchForStringsGivenPaths() {
			
 
				+
			
 
				+        Collection<String> snippets = Arrays.asList("on Mac /Users/dave",
			
 
				+            "on Windows C:\\Users\\dave",
			
 
				+            "on Linux /home/dave");
			
 
				+
			
 
				+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
			
 
				+        grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
			
 
				+
			
 
				+        assertEquals(".*? .*? %{PATH:path}", grokPatternCreator.getOverallGrokPatternBuilder().toString());
			
 
				+    }
			
 
				+
			
 
				+    public void testAppendBestGrokMatchForStringsGivenKvPairs() {
			
 
				+
			
 
				+        Collection<String> snippets = Arrays.asList("foo=1 and bar=a",
			
 
				+            "something foo=2 bar=b something else",
			
 
				+            "foo=3 bar=c",
			
 
				+            " foo=1 bar=a ");
			
 
				+
			
 
				+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
			
 
				+        grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
			
 
				+
			
 
				+        assertEquals(".*?\\bfoo=%{USER:foo} .*?\\bbar=%{USER:bar}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
			
 
				+    }
			
 
				+
			
 
				+    public void testCreateGrokPatternFromExamplesGivenNamedLogs() {
			
 
				+
			
 
				+        Collection<String> sampleMessages = Arrays.asList(
			
 
				+            "Sep  8 11:55:06 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'elastic.slack.com/A/IN': 95.110.64.205#53",
			
 
				+            "Sep  8 11:55:08 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'slack-imgs.com/A/IN': 95.110.64.205#53",
			
 
				+            "Sep  8 11:55:35 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'www.elastic.co/A/IN': 95.110.68.206#53",
			
 
				+            "Sep  8 11:55:42 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'b.akamaiedge.net/A/IN': 95.110.64.205#53");
			
 
				+
			
 
				+        Map<String, Object> mappings = new HashMap<>();
			
 
				+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
			
 
				+
			
 
				+        assertEquals("%{SYSLOGTIMESTAMP:timestamp} .*? .*?\\[%{INT:field}\\]: %{LOGLEVEL:loglevel} \\(.*? .*? .*?\\) .*? " +
			
 
				+                "%{QUOTEDSTRING:field2}: %{IP:ipaddress}#%{INT:field3}",
			
 
				+            grokPatternCreator.createGrokPatternFromExamples("SYSLOGTIMESTAMP", "timestamp"));
			
 
				+        assertEquals(5, mappings.size());
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field"));
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel"));
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("field2"));
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("ipaddress"));
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field3"));
			
 
				+    }
			
 
				+
			
 
				+    public void testCreateGrokPatternFromExamplesGivenCatalinaLogs() {
			
 
				+
			
 
				+        Collection<String> sampleMessages = Arrays.asList(
			
 
				+            "Aug 29, 2009 12:03:33 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
			
 
				+                "Invalid chunk ignored.",
			
 
				+            "Aug 29, 2009 12:03:40 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
			
 
				+                "Invalid chunk ignored.",
			
 
				+            "Aug 29, 2009 12:03:45 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
			
 
				+                "Invalid chunk ignored.",
			
 
				+            "Aug 29, 2009 12:03:57 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
			
 
				+                "Invalid chunk ignored.");
			
 
				+
			
 
				+        Map<String, Object> mappings = new HashMap<>();
			
 
				+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
			
 
				+
			
 
				+        assertEquals("%{CATALINA_DATESTAMP:timestamp} .*? .*?\\n%{LOGLEVEL:loglevel}: .*",
			
 
				+            grokPatternCreator.createGrokPatternFromExamples("CATALINA_DATESTAMP", "timestamp"));
			
 
				+        assertEquals(1, mappings.size());
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel"));
			
 
				+    }
			
 
				+
			
 
				+    public void testCreateGrokPatternFromExamplesGivenMultiTimestampLogs() {
			
 
				+
			
 
				+        // Two timestamps: one local, one UTC
			
 
				+        Collection<String> sampleMessages = Arrays.asList(
			
 
				+            "559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" +
			
 
				+                "Info\tsshd\tsubsystem request for sftp",
			
 
				+            "559550912548986880\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" +
			
 
				+                "Info\tsshd\tsubsystem request for sftp",
			
 
				+            "559550912548986887\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" +
			
 
				+                "Info\tsshd\tsubsystem request for sftp",
			
 
				+            "559550912603512850\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" +
			
 
				+                "Info\tsshd\tsubsystem request for sftp");
			
 
				+
			
 
				+        Map<String, Object> mappings = new HashMap<>();
			
 
				+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
			
 
				+
			
 
				+        assertEquals("%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{TIMESTAMP_ISO8601:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" +
			
 
				+                "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*",
			
 
				+            grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", "timestamp"));
			
 
				+        assertEquals(5, mappings.size());
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field"));
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"),
			
 
				+            mappings.get("extra_timestamp"));
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field2"));
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("ipaddress"));
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel"));
			
 
				+    }
			
 
				+
			
 
				+    public void testFindFullLineGrokPatternGivenApacheCombinedLogs() {
			
 
				+        Collection<String> sampleMessages = Arrays.asList(
			
 
				+            "83.149.9.216 - - [19/Jan/2016:08:13:42 +0000] " +
			
 
				+                "\"GET /presentations/logstash-monitorama-2013/images/kibana-search.png HTTP/1.1\" 200 203023 " +
			
 
				+                "\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " +
			
 
				+                "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"",
			
 
				+            "83.149.9.216 - - [19/Jan/2016:08:13:44 +0000] " +
			
 
				+                "\"GET /presentations/logstash-monitorama-2013/plugin/zoom-js/zoom.js HTTP/1.1\" 200 7697 " +
			
 
				+                "\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " +
			
 
				+                "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"",
			
 
				+            "83.149.9.216 - - [19/Jan/2016:08:13:44 +0000] " +
			
 
				+                "\"GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js HTTP/1.1\" 200 26185 " +
			
 
				+                "\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " +
			
 
				+                "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"",
			
 
				+            "83.149.9.216 - - [19/Jan/2016:08:13:42 +0000] " +
			
 
				+                "\"GET /presentations/logstash-monitorama-2013/images/sad-medic.png HTTP/1.1\" 200 430406 " +
			
 
				+                "\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " +
			
 
				+                "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"");
			
 
				+
			
 
				+        Map<String, Object> mappings = new HashMap<>();
			
 
				+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
			
 
				+
			
 
				+        assertEquals(new Tuple<>("timestamp", "%{COMBINEDAPACHELOG}"), grokPatternCreator.findFullLineGrokPattern());
			
 
				+        assertEquals(10, mappings.size());
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "text"), mappings.get("agent"));
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("auth"));
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("bytes"));
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("clientip"));
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "double"), mappings.get("httpversion"));
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("ident"));
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("referrer"));
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("request"));
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("response"));
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("verb"));
			
 
				+    }
			
 
				+
			
 
				+    public void testAdjustForPunctuationGivenCommonPrefix() {
			
 
				+        Collection<String> snippets = Arrays.asList(
			
 
				+            "\",\"lab6.localhost\",\"Route Domain\",\"/Common/0\",\"No-lookup\",\"192.168.33.212\",\"No-lookup\",\"192.168.33.132\"," +
			
 
				+                "\"80\",\"46721\",\"/Common/Subnet_33\",\"TCP\",\"0\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Staged\",\"/Common/policy1\"" +
			
 
				+                ",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\"",
			
 
				+            "\",\"lab6.localhost\",\"Route Domain\",\"/Common/0\",\"No-lookup\",\"192.168.143.244\",\"No-lookup\",\"192.168.33.106\"," +
			
 
				+                "\"55025\",\"162\",\"/Common/Subnet_33\",\"UDP\",\"0\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Staged\",\"/Common/policy1\"" +
			
 
				+                ",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\"",
			
 
				+            "\",\"lab6.localhost\",\"Route Domain\",\"/Common/0\",\"No-lookup\",\"192.168.33.3\",\"No-lookup\",\"224.0.0.102\"," +
			
 
				+                "\"3222\",\"3222\",\"/Common/Subnet_33\",\"UDP\",\"0\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Staged\",\"/Common/policy1\"" +
			
 
				+                ",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\""
			
 
				+        );
			
 
				+
			
 
				+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
			
 
				+        Collection<String> adjustedSnippets = grokPatternCreator.adjustForPunctuation(snippets);
			
 
				+
			
 
				+        assertEquals("\",", grokPatternCreator.getOverallGrokPatternBuilder().toString());
			
 
				+        assertNotNull(adjustedSnippets);
			
 
				+        assertThat(new ArrayList<>(adjustedSnippets),
			
 
				+            containsInAnyOrder(snippets.stream().map(snippet -> snippet.substring(2)).toArray(String[]::new)));
			
 
				+    }
			
 
				+
			
 
				+    public void testAdjustForPunctuationGivenNoCommonPrefix() {
			
 
				+        Collection<String> snippets = Arrays.asList(
			
 
				+            "|client (id:2) was removed from servergroup 'Normal'(id:7) by client 'User1'(id:2)",
			
 
				+            "|servergroup 'GAME'(id:9) was added by 'User1'(id:2)",
			
 
				+            "|permission 'i_group_auto_update_type'(id:146) with values (value:30, negated:0, skipchannel:0) " +
			
 
				+                "was added by 'User1'(id:2) to servergroup 'GAME'(id:9)"
			
 
				+        );
			
 
				+
			
 
				+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
			
 
				+        Collection<String> adjustedSnippets = grokPatternCreator.adjustForPunctuation(snippets);
			
 
				+
			
 
				+        assertEquals("", grokPatternCreator.getOverallGrokPatternBuilder().toString());
			
 
				+        assertSame(snippets, adjustedSnippets);
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/JsonLogStructureFinderFactoryTests.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/JsonLogStructureFinderFactoryTests.java
@@ -0,0 +1,46 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+public class JsonLogStructureFinderFactoryTests extends LogStructureTestCase {
			
 
				+
			
 
				+    private LogStructureFinderFactory factory = new JsonLogStructureFinderFactory();
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenJson() {
			
 
				+
			
 
				+        assertTrue(factory.canCreateFromSample(explanation, JSON_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenXml() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, XML_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenCsv() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, CSV_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenTsv() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, TSV_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenSemiColonSeparatedValues() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_SEPARATED_VALUES_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenPipeSeparatedValues() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenText() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/JsonLogStructureFinderTests.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/JsonLogStructureFinderTests.java
@@ -0,0 +1,39 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import java.util.Collections;
			
 
				+
			
 
				+public class JsonLogStructureFinderTests extends LogStructureTestCase {
			
 
				+
			
 
				+    private LogStructureFinderFactory factory = new JsonLogStructureFinderFactory();
			
 
				+
			
 
				+    public void testCreateConfigsGivenGoodJson() throws Exception {
			
 
				+        assertTrue(factory.canCreateFromSample(explanation, JSON_SAMPLE));
			
 
				+
			
 
				+        String charset = randomFrom(POSSIBLE_CHARSETS);
			
 
				+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
			
 
				+        LogStructureFinder structureFinder = factory.createFromSample(explanation, JSON_SAMPLE, charset, hasByteOrderMarker);
			
 
				+
			
 
				+        LogStructure structure = structureFinder.getStructure();
			
 
				+
			
 
				+        assertEquals(LogStructure.Format.JSON, structure.getFormat());
			
 
				+        assertEquals(charset, structure.getCharset());
			
 
				+        if (hasByteOrderMarker == null) {
			
 
				+            assertNull(structure.getHasByteOrderMarker());
			
 
				+        } else {
			
 
				+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
			
 
				+        }
			
 
				+        assertNull(structure.getExcludeLinesPattern());
			
 
				+        assertNull(structure.getMultilineStartPattern());
			
 
				+        assertNull(structure.getSeparator());
			
 
				+        assertNull(structure.getHasHeaderRow());
			
 
				+        assertNull(structure.getShouldTrimFields());
			
 
				+        assertNull(structure.getGrokPattern());
			
 
				+        assertEquals("timestamp", structure.getTimestampField());
			
 
				+        assertEquals(Collections.singletonList("UNIX_MS"), structure.getTimestampFormats());
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderManagerTests.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderManagerTests.java
@@ -0,0 +1,72 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import com.ibm.icu.text.CharsetMatch;
			
 
				+
			
 
				+import java.io.ByteArrayInputStream;
			
 
				+import java.nio.charset.Charset;
			
 
				+import java.nio.charset.StandardCharsets;
			
 
				+import java.util.Arrays;
			
 
				+
			
 
				+import static org.hamcrest.Matchers.startsWith;
			
 
				+import static org.hamcrest.core.IsInstanceOf.instanceOf;
			
 
				+
			
 
				+public class LogStructureFinderManagerTests extends LogStructureTestCase {
			
 
				+
			
 
				+    private LogStructureFinderManager structureFinderManager = new LogStructureFinderManager();
			
 
				+
			
 
				+    public void testFindCharsetGivenCharacterWidths() throws Exception {
			
 
				+
			
 
				+        for (Charset charset : Arrays.asList(StandardCharsets.UTF_8, StandardCharsets.UTF_16LE, StandardCharsets.UTF_16BE)) {
			
 
				+            CharsetMatch charsetMatch = structureFinderManager.findCharset(explanation,
			
 
				+                new ByteArrayInputStream(TEXT_SAMPLE.getBytes(charset)));
			
 
				+            assertEquals(charset.name(), charsetMatch.getName());
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    public void testFindCharsetGivenBinary() throws Exception {
			
 
				+
			
 
				+        // This input should never match a single byte character set.  ICU4J will sometimes decide
			
 
				+        // that it matches a double byte character set, hence the two assertion branches.
			
 
				+        int size = 1000;
			
 
				+        byte[] binaryBytes = randomByteArrayOfLength(size);
			
 
				+        for (int i = 0; i < 10; ++i) {
			
 
				+            binaryBytes[randomIntBetween(0, size - 1)] = 0;
			
 
				+        }
			
 
				+
			
 
				+        try {
			
 
				+            CharsetMatch charsetMatch = structureFinderManager.findCharset(explanation, new ByteArrayInputStream(binaryBytes));
			
 
				+            assertThat(charsetMatch.getName(), startsWith("UTF-16"));
			
 
				+        } catch (IllegalArgumentException e) {
			
 
				+            assertEquals("Could not determine a usable character encoding for the input - could it be binary data?", e.getMessage());
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    public void testMakeBestStructureGivenJson() throws Exception {
			
 
				+        assertThat(structureFinderManager.makeBestStructureFinder(explanation,
			
 
				+            "{ \"time\": \"2018-05-17T13:41:23\", \"message\": \"hello\" }", StandardCharsets.UTF_8.name(), randomBoolean()),
			
 
				+            instanceOf(JsonLogStructureFinder.class));
			
 
				+    }
			
 
				+
			
 
				+    public void testMakeBestStructureGivenXml() throws Exception {
			
 
				+        assertThat(structureFinderManager.makeBestStructureFinder(explanation,
			
 
				+            "<log time=\"2018-05-17T13:41:23\"><message>hello</message></log>", StandardCharsets.UTF_8.name(), randomBoolean()),
			
 
				+            instanceOf(XmlLogStructureFinder.class));
			
 
				+    }
			
 
				+
			
 
				+    public void testMakeBestStructureGivenCsv() throws Exception {
			
 
				+        assertThat(structureFinderManager.makeBestStructureFinder(explanation, "time,message\n" +
			
 
				+                "2018-05-17T13:41:23,hello\n", StandardCharsets.UTF_8.name(), randomBoolean()),
			
 
				+            instanceOf(SeparatedValuesLogStructureFinder.class));
			
 
				+    }
			
 
				+
			
 
				+    public void testMakeBestStructureGivenText() throws Exception {
			
 
				+        assertThat(structureFinderManager.makeBestStructureFinder(explanation, "[2018-05-17T13:41:23] hello\n" +
			
 
				+                "[2018-05-17T13:41:24] hello again\n", StandardCharsets.UTF_8.name(), randomBoolean()),
			
 
				+            instanceOf(TextLogStructureFinder.class));
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureTestCase.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureTestCase.java
@@ -0,0 +1,86 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.elasticsearch.common.logging.Loggers;
			
 
				+import org.elasticsearch.test.ESTestCase;
			
 
				+import org.junit.After;
			
 
				+import org.junit.Before;
			
 
				+
			
 
				+import java.nio.charset.Charset;
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.Collections;
			
 
				+import java.util.List;
			
 
				+import java.util.Locale;
			
 
				+import java.util.stream.Collectors;
			
 
				+
			
 
				+public abstract class LogStructureTestCase extends ESTestCase {
			
 
				+
			
 
				+    protected static final List<String> POSSIBLE_CHARSETS = Collections.unmodifiableList(Charset.availableCharsets().keySet().stream()
			
 
				+        .filter(name -> LogStructureFinderManager.FILEBEAT_SUPPORTED_ENCODINGS.contains(name.toLowerCase(Locale.ROOT)))
			
 
				+        .collect(Collectors.toList()));
			
 
				+
			
 
				+    protected static final String CSV_SAMPLE = "time,id,value\n" +
			
 
				+        "2018-05-17T16:23:40,key1,42.0\n" +
			
 
				+        "2018-05-17T16:24:11,\"key with spaces\",42.0\n";
			
 
				+
			
 
				+    protected static final String JSON_SAMPLE = "{\"logger\":\"controller\",\"timestamp\":1478261151445,\"level\":\"INFO\"," +
			
 
				+            "\"pid\":42,\"thread\":\"0x7fff7d2a8000\",\"message\":\"message 1\",\"class\":\"ml\"," +
			
 
				+            "\"method\":\"core::SomeNoiseMaker\",\"file\":\"Noisemaker.cc\",\"line\":333}\n" +
			
 
				+        "{\"logger\":\"controller\",\"timestamp\":1478261151445," +
			
 
				+            "\"level\":\"INFO\",\"pid\":42,\"thread\":\"0x7fff7d2a8000\",\"message\":\"message 2\",\"class\":\"ml\"," +
			
 
				+            "\"method\":\"core::SomeNoiseMaker\",\"file\":\"Noisemaker.cc\",\"line\":333}\n";
			
 
				+
			
 
				+    protected static final String PIPE_SEPARATED_VALUES_SAMPLE = "2018-01-06 16:56:14.295748|INFO    |VirtualServer |1  |" +
			
 
				+            "listening on 0.0.0.0:9987, :::9987\n" +
			
 
				+        "2018-01-06 17:19:44.465252|INFO    |VirtualServer |1  |client " +
			
 
				+            "'User1'(id:2) changed default admin channelgroup to 'Guest'(id:8)\n" +
			
 
				+        "2018-01-06 17:21:25.764368|INFO    |VirtualServer |1  |client " +
			
 
				+            "'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel 'Default Channel'(id:1)";
			
 
				+
			
 
				+    protected static final String SEMI_COLON_SEPARATED_VALUES_SAMPLE = "\"pos_id\";\"trip_id\";\"latitude\";\"longitude\";\"altitude\";" +
			
 
				+            "\"timestamp\"\n" +
			
 
				+        "\"1\";\"3\";\"4703.7815\";\"1527.4713\";\"359.9\";\"2017-01-19 16:19:04.742113\"\n" +
			
 
				+        "\"2\";\"3\";\"4703.7815\";\"1527.4714\";\"359.9\";\"2017-01-19 16:19:05.741890\"\n" +
			
 
				+        "\"3\";\"3\";\"4703.7816\";\"1527.4716\";\"360.3\";\"2017-01-19 16:19:06.738842\"";
			
 
				+
			
 
				+    protected static final String TEXT_SAMPLE = "[2018-05-11T17:07:29,461][INFO ][o.e.n.Node               ] [node-0] initializing ...\n" +
			
 
				+        "[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment    ] [node-0] using [1] data paths, mounts [[/ (/dev/disk1)]], " +
			
 
				+            "net usable_space [223.4gb], net total_space [464.7gb], types [hfs]\n" +
			
 
				+        "[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment    ] [node-0] heap size [3.9gb], " +
			
 
				+            "compressed ordinary object pointers [true]\n" +
			
 
				+        "[2018-05-11T17:07:29,556][INFO ][o.e.n.Node               ] [node-0] node name [node-0], node ID [tJ9u8HcaTbWxRtnlfz1RQA]\n";
			
 
				+
			
 
				+    protected static final String TSV_SAMPLE = "time\tid\tvalue\n" +
			
 
				+        "2018-05-17T16:23:40\tkey1\t42.0\n" +
			
 
				+        "2018-05-17T16:24:11\t\"key with spaces\"\t42.0\n";
			
 
				+
			
 
				+    protected static final String XML_SAMPLE = "<log4j:event logger=\"autodetect\" timestamp=\"1526574809521\" level=\"ERROR\" " +
			
 
				+            "thread=\"0x7fffc5a7c3c0\">\n" +
			
 
				+        "<log4j:message><![CDATA[Neither a fieldname clause nor a field config file was specified]]></log4j:message>\n" +
			
 
				+        "</log4j:event>\n" +
			
 
				+        "\n" +
			
 
				+        "<log4j:event logger=\"autodetect\" timestamp=\"1526574809522\" level=\"FATAL\" thread=\"0x7fffc5a7c3c0\">\n" +
			
 
				+        "<log4j:message><![CDATA[Field config could not be interpreted]]></log4j:message>\n" +
			
 
				+        "</log4j:event>\n" +
			
 
				+        "\n";
			
 
				+
			
 
				+    protected List<String> explanation;
			
 
				+
			
 
				+    @Before
			
 
				+    public void initExplanation() {
			
 
				+        explanation = new ArrayList<>();
			
 
				+    }
			
 
				+
			
 
				+    @After
			
 
				+    public void printExplanation() {
			
 
				+        Loggers.getLogger(getClass()).info("Explanation:\n" + String.join("\n", explanation));
			
 
				+    }
			
 
				+
			
 
				+    protected Boolean randomHasByteOrderMarker(String charset) {
			
 
				+        return charset.toUpperCase(Locale.ROOT).startsWith("UTF") ? randomBoolean() : null;
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureTests.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureTests.java
@@ -0,0 +1,83 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.elasticsearch.common.xcontent.XContentParser;
			
 
				+import org.elasticsearch.test.AbstractXContentTestCase;
			
 
				+
			
 
				+import java.nio.charset.Charset;
			
 
				+import java.util.Arrays;
			
 
				+import java.util.Collections;
			
 
				+import java.util.EnumSet;
			
 
				+import java.util.Locale;
			
 
				+import java.util.Map;
			
 
				+import java.util.TreeMap;
			
 
				+
			
 
				+public class LogStructureTests extends AbstractXContentTestCase<LogStructure> {
			
 
				+
			
 
				+    protected LogStructure createTestInstance() {
			
 
				+
			
 
				+        LogStructure.Format format = randomFrom(EnumSet.allOf(LogStructure.Format.class));
			
 
				+
			
 
				+        LogStructure.Builder builder = new LogStructure.Builder(format);
			
 
				+
			
 
				+        int numLinesAnalyzed = randomIntBetween(2, 10000);
			
 
				+        builder.setNumLinesAnalyzed(numLinesAnalyzed);
			
 
				+        int numMessagesAnalyzed = randomIntBetween(1, numLinesAnalyzed);
			
 
				+        builder.setNumMessagesAnalyzed(numMessagesAnalyzed);
			
 
				+        builder.setSampleStart(randomAlphaOfLength(1000));
			
 
				+
			
 
				+        String charset = randomFrom(Charset.availableCharsets().keySet());
			
 
				+        builder.setCharset(charset);
			
 
				+        if (charset.toUpperCase(Locale.ROOT).startsWith("UTF")) {
			
 
				+            builder.setHasByteOrderMarker(randomBoolean());
			
 
				+        }
			
 
				+
			
 
				+        if (numMessagesAnalyzed < numLinesAnalyzed) {
			
 
				+            builder.setMultilineStartPattern(randomAlphaOfLength(100));
			
 
				+        }
			
 
				+        if (randomBoolean()) {
			
 
				+            builder.setExcludeLinesPattern(randomAlphaOfLength(100));
			
 
				+        }
			
 
				+
			
 
				+        if (format.isSeparatedValues() || (format.supportsNesting() && randomBoolean())) {
			
 
				+            builder.setInputFields(Arrays.asList(generateRandomStringArray(10, 10, false, false)));
			
 
				+        }
			
 
				+        if (format.isSeparatedValues()) {
			
 
				+            builder.setHasHeaderRow(randomBoolean());
			
 
				+            if (rarely()) {
			
 
				+                builder.setSeparator(format.separator());
			
 
				+            }
			
 
				+        }
			
 
				+        if (format.isSemiStructured()) {
			
 
				+            builder.setGrokPattern(randomAlphaOfLength(100));
			
 
				+        }
			
 
				+
			
 
				+        if (format.isSemiStructured() || randomBoolean()) {
			
 
				+            builder.setTimestampField(randomAlphaOfLength(10));
			
 
				+            builder.setTimestampFormats(Arrays.asList(generateRandomStringArray(3, 20, false, false)));
			
 
				+            builder.setNeedClientTimezone(randomBoolean());
			
 
				+        }
			
 
				+
			
 
				+        Map<String, Object> mappings = new TreeMap<>();
			
 
				+        for (String field : generateRandomStringArray(5, 20, false, false)) {
			
 
				+            mappings.put(field, Collections.singletonMap(randomAlphaOfLength(5), randomAlphaOfLength(10)));
			
 
				+        }
			
 
				+        builder.setMappings(mappings);
			
 
				+
			
 
				+        builder.setExplanation(Arrays.asList(generateRandomStringArray(10, 150, false, false)));
			
 
				+
			
 
				+        return builder.build();
			
 
				+    }
			
 
				+
			
 
				+    protected LogStructure doParseInstance(XContentParser parser) {
			
 
				+        return LogStructure.PARSER.apply(parser, null).build();
			
 
				+    }
			
 
				+
			
 
				+    protected boolean supportsUnknownFields() {
			
 
				+        return false;
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureUtilsTests.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureUtilsTests.java
@@ -0,0 +1,292 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.elasticsearch.common.collect.Tuple;
			
 
				+import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
			
 
				+
			
 
				+import java.util.Arrays;
			
 
				+import java.util.Collections;
			
 
				+import java.util.HashMap;
			
 
				+import java.util.LinkedHashMap;
			
 
				+import java.util.Map;
			
 
				+
			
 
				+import static org.hamcrest.Matchers.contains;
			
 
				+
			
 
				+public class LogStructureUtilsTests extends LogStructureTestCase {
			
 
				+
			
 
				+    public void testMoreLikelyGivenText() {
			
 
				+        assertTrue(LogStructureUtils.isMoreLikelyTextThanKeyword("the quick brown fox jumped over the lazy dog"));
			
 
				+        assertTrue(LogStructureUtils.isMoreLikelyTextThanKeyword(randomAlphaOfLengthBetween(257, 10000)));
			
 
				+    }
			
 
				+
			
 
				+    public void testMoreLikelyGivenKeyword() {
			
 
				+        assertFalse(LogStructureUtils.isMoreLikelyTextThanKeyword("1"));
			
 
				+        assertFalse(LogStructureUtils.isMoreLikelyTextThanKeyword("DEBUG"));
			
 
				+        assertFalse(LogStructureUtils.isMoreLikelyTextThanKeyword(randomAlphaOfLengthBetween(1, 256)));
			
 
				+    }
			
 
				+
			
 
				+    public void testSingleSampleSingleField() {
			
 
				+        Map<String, String> sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
			
 
				+        Tuple<String, TimestampMatch> match =
			
 
				+            LogStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample));
			
 
				+        assertNotNull(match);
			
 
				+        assertEquals("field1", match.v1());
			
 
				+        assertThat(match.v2().dateFormats, contains("ISO8601"));
			
 
				+        assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
			
 
				+    }
			
 
				+
			
 
				+    public void testSamplesWithSameSingleTimeField() {
			
 
				+        Map<String, String> sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
			
 
				+        Map<String, String> sample2 = Collections.singletonMap("field1", "2018-05-24T17:33:39,406");
			
 
				+        Tuple<String, TimestampMatch> match =
			
 
				+            LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
			
 
				+        assertNotNull(match);
			
 
				+        assertEquals("field1", match.v1());
			
 
				+        assertThat(match.v2().dateFormats, contains("ISO8601"));
			
 
				+        assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
			
 
				+    }
			
 
				+
			
 
				+    public void testSamplesWithOneSingleTimeFieldDifferentFormat() {
			
 
				+        Map<String, String> sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
			
 
				+        Map<String, String> sample2 = Collections.singletonMap("field1", "2018-05-24 17:33:39,406");
			
 
				+        Tuple<String, TimestampMatch> match =
			
 
				+            LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
			
 
				+        assertNull(match);
			
 
				+    }
			
 
				+
			
 
				+    public void testSamplesWithDifferentSingleTimeField() {
			
 
				+        Map<String, String> sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
			
 
				+        Map<String, String> sample2 = Collections.singletonMap("another_field", "2018-05-24T17:33:39,406");
			
 
				+        Tuple<String, TimestampMatch> match =
			
 
				+            LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
			
 
				+        assertNull(match);
			
 
				+    }
			
 
				+
			
 
				+    public void testSingleSampleManyFieldsOneTimeFormat() {
			
 
				+        Map<String, Object> sample = new LinkedHashMap<>();
			
 
				+        sample.put("foo", "not a time");
			
 
				+        sample.put("time", "2018-05-24 17:28:31,735");
			
 
				+        sample.put("bar", 42);
			
 
				+        Tuple<String, TimestampMatch> match =
			
 
				+            LogStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample));
			
 
				+        assertNotNull(match);
			
 
				+        assertEquals("time", match.v1());
			
 
				+        assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS"));
			
 
				+        assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
			
 
				+    }
			
 
				+
			
 
				+    public void testSamplesWithManyFieldsSameSingleTimeFormat() {
			
 
				+        Map<String, Object> sample1 = new LinkedHashMap<>();
			
 
				+        sample1.put("foo", "not a time");
			
 
				+        sample1.put("time", "2018-05-24 17:28:31,735");
			
 
				+        sample1.put("bar", 42);
			
 
				+        Map<String, Object> sample2 = new LinkedHashMap<>();
			
 
				+        sample2.put("foo", "whatever");
			
 
				+        sample2.put("time", "2018-05-29 11:53:02,837");
			
 
				+        sample2.put("bar", 17);
			
 
				+        Tuple<String, TimestampMatch> match =
			
 
				+            LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
			
 
				+        assertNotNull(match);
			
 
				+        assertEquals("time", match.v1());
			
 
				+        assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS"));
			
 
				+        assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
			
 
				+    }
			
 
				+
			
 
				+    public void testSamplesWithManyFieldsSameTimeFieldDifferentTimeFormat() {
			
 
				+        Map<String, Object> sample1 = new LinkedHashMap<>();
			
 
				+        sample1.put("foo", "not a time");
			
 
				+        sample1.put("time", "2018-05-24 17:28:31,735");
			
 
				+        sample1.put("bar", 42);
			
 
				+        Map<String, Object> sample2 = new LinkedHashMap<>();
			
 
				+        sample2.put("foo", "whatever");
			
 
				+        sample2.put("time", "May 29 2018 11:53:02");
			
 
				+        sample2.put("bar", 17);
			
 
				+        Tuple<String, TimestampMatch> match =
			
 
				+            LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
			
 
				+        assertNull(match);
			
 
				+    }
			
 
				+
			
 
				+    public void testSamplesWithManyFieldsSameSingleTimeFormatDistractionBefore() {
			
 
				+        Map<String, Object> sample1 = new LinkedHashMap<>();
			
 
				+        sample1.put("red_herring", "May 29 2007 11:53:02");
			
 
				+        sample1.put("time", "2018-05-24 17:28:31,735");
			
 
				+        sample1.put("bar", 42);
			
 
				+        Map<String, Object> sample2 = new LinkedHashMap<>();
			
 
				+        sample2.put("red_herring", "whatever");
			
 
				+        sample2.put("time", "2018-05-29 11:53:02,837");
			
 
				+        sample2.put("bar", 17);
			
 
				+        Tuple<String, TimestampMatch> match =
			
 
				+            LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
			
 
				+        assertNotNull(match);
			
 
				+        assertEquals("time", match.v1());
			
 
				+        assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS"));
			
 
				+        assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
			
 
				+    }
			
 
				+
			
 
				+    public void testSamplesWithManyFieldsSameSingleTimeFormatDistractionAfter() {
			
 
				+        Map<String, Object> sample1 = new LinkedHashMap<>();
			
 
				+        sample1.put("foo", "not a time");
			
 
				+        sample1.put("time", "May 24 2018 17:28:31");
			
 
				+        sample1.put("red_herring", "2018-05-24 17:28:31,735");
			
 
				+        Map<String, Object> sample2 = new LinkedHashMap<>();
			
 
				+        sample2.put("foo", "whatever");
			
 
				+        sample2.put("time", "May 29 2018 11:53:02");
			
 
				+        sample2.put("red_herring", "17");
			
 
				+        Tuple<String, TimestampMatch> match =
			
 
				+            LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
			
 
				+        assertNotNull(match);
			
 
				+        assertEquals("time", match.v1());
			
 
				+        assertThat(match.v2().dateFormats, contains("MMM dd YYYY HH:mm:ss", "MMM  d YYYY HH:mm:ss"));
			
 
				+        assertEquals("CISCOTIMESTAMP", match.v2().grokPatternName);
			
 
				+    }
			
 
				+
			
 
				+    public void testSamplesWithManyFieldsInconsistentTimeFields() {
			
 
				+        Map<String, Object> sample1 = new LinkedHashMap<>();
			
 
				+        sample1.put("foo", "not a time");
			
 
				+        sample1.put("time1", "May 24 2018 17:28:31");
			
 
				+        sample1.put("bar", 17);
			
 
				+        Map<String, Object> sample2 = new LinkedHashMap<>();
			
 
				+        sample2.put("foo", "whatever");
			
 
				+        sample2.put("time2", "May 29 2018 11:53:02");
			
 
				+        sample2.put("bar", 42);
			
 
				+        Tuple<String, TimestampMatch> match =
			
 
				+            LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
			
 
				+        assertNull(match);
			
 
				+    }
			
 
				+
			
 
				+    public void testSamplesWithManyFieldsInconsistentAndConsistentTimeFields() {
			
 
				+        Map<String, Object> sample1 = new LinkedHashMap<>();
			
 
				+        sample1.put("foo", "not a time");
			
 
				+        sample1.put("time1", "2018-05-09 17:28:31,735");
			
 
				+        sample1.put("time2", "May  9 2018 17:28:31");
			
 
				+        sample1.put("bar", 17);
			
 
				+        Map<String, Object> sample2 = new LinkedHashMap<>();
			
 
				+        sample2.put("foo", "whatever");
			
 
				+        sample2.put("time2", "May 10 2018 11:53:02");
			
 
				+        sample2.put("time3", "Thu, May 10 2018 11:53:02");
			
 
				+        sample2.put("bar", 42);
			
 
				+        Tuple<String, TimestampMatch> match =
			
 
				+            LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
			
 
				+        assertNotNull(match);
			
 
				+        assertEquals("time2", match.v1());
			
 
				+        assertThat(match.v2().dateFormats, contains("MMM dd YYYY HH:mm:ss", "MMM  d YYYY HH:mm:ss"));
			
 
				+        assertEquals("CISCOTIMESTAMP", match.v2().grokPatternName);
			
 
				+    }
			
 
				+
			
 
				+    public void testGuessMappingGivenNothing() {
			
 
				+        assertNull(LogStructureUtils.guessMapping(explanation, "foo", Collections.emptyList()));
			
 
				+    }
			
 
				+
			
 
				+    public void testGuessMappingGivenKeyword() {
			
 
				+        Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword");
			
 
				+
			
 
				+        assertEquals(expected,
			
 
				+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("ERROR", "INFO", "DEBUG")));
			
 
				+        assertEquals(expected,
			
 
				+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("2018-06-11T13:26:47Z", "not a date")));
			
 
				+    }
			
 
				+
			
 
				+    public void testGuessMappingGivenText() {
			
 
				+        Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "text");
			
 
				+
			
 
				+        assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo",
			
 
				+            Arrays.asList("a", "the quick brown fox jumped over the lazy dog")));
			
 
				+    }
			
 
				+
			
 
				+    public void testGuessMappingGivenIp() {
			
 
				+        Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "ip");
			
 
				+
			
 
				+        assertEquals(expected,
			
 
				+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("10.0.0.1", "172.16.0.1", "192.168.0.1")));
			
 
				+    }
			
 
				+
			
 
				+    public void testGuessMappingGivenDouble() {
			
 
				+        Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "double");
			
 
				+
			
 
				+        assertEquals(expected,
			
 
				+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("3.14159265359", "0", "-8")));
			
 
				+        // 12345678901234567890 is too long for long
			
 
				+        assertEquals(expected,
			
 
				+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("1", "2", "12345678901234567890")));
			
 
				+        assertEquals(expected,
			
 
				+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(3.14159265359, 0.0, 1e-308)));
			
 
				+        assertEquals(expected,
			
 
				+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("-1e-1", "-1e308", "1e-308")));
			
 
				+    }
			
 
				+
			
 
				+    public void testGuessMappingGivenLong() {
			
 
				+        Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long");
			
 
				+
			
 
				+        assertEquals(expected,
			
 
				+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("500", "3", "-3")));
			
 
				+        assertEquals(expected,
			
 
				+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(500, 6, 0)));
			
 
				+    }
			
 
				+
			
 
				+    public void testGuessMappingGivenDate() {
			
 
				+        Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date");
			
 
				+
			
 
				+        assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo",
			
 
				+            Arrays.asList("2018-06-11T13:26:47Z", "2018-06-11T13:27:12Z")));
			
 
				+    }
			
 
				+
			
 
				+    public void testGuessMappingGivenBoolean() {
			
 
				+        Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "boolean");
			
 
				+
			
 
				+        assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("false", "true")));
			
 
				+        assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(true, false)));
			
 
				+    }
			
 
				+
			
 
				+    public void testGuessMappingGivenArray() {
			
 
				+        Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long");
			
 
				+
			
 
				+        assertEquals(expected,
			
 
				+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(42, Arrays.asList(1, -99))));
			
 
				+
			
 
				+        expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword");
			
 
				+
			
 
				+        assertEquals(expected,
			
 
				+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(new String[]{ "x", "y" }, "z")));
			
 
				+    }
			
 
				+
			
 
				+    public void testGuessMappingGivenObject() {
			
 
				+        Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "object");
			
 
				+
			
 
				+        assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo",
			
 
				+            Arrays.asList(Collections.singletonMap("name", "value1"), Collections.singletonMap("name", "value2"))));
			
 
				+    }
			
 
				+
			
 
				+    public void testGuessMappingGivenObjectAndNonObject() {
			
 
				+        RuntimeException e = expectThrows(RuntimeException.class, () -> LogStructureUtils.guessMapping(explanation,
			
 
				+            "foo", Arrays.asList(Collections.singletonMap("name", "value1"), "value2")));
			
 
				+
			
 
				+        assertEquals("Field [foo] has both object and non-object values - this is not supported by Elasticsearch", e.getMessage());
			
 
				+    }
			
 
				+
			
 
				+    public void testGuessMappings() {
			
 
				+        Map<String, Object> sample1 = new LinkedHashMap<>();
			
 
				+        sample1.put("foo", "not a time");
			
 
				+        sample1.put("time", "2018-05-24 17:28:31,735");
			
 
				+        sample1.put("bar", 42);
			
 
				+        sample1.put("nothing", null);
			
 
				+        Map<String, Object> sample2 = new LinkedHashMap<>();
			
 
				+        sample2.put("foo", "whatever");
			
 
				+        sample2.put("time", "2018-05-29 11:53:02,837");
			
 
				+        sample2.put("bar", 17);
			
 
				+        sample2.put("nothing", null);
			
 
				+
			
 
				+        Map<String, Object> mappings = LogStructureUtils.guessMappings(explanation, Arrays.asList(sample1, sample2));
			
 
				+        assertNotNull(mappings);
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("foo"));
			
 
				+        Map<String, String> expectedTimeMapping = new HashMap<>();
			
 
				+        expectedTimeMapping.put(LogStructureUtils.MAPPING_TYPE_SETTING, "date");
			
 
				+        expectedTimeMapping.put(LogStructureUtils.MAPPING_FORMAT_SETTING, "YYYY-MM-dd HH:mm:ss,SSS");
			
 
				+        assertEquals(expectedTimeMapping, mappings.get("time"));
			
 
				+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("bar"));
			
 
				+        assertNull(mappings.get("nothing"));
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/PipeSeparatedValuesLogStructureFinderFactoryTests.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/PipeSeparatedValuesLogStructureFinderFactoryTests.java
@@ -0,0 +1,23 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+public class PipeSeparatedValuesLogStructureFinderFactoryTests extends LogStructureTestCase {
			
 
				+
			
 
				+    private LogStructureFinderFactory factory = new PipeSeparatedValuesLogStructureFinderFactory();
			
 
				+
			
 
				+    // No need to check JSON, XML, CSV, TSV or semi-colon separated values because they come earlier in the order we check formats
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenPipeSeparatedValues() {
			
 
				+
			
 
				+        assertTrue(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenText() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/SemiColonSeparatedValuesLogStructureFinderFactoryTests.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/SemiColonSeparatedValuesLogStructureFinderFactoryTests.java
@@ -0,0 +1,28 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+public class SemiColonSeparatedValuesLogStructureFinderFactoryTests extends LogStructureTestCase {
			
 
				+
			
 
				+    private LogStructureFinderFactory factory = new SemiColonSeparatedValuesLogStructureFinderFactory();
			
 
				+
			
 
				+    // No need to check JSON, XML, CSV or TSV because they come earlier in the order we check formats
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenSemiColonSeparatedValues() {
			
 
				+
			
 
				+        assertTrue(factory.canCreateFromSample(explanation, SEMI_COLON_SEPARATED_VALUES_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenPipeSeparatedValues() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenText() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/SeparatedValuesLogStructureFinderTests.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/SeparatedValuesLogStructureFinderTests.java
@@ -0,0 +1,293 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.elasticsearch.common.collect.Tuple;
			
 
				+import org.supercsv.prefs.CsvPreference;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.util.Arrays;
			
 
				+import java.util.Collections;
			
 
				+
			
 
				+import static org.elasticsearch.xpack.ml.logstructurefinder.SeparatedValuesLogStructureFinder.levenshteinFieldwiseCompareRows;
			
 
				+import static org.elasticsearch.xpack.ml.logstructurefinder.SeparatedValuesLogStructureFinder.levenshteinDistance;
			
 
				+import static org.hamcrest.Matchers.arrayContaining;
			
 
				+
			
 
				+public class SeparatedValuesLogStructureFinderTests extends LogStructureTestCase {
			
 
				+
			
 
				+    private LogStructureFinderFactory factory = new CsvLogStructureFinderFactory();
			
 
				+
			
 
				+    public void testCreateConfigsGivenCompleteCsv() throws Exception {
			
 
				+        String sample = "time,message\n" +
			
 
				+            "2018-05-17T13:41:23,hello\n" +
			
 
				+            "2018-05-17T13:41:32,hello again\n";
			
 
				+        assertTrue(factory.canCreateFromSample(explanation, sample));
			
 
				+
			
 
				+        String charset = randomFrom(POSSIBLE_CHARSETS);
			
 
				+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
			
 
				+        LogStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
			
 
				+
			
 
				+        LogStructure structure = structureFinder.getStructure();
			
 
				+
			
 
				+        assertEquals(LogStructure.Format.CSV, structure.getFormat());
			
 
				+        assertEquals(charset, structure.getCharset());
			
 
				+        if (hasByteOrderMarker == null) {
			
 
				+            assertNull(structure.getHasByteOrderMarker());
			
 
				+        } else {
			
 
				+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
			
 
				+        }
			
 
				+        assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern());
			
 
				+        assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
			
 
				+        assertEquals(Character.valueOf(','), structure.getSeparator());
			
 
				+        assertTrue(structure.getHasHeaderRow());
			
 
				+        assertNull(structure.getShouldTrimFields());
			
 
				+        assertEquals(Arrays.asList("time", "message"), structure.getInputFields());
			
 
				+        assertNull(structure.getGrokPattern());
			
 
				+        assertEquals("time", structure.getTimestampField());
			
 
				+        assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
			
 
				+    }
			
 
				+
			
 
				+    public void testCreateConfigsGivenCsvWithIncompleteLastRecord() throws Exception {
			
 
				+        String sample = "message,time,count\n" +
			
 
				+            "\"hello\n" +
			
 
				+            "world\",2018-05-17T13:41:23,1\n" +
			
 
				+            "\"hello again\n"; // note that this last record is truncated
			
 
				+        assertTrue(factory.canCreateFromSample(explanation, sample));
			
 
				+
			
 
				+        String charset = randomFrom(POSSIBLE_CHARSETS);
			
 
				+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
			
 
				+        LogStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
			
 
				+
			
 
				+        LogStructure structure = structureFinder.getStructure();
			
 
				+
			
 
				+        assertEquals(LogStructure.Format.CSV, structure.getFormat());
			
 
				+        assertEquals(charset, structure.getCharset());
			
 
				+        if (hasByteOrderMarker == null) {
			
 
				+            assertNull(structure.getHasByteOrderMarker());
			
 
				+        } else {
			
 
				+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
			
 
				+        }
			
 
				+        assertEquals("^\"?message\"?,\"?time\"?,\"?count\"?", structure.getExcludeLinesPattern());
			
 
				+        assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
			
 
				+        assertEquals(Character.valueOf(','), structure.getSeparator());
			
 
				+        assertTrue(structure.getHasHeaderRow());
			
 
				+        assertNull(structure.getShouldTrimFields());
			
 
				+        assertEquals(Arrays.asList("message", "time", "count"), structure.getInputFields());
			
 
				+        assertNull(structure.getGrokPattern());
			
 
				+        assertEquals("time", structure.getTimestampField());
			
 
				+        assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
			
 
				+    }
			
 
				+
			
 
				+    public void testCreateConfigsGivenCsvWithTrailingNulls() throws Exception {
			
 
				+        String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," +
			
 
				+            "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," +
			
 
				+            "improvement_surcharge,total_amount,,\n" +
			
 
				+            "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" +
			
 
				+            "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" +
			
 
				+            "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n";
			
 
				+        assertTrue(factory.canCreateFromSample(explanation, sample));
			
 
				+
			
 
				+        String charset = randomFrom(POSSIBLE_CHARSETS);
			
 
				+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
			
 
				+        LogStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
			
 
				+
			
 
				+        LogStructure structure = structureFinder.getStructure();
			
 
				+
			
 
				+        assertEquals(LogStructure.Format.CSV, structure.getFormat());
			
 
				+        assertEquals(charset, structure.getCharset());
			
 
				+        if (hasByteOrderMarker == null) {
			
 
				+            assertNull(structure.getHasByteOrderMarker());
			
 
				+        } else {
			
 
				+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
			
 
				+        }
			
 
				+        assertEquals("^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," +
			
 
				+            "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," +
			
 
				+            "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?,\"?\"?,\"?\"?",
			
 
				+            structure.getExcludeLinesPattern());
			
 
				+        assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
			
 
				+        assertEquals(Character.valueOf(','), structure.getSeparator());
			
 
				+        assertTrue(structure.getHasHeaderRow());
			
 
				+        assertNull(structure.getShouldTrimFields());
			
 
				+        assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance",
			
 
				+            "RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax",
			
 
				+            "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", "column18", "column19"), structure.getInputFields());
			
 
				+        assertNull(structure.getGrokPattern());
			
 
				+        assertEquals("tpep_pickup_datetime", structure.getTimestampField());
			
 
				+        assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats());
			
 
				+    }
			
 
				+
			
 
				+    public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exception {
			
 
				+        String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," +
			
 
				+            "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," +
			
 
				+            "improvement_surcharge,total_amount\n" +
			
 
				+            "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" +
			
 
				+            "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" +
			
 
				+            "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n";
			
 
				+        assertTrue(factory.canCreateFromSample(explanation, sample));
			
 
				+
			
 
				+        String charset = randomFrom(POSSIBLE_CHARSETS);
			
 
				+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
			
 
				+        LogStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
			
 
				+
			
 
				+        LogStructure structure = structureFinder.getStructure();
			
 
				+
			
 
				+        assertEquals(LogStructure.Format.CSV, structure.getFormat());
			
 
				+        assertEquals(charset, structure.getCharset());
			
 
				+        if (hasByteOrderMarker == null) {
			
 
				+            assertNull(structure.getHasByteOrderMarker());
			
 
				+        } else {
			
 
				+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
			
 
				+        }
			
 
				+        assertEquals("^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," +
			
 
				+                "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," +
			
 
				+                "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?",
			
 
				+            structure.getExcludeLinesPattern());
			
 
				+        assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
			
 
				+        assertEquals(Character.valueOf(','), structure.getSeparator());
			
 
				+        assertTrue(structure.getHasHeaderRow());
			
 
				+        assertNull(structure.getShouldTrimFields());
			
 
				+        assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance",
			
 
				+            "RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax",
			
 
				+            "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount"), structure.getInputFields());
			
 
				+        assertNull(structure.getGrokPattern());
			
 
				+        assertEquals("tpep_pickup_datetime", structure.getTimestampField());
			
 
				+        assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats());
			
 
				+    }
			
 
				+
			
 
				+    public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception {
			
 
				+        String sample = "\"pos_id\",\"trip_id\",\"latitude\",\"longitude\",\"altitude\",\"timestamp\"\n" +
			
 
				+            "\"1\",\"3\",\"4703.7815\",\"1527.4713\",\"359.9\",\"2017-01-19 16:19:04.742113\"\n" +
			
 
				+            "\"2\",\"3\",\"4703.7815\",\"1527.4714\",\"359.9\",\"2017-01-19 16:19:05.741890\"\n";
			
 
				+        assertTrue(factory.canCreateFromSample(explanation, sample));
			
 
				+
			
 
				+        String charset = randomFrom(POSSIBLE_CHARSETS);
			
 
				+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
			
 
				+        LogStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
			
 
				+
			
 
				+        LogStructure structure = structureFinder.getStructure();
			
 
				+
			
 
				+        assertEquals(LogStructure.Format.CSV, structure.getFormat());
			
 
				+        assertEquals(charset, structure.getCharset());
			
 
				+        if (hasByteOrderMarker == null) {
			
 
				+            assertNull(structure.getHasByteOrderMarker());
			
 
				+        } else {
			
 
				+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
			
 
				+        }
			
 
				+        assertEquals("^\"?pos_id\"?,\"?trip_id\"?,\"?latitude\"?,\"?longitude\"?,\"?altitude\"?,\"?timestamp\"?",
			
 
				+            structure.getExcludeLinesPattern());
			
 
				+        assertNull(structure.getMultilineStartPattern());
			
 
				+        assertEquals(Character.valueOf(','), structure.getSeparator());
			
 
				+        assertTrue(structure.getHasHeaderRow());
			
 
				+        assertNull(structure.getShouldTrimFields());
			
 
				+        assertEquals(Arrays.asList("pos_id", "trip_id", "latitude", "longitude", "altitude", "timestamp"), structure.getInputFields());
			
 
				+        assertNull(structure.getGrokPattern());
			
 
				+        assertEquals("timestamp", structure.getTimestampField());
			
 
				+        assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss.SSSSSS"), structure.getTimestampFormats());
			
 
				+    }
			
 
				+
			
 
				+    public void testFindHeaderFromSampleGivenHeaderInSample() throws IOException {
			
 
				+        String withHeader = "time,airline,responsetime,sourcetype\n" +
			
 
				+            "2014-06-23 00:00:00Z,AAL,132.2046,farequote\n" +
			
 
				+            "2014-06-23 00:00:00Z,JZA,990.4628,farequote\n" +
			
 
				+            "2014-06-23 00:00:01Z,JBU,877.5927,farequote\n" +
			
 
				+            "2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n";
			
 
				+
			
 
				+        Tuple<Boolean, String[]> header = SeparatedValuesLogStructureFinder.findHeaderFromSample(explanation,
			
 
				+            SeparatedValuesLogStructureFinder.readRows(withHeader, CsvPreference.EXCEL_PREFERENCE).v1());
			
 
				+
			
 
				+        assertTrue(header.v1());
			
 
				+        assertThat(header.v2(), arrayContaining("time", "airline", "responsetime", "sourcetype"));
			
 
				+    }
			
 
				+
			
 
				+    public void testFindHeaderFromSampleGivenHeaderNotInSample() throws IOException {
			
 
				+        String withoutHeader = "2014-06-23 00:00:00Z,AAL,132.2046,farequote\n" +
			
 
				+            "2014-06-23 00:00:00Z,JZA,990.4628,farequote\n" +
			
 
				+            "2014-06-23 00:00:01Z,JBU,877.5927,farequote\n" +
			
 
				+            "2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n";
			
 
				+
			
 
				+        Tuple<Boolean, String[]> header = SeparatedValuesLogStructureFinder.findHeaderFromSample(explanation,
			
 
				+            SeparatedValuesLogStructureFinder.readRows(withoutHeader, CsvPreference.EXCEL_PREFERENCE).v1());
			
 
				+
			
 
				+        assertFalse(header.v1());
			
 
				+        assertThat(header.v2(), arrayContaining("column1", "column2", "column3", "column4"));
			
 
				+    }
			
 
				+
			
 
				+    public void testLevenshteinDistance() {
			
 
				+
			
 
				+        assertEquals(0, levenshteinDistance("cat", "cat"));
			
 
				+        assertEquals(3, levenshteinDistance("cat", "dog"));
			
 
				+        assertEquals(5, levenshteinDistance("cat", "mouse"));
			
 
				+        assertEquals(3, levenshteinDistance("cat", ""));
			
 
				+
			
 
				+        assertEquals(3, levenshteinDistance("dog", "cat"));
			
 
				+        assertEquals(0, levenshteinDistance("dog", "dog"));
			
 
				+        assertEquals(4, levenshteinDistance("dog", "mouse"));
			
 
				+        assertEquals(3, levenshteinDistance("dog", ""));
			
 
				+
			
 
				+        assertEquals(5, levenshteinDistance("mouse", "cat"));
			
 
				+        assertEquals(4, levenshteinDistance("mouse", "dog"));
			
 
				+        assertEquals(0, levenshteinDistance("mouse", "mouse"));
			
 
				+        assertEquals(5, levenshteinDistance("mouse", ""));
			
 
				+
			
 
				+        assertEquals(3, levenshteinDistance("", "cat"));
			
 
				+        assertEquals(3, levenshteinDistance("", "dog"));
			
 
				+        assertEquals(5, levenshteinDistance("", "mouse"));
			
 
				+        assertEquals(0, levenshteinDistance("", ""));
			
 
				+    }
			
 
				+
			
 
				+    public void testLevenshteinCompareRows() {
			
 
				+
			
 
				+        assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "dog")));
			
 
				+        assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "cat")));
			
 
				+        assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("dog", "cat")));
			
 
				+        assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("mouse", "cat")));
			
 
				+        assertEquals(5, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "dog", "cat")));
			
 
				+        assertEquals(4, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "mouse", "mouse")));
			
 
				+        assertEquals(7, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "cat", "dog")));
			
 
				+    }
			
 
				+
			
 
				+    public void testLineHasUnescapedQuote() {
			
 
				+
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a,b,c", CsvPreference.EXCEL_PREFERENCE));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\",b,c", CsvPreference.EXCEL_PREFERENCE));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a,b\",c", CsvPreference.EXCEL_PREFERENCE));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a,b,c\"", CsvPreference.EXCEL_PREFERENCE));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a,\"b\",c", CsvPreference.EXCEL_PREFERENCE));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a,b,\"c\"", CsvPreference.EXCEL_PREFERENCE));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a,\"b\"\"\",c", CsvPreference.EXCEL_PREFERENCE));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a,b,\"c\"\"\"", CsvPreference.EXCEL_PREFERENCE));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"\"\"a\",b,c", CsvPreference.EXCEL_PREFERENCE));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\"\"\",b,c", CsvPreference.EXCEL_PREFERENCE));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a,\"\"b\",c", CsvPreference.EXCEL_PREFERENCE));
			
 
				+        assertTrue(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("between\"words,b,c", CsvPreference.EXCEL_PREFERENCE));
			
 
				+        assertTrue(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("x and \"y\",b,c", CsvPreference.EXCEL_PREFERENCE));
			
 
				+
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a\tb\tc", CsvPreference.TAB_PREFERENCE));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\"\tb\tc", CsvPreference.TAB_PREFERENCE));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\tb\"\tc", CsvPreference.TAB_PREFERENCE));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\tb\tc\"", CsvPreference.TAB_PREFERENCE));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a\t\"b\"\tc", CsvPreference.TAB_PREFERENCE));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a\tb\t\"c\"", CsvPreference.TAB_PREFERENCE));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a\t\"b\"\"\"\tc", CsvPreference.TAB_PREFERENCE));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a\tb\t\"c\"\"\"", CsvPreference.TAB_PREFERENCE));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"\"\"a\"\tb\tc", CsvPreference.TAB_PREFERENCE));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\"\"\"\tb\tc", CsvPreference.TAB_PREFERENCE));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\t\"\"b\"\tc", CsvPreference.TAB_PREFERENCE));
			
 
				+        assertTrue(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("between\"words\tb\tc", CsvPreference.TAB_PREFERENCE));
			
 
				+        assertTrue(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("x and \"y\"\tb\tc", CsvPreference.TAB_PREFERENCE));
			
 
				+    }
			
 
				+
			
 
				+    public void testRowContainsDuplicateNonEmptyValues() {
			
 
				+
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Collections.singletonList("a")));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Collections.singletonList("")));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "c")));
			
 
				+        assertTrue(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "a")));
			
 
				+        assertTrue(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "b")));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "", "")));
			
 
				+        assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("", "a", "")));
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/TextLogStructureFinderFactoryTests.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/TextLogStructureFinderFactoryTests.java
@@ -0,0 +1,19 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+public class TextLogStructureFinderFactoryTests extends LogStructureTestCase {
			
 
				+
			
 
				+    private LogStructureFinderFactory factory = new TextLogStructureFinderFactory();
			
 
				+
			
 
				+    // No need to check JSON, XML, CSV, TSV, semi-colon separated values or pipe
			
 
				+    // separated values because they come earlier in the order we check formats
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenText() {
			
 
				+
			
 
				+        assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/TextLogStructureFinderTests.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/TextLogStructureFinderTests.java
@@ -0,0 +1,245 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.elasticsearch.common.collect.Tuple;
			
 
				+import org.elasticsearch.common.util.set.Sets;
			
 
				+import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
			
 
				+
			
 
				+import java.util.Collections;
			
 
				+import java.util.Set;
			
 
				+
			
 
				+public class TextLogStructureFinderTests extends LogStructureTestCase {
			
 
				+
			
 
				+    private LogStructureFinderFactory factory = new TextLogStructureFinderFactory();
			
 
				+
			
 
				+    public void testCreateConfigsGivenElasticsearchLog() throws Exception {
			
 
				+        assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
			
 
				+
			
 
				+        String charset = randomFrom(POSSIBLE_CHARSETS);
			
 
				+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
			
 
				+        LogStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker);
			
 
				+
			
 
				+        LogStructure structure = structureFinder.getStructure();
			
 
				+
			
 
				+        assertEquals(LogStructure.Format.SEMI_STRUCTURED_TEXT, structure.getFormat());
			
 
				+        assertEquals(charset, structure.getCharset());
			
 
				+        if (hasByteOrderMarker == null) {
			
 
				+            assertNull(structure.getHasByteOrderMarker());
			
 
				+        } else {
			
 
				+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
			
 
				+        }
			
 
				+        assertNull(structure.getExcludeLinesPattern());
			
 
				+        assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
			
 
				+        assertNull(structure.getSeparator());
			
 
				+        assertNull(structure.getHasHeaderRow());
			
 
				+        assertNull(structure.getShouldTrimFields());
			
 
				+        assertEquals("\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} \\]\\[.*", structure.getGrokPattern());
			
 
				+        assertEquals("timestamp", structure.getTimestampField());
			
 
				+        assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
			
 
				+    }
			
 
				+
			
 
				+    public void testCreateMultiLineMessageStartRegexGivenNoPrefaces() {
			
 
				+        for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
			
 
				+            String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
			
 
				+            assertEquals("^" + simpleDateRegex.replaceFirst("^\\\\b", ""),
			
 
				+                TextLogStructureFinder.createMultiLineMessageStartRegex(Collections.emptySet(), simpleDateRegex));
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    public void testCreateMultiLineMessageStartRegexGivenOneEmptyPreface() {
			
 
				+        for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
			
 
				+            String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
			
 
				+            assertEquals("^" + simpleDateRegex.replaceFirst("^\\\\b", ""),
			
 
				+                TextLogStructureFinder.createMultiLineMessageStartRegex(Collections.singleton(""), simpleDateRegex));
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    public void testCreateMultiLineMessageStartRegexGivenOneLogLevelPreface() {
			
 
				+        for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
			
 
				+            String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
			
 
				+            assertEquals("^\\[.*?\\] \\[" + simpleDateRegex,
			
 
				+                TextLogStructureFinder.createMultiLineMessageStartRegex(Collections.singleton("[ERROR] ["), simpleDateRegex));
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    public void testCreateMultiLineMessageStartRegexGivenManyLogLevelPrefaces() {
			
 
				+        for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
			
 
				+            Set<String> prefaces = Sets.newHashSet("[ERROR] [", "[DEBUG] [");
			
 
				+            String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
			
 
				+            assertEquals("^\\[.*?\\] \\[" + simpleDateRegex,
			
 
				+                TextLogStructureFinder.createMultiLineMessageStartRegex(prefaces, simpleDateRegex));
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    public void testCreateMultiLineMessageStartRegexGivenManyHostnamePrefaces() {
			
 
				+        for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
			
 
				+            Set<String> prefaces = Sets.newHashSet("host-1.acme.com|", "my_host.elastic.co|");
			
 
				+            String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
			
 
				+            assertEquals("^.*?\\|" + simpleDateRegex,
			
 
				+                TextLogStructureFinder.createMultiLineMessageStartRegex(prefaces, simpleDateRegex));
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    public void testCreateMultiLineMessageStartRegexGivenManyPrefacesIncludingEmpty() {
			
 
				+        for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
			
 
				+            Set<String> prefaces = Sets.newHashSet("", "[non-standard] ");
			
 
				+            String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
			
 
				+            assertEquals("^.*?" + simpleDateRegex,
			
 
				+                TextLogStructureFinder.createMultiLineMessageStartRegex(prefaces, simpleDateRegex));
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    public void testMostLikelyTimestampGivenAllSame() {
			
 
				+        String sample = "[2018-06-27T11:59:22,125][INFO ][o.e.n.Node               ] [node-0] initializing ...\n" +
			
 
				+            "[2018-06-27T11:59:22,201][INFO ][o.e.e.NodeEnvironment    ] [node-0] using [1] data paths, mounts [[/ (/dev/disk1)]], " +
			
 
				+                "net usable_space [216.1gb], net total_space [464.7gb], types [hfs]\n" +
			
 
				+            "[2018-06-27T11:59:22,202][INFO ][o.e.e.NodeEnvironment    ] [node-0] heap size [494.9mb], " +
			
 
				+                "compressed ordinary object pointers [true]\n" +
			
 
				+            "[2018-06-27T11:59:22,204][INFO ][o.e.n.Node               ] [node-0] node name [node-0], node ID [Ha1gD8nNSDqjd6PIyu3DJA]\n" +
			
 
				+            "[2018-06-27T11:59:22,204][INFO ][o.e.n.Node               ] [node-0] version[6.4.0-SNAPSHOT], pid[2785], " +
			
 
				+                "build[default/zip/3c60efa/2018-06-26T14:55:15.206676Z], OS[Mac OS X/10.12.6/x86_64], " +
			
 
				+                "JVM[\"Oracle Corporation\"/Java HotSpot(TM) 64-Bit Server VM/10/10+46]\n" +
			
 
				+            "[2018-06-27T11:59:22,205][INFO ][o.e.n.Node               ] [node-0] JVM arguments [-Xms1g, -Xmx1g, " +
			
 
				+                "-XX:+UseConcMarkSweepGC, -XX:CMSInitiatingOccupancyFraction=75, -XX:+UseCMSInitiatingOccupancyOnly, " +
			
 
				+                "-XX:+AlwaysPreTouch, -Xss1m, -Djava.awt.headless=true, -Dfile.encoding=UTF-8, -Djna.nosys=true, " +
			
 
				+                "-XX:-OmitStackTraceInFastThrow, -Dio.netty.noUnsafe=true, -Dio.netty.noKeySetOptimization=true, " +
			
 
				+                "-Dio.netty.recycler.maxCapacityPerThread=0, -Dlog4j.shutdownHookEnabled=false, -Dlog4j2.disable.jmx=true, " +
			
 
				+                "-Djava.io.tmpdir=/var/folders/k5/5sqcdlps5sg3cvlp783gcz740000h0/T/elasticsearch.nFUyeMH1, " +
			
 
				+                "-XX:+HeapDumpOnOutOfMemoryError, -XX:HeapDumpPath=data, -XX:ErrorFile=logs/hs_err_pid%p.log, " +
			
 
				+                "-Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filecount=32,filesize=64m, " +
			
 
				+                "-Djava.locale.providers=COMPAT, -Dio.netty.allocator.type=unpooled, -ea, -esa, -Xms512m, -Xmx512m, " +
			
 
				+                "-Des.path.home=/Users/dave/elasticsearch/distribution/build/cluster/run node0/elasticsearch-6.4.0-SNAPSHOT, " +
			
 
				+                "-Des.path.conf=/Users/dave/elasticsearch/distribution/build/cluster/run node0/elasticsearch-6.4.0-SNAPSHOT/config, " +
			
 
				+                "-Des.distribution.flavor=default, -Des.distribution.type=zip]\n" +
			
 
				+            "[2018-06-27T11:59:22,205][WARN ][o.e.n.Node               ] [node-0] version [6.4.0-SNAPSHOT] is a pre-release version of " +
			
 
				+                "Elasticsearch and is not suitable for production\n" +
			
 
				+            "[2018-06-27T11:59:23,585][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [aggs-matrix-stats]\n" +
			
 
				+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [analysis-common]\n" +
			
 
				+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [ingest-common]\n" +
			
 
				+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [lang-expression]\n" +
			
 
				+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [lang-mustache]\n" +
			
 
				+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [lang-painless]\n" +
			
 
				+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [mapper-extras]\n" +
			
 
				+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [parent-join]\n" +
			
 
				+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [percolator]\n" +
			
 
				+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [rank-eval]\n" +
			
 
				+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [reindex]\n" +
			
 
				+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [repository-url]\n" +
			
 
				+            "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [transport-netty4]\n" +
			
 
				+            "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-core]\n" +
			
 
				+            "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-deprecation]\n" +
			
 
				+            "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-graph]\n" +
			
 
				+            "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-logstash]\n" +
			
 
				+            "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-ml]\n" +
			
 
				+            "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-monitoring]\n" +
			
 
				+            "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-rollup]\n" +
			
 
				+            "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-security]\n" +
			
 
				+            "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-sql]\n" +
			
 
				+            "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-upgrade]\n" +
			
 
				+            "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-watcher]\n" +
			
 
				+            "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService     ] [node-0] no plugins loaded\n";
			
 
				+
			
 
				+        Tuple<TimestampMatch, Set<String>> mostLikelyMatch = TextLogStructureFinder.mostLikelyTimestamp(sample.split("\n"));
			
 
				+        assertNotNull(mostLikelyMatch);
			
 
				+        assertEquals(new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""),
			
 
				+            mostLikelyMatch.v1());
			
 
				+    }
			
 
				+
			
 
				+    public void testMostLikelyTimestampGivenExceptionTrace() {
			
 
				+        String sample = "[2018-02-28T14:49:40,517][DEBUG][o.e.a.b.TransportShardBulkAction] [an_index][2] failed to execute bulk item " +
			
 
				+                "(index) BulkShardRequest [[an_index][2]] containing [33] requests\n" +
			
 
				+            "java.lang.IllegalArgumentException: Document contains at least one immense term in field=\"message.keyword\" (whose UTF8 " +
			
 
				+                "encoding is longer than the max length 32766), all of which were skipped.  Please correct the analyzer to not produce " +
			
 
				+                "such terms.  The prefix of the first immense term is: '[60, 83, 79, 65, 80, 45, 69, 78, 86, 58, 69, 110, 118, 101, 108, " +
			
 
				+                "111, 112, 101, 32, 120, 109, 108, 110, 115, 58, 83, 79, 65, 80, 45]...', original message: bytes can be at most 32766 " +
			
 
				+                "in length; got 49023\n" +
			
 
				+            "\tat org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:796) " +
			
 
				+                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
			
 
				+            "\tat org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:430) " +
			
 
				+                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
			
 
				+            "\tat org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:392) " +
			
 
				+                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
			
 
				+            "\tat org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:240) " +
			
 
				+                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
			
 
				+            "\tat org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:496) " +
			
 
				+                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
			
 
				+            "\tat org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1729) " +
			
 
				+                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
			
 
				+            "\tat org.apache.lucene.index.IndexWriter.addDocument(IndexWriter.java:1464) " +
			
 
				+                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
			
 
				+            "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:1070) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.index.engine.InternalEngine.indexIntoLucene(InternalEngine.java:1012) " +
			
 
				+                "~[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:878) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.index.shard.IndexShard.index(IndexShard.java:738) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperation(IndexShard.java:707) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperationOnPrimary(IndexShard.java:673) " +
			
 
				+                "~[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequestOnPrimary(TransportShardBulkAction.java:548) " +
			
 
				+                "~[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequest(TransportShardBulkAction.java:140) " +
			
 
				+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeBulkItemRequest(TransportShardBulkAction.java:236) " +
			
 
				+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.performOnPrimary(TransportShardBulkAction.java:123) " +
			
 
				+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:110) " +
			
 
				+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:72) " +
			
 
				+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" +
			
 
				+                "(TransportReplicationAction.java:1034) [elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" +
			
 
				+                "(TransportReplicationAction.java:1012) [elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.action.support.replication.ReplicationOperation.execute(ReplicationOperation.java:103) " +
			
 
				+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" +
			
 
				+                "(TransportReplicationAction.java:359) [elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" +
			
 
				+                "(TransportReplicationAction.java:299) [elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" +
			
 
				+                "(TransportReplicationAction.java:975) [elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" +
			
 
				+                "(TransportReplicationAction.java:972) [elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.index.shard.IndexShardOperationPermits.acquire(IndexShardOperationPermits.java:238) " +
			
 
				+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.index.shard.IndexShard.acquirePrimaryOperationPermit(IndexShard.java:2220) " +
			
 
				+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.acquirePrimaryShardReference" +
			
 
				+                "(TransportReplicationAction.java:984) [elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.access$500(TransportReplicationAction.java:98) " +
			
 
				+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.doRun" +
			
 
				+                "(TransportReplicationAction.java:320) [elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " +
			
 
				+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" +
			
 
				+                ".messageReceived(TransportReplicationAction.java:295) [elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" +
			
 
				+                ".messageReceived(TransportReplicationAction.java:282) [elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:66) " +
			
 
				+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.transport.TransportService$7.doRun(TransportService.java:656) " +
			
 
				+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:635) " +
			
 
				+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " +
			
 
				+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
			
 
				+            "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_144]\n" +
			
 
				+            "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_144]\n" +
			
 
				+            "\tat java.lang.Thread.run(Thread.java:748) [?:1.8.0_144]\n";
			
 
				+
			
 
				+        Tuple<TimestampMatch, Set<String>> mostLikelyMatch = TextLogStructureFinder.mostLikelyTimestamp(sample.split("\n"));
			
 
				+        assertNotNull(mostLikelyMatch);
			
 
				+
			
 
				+        // Even though many lines have a timestamp near the end (in the Lucene version information),
			
 
				+        // these are so far along the lines that the weight of the timestamp near the beginning of the
			
 
				+        // first line should take precedence
			
 
				+        assertEquals(new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""),
			
 
				+            mostLikelyMatch.v1());
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/TimestampFormatFinderTests.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/TimestampFormatFinderTests.java
@@ -0,0 +1,242 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import org.elasticsearch.common.collect.Tuple;
			
 
				+import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
			
 
				+import org.joda.time.DateTime;
			
 
				+import org.joda.time.DateTimeZone;
			
 
				+import org.joda.time.format.DateTimeFormat;
			
 
				+import org.joda.time.format.DateTimeFormatter;
			
 
				+import org.joda.time.format.ISODateTimeFormat;
			
 
				+
			
 
				+import java.util.Arrays;
			
 
				+import java.util.Locale;
			
 
				+
			
 
				+public class TimestampFormatFinderTests extends LogStructureTestCase {
			
 
				+
			
 
				+    public void testFindFirstMatchGivenNoMatch() {
			
 
				+
			
 
				+        assertNull(TimestampFormatFinder.findFirstMatch(""));
			
 
				+        assertNull(TimestampFormatFinder.findFirstMatch("no timestamps in here"));
			
 
				+        assertNull(TimestampFormatFinder.findFirstMatch(":::"));
			
 
				+        assertNull(TimestampFormatFinder.findFirstMatch("/+"));
			
 
				+    }
			
 
				+
			
 
				+    public void testFindFirstMatchGivenOnlyIso8601() {
			
 
				+
			
 
				+        TimestampMatch expected = new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601",
			
 
				+            "");
			
 
				+
			
 
				+        checkAndValidateDateFormat(expected, "2018-05-15T16:14:56,374Z", 1526400896374L);
			
 
				+        checkAndValidateDateFormat(expected, "2018-05-15T17:14:56,374+0100", 1526400896374L);
			
 
				+        checkAndValidateDateFormat(expected, "2018-05-15T17:14:56,374+01:00", 1526400896374L);
			
 
				+        checkAndValidateDateFormat(expected, "2018-05-15T17:14:56,374", 1526400896374L);
			
 
				+        checkAndValidateDateFormat(expected, "2018-05-15T16:14:56Z", 1526400896000L);
			
 
				+        checkAndValidateDateFormat(expected, "2018-05-15T17:14:56+0100", 1526400896000L);
			
 
				+        checkAndValidateDateFormat(expected, "2018-05-15T17:14:56+01:00", 1526400896000L);
			
 
				+        checkAndValidateDateFormat(expected, "2018-05-15T17:14:56", 1526400896000L);
			
 
				+
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(1, "", "YYYY-MM-dd HH:mm:ss,SSSZ",
			
 
				+                "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 16:14:56,374Z",
			
 
				+            1526400896374L);
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(1, "", "YYYY-MM-dd HH:mm:ss,SSSZ",
			
 
				+                "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56,374+0100",
			
 
				+            1526400896374L);
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(2, "", "YYYY-MM-dd HH:mm:ss,SSSZZ",
			
 
				+                "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56,374+01:00",
			
 
				+            1526400896374L);
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(3, "", "YYYY-MM-dd HH:mm:ss,SSS",
			
 
				+                "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56,374", 1526400896374L);
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(4, "", "YYYY-MM-dd HH:mm:ssZ",
			
 
				+                "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 16:14:56Z", 1526400896000L);
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(4, "", "YYYY-MM-dd HH:mm:ssZ",
			
 
				+                "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56+0100", 1526400896000L);
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(5, "", "YYYY-MM-dd HH:mm:ssZZ",
			
 
				+                "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56+01:00", 1526400896000L);
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(6, "", "YYYY-MM-dd HH:mm:ss",
			
 
				+                "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56", 1526400896000L);
			
 
				+    }
			
 
				+
			
 
				+    public void testFindFirstMatchGivenOnlyKnownDateFormat() {
			
 
				+
			
 
				+        // Note: some of the time formats give millisecond accuracy, some second accuracy and some minute accuracy
			
 
				+
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(0, "", "YYYY-MM-dd HH:mm:ss,SSS Z",
			
 
				+                "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TOMCAT_DATESTAMP", ""), "2018-05-15 17:14:56,374 +0100",
			
 
				+            1526400896374L);
			
 
				+
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(8, "", "EEE MMM dd YYYY HH:mm:ss zzz",
			
 
				+                "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "DATESTAMP_RFC822", ""),
			
 
				+            "Tue May 15 2018 16:14:56 UTC", 1526400896000L);
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(9, "", "EEE MMM dd YYYY HH:mm zzz",
			
 
				+                "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2} ", "DATESTAMP_RFC822", ""),
			
 
				+            "Tue May 15 2018 16:14 UTC", 1526400840000L);
			
 
				+
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(10, "", "EEE, dd MMM YYYY HH:mm:ss ZZ",
			
 
				+                "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""),
			
 
				+            "Tue, 15 May 2018 17:14:56 +01:00", 1526400896000L);
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(11, "", "EEE, dd MMM YYYY HH:mm:ss Z",
			
 
				+                "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""),
			
 
				+            "Tue, 15 May 2018 17:14:56 +0100", 1526400896000L);
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(12, "", "EEE, dd MMM YYYY HH:mm ZZ",
			
 
				+                "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""),
			
 
				+            "Tue, 15 May 2018 17:14 +01:00", 1526400840000L);
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(13, "", "EEE, dd MMM YYYY HH:mm Z",
			
 
				+                "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""), "Tue, 15 May 2018 17:14 +0100",
			
 
				+            1526400840000L);
			
 
				+
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(14, "", "EEE MMM dd HH:mm:ss zzz YYYY",
			
 
				+                "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b", "DATESTAMP_OTHER", ""),
			
 
				+            "Tue May 15 16:14:56 UTC 2018", 1526400896000L);
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(15, "", "EEE MMM dd HH:mm zzz YYYY",
			
 
				+                "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b", "DATESTAMP_OTHER", ""),
			
 
				+            "Tue May 15 16:14 UTC 2018", 1526400840000L);
			
 
				+
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(16, "", "YYYYMMddHHmmss", "\\b\\d{14}\\b", "DATESTAMP_EVENTLOG", ""),
			
 
				+            "20180515171456", 1526400896000L);
			
 
				+
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(17, "", "EEE MMM dd HH:mm:ss YYYY",
			
 
				+                "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b", "HTTPDERROR_DATE", ""),
			
 
				+            "Tue May 15 17:14:56 2018", 1526400896000L);
			
 
				+
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(18, "", Arrays.asList("MMM dd HH:mm:ss.SSS", "MMM  d HH:mm:ss.SSS"),
			
 
				+            "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}", "SYSLOGTIMESTAMP", ""), "May 15 17:14:56.725", 1526400896725L);
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(19, "", Arrays.asList("MMM dd HH:mm:ss", "MMM  d HH:mm:ss"),
			
 
				+            "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "SYSLOGTIMESTAMP", ""), "May 15 17:14:56", 1526400896000L);
			
 
				+
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(20, "", "dd/MMM/YYYY:HH:mm:ss Z",
			
 
				+                "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", "HTTPDATE", ""), "15/May/2018:17:14:56 +0100", 1526400896000L);
			
 
				+
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(21, "", "MMM dd, YYYY K:mm:ss a",
			
 
				+                "\\b[A-Z]\\S{2,8} \\d{1,2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "CATALINA_DATESTAMP", ""), "May 15, 2018 5:14:56 PM",
			
 
				+            1526400896000L);
			
 
				+
			
 
				+        checkAndValidateDateFormat(new TimestampMatch(22, "", Arrays.asList("MMM dd YYYY HH:mm:ss", "MMM  d YYYY HH:mm:ss"),
			
 
				+                "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", "CISCOTIMESTAMP", ""), "May 15 2018 17:14:56",
			
 
				+            1526400896000L);
			
 
				+    }
			
 
				+
			
 
				+    public void testFindFirstMatchGivenOnlySystemDate() {
			
 
				+
			
 
				+        assertEquals(new TimestampMatch(23, "", "UNIX_MS", "\\b\\d{13}\\b", "POSINT", ""),
			
 
				+            TimestampFormatFinder.findFirstMatch("1526400896374"));
			
 
				+        assertEquals(new TimestampMatch(23, "", "UNIX_MS", "\\b\\d{13}\\b", "POSINT", ""),
			
 
				+            TimestampFormatFinder.findFirstFullMatch("1526400896374"));
			
 
				+
			
 
				+        assertEquals(new TimestampMatch(24, "", "UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "NUMBER", ""),
			
 
				+            TimestampFormatFinder.findFirstMatch("1526400896.736"));
			
 
				+        assertEquals(new TimestampMatch(24, "", "UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "NUMBER", ""),
			
 
				+            TimestampFormatFinder.findFirstFullMatch("1526400896.736"));
			
 
				+        assertEquals(new TimestampMatch(25, "", "UNIX", "\\b\\d{10}\\b", "POSINT", ""),
			
 
				+            TimestampFormatFinder.findFirstMatch("1526400896"));
			
 
				+        assertEquals(new TimestampMatch(25, "", "UNIX", "\\b\\d{10}\\b", "POSINT", ""),
			
 
				+            TimestampFormatFinder.findFirstFullMatch("1526400896"));
			
 
				+
			
 
				+        assertEquals(new TimestampMatch(26, "", "TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM", ""),
			
 
				+            TimestampFormatFinder.findFirstMatch("400000005afb159a164ac980"));
			
 
				+        assertEquals(new TimestampMatch(26, "", "TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM", ""),
			
 
				+            TimestampFormatFinder.findFirstFullMatch("400000005afb159a164ac980"));
			
 
				+    }
			
 
				+
			
 
				+    private void checkAndValidateDateFormat(TimestampMatch expected, String text, long expectedEpochMs) {
			
 
				+
			
 
				+        assertEquals(expected, TimestampFormatFinder.findFirstMatch(text));
			
 
				+        assertEquals(expected, TimestampFormatFinder.findFirstFullMatch(text));
			
 
				+
			
 
				+        // All the test times are for Tue May 15 2018 16:14:56 UTC, which is 17:14:56 in London
			
 
				+        DateTimeZone zone = DateTimeZone.forID("Europe/London");
			
 
				+        DateTime parsed;
			
 
				+        for (int i = 0; i < expected.dateFormats.size(); ++i) {
			
 
				+            try {
			
 
				+                String dateFormat = expected.dateFormats.get(i);
			
 
				+                switch (dateFormat) {
			
 
				+                    case "ISO8601":
			
 
				+                        parsed = ISODateTimeFormat.dateTimeParser().withZone(zone).withDefaultYear(2018).parseDateTime(text);
			
 
				+                        break;
			
 
				+                    default:
			
 
				+                        DateTimeFormatter parser = DateTimeFormat.forPattern(dateFormat).withZone(zone).withLocale(Locale.UK);
			
 
				+                        parsed = parser.withDefaultYear(2018).parseDateTime(text);
			
 
				+                        break;
			
 
				+                }
			
 
				+                if (expectedEpochMs == parsed.getMillis()) {
			
 
				+                    break;
			
 
				+                }
			
 
				+                // If the last one isn't right then propagate
			
 
				+                if (i == expected.dateFormats.size() - 1) {
			
 
				+                    assertEquals(expectedEpochMs, parsed.getMillis());
			
 
				+                }
			
 
				+            } catch (RuntimeException e) {
			
 
				+                // If the last one throws then propagate
			
 
				+                if (i == expected.dateFormats.size() - 1) {
			
 
				+                    throw e;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        assertTrue(expected.simplePattern.matcher(text).find());
			
 
				+    }
			
 
				+
			
 
				+    public void testFindFirstMatchGivenRealLogMessages() {
			
 
				+
			
 
				+        assertEquals(new TimestampMatch(7, "[", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601",
			
 
				+                "][INFO ][o.e.e.NodeEnvironment    ] [node-0] heap size [3.9gb], compressed ordinary object pointers [true]"),
			
 
				+            TimestampFormatFinder.findFirstMatch("[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment    ] [node-0] " +
			
 
				+                "heap size [3.9gb], compressed ordinary object pointers [true]"));
			
 
				+
			
 
				+        assertEquals(new TimestampMatch(20, "192.168.62.101 - - [", "dd/MMM/YYYY:HH:mm:ss Z",
			
 
				+                "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", "HTTPDATE",
			
 
				+                "] \"POST //apiserv:8080/engine/v2/jobs HTTP/1.1\" 201 42 \"-\" \"curl/7.46.0\" 384"),
			
 
				+            TimestampFormatFinder.findFirstMatch("192.168.62.101 - - [29/Jun/2016:12:11:31 +0000] " +
			
 
				+                "\"POST //apiserv:8080/engine/v2/jobs HTTP/1.1\" 201 42 \"-\" \"curl/7.46.0\" 384"));
			
 
				+
			
 
				+        assertEquals(new TimestampMatch(21, "", "MMM dd, YYYY K:mm:ss a",
			
 
				+                "\\b[A-Z]\\S{2,8} \\d{1,2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "CATALINA_DATESTAMP",
			
 
				+                " org.apache.tomcat.util.http.Parameters processParameters"),
			
 
				+            TimestampFormatFinder.findFirstMatch("Aug 29, 2009 12:03:57 AM org.apache.tomcat.util.http.Parameters processParameters"));
			
 
				+
			
 
				+        assertEquals(new TimestampMatch(19, "", Arrays.asList("MMM dd HH:mm:ss", "MMM  d HH:mm:ss"),
			
 
				+                "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "SYSLOGTIMESTAMP", " esxi1.acme.com Vpxa: " +
			
 
				+                    "[3CB3FB90 verbose 'vpxavpxaInvtVm' opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed"),
			
 
				+            TimestampFormatFinder.findFirstMatch("Oct 19 17:04:44 esxi1.acme.com Vpxa: [3CB3FB90 verbose 'vpxavpxaInvtVm' " +
			
 
				+                "opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed"));
			
 
				+
			
 
				+        assertEquals(new TimestampMatch(7, "559550912540598297\t", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}",
			
 
				+                "TIMESTAMP_ISO8601",
			
 
				+                "\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\tInfo\tsshd\tsubsystem request for sftp"),
			
 
				+            TimestampFormatFinder.findFirstMatch("559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t" +
			
 
				+                "192.168.114.28\tAuthpriv\tInfo\tsshd\tsubsystem request for sftp"));
			
 
				+
			
 
				+        assertEquals(new TimestampMatch(19, "", Arrays.asList("MMM dd HH:mm:ss", "MMM  d HH:mm:ss"),
			
 
				+                "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "SYSLOGTIMESTAMP",
			
 
				+                " dnsserv named[22529]: error (unexpected RCODE REFUSED) resolving 'www.elastic.co/A/IN': 95.110.68.206#53"),
			
 
				+            TimestampFormatFinder.findFirstMatch("Sep  8 11:55:35 dnsserv named[22529]: error (unexpected RCODE REFUSED) resolving " +
			
 
				+                "'www.elastic.co/A/IN': 95.110.68.206#53"));
			
 
				+
			
 
				+        assertEquals(new TimestampMatch(3, "", "YYYY-MM-dd HH:mm:ss.SSSSSS", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}",
			
 
				+                "TIMESTAMP_ISO8601",
			
 
				+                "|INFO    |VirtualServer |1  |client  'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client " +
			
 
				+                    "'User1'(id:2) in channel '3er Instanz'(id:2)"),
			
 
				+            TimestampFormatFinder.findFirstMatch("2018-01-06 19:22:20.106822|INFO    |VirtualServer |1  |client " +
			
 
				+                " 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel '3er Instanz'(id:2)"));
			
 
				+    }
			
 
				+
			
 
				+    public void testInterpretFractionalSeconds() {
			
 
				+        assertEquals(new Tuple<>(',', 0), TimestampFormatFinder.interpretFractionalSeconds("Sep  8 11:55:35"));
			
 
				+        assertEquals(new Tuple<>(',', 0), TimestampFormatFinder.interpretFractionalSeconds("29/Jun/2016:12:11:31 +0000"));
			
 
				+        assertEquals(new Tuple<>('.', 6), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06 17:21:25.764368"));
			
 
				+        assertEquals(new Tuple<>(',', 9), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764363438"));
			
 
				+        assertEquals(new Tuple<>(',', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764"));
			
 
				+        assertEquals(new Tuple<>('.', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25.764"));
			
 
				+        assertEquals(new Tuple<>('.', 6), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06 17:21:25.764368Z"));
			
 
				+        assertEquals(new Tuple<>(',', 9), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764363438Z"));
			
 
				+        assertEquals(new Tuple<>(',', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764Z"));
			
 
				+        assertEquals(new Tuple<>('.', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25.764Z"));
			
 
				+        assertEquals(new Tuple<>('.', 6), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06 17:21:25.764368 Z"));
			
 
				+        assertEquals(new Tuple<>(',', 9), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764363438 Z"));
			
 
				+        assertEquals(new Tuple<>(',', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764 Z"));
			
 
				+        assertEquals(new Tuple<>('.', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25.764 Z"));
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/TsvLogStructureFinderFactoryTests.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/TsvLogStructureFinderFactoryTests.java
@@ -0,0 +1,33 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+public class TsvLogStructureFinderFactoryTests extends LogStructureTestCase {
			
 
				+
			
 
				+    private LogStructureFinderFactory factory = new TsvLogStructureFinderFactory();
			
 
				+
			
 
				+    // No need to check JSON, XML or CSV because they come earlier in the order we check formats
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenTsv() {
			
 
				+
			
 
				+        assertTrue(factory.canCreateFromSample(explanation, TSV_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenSemiColonSeparatedValues() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_SEPARATED_VALUES_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenPipeSeparatedValues() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenText() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/XmlLogStructureFinderFactoryTests.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/XmlLogStructureFinderFactoryTests.java
@@ -0,0 +1,43 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+public class XmlLogStructureFinderFactoryTests extends LogStructureTestCase {
			
 
				+
			
 
				+    private LogStructureFinderFactory factory = new XmlLogStructureFinderFactory();
			
 
				+
			
 
				+    // No need to check JSON because it comes earlier in the order we check formats
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenXml() {
			
 
				+
			
 
				+        assertTrue(factory.canCreateFromSample(explanation, XML_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenCsv() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, CSV_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenTsv() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, TSV_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenSemiColonSeparatedValues() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_SEPARATED_VALUES_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenPipeSeparatedValues() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
			
 
				+    }
			
 
				+
			
 
				+    public void testCanCreateFromSampleGivenText() {
			
 
				+
			
 
				+        assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
			
 
				+    }
			
 
				+}
			
--- a/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/XmlLogStructureFinderTests.java
+++ b/x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/XmlLogStructureFinderTests.java
@@ -0,0 +1,39 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License;
			
 
				+ * you may not use this file except in compliance with the Elastic License.
			
 
				+ */
			
 
				+package org.elasticsearch.xpack.ml.logstructurefinder;
			
 
				+
			
 
				+import java.util.Collections;
			
 
				+
			
 
				+public class XmlLogStructureFinderTests extends LogStructureTestCase {
			
 
				+
			
 
				+    private LogStructureFinderFactory factory = new XmlLogStructureFinderFactory();
			
 
				+
			
 
				+    public void testCreateConfigsGivenGoodXml() throws Exception {
			
 
				+        assertTrue(factory.canCreateFromSample(explanation, XML_SAMPLE));
			
 
				+
			
 
				+        String charset = randomFrom(POSSIBLE_CHARSETS);
			
 
				+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
			
 
				+        LogStructureFinder structureFinder = factory.createFromSample(explanation, XML_SAMPLE, charset, hasByteOrderMarker);
			
 
				+
			
 
				+        LogStructure structure = structureFinder.getStructure();
			
 
				+
			
 
				+        assertEquals(LogStructure.Format.XML, structure.getFormat());
			
 
				+        assertEquals(charset, structure.getCharset());
			
 
				+        if (hasByteOrderMarker == null) {
			
 
				+            assertNull(structure.getHasByteOrderMarker());
			
 
				+        } else {
			
 
				+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
			
 
				+        }
			
 
				+        assertNull(structure.getExcludeLinesPattern());
			
 
				+        assertEquals("^\\s*<log4j:event", structure.getMultilineStartPattern());
			
 
				+        assertNull(structure.getSeparator());
			
 
				+        assertNull(structure.getHasHeaderRow());
			
 
				+        assertNull(structure.getShouldTrimFields());
			
 
				+        assertNull(structure.getGrokPattern());
			
 
				+        assertEquals("timestamp", structure.getTimestampField());
			
 
				+        assertEquals(Collections.singletonList("UNIX_MS"), structure.getTimestampFormats());
			
 
				+    }
			
 
				+}