Browse Source

[ML] Add log structure finder functionality (#32788)

This change adds a library to ML that can be used to deduce a log
file's structure given only a sample of the log file.

Eventually this will be used to add an endpoint to ML to make the
functionality available to end users, but this will follow in a
separate change.

The functionality is split into a library so that it can also be
used by a command line tool without requiring the command line
tool to include all server code.
David Roberts 7 years ago
parent
commit
5ba04e23fc
42 changed files with 5744 additions and 0 deletions
  1. 36 0
      x-pack/plugin/ml/log-structure-finder/build.gradle
  2. 1 0
      x-pack/plugin/ml/log-structure-finder/licenses/icu4j-62.1.jar.sha1
  3. 33 0
      x-pack/plugin/ml/log-structure-finder/licenses/icu4j-LICENSE.txt
  4. 3 0
      x-pack/plugin/ml/log-structure-finder/licenses/icu4j-NOTICE.txt
  5. 1 0
      x-pack/plugin/ml/log-structure-finder/licenses/super-csv-2.4.0.jar.sha1
  6. 203 0
      x-pack/plugin/ml/log-structure-finder/licenses/super-csv-LICENSE.txt
  7. 0 0
      x-pack/plugin/ml/log-structure-finder/licenses/super-csv-NOTICE.txt
  8. 35 0
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/CsvLogStructureFinderFactory.java
  9. 615 0
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/GrokPatternCreator.java
  10. 84 0
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/JsonLogStructureFinder.java
  11. 87 0
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/JsonLogStructureFinderFactory.java
  12. 614 0
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructure.java
  13. 23 0
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinder.java
  14. 35 0
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderFactory.java
  15. 232 0
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderManager.java
  16. 238 0
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureUtils.java
  17. 38 0
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/PipeSeparatedValuesLogStructureFinderFactory.java
  18. 37 0
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/SemiColonSeparatedValuesLogStructureFinderFactory.java
  19. 486 0
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/SeparatedValuesLogStructureFinder.java
  20. 201 0
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/TextLogStructureFinder.java
  21. 39 0
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/TextLogStructureFinderFactory.java
  22. 427 0
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/TimestampFormatFinder.java
  23. 35 0
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/TsvLogStructureFinderFactory.java
  24. 172 0
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/XmlLogStructureFinder.java
  25. 122 0
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/XmlLogStructureFinderFactory.java
  26. 38 0
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/CsvLogStructureFinderFactoryTests.java
  27. 326 0
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/GrokPatternCreatorTests.java
  28. 46 0
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/JsonLogStructureFinderFactoryTests.java
  29. 39 0
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/JsonLogStructureFinderTests.java
  30. 72 0
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderManagerTests.java
  31. 86 0
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureTestCase.java
  32. 83 0
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureTests.java
  33. 292 0
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureUtilsTests.java
  34. 23 0
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/PipeSeparatedValuesLogStructureFinderFactoryTests.java
  35. 28 0
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/SemiColonSeparatedValuesLogStructureFinderFactoryTests.java
  36. 293 0
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/SeparatedValuesLogStructureFinderTests.java
  37. 19 0
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/TextLogStructureFinderFactoryTests.java
  38. 245 0
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/TextLogStructureFinderTests.java
  39. 242 0
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/TimestampFormatFinderTests.java
  40. 33 0
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/TsvLogStructureFinderFactoryTests.java
  41. 43 0
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/XmlLogStructureFinderFactoryTests.java
  42. 39 0
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/XmlLogStructureFinderTests.java

+ 36 - 0
x-pack/plugin/ml/log-structure-finder/build.gradle

@@ -0,0 +1,36 @@
+import org.elasticsearch.gradle.precommit.PrecommitTasks
+
+apply plugin: 'elasticsearch.build'
+
+archivesBaseName = 'x-pack-log-structure-finder'
+
+description = 'Common code for reverse engineering log structure'
+
+dependencies {
+    compile "org.elasticsearch:elasticsearch-core:${version}"
+    compile "org.elasticsearch:elasticsearch-x-content:${version}"
+    compile project(':libs:grok')
+    compile "com.ibm.icu:icu4j:${versions.icu4j}"
+    compile "net.sf.supercsv:super-csv:${versions.supercsv}"
+
+    testCompile "org.elasticsearch.test:framework:${version}"
+}
+
+configurations {
+    testArtifacts.extendsFrom testRuntime
+}
+task testJar(type: Jar) {
+    appendix 'test'
+    from sourceSets.test.output
+}
+artifacts {
+    // normal es plugins do not publish the jar but we need to since users need it for Transport Clients and extensions
+    archives jar
+    testArtifacts testJar
+}
+
+forbiddenApisMain {
+    // log-structure-finder does not depend on server, so cannot forbid server methods
+    signaturesURLs = [PrecommitTasks.getResource('/forbidden/jdk-signatures.txt')]
+}
+

+ 1 - 0
x-pack/plugin/ml/log-structure-finder/licenses/icu4j-62.1.jar.sha1

@@ -0,0 +1 @@
+7a4d00d5ec5febd252a6182e8b6e87a0a9821f81

+ 33 - 0
x-pack/plugin/ml/log-structure-finder/licenses/icu4j-LICENSE.txt

@@ -0,0 +1,33 @@
+ICU License - ICU 1.8.1 and later
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright (c) 1995-2012 International Business Machines Corporation and others
+
+All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, and/or sell copies of the
+Software, and to permit persons to whom the Software is furnished to do so,
+provided that the above copyright notice(s) and this permission notice appear
+in all copies of the Software and that both the above copyright notice(s) and
+this permission notice appear in supporting documentation.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
+LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
+ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
+IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder shall not
+be used in advertising or otherwise to promote the sale, use or other
+dealings in this Software without prior written authorization of the
+copyright holder.
+
+All trademarks and registered trademarks mentioned herein are the property of
+their respective owners.

+ 3 - 0
x-pack/plugin/ml/log-structure-finder/licenses/icu4j-NOTICE.txt

@@ -0,0 +1,3 @@
+ICU4J, (under lucene/analysis/icu) is licensed under an MIT style license
+(modules/analysis/icu/lib/icu4j-LICENSE-BSD_LIKE.txt) and Copyright (c) 1995-2012
+International Business Machines Corporation and others

+ 1 - 0
x-pack/plugin/ml/log-structure-finder/licenses/super-csv-2.4.0.jar.sha1

@@ -0,0 +1 @@
+017f8708c929029dde48bc298deaf3c7ae2452d3

+ 203 - 0
x-pack/plugin/ml/log-structure-finder/licenses/super-csv-LICENSE.txt

@@ -0,0 +1,203 @@
+/*
+ *                                 Apache License
+ *                           Version 2.0, January 2004
+ *                        http://www.apache.org/licenses/
+ *
+ *   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+ *
+ *   1. Definitions.
+ *
+ *      "License" shall mean the terms and conditions for use, reproduction,
+ *      and distribution as defined by Sections 1 through 9 of this document.
+ *
+ *      "Licensor" shall mean the copyright owner or entity authorized by
+ *      the copyright owner that is granting the License.
+ *
+ *      "Legal Entity" shall mean the union of the acting entity and all
+ *      other entities that control, are controlled by, or are under common
+ *      control with that entity. For the purposes of this definition,
+ *      "control" means (i) the power, direct or indirect, to cause the
+ *      direction or management of such entity, whether by contract or
+ *      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ *      outstanding shares, or (iii) beneficial ownership of such entity.
+ *
+ *      "You" (or "Your") shall mean an individual or Legal Entity
+ *      exercising permissions granted by this License.
+ *
+ *      "Source" form shall mean the preferred form for making modifications,
+ *      including but not limited to software source code, documentation
+ *      source, and configuration files.
+ *
+ *      "Object" form shall mean any form resulting from mechanical
+ *      transformation or translation of a Source form, including but
+ *      not limited to compiled object code, generated documentation,
+ *      and conversions to other media types.
+ *
+ *      "Work" shall mean the work of authorship, whether in Source or
+ *      Object form, made available under the License, as indicated by a
+ *      copyright notice that is included in or attached to the work
+ *      (an example is provided in the Appendix below).
+ *
+ *      "Derivative Works" shall mean any work, whether in Source or Object
+ *      form, that is based on (or derived from) the Work and for which the
+ *      editorial revisions, annotations, elaborations, or other modifications
+ *      represent, as a whole, an original work of authorship. For the purposes
+ *      of this License, Derivative Works shall not include works that remain
+ *      separable from, or merely link (or bind by name) to the interfaces of,
+ *      the Work and Derivative Works thereof.
+ *
+ *      "Contribution" shall mean any work of authorship, including
+ *      the original version of the Work and any modifications or additions
+ *      to that Work or Derivative Works thereof, that is intentionally
+ *      submitted to Licensor for inclusion in the Work by the copyright owner
+ *      or by an individual or Legal Entity authorized to submit on behalf of
+ *      the copyright owner. For the purposes of this definition, "submitted"
+ *      means any form of electronic, verbal, or written communication sent
+ *      to the Licensor or its representatives, including but not limited to
+ *      communication on electronic mailing lists, source code control systems,
+ *      and issue tracking systems that are managed by, or on behalf of, the
+ *      Licensor for the purpose of discussing and improving the Work, but
+ *      excluding communication that is conspicuously marked or otherwise
+ *      designated in writing by the copyright owner as "Not a Contribution."
+ *
+ *      "Contributor" shall mean Licensor and any individual or Legal Entity
+ *      on behalf of whom a Contribution has been received by Licensor and
+ *      subsequently incorporated within the Work.
+ *
+ *   2. Grant of Copyright License. Subject to the terms and conditions of
+ *      this License, each Contributor hereby grants to You a perpetual,
+ *      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ *      copyright license to reproduce, prepare Derivative Works of,
+ *      publicly display, publicly perform, sublicense, and distribute the
+ *      Work and such Derivative Works in Source or Object form.
+ *
+ *   3. Grant of Patent License. Subject to the terms and conditions of
+ *      this License, each Contributor hereby grants to You a perpetual,
+ *      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ *      (except as stated in this section) patent license to make, have made,
+ *      use, offer to sell, sell, import, and otherwise transfer the Work,
+ *      where such license applies only to those patent claims licensable
+ *      by such Contributor that are necessarily infringed by their
+ *      Contribution(s) alone or by combination of their Contribution(s)
+ *      with the Work to which such Contribution(s) was submitted. If You
+ *      institute patent litigation against any entity (including a
+ *      cross-claim or counterclaim in a lawsuit) alleging that the Work
+ *      or a Contribution incorporated within the Work constitutes direct
+ *      or contributory patent infringement, then any patent licenses
+ *      granted to You under this License for that Work shall terminate
+ *      as of the date such litigation is filed.
+ *
+ *   4. Redistribution. You may reproduce and distribute copies of the
+ *      Work or Derivative Works thereof in any medium, with or without
+ *      modifications, and in Source or Object form, provided that You
+ *      meet the following conditions:
+ *
+ *      (a) You must give any other recipients of the Work or
+ *          Derivative Works a copy of this License; and
+ *
+ *      (b) You must cause any modified files to carry prominent notices
+ *          stating that You changed the files; and
+ *
+ *      (c) You must retain, in the Source form of any Derivative Works
+ *          that You distribute, all copyright, patent, trademark, and
+ *          attribution notices from the Source form of the Work,
+ *          excluding those notices that do not pertain to any part of
+ *          the Derivative Works; and
+ *
+ *      (d) If the Work includes a "NOTICE" text file as part of its
+ *          distribution, then any Derivative Works that You distribute must
+ *          include a readable copy of the attribution notices contained
+ *          within such NOTICE file, excluding those notices that do not
+ *          pertain to any part of the Derivative Works, in at least one
+ *          of the following places: within a NOTICE text file distributed
+ *          as part of the Derivative Works; within the Source form or
+ *          documentation, if provided along with the Derivative Works; or,
+ *          within a display generated by the Derivative Works, if and
+ *          wherever such third-party notices normally appear. The contents
+ *          of the NOTICE file are for informational purposes only and
+ *          do not modify the License. You may add Your own attribution
+ *          notices within Derivative Works that You distribute, alongside
+ *          or as an addendum to the NOTICE text from the Work, provided
+ *          that such additional attribution notices cannot be construed
+ *          as modifying the License.
+ *
+ *      You may add Your own copyright statement to Your modifications and
+ *      may provide additional or different license terms and conditions
+ *      for use, reproduction, or distribution of Your modifications, or
+ *      for any such Derivative Works as a whole, provided Your use,
+ *      reproduction, and distribution of the Work otherwise complies with
+ *      the conditions stated in this License.
+ *
+ *   5. Submission of Contributions. Unless You explicitly state otherwise,
+ *      any Contribution intentionally submitted for inclusion in the Work
+ *      by You to the Licensor shall be under the terms and conditions of
+ *      this License, without any additional terms or conditions.
+ *      Notwithstanding the above, nothing herein shall supersede or modify
+ *      the terms of any separate license agreement you may have executed
+ *      with Licensor regarding such Contributions.
+ *
+ *   6. Trademarks. This License does not grant permission to use the trade
+ *      names, trademarks, service marks, or product names of the Licensor,
+ *      except as required for reasonable and customary use in describing the
+ *      origin of the Work and reproducing the content of the NOTICE file.
+ *
+ *   7. Disclaimer of Warranty. Unless required by applicable law or
+ *      agreed to in writing, Licensor provides the Work (and each
+ *      Contributor provides its Contributions) on an "AS IS" BASIS,
+ *      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ *      implied, including, without limitation, any warranties or conditions
+ *      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ *      PARTICULAR PURPOSE. You are solely responsible for determining the
+ *      appropriateness of using or redistributing the Work and assume any
+ *      risks associated with Your exercise of permissions under this License.
+ *
+ *   8. Limitation of Liability. In no event and under no legal theory,
+ *      whether in tort (including negligence), contract, or otherwise,
+ *      unless required by applicable law (such as deliberate and grossly
+ *      negligent acts) or agreed to in writing, shall any Contributor be
+ *      liable to You for damages, including any direct, indirect, special,
+ *      incidental, or consequential damages of any character arising as a
+ *      result of this License or out of the use or inability to use the
+ *      Work (including but not limited to damages for loss of goodwill,
+ *      work stoppage, computer failure or malfunction, or any and all
+ *      other commercial damages or losses), even if such Contributor
+ *      has been advised of the possibility of such damages.
+ *
+ *   9. Accepting Warranty or Additional Liability. While redistributing
+ *      the Work or Derivative Works thereof, You may choose to offer,
+ *      and charge a fee for, acceptance of support, warranty, indemnity,
+ *      or other liability obligations and/or rights consistent with this
+ *      License. However, in accepting such obligations, You may act only
+ *      on Your own behalf and on Your sole responsibility, not on behalf
+ *      of any other Contributor, and only if You agree to indemnify,
+ *      defend, and hold each Contributor harmless for any liability
+ *      incurred by, or claims asserted against, such Contributor by reason
+ *      of your accepting any such warranty or additional liability.
+ *
+ *   END OF TERMS AND CONDITIONS
+ *
+ *   APPENDIX: How to apply the Apache License to your work.
+ *
+ *      To apply the Apache License to your work, attach the following
+ *      boilerplate notice, with the fields enclosed by brackets "[]"
+ *      replaced with your own identifying information. (Don't include
+ *      the brackets!)  The text should be enclosed in the appropriate
+ *      comment syntax for the file format. We also recommend that a
+ *      file or class name and description of purpose be included on the
+ *      same "printed page" as the copyright notice for easier
+ *      identification within third-party archives.
+ *
+ *   Copyright 2007 Kasper B. Graversen
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+ */

+ 0 - 0
x-pack/plugin/ml/log-structure-finder/licenses/super-csv-NOTICE.txt


+ 35 - 0
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/CsvLogStructureFinderFactory.java

@@ -0,0 +1,35 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.supercsv.prefs.CsvPreference;
+
+import java.io.IOException;
+import java.util.List;
+
+public class CsvLogStructureFinderFactory implements LogStructureFinderFactory {
+
+    /**
+     * Rules are:
+     * - The file must be valid CSV
+     * - It must contain at least two complete records
+     * - There must be at least two fields per record (otherwise files with no commas could be treated as CSV!)
+     * - Every CSV record except the last must have the same number of fields
+     * The reason the last record is allowed to have fewer fields than the others is that
+     * it could have been truncated when the file was sampled.
+     */
+    @Override
+    public boolean canCreateFromSample(List<String> explanation, String sample) {
+        return SeparatedValuesLogStructureFinder.canCreateFromSample(explanation, sample, 2, CsvPreference.EXCEL_PREFERENCE, "CSV");
+    }
+
+    @Override
+    public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
+        throws IOException {
+        return SeparatedValuesLogStructureFinder.makeSeparatedValuesLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
+            CsvPreference.EXCEL_PREFERENCE, false);
+    }
+}

+ 615 - 0
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/GrokPatternCreator.java

@@ -0,0 +1,615 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.elasticsearch.common.collect.Tuple;
+import org.elasticsearch.grok.Grok;
+import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+/**
+ * Creates Grok patterns that will match all provided sample messages.
+ *
+ * The choice of field names is quite primitive.  The intention is that a human will edit these.
+ */
+public final class GrokPatternCreator {
+
+    private static final Map<Character, Boolean> PUNCTUATION_OR_SPACE_NEEDS_ESCAPING;
+    static {
+        HashMap<Character, Boolean> punctuationOrSpaceNeedsEscaping = new HashMap<>();
+        String punctuationAndSpaceCharacters = "\"'`‘’“”#@%=\\/|~:;,<>()[]{}«»^$*¿?¡!§¶ \t\n";
+        String punctuationThatNeedsEscaping = "\\|()[]{}^$*?";
+        punctuationAndSpaceCharacters.chars()
+            .forEach(c -> punctuationOrSpaceNeedsEscaping.put((char) c, punctuationThatNeedsEscaping.indexOf(c) >= 0));
+        PUNCTUATION_OR_SPACE_NEEDS_ESCAPING = Collections.unmodifiableMap(punctuationOrSpaceNeedsEscaping);
+    }
+
+    private static final String PREFACE = "preface";
+    private static final String VALUE = "value";
+    private static final String EPILOGUE = "epilogue";
+
+    /**
+     * Grok patterns that are designed to match the whole message, not just a part of it.
+     */
+    private static final List<FullMatchGrokPatternCandidate> FULL_MATCH_GROK_PATTERNS = Arrays.asList(
+        new FullMatchGrokPatternCandidate("BACULA_LOGLINE", "bts"),
+        new FullMatchGrokPatternCandidate("CATALINALOG", "timestamp"),
+        new FullMatchGrokPatternCandidate("COMBINEDAPACHELOG", "timestamp"),
+        new FullMatchGrokPatternCandidate("COMMONAPACHELOG", "timestamp"),
+        new FullMatchGrokPatternCandidate("ELB_ACCESS_LOG", "timestamp"),
+        new FullMatchGrokPatternCandidate("HAPROXYHTTP", "syslog_timestamp"),
+        new FullMatchGrokPatternCandidate("HAPROXYTCP", "syslog_timestamp"),
+        new FullMatchGrokPatternCandidate("HTTPD20_ERRORLOG", "timestamp"),
+        new FullMatchGrokPatternCandidate("HTTPD24_ERRORLOG", "timestamp"),
+        new FullMatchGrokPatternCandidate("NAGIOSLOGLINE", "nagios_epoch"),
+        new FullMatchGrokPatternCandidate("NETSCREENSESSIONLOG", "date"),
+        new FullMatchGrokPatternCandidate("RAILS3", "timestamp"),
+        new FullMatchGrokPatternCandidate("RUBY_LOGGER", "timestamp"),
+        new FullMatchGrokPatternCandidate("SHOREWALL", "timestamp"),
+        new FullMatchGrokPatternCandidate("TOMCATLOG", "timestamp")
+    );
+
+    /**
+     * The first match in this list will be chosen, so it needs to be ordered
+     * such that more generic patterns come after more specific patterns.
+     */
+    private static final List<GrokPatternCandidate> ORDERED_CANDIDATE_GROK_PATTERNS = Arrays.asList(
+        new ValueOnlyGrokPatternCandidate("TOMCAT_DATESTAMP", "date", "extra_timestamp"),
+        new ValueOnlyGrokPatternCandidate("TIMESTAMP_ISO8601", "date", "extra_timestamp"),
+        new ValueOnlyGrokPatternCandidate("DATESTAMP_RFC822", "date", "extra_timestamp"),
+        new ValueOnlyGrokPatternCandidate("DATESTAMP_RFC2822", "date", "extra_timestamp"),
+        new ValueOnlyGrokPatternCandidate("DATESTAMP_OTHER", "date", "extra_timestamp"),
+        new ValueOnlyGrokPatternCandidate("DATESTAMP_EVENTLOG", "date", "extra_timestamp"),
+        new ValueOnlyGrokPatternCandidate("SYSLOGTIMESTAMP", "date", "extra_timestamp"),
+        new ValueOnlyGrokPatternCandidate("HTTPDATE", "date", "extra_timestamp"),
+        new ValueOnlyGrokPatternCandidate("CATALINA_DATESTAMP", "date", "extra_timestamp"),
+        new ValueOnlyGrokPatternCandidate("CISCOTIMESTAMP", "date", "extra_timestamp"),
+        new ValueOnlyGrokPatternCandidate("LOGLEVEL", "keyword", "loglevel"),
+        new ValueOnlyGrokPatternCandidate("URI", "keyword", "uri"),
+        new ValueOnlyGrokPatternCandidate("UUID", "keyword", "uuid"),
+        new ValueOnlyGrokPatternCandidate("MAC", "keyword", "macaddress"),
+        // Can't use \b as the breaks, because slashes are not "word" characters
+        new ValueOnlyGrokPatternCandidate("PATH", "keyword", "path", "(?<!\\w)", "(?!\\w)"),
+        new ValueOnlyGrokPatternCandidate("EMAILADDRESS", "keyword", "email"),
+        // TODO: would be nice to have IPORHOST here, but HOST matches almost all words
+        new ValueOnlyGrokPatternCandidate("IP", "ip", "ipaddress"),
+        new ValueOnlyGrokPatternCandidate("DATE", "date", "date"),
+        new ValueOnlyGrokPatternCandidate("TIME", "date", "time"),
+        // This already includes pre/post break conditions
+        new ValueOnlyGrokPatternCandidate("QUOTEDSTRING", "keyword", "field", "", ""),
+        // Disallow +, - and . before numbers, as well as "word" characters, otherwise we'll pick
+        // up numeric suffices too eagerly
+        new ValueOnlyGrokPatternCandidate("INT", "long", "field", "(?<![\\w.+-])", "(?![\\w+-]|\\.\\d)"),
+        new ValueOnlyGrokPatternCandidate("NUMBER", "double", "field", "(?<![\\w.+-])", "(?![\\w+-]|\\.\\d)"),
+        new ValueOnlyGrokPatternCandidate("BASE16NUM", "keyword", "field", "(?<![\\w.+-])", "(?![\\w+-]|\\.\\w)")
+        // TODO: also unfortunately can't have USERNAME in the list as it matches too broadly
+        // Fixing these problems with overly broad matches would require some extra intelligence
+        // to be added to remove inappropriate matches.  One idea would be to use a dictionary,
+        // but that doesn't necessarily help as "jay" could be a username but is also a dictionary
+        // word (plus there's the international headache with relying on dictionaries).  Similarly,
+        // hostnames could also be dictionary words - I've worked on machines called "hippo" and
+        // "scarf" in the past.  Another idea would be to look at the adjacent characters and
+        // apply some heuristic based on those.
+    );
+
+    /**
+     * It is expected that the explanation will be shared with other code.
+     * Both this class and other classes will update it.
+     */
+    private final List<String> explanation;
+    private final Collection<String> sampleMessages;
+
+    /**
+     * It is expected that the mappings will be shared with other code.
+     * Both this class and other classes will update it.
+     */
+    private final Map<String, Object> mappings;
+    private final Map<String, Integer> fieldNameCountStore = new HashMap<>();
+    private final StringBuilder overallGrokPatternBuilder = new StringBuilder();
+
+    /**
+     *
+     * @param explanation List of reasons for making decisions.  May contain items when passed and new reasons
+     *                    can be appended by the methods of this class.
+     * @param sampleMessages Sample messages that any Grok pattern found must match.
+     * @param mappings Will be updated with mappings appropriate for the returned pattern, if non-<code>null</code>.
+     */
+    public GrokPatternCreator(List<String> explanation, Collection<String> sampleMessages, Map<String, Object> mappings) {
+        this.explanation = explanation;
+        this.sampleMessages = Collections.unmodifiableCollection(sampleMessages);
+        this.mappings = mappings;
+    }
+
+    /**
+     * This method attempts to find a Grok pattern that will match all of the sample messages in their entirety.
+     * @return A tuple of (time field name, Grok string), or <code>null</code> if no suitable Grok pattern was found.
+     */
+    public Tuple<String, String> findFullLineGrokPattern() {
+
+        for (FullMatchGrokPatternCandidate candidate : FULL_MATCH_GROK_PATTERNS) {
+            if (candidate.matchesAll(sampleMessages)) {
+                return candidate.processMatch(explanation, sampleMessages, mappings);
+            }
+        }
+
+        return null;
+    }
+
+    /**
+     * Build a Grok pattern that will match all of the sample messages in their entirety.
+     * @param seedPatternName A pattern that has already been determined to match some portion of every sample message.
+     * @param seedFieldName The field name to be used for the portion of every sample message that the seed pattern matches.
+     * @return The built Grok pattern.
+     */
+    public String createGrokPatternFromExamples(String seedPatternName, String seedFieldName) {
+
+        overallGrokPatternBuilder.setLength(0);
+
+        GrokPatternCandidate seedCandidate = new NoMappingGrokPatternCandidate(seedPatternName, seedFieldName);
+
+        processCandidateAndSplit(seedCandidate, true, sampleMessages, false, 0, false, 0);
+
+        return overallGrokPatternBuilder.toString().replace("\t", "\\t").replace("\n", "\\n");
+    }
+
+    /**
+     * This is purely to allow unit tests to inspect the partial Grok pattern after testing implementation details.
+     * It should not be used in production code.
+     */
+    StringBuilder getOverallGrokPatternBuilder() {
+        return overallGrokPatternBuilder;
+    }
+
+    /**
+     * Given a chosen Grok pattern and a collection of message snippets, split the snippets into the
+     * matched section and the pieces before and after it.  Recurse to find more matches in the pieces
+     * before and after and update the supplied string builder.
+     */
+    private void processCandidateAndSplit(GrokPatternCandidate chosenPattern, boolean isLast, Collection<String> snippets,
+                                          boolean ignoreKeyValueCandidateLeft, int ignoreValueOnlyCandidatesLeft,
+                                          boolean ignoreKeyValueCandidateRight, int ignoreValueOnlyCandidatesRight) {
+
+        Collection<String> prefaces = new ArrayList<>();
+        Collection<String> epilogues = new ArrayList<>();
+        String patternBuilderContent = chosenPattern.processCaptures(fieldNameCountStore, snippets, prefaces, epilogues, mappings);
+        appendBestGrokMatchForStrings(false, prefaces, ignoreKeyValueCandidateLeft, ignoreValueOnlyCandidatesLeft);
+        overallGrokPatternBuilder.append(patternBuilderContent);
+        appendBestGrokMatchForStrings(isLast, epilogues, ignoreKeyValueCandidateRight, ignoreValueOnlyCandidatesRight);
+    }
+
+    /**
+     * Given a collection of message snippets, work out which (if any) of the Grok patterns we're allowed
+     * to use matches it best.  Then append the appropriate Grok language to represent that finding onto
+     * the supplied string builder.
+     */
+    void appendBestGrokMatchForStrings(boolean isLast, Collection<String> snippets,
+                                       boolean ignoreKeyValueCandidate, int ignoreValueOnlyCandidates) {
+
+        snippets = adjustForPunctuation(snippets);
+
+        GrokPatternCandidate bestCandidate = null;
+        if (snippets.isEmpty() == false) {
+            GrokPatternCandidate kvCandidate = new KeyValueGrokPatternCandidate(explanation);
+            if (ignoreKeyValueCandidate == false && kvCandidate.matchesAll(snippets)) {
+                bestCandidate = kvCandidate;
+            } else {
+                ignoreKeyValueCandidate = true;
+                for (GrokPatternCandidate candidate :
+                    ORDERED_CANDIDATE_GROK_PATTERNS.subList(ignoreValueOnlyCandidates, ORDERED_CANDIDATE_GROK_PATTERNS.size())) {
+                    if (candidate.matchesAll(snippets)) {
+                        bestCandidate = candidate;
+                        break;
+                    }
+                    ++ignoreValueOnlyCandidates;
+                }
+            }
+        }
+
+        if (bestCandidate == null) {
+            if (isLast) {
+                finalizeGrokPattern(snippets);
+            } else {
+                addIntermediateRegex(snippets);
+            }
+        } else {
+            processCandidateAndSplit(bestCandidate, isLast, snippets, true, ignoreValueOnlyCandidates + (ignoreKeyValueCandidate ? 1 : 0),
+                ignoreKeyValueCandidate, ignoreValueOnlyCandidates);
+        }
+    }
+
+    /**
+     * If the snippets supplied begin with more than 1 character of common punctuation or whitespace
+     * then add all but the last of these characters to the overall pattern and remove them from the
+     * snippets.
+     * @param snippets Input snippets - not modified.
+     * @return Output snippets, which will be a copy of the input snippets but with whatever characters
+     *         were added to <code>overallPatternBuilder</code> removed from the beginning.
+     */
+    Collection<String> adjustForPunctuation(Collection<String> snippets) {
+
+        assert snippets.isEmpty() == false;
+
+        StringBuilder commonInitialPunctuation = new StringBuilder();
+
+        for (String snippet : snippets) {
+
+            if (commonInitialPunctuation.length() == 0) {
+                for (int index = 0; index < snippet.length(); ++index) {
+                    char ch = snippet.charAt(index);
+                    if (PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.get(ch) != null) {
+                        commonInitialPunctuation.append(ch);
+                    } else {
+                        break;
+                    }
+                }
+            } else {
+                if (commonInitialPunctuation.length() > snippet.length()) {
+                    commonInitialPunctuation.delete(snippet.length(), commonInitialPunctuation.length());
+                }
+                for (int index = 0; index < commonInitialPunctuation.length(); ++index) {
+                    char ch = snippet.charAt(index);
+                    if (ch != commonInitialPunctuation.charAt(index)) {
+                        commonInitialPunctuation.delete(index, commonInitialPunctuation.length());
+                        break;
+                    }
+                }
+            }
+
+            if (commonInitialPunctuation.length() <= 1) {
+                return snippets;
+            }
+        }
+
+        int numLiteralCharacters = commonInitialPunctuation.length() - 1;
+
+        for (int index = 0; index < numLiteralCharacters; ++index) {
+            char ch = commonInitialPunctuation.charAt(index);
+            if (PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.getOrDefault(ch, false)) {
+                overallGrokPatternBuilder.append('\\');
+            }
+            overallGrokPatternBuilder.append(ch);
+        }
+
+        return snippets.stream().map(snippet -> snippet.substring(numLiteralCharacters)).collect(Collectors.toList());
+    }
+
+    /**
+     * The first time a particular field name is passed, simply return it.
+     * The second time return it with "2" appended.
+     * The third time return it with "3" appended.
+     * Etc.
+     */
+    static String buildFieldName(Map<String, Integer> fieldNameCountStore, String fieldName) {
+        Integer numberSeen = fieldNameCountStore.compute(fieldName, (k, v) -> 1 + ((v == null) ? 0 : v));
+        return (numberSeen > 1) ? fieldName + numberSeen : fieldName;
+    }
+
+    private void addIntermediateRegex(Collection<String> snippets) {
+        addIntermediateRegex(overallGrokPatternBuilder, snippets);
+    }
+
+    public static void addIntermediateRegex(StringBuilder patternBuilder, Collection<String> snippets) {
+        if (snippets.isEmpty()) {
+            return;
+        }
+
+        List<String> others = new ArrayList<>(snippets);
+        String driver = others.remove(others.size() - 1);
+
+        boolean wildcardRequiredIfNonMatchFound = true;
+        for (int i = 0; i < driver.length(); ++i) {
+            char ch = driver.charAt(i);
+            Boolean punctuationOrSpaceNeedsEscaping = PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.get(ch);
+            if (punctuationOrSpaceNeedsEscaping != null && others.stream().allMatch(other -> other.indexOf(ch) >= 0)) {
+                if (wildcardRequiredIfNonMatchFound && others.stream().anyMatch(other -> other.indexOf(ch) > 0)) {
+                    patternBuilder.append(".*?");
+                }
+                if (punctuationOrSpaceNeedsEscaping) {
+                    patternBuilder.append('\\');
+                }
+                patternBuilder.append(ch);
+                wildcardRequiredIfNonMatchFound = true;
+                others = others.stream().map(other -> other.substring(other.indexOf(ch) + 1)).collect(Collectors.toList());
+            } else if (wildcardRequiredIfNonMatchFound) {
+                patternBuilder.append(".*?");
+                wildcardRequiredIfNonMatchFound = false;
+            }
+        }
+
+        if (wildcardRequiredIfNonMatchFound && others.stream().anyMatch(s -> s.isEmpty() == false)) {
+            patternBuilder.append(".*?");
+        }
+    }
+
+    private void finalizeGrokPattern(Collection<String> snippets) {
+        if (snippets.stream().allMatch(String::isEmpty)) {
+            return;
+        }
+
+        List<String> others = new ArrayList<>(snippets);
+        String driver = others.remove(others.size() - 1);
+
+        for (int i = 0; i < driver.length(); ++i) {
+            char ch = driver.charAt(i);
+            int driverIndex = i;
+            Boolean punctuationOrSpaceNeedsEscaping = PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.get(ch);
+            if (punctuationOrSpaceNeedsEscaping != null &&
+                others.stream().allMatch(other -> other.length() > driverIndex && other.charAt(driverIndex) == ch)) {
+                if (punctuationOrSpaceNeedsEscaping) {
+                    overallGrokPatternBuilder.append('\\');
+                }
+                overallGrokPatternBuilder.append(ch);
+                if (i == driver.length() - 1 && others.stream().allMatch(driver::equals)) {
+                    return;
+                }
+            } else {
+                break;
+            }
+        }
+
+        overallGrokPatternBuilder.append(".*");
+    }
+
+    interface GrokPatternCandidate {
+
+        /**
+         * @return Does this Grok pattern candidate match all the snippets?
+         */
+        boolean matchesAll(Collection<String> snippets);
+
+        /**
+         * After it has been determined that this Grok pattern candidate matches a collection of strings,
+         * return collections of the bits that come before (prefaces) and after (epilogues) the bit
+         * that matches.  Also update mappings with the most appropriate field name and type.
+         * @return The string that needs to be incorporated into the overall Grok pattern for the line.
+         */
+        String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
+                               Collection<String> epilogues, Map<String, Object> mappings);
+    }
+
+    /**
+     * A Grok pattern candidate that will match a single named Grok pattern.
+     */
+    static class ValueOnlyGrokPatternCandidate implements GrokPatternCandidate {
+
+        private final String grokPatternName;
+        private final String mappingType;
+        private final String fieldName;
+        private final Grok grok;
+
+        /**
+         * Pre/post breaks default to \b, but this may not be appropriate for Grok patterns that start or
+         * end with a non "word" character (i.e. letter, number or underscore).  For such patterns use one
+         * of the other constructors.
+         * <p>
+         * In cases where the Grok pattern defined by Logstash already includes conditions on what must
+         * come before and after the match, use one of the other constructors and specify an empty string
+         * for the pre and/or post breaks.
+         *
+         * @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash.
+         * @param fieldName       Name of the field to extract from the match.
+         */
+        ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName) {
+            this(grokPatternName, mappingType, fieldName, "\\b", "\\b");
+        }
+
+        /**
+         * @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash.
+         * @param mappingType     Data type for field in Elasticsearch mappings.
+         * @param fieldName       Name of the field to extract from the match.
+         * @param preBreak        Only consider the match if it's broken from the previous text by this.
+         * @param postBreak       Only consider the match if it's broken from the following text by this.
+         */
+        ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName, String preBreak, String postBreak) {
+            this.grokPatternName = grokPatternName;
+            this.mappingType = mappingType;
+            this.fieldName = fieldName;
+            // The (?m) here has the Ruby meaning, which is equivalent to (?s) in Java
+            grok = new Grok(Grok.getBuiltinPatterns(), "(?m)%{DATA:" + PREFACE + "}" + preBreak +
+                "%{" + grokPatternName + ":" + VALUE + "}" + postBreak + "%{GREEDYDATA:" + EPILOGUE + "}");
+        }
+
+        @Override
+        public boolean matchesAll(Collection<String> snippets) {
+            return snippets.stream().allMatch(grok::match);
+        }
+
+        /**
+         * Given a collection of strings, and a Grok pattern that matches some part of them all,
+         * return collections of the bits that come before (prefaces) and after (epilogues) the
+         * bit that matches.
+         */
+        @Override
+        public String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
+                                      Collection<String> epilogues, Map<String, Object> mappings) {
+            String sampleValue = null;
+            for (String snippet : snippets) {
+                Map<String, Object> captures = grok.captures(snippet);
+                // If the pattern doesn't match then captures will be null
+                if (captures == null) {
+                    throw new IllegalStateException("[%{" + grokPatternName + "}] does not match snippet [" + snippet + "]");
+                }
+                prefaces.add(captures.getOrDefault(PREFACE, "").toString());
+                if (sampleValue == null) {
+                    sampleValue = captures.get(VALUE).toString();
+                }
+                epilogues.add(captures.getOrDefault(EPILOGUE, "").toString());
+            }
+            String adjustedFieldName = buildFieldName(fieldNameCountStore, fieldName);
+            if (mappings != null) {
+                Map<String, String> fullMappingType = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, mappingType);
+                if ("date".equals(mappingType)) {
+                    TimestampMatch timestampMatch = TimestampFormatFinder.findFirstFullMatch(sampleValue);
+                    if (timestampMatch != null) {
+                        fullMappingType = timestampMatch.getEsDateMappingTypeWithFormat();
+                    }
+                }
+                mappings.put(adjustedFieldName, fullMappingType);
+            }
+            return "%{" + grokPatternName + ":" + adjustedFieldName + "}";
+        }
+    }
+
+    /**
+     * Unlike the {@link ValueOnlyGrokPatternCandidate} an object of this class is not immutable and not thread safe.
+     * When a given object matches a set of strings it chooses a field name.  Then that same field name is used when
+     * processing captures from the pattern.  Hence only a single thread may use any particular instance of this
+     * class.
+     */
+    static class KeyValueGrokPatternCandidate implements GrokPatternCandidate {
+
+        private static final Pattern kvFinder = Pattern.compile("\\b(\\w+)=[\\w.-]+");
+        private final List<String> explanation;
+        private String fieldName;
+
+        KeyValueGrokPatternCandidate(List<String> explanation) {
+            this.explanation = explanation;
+        }
+
+        @Override
+        public boolean matchesAll(Collection<String> snippets) {
+            Set<String> candidateNames = new LinkedHashSet<>();
+            boolean isFirst = true;
+            for (String snippet : snippets) {
+                if (isFirst) {
+                    Matcher matcher = kvFinder.matcher(snippet);
+                    while (matcher.find()) {
+                        candidateNames.add(matcher.group(1));
+                    }
+                    isFirst = false;
+                } else {
+                    candidateNames.removeIf(candidateName ->
+                        Pattern.compile("\\b" + candidateName + "=[\\w.-]+").matcher(snippet).find() == false);
+                }
+                if (candidateNames.isEmpty()) {
+                    break;
+                }
+            }
+            return (fieldName = candidateNames.stream().findFirst().orElse(null)) != null;
+        }
+
+        @Override
+        public String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
+                                      Collection<String> epilogues, Map<String, Object> mappings) {
+            if (fieldName == null) {
+                throw new IllegalStateException("Cannot process KV matches until a field name has been determined");
+            }
+            Grok grok = new Grok(Grok.getBuiltinPatterns(), "(?m)%{DATA:" + PREFACE + "}\\b" +
+                fieldName + "=%{USER:" + VALUE + "}%{GREEDYDATA:" + EPILOGUE + "}");
+            Collection<String> values = new ArrayList<>();
+            for (String snippet : snippets) {
+                Map<String, Object> captures = grok.captures(snippet);
+                // If the pattern doesn't match then captures will be null
+                if (captures == null) {
+                    throw new IllegalStateException("[\\b" + fieldName + "=%{USER}] does not match snippet [" + snippet + "]");
+                }
+                prefaces.add(captures.getOrDefault(PREFACE, "").toString());
+                values.add(captures.getOrDefault(VALUE, "").toString());
+                epilogues.add(captures.getOrDefault(EPILOGUE, "").toString());
+            }
+            String adjustedFieldName = buildFieldName(fieldNameCountStore, fieldName);
+            if (mappings != null) {
+                mappings.put(adjustedFieldName, LogStructureUtils.guessScalarMapping(explanation, adjustedFieldName, values));
+            }
+            return "\\b" + fieldName + "=%{USER:" + adjustedFieldName + "}";
+        }
+    }
+
+    /**
+     * A Grok pattern candidate that matches a single named Grok pattern but will not update mappings.
+     */
+    static class NoMappingGrokPatternCandidate extends ValueOnlyGrokPatternCandidate {
+
+        NoMappingGrokPatternCandidate(String grokPatternName, String fieldName) {
+            super(grokPatternName, null, fieldName);
+        }
+
+        @Override
+        public String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
+                                      Collection<String> epilogues, Map<String, Object> mappings) {
+            return super.processCaptures(fieldNameCountStore, snippets, prefaces, epilogues, null);
+        }
+    }
+
+    /**
+     * Used to check whether a single Grok pattern matches every sample message in its entirety.
+     */
+    static class FullMatchGrokPatternCandidate {
+
+        private final String grokString;
+        private final String timeField;
+        private final Grok grok;
+
+        FullMatchGrokPatternCandidate(String grokPatternName, String timeField) {
+            grokString = "%{" + grokPatternName + "}";
+            this.timeField = timeField;
+            grok = new Grok(Grok.getBuiltinPatterns(), grokString);
+        }
+
+        public boolean matchesAll(Collection<String> sampleMessages) {
+            return sampleMessages.stream().allMatch(grok::match);
+        }
+
+        /**
+         * This must only be called if {@link #matchesAll} returns <code>true</code>.
+         * @return A tuple of (time field name, Grok string).
+         */
+        public Tuple<String, String> processMatch(List<String> explanation, Collection<String> sampleMessages,
+                                                  Map<String, Object> mappings) {
+
+            explanation.add("A full message Grok pattern [" + grokString.substring(2, grokString.length() - 1) + "] looks appropriate");
+
+            if (mappings != null) {
+                Map<String, Collection<String>> valuesPerField = new HashMap<>();
+
+                for (String sampleMessage : sampleMessages) {
+                    Map<String, Object> captures = grok.captures(sampleMessage);
+                    // If the pattern doesn't match then captures will be null
+                    if (captures == null) {
+                        throw new IllegalStateException("[" + grokString + "] does not match snippet [" + sampleMessage + "]");
+                    }
+                    for (Map.Entry<String, Object> capture : captures.entrySet()) {
+
+                        String fieldName = capture.getKey();
+                        String fieldValue = capture.getValue().toString();
+
+                        // Exclude the time field because that will be dropped and replaced with @timestamp
+                        if (fieldName.equals(timeField) == false) {
+                            valuesPerField.compute(fieldName, (k, v) -> {
+                                if (v == null) {
+                                    return new ArrayList<>(Collections.singletonList(fieldValue));
+                                } else {
+                                    v.add(fieldValue);
+                                    return v;
+                                }
+                            });
+                        }
+                    }
+                }
+
+                for (Map.Entry<String, Collection<String>> valuesForField : valuesPerField.entrySet()) {
+                    String fieldName = valuesForField.getKey();
+                    mappings.put(fieldName,
+                        LogStructureUtils.guessScalarMapping(explanation, fieldName, valuesForField.getValue()));
+                }
+            }
+
+            return new Tuple<>(timeField, grokString);
+        }
+    }
+}

+ 84 - 0
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/JsonLogStructureFinder.java

@@ -0,0 +1,84 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.elasticsearch.common.collect.Tuple;
+import org.elasticsearch.common.xcontent.DeprecationHandler;
+import org.elasticsearch.common.xcontent.NamedXContentRegistry;
+import org.elasticsearch.common.xcontent.XContentParser;
+import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.stream.Collectors;
+
+import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;
+
+/**
+ * Really ND-JSON.
+ */
+public class JsonLogStructureFinder implements LogStructureFinder {
+
+    private final List<String> sampleMessages;
+    private final LogStructure structure;
+
+    static JsonLogStructureFinder makeJsonLogStructureFinder(List<String> explanation, String sample, String charsetName,
+                                                             Boolean hasByteOrderMarker) throws IOException {
+
+        List<Map<String, ?>> sampleRecords = new ArrayList<>();
+
+        List<String> sampleMessages = Arrays.asList(sample.split("\n"));
+        for (String sampleMessage : sampleMessages) {
+            XContentParser parser = jsonXContent.createParser(NamedXContentRegistry.EMPTY, DeprecationHandler.THROW_UNSUPPORTED_OPERATION,
+                sampleMessage);
+            sampleRecords.add(parser.mapOrdered());
+        }
+
+        LogStructure.Builder structureBuilder = new LogStructure.Builder(LogStructure.Format.JSON)
+            .setCharset(charsetName)
+            .setHasByteOrderMarker(hasByteOrderMarker)
+            .setSampleStart(sampleMessages.stream().limit(2).collect(Collectors.joining("\n", "", "\n")))
+            .setNumLinesAnalyzed(sampleMessages.size())
+            .setNumMessagesAnalyzed(sampleRecords.size());
+
+        Tuple<String, TimestampMatch> timeField = LogStructureUtils.guessTimestampField(explanation, sampleRecords);
+        if (timeField != null) {
+            structureBuilder.setTimestampField(timeField.v1())
+                .setTimestampFormats(timeField.v2().dateFormats)
+                .setNeedClientTimezone(timeField.v2().hasTimezoneDependentParsing());
+        }
+
+        SortedMap<String, Object> mappings = LogStructureUtils.guessMappings(explanation, sampleRecords);
+        mappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
+
+        LogStructure structure = structureBuilder
+            .setMappings(mappings)
+            .setExplanation(explanation)
+            .build();
+
+        return new JsonLogStructureFinder(sampleMessages, structure);
+    }
+
+    private JsonLogStructureFinder(List<String> sampleMessages, LogStructure structure) {
+        this.sampleMessages = Collections.unmodifiableList(sampleMessages);
+        this.structure = structure;
+    }
+
+    @Override
+    public List<String> getSampleMessages() {
+        return sampleMessages;
+    }
+
+    @Override
+    public LogStructure getStructure() {
+        return structure;
+    }
+}

+ 87 - 0
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/JsonLogStructureFinderFactory.java

@@ -0,0 +1,87 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.elasticsearch.common.xcontent.DeprecationHandler;
+import org.elasticsearch.common.xcontent.NamedXContentRegistry;
+import org.elasticsearch.common.xcontent.XContentParser;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.List;
+import java.util.Locale;
+
+import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;
+
+public class JsonLogStructureFinderFactory implements LogStructureFinderFactory {
+
+    /**
+     * This format matches if the sample consists of one or more JSON documents.
+     * If there is more than one, they must be newline-delimited.  The
+     * documents must be non-empty, to prevent lines containing "{}" from matching.
+     */
+    @Override
+    public boolean canCreateFromSample(List<String> explanation, String sample) {
+
+        int completeDocCount = 0;
+
+        try {
+            String[] sampleLines = sample.split("\n");
+            for (String sampleLine : sampleLines) {
+                try (XContentParser parser = jsonXContent.createParser(NamedXContentRegistry.EMPTY,
+                    DeprecationHandler.THROW_UNSUPPORTED_OPERATION, new ContextPrintingStringReader(sampleLine))) {
+
+                    if (parser.map().isEmpty()) {
+                        explanation.add("Not JSON because an empty object was parsed: [" + sampleLine + "]");
+                        return false;
+                    }
+                    ++completeDocCount;
+                    if (parser.nextToken() != null) {
+                        explanation.add("Not newline delimited JSON because a line contained more than a single object: [" +
+                            sampleLine + "]");
+                        return false;
+                    }
+                }
+            }
+        } catch (IOException | IllegalStateException e) {
+            explanation.add("Not JSON because there was a parsing exception: [" + e.getMessage().replaceAll("\\s?\r?\n\\s?", " ") + "]");
+            return false;
+        }
+
+        if (completeDocCount == 0) {
+            explanation.add("Not JSON because sample didn't contain a complete document");
+            return false;
+        }
+
+        explanation.add("Deciding sample is newline delimited JSON");
+        return true;
+    }
+
+    @Override
+    public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
+        throws IOException {
+        return JsonLogStructureFinder.makeJsonLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker);
+    }
+
+    private static class ContextPrintingStringReader extends StringReader {
+
+        private final String str;
+
+        ContextPrintingStringReader(String str) {
+            super(str);
+            this.str = str;
+        }
+
+        @Override
+        public String toString() {
+            if (str.length() <= 80) {
+                return String.format(Locale.ROOT, "\"%s\"", str);
+            } else {
+                return String.format(Locale.ROOT, "\"%.77s...\"", str);
+            }
+        }
+    }
+}

+ 614 - 0
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructure.java

@@ -0,0 +1,614 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.elasticsearch.common.ParseField;
+import org.elasticsearch.common.xcontent.ObjectParser;
+import org.elasticsearch.common.xcontent.ToXContentObject;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Objects;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+/**
+ * Stores the log file format determined by a {@link LogStructureFinder}.
+ */
+public class LogStructure implements ToXContentObject {
+
+    public enum Format {
+
+        JSON, XML, CSV, TSV, SEMI_COLON_SEPARATED_VALUES, PIPE_SEPARATED_VALUES, SEMI_STRUCTURED_TEXT;
+
+        public Character separator() {
+            switch (this) {
+                case JSON:
+                case XML:
+                    return null;
+                case CSV:
+                    return ',';
+                case TSV:
+                    return '\t';
+                case SEMI_COLON_SEPARATED_VALUES:
+                    return ';';
+                case PIPE_SEPARATED_VALUES:
+                    return '|';
+                case SEMI_STRUCTURED_TEXT:
+                    return null;
+                default:
+                    throw new IllegalStateException("enum value [" + this + "] missing from switch.");
+            }
+        }
+
+        public boolean supportsNesting() {
+            switch (this) {
+                case JSON:
+                case XML:
+                    return true;
+                case CSV:
+                case TSV:
+                case SEMI_COLON_SEPARATED_VALUES:
+                case PIPE_SEPARATED_VALUES:
+                case SEMI_STRUCTURED_TEXT:
+                    return false;
+                default:
+                    throw new IllegalStateException("enum value [" + this + "] missing from switch.");
+            }
+        }
+
+        public boolean isStructured() {
+            switch (this) {
+                case JSON:
+                case XML:
+                case CSV:
+                case TSV:
+                case SEMI_COLON_SEPARATED_VALUES:
+                case PIPE_SEPARATED_VALUES:
+                    return true;
+                case SEMI_STRUCTURED_TEXT:
+                    return false;
+                default:
+                    throw new IllegalStateException("enum value [" + this + "] missing from switch.");
+            }
+        }
+
+        public boolean isSemiStructured() {
+            switch (this) {
+                case JSON:
+                case XML:
+                case CSV:
+                case TSV:
+                case SEMI_COLON_SEPARATED_VALUES:
+                case PIPE_SEPARATED_VALUES:
+                    return false;
+                case SEMI_STRUCTURED_TEXT:
+                    return true;
+                default:
+                    throw new IllegalStateException("enum value [" + this + "] missing from switch.");
+            }
+        }
+
+        public boolean isSeparatedValues() {
+            switch (this) {
+                case JSON:
+                case XML:
+                    return false;
+                case CSV:
+                case TSV:
+                case SEMI_COLON_SEPARATED_VALUES:
+                case PIPE_SEPARATED_VALUES:
+                    return true;
+                case SEMI_STRUCTURED_TEXT:
+                    return false;
+                default:
+                    throw new IllegalStateException("enum value [" + this + "] missing from switch.");
+            }
+        }
+
+        public static Format fromSeparator(char separator) {
+            switch (separator) {
+                case ',':
+                    return CSV;
+                case '\t':
+                    return TSV;
+                case ';':
+                    return SEMI_COLON_SEPARATED_VALUES;
+                case '|':
+                    return PIPE_SEPARATED_VALUES;
+                default:
+                    throw new IllegalArgumentException("No known format has separator [" + separator + "]");
+            }
+        }
+
+        public static Format fromString(String name) {
+            return valueOf(name.trim().toUpperCase(Locale.ROOT));
+        }
+
+        @Override
+        public String toString() {
+            return name().toLowerCase(Locale.ROOT);
+        }
+    }
+
+    static final ParseField NUM_LINES_ANALYZED = new ParseField("num_lines_analyzed");
+    static final ParseField NUM_MESSAGES_ANALYZED = new ParseField("num_messages_analyzed");
+    static final ParseField SAMPLE_START = new ParseField("sample_start");
+    static final ParseField CHARSET = new ParseField("charset");
+    static final ParseField HAS_BYTE_ORDER_MARKER = new ParseField("has_byte_order_marker");
+    static final ParseField STRUCTURE = new ParseField("format");
+    static final ParseField MULTILINE_START_PATTERN = new ParseField("multiline_start_pattern");
+    static final ParseField EXCLUDE_LINES_PATTERN = new ParseField("exclude_lines_pattern");
+    static final ParseField INPUT_FIELDS = new ParseField("input_fields");
+    static final ParseField HAS_HEADER_ROW = new ParseField("has_header_row");
+    static final ParseField SEPARATOR = new ParseField("separator");
+    static final ParseField SHOULD_TRIM_FIELDS = new ParseField("should_trim_fields");
+    static final ParseField GROK_PATTERN = new ParseField("grok_pattern");
+    static final ParseField TIMESTAMP_FIELD = new ParseField("timestamp_field");
+    static final ParseField TIMESTAMP_FORMATS = new ParseField("timestamp_formats");
+    static final ParseField NEED_CLIENT_TIMEZONE = new ParseField("need_client_timezone");
+    static final ParseField MAPPINGS = new ParseField("mappings");
+    static final ParseField EXPLANATION = new ParseField("explanation");
+
+    public static final ObjectParser<Builder, Void> PARSER = new ObjectParser<>("log_file_structure", false, Builder::new);
+
+    static {
+        PARSER.declareInt(Builder::setNumLinesAnalyzed, NUM_LINES_ANALYZED);
+        PARSER.declareInt(Builder::setNumMessagesAnalyzed, NUM_MESSAGES_ANALYZED);
+        PARSER.declareString(Builder::setSampleStart, SAMPLE_START);
+        PARSER.declareString(Builder::setCharset, CHARSET);
+        PARSER.declareBoolean(Builder::setHasByteOrderMarker, HAS_BYTE_ORDER_MARKER);
+        PARSER.declareString((p, c) -> p.setFormat(Format.fromString(c)), STRUCTURE);
+        PARSER.declareString(Builder::setMultilineStartPattern, MULTILINE_START_PATTERN);
+        PARSER.declareString(Builder::setExcludeLinesPattern, EXCLUDE_LINES_PATTERN);
+        PARSER.declareStringArray(Builder::setInputFields, INPUT_FIELDS);
+        PARSER.declareBoolean(Builder::setHasHeaderRow, HAS_HEADER_ROW);
+        PARSER.declareString((p, c) -> p.setSeparator(c.charAt(0)), SEPARATOR);
+        PARSER.declareBoolean(Builder::setShouldTrimFields, SHOULD_TRIM_FIELDS);
+        PARSER.declareString(Builder::setGrokPattern, GROK_PATTERN);
+        PARSER.declareString(Builder::setTimestampField, TIMESTAMP_FIELD);
+        PARSER.declareStringArray(Builder::setTimestampFormats, TIMESTAMP_FORMATS);
+        PARSER.declareBoolean(Builder::setNeedClientTimezone, NEED_CLIENT_TIMEZONE);
+        PARSER.declareObject(Builder::setMappings, (p, c) -> new TreeMap<>(p.map()), MAPPINGS);
+        PARSER.declareStringArray(Builder::setExplanation, EXPLANATION);
+    }
+
+    private final int numLinesAnalyzed;
+    private final int numMessagesAnalyzed;
+    private final String sampleStart;
+    private final String charset;
+    private final Boolean hasByteOrderMarker;
+    private final Format format;
+    private final String multilineStartPattern;
+    private final String excludeLinesPattern;
+    private final List<String> inputFields;
+    private final Boolean hasHeaderRow;
+    private final Character separator;
+    private final Boolean shouldTrimFields;
+    private final String grokPattern;
+    private final List<String> timestampFormats;
+    private final String timestampField;
+    private final boolean needClientTimezone;
+    private final SortedMap<String, Object> mappings;
+    private final List<String> explanation;
+
+    public LogStructure(int numLinesAnalyzed, int numMessagesAnalyzed, String sampleStart, String charset, Boolean hasByteOrderMarker,
+                        Format format, String multilineStartPattern, String excludeLinesPattern, List<String> inputFields,
+                        Boolean hasHeaderRow, Character separator, Boolean shouldTrimFields, String grokPattern, String timestampField,
+                        List<String> timestampFormats, boolean needClientTimezone, Map<String, Object> mappings,
+                        List<String> explanation) {
+
+        this.numLinesAnalyzed = numLinesAnalyzed;
+        this.numMessagesAnalyzed = numMessagesAnalyzed;
+        this.sampleStart = Objects.requireNonNull(sampleStart);
+        this.charset = Objects.requireNonNull(charset);
+        this.hasByteOrderMarker = hasByteOrderMarker;
+        this.format = Objects.requireNonNull(format);
+        this.multilineStartPattern = multilineStartPattern;
+        this.excludeLinesPattern = excludeLinesPattern;
+        this.inputFields = (inputFields == null) ? null : Collections.unmodifiableList(new ArrayList<>(inputFields));
+        this.hasHeaderRow = hasHeaderRow;
+        this.separator = separator;
+        this.shouldTrimFields = shouldTrimFields;
+        this.grokPattern = grokPattern;
+        this.timestampField = timestampField;
+        this.timestampFormats = (timestampFormats == null) ? null : Collections.unmodifiableList(new ArrayList<>(timestampFormats));
+        this.needClientTimezone = needClientTimezone;
+        this.mappings = Collections.unmodifiableSortedMap(new TreeMap<>(mappings));
+        this.explanation = Collections.unmodifiableList(new ArrayList<>(explanation));
+    }
+
+    public int getNumLinesAnalyzed() {
+        return numLinesAnalyzed;
+    }
+
+    public int getNumMessagesAnalyzed() {
+        return numMessagesAnalyzed;
+    }
+
+    public String getSampleStart() {
+        return sampleStart;
+    }
+
+    public String getCharset() {
+        return charset;
+    }
+
+    public Boolean getHasByteOrderMarker() {
+        return hasByteOrderMarker;
+    }
+
+    public Format getFormat() {
+        return format;
+    }
+
+    public String getMultilineStartPattern() {
+        return multilineStartPattern;
+    }
+
+    public String getExcludeLinesPattern() {
+        return excludeLinesPattern;
+    }
+
+    public List<String> getInputFields() {
+        return inputFields;
+    }
+
+    public Boolean getHasHeaderRow() {
+        return hasHeaderRow;
+    }
+
+    public Character getSeparator() {
+        return separator;
+    }
+
+    public Boolean getShouldTrimFields() {
+        return shouldTrimFields;
+    }
+
+    public String getGrokPattern() {
+        return grokPattern;
+    }
+
+    public String getTimestampField() {
+        return timestampField;
+    }
+
+    public List<String> getTimestampFormats() {
+        return timestampFormats;
+    }
+
+    public boolean needClientTimezone() {
+        return needClientTimezone;
+    }
+
+    public SortedMap<String, Object> getMappings() {
+        return mappings;
+    }
+
+    public List<String> getExplanation() {
+        return explanation;
+    }
+
+    @Override
+    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
+
+        builder.startObject();
+        builder.field(NUM_LINES_ANALYZED.getPreferredName(), numLinesAnalyzed);
+        builder.field(NUM_MESSAGES_ANALYZED.getPreferredName(), numMessagesAnalyzed);
+        builder.field(SAMPLE_START.getPreferredName(), sampleStart);
+        builder.field(CHARSET.getPreferredName(), charset);
+        if (hasByteOrderMarker != null) {
+            builder.field(HAS_BYTE_ORDER_MARKER.getPreferredName(), hasByteOrderMarker.booleanValue());
+        }
+        builder.field(STRUCTURE.getPreferredName(), format);
+        if (multilineStartPattern != null && multilineStartPattern.isEmpty() == false) {
+            builder.field(MULTILINE_START_PATTERN.getPreferredName(), multilineStartPattern);
+        }
+        if (excludeLinesPattern != null && excludeLinesPattern.isEmpty() == false) {
+            builder.field(EXCLUDE_LINES_PATTERN.getPreferredName(), excludeLinesPattern);
+        }
+        if (inputFields != null && inputFields.isEmpty() == false) {
+            builder.field(INPUT_FIELDS.getPreferredName(), inputFields);
+        }
+        if (hasHeaderRow != null) {
+            builder.field(HAS_HEADER_ROW.getPreferredName(), hasHeaderRow.booleanValue());
+        }
+        if (separator != null) {
+            builder.field(SEPARATOR.getPreferredName(), String.valueOf(separator));
+        }
+        if (shouldTrimFields != null) {
+            builder.field(SHOULD_TRIM_FIELDS.getPreferredName(), shouldTrimFields.booleanValue());
+        }
+        if (grokPattern != null && grokPattern.isEmpty() == false) {
+            builder.field(GROK_PATTERN.getPreferredName(), grokPattern);
+        }
+        if (timestampField != null && timestampField.isEmpty() == false) {
+            builder.field(TIMESTAMP_FIELD.getPreferredName(), timestampField);
+        }
+        if (timestampFormats != null && timestampFormats.isEmpty() == false) {
+            builder.field(TIMESTAMP_FORMATS.getPreferredName(), timestampFormats);
+        }
+        builder.field(NEED_CLIENT_TIMEZONE.getPreferredName(), needClientTimezone);
+        builder.field(MAPPINGS.getPreferredName(), mappings);
+        builder.field(EXPLANATION.getPreferredName(), explanation);
+        builder.endObject();
+
+        return builder;
+    }
+
+    @Override
+    public int hashCode() {
+
+        return Objects.hash(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format,
+            multilineStartPattern, excludeLinesPattern, inputFields, hasHeaderRow, separator, shouldTrimFields, grokPattern, timestampField,
+            timestampFormats, needClientTimezone, mappings, explanation);
+    }
+
+    @Override
+    public boolean equals(Object other) {
+
+        if (this == other) {
+            return true;
+        }
+
+        if (other == null || getClass() != other.getClass()) {
+            return false;
+        }
+
+        LogStructure that = (LogStructure) other;
+        return this.numLinesAnalyzed == that.numLinesAnalyzed &&
+            this.numMessagesAnalyzed == that.numMessagesAnalyzed &&
+            this.needClientTimezone == that.needClientTimezone &&
+            Objects.equals(this.sampleStart, that.sampleStart) &&
+            Objects.equals(this.charset, that.charset) &&
+            Objects.equals(this.hasByteOrderMarker, that.hasByteOrderMarker) &&
+            Objects.equals(this.format, that.format) &&
+            Objects.equals(this.multilineStartPattern, that.multilineStartPattern) &&
+            Objects.equals(this.excludeLinesPattern, that.excludeLinesPattern) &&
+            Objects.equals(this.inputFields, that.inputFields) &&
+            Objects.equals(this.hasHeaderRow, that.hasHeaderRow) &&
+            Objects.equals(this.separator, that.separator) &&
+            Objects.equals(this.shouldTrimFields, that.shouldTrimFields) &&
+            Objects.equals(this.grokPattern, that.grokPattern) &&
+            Objects.equals(this.timestampField, that.timestampField) &&
+            Objects.equals(this.timestampFormats, that.timestampFormats) &&
+            Objects.equals(this.mappings, that.mappings) &&
+            Objects.equals(this.explanation, that.explanation);
+    }
+
+    public static class Builder {
+
+        private int numLinesAnalyzed;
+        private int numMessagesAnalyzed;
+        private String sampleStart;
+        private String charset;
+        private Boolean hasByteOrderMarker;
+        private Format format;
+        private String multilineStartPattern;
+        private String excludeLinesPattern;
+        private List<String> inputFields;
+        private Boolean hasHeaderRow;
+        private Character separator;
+        private Boolean shouldTrimFields;
+        private String grokPattern;
+        private String timestampField;
+        private List<String> timestampFormats;
+        private boolean needClientTimezone;
+        private Map<String, Object> mappings;
+        private List<String> explanation;
+
+        public Builder() {
+            this(Format.SEMI_STRUCTURED_TEXT);
+        }
+
+        public Builder(Format format) {
+            setFormat(format);
+        }
+
+        public Builder setNumLinesAnalyzed(int numLinesAnalyzed) {
+            this.numLinesAnalyzed = numLinesAnalyzed;
+            return this;
+        }
+
+        public Builder setNumMessagesAnalyzed(int numMessagesAnalyzed) {
+            this.numMessagesAnalyzed = numMessagesAnalyzed;
+            return this;
+        }
+
+        public Builder setSampleStart(String sampleStart) {
+            this.sampleStart = Objects.requireNonNull(sampleStart);
+            return this;
+        }
+
+        public Builder setCharset(String charset) {
+            this.charset = Objects.requireNonNull(charset);
+            return this;
+        }
+
+        public Builder setHasByteOrderMarker(Boolean hasByteOrderMarker) {
+            this.hasByteOrderMarker = hasByteOrderMarker;
+            return this;
+        }
+
+        public Builder setFormat(Format format) {
+            this.format = Objects.requireNonNull(format);
+            this.separator = format.separator();
+            return this;
+        }
+
+        public Builder setMultilineStartPattern(String multilineStartPattern) {
+            this.multilineStartPattern = multilineStartPattern;
+            return this;
+        }
+
+        public Builder setExcludeLinesPattern(String excludeLinesPattern) {
+            this.excludeLinesPattern = excludeLinesPattern;
+            return this;
+        }
+
+        public Builder setInputFields(List<String> inputFields) {
+            this.inputFields = inputFields;
+            return this;
+        }
+
+        public Builder setHasHeaderRow(Boolean hasHeaderRow) {
+            this.hasHeaderRow = hasHeaderRow;
+            return this;
+        }
+
+        public Builder setShouldTrimFields(Boolean shouldTrimFields) {
+            this.shouldTrimFields = shouldTrimFields;
+            return this;
+        }
+
+        public Builder setSeparator(Character separator) {
+            this.separator = separator;
+            return this;
+        }
+
+        public Builder setGrokPattern(String grokPattern) {
+            this.grokPattern = grokPattern;
+            return this;
+        }
+
+        public Builder setTimestampField(String timestampField) {
+            this.timestampField = timestampField;
+            return this;
+        }
+
+        public Builder setTimestampFormats(List<String> timestampFormats) {
+            this.timestampFormats = timestampFormats;
+            return this;
+        }
+
+        public Builder setNeedClientTimezone(boolean needClientTimezone) {
+            this.needClientTimezone = needClientTimezone;
+            return this;
+        }
+
+        public Builder setMappings(Map<String, Object> mappings) {
+            this.mappings = Objects.requireNonNull(mappings);
+            return this;
+        }
+
+        public Builder setExplanation(List<String> explanation) {
+            this.explanation = Objects.requireNonNull(explanation);
+            return this;
+        }
+
+        @SuppressWarnings("fallthrough")
+        public LogStructure build() {
+
+            if (numLinesAnalyzed <= 0) {
+                throw new IllegalArgumentException("Number of lines analyzed must be positive.");
+            }
+
+            if (numMessagesAnalyzed <= 0) {
+                throw new IllegalArgumentException("Number of messages analyzed must be positive.");
+            }
+
+            if (numMessagesAnalyzed > numLinesAnalyzed) {
+                throw new IllegalArgumentException("Number of messages analyzed cannot be greater than number of lines analyzed.");
+            }
+
+            if (sampleStart == null || sampleStart.isEmpty()) {
+                throw new IllegalArgumentException("Sample start must be specified.");
+            }
+
+            if (charset == null || charset.isEmpty()) {
+                throw new IllegalArgumentException("A character set must be specified.");
+            }
+
+            if (charset.toUpperCase(Locale.ROOT).startsWith("UTF") == false && hasByteOrderMarker != null) {
+                throw new IllegalArgumentException("A byte order marker is only possible for UTF character sets.");
+            }
+
+            switch (format) {
+                case JSON:
+                    if (shouldTrimFields != null) {
+                        throw new IllegalArgumentException("Should trim fields may not be specified for [" + format + "] structures.");
+                    }
+                    // $FALL-THROUGH$
+                case XML:
+                    if (hasHeaderRow != null) {
+                        throw new IllegalArgumentException("Has header row may not be specified for [" + format + "] structures.");
+                    }
+                    if (separator != null) {
+                        throw new IllegalArgumentException("Separator may not be specified for [" + format + "] structures.");
+                    }
+                    if (grokPattern != null) {
+                        throw new IllegalArgumentException("Grok pattern may not be specified for [" + format + "] structures.");
+                    }
+                    break;
+                case CSV:
+                case TSV:
+                case SEMI_COLON_SEPARATED_VALUES:
+                case PIPE_SEPARATED_VALUES:
+                    if (inputFields == null || inputFields.isEmpty()) {
+                        throw new IllegalArgumentException("Input fields must be specified for [" + format + "] structures.");
+                    }
+                    if (hasHeaderRow == null) {
+                        throw new IllegalArgumentException("Has header row must be specified for [" + format + "] structures.");
+                    }
+                    Character expectedSeparator = format.separator();
+                    assert expectedSeparator != null;
+                    if (expectedSeparator.equals(separator) == false) {
+                        throw new IllegalArgumentException("Separator must be [" + expectedSeparator + "] for [" + format +
+                            "] structures.");
+                    }
+                    if (grokPattern != null) {
+                        throw new IllegalArgumentException("Grok pattern may not be specified for [" + format + "] structures.");
+                    }
+                    break;
+                case SEMI_STRUCTURED_TEXT:
+                    if (inputFields != null) {
+                        throw new IllegalArgumentException("Input fields may not be specified for [" + format + "] structures.");
+                    }
+                    if (hasHeaderRow != null) {
+                        throw new IllegalArgumentException("Has header row may not be specified for [" + format + "] structures.");
+                    }
+                    if (separator != null) {
+                        throw new IllegalArgumentException("Separator may not be specified for [" + format + "] structures.");
+                    }
+                    if (shouldTrimFields != null) {
+                        throw new IllegalArgumentException("Should trim fields may not be specified for [" + format + "] structures.");
+                    }
+                    if (grokPattern == null || grokPattern.isEmpty()) {
+                        throw new IllegalArgumentException("Grok pattern must be specified for [" + format + "] structures.");
+                    }
+                    break;
+                default:
+                    throw new IllegalStateException("enum value [" + format + "] missing from switch.");
+            }
+
+            if ((timestampField == null) != (timestampFormats == null || timestampFormats.isEmpty())) {
+                throw new IllegalArgumentException("Timestamp field and timestamp formats must both be specified or neither be specified.");
+            }
+
+            if (needClientTimezone && timestampField == null) {
+                throw new IllegalArgumentException("Client timezone cannot be needed if there is no timestamp field.");
+            }
+
+            if (mappings == null || mappings.isEmpty()) {
+                throw new IllegalArgumentException("Mappings must be specified.");
+            }
+
+            if (explanation == null || explanation.isEmpty()) {
+                throw new IllegalArgumentException("Explanation must be specified.");
+            }
+
+            return new LogStructure(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format,
+                multilineStartPattern, excludeLinesPattern, inputFields, hasHeaderRow, separator, shouldTrimFields, grokPattern,
+                timestampField, timestampFormats, needClientTimezone, mappings, explanation);
+        }
+    }
+}

+ 23 - 0
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinder.java

@@ -0,0 +1,23 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import java.util.List;
+
+public interface LogStructureFinder {
+
+    /**
+     * The (possibly multi-line) messages that the log sample was split into.
+     * @return A list of messages.
+     */
+    List<String> getSampleMessages();
+
+    /**
+     * Retrieve the structure of the log file used to instantiate the finder.
+     * @return The log file structure.
+     */
+    LogStructure getStructure();
+}

+ 35 - 0
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderFactory.java

@@ -0,0 +1,35 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import java.util.List;
+
+public interface LogStructureFinderFactory {
+
+    /**
+     * Given a sample of a log file, decide whether this factory will be able
+     * to create an appropriate object to represent its ingestion configs.
+     * @param explanation List of reasons for making decisions.  May contain items when passed and new reasons
+     *                    can be appended by this method.
+     * @param sample A sample from the log file to be ingested.
+     * @return <code>true</code> if this factory can create an appropriate log
+     *         file structure given the sample; otherwise <code>false</code>.
+     */
+    boolean canCreateFromSample(List<String> explanation, String sample);
+
+    /**
+     * Create an object representing the structure of a log file.
+     * @param explanation List of reasons for making decisions.  May contain items when passed and new reasons
+     *                    can be appended by this method.
+     * @param sample A sample from the log file to be ingested.
+     * @param charsetName The name of the character set in which the sample was provided.
+     * @param hasByteOrderMarker Did the sample have a byte order marker?  <code>null</code> means "not relevant".
+     * @return A log file structure object suitable for ingesting the supplied sample.
+     * @throws Exception if something goes wrong during creation.
+     */
+    LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
+        throws Exception;
+}

+ 232 - 0
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderManager.java

@@ -0,0 +1,232 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import com.ibm.icu.text.CharsetDetector;
+import com.ibm.icu.text.CharsetMatch;
+import org.elasticsearch.common.collect.Tuple;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Optional;
+import java.util.Set;
+
+/**
+ * Runs the high-level steps needed to create ingest configs for the specified log file.  In order:
+ * 1. Determine the most likely character set (UTF-8, UTF-16LE, ISO-8859-2, etc.)
+ * 2. Load a sample of the file, consisting of the first 1000 lines of the file
+ * 3. Determine the most likely file structure - one of ND-JSON, XML, CSV, TSV or semi-structured text
+ * 4. Create an appropriate structure object and delegate writing configs to it
+ */
+public final class LogStructureFinderManager {
+
+    public static final int MIN_SAMPLE_LINE_COUNT = 2;
+
+    static final Set<String> FILEBEAT_SUPPORTED_ENCODINGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+        "866", "ansi_x3.4-1968", "arabic", "ascii", "asmo-708", "big5", "big5-hkscs", "chinese", "cn-big5", "cp1250", "cp1251", "cp1252",
+        "cp1253", "cp1254", "cp1255", "cp1256", "cp1257", "cp1258", "cp819", "cp866", "csbig5", "cseuckr", "cseucpkdfmtjapanese",
+        "csgb2312", "csibm866", "csiso2022jp", "csiso2022kr", "csiso58gb231280", "csiso88596e", "csiso88596i", "csiso88598e", "csiso88598i",
+        "csisolatin1", "csisolatin2", "csisolatin3", "csisolatin4", "csisolatin5", "csisolatin6", "csisolatin9", "csisolatinarabic",
+        "csisolatincyrillic", "csisolatingreek", "csisolatinhebrew", "cskoi8r", "csksc56011987", "csmacintosh", "csshiftjis", "cyrillic",
+        "dos-874", "ecma-114", "ecma-118", "elot_928", "euc-jp", "euc-kr", "gb18030", "gb2312", "gb_2312", "gb_2312-80", "gbk", "greek",
+        "greek8", "hebrew", "hz-gb-2312", "ibm819", "ibm866", "iso-2022-cn", "iso-2022-cn-ext", "iso-2022-jp", "iso-2022-kr", "iso-8859-1",
+        "iso-8859-10", "iso-8859-11", "iso-8859-13", "iso-8859-14", "iso-8859-15", "iso-8859-16", "iso-8859-2", "iso-8859-3", "iso-8859-4",
+        "iso-8859-5", "iso-8859-6", "iso-8859-6-e", "iso-8859-6-i", "iso-8859-7", "iso-8859-8", "iso-8859-8-e", "iso-8859-8-i",
+        "iso-8859-9", "iso-ir-100", "iso-ir-101", "iso-ir-109", "iso-ir-110", "iso-ir-126", "iso-ir-127", "iso-ir-138", "iso-ir-144",
+        "iso-ir-148", "iso-ir-149", "iso-ir-157", "iso-ir-58", "iso8859-1", "iso8859-10", "iso8859-11", "iso8859-13", "iso8859-14",
+        "iso8859-15", "iso8859-2", "iso8859-3", "iso8859-4", "iso8859-5", "iso8859-6", "iso8859-6e", "iso8859-6i", "iso8859-7", "iso8859-8",
+        "iso8859-8e", "iso8859-8i", "iso8859-9", "iso88591", "iso885910", "iso885911", "iso885913", "iso885914", "iso885915", "iso88592",
+        "iso88593", "iso88594", "iso88595", "iso88596", "iso88597", "iso88598", "iso88599", "iso_8859-1", "iso_8859-15", "iso_8859-1:1987",
+        "iso_8859-2", "iso_8859-2:1987", "iso_8859-3", "iso_8859-3:1988", "iso_8859-4", "iso_8859-4:1988", "iso_8859-5", "iso_8859-5:1988",
+        "iso_8859-6", "iso_8859-6:1987", "iso_8859-7", "iso_8859-7:1987", "iso_8859-8", "iso_8859-8:1988", "iso_8859-9", "iso_8859-9:1989",
+        "koi", "koi8", "koi8-r", "koi8-ru", "koi8-u", "koi8_r", "korean", "ks_c_5601-1987", "ks_c_5601-1989", "ksc5601", "ksc_5601", "l1",
+        "l2", "l3", "l4", "l5", "l6", "l9", "latin1", "latin2", "latin3", "latin4", "latin5", "latin6", "logical", "mac", "macintosh",
+        "ms932", "ms_kanji", "shift-jis", "shift_jis", "sjis", "sun_eu_greek", "tis-620", "unicode-1-1-utf-8", "us-ascii", "utf-16",
+        "utf-16-bom", "utf-16be", "utf-16be-bom", "utf-16le", "utf-16le-bom", "utf-8", "utf8", "visual", "windows-1250", "windows-1251",
+        "windows-1252", "windows-1253", "windows-1254", "windows-1255", "windows-1256", "windows-1257", "windows-1258", "windows-31j",
+        "windows-874", "windows-949", "x-cp1250", "x-cp1251", "x-cp1252", "x-cp1253", "x-cp1254", "x-cp1255", "x-cp1256", "x-cp1257",
+        "x-cp1258", "x-euc-jp", "x-gbk", "x-mac-cyrillic", "x-mac-roman", "x-mac-ukrainian", "x-sjis", "x-x-big5"
+    )));
+
+    /**
+     * These need to be ordered so that the more generic formats come after the more specific ones
+     */
+    private static final List<LogStructureFinderFactory> ORDERED_STRUCTURE_FACTORIES = Collections.unmodifiableList(Arrays.asList(
+        new JsonLogStructureFinderFactory(),
+        new XmlLogStructureFinderFactory(),
+        // ND-JSON will often also be valid (although utterly weird) CSV, so JSON must come before CSV
+        new CsvLogStructureFinderFactory(),
+        new TsvLogStructureFinderFactory(),
+        new SemiColonSeparatedValuesLogStructureFinderFactory(),
+        new PipeSeparatedValuesLogStructureFinderFactory(),
+        new TextLogStructureFinderFactory()
+    ));
+
+    private static final int BUFFER_SIZE = 8192;
+
+    /**
+     * Given a stream of data from some log file, determine its structure.
+     * @param idealSampleLineCount Ideally, how many lines from the stream will be read to determine the structure?
+     *                             If the stream has fewer lines then an attempt will still be made, providing at
+     *                             least {@link #MIN_SAMPLE_LINE_COUNT} lines can be read.
+     * @param fromFile A stream from which the sample will be read.
+     * @return A {@link LogStructureFinder} object from which the structure and messages can be queried.
+     * @throws Exception A variety of problems could occur at various stages of the structure finding process.
+     */
+    public LogStructureFinder findLogStructure(int idealSampleLineCount, InputStream fromFile) throws Exception {
+        return findLogStructure(new ArrayList<>(), idealSampleLineCount, fromFile);
+    }
+
+    public LogStructureFinder findLogStructure(List<String> explanation, int idealSampleLineCount, InputStream fromFile)
+        throws Exception {
+
+        CharsetMatch charsetMatch = findCharset(explanation, fromFile);
+        String charsetName = charsetMatch.getName();
+
+        Tuple<String, Boolean> sampleInfo = sampleFile(charsetMatch.getReader(), charsetName, MIN_SAMPLE_LINE_COUNT,
+            Math.max(MIN_SAMPLE_LINE_COUNT, idealSampleLineCount));
+
+        return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2());
+    }
+
+    CharsetMatch findCharset(List<String> explanation, InputStream inputStream) throws Exception {
+
+        // We need an input stream that supports mark and reset, so wrap the argument
+        // in a BufferedInputStream if it doesn't already support this feature
+        if (inputStream.markSupported() == false) {
+            inputStream = new BufferedInputStream(inputStream, BUFFER_SIZE);
+        }
+
+        // This is from ICU4J
+        CharsetDetector charsetDetector = new CharsetDetector().setText(inputStream);
+        CharsetMatch[] charsetMatches = charsetDetector.detectAll();
+
+        // Determine some extra characteristics of the input to compensate for some deficiencies of ICU4J
+        boolean pureAscii = true;
+        boolean containsZeroBytes = false;
+        inputStream.mark(BUFFER_SIZE);
+        byte[] workspace = new byte[BUFFER_SIZE];
+        int remainingLength = BUFFER_SIZE;
+        do {
+            int bytesRead = inputStream.read(workspace, 0, remainingLength);
+            if (bytesRead <= 0) {
+                break;
+            }
+            for (int i = 0; i < bytesRead && containsZeroBytes == false; ++i) {
+                if (workspace[i] == 0) {
+                    containsZeroBytes = true;
+                    pureAscii = false;
+                } else {
+                    pureAscii = pureAscii && workspace[i] > 0 && workspace[i] < 128;
+                }
+            }
+            remainingLength -= bytesRead;
+        } while (containsZeroBytes == false && remainingLength > 0);
+        inputStream.reset();
+
+        if (pureAscii) {
+            // If the input is pure ASCII then many single byte character sets will match.  We want to favour
+            // UTF-8 in this case, as it avoids putting a bold declaration of a dubious character set choice
+            // in the config files.
+            Optional<CharsetMatch> utf8CharsetMatch = Arrays.stream(charsetMatches)
+                .filter(charsetMatch -> StandardCharsets.UTF_8.name().equals(charsetMatch.getName())).findFirst();
+            if (utf8CharsetMatch.isPresent()) {
+                explanation.add("Using character encoding [" + StandardCharsets.UTF_8.name() +
+                    "], which matched the input with [" + utf8CharsetMatch.get().getConfidence() + "%] confidence - first [" +
+                    (BUFFER_SIZE / 1024) + "kB] of input was pure ASCII");
+                return utf8CharsetMatch.get();
+            }
+        }
+
+        // Input wasn't pure ASCII, so use the best matching character set that's supported by both Java and Go.
+        // Additionally, if the input contains zero bytes then avoid single byte character sets, as ICU4J will
+        // suggest these for binary files but then
+        for (CharsetMatch charsetMatch : charsetMatches) {
+            String name = charsetMatch.getName();
+            if (Charset.isSupported(name) && FILEBEAT_SUPPORTED_ENCODINGS.contains(name.toLowerCase(Locale.ROOT))) {
+
+                // This extra test is to avoid trying to read binary files as text.  Running the log config
+                // deduction algorithms on binary files is very slow as the binary files generally appear to
+                // have very long lines.
+                boolean spaceEncodingContainsZeroByte = false;
+                byte[] spaceBytes = " ".getBytes(name);
+                for (int i = 0; i < spaceBytes.length && spaceEncodingContainsZeroByte == false; ++i) {
+                    spaceEncodingContainsZeroByte = (spaceBytes[i] == 0);
+                }
+                if (containsZeroBytes && spaceEncodingContainsZeroByte == false) {
+                    explanation.add("Character encoding [" + name + "] matched the input with [" + charsetMatch.getConfidence() +
+                        "%] confidence but was rejected as the input contains zero bytes and the [" + name + "] encoding does not");
+                } else {
+                    explanation.add("Using character encoding [" + name + "], which matched the input with [" +
+                        charsetMatch.getConfidence() + "%] confidence");
+                    return charsetMatch;
+                }
+            } else {
+                explanation.add("Character encoding [" + name + "] matched the input with [" + charsetMatch.getConfidence() +
+                    "%] confidence but was rejected as it is not supported by [" +
+                    (Charset.isSupported(name) ? "Filebeat" : "the JVM") + "]");
+            }
+        }
+
+        throw new IllegalArgumentException("Could not determine a usable character encoding for the input" +
+            (containsZeroBytes ? " - could it be binary data?" : ""));
+    }
+
+    LogStructureFinder makeBestStructureFinder(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
+        throws Exception {
+
+        for (LogStructureFinderFactory factory : ORDERED_STRUCTURE_FACTORIES) {
+            if (factory.canCreateFromSample(explanation, sample)) {
+                return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker);
+            }
+        }
+        throw new IllegalArgumentException("Input did not match any known formats");
+    }
+
+    private Tuple<String, Boolean> sampleFile(Reader reader, String charsetName, int minLines, int maxLines) throws IOException {
+
+        int lineCount = 0;
+        BufferedReader bufferedReader = new BufferedReader(reader);
+        StringBuilder sample = new StringBuilder();
+
+        // Don't include any byte-order-marker in the sample.  (The logic to skip it works for both
+        // UTF-8 and UTF-16 assuming the character set of the reader was correctly detected.)
+        Boolean hasByteOrderMarker = null;
+        if (charsetName.toUpperCase(Locale.ROOT).startsWith("UTF")) {
+            int maybeByteOrderMarker = reader.read();
+            hasByteOrderMarker = ((char) maybeByteOrderMarker == '\uFEFF');
+            if (maybeByteOrderMarker >= 0 && hasByteOrderMarker == false && (char) maybeByteOrderMarker != '\r')
+            {
+                sample.appendCodePoint(maybeByteOrderMarker);
+                if ((char) maybeByteOrderMarker == '\n') {
+                    ++lineCount;
+                }
+            }
+        }
+
+        String line;
+        while ((line = bufferedReader.readLine()) != null && ++lineCount <= maxLines) {
+            sample.append(line).append('\n');
+        }
+
+        if (lineCount < minLines) {
+            throw new IllegalArgumentException("Input contained too few lines to sample");
+        }
+
+        return new Tuple<>(sample.toString(), hasByteOrderMarker);
+    }
+}

+ 238 - 0
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureUtils.java

@@ -0,0 +1,238 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.elasticsearch.common.collect.Tuple;
+import org.elasticsearch.grok.Grok;
+import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+final class LogStructureUtils {
+
+    static final String DEFAULT_TIMESTAMP_FIELD = "@timestamp";
+    static final String MAPPING_TYPE_SETTING = "type";
+    static final String MAPPING_FORMAT_SETTING = "format";
+    static final String MAPPING_PROPERTIES_SETTING = "properties";
+
+    // NUMBER Grok pattern doesn't support scientific notation, so we extend it
+    private static final Grok NUMBER_GROK = new Grok(Grok.getBuiltinPatterns(), "^%{NUMBER}(?:[eE][+-]?[0-3]?[0-9]{1,2})?$");
+    private static final Grok IP_GROK = new Grok(Grok.getBuiltinPatterns(), "^%{IP}$");
+    private static final int KEYWORD_MAX_LEN = 256;
+    private static final int KEYWORD_MAX_SPACES = 5;
+
+    private LogStructureUtils() {
+    }
+
+    /**
+     * Given one or more sample records, find a timestamp field that is consistently present in them all.
+     * To be returned the timestamp field:
+     * - Must exist in every record
+     * - Must have the same timestamp format in every record
+     * If multiple fields meet these criteria then the one that occurred first in the first sample record
+     * is chosen.
+     * @param explanation List of reasons for choosing the overall log structure.  This list
+     *                    may be non-empty when the method is called, and this method may
+     *                    append to it.
+     * @param sampleRecords List of records derived from the provided log sample.
+     * @return A tuple of (field name, timestamp format) if one can be found, or <code>null</code> if
+     *         there is no consistent timestamp.
+     */
+    static Tuple<String, TimestampMatch> guessTimestampField(List<String> explanation, List<Map<String, ?>> sampleRecords) {
+        if (sampleRecords.isEmpty()) {
+            return null;
+        }
+
+        // Accept the first match from the first sample that is compatible with all the other samples
+        for (Tuple<String, TimestampMatch> candidate : findCandidates(explanation, sampleRecords)) {
+
+            boolean allGood = true;
+            for (Map<String, ?> sampleRecord : sampleRecords.subList(1, sampleRecords.size())) {
+                Object fieldValue = sampleRecord.get(candidate.v1());
+                if (fieldValue == null) {
+                    explanation.add("First sample match [" + candidate.v1() + "] ruled out because record [" + sampleRecord +
+                        "] doesn't have field");
+                    allGood = false;
+                    break;
+                }
+
+                TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(fieldValue.toString());
+                if (match == null || match.candidateIndex != candidate.v2().candidateIndex) {
+                    explanation.add("First sample match [" + candidate.v1() + "] ruled out because record [" + sampleRecord +
+                        "] matches differently: [" + match + "]");
+                    allGood = false;
+                    break;
+                }
+            }
+
+            if (allGood) {
+                explanation.add("Guessing timestamp field is [" + candidate.v1() + "] with format [" + candidate.v2() + "]");
+                return candidate;
+            }
+        }
+
+        return null;
+    }
+
+    private static List<Tuple<String, TimestampMatch>> findCandidates(List<String> explanation, List<Map<String, ?>> sampleRecords) {
+
+        List<Tuple<String, TimestampMatch>> candidates = new ArrayList<>();
+
+        // Get candidate timestamps from the first sample record
+        for (Map.Entry<String, ?> entry : sampleRecords.get(0).entrySet()) {
+            Object value = entry.getValue();
+            if (value != null) {
+                TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(value.toString());
+                if (match != null) {
+                    Tuple<String, TimestampMatch> candidate = new Tuple<>(entry.getKey(), match);
+                    candidates.add(candidate);
+                    explanation.add("First sample timestamp match [" + candidate + "]");
+                }
+            }
+        }
+
+        return candidates;
+    }
+
+    /**
+     * Given the sampled records, guess appropriate Elasticsearch mappings.
+     * @param sampleRecords The sampled records.
+     * @return A map of field name to mapping settings.
+     */
+    static SortedMap<String, Object> guessMappings(List<String> explanation, List<Map<String, ?>> sampleRecords) {
+
+        SortedMap<String, Object> mappings = new TreeMap<>();
+
+        for (Map<String, ?> sampleRecord : sampleRecords) {
+            for (String fieldName : sampleRecord.keySet()) {
+                mappings.computeIfAbsent(fieldName, key -> guessMapping(explanation, fieldName,
+                    sampleRecords.stream().flatMap(record -> {
+                            Object fieldValue = record.get(fieldName);
+                            return (fieldValue == null) ? Stream.empty() : Stream.of(fieldValue);
+                        }
+                    ).collect(Collectors.toList())));
+            }
+        }
+
+        return mappings;
+    }
+
+    static Map<String, String> guessMapping(List<String> explanation, String fieldName, List<Object> fieldValues) {
+
+        if (fieldValues == null || fieldValues.isEmpty()) {
+            // We can get here if all the records that contained a given field had a null value for it.
+            // In this case it's best not to make any statement about what the mapping type should be.
+            return null;
+        }
+
+        if (fieldValues.stream().anyMatch(value -> value instanceof Map)) {
+            if (fieldValues.stream().allMatch(value -> value instanceof Map)) {
+                return Collections.singletonMap(MAPPING_TYPE_SETTING, "object");
+            }
+            throw new IllegalArgumentException("Field [" + fieldName +
+                "] has both object and non-object values - this is not supported by Elasticsearch");
+        }
+
+        if (fieldValues.stream().anyMatch(value -> value instanceof List || value instanceof Object[])) {
+            // Elasticsearch fields can be either arrays or single values, but array values must all have the same type
+            return guessMapping(explanation, fieldName,
+                fieldValues.stream().flatMap(LogStructureUtils::flatten).collect(Collectors.toList()));
+        }
+
+        return guessScalarMapping(explanation, fieldName, fieldValues.stream().map(Object::toString).collect(Collectors.toList()));
+    }
+
+    private static Stream<Object> flatten(Object value) {
+        if (value instanceof List) {
+            @SuppressWarnings("unchecked")
+            List<Object> objectList = (List<Object>) value;
+            return objectList.stream();
+        } else if (value instanceof Object[]) {
+            return Arrays.stream((Object[]) value);
+        } else {
+            return Stream.of(value);
+        }
+    }
+
+    /**
+     * Given some sample values for a field, guess the most appropriate index mapping for the
+     * field.
+     * @param explanation List of reasons for choosing the overall log structure.  This list
+     *                    may be non-empty when the method is called, and this method may
+     *                    append to it.
+     * @param fieldName Name of the field for which mappings are to be guessed.
+     * @param fieldValues Values of the field for which mappings are to be guessed.  The guessed
+     *                    mapping will be compatible with all the provided values.  Must not be
+     *                    empty.
+     * @return The sub-section of the index mappings most appropriate for the field,
+     *         for example <code>{ "type" : "keyword" }</code>.
+     */
+    static Map<String, String> guessScalarMapping(List<String> explanation, String fieldName, Collection<String> fieldValues) {
+
+        assert fieldValues.isEmpty() == false;
+
+        if (fieldValues.stream().allMatch(value -> "true".equals(value) || "false".equals(value))) {
+            return Collections.singletonMap(MAPPING_TYPE_SETTING, "boolean");
+        }
+
+        // This checks if a date mapping would be appropriate, and, if so, finds the correct format
+        Iterator<String> iter = fieldValues.iterator();
+        TimestampMatch timestampMatch = TimestampFormatFinder.findFirstFullMatch(iter.next());
+        while (timestampMatch != null && iter.hasNext()) {
+            // To be mapped as type date all the values must match the same date format - it is
+            // not acceptable for all values to be dates, but with different formats
+            if (timestampMatch.equals(TimestampFormatFinder.findFirstFullMatch(iter.next(), timestampMatch.candidateIndex)) == false) {
+                timestampMatch = null;
+            }
+        }
+        if (timestampMatch != null) {
+            return timestampMatch.getEsDateMappingTypeWithFormat();
+        }
+
+        if (fieldValues.stream().allMatch(NUMBER_GROK::match)) {
+            try {
+                fieldValues.forEach(Long::parseLong);
+                return Collections.singletonMap(MAPPING_TYPE_SETTING, "long");
+            } catch (NumberFormatException e) {
+                explanation.add("Rejecting type 'long' for field [" + fieldName + "] due to parse failure: [" + e.getMessage() + "]");
+            }
+            try {
+                fieldValues.forEach(Double::parseDouble);
+                return Collections.singletonMap(MAPPING_TYPE_SETTING, "double");
+            } catch (NumberFormatException e) {
+                explanation.add("Rejecting type 'double' for field [" + fieldName + "] due to parse failure: [" + e.getMessage() + "]");
+            }
+        }
+        else if (fieldValues.stream().allMatch(IP_GROK::match)) {
+            return Collections.singletonMap(MAPPING_TYPE_SETTING, "ip");
+        }
+
+        if (fieldValues.stream().anyMatch(LogStructureUtils::isMoreLikelyTextThanKeyword)) {
+            return Collections.singletonMap(MAPPING_TYPE_SETTING, "text");
+        }
+
+        return Collections.singletonMap(MAPPING_TYPE_SETTING, "keyword");
+    }
+
+    /**
+     * The thinking is that the longer the field value and the more spaces it contains,
+     * the more likely it is that it should be indexed as text rather than keyword.
+     */
+    static boolean isMoreLikelyTextThanKeyword(String str) {
+        int length = str.length();
+        return length > KEYWORD_MAX_LEN || length - str.replaceAll("\\s", "").length() > KEYWORD_MAX_SPACES;
+    }
+}

+ 38 - 0
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/PipeSeparatedValuesLogStructureFinderFactory.java

@@ -0,0 +1,38 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.supercsv.prefs.CsvPreference;
+
+import java.io.IOException;
+import java.util.List;
+
+public class PipeSeparatedValuesLogStructureFinderFactory implements LogStructureFinderFactory {
+
+    private static final CsvPreference PIPE_PREFERENCE = new CsvPreference.Builder('"', '|', "\n").build();
+
+    /**
+     * Rules are:
+     * - The file must be valid pipe (<code>|</code>) separated values
+     * - It must contain at least two complete records
+     * - There must be at least five fields per record (otherwise files with coincidental
+     *   or no pipe characters could be treated as pipe separated)
+     * - Every pipe separated value record except the last must have the same number of fields
+     * The reason the last record is allowed to have fewer fields than the others is that
+     * it could have been truncated when the file was sampled.
+     */
+    @Override
+    public boolean canCreateFromSample(List<String> explanation, String sample) {
+        return SeparatedValuesLogStructureFinder.canCreateFromSample(explanation, sample, 5, PIPE_PREFERENCE, "pipe separated values");
+    }
+
+    @Override
+    public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
+        throws IOException {
+        return SeparatedValuesLogStructureFinder.makeSeparatedValuesLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
+            PIPE_PREFERENCE, true);
+    }
+}

+ 37 - 0
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/SemiColonSeparatedValuesLogStructureFinderFactory.java

@@ -0,0 +1,37 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.supercsv.prefs.CsvPreference;
+
+import java.io.IOException;
+import java.util.List;
+
+public class SemiColonSeparatedValuesLogStructureFinderFactory implements LogStructureFinderFactory {
+
+    /**
+     * Rules are:
+     * - The file must be valid semi-colon separated values
+     * - It must contain at least two complete records
+     * - There must be at least four fields per record (otherwise files with coincidental
+     *   or no semi-colons could be treated as semi-colon separated)
+     * - Every semi-colon separated value record except the last must have the same number of fields
+     * The reason the last record is allowed to have fewer fields than the others is that
+     * it could have been truncated when the file was sampled.
+     */
+    @Override
+    public boolean canCreateFromSample(List<String> explanation, String sample) {
+        return SeparatedValuesLogStructureFinder.canCreateFromSample(explanation, sample, 4,
+            CsvPreference.EXCEL_NORTH_EUROPE_PREFERENCE, "semi-colon separated values");
+    }
+
+    @Override
+    public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
+        throws IOException {
+        return SeparatedValuesLogStructureFinder.makeSeparatedValuesLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
+            CsvPreference.EXCEL_NORTH_EUROPE_PREFERENCE, false);
+    }
+}

+ 486 - 0
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/SeparatedValuesLogStructureFinder.java

@@ -0,0 +1,486 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.elasticsearch.common.collect.Tuple;
+import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
+import org.supercsv.exception.SuperCsvException;
+import org.supercsv.io.CsvListReader;
+import org.supercsv.prefs.CsvPreference;
+import org.supercsv.util.Util;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.DoubleSummaryStatistics;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Random;
+import java.util.SortedMap;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+public class SeparatedValuesLogStructureFinder implements LogStructureFinder {
+
+    private static final int MAX_LEVENSHTEIN_COMPARISONS = 100;
+
+    private final List<String> sampleMessages;
+    private final LogStructure structure;
+
+    static SeparatedValuesLogStructureFinder makeSeparatedValuesLogStructureFinder(List<String> explanation, String sample,
+                                                                                   String charsetName, Boolean hasByteOrderMarker,
+                                                                                   CsvPreference csvPreference, boolean trimFields)
+        throws IOException {
+
+        Tuple<List<List<String>>, List<Integer>> parsed = readRows(sample, csvPreference);
+        List<List<String>> rows = parsed.v1();
+        List<Integer> lineNumbers = parsed.v2();
+
+        Tuple<Boolean, String[]> headerInfo = findHeaderFromSample(explanation, rows);
+        boolean isHeaderInFile = headerInfo.v1();
+        String[] header = headerInfo.v2();
+        String[] headerWithNamedBlanks = new String[header.length];
+        for (int i = 0; i < header.length; ++i) {
+            String rawHeader = header[i].isEmpty() ? "column" + (i + 1) : header[i];
+            headerWithNamedBlanks[i] = trimFields ? rawHeader.trim() : rawHeader;
+        }
+
+        List<String> sampleLines = Arrays.asList(sample.split("\n"));
+        List<String> sampleMessages = new ArrayList<>();
+        List<Map<String, ?>> sampleRecords = new ArrayList<>();
+        int prevMessageEndLineNumber = isHeaderInFile ? lineNumbers.get(0) : -1;
+        for (int index = isHeaderInFile ? 1 : 0; index < rows.size(); ++index) {
+            List<String> row = rows.get(index);
+            int lineNumber = lineNumbers.get(index);
+            Map<String, String> sampleRecord = new LinkedHashMap<>();
+            Util.filterListToMap(sampleRecord, headerWithNamedBlanks,
+                trimFields ? row.stream().map(String::trim).collect(Collectors.toList()) : row);
+            sampleRecords.add(sampleRecord);
+            sampleMessages.add(
+                sampleLines.subList(prevMessageEndLineNumber + 1, lineNumbers.get(index)).stream().collect(Collectors.joining("\n")));
+            prevMessageEndLineNumber = lineNumber;
+        }
+
+        String preamble = Pattern.compile("\n").splitAsStream(sample).limit(lineNumbers.get(1)).collect(Collectors.joining("\n", "", "\n"));
+
+        char delimiter = (char) csvPreference.getDelimiterChar();
+        LogStructure.Builder structureBuilder = new LogStructure.Builder(LogStructure.Format.fromSeparator(delimiter))
+            .setCharset(charsetName)
+            .setHasByteOrderMarker(hasByteOrderMarker)
+            .setSampleStart(preamble)
+            .setNumLinesAnalyzed(lineNumbers.get(lineNumbers.size() - 1))
+            .setNumMessagesAnalyzed(sampleRecords.size())
+            .setHasHeaderRow(isHeaderInFile)
+            .setInputFields(Arrays.stream(headerWithNamedBlanks).collect(Collectors.toList()));
+
+        if (trimFields) {
+            structureBuilder.setShouldTrimFields(true);
+        }
+
+        Tuple<String, TimestampMatch> timeField = LogStructureUtils.guessTimestampField(explanation, sampleRecords);
+        if (timeField != null) {
+            String timeLineRegex = null;
+            StringBuilder builder = new StringBuilder("^");
+            // We make the assumption that the timestamp will be on the first line of each record.  Therefore, if the
+            // timestamp is the last column then either our assumption is wrong (and the approach will completely
+            // break down) or else every record is on a single line and there's no point creating a multiline config.
+            // This is why the loop excludes the last column.
+            for (String column : Arrays.asList(header).subList(0, header.length - 1)) {
+                if (timeField.v1().equals(column)) {
+                    builder.append("\"?");
+                    String simpleTimePattern = timeField.v2().simplePattern.pattern();
+                    builder.append(simpleTimePattern.startsWith("\\b") ? simpleTimePattern.substring(2) : simpleTimePattern);
+                    timeLineRegex = builder.toString();
+                    break;
+                } else {
+                    builder.append(".*?");
+                    if (delimiter == '\t') {
+                        builder.append("\\t");
+                    } else {
+                        builder.append(delimiter);
+                    }
+                }
+            }
+
+            if (isHeaderInFile) {
+                structureBuilder.setExcludeLinesPattern("^" + Arrays.stream(header)
+                    .map(column -> "\"?" + column.replace("\"", "\"\"").replaceAll("([\\\\|()\\[\\]{}^$*?])", "\\\\$1") + "\"?")
+                    .collect(Collectors.joining(",")));
+            }
+
+            structureBuilder.setTimestampField(timeField.v1())
+                .setTimestampFormats(timeField.v2().dateFormats)
+                .setNeedClientTimezone(timeField.v2().hasTimezoneDependentParsing())
+                .setMultilineStartPattern(timeLineRegex);
+        }
+
+        SortedMap<String, Object> mappings = LogStructureUtils.guessMappings(explanation, sampleRecords);
+        mappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
+
+        LogStructure structure = structureBuilder
+            .setMappings(mappings)
+            .setExplanation(explanation)
+            .build();
+
+        return new SeparatedValuesLogStructureFinder(sampleMessages, structure);
+    }
+
+    private SeparatedValuesLogStructureFinder(List<String> sampleMessages, LogStructure structure) {
+        this.sampleMessages = Collections.unmodifiableList(sampleMessages);
+        this.structure = structure;
+    }
+
+    @Override
+    public List<String> getSampleMessages() {
+        return sampleMessages;
+    }
+
+    @Override
+    public LogStructure getStructure() {
+        return structure;
+    }
+
+    static Tuple<List<List<String>>, List<Integer>> readRows(String sample, CsvPreference csvPreference) throws IOException {
+
+        int fieldsInFirstRow = -1;
+
+        List<List<String>> rows = new ArrayList<>();
+        List<Integer> lineNumbers = new ArrayList<>();
+
+        try (CsvListReader csvReader = new CsvListReader(new StringReader(sample), csvPreference)) {
+
+            try {
+                List<String> row;
+                while ((row = csvReader.read()) != null) {
+                    if (fieldsInFirstRow < 0) {
+                        fieldsInFirstRow = row.size();
+                    } else {
+                        // Tolerate extra columns if and only if they're empty
+                        while (row.size() > fieldsInFirstRow && row.get(row.size() - 1) == null) {
+                            row.remove(row.size() - 1);
+                        }
+                    }
+                    rows.add(row);
+                    lineNumbers.add(csvReader.getLineNumber());
+                }
+            } catch (SuperCsvException e) {
+                // Tolerate an incomplete last row
+                if (notUnexpectedEndOfFile(e)) {
+                    throw e;
+                }
+            }
+        }
+
+        assert rows.isEmpty() == false;
+        assert lineNumbers.size() == rows.size();
+
+        if (rows.get(0).size() != rows.get(rows.size() - 1).size()) {
+            rows.remove(rows.size() - 1);
+            lineNumbers.remove(lineNumbers.size() - 1);
+        }
+
+        // This should have been enforced by canCreateFromSample()
+        assert rows.size() > 1;
+
+        return new Tuple<>(rows, lineNumbers);
+    }
+
+    static Tuple<Boolean, String[]> findHeaderFromSample(List<String> explanation, List<List<String>> rows) {
+
+        assert rows.isEmpty() == false;
+
+        List<String> firstRow = rows.get(0);
+
+        boolean isHeaderInFile = true;
+        if (rowContainsDuplicateNonEmptyValues(firstRow)) {
+            isHeaderInFile = false;
+            explanation.add("First row contains duplicate values, so assuming it's not a header");
+        } else {
+            if (rows.size() < 3) {
+                explanation.add("Too little data to accurately assess whether header is in sample - guessing it is");
+            } else {
+                isHeaderInFile = isFirstRowUnusual(explanation, rows);
+            }
+        }
+
+        if (isHeaderInFile) {
+            // SuperCSV will put nulls in the header if any columns don't have names, but empty strings are better for us
+            return new Tuple<>(true, firstRow.stream().map(field -> (field == null) ? "" : field).toArray(String[]::new));
+        } else {
+            return new Tuple<>(false, IntStream.rangeClosed(1, firstRow.size()).mapToObj(num -> "column" + num).toArray(String[]::new));
+        }
+    }
+
+    static boolean rowContainsDuplicateNonEmptyValues(List<String> row) {
+
+        HashSet<String> values = new HashSet<>();
+
+        for (String value : row) {
+            if (value != null && value.isEmpty() == false && values.add(value) == false) {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    private static boolean isFirstRowUnusual(List<String> explanation, List<List<String>> rows) {
+
+        assert rows.size() >= 3;
+
+        List<String> firstRow = rows.get(0);
+        String firstRowStr = firstRow.stream().map(field -> (field == null) ? "" : field).collect(Collectors.joining(""));
+        List<List<String>> otherRows = rows.subList(1, rows.size());
+        List<String> otherRowStrs = new ArrayList<>();
+        for (List<String> row : otherRows) {
+            otherRowStrs.add(row.stream().map(str -> (str == null) ? "" : str).collect(Collectors.joining("")));
+        }
+
+        // Check lengths
+
+        double firstRowLength = firstRowStr.length();
+        DoubleSummaryStatistics otherRowStats = otherRowStrs.stream().mapToDouble(otherRow -> (double) otherRow.length())
+            .collect(DoubleSummaryStatistics::new, DoubleSummaryStatistics::accept, DoubleSummaryStatistics::combine);
+
+        double otherLengthRange = otherRowStats.getMax() - otherRowStats.getMin();
+        if (firstRowLength < otherRowStats.getMin() - otherLengthRange / 10.0 ||
+            firstRowLength > otherRowStats.getMax() + otherLengthRange / 10.0) {
+            explanation.add("First row is unusual based on length test: [" + firstRowLength + "] and [" +
+                toNiceString(otherRowStats) + "]");
+            return true;
+        }
+
+        explanation.add("First row is not unusual based on length test: [" + firstRowLength + "] and [" +
+            toNiceString(otherRowStats) + "]");
+
+        // Check edit distances
+
+        DoubleSummaryStatistics firstRowStats = otherRows.stream().limit(MAX_LEVENSHTEIN_COMPARISONS)
+            .mapToDouble(otherRow -> (double) levenshteinFieldwiseCompareRows(firstRow, otherRow))
+            .collect(DoubleSummaryStatistics::new, DoubleSummaryStatistics::accept, DoubleSummaryStatistics::combine);
+
+        otherRowStats = new DoubleSummaryStatistics();
+        int numComparisons = 0;
+        int proportion = otherRowStrs.size() / MAX_LEVENSHTEIN_COMPARISONS;
+        int innerIncrement = 1 + proportion * proportion;
+        Random random = new Random(firstRow.hashCode());
+        for (int i = 0; numComparisons < MAX_LEVENSHTEIN_COMPARISONS && i < otherRowStrs.size(); ++i) {
+            for (int j = i + 1 + random.nextInt(innerIncrement); numComparisons < MAX_LEVENSHTEIN_COMPARISONS && j < otherRowStrs.size();
+                 j += innerIncrement) {
+                otherRowStats.accept((double) levenshteinFieldwiseCompareRows(otherRows.get(i), otherRows.get(j)));
+                ++numComparisons;
+            }
+        }
+
+        if (firstRowStats.getAverage() > otherRowStats.getAverage() * 1.2) {
+            explanation.add("First row is unusual based on Levenshtein test [" + toNiceString(firstRowStats) +
+                "] and [" + toNiceString(otherRowStats) + "]");
+            return true;
+        }
+
+        explanation.add("First row is not unusual based on Levenshtein test [" + toNiceString(firstRowStats) +
+            "] and [" + toNiceString(otherRowStats) + "]");
+
+        return false;
+    }
+
+    private static String toNiceString(DoubleSummaryStatistics stats) {
+        return String.format(Locale.ROOT, "count=%d, min=%f, average=%f, max=%f", stats.getCount(), stats.getMin(), stats.getAverage(),
+            stats.getMax());
+    }
+
+    /**
+     * Sum of the Levenshtein distances between corresponding elements
+     * in the two supplied lists _excluding_ the biggest difference.
+     * The reason the biggest difference is excluded is that sometimes
+     * there's a "message" field that is much longer than any of the other
+     * fields, varies enormously between rows, and skews the comparison.
+     */
+    static int levenshteinFieldwiseCompareRows(List<String> firstRow, List<String> secondRow) {
+
+        int largestSize = Math.max(firstRow.size(), secondRow.size());
+        if (largestSize <= 1) {
+            return 0;
+        }
+
+        int[] distances = new int[largestSize];
+
+        for (int index = 0; index < largestSize; ++index) {
+            distances[index] = levenshteinDistance((index < firstRow.size()) ? firstRow.get(index) : "",
+                (index < secondRow.size()) ? secondRow.get(index) : "");
+        }
+
+        Arrays.sort(distances);
+
+        return IntStream.of(distances).limit(distances.length - 1).sum();
+    }
+
+    /**
+     * This method implements the simple algorithm for calculating Levenshtein distance.
+     */
+    static int levenshteinDistance(String first, String second) {
+
+        // There are some examples with pretty pictures of the matrix on Wikipedia here:
+        // http://en.wikipedia.org/wiki/Levenshtein_distance
+
+        int firstLen = (first == null) ? 0 : first.length();
+        int secondLen = (second == null) ? 0 : second.length();
+        if (firstLen == 0) {
+            return secondLen;
+        }
+        if (secondLen == 0) {
+            return firstLen;
+        }
+
+        int[] currentCol = new int[secondLen + 1];
+        int[] prevCol = new int[secondLen + 1];
+
+        // Populate the left column
+        for (int down = 0; down <= secondLen; ++down) {
+            currentCol[down] = down;
+        }
+
+        // Calculate the other entries in the matrix
+        for (int across = 1; across <= firstLen; ++across) {
+            int[] tmp = prevCol;
+            prevCol = currentCol;
+            // We could allocate a new array for currentCol here, but it's more efficient to reuse the one that's now redundant
+            currentCol = tmp;
+
+            currentCol[0] = across;
+
+            for (int down = 1; down <= secondLen; ++down) {
+
+                // Do the strings differ at the point we've reached?
+                if (first.charAt(across - 1) == second.charAt(down - 1)) {
+
+                    // No, they're the same => no extra cost
+                    currentCol[down] = prevCol[down - 1];
+                } else {
+                    // Yes, they differ, so there are 3 options:
+
+                    // 1) Deletion => cell to the left's value plus 1
+                    int option1 = prevCol[down];
+
+                    // 2) Insertion => cell above's value plus 1
+                    int option2 = currentCol[down - 1];
+
+                    // 3) Substitution => cell above left's value plus 1
+                    int option3 = prevCol[down - 1];
+
+                    // Take the cheapest option of the 3
+                    currentCol[down] = Math.min(Math.min(option1, option2), option3) + 1;
+                }
+            }
+        }
+
+        // Result is the value in the bottom right hand corner of the matrix
+        return currentCol[secondLen];
+    }
+
+    static boolean lineHasUnescapedQuote(String line, CsvPreference csvPreference) {
+        char quote = csvPreference.getQuoteChar();
+        String lineWithEscapedQuotesRemoved = line.replace(String.valueOf(quote) + quote, "");
+        for (int index = 1; index < lineWithEscapedQuotesRemoved.length() - 1; ++index) {
+            if (lineWithEscapedQuotesRemoved.charAt(index) == quote &&
+                lineWithEscapedQuotesRemoved.codePointAt(index - 1) != csvPreference.getDelimiterChar() &&
+                lineWithEscapedQuotesRemoved.codePointAt(index + 1) != csvPreference.getDelimiterChar()) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    static boolean canCreateFromSample(List<String> explanation, String sample, int minFieldsPerRow, CsvPreference csvPreference,
+                                       String formatName) {
+
+        // Logstash's CSV parser won't tolerate fields where just part of the
+        // value is quoted, whereas SuperCSV will, hence this extra check
+        String[] sampleLines = sample.split("\n");
+        for (String sampleLine : sampleLines) {
+            if (lineHasUnescapedQuote(sampleLine, csvPreference)) {
+                explanation.add("Not " + formatName +
+                    " because a line has an unescaped quote that is not at the beginning or end of a field: [" + sampleLine + "]");
+                return false;
+            }
+        }
+
+        try (CsvListReader csvReader = new CsvListReader(new StringReader(sample), csvPreference)) {
+
+            int fieldsInFirstRow = -1;
+            int fieldsInLastRow = -1;
+
+            int numberOfRows = 0;
+            try {
+                List<String> row;
+                while ((row = csvReader.read()) != null) {
+
+                    int fieldsInThisRow = row.size();
+                    ++numberOfRows;
+                    if (fieldsInFirstRow < 0) {
+                        fieldsInFirstRow = fieldsInThisRow;
+                        if (fieldsInFirstRow < minFieldsPerRow) {
+                            explanation.add("Not " + formatName + " because the first row has fewer than [" + minFieldsPerRow +
+                                "] fields: [" + fieldsInFirstRow + "]");
+                            return false;
+                        }
+                        fieldsInLastRow = fieldsInFirstRow;
+                        continue;
+                    }
+
+                    // Tolerate extra columns if and only if they're empty
+                    while (fieldsInThisRow > fieldsInFirstRow && row.get(fieldsInThisRow - 1) == null) {
+                        --fieldsInThisRow;
+                    }
+
+                    if (fieldsInLastRow != fieldsInFirstRow) {
+                        explanation.add("Not " + formatName + " because row [" + (numberOfRows - 1) +
+                            "] has a different number of fields to the first row: [" + fieldsInFirstRow + "] and [" +
+                            fieldsInLastRow + "]");
+                        return false;
+                    }
+
+                    fieldsInLastRow = fieldsInThisRow;
+                }
+
+                if (fieldsInLastRow > fieldsInFirstRow) {
+                    explanation.add("Not " + formatName + " because last row has more fields than first row: [" + fieldsInFirstRow +
+                        "] and [" + fieldsInLastRow + "]");
+                    return false;
+                }
+                if (fieldsInLastRow < fieldsInFirstRow) {
+                    --numberOfRows;
+                }
+            } catch (SuperCsvException e) {
+                // Tolerate an incomplete last row
+                if (notUnexpectedEndOfFile(e)) {
+                    explanation.add("Not " + formatName + " because there was a parsing exception: [" + e.getMessage() + "]");
+                    return false;
+                }
+            }
+            if (numberOfRows <= 1) {
+                explanation.add("Not " + formatName + " because fewer than 2 complete records in sample: [" + numberOfRows + "]");
+                return false;
+            }
+            explanation.add("Deciding sample is " + formatName);
+            return true;
+
+        } catch (IOException e) {
+            explanation.add("Not " + formatName + " because there was a parsing exception: [" + e.getMessage() + "]");
+            return false;
+        }
+    }
+
+    private static boolean notUnexpectedEndOfFile(SuperCsvException e) {
+        return e.getMessage().startsWith("unexpected end of file while reading quoted column") == false;
+    }
+}

+ 201 - 0
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/TextLogStructureFinder.java

@@ -0,0 +1,201 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.elasticsearch.common.collect.Tuple;
+import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.regex.Pattern;
+
+public class TextLogStructureFinder implements LogStructureFinder {
+
+    private final List<String> sampleMessages;
+    private final LogStructure structure;
+
+    static TextLogStructureFinder makeTextLogStructureFinder(List<String> explanation, String sample, String charsetName,
+                                                             Boolean hasByteOrderMarker) {
+
+        String[] sampleLines = sample.split("\n");
+        Tuple<TimestampMatch, Set<String>> bestTimestamp = mostLikelyTimestamp(sampleLines);
+        if (bestTimestamp == null) {
+            // Is it appropriate to treat a file that is neither structured nor has
+            // a regular pattern of timestamps as a log file?  Probably not...
+            throw new IllegalArgumentException("Could not find a timestamp in the log sample provided");
+        }
+
+        explanation.add("Most likely timestamp format is [" + bestTimestamp.v1() + "]");
+
+        List<String> sampleMessages = new ArrayList<>();
+        StringBuilder preamble = new StringBuilder();
+        int linesConsumed = 0;
+        StringBuilder message = null;
+        int linesInMessage = 0;
+        String multiLineRegex = createMultiLineMessageStartRegex(bestTimestamp.v2(), bestTimestamp.v1().simplePattern.pattern());
+        Pattern multiLinePattern = Pattern.compile(multiLineRegex);
+        for (String sampleLine : sampleLines) {
+            if (multiLinePattern.matcher(sampleLine).find()) {
+                if (message != null) {
+                    sampleMessages.add(message.toString());
+                    linesConsumed += linesInMessage;
+                }
+                message = new StringBuilder(sampleLine);
+                linesInMessage = 1;
+            } else {
+                // If message is null here then the sample probably began with the incomplete ending of a previous message
+                if (message == null) {
+                    // We count lines before the first message as consumed (just like we would
+                    // for the CSV header or lines before the first XML document starts)
+                    ++linesConsumed;
+                } else {
+                    message.append('\n').append(sampleLine);
+                    ++linesInMessage;
+                }
+            }
+            if (sampleMessages.size() < 2) {
+                preamble.append(sampleLine).append('\n');
+            }
+        }
+        // Don't add the last message, as it might be partial and mess up subsequent pattern finding
+
+        LogStructure.Builder structureBuilder = new LogStructure.Builder(LogStructure.Format.SEMI_STRUCTURED_TEXT)
+            .setCharset(charsetName)
+            .setHasByteOrderMarker(hasByteOrderMarker)
+            .setSampleStart(preamble.toString())
+            .setNumLinesAnalyzed(linesConsumed)
+            .setNumMessagesAnalyzed(sampleMessages.size())
+            .setMultilineStartPattern(multiLineRegex);
+
+        SortedMap<String, Object> mappings = new TreeMap<>();
+        mappings.put("message", Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "text"));
+        mappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
+
+        // We can't parse directly into @timestamp using Grok, so parse to some other time field, which the date filter will then remove
+        String interimTimestampField;
+        String grokPattern;
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
+        Tuple<String, String> timestampFieldAndFullMatchGrokPattern = grokPatternCreator.findFullLineGrokPattern();
+        if (timestampFieldAndFullMatchGrokPattern != null) {
+            interimTimestampField = timestampFieldAndFullMatchGrokPattern.v1();
+            grokPattern = timestampFieldAndFullMatchGrokPattern.v2();
+        } else {
+            interimTimestampField = "timestamp";
+            grokPattern = grokPatternCreator.createGrokPatternFromExamples(bestTimestamp.v1().grokPatternName, interimTimestampField);
+        }
+
+        LogStructure structure = structureBuilder
+            .setTimestampField(interimTimestampField)
+            .setTimestampFormats(bestTimestamp.v1().dateFormats)
+            .setNeedClientTimezone(bestTimestamp.v1().hasTimezoneDependentParsing())
+            .setGrokPattern(grokPattern)
+            .setMappings(mappings)
+            .setExplanation(explanation)
+            .build();
+
+        return new TextLogStructureFinder(sampleMessages, structure);
+    }
+
+    private TextLogStructureFinder(List<String> sampleMessages, LogStructure structure) {
+        this.sampleMessages = Collections.unmodifiableList(sampleMessages);
+        this.structure = structure;
+    }
+
+    @Override
+    public List<String> getSampleMessages() {
+        return sampleMessages;
+    }
+
+    @Override
+    public LogStructure getStructure() {
+        return structure;
+    }
+
+    static Tuple<TimestampMatch, Set<String>> mostLikelyTimestamp(String[] sampleLines) {
+
+        Map<TimestampMatch, Tuple<Double, Set<String>>> timestampMatches = new LinkedHashMap<>();
+
+        int remainingLines = sampleLines.length;
+        double differenceBetweenTwoHighestWeights = 0.0;
+        for (String sampleLine : sampleLines) {
+            TimestampMatch match = TimestampFormatFinder.findFirstMatch(sampleLine);
+            if (match != null) {
+                TimestampMatch pureMatch = new TimestampMatch(match.candidateIndex, "", match.dateFormats, match.simplePattern,
+                    match.grokPatternName, "");
+                timestampMatches.compute(pureMatch, (k, v) -> {
+                    if (v == null) {
+                        return new Tuple<>(weightForMatch(match.preface), new HashSet<>(Collections.singletonList(match.preface)));
+                    } else {
+                        v.v2().add(match.preface);
+                        return new Tuple<>(v.v1() + weightForMatch(match.preface), v.v2());
+                    }
+                });
+                differenceBetweenTwoHighestWeights = findDifferenceBetweenTwoHighestWeights(timestampMatches.values());
+            }
+            // The highest possible weight is 1, so if the difference between the two highest weights
+            // is less than the number of lines remaining then the leader cannot possibly be overtaken
+            if (differenceBetweenTwoHighestWeights > --remainingLines) {
+                break;
+            }
+        }
+
+        double highestWeight = 0.0;
+        Tuple<TimestampMatch, Set<String>> highestWeightMatch = null;
+        for (Map.Entry<TimestampMatch, Tuple<Double, Set<String>>> entry : timestampMatches.entrySet()) {
+            double weight = entry.getValue().v1();
+            if (weight > highestWeight) {
+                highestWeight = weight;
+                highestWeightMatch = new Tuple<>(entry.getKey(), entry.getValue().v2());
+            }
+        }
+        return highestWeightMatch;
+    }
+
+    /**
+     * Used to weight a timestamp match according to how far along the line it is found.
+     * Timestamps at the very beginning of the line are given a weight of 1.  The weight
+     * progressively decreases the more text there is preceding the timestamp match, but
+     * is always greater than 0.
+     * @return A weight in the range (0, 1].
+     */
+    private static double weightForMatch(String preface) {
+        return Math.pow(1.0 + preface.length() / 15.0, -1.1);
+    }
+
+    private static double findDifferenceBetweenTwoHighestWeights(Collection<Tuple<Double, Set<String>>> timestampMatches) {
+        double highestWeight = 0.0;
+        double secondHighestWeight = 0.0;
+        for (Tuple<Double, Set<String>> timestampMatch : timestampMatches) {
+            double weight = timestampMatch.v1();
+            if (weight > highestWeight) {
+                secondHighestWeight = highestWeight;
+                highestWeight = weight;
+            } else if (weight > secondHighestWeight) {
+                secondHighestWeight = weight;
+            }
+        }
+        return highestWeight - secondHighestWeight;
+    }
+
+    static String createMultiLineMessageStartRegex(Collection<String> prefaces, String timestampRegex) {
+
+        StringBuilder builder = new StringBuilder("^");
+        GrokPatternCreator.addIntermediateRegex(builder, prefaces);
+        builder.append(timestampRegex);
+        if (builder.substring(0, 3).equals("^\\b")) {
+            builder.delete(1, 3);
+        }
+        return builder.toString();
+    }
+}

+ 39 - 0
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/TextLogStructureFinderFactory.java

@@ -0,0 +1,39 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import java.util.List;
+import java.util.regex.Pattern;
+
+public class TextLogStructureFinderFactory implements LogStructureFinderFactory {
+
+    // This works because, by default, dot doesn't match newlines
+    private static final Pattern TWO_NON_BLANK_LINES_PATTERN = Pattern.compile(".\n+.");
+
+    /**
+     * This format matches if the sample contains at least one newline and at least two
+     * non-blank lines.
+     */
+    @Override
+    public boolean canCreateFromSample(List<String> explanation, String sample) {
+        if (sample.indexOf('\n') < 0) {
+            explanation.add("Not text because sample contains no newlines");
+            return false;
+        }
+        if (TWO_NON_BLANK_LINES_PATTERN.matcher(sample).find() == false) {
+            explanation.add("Not text because sample contains fewer than two non-blank lines");
+            return false;
+        }
+
+        explanation.add("Deciding sample is text");
+        return true;
+    }
+
+    @Override
+    public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker) {
+        return TextLogStructureFinder.makeTextLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker);
+    }
+}

+ 427 - 0
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/TimestampFormatFinder.java

@@ -0,0 +1,427 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.elasticsearch.common.collect.Tuple;
+import org.elasticsearch.grok.Grok;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Objects;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+/**
+ * Used to find the best timestamp format for one of the following situations:
+ * 1. Matching an entire field value
+ * 2. Matching a timestamp found somewhere within a message
+ */
+public final class TimestampFormatFinder {
+
+    private static final String PREFACE = "preface";
+    private static final String EPILOGUE = "epilogue";
+
+    private static final Pattern FRACTIONAL_SECOND_INTERPRETER = Pattern.compile("([:.,])(\\d{3,9})");
+    private static final char DEFAULT_FRACTIONAL_SECOND_SEPARATOR = ',';
+
+    /**
+     * The timestamp patterns are complex and it can be slow to prove they do not
+     * match anywhere in a long message.  Many of the timestamps are similar and
+     * will never be found in a string if simpler sub-patterns do not exist in the
+     * string.  These sub-patterns can be used to quickly rule out multiple complex
+     * patterns.  These patterns do not need to represent quantities that are
+     * useful to know the value of, merely character sequences that can be used to
+     * prove that <em>several</em> more complex patterns cannot possibly match.
+     */
+    private static final List<Pattern> QUICK_RULE_OUT_PATTERNS = Arrays.asList(
+        // YYYY-MM-dd followed by a space
+        Pattern.compile("\\b\\d{4}-\\d{2}-\\d{2} "),
+        // The end of some number (likely year or day) followed by a space then HH:mm
+        Pattern.compile("\\d \\d{2}:\\d{2}\\b"),
+        // HH:mm:ss surrounded by spaces
+        Pattern.compile(" \\d{2}:\\d{2}:\\d{2} ")
+    );
+
+    /**
+     * The first match in this list will be chosen, so it needs to be ordered
+     * such that more generic patterns come after more specific patterns.
+     */
+    static final List<CandidateTimestampFormat> ORDERED_CANDIDATE_FORMATS = Arrays.asList(
+        // The TOMCAT_DATESTAMP format has to come before ISO8601 because it's basically ISO8601 but
+        // with a space before the timezone, and because the timezone is optional in ISO8601 it will
+        // be recognised as that with the timezone missed off if ISO8601 is checked first
+        new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSS Z", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}",
+            "\\b20\\d{2}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9} (?:Z|[+-]%{HOUR}%{MINUTE})\\b",
+            "TOMCAT_DATESTAMP", Arrays.asList(0, 1)),
+        // The Elasticsearch ISO8601 parser requires a literal T between the date and time, so
+        // longhand formats are needed if there's a space instead
+        new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSSZ", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}",
+            "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}(?:Z|[+-]%{HOUR}%{MINUTE})\\b",
+            "TIMESTAMP_ISO8601", Arrays.asList(0, 1)),
+        new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSSZZ", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}",
+            "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}[+-]%{HOUR}:%{MINUTE}\\b",
+            "TIMESTAMP_ISO8601", Arrays.asList(0, 1)),
+        new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSS", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}",
+            "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}\\b", "TIMESTAMP_ISO8601",
+            Arrays.asList(0, 1)),
+        new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ssZ", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}",
+            "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)(?:Z|[+-]%{HOUR}%{MINUTE})\\b", "TIMESTAMP_ISO8601",
+            Arrays.asList(0, 1)),
+        new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ssZZ", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}",
+            "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[+-]%{HOUR}:%{MINUTE}\\b", "TIMESTAMP_ISO8601",
+            Arrays.asList(0, 1)),
+        new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}",
+            "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)\\b", "TIMESTAMP_ISO8601",
+            Arrays.asList(0, 1)),
+        new CandidateTimestampFormat("ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "\\b%{TIMESTAMP_ISO8601}\\b",
+            "TIMESTAMP_ISO8601"),
+        new CandidateTimestampFormat("EEE MMM dd YYYY HH:mm:ss zzz",
+            "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2} ",
+            "\\b%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{TZ}\\b", "DATESTAMP_RFC822", Arrays.asList(1, 2)),
+        new CandidateTimestampFormat("EEE MMM dd YYYY HH:mm zzz", "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2} ",
+            "\\b%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE} %{TZ}\\b", "DATESTAMP_RFC822", Collections.singletonList(1)),
+        new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm:ss ZZ",
+            "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ",
+            "\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:Z|[+-]%{HOUR}:%{MINUTE})\\b",
+            "DATESTAMP_RFC2822", Arrays.asList(1, 2)),
+        new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm:ss Z",
+            "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ",
+            "\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:Z|[+-]%{HOUR}%{MINUTE})\\b",
+            "DATESTAMP_RFC2822", Arrays.asList(1, 2)),
+        new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm ZZ", "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ",
+            "\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE} (?:Z|[+-]%{HOUR}:%{MINUTE})\\b", "DATESTAMP_RFC2822",
+            Collections.singletonList(1)),
+        new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm Z", "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ",
+            "\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE} (?:Z|[+-]%{HOUR}%{MINUTE})\\b", "DATESTAMP_RFC2822",
+            Collections.singletonList(1)),
+        new CandidateTimestampFormat("EEE MMM dd HH:mm:ss zzz YYYY",
+            "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b",
+            "\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{TZ} %{YEAR}\\b", "DATESTAMP_OTHER",
+            Arrays.asList(1, 2)),
+        new CandidateTimestampFormat("EEE MMM dd HH:mm zzz YYYY",
+            "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b",
+            "\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE} %{TZ} %{YEAR}\\b", "DATESTAMP_OTHER", Collections.singletonList(1)),
+        new CandidateTimestampFormat("YYYYMMddHHmmss", "\\b\\d{14}\\b",
+            "\\b20\\d{2}%{MONTHNUM2}(?:(?:0[1-9])|(?:[12][0-9])|(?:3[01]))(?:2[0123]|[01][0-9])%{MINUTE}(?:[0-5][0-9]|60)\\b",
+            "DATESTAMP_EVENTLOG"),
+        new CandidateTimestampFormat("EEE MMM dd HH:mm:ss YYYY",
+            "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b",
+            "\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{YEAR}\\b", "HTTPDERROR_DATE", Arrays.asList(1, 2)),
+        new CandidateTimestampFormat(Arrays.asList("MMM dd HH:mm:ss,SSS", "MMM  d HH:mm:ss,SSS"),
+            "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2},\\d{3}",
+            "%{MONTH} +%{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}\\b", "SYSLOGTIMESTAMP",
+            Collections.singletonList(1)),
+        new CandidateTimestampFormat(Arrays.asList("MMM dd HH:mm:ss", "MMM  d HH:mm:ss"),
+            "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "%{MONTH} +%{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)\\b",
+            "SYSLOGTIMESTAMP", Collections.singletonList(1)),
+        new CandidateTimestampFormat("dd/MMM/YYYY:HH:mm:ss Z", "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ",
+            "\\b%{MONTHDAY}/%{MONTH}/%{YEAR}:%{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) [+-]?%{HOUR}%{MINUTE}\\b", "HTTPDATE"),
+        new CandidateTimestampFormat("MMM dd, YYYY K:mm:ss a", "\\b[A-Z]\\S{2,8} \\d{1,2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b",
+            "%{MONTH} %{MONTHDAY}, 20\\d{2} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:AM|PM)\\b", "CATALINA_DATESTAMP"),
+        new CandidateTimestampFormat(Arrays.asList("MMM dd YYYY HH:mm:ss", "MMM  d YYYY HH:mm:ss"),
+            "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b",
+            "%{MONTH} +%{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)\\b", "CISCOTIMESTAMP", Collections.singletonList(1)),
+        new CandidateTimestampFormat("UNIX_MS", "\\b\\d{13}\\b", "\\b\\d{13}\\b", "POSINT"),
+        new CandidateTimestampFormat("UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "\\b\\d{10}\\.(?:\\d{3}){1,3}\\b", "NUMBER"),
+        new CandidateTimestampFormat("UNIX", "\\b\\d{10}\\b", "\\b\\d{10}\\b", "POSINT"),
+        new CandidateTimestampFormat("TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM")
+    );
+
+    private TimestampFormatFinder() {
+    }
+
+    /**
+     * Find the first timestamp format that matches part of the supplied value.
+     * @param text The value that the returned timestamp format must exist within.
+     * @return The timestamp format, or <code>null</code> if none matches.
+     */
+    public static TimestampMatch findFirstMatch(String text) {
+        return findFirstMatch(text, 0);
+    }
+
+    /**
+     * Find the first timestamp format that matches part of the supplied value,
+     * excluding a specified number of candidate formats.
+     * @param text The value that the returned timestamp format must exist within.
+     * @param ignoreCandidates The number of candidate formats to exclude from the search.
+     * @return The timestamp format, or <code>null</code> if none matches.
+     */
+    public static TimestampMatch findFirstMatch(String text, int ignoreCandidates) {
+        Boolean[] quickRuleoutMatches = new Boolean[QUICK_RULE_OUT_PATTERNS.size()];
+        int index = ignoreCandidates;
+        for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS.subList(ignoreCandidates, ORDERED_CANDIDATE_FORMATS.size())) {
+            boolean quicklyRuledOut = false;
+            for (Integer quickRuleOutIndex : candidate.quickRuleOutIndices) {
+                if (quickRuleoutMatches[quickRuleOutIndex] == null) {
+                    quickRuleoutMatches[quickRuleOutIndex] = QUICK_RULE_OUT_PATTERNS.get(quickRuleOutIndex).matcher(text).find();
+                }
+                if (quickRuleoutMatches[quickRuleOutIndex] == false) {
+                    quicklyRuledOut = true;
+                    break;
+                }
+            }
+            if (quicklyRuledOut == false) {
+                Map<String, Object> captures = candidate.strictSearchGrok.captures(text);
+                if (captures != null) {
+                    String preface = captures.getOrDefault(PREFACE, "").toString();
+                    String epilogue = captures.getOrDefault(EPILOGUE, "").toString();
+                    return makeTimestampMatch(candidate, index, preface, text.substring(preface.length(),
+                        text.length() - epilogue.length()), epilogue);
+                }
+            }
+            ++index;
+        }
+        return null;
+    }
+
+    /**
+     * Find the best timestamp format for matching an entire field value.
+     * @param text The value that the returned timestamp format must match in its entirety.
+     * @return The timestamp format, or <code>null</code> if none matches.
+     */
+    public static TimestampMatch findFirstFullMatch(String text) {
+        return findFirstFullMatch(text, 0);
+    }
+
+    /**
+     * Find the best timestamp format for matching an entire field value,
+     * excluding a specified number of candidate formats.
+     * @param text The value that the returned timestamp format must match in its entirety.
+     * @param ignoreCandidates The number of candidate formats to exclude from the search.
+     * @return The timestamp format, or <code>null</code> if none matches.
+     */
+    public static TimestampMatch findFirstFullMatch(String text, int ignoreCandidates) {
+        int index = ignoreCandidates;
+        for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS.subList(ignoreCandidates, ORDERED_CANDIDATE_FORMATS.size())) {
+            Map<String, Object> captures = candidate.strictFullMatchGrok.captures(text);
+            if (captures != null) {
+                return makeTimestampMatch(candidate, index, "", text, "");
+            }
+            ++index;
+        }
+        return null;
+    }
+
+    private static TimestampMatch makeTimestampMatch(CandidateTimestampFormat chosenTimestampFormat, int chosenIndex,
+                                                     String preface, String matchedDate, String epilogue) {
+        Tuple<Character, Integer> fractionalSecondsInterpretation = interpretFractionalSeconds(matchedDate);
+        List<String> dateFormats = chosenTimestampFormat.dateFormats;
+        Pattern simplePattern = chosenTimestampFormat.simplePattern;
+        char separator = fractionalSecondsInterpretation.v1();
+        if (separator != DEFAULT_FRACTIONAL_SECOND_SEPARATOR) {
+            dateFormats = dateFormats.stream().map(dateFormat -> dateFormat.replace(DEFAULT_FRACTIONAL_SECOND_SEPARATOR, separator))
+                .collect(Collectors.toList());
+            if (dateFormats.stream().noneMatch(dateFormat -> dateFormat.startsWith("UNIX"))) {
+                String patternStr = simplePattern.pattern();
+                int separatorPos = patternStr.lastIndexOf(DEFAULT_FRACTIONAL_SECOND_SEPARATOR);
+                if (separatorPos >= 0) {
+                    StringBuilder newPatternStr = new StringBuilder(patternStr);
+                    newPatternStr.replace(separatorPos, separatorPos + 1, ((separator == '.') ? "\\" : "") + separator);
+                    simplePattern = Pattern.compile(newPatternStr.toString());
+                }
+            }
+        }
+        int numberOfDigitsInFractionalComponent = fractionalSecondsInterpretation.v2();
+        if (numberOfDigitsInFractionalComponent > 3) {
+            String fractionalSecondsFormat = "SSSSSSSSS".substring(0, numberOfDigitsInFractionalComponent);
+            dateFormats = dateFormats.stream().map(dateFormat -> dateFormat.replace("SSS", fractionalSecondsFormat))
+                .collect(Collectors.toList());
+        }
+        return new TimestampMatch(chosenIndex, preface, dateFormats, simplePattern, chosenTimestampFormat.standardGrokPatternName,
+            epilogue);
+    }
+
+    /**
+     * Interpret the fractional seconds component of a date to determine two things:
+     * 1. The separator character - one of colon, comma and dot.
+     * 2. The number of digits in the fractional component.
+     * @param date The textual representation of the date for which fractional seconds are to be interpreted.
+     * @return A tuple of (fractional second separator character, number of digits in fractional component).
+     */
+    static Tuple<Character, Integer> interpretFractionalSeconds(String date) {
+
+        Matcher matcher = FRACTIONAL_SECOND_INTERPRETER.matcher(date);
+        if (matcher.find()) {
+            return new Tuple<>(matcher.group(1).charAt(0), matcher.group(2).length());
+        }
+
+        return new Tuple<>(DEFAULT_FRACTIONAL_SECOND_SEPARATOR, 0);
+    }
+
+    /**
+     * Represents a timestamp that has matched a field value or been found within a message.
+     */
+    public static final class TimestampMatch {
+
+        /**
+         * The index of the corresponding entry in the <code>ORDERED_CANDIDATE_FORMATS</code> list.
+         */
+        public final int candidateIndex;
+
+        /**
+         * Text that came before the timestamp in the matched field/message.
+         */
+        public final String preface;
+
+        /**
+         * Time format specifier(s) that will work with Logstash and Ingest pipeline date parsers.
+         */
+        public final List<String> dateFormats;
+
+        /**
+         * A simple regex that will work in many languages to detect whether the timestamp format
+         * exists in a particular line.
+         */
+        public final Pattern simplePattern;
+
+        /**
+         * Name of an out-of-the-box Grok pattern that will match the timestamp.
+         */
+        public final String grokPatternName;
+
+        /**
+         * Text that came after the timestamp in the matched field/message.
+         */
+        public final String epilogue;
+
+        TimestampMatch(int candidateIndex, String preface, String dateFormat, String simpleRegex, String grokPatternName, String epilogue) {
+            this(candidateIndex, preface, Collections.singletonList(dateFormat), simpleRegex, grokPatternName, epilogue);
+        }
+
+        TimestampMatch(int candidateIndex, String preface, String dateFormat, String simpleRegex, String grokPatternName, String epilogue,
+                       boolean hasFractionalComponentSmallerThanMillisecond) {
+            this(candidateIndex, preface, Collections.singletonList(dateFormat), simpleRegex, grokPatternName, epilogue);
+        }
+
+        TimestampMatch(int candidateIndex, String preface, List<String> dateFormats, String simpleRegex, String grokPatternName,
+                       String epilogue) {
+            this(candidateIndex, preface, dateFormats, Pattern.compile(simpleRegex), grokPatternName, epilogue);
+        }
+
+        TimestampMatch(int candidateIndex, String preface, List<String> dateFormats, Pattern simplePattern, String grokPatternName,
+                       String epilogue) {
+            this.candidateIndex = candidateIndex;
+            this.preface = preface;
+            this.dateFormats = dateFormats;
+            this.simplePattern = simplePattern;
+            this.grokPatternName = grokPatternName;
+            this.epilogue = epilogue;
+        }
+
+        /**
+         * Does the parsing the timestamp produce different results depending on the timezone of the parser?
+         * I.e., does the textual representation NOT define the timezone?
+         */
+        public boolean hasTimezoneDependentParsing() {
+            return dateFormats.stream()
+                .anyMatch(dateFormat -> dateFormat.contains("HH") && dateFormat.toLowerCase(Locale.ROOT).indexOf('z') == -1);
+        }
+
+        /**
+         * Sometimes Elasticsearch mappings for dates need to include the format.
+         * This method returns appropriate mappings settings: at minimum "type"="date",
+         * and possibly also a "format" setting.
+         */
+        public Map<String, String> getEsDateMappingTypeWithFormat() {
+            if (dateFormats.contains("TAI64N")) {
+                // There's no format for TAI64N in the date formats used in mappings
+                return Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword");
+            }
+            Map<String, String> mapping = new LinkedHashMap<>();
+            mapping.put(LogStructureUtils.MAPPING_TYPE_SETTING, "date");
+            String formats = dateFormats.stream().flatMap(format -> {
+                switch (format) {
+                    case "ISO8601":
+                        return Stream.empty();
+                    case "UNIX_MS":
+                        return Stream.of("epoch_millis");
+                    case "UNIX":
+                        return Stream.of("epoch_second");
+                    default:
+                        return Stream.of(format);
+                }
+            }).collect(Collectors.joining("||"));
+            if (formats.isEmpty() == false) {
+                mapping.put(LogStructureUtils.MAPPING_FORMAT_SETTING, formats);
+            }
+            return mapping;
+        }
+
+        @Override
+        public int hashCode() {
+            return Objects.hash(candidateIndex, preface, dateFormats, simplePattern.pattern(), grokPatternName, epilogue);
+        }
+
+        @Override
+        public boolean equals(Object other) {
+            if (this == other) {
+                return true;
+            }
+            if (other == null || getClass() != other.getClass()) {
+                return false;
+            }
+
+            TimestampMatch that = (TimestampMatch) other;
+            return this.candidateIndex == that.candidateIndex &&
+                Objects.equals(this.preface, that.preface) &&
+                Objects.equals(this.dateFormats, that.dateFormats) &&
+                Objects.equals(this.simplePattern.pattern(), that.simplePattern.pattern()) &&
+                Objects.equals(this.grokPatternName, that.grokPatternName) &&
+                Objects.equals(this.epilogue, that.epilogue);
+        }
+
+        @Override
+        public String toString() {
+            return "index = " + candidateIndex + (preface.isEmpty() ? "" : ", preface = '" + preface + "'") +
+                ", date formats = " + dateFormats.stream().collect(Collectors.joining("', '", "[ '", "' ]")) +
+                ", simple pattern = '" + simplePattern.pattern() + "', grok pattern = '" + grokPatternName + "'" +
+                (epilogue.isEmpty() ? "" : ", epilogue = '" + epilogue + "'");
+        }
+    }
+
+    static final class CandidateTimestampFormat {
+
+        final List<String> dateFormats;
+        final Pattern simplePattern;
+        final Grok strictSearchGrok;
+        final Grok strictFullMatchGrok;
+        final String standardGrokPatternName;
+        final List<Integer> quickRuleOutIndices;
+
+        CandidateTimestampFormat(String dateFormat, String simpleRegex, String strictGrokPattern, String standardGrokPatternName) {
+            this(Collections.singletonList(dateFormat), simpleRegex, strictGrokPattern, standardGrokPatternName);
+        }
+
+        CandidateTimestampFormat(String dateFormat, String simpleRegex, String strictGrokPattern, String standardGrokPatternName,
+                                 List<Integer> quickRuleOutIndices) {
+            this(Collections.singletonList(dateFormat), simpleRegex, strictGrokPattern, standardGrokPatternName, quickRuleOutIndices);
+        }
+
+        CandidateTimestampFormat(List<String> dateFormats, String simpleRegex, String strictGrokPattern, String standardGrokPatternName) {
+            this(dateFormats, simpleRegex, strictGrokPattern, standardGrokPatternName, Collections.emptyList());
+        }
+
+        CandidateTimestampFormat(List<String> dateFormats, String simpleRegex, String strictGrokPattern, String standardGrokPatternName,
+                                 List<Integer> quickRuleOutIndices) {
+            this.dateFormats = dateFormats;
+            this.simplePattern = Pattern.compile(simpleRegex, Pattern.MULTILINE);
+            // The (?m) here has the Ruby meaning, which is equivalent to (?s) in Java
+            this.strictSearchGrok = new Grok(Grok.getBuiltinPatterns(), "(?m)%{DATA:" + PREFACE + "}" + strictGrokPattern +
+                "%{GREEDYDATA:" + EPILOGUE + "}");
+            this.strictFullMatchGrok = new Grok(Grok.getBuiltinPatterns(), strictGrokPattern);
+            this.standardGrokPatternName = standardGrokPatternName;
+            assert quickRuleOutIndices.stream()
+                .noneMatch(quickRuleOutIndex -> quickRuleOutIndex < 0 || quickRuleOutIndex >= QUICK_RULE_OUT_PATTERNS.size());
+            this.quickRuleOutIndices = quickRuleOutIndices;
+        }
+    }
+}

+ 35 - 0
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/TsvLogStructureFinderFactory.java

@@ -0,0 +1,35 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.supercsv.prefs.CsvPreference;
+
+import java.io.IOException;
+import java.util.List;
+
+public class TsvLogStructureFinderFactory implements LogStructureFinderFactory {
+
+    /**
+     * Rules are:
+     * - The file must be valid TSV
+     * - It must contain at least two complete records
+     * - There must be at least two fields per record (otherwise files with no tabs could be treated as TSV!)
+     * - Every TSV record except the last must have the same number of fields
+     * The reason the last record is allowed to have fewer fields than the others is that
+     * it could have been truncated when the file was sampled.
+     */
+    @Override
+    public boolean canCreateFromSample(List<String> explanation, String sample) {
+        return SeparatedValuesLogStructureFinder.canCreateFromSample(explanation, sample, 2, CsvPreference.TAB_PREFERENCE, "TSV");
+    }
+
+    @Override
+    public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
+        throws IOException {
+        return SeparatedValuesLogStructureFinder.makeSeparatedValuesLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
+            CsvPreference.TAB_PREFERENCE, false);
+    }
+}

+ 172 - 0
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/XmlLogStructureFinder.java

@@ -0,0 +1,172 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.elasticsearch.common.collect.Tuple;
+import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
+import org.w3c.dom.Document;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Scanner;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.regex.Pattern;
+
+public class XmlLogStructureFinder implements LogStructureFinder {
+
+    private final List<String> sampleMessages;
+    private final LogStructure structure;
+
+    static XmlLogStructureFinder makeXmlLogStructureFinder(List<String> explanation, String sample, String charsetName,
+                                                           Boolean hasByteOrderMarker)
+        throws IOException, ParserConfigurationException, SAXException {
+
+        String messagePrefix;
+        try (Scanner scanner = new Scanner(sample)) {
+            messagePrefix = scanner.next();
+        }
+
+        DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
+        docBuilderFactory.setNamespaceAware(false);
+        docBuilderFactory.setValidating(false);
+
+        List<String> sampleMessages = new ArrayList<>();
+        List<Map<String, ?>> sampleRecords = new ArrayList<>();
+
+        String[] sampleDocEnds = sample.split(Pattern.quote(messagePrefix));
+        StringBuilder preamble = new StringBuilder(sampleDocEnds[0]);
+        int linesConsumed = numNewlinesIn(sampleDocEnds[0]);
+        for (int i = 1; i < sampleDocEnds.length; ++i) {
+            String sampleDoc = messagePrefix + sampleDocEnds[i];
+            if (i < 3) {
+                preamble.append(sampleDoc);
+            }
+            DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
+            try (InputStream is = new ByteArrayInputStream(sampleDoc.getBytes(StandardCharsets.UTF_8))) {
+                sampleRecords.add(docToMap(docBuilder.parse(is)));
+                sampleMessages.add(sampleDoc);
+                linesConsumed += numNewlinesIn(sampleDoc);
+            } catch (SAXException e) {
+                // Tolerate an incomplete last record as long as we have one complete record
+                if (sampleRecords.isEmpty() || i < sampleDocEnds.length - 1) {
+                    throw e;
+                }
+            }
+        }
+
+        if (sample.endsWith("\n") == false) {
+            ++linesConsumed;
+        }
+
+        // If we get here the XML parser should have confirmed this
+        assert messagePrefix.charAt(0) == '<';
+        String topLevelTag = messagePrefix.substring(1);
+
+        LogStructure.Builder structureBuilder = new LogStructure.Builder(LogStructure.Format.XML)
+            .setCharset(charsetName)
+            .setHasByteOrderMarker(hasByteOrderMarker)
+            .setSampleStart(preamble.toString())
+            .setNumLinesAnalyzed(linesConsumed)
+            .setNumMessagesAnalyzed(sampleRecords.size())
+            .setMultilineStartPattern("^\\s*<" + topLevelTag);
+
+        Tuple<String, TimestampMatch> timeField = LogStructureUtils.guessTimestampField(explanation, sampleRecords);
+        if (timeField != null) {
+            structureBuilder.setTimestampField(timeField.v1())
+                .setTimestampFormats(timeField.v2().dateFormats)
+                .setNeedClientTimezone(timeField.v2().hasTimezoneDependentParsing());
+        }
+
+        SortedMap<String, Object> innerMappings = LogStructureUtils.guessMappings(explanation, sampleRecords);
+        Map<String, Object> secondLevelProperties = new LinkedHashMap<>();
+        secondLevelProperties.put(LogStructureUtils.MAPPING_TYPE_SETTING, "object");
+        secondLevelProperties.put(LogStructureUtils.MAPPING_PROPERTIES_SETTING, innerMappings);
+        SortedMap<String, Object> outerMappings = new TreeMap<>();
+        outerMappings.put(topLevelTag, secondLevelProperties);
+        outerMappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD,
+            Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
+
+        LogStructure structure = structureBuilder
+            .setMappings(outerMappings)
+            .setExplanation(explanation)
+            .build();
+
+        return new XmlLogStructureFinder(sampleMessages, structure);
+    }
+
+    private XmlLogStructureFinder(List<String> sampleMessages, LogStructure structure) {
+        this.sampleMessages = Collections.unmodifiableList(sampleMessages);
+        this.structure = structure;
+    }
+
+    @Override
+    public List<String> getSampleMessages() {
+        return sampleMessages;
+    }
+
+    @Override
+    public LogStructure getStructure() {
+        return structure;
+    }
+
+    private static int numNewlinesIn(String str) {
+        return (int) str.chars().filter(c -> c == '\n').count();
+    }
+
+    private static Map<String, Object> docToMap(Document doc) {
+
+        Map<String, Object> docAsMap = new LinkedHashMap<>();
+
+        doc.getDocumentElement().normalize();
+        addNodeToMap(doc.getDocumentElement(), docAsMap);
+
+        return docAsMap;
+    }
+
+    private static void addNodeToMap(Node node, Map<String, Object> nodeAsMap) {
+
+        NamedNodeMap attributes = node.getAttributes();
+        for (int i = 0; i < attributes.getLength(); ++i) {
+            Node attribute = attributes.item(i);
+            nodeAsMap.put(attribute.getNodeName(), attribute.getNodeValue());
+        }
+
+        NodeList children = node.getChildNodes();
+        for (int i = 0; i < children.getLength(); ++i) {
+            Node child = children.item(i);
+            if (child.getNodeType() == Node.ELEMENT_NODE) {
+                if (child.getChildNodes().getLength() == 1) {
+                    Node grandChild = child.getChildNodes().item(0);
+                    String value = grandChild.getNodeValue().trim();
+                    if (value.isEmpty() == false) {
+                        nodeAsMap.put(child.getNodeName(), value);
+                    }
+                } else {
+                    Map<String, Object> childNodeAsMap = new LinkedHashMap<>();
+                    addNodeToMap(child, childNodeAsMap);
+                    if (childNodeAsMap.isEmpty() == false) {
+                        nodeAsMap.put(child.getNodeName(), childNodeAsMap);
+                    }
+                }
+            }
+        }
+    }
+}

+ 122 - 0
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/XmlLogStructureFinderFactory.java

@@ -0,0 +1,122 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.xml.sax.SAXException;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.stream.Location;
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.List;
+
+public class XmlLogStructureFinderFactory implements LogStructureFinderFactory {
+
+    private final XMLInputFactory xmlFactory;
+
+    public XmlLogStructureFinderFactory() {
+        xmlFactory = XMLInputFactory.newInstance();
+        xmlFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.FALSE);
+        xmlFactory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE);
+    }
+
+    /**
+     * This format matches if the sample consists of one or more XML documents,
+     * all with the same root element name.  If there is more than one document,
+     * only whitespace is allowed in between them.  The last one does not
+     * necessarily have to be complete (as the sample could have truncated it).
+     */
+    @Override
+    public boolean canCreateFromSample(List<String> explanation, String sample) {
+
+        int completeDocCount = 0;
+        String commonRootElementName = null;
+        String remainder = sample.trim();
+        boolean mightBeAnotherDocument = !remainder.isEmpty();
+
+        // This processing is extremely complicated because it's necessary
+        // to create a new XML stream reader per document, but each one
+        // will read ahead so will potentially consume characters from the
+        // following document.  We must therefore also recreate the string
+        // reader for each document.
+        while (mightBeAnotherDocument) {
+
+            try (Reader reader = new StringReader(remainder)) {
+
+                XMLStreamReader xmlReader = xmlFactory.createXMLStreamReader(reader);
+                try {
+                    int nestingLevel = 0;
+                    while ((mightBeAnotherDocument = xmlReader.hasNext())) {
+                        switch (xmlReader.next()) {
+                            case XMLStreamReader.START_ELEMENT:
+                                if (nestingLevel++ == 0) {
+                                    String rootElementName = xmlReader.getLocalName();
+                                    if (commonRootElementName == null) {
+                                        commonRootElementName = rootElementName;
+                                    } else if (commonRootElementName.equals(rootElementName) == false) {
+                                        explanation.add("Not XML because different documents have different root " +
+                                            "element names: [" + commonRootElementName + "] and [" + rootElementName + "]");
+                                        return false;
+                                    }
+                                }
+                                break;
+                            case XMLStreamReader.END_ELEMENT:
+                                if (--nestingLevel < 0) {
+                                    explanation.add("Not XML because an end element occurs before a start element");
+                                    return false;
+                                }
+                                break;
+                        }
+                        if (nestingLevel == 0) {
+                            ++completeDocCount;
+                            // Find the position that's one character beyond end of the end element.
+                            // The next document (if there is one) must start after this (possibly
+                            // preceeded by whitespace).
+                            Location location = xmlReader.getLocation();
+                            int endPos = 0;
+                            // Line and column numbers start at 1, not 0
+                            for (int wholeLines = location.getLineNumber() - 1; wholeLines > 0; --wholeLines) {
+                                endPos = remainder.indexOf('\n', endPos) + 1;
+                                if (endPos == 0) {
+                                    explanation.add("Not XML because XML parser location is inconsistent: line [" +
+                                        location.getLineNumber() + "], column [" + location.getColumnNumber() + "] in [" + remainder + "]");
+                                    return false;
+                                }
+                            }
+                            endPos += location.getColumnNumber() - 1;
+                            remainder = remainder.substring(endPos).trim();
+                            mightBeAnotherDocument = !remainder.isEmpty();
+                            break;
+                        }
+                    }
+                } finally {
+                    xmlReader.close();
+                }
+            } catch (IOException | XMLStreamException e) {
+                explanation.add("Not XML because there was a parsing exception: [" + e.getMessage().replaceAll("\\s?\r?\n\\s?", " ") + "]");
+                return false;
+            }
+        }
+
+        if (completeDocCount == 0) {
+            explanation.add("Not XML because sample didn't contain a complete document");
+            return false;
+        }
+
+        explanation.add("Deciding sample is XML");
+        return true;
+    }
+
+    @Override
+    public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
+        throws IOException, ParserConfigurationException, SAXException {
+        return XmlLogStructureFinder.makeXmlLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker);
+    }
+}

+ 38 - 0
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/CsvLogStructureFinderFactoryTests.java

@@ -0,0 +1,38 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+public class CsvLogStructureFinderFactoryTests extends LogStructureTestCase {
+
+    private LogStructureFinderFactory factory = new CsvLogStructureFinderFactory();
+
+    // No need to check JSON or XML because they come earlier in the order we check formats
+
+    public void testCanCreateFromSampleGivenCsv() {
+
+        assertTrue(factory.canCreateFromSample(explanation, CSV_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenTsv() {
+
+        assertFalse(factory.canCreateFromSample(explanation, TSV_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenSemiColonSeparatedValues() {
+
+        assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_SEPARATED_VALUES_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenPipeSeparatedValues() {
+
+        assertFalse(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenText() {
+
+        assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
+    }
+}

+ 326 - 0
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/GrokPatternCreatorTests.java

@@ -0,0 +1,326 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.elasticsearch.common.collect.Tuple;
+import org.elasticsearch.xpack.ml.logstructurefinder.GrokPatternCreator.ValueOnlyGrokPatternCandidate;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import static org.hamcrest.Matchers.containsInAnyOrder;
+
+public class GrokPatternCreatorTests extends LogStructureTestCase {
+
+    public void testBuildFieldName() {
+        Map<String, Integer> fieldNameCountStore = new HashMap<>();
+        assertEquals("field", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
+        assertEquals("field2", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
+        assertEquals("field3", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
+        assertEquals("extra_timestamp", GrokPatternCreator.buildFieldName(fieldNameCountStore, "extra_timestamp"));
+        assertEquals("field4", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
+        assertEquals("uri", GrokPatternCreator.buildFieldName(fieldNameCountStore, "uri"));
+        assertEquals("extra_timestamp2", GrokPatternCreator.buildFieldName(fieldNameCountStore, "extra_timestamp"));
+        assertEquals("field5", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field"));
+    }
+
+    public void testPopulatePrefacesAndEpiloguesGivenTimestamp() {
+
+        Collection<String> matchingStrings = Arrays.asList("[2018-01-25T15:33:23] DEBUG ",
+            "[2018-01-24T12:33:23] ERROR ",
+            "junk [2018-01-22T07:33:23] INFO ",
+            "[2018-01-21T03:33:23] DEBUG ");
+        ValueOnlyGrokPatternCandidate candidate = new ValueOnlyGrokPatternCandidate("TIMESTAMP_ISO8601", "date", "extra_timestamp");
+
+        Map<String, Integer> fieldNameCountStore = new HashMap<>();
+        Collection<String> prefaces = new ArrayList<>();
+        Collection<String> epilogues = new ArrayList<>();
+
+        candidate.processCaptures(fieldNameCountStore, matchingStrings, prefaces, epilogues, null);
+
+        assertThat(prefaces, containsInAnyOrder("[", "[", "junk [", "["));
+        assertThat(epilogues, containsInAnyOrder("] DEBUG ", "] ERROR ", "] INFO ", "] DEBUG "));
+    }
+
+    public void testPopulatePrefacesAndEpiloguesGivenEmailAddress() {
+
+        Collection<String> matchingStrings = Arrays.asList("before alice@acme.com after",
+            "abc bob@acme.com xyz",
+            "carol@acme.com");
+        ValueOnlyGrokPatternCandidate candidate = new ValueOnlyGrokPatternCandidate("EMAILADDRESS", "keyword", "email");
+
+        Map<String, Integer> fieldNameCountStore = new HashMap<>();
+        Collection<String> prefaces = new ArrayList<>();
+        Collection<String> epilogues = new ArrayList<>();
+
+        candidate.processCaptures(fieldNameCountStore, matchingStrings, prefaces, epilogues, null);
+
+        assertThat(prefaces, containsInAnyOrder("before ", "abc ", ""));
+        assertThat(epilogues, containsInAnyOrder(" after", " xyz", ""));
+    }
+
+    public void testAppendBestGrokMatchForStringsGivenTimestampsAndLogLevels() {
+
+        Collection<String> snippets = Arrays.asList("[2018-01-25T15:33:23] DEBUG ",
+            "[2018-01-24T12:33:23] ERROR ",
+            "junk [2018-01-22T07:33:23] INFO ",
+            "[2018-01-21T03:33:23] DEBUG ");
+
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
+
+        assertEquals(".*?\\[%{TIMESTAMP_ISO8601:extra_timestamp}\\] %{LOGLEVEL:loglevel} ",
+            grokPatternCreator.getOverallGrokPatternBuilder().toString());
+    }
+
+    public void testAppendBestGrokMatchForStringsGivenNumbersInBrackets() {
+
+        Collection<String> snippets = Arrays.asList("(-2)",
+            "  (-3)",
+            " (4)",
+            " (-5) ");
+
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
+
+        assertEquals(".*?\\(%{INT:field}\\).*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
+    }
+
+    public void testAppendBestGrokMatchForStringsGivenNegativeNumbersWithoutBreak() {
+
+        Collection<String> snippets = Arrays.asList("before-2 ",
+            "prior to-3",
+            "-4");
+
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
+
+        // It seems sensible that we don't detect these suffices as either base 10 or base 16 numbers
+        assertEquals(".*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
+    }
+
+    public void testAppendBestGrokMatchForStringsGivenHexNumbers() {
+
+        Collection<String> snippets = Arrays.asList(" abc",
+            "  123",
+            " -123",
+            "1f is hex");
+
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
+
+        assertEquals(".*?%{BASE16NUM:field}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
+    }
+
+    public void testAppendBestGrokMatchForStringsGivenHostnamesWithNumbers() {
+
+        Collection<String> snippets = Arrays.asList("<host1.1.p2ps:",
+            "<host2.1.p2ps:");
+
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
+
+        // We don't want the .1. in the middle to get detected as a hex number
+        assertEquals("<.*?:", grokPatternCreator.getOverallGrokPatternBuilder().toString());
+    }
+
+    public void testAppendBestGrokMatchForStringsGivenEmailAddresses() {
+
+        Collection<String> snippets = Arrays.asList("before alice@acme.com after",
+            "abc bob@acme.com xyz",
+            "carol@acme.com");
+
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
+
+        assertEquals(".*?%{EMAILADDRESS:email}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
+    }
+
+    public void testAppendBestGrokMatchForStringsGivenUris() {
+
+        Collection<String> snippets = Arrays.asList("main site https://www.elastic.co/ with trailing slash",
+            "https://www.elastic.co/guide/en/x-pack/current/ml-configuring-categories.html#ml-configuring-categories is a section",
+            "download today from https://www.elastic.co/downloads");
+
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
+
+        assertEquals(".*?%{URI:uri}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
+    }
+
+    public void testAppendBestGrokMatchForStringsGivenPaths() {
+
+        Collection<String> snippets = Arrays.asList("on Mac /Users/dave",
+            "on Windows C:\\Users\\dave",
+            "on Linux /home/dave");
+
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
+
+        assertEquals(".*? .*? %{PATH:path}", grokPatternCreator.getOverallGrokPatternBuilder().toString());
+    }
+
+    public void testAppendBestGrokMatchForStringsGivenKvPairs() {
+
+        Collection<String> snippets = Arrays.asList("foo=1 and bar=a",
+            "something foo=2 bar=b something else",
+            "foo=3 bar=c",
+            " foo=1 bar=a ");
+
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
+
+        assertEquals(".*?\\bfoo=%{USER:foo} .*?\\bbar=%{USER:bar}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
+    }
+
+    public void testCreateGrokPatternFromExamplesGivenNamedLogs() {
+
+        Collection<String> sampleMessages = Arrays.asList(
+            "Sep  8 11:55:06 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'elastic.slack.com/A/IN': 95.110.64.205#53",
+            "Sep  8 11:55:08 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'slack-imgs.com/A/IN': 95.110.64.205#53",
+            "Sep  8 11:55:35 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'www.elastic.co/A/IN': 95.110.68.206#53",
+            "Sep  8 11:55:42 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'b.akamaiedge.net/A/IN': 95.110.64.205#53");
+
+        Map<String, Object> mappings = new HashMap<>();
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
+
+        assertEquals("%{SYSLOGTIMESTAMP:timestamp} .*? .*?\\[%{INT:field}\\]: %{LOGLEVEL:loglevel} \\(.*? .*? .*?\\) .*? " +
+                "%{QUOTEDSTRING:field2}: %{IP:ipaddress}#%{INT:field3}",
+            grokPatternCreator.createGrokPatternFromExamples("SYSLOGTIMESTAMP", "timestamp"));
+        assertEquals(5, mappings.size());
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field"));
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel"));
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("field2"));
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("ipaddress"));
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field3"));
+    }
+
+    public void testCreateGrokPatternFromExamplesGivenCatalinaLogs() {
+
+        Collection<String> sampleMessages = Arrays.asList(
+            "Aug 29, 2009 12:03:33 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
+                "Invalid chunk ignored.",
+            "Aug 29, 2009 12:03:40 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
+                "Invalid chunk ignored.",
+            "Aug 29, 2009 12:03:45 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
+                "Invalid chunk ignored.",
+            "Aug 29, 2009 12:03:57 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " +
+                "Invalid chunk ignored.");
+
+        Map<String, Object> mappings = new HashMap<>();
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
+
+        assertEquals("%{CATALINA_DATESTAMP:timestamp} .*? .*?\\n%{LOGLEVEL:loglevel}: .*",
+            grokPatternCreator.createGrokPatternFromExamples("CATALINA_DATESTAMP", "timestamp"));
+        assertEquals(1, mappings.size());
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel"));
+    }
+
+    public void testCreateGrokPatternFromExamplesGivenMultiTimestampLogs() {
+
+        // Two timestamps: one local, one UTC
+        Collection<String> sampleMessages = Arrays.asList(
+            "559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" +
+                "Info\tsshd\tsubsystem request for sftp",
+            "559550912548986880\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" +
+                "Info\tsshd\tsubsystem request for sftp",
+            "559550912548986887\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" +
+                "Info\tsshd\tsubsystem request for sftp",
+            "559550912603512850\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" +
+                "Info\tsshd\tsubsystem request for sftp");
+
+        Map<String, Object> mappings = new HashMap<>();
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
+
+        assertEquals("%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{TIMESTAMP_ISO8601:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" +
+                "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*",
+            grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", "timestamp"));
+        assertEquals(5, mappings.size());
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field"));
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"),
+            mappings.get("extra_timestamp"));
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field2"));
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("ipaddress"));
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel"));
+    }
+
+    public void testFindFullLineGrokPatternGivenApacheCombinedLogs() {
+        Collection<String> sampleMessages = Arrays.asList(
+            "83.149.9.216 - - [19/Jan/2016:08:13:42 +0000] " +
+                "\"GET /presentations/logstash-monitorama-2013/images/kibana-search.png HTTP/1.1\" 200 203023 " +
+                "\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " +
+                "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"",
+            "83.149.9.216 - - [19/Jan/2016:08:13:44 +0000] " +
+                "\"GET /presentations/logstash-monitorama-2013/plugin/zoom-js/zoom.js HTTP/1.1\" 200 7697 " +
+                "\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " +
+                "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"",
+            "83.149.9.216 - - [19/Jan/2016:08:13:44 +0000] " +
+                "\"GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js HTTP/1.1\" 200 26185 " +
+                "\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " +
+                "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"",
+            "83.149.9.216 - - [19/Jan/2016:08:13:42 +0000] " +
+                "\"GET /presentations/logstash-monitorama-2013/images/sad-medic.png HTTP/1.1\" 200 430406 " +
+                "\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " +
+                "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"");
+
+        Map<String, Object> mappings = new HashMap<>();
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
+
+        assertEquals(new Tuple<>("timestamp", "%{COMBINEDAPACHELOG}"), grokPatternCreator.findFullLineGrokPattern());
+        assertEquals(10, mappings.size());
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "text"), mappings.get("agent"));
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("auth"));
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("bytes"));
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("clientip"));
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "double"), mappings.get("httpversion"));
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("ident"));
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("referrer"));
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("request"));
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("response"));
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("verb"));
+    }
+
+    public void testAdjustForPunctuationGivenCommonPrefix() {
+        Collection<String> snippets = Arrays.asList(
+            "\",\"lab6.localhost\",\"Route Domain\",\"/Common/0\",\"No-lookup\",\"192.168.33.212\",\"No-lookup\",\"192.168.33.132\"," +
+                "\"80\",\"46721\",\"/Common/Subnet_33\",\"TCP\",\"0\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Staged\",\"/Common/policy1\"" +
+                ",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\"",
+            "\",\"lab6.localhost\",\"Route Domain\",\"/Common/0\",\"No-lookup\",\"192.168.143.244\",\"No-lookup\",\"192.168.33.106\"," +
+                "\"55025\",\"162\",\"/Common/Subnet_33\",\"UDP\",\"0\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Staged\",\"/Common/policy1\"" +
+                ",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\"",
+            "\",\"lab6.localhost\",\"Route Domain\",\"/Common/0\",\"No-lookup\",\"192.168.33.3\",\"No-lookup\",\"224.0.0.102\"," +
+                "\"3222\",\"3222\",\"/Common/Subnet_33\",\"UDP\",\"0\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Staged\",\"/Common/policy1\"" +
+                ",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\""
+        );
+
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        Collection<String> adjustedSnippets = grokPatternCreator.adjustForPunctuation(snippets);
+
+        assertEquals("\",", grokPatternCreator.getOverallGrokPatternBuilder().toString());
+        assertNotNull(adjustedSnippets);
+        assertThat(new ArrayList<>(adjustedSnippets),
+            containsInAnyOrder(snippets.stream().map(snippet -> snippet.substring(2)).toArray(String[]::new)));
+    }
+
+    public void testAdjustForPunctuationGivenNoCommonPrefix() {
+        Collection<String> snippets = Arrays.asList(
+            "|client (id:2) was removed from servergroup 'Normal'(id:7) by client 'User1'(id:2)",
+            "|servergroup 'GAME'(id:9) was added by 'User1'(id:2)",
+            "|permission 'i_group_auto_update_type'(id:146) with values (value:30, negated:0, skipchannel:0) " +
+                "was added by 'User1'(id:2) to servergroup 'GAME'(id:9)"
+        );
+
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        Collection<String> adjustedSnippets = grokPatternCreator.adjustForPunctuation(snippets);
+
+        assertEquals("", grokPatternCreator.getOverallGrokPatternBuilder().toString());
+        assertSame(snippets, adjustedSnippets);
+    }
+}

+ 46 - 0
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/JsonLogStructureFinderFactoryTests.java

@@ -0,0 +1,46 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+public class JsonLogStructureFinderFactoryTests extends LogStructureTestCase {
+
+    private LogStructureFinderFactory factory = new JsonLogStructureFinderFactory();
+
+    public void testCanCreateFromSampleGivenJson() {
+
+        assertTrue(factory.canCreateFromSample(explanation, JSON_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenXml() {
+
+        assertFalse(factory.canCreateFromSample(explanation, XML_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenCsv() {
+
+        assertFalse(factory.canCreateFromSample(explanation, CSV_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenTsv() {
+
+        assertFalse(factory.canCreateFromSample(explanation, TSV_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenSemiColonSeparatedValues() {
+
+        assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_SEPARATED_VALUES_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenPipeSeparatedValues() {
+
+        assertFalse(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenText() {
+
+        assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
+    }
+}

+ 39 - 0
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/JsonLogStructureFinderTests.java

@@ -0,0 +1,39 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import java.util.Collections;
+
+public class JsonLogStructureFinderTests extends LogStructureTestCase {
+
+    private LogStructureFinderFactory factory = new JsonLogStructureFinderFactory();
+
+    public void testCreateConfigsGivenGoodJson() throws Exception {
+        assertTrue(factory.canCreateFromSample(explanation, JSON_SAMPLE));
+
+        String charset = randomFrom(POSSIBLE_CHARSETS);
+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
+        LogStructureFinder structureFinder = factory.createFromSample(explanation, JSON_SAMPLE, charset, hasByteOrderMarker);
+
+        LogStructure structure = structureFinder.getStructure();
+
+        assertEquals(LogStructure.Format.JSON, structure.getFormat());
+        assertEquals(charset, structure.getCharset());
+        if (hasByteOrderMarker == null) {
+            assertNull(structure.getHasByteOrderMarker());
+        } else {
+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
+        }
+        assertNull(structure.getExcludeLinesPattern());
+        assertNull(structure.getMultilineStartPattern());
+        assertNull(structure.getSeparator());
+        assertNull(structure.getHasHeaderRow());
+        assertNull(structure.getShouldTrimFields());
+        assertNull(structure.getGrokPattern());
+        assertEquals("timestamp", structure.getTimestampField());
+        assertEquals(Collections.singletonList("UNIX_MS"), structure.getTimestampFormats());
+    }
+}

+ 72 - 0
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderManagerTests.java

@@ -0,0 +1,72 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import com.ibm.icu.text.CharsetMatch;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+
+import static org.hamcrest.Matchers.startsWith;
+import static org.hamcrest.core.IsInstanceOf.instanceOf;
+
+public class LogStructureFinderManagerTests extends LogStructureTestCase {
+
+    private LogStructureFinderManager structureFinderManager = new LogStructureFinderManager();
+
+    public void testFindCharsetGivenCharacterWidths() throws Exception {
+
+        for (Charset charset : Arrays.asList(StandardCharsets.UTF_8, StandardCharsets.UTF_16LE, StandardCharsets.UTF_16BE)) {
+            CharsetMatch charsetMatch = structureFinderManager.findCharset(explanation,
+                new ByteArrayInputStream(TEXT_SAMPLE.getBytes(charset)));
+            assertEquals(charset.name(), charsetMatch.getName());
+        }
+    }
+
+    public void testFindCharsetGivenBinary() throws Exception {
+
+        // This input should never match a single byte character set.  ICU4J will sometimes decide
+        // that it matches a double byte character set, hence the two assertion branches.
+        int size = 1000;
+        byte[] binaryBytes = randomByteArrayOfLength(size);
+        for (int i = 0; i < 10; ++i) {
+            binaryBytes[randomIntBetween(0, size - 1)] = 0;
+        }
+
+        try {
+            CharsetMatch charsetMatch = structureFinderManager.findCharset(explanation, new ByteArrayInputStream(binaryBytes));
+            assertThat(charsetMatch.getName(), startsWith("UTF-16"));
+        } catch (IllegalArgumentException e) {
+            assertEquals("Could not determine a usable character encoding for the input - could it be binary data?", e.getMessage());
+        }
+    }
+
+    public void testMakeBestStructureGivenJson() throws Exception {
+        assertThat(structureFinderManager.makeBestStructureFinder(explanation,
+            "{ \"time\": \"2018-05-17T13:41:23\", \"message\": \"hello\" }", StandardCharsets.UTF_8.name(), randomBoolean()),
+            instanceOf(JsonLogStructureFinder.class));
+    }
+
+    public void testMakeBestStructureGivenXml() throws Exception {
+        assertThat(structureFinderManager.makeBestStructureFinder(explanation,
+            "<log time=\"2018-05-17T13:41:23\"><message>hello</message></log>", StandardCharsets.UTF_8.name(), randomBoolean()),
+            instanceOf(XmlLogStructureFinder.class));
+    }
+
+    public void testMakeBestStructureGivenCsv() throws Exception {
+        assertThat(structureFinderManager.makeBestStructureFinder(explanation, "time,message\n" +
+                "2018-05-17T13:41:23,hello\n", StandardCharsets.UTF_8.name(), randomBoolean()),
+            instanceOf(SeparatedValuesLogStructureFinder.class));
+    }
+
+    public void testMakeBestStructureGivenText() throws Exception {
+        assertThat(structureFinderManager.makeBestStructureFinder(explanation, "[2018-05-17T13:41:23] hello\n" +
+                "[2018-05-17T13:41:24] hello again\n", StandardCharsets.UTF_8.name(), randomBoolean()),
+            instanceOf(TextLogStructureFinder.class));
+    }
+}

+ 86 - 0
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureTestCase.java

@@ -0,0 +1,86 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.elasticsearch.common.logging.Loggers;
+import org.elasticsearch.test.ESTestCase;
+import org.junit.After;
+import org.junit.Before;
+
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+import java.util.stream.Collectors;
+
+public abstract class LogStructureTestCase extends ESTestCase {
+
+    protected static final List<String> POSSIBLE_CHARSETS = Collections.unmodifiableList(Charset.availableCharsets().keySet().stream()
+        .filter(name -> LogStructureFinderManager.FILEBEAT_SUPPORTED_ENCODINGS.contains(name.toLowerCase(Locale.ROOT)))
+        .collect(Collectors.toList()));
+
+    protected static final String CSV_SAMPLE = "time,id,value\n" +
+        "2018-05-17T16:23:40,key1,42.0\n" +
+        "2018-05-17T16:24:11,\"key with spaces\",42.0\n";
+
+    protected static final String JSON_SAMPLE = "{\"logger\":\"controller\",\"timestamp\":1478261151445,\"level\":\"INFO\"," +
+            "\"pid\":42,\"thread\":\"0x7fff7d2a8000\",\"message\":\"message 1\",\"class\":\"ml\"," +
+            "\"method\":\"core::SomeNoiseMaker\",\"file\":\"Noisemaker.cc\",\"line\":333}\n" +
+        "{\"logger\":\"controller\",\"timestamp\":1478261151445," +
+            "\"level\":\"INFO\",\"pid\":42,\"thread\":\"0x7fff7d2a8000\",\"message\":\"message 2\",\"class\":\"ml\"," +
+            "\"method\":\"core::SomeNoiseMaker\",\"file\":\"Noisemaker.cc\",\"line\":333}\n";
+
+    protected static final String PIPE_SEPARATED_VALUES_SAMPLE = "2018-01-06 16:56:14.295748|INFO    |VirtualServer |1  |" +
+            "listening on 0.0.0.0:9987, :::9987\n" +
+        "2018-01-06 17:19:44.465252|INFO    |VirtualServer |1  |client " +
+            "'User1'(id:2) changed default admin channelgroup to 'Guest'(id:8)\n" +
+        "2018-01-06 17:21:25.764368|INFO    |VirtualServer |1  |client " +
+            "'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel 'Default Channel'(id:1)";
+
+    protected static final String SEMI_COLON_SEPARATED_VALUES_SAMPLE = "\"pos_id\";\"trip_id\";\"latitude\";\"longitude\";\"altitude\";" +
+            "\"timestamp\"\n" +
+        "\"1\";\"3\";\"4703.7815\";\"1527.4713\";\"359.9\";\"2017-01-19 16:19:04.742113\"\n" +
+        "\"2\";\"3\";\"4703.7815\";\"1527.4714\";\"359.9\";\"2017-01-19 16:19:05.741890\"\n" +
+        "\"3\";\"3\";\"4703.7816\";\"1527.4716\";\"360.3\";\"2017-01-19 16:19:06.738842\"";
+
+    protected static final String TEXT_SAMPLE = "[2018-05-11T17:07:29,461][INFO ][o.e.n.Node               ] [node-0] initializing ...\n" +
+        "[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment    ] [node-0] using [1] data paths, mounts [[/ (/dev/disk1)]], " +
+            "net usable_space [223.4gb], net total_space [464.7gb], types [hfs]\n" +
+        "[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment    ] [node-0] heap size [3.9gb], " +
+            "compressed ordinary object pointers [true]\n" +
+        "[2018-05-11T17:07:29,556][INFO ][o.e.n.Node               ] [node-0] node name [node-0], node ID [tJ9u8HcaTbWxRtnlfz1RQA]\n";
+
+    protected static final String TSV_SAMPLE = "time\tid\tvalue\n" +
+        "2018-05-17T16:23:40\tkey1\t42.0\n" +
+        "2018-05-17T16:24:11\t\"key with spaces\"\t42.0\n";
+
+    protected static final String XML_SAMPLE = "<log4j:event logger=\"autodetect\" timestamp=\"1526574809521\" level=\"ERROR\" " +
+            "thread=\"0x7fffc5a7c3c0\">\n" +
+        "<log4j:message><![CDATA[Neither a fieldname clause nor a field config file was specified]]></log4j:message>\n" +
+        "</log4j:event>\n" +
+        "\n" +
+        "<log4j:event logger=\"autodetect\" timestamp=\"1526574809522\" level=\"FATAL\" thread=\"0x7fffc5a7c3c0\">\n" +
+        "<log4j:message><![CDATA[Field config could not be interpreted]]></log4j:message>\n" +
+        "</log4j:event>\n" +
+        "\n";
+
+    protected List<String> explanation;
+
+    @Before
+    public void initExplanation() {
+        explanation = new ArrayList<>();
+    }
+
+    @After
+    public void printExplanation() {
+        Loggers.getLogger(getClass()).info("Explanation:\n" + String.join("\n", explanation));
+    }
+
+    protected Boolean randomHasByteOrderMarker(String charset) {
+        return charset.toUpperCase(Locale.ROOT).startsWith("UTF") ? randomBoolean() : null;
+    }
+}

+ 83 - 0
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureTests.java

@@ -0,0 +1,83 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.elasticsearch.common.xcontent.XContentParser;
+import org.elasticsearch.test.AbstractXContentTestCase;
+
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.Locale;
+import java.util.Map;
+import java.util.TreeMap;
+
+public class LogStructureTests extends AbstractXContentTestCase<LogStructure> {
+
+    protected LogStructure createTestInstance() {
+
+        LogStructure.Format format = randomFrom(EnumSet.allOf(LogStructure.Format.class));
+
+        LogStructure.Builder builder = new LogStructure.Builder(format);
+
+        int numLinesAnalyzed = randomIntBetween(2, 10000);
+        builder.setNumLinesAnalyzed(numLinesAnalyzed);
+        int numMessagesAnalyzed = randomIntBetween(1, numLinesAnalyzed);
+        builder.setNumMessagesAnalyzed(numMessagesAnalyzed);
+        builder.setSampleStart(randomAlphaOfLength(1000));
+
+        String charset = randomFrom(Charset.availableCharsets().keySet());
+        builder.setCharset(charset);
+        if (charset.toUpperCase(Locale.ROOT).startsWith("UTF")) {
+            builder.setHasByteOrderMarker(randomBoolean());
+        }
+
+        if (numMessagesAnalyzed < numLinesAnalyzed) {
+            builder.setMultilineStartPattern(randomAlphaOfLength(100));
+        }
+        if (randomBoolean()) {
+            builder.setExcludeLinesPattern(randomAlphaOfLength(100));
+        }
+
+        if (format.isSeparatedValues() || (format.supportsNesting() && randomBoolean())) {
+            builder.setInputFields(Arrays.asList(generateRandomStringArray(10, 10, false, false)));
+        }
+        if (format.isSeparatedValues()) {
+            builder.setHasHeaderRow(randomBoolean());
+            if (rarely()) {
+                builder.setSeparator(format.separator());
+            }
+        }
+        if (format.isSemiStructured()) {
+            builder.setGrokPattern(randomAlphaOfLength(100));
+        }
+
+        if (format.isSemiStructured() || randomBoolean()) {
+            builder.setTimestampField(randomAlphaOfLength(10));
+            builder.setTimestampFormats(Arrays.asList(generateRandomStringArray(3, 20, false, false)));
+            builder.setNeedClientTimezone(randomBoolean());
+        }
+
+        Map<String, Object> mappings = new TreeMap<>();
+        for (String field : generateRandomStringArray(5, 20, false, false)) {
+            mappings.put(field, Collections.singletonMap(randomAlphaOfLength(5), randomAlphaOfLength(10)));
+        }
+        builder.setMappings(mappings);
+
+        builder.setExplanation(Arrays.asList(generateRandomStringArray(10, 150, false, false)));
+
+        return builder.build();
+    }
+
+    protected LogStructure doParseInstance(XContentParser parser) {
+        return LogStructure.PARSER.apply(parser, null).build();
+    }
+
+    protected boolean supportsUnknownFields() {
+        return false;
+    }
+}

+ 292 - 0
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureUtilsTests.java

@@ -0,0 +1,292 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.elasticsearch.common.collect.Tuple;
+import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+import static org.hamcrest.Matchers.contains;
+
+public class LogStructureUtilsTests extends LogStructureTestCase {
+
+    public void testMoreLikelyGivenText() {
+        assertTrue(LogStructureUtils.isMoreLikelyTextThanKeyword("the quick brown fox jumped over the lazy dog"));
+        assertTrue(LogStructureUtils.isMoreLikelyTextThanKeyword(randomAlphaOfLengthBetween(257, 10000)));
+    }
+
+    public void testMoreLikelyGivenKeyword() {
+        assertFalse(LogStructureUtils.isMoreLikelyTextThanKeyword("1"));
+        assertFalse(LogStructureUtils.isMoreLikelyTextThanKeyword("DEBUG"));
+        assertFalse(LogStructureUtils.isMoreLikelyTextThanKeyword(randomAlphaOfLengthBetween(1, 256)));
+    }
+
+    public void testSingleSampleSingleField() {
+        Map<String, String> sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
+        Tuple<String, TimestampMatch> match =
+            LogStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample));
+        assertNotNull(match);
+        assertEquals("field1", match.v1());
+        assertThat(match.v2().dateFormats, contains("ISO8601"));
+        assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
+    }
+
+    public void testSamplesWithSameSingleTimeField() {
+        Map<String, String> sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
+        Map<String, String> sample2 = Collections.singletonMap("field1", "2018-05-24T17:33:39,406");
+        Tuple<String, TimestampMatch> match =
+            LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
+        assertNotNull(match);
+        assertEquals("field1", match.v1());
+        assertThat(match.v2().dateFormats, contains("ISO8601"));
+        assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
+    }
+
+    public void testSamplesWithOneSingleTimeFieldDifferentFormat() {
+        Map<String, String> sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
+        Map<String, String> sample2 = Collections.singletonMap("field1", "2018-05-24 17:33:39,406");
+        Tuple<String, TimestampMatch> match =
+            LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
+        assertNull(match);
+    }
+
+    public void testSamplesWithDifferentSingleTimeField() {
+        Map<String, String> sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
+        Map<String, String> sample2 = Collections.singletonMap("another_field", "2018-05-24T17:33:39,406");
+        Tuple<String, TimestampMatch> match =
+            LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
+        assertNull(match);
+    }
+
+    public void testSingleSampleManyFieldsOneTimeFormat() {
+        Map<String, Object> sample = new LinkedHashMap<>();
+        sample.put("foo", "not a time");
+        sample.put("time", "2018-05-24 17:28:31,735");
+        sample.put("bar", 42);
+        Tuple<String, TimestampMatch> match =
+            LogStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample));
+        assertNotNull(match);
+        assertEquals("time", match.v1());
+        assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS"));
+        assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
+    }
+
+    public void testSamplesWithManyFieldsSameSingleTimeFormat() {
+        Map<String, Object> sample1 = new LinkedHashMap<>();
+        sample1.put("foo", "not a time");
+        sample1.put("time", "2018-05-24 17:28:31,735");
+        sample1.put("bar", 42);
+        Map<String, Object> sample2 = new LinkedHashMap<>();
+        sample2.put("foo", "whatever");
+        sample2.put("time", "2018-05-29 11:53:02,837");
+        sample2.put("bar", 17);
+        Tuple<String, TimestampMatch> match =
+            LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
+        assertNotNull(match);
+        assertEquals("time", match.v1());
+        assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS"));
+        assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
+    }
+
+    public void testSamplesWithManyFieldsSameTimeFieldDifferentTimeFormat() {
+        Map<String, Object> sample1 = new LinkedHashMap<>();
+        sample1.put("foo", "not a time");
+        sample1.put("time", "2018-05-24 17:28:31,735");
+        sample1.put("bar", 42);
+        Map<String, Object> sample2 = new LinkedHashMap<>();
+        sample2.put("foo", "whatever");
+        sample2.put("time", "May 29 2018 11:53:02");
+        sample2.put("bar", 17);
+        Tuple<String, TimestampMatch> match =
+            LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
+        assertNull(match);
+    }
+
+    public void testSamplesWithManyFieldsSameSingleTimeFormatDistractionBefore() {
+        Map<String, Object> sample1 = new LinkedHashMap<>();
+        sample1.put("red_herring", "May 29 2007 11:53:02");
+        sample1.put("time", "2018-05-24 17:28:31,735");
+        sample1.put("bar", 42);
+        Map<String, Object> sample2 = new LinkedHashMap<>();
+        sample2.put("red_herring", "whatever");
+        sample2.put("time", "2018-05-29 11:53:02,837");
+        sample2.put("bar", 17);
+        Tuple<String, TimestampMatch> match =
+            LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
+        assertNotNull(match);
+        assertEquals("time", match.v1());
+        assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS"));
+        assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
+    }
+
+    public void testSamplesWithManyFieldsSameSingleTimeFormatDistractionAfter() {
+        Map<String, Object> sample1 = new LinkedHashMap<>();
+        sample1.put("foo", "not a time");
+        sample1.put("time", "May 24 2018 17:28:31");
+        sample1.put("red_herring", "2018-05-24 17:28:31,735");
+        Map<String, Object> sample2 = new LinkedHashMap<>();
+        sample2.put("foo", "whatever");
+        sample2.put("time", "May 29 2018 11:53:02");
+        sample2.put("red_herring", "17");
+        Tuple<String, TimestampMatch> match =
+            LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
+        assertNotNull(match);
+        assertEquals("time", match.v1());
+        assertThat(match.v2().dateFormats, contains("MMM dd YYYY HH:mm:ss", "MMM  d YYYY HH:mm:ss"));
+        assertEquals("CISCOTIMESTAMP", match.v2().grokPatternName);
+    }
+
+    public void testSamplesWithManyFieldsInconsistentTimeFields() {
+        Map<String, Object> sample1 = new LinkedHashMap<>();
+        sample1.put("foo", "not a time");
+        sample1.put("time1", "May 24 2018 17:28:31");
+        sample1.put("bar", 17);
+        Map<String, Object> sample2 = new LinkedHashMap<>();
+        sample2.put("foo", "whatever");
+        sample2.put("time2", "May 29 2018 11:53:02");
+        sample2.put("bar", 42);
+        Tuple<String, TimestampMatch> match =
+            LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
+        assertNull(match);
+    }
+
+    public void testSamplesWithManyFieldsInconsistentAndConsistentTimeFields() {
+        Map<String, Object> sample1 = new LinkedHashMap<>();
+        sample1.put("foo", "not a time");
+        sample1.put("time1", "2018-05-09 17:28:31,735");
+        sample1.put("time2", "May  9 2018 17:28:31");
+        sample1.put("bar", 17);
+        Map<String, Object> sample2 = new LinkedHashMap<>();
+        sample2.put("foo", "whatever");
+        sample2.put("time2", "May 10 2018 11:53:02");
+        sample2.put("time3", "Thu, May 10 2018 11:53:02");
+        sample2.put("bar", 42);
+        Tuple<String, TimestampMatch> match =
+            LogStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
+        assertNotNull(match);
+        assertEquals("time2", match.v1());
+        assertThat(match.v2().dateFormats, contains("MMM dd YYYY HH:mm:ss", "MMM  d YYYY HH:mm:ss"));
+        assertEquals("CISCOTIMESTAMP", match.v2().grokPatternName);
+    }
+
+    public void testGuessMappingGivenNothing() {
+        assertNull(LogStructureUtils.guessMapping(explanation, "foo", Collections.emptyList()));
+    }
+
+    public void testGuessMappingGivenKeyword() {
+        Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword");
+
+        assertEquals(expected,
+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("ERROR", "INFO", "DEBUG")));
+        assertEquals(expected,
+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("2018-06-11T13:26:47Z", "not a date")));
+    }
+
+    public void testGuessMappingGivenText() {
+        Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "text");
+
+        assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo",
+            Arrays.asList("a", "the quick brown fox jumped over the lazy dog")));
+    }
+
+    public void testGuessMappingGivenIp() {
+        Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "ip");
+
+        assertEquals(expected,
+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("10.0.0.1", "172.16.0.1", "192.168.0.1")));
+    }
+
+    public void testGuessMappingGivenDouble() {
+        Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "double");
+
+        assertEquals(expected,
+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("3.14159265359", "0", "-8")));
+        // 12345678901234567890 is too long for long
+        assertEquals(expected,
+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("1", "2", "12345678901234567890")));
+        assertEquals(expected,
+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(3.14159265359, 0.0, 1e-308)));
+        assertEquals(expected,
+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("-1e-1", "-1e308", "1e-308")));
+    }
+
+    public void testGuessMappingGivenLong() {
+        Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long");
+
+        assertEquals(expected,
+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("500", "3", "-3")));
+        assertEquals(expected,
+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(500, 6, 0)));
+    }
+
+    public void testGuessMappingGivenDate() {
+        Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date");
+
+        assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo",
+            Arrays.asList("2018-06-11T13:26:47Z", "2018-06-11T13:27:12Z")));
+    }
+
+    public void testGuessMappingGivenBoolean() {
+        Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "boolean");
+
+        assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("false", "true")));
+        assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(true, false)));
+    }
+
+    public void testGuessMappingGivenArray() {
+        Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long");
+
+        assertEquals(expected,
+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(42, Arrays.asList(1, -99))));
+
+        expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword");
+
+        assertEquals(expected,
+            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(new String[]{ "x", "y" }, "z")));
+    }
+
+    public void testGuessMappingGivenObject() {
+        Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "object");
+
+        assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo",
+            Arrays.asList(Collections.singletonMap("name", "value1"), Collections.singletonMap("name", "value2"))));
+    }
+
+    public void testGuessMappingGivenObjectAndNonObject() {
+        RuntimeException e = expectThrows(RuntimeException.class, () -> LogStructureUtils.guessMapping(explanation,
+            "foo", Arrays.asList(Collections.singletonMap("name", "value1"), "value2")));
+
+        assertEquals("Field [foo] has both object and non-object values - this is not supported by Elasticsearch", e.getMessage());
+    }
+
+    public void testGuessMappings() {
+        Map<String, Object> sample1 = new LinkedHashMap<>();
+        sample1.put("foo", "not a time");
+        sample1.put("time", "2018-05-24 17:28:31,735");
+        sample1.put("bar", 42);
+        sample1.put("nothing", null);
+        Map<String, Object> sample2 = new LinkedHashMap<>();
+        sample2.put("foo", "whatever");
+        sample2.put("time", "2018-05-29 11:53:02,837");
+        sample2.put("bar", 17);
+        sample2.put("nothing", null);
+
+        Map<String, Object> mappings = LogStructureUtils.guessMappings(explanation, Arrays.asList(sample1, sample2));
+        assertNotNull(mappings);
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("foo"));
+        Map<String, String> expectedTimeMapping = new HashMap<>();
+        expectedTimeMapping.put(LogStructureUtils.MAPPING_TYPE_SETTING, "date");
+        expectedTimeMapping.put(LogStructureUtils.MAPPING_FORMAT_SETTING, "YYYY-MM-dd HH:mm:ss,SSS");
+        assertEquals(expectedTimeMapping, mappings.get("time"));
+        assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("bar"));
+        assertNull(mappings.get("nothing"));
+    }
+}

+ 23 - 0
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/PipeSeparatedValuesLogStructureFinderFactoryTests.java

@@ -0,0 +1,23 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+public class PipeSeparatedValuesLogStructureFinderFactoryTests extends LogStructureTestCase {
+
+    private LogStructureFinderFactory factory = new PipeSeparatedValuesLogStructureFinderFactory();
+
+    // No need to check JSON, XML, CSV, TSV or semi-colon separated values because they come earlier in the order we check formats
+
+    public void testCanCreateFromSampleGivenPipeSeparatedValues() {
+
+        assertTrue(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenText() {
+
+        assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
+    }
+}

+ 28 - 0
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/SemiColonSeparatedValuesLogStructureFinderFactoryTests.java

@@ -0,0 +1,28 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+public class SemiColonSeparatedValuesLogStructureFinderFactoryTests extends LogStructureTestCase {
+
+    private LogStructureFinderFactory factory = new SemiColonSeparatedValuesLogStructureFinderFactory();
+
+    // No need to check JSON, XML, CSV or TSV because they come earlier in the order we check formats
+
+    public void testCanCreateFromSampleGivenSemiColonSeparatedValues() {
+
+        assertTrue(factory.canCreateFromSample(explanation, SEMI_COLON_SEPARATED_VALUES_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenPipeSeparatedValues() {
+
+        assertFalse(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenText() {
+
+        assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
+    }
+}

+ 293 - 0
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/SeparatedValuesLogStructureFinderTests.java

@@ -0,0 +1,293 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.elasticsearch.common.collect.Tuple;
+import org.supercsv.prefs.CsvPreference;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+
+import static org.elasticsearch.xpack.ml.logstructurefinder.SeparatedValuesLogStructureFinder.levenshteinFieldwiseCompareRows;
+import static org.elasticsearch.xpack.ml.logstructurefinder.SeparatedValuesLogStructureFinder.levenshteinDistance;
+import static org.hamcrest.Matchers.arrayContaining;
+
+public class SeparatedValuesLogStructureFinderTests extends LogStructureTestCase {
+
+    private LogStructureFinderFactory factory = new CsvLogStructureFinderFactory();
+
+    public void testCreateConfigsGivenCompleteCsv() throws Exception {
+        String sample = "time,message\n" +
+            "2018-05-17T13:41:23,hello\n" +
+            "2018-05-17T13:41:32,hello again\n";
+        assertTrue(factory.canCreateFromSample(explanation, sample));
+
+        String charset = randomFrom(POSSIBLE_CHARSETS);
+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
+        LogStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
+
+        LogStructure structure = structureFinder.getStructure();
+
+        assertEquals(LogStructure.Format.CSV, structure.getFormat());
+        assertEquals(charset, structure.getCharset());
+        if (hasByteOrderMarker == null) {
+            assertNull(structure.getHasByteOrderMarker());
+        } else {
+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
+        }
+        assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern());
+        assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
+        assertEquals(Character.valueOf(','), structure.getSeparator());
+        assertTrue(structure.getHasHeaderRow());
+        assertNull(structure.getShouldTrimFields());
+        assertEquals(Arrays.asList("time", "message"), structure.getInputFields());
+        assertNull(structure.getGrokPattern());
+        assertEquals("time", structure.getTimestampField());
+        assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
+    }
+
+    public void testCreateConfigsGivenCsvWithIncompleteLastRecord() throws Exception {
+        String sample = "message,time,count\n" +
+            "\"hello\n" +
+            "world\",2018-05-17T13:41:23,1\n" +
+            "\"hello again\n"; // note that this last record is truncated
+        assertTrue(factory.canCreateFromSample(explanation, sample));
+
+        String charset = randomFrom(POSSIBLE_CHARSETS);
+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
+        LogStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
+
+        LogStructure structure = structureFinder.getStructure();
+
+        assertEquals(LogStructure.Format.CSV, structure.getFormat());
+        assertEquals(charset, structure.getCharset());
+        if (hasByteOrderMarker == null) {
+            assertNull(structure.getHasByteOrderMarker());
+        } else {
+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
+        }
+        assertEquals("^\"?message\"?,\"?time\"?,\"?count\"?", structure.getExcludeLinesPattern());
+        assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
+        assertEquals(Character.valueOf(','), structure.getSeparator());
+        assertTrue(structure.getHasHeaderRow());
+        assertNull(structure.getShouldTrimFields());
+        assertEquals(Arrays.asList("message", "time", "count"), structure.getInputFields());
+        assertNull(structure.getGrokPattern());
+        assertEquals("time", structure.getTimestampField());
+        assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
+    }
+
+    public void testCreateConfigsGivenCsvWithTrailingNulls() throws Exception {
+        String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," +
+            "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," +
+            "improvement_surcharge,total_amount,,\n" +
+            "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" +
+            "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" +
+            "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n";
+        assertTrue(factory.canCreateFromSample(explanation, sample));
+
+        String charset = randomFrom(POSSIBLE_CHARSETS);
+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
+        LogStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
+
+        LogStructure structure = structureFinder.getStructure();
+
+        assertEquals(LogStructure.Format.CSV, structure.getFormat());
+        assertEquals(charset, structure.getCharset());
+        if (hasByteOrderMarker == null) {
+            assertNull(structure.getHasByteOrderMarker());
+        } else {
+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
+        }
+        assertEquals("^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," +
+            "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," +
+            "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?,\"?\"?,\"?\"?",
+            structure.getExcludeLinesPattern());
+        assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
+        assertEquals(Character.valueOf(','), structure.getSeparator());
+        assertTrue(structure.getHasHeaderRow());
+        assertNull(structure.getShouldTrimFields());
+        assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance",
+            "RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax",
+            "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", "column18", "column19"), structure.getInputFields());
+        assertNull(structure.getGrokPattern());
+        assertEquals("tpep_pickup_datetime", structure.getTimestampField());
+        assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats());
+    }
+
+    public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exception {
+        String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," +
+            "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," +
+            "improvement_surcharge,total_amount\n" +
+            "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" +
+            "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" +
+            "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n";
+        assertTrue(factory.canCreateFromSample(explanation, sample));
+
+        String charset = randomFrom(POSSIBLE_CHARSETS);
+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
+        LogStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
+
+        LogStructure structure = structureFinder.getStructure();
+
+        assertEquals(LogStructure.Format.CSV, structure.getFormat());
+        assertEquals(charset, structure.getCharset());
+        if (hasByteOrderMarker == null) {
+            assertNull(structure.getHasByteOrderMarker());
+        } else {
+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
+        }
+        assertEquals("^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," +
+                "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," +
+                "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?",
+            structure.getExcludeLinesPattern());
+        assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
+        assertEquals(Character.valueOf(','), structure.getSeparator());
+        assertTrue(structure.getHasHeaderRow());
+        assertNull(structure.getShouldTrimFields());
+        assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance",
+            "RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax",
+            "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount"), structure.getInputFields());
+        assertNull(structure.getGrokPattern());
+        assertEquals("tpep_pickup_datetime", structure.getTimestampField());
+        assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats());
+    }
+
+    public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception {
+        String sample = "\"pos_id\",\"trip_id\",\"latitude\",\"longitude\",\"altitude\",\"timestamp\"\n" +
+            "\"1\",\"3\",\"4703.7815\",\"1527.4713\",\"359.9\",\"2017-01-19 16:19:04.742113\"\n" +
+            "\"2\",\"3\",\"4703.7815\",\"1527.4714\",\"359.9\",\"2017-01-19 16:19:05.741890\"\n";
+        assertTrue(factory.canCreateFromSample(explanation, sample));
+
+        String charset = randomFrom(POSSIBLE_CHARSETS);
+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
+        LogStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
+
+        LogStructure structure = structureFinder.getStructure();
+
+        assertEquals(LogStructure.Format.CSV, structure.getFormat());
+        assertEquals(charset, structure.getCharset());
+        if (hasByteOrderMarker == null) {
+            assertNull(structure.getHasByteOrderMarker());
+        } else {
+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
+        }
+        assertEquals("^\"?pos_id\"?,\"?trip_id\"?,\"?latitude\"?,\"?longitude\"?,\"?altitude\"?,\"?timestamp\"?",
+            structure.getExcludeLinesPattern());
+        assertNull(structure.getMultilineStartPattern());
+        assertEquals(Character.valueOf(','), structure.getSeparator());
+        assertTrue(structure.getHasHeaderRow());
+        assertNull(structure.getShouldTrimFields());
+        assertEquals(Arrays.asList("pos_id", "trip_id", "latitude", "longitude", "altitude", "timestamp"), structure.getInputFields());
+        assertNull(structure.getGrokPattern());
+        assertEquals("timestamp", structure.getTimestampField());
+        assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss.SSSSSS"), structure.getTimestampFormats());
+    }
+
+    public void testFindHeaderFromSampleGivenHeaderInSample() throws IOException {
+        String withHeader = "time,airline,responsetime,sourcetype\n" +
+            "2014-06-23 00:00:00Z,AAL,132.2046,farequote\n" +
+            "2014-06-23 00:00:00Z,JZA,990.4628,farequote\n" +
+            "2014-06-23 00:00:01Z,JBU,877.5927,farequote\n" +
+            "2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n";
+
+        Tuple<Boolean, String[]> header = SeparatedValuesLogStructureFinder.findHeaderFromSample(explanation,
+            SeparatedValuesLogStructureFinder.readRows(withHeader, CsvPreference.EXCEL_PREFERENCE).v1());
+
+        assertTrue(header.v1());
+        assertThat(header.v2(), arrayContaining("time", "airline", "responsetime", "sourcetype"));
+    }
+
+    public void testFindHeaderFromSampleGivenHeaderNotInSample() throws IOException {
+        String withoutHeader = "2014-06-23 00:00:00Z,AAL,132.2046,farequote\n" +
+            "2014-06-23 00:00:00Z,JZA,990.4628,farequote\n" +
+            "2014-06-23 00:00:01Z,JBU,877.5927,farequote\n" +
+            "2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n";
+
+        Tuple<Boolean, String[]> header = SeparatedValuesLogStructureFinder.findHeaderFromSample(explanation,
+            SeparatedValuesLogStructureFinder.readRows(withoutHeader, CsvPreference.EXCEL_PREFERENCE).v1());
+
+        assertFalse(header.v1());
+        assertThat(header.v2(), arrayContaining("column1", "column2", "column3", "column4"));
+    }
+
+    public void testLevenshteinDistance() {
+
+        assertEquals(0, levenshteinDistance("cat", "cat"));
+        assertEquals(3, levenshteinDistance("cat", "dog"));
+        assertEquals(5, levenshteinDistance("cat", "mouse"));
+        assertEquals(3, levenshteinDistance("cat", ""));
+
+        assertEquals(3, levenshteinDistance("dog", "cat"));
+        assertEquals(0, levenshteinDistance("dog", "dog"));
+        assertEquals(4, levenshteinDistance("dog", "mouse"));
+        assertEquals(3, levenshteinDistance("dog", ""));
+
+        assertEquals(5, levenshteinDistance("mouse", "cat"));
+        assertEquals(4, levenshteinDistance("mouse", "dog"));
+        assertEquals(0, levenshteinDistance("mouse", "mouse"));
+        assertEquals(5, levenshteinDistance("mouse", ""));
+
+        assertEquals(3, levenshteinDistance("", "cat"));
+        assertEquals(3, levenshteinDistance("", "dog"));
+        assertEquals(5, levenshteinDistance("", "mouse"));
+        assertEquals(0, levenshteinDistance("", ""));
+    }
+
+    public void testLevenshteinCompareRows() {
+
+        assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "dog")));
+        assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "cat")));
+        assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("dog", "cat")));
+        assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("mouse", "cat")));
+        assertEquals(5, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "dog", "cat")));
+        assertEquals(4, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "mouse", "mouse")));
+        assertEquals(7, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "cat", "dog")));
+    }
+
+    public void testLineHasUnescapedQuote() {
+
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a,b,c", CsvPreference.EXCEL_PREFERENCE));
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\",b,c", CsvPreference.EXCEL_PREFERENCE));
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a,b\",c", CsvPreference.EXCEL_PREFERENCE));
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a,b,c\"", CsvPreference.EXCEL_PREFERENCE));
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a,\"b\",c", CsvPreference.EXCEL_PREFERENCE));
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a,b,\"c\"", CsvPreference.EXCEL_PREFERENCE));
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a,\"b\"\"\",c", CsvPreference.EXCEL_PREFERENCE));
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a,b,\"c\"\"\"", CsvPreference.EXCEL_PREFERENCE));
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"\"\"a\",b,c", CsvPreference.EXCEL_PREFERENCE));
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\"\"\",b,c", CsvPreference.EXCEL_PREFERENCE));
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a,\"\"b\",c", CsvPreference.EXCEL_PREFERENCE));
+        assertTrue(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("between\"words,b,c", CsvPreference.EXCEL_PREFERENCE));
+        assertTrue(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("x and \"y\",b,c", CsvPreference.EXCEL_PREFERENCE));
+
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a\tb\tc", CsvPreference.TAB_PREFERENCE));
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\"\tb\tc", CsvPreference.TAB_PREFERENCE));
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\tb\"\tc", CsvPreference.TAB_PREFERENCE));
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\tb\tc\"", CsvPreference.TAB_PREFERENCE));
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a\t\"b\"\tc", CsvPreference.TAB_PREFERENCE));
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a\tb\t\"c\"", CsvPreference.TAB_PREFERENCE));
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a\t\"b\"\"\"\tc", CsvPreference.TAB_PREFERENCE));
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("a\tb\t\"c\"\"\"", CsvPreference.TAB_PREFERENCE));
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"\"\"a\"\tb\tc", CsvPreference.TAB_PREFERENCE));
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\"\"\"\tb\tc", CsvPreference.TAB_PREFERENCE));
+        assertFalse(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("\"a\t\"\"b\"\tc", CsvPreference.TAB_PREFERENCE));
+        assertTrue(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("between\"words\tb\tc", CsvPreference.TAB_PREFERENCE));
+        assertTrue(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("x and \"y\"\tb\tc", CsvPreference.TAB_PREFERENCE));
+    }
+
+    public void testRowContainsDuplicateNonEmptyValues() {
+
+        assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Collections.singletonList("a")));
+        assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Collections.singletonList("")));
+        assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "c")));
+        assertTrue(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "a")));
+        assertTrue(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "b")));
+        assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "", "")));
+        assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("", "a", "")));
+    }
+}

+ 19 - 0
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/TextLogStructureFinderFactoryTests.java

@@ -0,0 +1,19 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+public class TextLogStructureFinderFactoryTests extends LogStructureTestCase {
+
+    private LogStructureFinderFactory factory = new TextLogStructureFinderFactory();
+
+    // No need to check JSON, XML, CSV, TSV, semi-colon separated values or pipe
+    // separated values because they come earlier in the order we check formats
+
+    public void testCanCreateFromSampleGivenText() {
+
+        assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
+    }
+}

+ 245 - 0
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/TextLogStructureFinderTests.java

@@ -0,0 +1,245 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.elasticsearch.common.collect.Tuple;
+import org.elasticsearch.common.util.set.Sets;
+import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
+
+import java.util.Collections;
+import java.util.Set;
+
+public class TextLogStructureFinderTests extends LogStructureTestCase {
+
+    private LogStructureFinderFactory factory = new TextLogStructureFinderFactory();
+
+    public void testCreateConfigsGivenElasticsearchLog() throws Exception {
+        assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
+
+        String charset = randomFrom(POSSIBLE_CHARSETS);
+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
+        LogStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker);
+
+        LogStructure structure = structureFinder.getStructure();
+
+        assertEquals(LogStructure.Format.SEMI_STRUCTURED_TEXT, structure.getFormat());
+        assertEquals(charset, structure.getCharset());
+        if (hasByteOrderMarker == null) {
+            assertNull(structure.getHasByteOrderMarker());
+        } else {
+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
+        }
+        assertNull(structure.getExcludeLinesPattern());
+        assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
+        assertNull(structure.getSeparator());
+        assertNull(structure.getHasHeaderRow());
+        assertNull(structure.getShouldTrimFields());
+        assertEquals("\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} \\]\\[.*", structure.getGrokPattern());
+        assertEquals("timestamp", structure.getTimestampField());
+        assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
+    }
+
+    public void testCreateMultiLineMessageStartRegexGivenNoPrefaces() {
+        for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
+            String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
+            assertEquals("^" + simpleDateRegex.replaceFirst("^\\\\b", ""),
+                TextLogStructureFinder.createMultiLineMessageStartRegex(Collections.emptySet(), simpleDateRegex));
+        }
+    }
+
+    public void testCreateMultiLineMessageStartRegexGivenOneEmptyPreface() {
+        for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
+            String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
+            assertEquals("^" + simpleDateRegex.replaceFirst("^\\\\b", ""),
+                TextLogStructureFinder.createMultiLineMessageStartRegex(Collections.singleton(""), simpleDateRegex));
+        }
+    }
+
+    public void testCreateMultiLineMessageStartRegexGivenOneLogLevelPreface() {
+        for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
+            String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
+            assertEquals("^\\[.*?\\] \\[" + simpleDateRegex,
+                TextLogStructureFinder.createMultiLineMessageStartRegex(Collections.singleton("[ERROR] ["), simpleDateRegex));
+        }
+    }
+
+    public void testCreateMultiLineMessageStartRegexGivenManyLogLevelPrefaces() {
+        for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
+            Set<String> prefaces = Sets.newHashSet("[ERROR] [", "[DEBUG] [");
+            String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
+            assertEquals("^\\[.*?\\] \\[" + simpleDateRegex,
+                TextLogStructureFinder.createMultiLineMessageStartRegex(prefaces, simpleDateRegex));
+        }
+    }
+
+    public void testCreateMultiLineMessageStartRegexGivenManyHostnamePrefaces() {
+        for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
+            Set<String> prefaces = Sets.newHashSet("host-1.acme.com|", "my_host.elastic.co|");
+            String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
+            assertEquals("^.*?\\|" + simpleDateRegex,
+                TextLogStructureFinder.createMultiLineMessageStartRegex(prefaces, simpleDateRegex));
+        }
+    }
+
+    public void testCreateMultiLineMessageStartRegexGivenManyPrefacesIncludingEmpty() {
+        for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
+            Set<String> prefaces = Sets.newHashSet("", "[non-standard] ");
+            String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
+            assertEquals("^.*?" + simpleDateRegex,
+                TextLogStructureFinder.createMultiLineMessageStartRegex(prefaces, simpleDateRegex));
+        }
+    }
+
+    public void testMostLikelyTimestampGivenAllSame() {
+        String sample = "[2018-06-27T11:59:22,125][INFO ][o.e.n.Node               ] [node-0] initializing ...\n" +
+            "[2018-06-27T11:59:22,201][INFO ][o.e.e.NodeEnvironment    ] [node-0] using [1] data paths, mounts [[/ (/dev/disk1)]], " +
+                "net usable_space [216.1gb], net total_space [464.7gb], types [hfs]\n" +
+            "[2018-06-27T11:59:22,202][INFO ][o.e.e.NodeEnvironment    ] [node-0] heap size [494.9mb], " +
+                "compressed ordinary object pointers [true]\n" +
+            "[2018-06-27T11:59:22,204][INFO ][o.e.n.Node               ] [node-0] node name [node-0], node ID [Ha1gD8nNSDqjd6PIyu3DJA]\n" +
+            "[2018-06-27T11:59:22,204][INFO ][o.e.n.Node               ] [node-0] version[6.4.0-SNAPSHOT], pid[2785], " +
+                "build[default/zip/3c60efa/2018-06-26T14:55:15.206676Z], OS[Mac OS X/10.12.6/x86_64], " +
+                "JVM[\"Oracle Corporation\"/Java HotSpot(TM) 64-Bit Server VM/10/10+46]\n" +
+            "[2018-06-27T11:59:22,205][INFO ][o.e.n.Node               ] [node-0] JVM arguments [-Xms1g, -Xmx1g, " +
+                "-XX:+UseConcMarkSweepGC, -XX:CMSInitiatingOccupancyFraction=75, -XX:+UseCMSInitiatingOccupancyOnly, " +
+                "-XX:+AlwaysPreTouch, -Xss1m, -Djava.awt.headless=true, -Dfile.encoding=UTF-8, -Djna.nosys=true, " +
+                "-XX:-OmitStackTraceInFastThrow, -Dio.netty.noUnsafe=true, -Dio.netty.noKeySetOptimization=true, " +
+                "-Dio.netty.recycler.maxCapacityPerThread=0, -Dlog4j.shutdownHookEnabled=false, -Dlog4j2.disable.jmx=true, " +
+                "-Djava.io.tmpdir=/var/folders/k5/5sqcdlps5sg3cvlp783gcz740000h0/T/elasticsearch.nFUyeMH1, " +
+                "-XX:+HeapDumpOnOutOfMemoryError, -XX:HeapDumpPath=data, -XX:ErrorFile=logs/hs_err_pid%p.log, " +
+                "-Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filecount=32,filesize=64m, " +
+                "-Djava.locale.providers=COMPAT, -Dio.netty.allocator.type=unpooled, -ea, -esa, -Xms512m, -Xmx512m, " +
+                "-Des.path.home=/Users/dave/elasticsearch/distribution/build/cluster/run node0/elasticsearch-6.4.0-SNAPSHOT, " +
+                "-Des.path.conf=/Users/dave/elasticsearch/distribution/build/cluster/run node0/elasticsearch-6.4.0-SNAPSHOT/config, " +
+                "-Des.distribution.flavor=default, -Des.distribution.type=zip]\n" +
+            "[2018-06-27T11:59:22,205][WARN ][o.e.n.Node               ] [node-0] version [6.4.0-SNAPSHOT] is a pre-release version of " +
+                "Elasticsearch and is not suitable for production\n" +
+            "[2018-06-27T11:59:23,585][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [aggs-matrix-stats]\n" +
+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [analysis-common]\n" +
+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [ingest-common]\n" +
+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [lang-expression]\n" +
+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [lang-mustache]\n" +
+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [lang-painless]\n" +
+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [mapper-extras]\n" +
+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [parent-join]\n" +
+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [percolator]\n" +
+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [rank-eval]\n" +
+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [reindex]\n" +
+            "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [repository-url]\n" +
+            "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [transport-netty4]\n" +
+            "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-core]\n" +
+            "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-deprecation]\n" +
+            "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-graph]\n" +
+            "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-logstash]\n" +
+            "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-ml]\n" +
+            "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-monitoring]\n" +
+            "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-rollup]\n" +
+            "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-security]\n" +
+            "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-sql]\n" +
+            "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-upgrade]\n" +
+            "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-watcher]\n" +
+            "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService     ] [node-0] no plugins loaded\n";
+
+        Tuple<TimestampMatch, Set<String>> mostLikelyMatch = TextLogStructureFinder.mostLikelyTimestamp(sample.split("\n"));
+        assertNotNull(mostLikelyMatch);
+        assertEquals(new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""),
+            mostLikelyMatch.v1());
+    }
+
+    public void testMostLikelyTimestampGivenExceptionTrace() {
+        String sample = "[2018-02-28T14:49:40,517][DEBUG][o.e.a.b.TransportShardBulkAction] [an_index][2] failed to execute bulk item " +
+                "(index) BulkShardRequest [[an_index][2]] containing [33] requests\n" +
+            "java.lang.IllegalArgumentException: Document contains at least one immense term in field=\"message.keyword\" (whose UTF8 " +
+                "encoding is longer than the max length 32766), all of which were skipped.  Please correct the analyzer to not produce " +
+                "such terms.  The prefix of the first immense term is: '[60, 83, 79, 65, 80, 45, 69, 78, 86, 58, 69, 110, 118, 101, 108, " +
+                "111, 112, 101, 32, 120, 109, 108, 110, 115, 58, 83, 79, 65, 80, 45]...', original message: bytes can be at most 32766 " +
+                "in length; got 49023\n" +
+            "\tat org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:796) " +
+                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
+            "\tat org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:430) " +
+                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
+            "\tat org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:392) " +
+                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
+            "\tat org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:240) " +
+                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
+            "\tat org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:496) " +
+                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
+            "\tat org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1729) " +
+                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
+            "\tat org.apache.lucene.index.IndexWriter.addDocument(IndexWriter.java:1464) " +
+                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
+            "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:1070) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.index.engine.InternalEngine.indexIntoLucene(InternalEngine.java:1012) " +
+                "~[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:878) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.index.shard.IndexShard.index(IndexShard.java:738) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperation(IndexShard.java:707) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperationOnPrimary(IndexShard.java:673) " +
+                "~[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequestOnPrimary(TransportShardBulkAction.java:548) " +
+                "~[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequest(TransportShardBulkAction.java:140) " +
+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeBulkItemRequest(TransportShardBulkAction.java:236) " +
+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.performOnPrimary(TransportShardBulkAction.java:123) " +
+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:110) " +
+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:72) " +
+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" +
+                "(TransportReplicationAction.java:1034) [elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" +
+                "(TransportReplicationAction.java:1012) [elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.action.support.replication.ReplicationOperation.execute(ReplicationOperation.java:103) " +
+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" +
+                "(TransportReplicationAction.java:359) [elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" +
+                "(TransportReplicationAction.java:299) [elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" +
+                "(TransportReplicationAction.java:975) [elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" +
+                "(TransportReplicationAction.java:972) [elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.index.shard.IndexShardOperationPermits.acquire(IndexShardOperationPermits.java:238) " +
+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.index.shard.IndexShard.acquirePrimaryOperationPermit(IndexShard.java:2220) " +
+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.acquirePrimaryShardReference" +
+                "(TransportReplicationAction.java:984) [elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.access$500(TransportReplicationAction.java:98) " +
+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.doRun" +
+                "(TransportReplicationAction.java:320) [elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " +
+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" +
+                ".messageReceived(TransportReplicationAction.java:295) [elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" +
+                ".messageReceived(TransportReplicationAction.java:282) [elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:66) " +
+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.transport.TransportService$7.doRun(TransportService.java:656) " +
+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:635) " +
+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " +
+                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+            "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_144]\n" +
+            "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_144]\n" +
+            "\tat java.lang.Thread.run(Thread.java:748) [?:1.8.0_144]\n";
+
+        Tuple<TimestampMatch, Set<String>> mostLikelyMatch = TextLogStructureFinder.mostLikelyTimestamp(sample.split("\n"));
+        assertNotNull(mostLikelyMatch);
+
+        // Even though many lines have a timestamp near the end (in the Lucene version information),
+        // these are so far along the lines that the weight of the timestamp near the beginning of the
+        // first line should take precedence
+        assertEquals(new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""),
+            mostLikelyMatch.v1());
+    }
+}

+ 242 - 0
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/TimestampFormatFinderTests.java

@@ -0,0 +1,242 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.elasticsearch.common.collect.Tuple;
+import org.elasticsearch.xpack.ml.logstructurefinder.TimestampFormatFinder.TimestampMatch;
+import org.joda.time.DateTime;
+import org.joda.time.DateTimeZone;
+import org.joda.time.format.DateTimeFormat;
+import org.joda.time.format.DateTimeFormatter;
+import org.joda.time.format.ISODateTimeFormat;
+
+import java.util.Arrays;
+import java.util.Locale;
+
+public class TimestampFormatFinderTests extends LogStructureTestCase {
+
+    public void testFindFirstMatchGivenNoMatch() {
+
+        assertNull(TimestampFormatFinder.findFirstMatch(""));
+        assertNull(TimestampFormatFinder.findFirstMatch("no timestamps in here"));
+        assertNull(TimestampFormatFinder.findFirstMatch(":::"));
+        assertNull(TimestampFormatFinder.findFirstMatch("/+"));
+    }
+
+    public void testFindFirstMatchGivenOnlyIso8601() {
+
+        TimestampMatch expected = new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601",
+            "");
+
+        checkAndValidateDateFormat(expected, "2018-05-15T16:14:56,374Z", 1526400896374L);
+        checkAndValidateDateFormat(expected, "2018-05-15T17:14:56,374+0100", 1526400896374L);
+        checkAndValidateDateFormat(expected, "2018-05-15T17:14:56,374+01:00", 1526400896374L);
+        checkAndValidateDateFormat(expected, "2018-05-15T17:14:56,374", 1526400896374L);
+        checkAndValidateDateFormat(expected, "2018-05-15T16:14:56Z", 1526400896000L);
+        checkAndValidateDateFormat(expected, "2018-05-15T17:14:56+0100", 1526400896000L);
+        checkAndValidateDateFormat(expected, "2018-05-15T17:14:56+01:00", 1526400896000L);
+        checkAndValidateDateFormat(expected, "2018-05-15T17:14:56", 1526400896000L);
+
+        checkAndValidateDateFormat(new TimestampMatch(1, "", "YYYY-MM-dd HH:mm:ss,SSSZ",
+                "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 16:14:56,374Z",
+            1526400896374L);
+        checkAndValidateDateFormat(new TimestampMatch(1, "", "YYYY-MM-dd HH:mm:ss,SSSZ",
+                "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56,374+0100",
+            1526400896374L);
+        checkAndValidateDateFormat(new TimestampMatch(2, "", "YYYY-MM-dd HH:mm:ss,SSSZZ",
+                "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56,374+01:00",
+            1526400896374L);
+        checkAndValidateDateFormat(new TimestampMatch(3, "", "YYYY-MM-dd HH:mm:ss,SSS",
+                "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56,374", 1526400896374L);
+        checkAndValidateDateFormat(new TimestampMatch(4, "", "YYYY-MM-dd HH:mm:ssZ",
+                "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 16:14:56Z", 1526400896000L);
+        checkAndValidateDateFormat(new TimestampMatch(4, "", "YYYY-MM-dd HH:mm:ssZ",
+                "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56+0100", 1526400896000L);
+        checkAndValidateDateFormat(new TimestampMatch(5, "", "YYYY-MM-dd HH:mm:ssZZ",
+                "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56+01:00", 1526400896000L);
+        checkAndValidateDateFormat(new TimestampMatch(6, "", "YYYY-MM-dd HH:mm:ss",
+                "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56", 1526400896000L);
+    }
+
+    public void testFindFirstMatchGivenOnlyKnownDateFormat() {
+
+        // Note: some of the time formats give millisecond accuracy, some second accuracy and some minute accuracy
+
+        checkAndValidateDateFormat(new TimestampMatch(0, "", "YYYY-MM-dd HH:mm:ss,SSS Z",
+                "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TOMCAT_DATESTAMP", ""), "2018-05-15 17:14:56,374 +0100",
+            1526400896374L);
+
+        checkAndValidateDateFormat(new TimestampMatch(8, "", "EEE MMM dd YYYY HH:mm:ss zzz",
+                "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "DATESTAMP_RFC822", ""),
+            "Tue May 15 2018 16:14:56 UTC", 1526400896000L);
+        checkAndValidateDateFormat(new TimestampMatch(9, "", "EEE MMM dd YYYY HH:mm zzz",
+                "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2} ", "DATESTAMP_RFC822", ""),
+            "Tue May 15 2018 16:14 UTC", 1526400840000L);
+
+        checkAndValidateDateFormat(new TimestampMatch(10, "", "EEE, dd MMM YYYY HH:mm:ss ZZ",
+                "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""),
+            "Tue, 15 May 2018 17:14:56 +01:00", 1526400896000L);
+        checkAndValidateDateFormat(new TimestampMatch(11, "", "EEE, dd MMM YYYY HH:mm:ss Z",
+                "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""),
+            "Tue, 15 May 2018 17:14:56 +0100", 1526400896000L);
+        checkAndValidateDateFormat(new TimestampMatch(12, "", "EEE, dd MMM YYYY HH:mm ZZ",
+                "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""),
+            "Tue, 15 May 2018 17:14 +01:00", 1526400840000L);
+        checkAndValidateDateFormat(new TimestampMatch(13, "", "EEE, dd MMM YYYY HH:mm Z",
+                "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""), "Tue, 15 May 2018 17:14 +0100",
+            1526400840000L);
+
+        checkAndValidateDateFormat(new TimestampMatch(14, "", "EEE MMM dd HH:mm:ss zzz YYYY",
+                "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b", "DATESTAMP_OTHER", ""),
+            "Tue May 15 16:14:56 UTC 2018", 1526400896000L);
+        checkAndValidateDateFormat(new TimestampMatch(15, "", "EEE MMM dd HH:mm zzz YYYY",
+                "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b", "DATESTAMP_OTHER", ""),
+            "Tue May 15 16:14 UTC 2018", 1526400840000L);
+
+        checkAndValidateDateFormat(new TimestampMatch(16, "", "YYYYMMddHHmmss", "\\b\\d{14}\\b", "DATESTAMP_EVENTLOG", ""),
+            "20180515171456", 1526400896000L);
+
+        checkAndValidateDateFormat(new TimestampMatch(17, "", "EEE MMM dd HH:mm:ss YYYY",
+                "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b", "HTTPDERROR_DATE", ""),
+            "Tue May 15 17:14:56 2018", 1526400896000L);
+
+        checkAndValidateDateFormat(new TimestampMatch(18, "", Arrays.asList("MMM dd HH:mm:ss.SSS", "MMM  d HH:mm:ss.SSS"),
+            "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}", "SYSLOGTIMESTAMP", ""), "May 15 17:14:56.725", 1526400896725L);
+        checkAndValidateDateFormat(new TimestampMatch(19, "", Arrays.asList("MMM dd HH:mm:ss", "MMM  d HH:mm:ss"),
+            "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "SYSLOGTIMESTAMP", ""), "May 15 17:14:56", 1526400896000L);
+
+        checkAndValidateDateFormat(new TimestampMatch(20, "", "dd/MMM/YYYY:HH:mm:ss Z",
+                "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", "HTTPDATE", ""), "15/May/2018:17:14:56 +0100", 1526400896000L);
+
+        checkAndValidateDateFormat(new TimestampMatch(21, "", "MMM dd, YYYY K:mm:ss a",
+                "\\b[A-Z]\\S{2,8} \\d{1,2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "CATALINA_DATESTAMP", ""), "May 15, 2018 5:14:56 PM",
+            1526400896000L);
+
+        checkAndValidateDateFormat(new TimestampMatch(22, "", Arrays.asList("MMM dd YYYY HH:mm:ss", "MMM  d YYYY HH:mm:ss"),
+                "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", "CISCOTIMESTAMP", ""), "May 15 2018 17:14:56",
+            1526400896000L);
+    }
+
+    public void testFindFirstMatchGivenOnlySystemDate() {
+
+        assertEquals(new TimestampMatch(23, "", "UNIX_MS", "\\b\\d{13}\\b", "POSINT", ""),
+            TimestampFormatFinder.findFirstMatch("1526400896374"));
+        assertEquals(new TimestampMatch(23, "", "UNIX_MS", "\\b\\d{13}\\b", "POSINT", ""),
+            TimestampFormatFinder.findFirstFullMatch("1526400896374"));
+
+        assertEquals(new TimestampMatch(24, "", "UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "NUMBER", ""),
+            TimestampFormatFinder.findFirstMatch("1526400896.736"));
+        assertEquals(new TimestampMatch(24, "", "UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "NUMBER", ""),
+            TimestampFormatFinder.findFirstFullMatch("1526400896.736"));
+        assertEquals(new TimestampMatch(25, "", "UNIX", "\\b\\d{10}\\b", "POSINT", ""),
+            TimestampFormatFinder.findFirstMatch("1526400896"));
+        assertEquals(new TimestampMatch(25, "", "UNIX", "\\b\\d{10}\\b", "POSINT", ""),
+            TimestampFormatFinder.findFirstFullMatch("1526400896"));
+
+        assertEquals(new TimestampMatch(26, "", "TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM", ""),
+            TimestampFormatFinder.findFirstMatch("400000005afb159a164ac980"));
+        assertEquals(new TimestampMatch(26, "", "TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM", ""),
+            TimestampFormatFinder.findFirstFullMatch("400000005afb159a164ac980"));
+    }
+
+    private void checkAndValidateDateFormat(TimestampMatch expected, String text, long expectedEpochMs) {
+
+        assertEquals(expected, TimestampFormatFinder.findFirstMatch(text));
+        assertEquals(expected, TimestampFormatFinder.findFirstFullMatch(text));
+
+        // All the test times are for Tue May 15 2018 16:14:56 UTC, which is 17:14:56 in London
+        DateTimeZone zone = DateTimeZone.forID("Europe/London");
+        DateTime parsed;
+        for (int i = 0; i < expected.dateFormats.size(); ++i) {
+            try {
+                String dateFormat = expected.dateFormats.get(i);
+                switch (dateFormat) {
+                    case "ISO8601":
+                        parsed = ISODateTimeFormat.dateTimeParser().withZone(zone).withDefaultYear(2018).parseDateTime(text);
+                        break;
+                    default:
+                        DateTimeFormatter parser = DateTimeFormat.forPattern(dateFormat).withZone(zone).withLocale(Locale.UK);
+                        parsed = parser.withDefaultYear(2018).parseDateTime(text);
+                        break;
+                }
+                if (expectedEpochMs == parsed.getMillis()) {
+                    break;
+                }
+                // If the last one isn't right then propagate
+                if (i == expected.dateFormats.size() - 1) {
+                    assertEquals(expectedEpochMs, parsed.getMillis());
+                }
+            } catch (RuntimeException e) {
+                // If the last one throws then propagate
+                if (i == expected.dateFormats.size() - 1) {
+                    throw e;
+                }
+            }
+        }
+        assertTrue(expected.simplePattern.matcher(text).find());
+    }
+
+    public void testFindFirstMatchGivenRealLogMessages() {
+
+        assertEquals(new TimestampMatch(7, "[", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601",
+                "][INFO ][o.e.e.NodeEnvironment    ] [node-0] heap size [3.9gb], compressed ordinary object pointers [true]"),
+            TimestampFormatFinder.findFirstMatch("[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment    ] [node-0] " +
+                "heap size [3.9gb], compressed ordinary object pointers [true]"));
+
+        assertEquals(new TimestampMatch(20, "192.168.62.101 - - [", "dd/MMM/YYYY:HH:mm:ss Z",
+                "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", "HTTPDATE",
+                "] \"POST //apiserv:8080/engine/v2/jobs HTTP/1.1\" 201 42 \"-\" \"curl/7.46.0\" 384"),
+            TimestampFormatFinder.findFirstMatch("192.168.62.101 - - [29/Jun/2016:12:11:31 +0000] " +
+                "\"POST //apiserv:8080/engine/v2/jobs HTTP/1.1\" 201 42 \"-\" \"curl/7.46.0\" 384"));
+
+        assertEquals(new TimestampMatch(21, "", "MMM dd, YYYY K:mm:ss a",
+                "\\b[A-Z]\\S{2,8} \\d{1,2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "CATALINA_DATESTAMP",
+                " org.apache.tomcat.util.http.Parameters processParameters"),
+            TimestampFormatFinder.findFirstMatch("Aug 29, 2009 12:03:57 AM org.apache.tomcat.util.http.Parameters processParameters"));
+
+        assertEquals(new TimestampMatch(19, "", Arrays.asList("MMM dd HH:mm:ss", "MMM  d HH:mm:ss"),
+                "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "SYSLOGTIMESTAMP", " esxi1.acme.com Vpxa: " +
+                    "[3CB3FB90 verbose 'vpxavpxaInvtVm' opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed"),
+            TimestampFormatFinder.findFirstMatch("Oct 19 17:04:44 esxi1.acme.com Vpxa: [3CB3FB90 verbose 'vpxavpxaInvtVm' " +
+                "opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed"));
+
+        assertEquals(new TimestampMatch(7, "559550912540598297\t", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}",
+                "TIMESTAMP_ISO8601",
+                "\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\tInfo\tsshd\tsubsystem request for sftp"),
+            TimestampFormatFinder.findFirstMatch("559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t" +
+                "192.168.114.28\tAuthpriv\tInfo\tsshd\tsubsystem request for sftp"));
+
+        assertEquals(new TimestampMatch(19, "", Arrays.asList("MMM dd HH:mm:ss", "MMM  d HH:mm:ss"),
+                "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "SYSLOGTIMESTAMP",
+                " dnsserv named[22529]: error (unexpected RCODE REFUSED) resolving 'www.elastic.co/A/IN': 95.110.68.206#53"),
+            TimestampFormatFinder.findFirstMatch("Sep  8 11:55:35 dnsserv named[22529]: error (unexpected RCODE REFUSED) resolving " +
+                "'www.elastic.co/A/IN': 95.110.68.206#53"));
+
+        assertEquals(new TimestampMatch(3, "", "YYYY-MM-dd HH:mm:ss.SSSSSS", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}",
+                "TIMESTAMP_ISO8601",
+                "|INFO    |VirtualServer |1  |client  'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client " +
+                    "'User1'(id:2) in channel '3er Instanz'(id:2)"),
+            TimestampFormatFinder.findFirstMatch("2018-01-06 19:22:20.106822|INFO    |VirtualServer |1  |client " +
+                " 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel '3er Instanz'(id:2)"));
+    }
+
+    public void testInterpretFractionalSeconds() {
+        assertEquals(new Tuple<>(',', 0), TimestampFormatFinder.interpretFractionalSeconds("Sep  8 11:55:35"));
+        assertEquals(new Tuple<>(',', 0), TimestampFormatFinder.interpretFractionalSeconds("29/Jun/2016:12:11:31 +0000"));
+        assertEquals(new Tuple<>('.', 6), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06 17:21:25.764368"));
+        assertEquals(new Tuple<>(',', 9), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764363438"));
+        assertEquals(new Tuple<>(',', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764"));
+        assertEquals(new Tuple<>('.', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25.764"));
+        assertEquals(new Tuple<>('.', 6), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06 17:21:25.764368Z"));
+        assertEquals(new Tuple<>(',', 9), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764363438Z"));
+        assertEquals(new Tuple<>(',', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764Z"));
+        assertEquals(new Tuple<>('.', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25.764Z"));
+        assertEquals(new Tuple<>('.', 6), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06 17:21:25.764368 Z"));
+        assertEquals(new Tuple<>(',', 9), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764363438 Z"));
+        assertEquals(new Tuple<>(',', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764 Z"));
+        assertEquals(new Tuple<>('.', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25.764 Z"));
+    }
+}

+ 33 - 0
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/TsvLogStructureFinderFactoryTests.java

@@ -0,0 +1,33 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+public class TsvLogStructureFinderFactoryTests extends LogStructureTestCase {
+
+    private LogStructureFinderFactory factory = new TsvLogStructureFinderFactory();
+
+    // No need to check JSON, XML or CSV because they come earlier in the order we check formats
+
+    public void testCanCreateFromSampleGivenTsv() {
+
+        assertTrue(factory.canCreateFromSample(explanation, TSV_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenSemiColonSeparatedValues() {
+
+        assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_SEPARATED_VALUES_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenPipeSeparatedValues() {
+
+        assertFalse(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenText() {
+
+        assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
+    }
+}

+ 43 - 0
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/XmlLogStructureFinderFactoryTests.java

@@ -0,0 +1,43 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+public class XmlLogStructureFinderFactoryTests extends LogStructureTestCase {
+
+    private LogStructureFinderFactory factory = new XmlLogStructureFinderFactory();
+
+    // No need to check JSON because it comes earlier in the order we check formats
+
+    public void testCanCreateFromSampleGivenXml() {
+
+        assertTrue(factory.canCreateFromSample(explanation, XML_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenCsv() {
+
+        assertFalse(factory.canCreateFromSample(explanation, CSV_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenTsv() {
+
+        assertFalse(factory.canCreateFromSample(explanation, TSV_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenSemiColonSeparatedValues() {
+
+        assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_SEPARATED_VALUES_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenPipeSeparatedValues() {
+
+        assertFalse(factory.canCreateFromSample(explanation, PIPE_SEPARATED_VALUES_SAMPLE));
+    }
+
+    public void testCanCreateFromSampleGivenText() {
+
+        assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
+    }
+}

+ 39 - 0
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/XmlLogStructureFinderTests.java

@@ -0,0 +1,39 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import java.util.Collections;
+
+public class XmlLogStructureFinderTests extends LogStructureTestCase {
+
+    private LogStructureFinderFactory factory = new XmlLogStructureFinderFactory();
+
+    public void testCreateConfigsGivenGoodXml() throws Exception {
+        assertTrue(factory.canCreateFromSample(explanation, XML_SAMPLE));
+
+        String charset = randomFrom(POSSIBLE_CHARSETS);
+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
+        LogStructureFinder structureFinder = factory.createFromSample(explanation, XML_SAMPLE, charset, hasByteOrderMarker);
+
+        LogStructure structure = structureFinder.getStructure();
+
+        assertEquals(LogStructure.Format.XML, structure.getFormat());
+        assertEquals(charset, structure.getCharset());
+        if (hasByteOrderMarker == null) {
+            assertNull(structure.getHasByteOrderMarker());
+        } else {
+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
+        }
+        assertNull(structure.getExcludeLinesPattern());
+        assertEquals("^\\s*<log4j:event", structure.getMultilineStartPattern());
+        assertNull(structure.getSeparator());
+        assertNull(structure.getHasHeaderRow());
+        assertNull(structure.getShouldTrimFields());
+        assertNull(structure.getGrokPattern());
+        assertEquals("timestamp", structure.getTimestampField());
+        assertEquals(Collections.singletonList("UNIX_MS"), structure.getTimestampFormats());
+    }
+}