Browse Source

[ML] Allow a certain number of ill-formatted rows when delimited format is specified (#55735)

While it is good to not be lenient when attempting to guess the file format, it is frustrating to users when they KNOW it is CSV but there are a few ill-formatted rows in the file (via some entry error, etc.).

This commit allows for up to 10% of sample rows to be considered "bad". These rows are effectively ignored while guessing the format.

This percentage of "allows bad rows" is only applied when the user has specified delimited formatting options. As the structure finder needs some guidance on what a "bad row" actually means.

related to https://github.com/elastic/elasticsearch/issues/38890
Benjamin Trent 5 years ago
parent
commit
fd554d95e4
15 changed files with 191 additions and 62 deletions
  1. 30 7
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java
  2. 9 2
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java
  3. 3 1
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java
  4. 6 2
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java
  5. 1 1
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderFactory.java
  6. 1 1
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java
  7. 1 1
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java
  8. 14 14
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java
  9. 104 11
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java
  10. 1 1
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderTests.java
  11. 7 7
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdNdJsonFileStructureFinderFactoryTests.java
  12. 1 1
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactoryTests.java
  13. 6 6
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java
  14. 6 6
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactoryTests.java
  15. 1 1
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java

+ 30 - 7
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java

@@ -5,6 +5,7 @@
  */
 package org.elasticsearch.xpack.ml.filestructurefinder;
 
+import org.apache.logging.log4j.message.ParameterizedMessage;
 import org.elasticsearch.common.collect.Tuple;
 import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats;
 import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
@@ -34,7 +35,6 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
     private static final String REGEX_NEEDS_ESCAPE_PATTERN = "([\\\\|()\\[\\]{}^$.+*?])";
     private static final int MAX_LEVENSHTEIN_COMPARISONS = 100;
     private static final int LONG_FIELD_THRESHOLD = 100;
-
     private final List<String> sampleMessages;
     private final FileStructure structure;
 
@@ -80,6 +80,11 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
         for (int index = isHeaderInFile ? 1 : 0; index < rows.size(); ++index) {
             List<String> row = rows.get(index);
             int lineNumber = lineNumbers.get(index);
+            // Indicates an illformatted row. We allow a certain number of these
+            if (row.size() != columnNames.length) {
+                prevMessageEndLineNumber = lineNumber;
+                continue;
+            }
             Map<String, String> sampleRecord = new LinkedHashMap<>();
             Util.filterListToMap(sampleRecord, columnNames,
                 trimFields ? row.stream().map(field -> (field == null) ? null : field.trim()).collect(Collectors.toList()) : row);
@@ -488,7 +493,7 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
     }
 
     static boolean canCreateFromSample(List<String> explanation, String sample, int minFieldsPerRow, CsvPreference csvPreference,
-                                       String formatName) {
+                                       String formatName, double allowedFractionOfBadLines) {
 
         // Logstash's CSV parser won't tolerate fields where just part of the
         // value is quoted, whereas SuperCSV will, hence this extra check
@@ -501,11 +506,13 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
             }
         }
 
+        int numberOfLinesInSample = sampleLines.length;
         try (CsvListReader csvReader = new CsvListReader(new StringReader(sample), csvPreference)) {
 
             int fieldsInFirstRow = -1;
             int fieldsInLastRow = -1;
 
+            List<Integer> illFormattedRows = new ArrayList<>();
             int numberOfRows = 0;
             try {
                 List<String> row;
@@ -529,11 +536,27 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
                         --fieldsInThisRow;
                     }
 
-                    if (fieldsInLastRow != fieldsInFirstRow) {
-                        explanation.add("Not " + formatName + " because row [" + (numberOfRows - 1) +
-                            "] has a different number of fields to the first row: [" + fieldsInFirstRow + "] and [" +
-                            fieldsInLastRow + "]");
-                        return false;
+                    // TODO: might be good one day to gather a distribution of the most common field counts
+                    // But, this would require iterating (or at least sampling) all the lines.
+                    if (fieldsInThisRow != fieldsInFirstRow) {
+                        illFormattedRows.add(numberOfRows - 1);
+                        // This calculation is complicated by the possibility of multi-lined CSV columns
+                        // `getLineNumber` is a current count of lines, regardless of row count, so
+                        // this formula is just an approximation, but gets more accurate the further
+                        // through the sample you are.
+                        double totalNumberOfRows = (numberOfRows + numberOfLinesInSample - csvReader.getLineNumber());
+                        // We should only allow a certain percentage of ill formatted rows
+                        // as it may have and down stream effects
+                        if (illFormattedRows.size() > Math.ceil(allowedFractionOfBadLines * totalNumberOfRows)) {
+                            explanation.add(new ParameterizedMessage(
+                                "Not {} because {} or more rows did not have the same number of fields as the first row ({}). Bad rows {}",
+                                formatName,
+                                illFormattedRows.size(),
+                                fieldsInFirstRow,
+                                illFormattedRows).getFormattedMessage());
+                            return false;
+                        }
+                        continue;
                     }
 
                     fieldsInLastRow = fieldsInThisRow;

+ 9 - 2
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java

@@ -14,6 +14,8 @@ import java.util.Locale;
 
 public class DelimitedFileStructureFinderFactory implements FileStructureFinderFactory {
 
+    static final double DELIMITER_OVERRIDDEN_ALLOWED_FRACTION_OF_BAD_LINES = 0.10d;
+    static final double FORMAT_OVERRIDDEN_ALLOWED_FRACTION_OF_BAD_LINES = 0.05d;
     private final CsvPreference csvPreference;
     private final int minFieldsPerRow;
     private final boolean trimFields;
@@ -44,7 +46,7 @@ public class DelimitedFileStructureFinderFactory implements FileStructureFinderF
      * it could have been truncated when the file was sampled.
      */
     @Override
-    public boolean canCreateFromSample(List<String> explanation, String sample) {
+    public boolean canCreateFromSample(List<String> explanation, String sample, double allowedFractionOfBadLines) {
         String formatName;
         switch ((char) csvPreference.getDelimiterChar()) {
             case ',':
@@ -57,7 +59,12 @@ public class DelimitedFileStructureFinderFactory implements FileStructureFinderF
                 formatName = Character.getName(csvPreference.getDelimiterChar()).toLowerCase(Locale.ROOT) + " delimited values";
                 break;
         }
-        return DelimitedFileStructureFinder.canCreateFromSample(explanation, sample, minFieldsPerRow, csvPreference, formatName);
+        return DelimitedFileStructureFinder.canCreateFromSample(explanation,
+            sample,
+            minFieldsPerRow,
+            csvPreference,
+            formatName,
+            allowedFractionOfBadLines);
     }
 
     @Override

+ 3 - 1
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java

@@ -25,10 +25,12 @@ public interface FileStructureFinderFactory {
      * @param explanation List of reasons for making decisions.  May contain items when passed and new reasons
      *                    can be appended by this method.
      * @param sample A sample from the file to be ingested.
+     * @param allowedFractionOfBadLines How many lines of the passed sample are allowed to be considered "bad".
+     *                                  Provided as a fraction from interval [0, 1]
      * @return <code>true</code> if this factory can create an appropriate
      *         file structure given the sample; otherwise <code>false</code>.
      */
-    boolean canCreateFromSample(List<String> explanation, String sample);
+    boolean canCreateFromSample(List<String> explanation, String sample, double allowedFractionOfBadLines);
 
     /**
      * Create an object representing the structure of a file.

+ 6 - 2
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java

@@ -11,6 +11,7 @@ import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.ElasticsearchTimeoutException;
 import org.elasticsearch.common.collect.Tuple;
 import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
 
 import java.io.BufferedInputStream;
 import java.io.BufferedReader;
@@ -474,14 +475,17 @@ public final class FileStructureFinderManager {
         Character quote = overrides.getQuote();
         Boolean shouldTrimFields = overrides.getShouldTrimFields();
         List<FileStructureFinderFactory> factories;
+        double allowedFractionOfBadLines = 0.0;
         if (delimiter != null) {
+            allowedFractionOfBadLines = DelimitedFileStructureFinderFactory.DELIMITER_OVERRIDDEN_ALLOWED_FRACTION_OF_BAD_LINES;
 
             // If a precise delimiter is specified, we only need one structure finder
             // factory, and we'll tolerate as little as one column in the input
             factories = Collections.singletonList(new DelimitedFileStructureFinderFactory(delimiter, (quote == null) ? '"' : quote, 1,
                 (shouldTrimFields == null) ? (delimiter == '|') : shouldTrimFields));
 
-        } else if (quote != null || shouldTrimFields != null) {
+        } else if (quote != null || shouldTrimFields != null || FileStructure.Format.DELIMITED.equals(overrides.getFormat())) {
+            allowedFractionOfBadLines = DelimitedFileStructureFinderFactory.FORMAT_OVERRIDDEN_ALLOWED_FRACTION_OF_BAD_LINES;
 
             // The delimiter is not specified, but some other aspect of delimited files is,
             // so clone our default delimited factories altering the overridden values
@@ -499,7 +503,7 @@ public final class FileStructureFinderManager {
 
         for (FileStructureFinderFactory factory : factories) {
             timeoutChecker.check("high level format detection");
-            if (factory.canCreateFromSample(explanation, sample)) {
+            if (factory.canCreateFromSample(explanation, sample, allowedFractionOfBadLines)) {
                 return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker, lineMergeSizeLimit, overrides,
                     timeoutChecker);
             }

+ 1 - 1
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderFactory.java

@@ -30,7 +30,7 @@ public class NdJsonFileStructureFinderFactory implements FileStructureFinderFact
      * documents must be non-empty, to prevent lines containing "{}" from matching.
      */
     @Override
-    public boolean canCreateFromSample(List<String> explanation, String sample) {
+    public boolean canCreateFromSample(List<String> explanation, String sample, double allowedFractionOfBadLines) {
 
         int completeDocCount = 0;
 

+ 1 - 1
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java

@@ -25,7 +25,7 @@ public class TextLogFileStructureFinderFactory implements FileStructureFinderFac
      * non-blank lines.
      */
     @Override
-    public boolean canCreateFromSample(List<String> explanation, String sample) {
+    public boolean canCreateFromSample(List<String> explanation, String sample, double allowedFractionOfBadLines) {
         if (sample.indexOf('\n') < 0) {
             explanation.add("Not text because sample contains no newlines");
             return false;

+ 1 - 1
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java

@@ -43,7 +43,7 @@ public class XmlFileStructureFinderFactory implements FileStructureFinderFactory
      * necessarily have to be complete (as the sample could have truncated it).
      */
     @Override
-    public boolean canCreateFromSample(List<String> explanation, String sample) {
+    public boolean canCreateFromSample(List<String> explanation, String sample, double allowedFractionOfBadLines) {
 
         int completeDocCount = 0;
         String commonRootElementName = null;

+ 14 - 14
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java

@@ -16,66 +16,66 @@ public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestC
 
     public void testCanCreateCsvFromSampleGivenCsv() {
 
-        assertTrue(csvFactory.canCreateFromSample(explanation, CSV_SAMPLE));
+        assertTrue(csvFactory.canCreateFromSample(explanation, CSV_SAMPLE, 0.0));
     }
 
     public void testCanCreateCsvFromSampleGivenTsv() {
 
-        assertFalse(csvFactory.canCreateFromSample(explanation, TSV_SAMPLE));
+        assertFalse(csvFactory.canCreateFromSample(explanation, TSV_SAMPLE, 0.0));
     }
 
     public void testCanCreateCsvFromSampleGivenSemiColonDelimited() {
 
-        assertFalse(csvFactory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE));
+        assertFalse(csvFactory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE, 0.0));
     }
 
     public void testCanCreateCsvFromSampleGivenPipeDelimited() {
 
-        assertFalse(csvFactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE));
+        assertFalse(csvFactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE, 0.0));
     }
 
     public void testCanCreateCsvFromSampleGivenText() {
 
-        assertFalse(csvFactory.canCreateFromSample(explanation, TEXT_SAMPLE));
+        assertFalse(csvFactory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0));
     }
 
     // TSV - no need to check NDJSON, XML or CSV because they come earlier in the order we check formats
 
     public void testCanCreateTsvFromSampleGivenTsv() {
 
-        assertTrue(tsvFactory.canCreateFromSample(explanation, TSV_SAMPLE));
+        assertTrue(tsvFactory.canCreateFromSample(explanation, TSV_SAMPLE, 0.0));
     }
 
     public void testCanCreateTsvFromSampleGivenSemiColonDelimited() {
 
-        assertFalse(tsvFactory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE));
+        assertFalse(tsvFactory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE, 0.0));
     }
 
     public void testCanCreateTsvFromSampleGivenPipeDelimited() {
 
-        assertFalse(tsvFactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE));
+        assertFalse(tsvFactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE, 0.0));
     }
 
     public void testCanCreateTsvFromSampleGivenText() {
 
-        assertFalse(tsvFactory.canCreateFromSample(explanation, TEXT_SAMPLE));
+        assertFalse(tsvFactory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0));
     }
 
     // Semi-colon delimited - no need to check NDJSON, XML, CSV or TSV because they come earlier in the order we check formats
 
     public void testCanCreateSemiColonDelimitedFromSampleGivenSemiColonDelimited() {
 
-        assertTrue(semiColonDelimitedfactory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE));
+        assertTrue(semiColonDelimitedfactory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE, 0.0));
     }
 
     public void testCanCreateSemiColonDelimitedFromSampleGivenPipeDelimited() {
 
-        assertFalse(semiColonDelimitedfactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE));
+        assertFalse(semiColonDelimitedfactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE, 0.0));
     }
 
     public void testCanCreateSemiColonDelimitedFromSampleGivenText() {
 
-        assertFalse(semiColonDelimitedfactory.canCreateFromSample(explanation, TEXT_SAMPLE));
+        assertFalse(semiColonDelimitedfactory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0));
     }
 
     // Pipe delimited - no need to check NDJSON, XML, CSV, TSV or semi-colon delimited
@@ -83,11 +83,11 @@ public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestC
 
     public void testCanCreatePipeDelimitedFromSampleGivenPipeDelimited() {
 
-        assertTrue(pipeDelimitedFactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE));
+        assertTrue(pipeDelimitedFactory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE, 0.0));
     }
 
     public void testCanCreatePipeDelimitedFromSampleGivenText() {
 
-        assertFalse(pipeDelimitedFactory.canCreateFromSample(explanation, TEXT_SAMPLE));
+        assertFalse(pipeDelimitedFactory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0));
     }
 }

+ 104 - 11
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java

@@ -37,7 +37,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
         String sample = "time,message\n" +
             "2018-05-17T13:41:23,hello\n" +
             "2018-05-17T13:41:32,hello again\n";
-        assertTrue(csvFactory.canCreateFromSample(explanation, sample));
+        assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0));
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
@@ -65,6 +65,99 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
         assertEquals(Collections.singletonList("ISO8601"), structure.getJodaTimestampFormats());
     }
 
+    public void testCreateConfigsGivenIncompleteCsv() throws Exception {
+        String sample = "time,message\n" +
+            "2018-05-17T13:41:23,hello\n" +
+            "badrow\n" + // REALLY bad row
+            "2018-05-17T13:41:25,hello\n" +
+            "2018-05-17T13:41:26,hello\n" +
+            "2018-05-17T13:41:27,hello\n" +
+            "2018-05-17T13:41:28,hello\n" +
+            "2018-05-17T13:41:29,hello\n" +
+            "2018-05-17T13:41:30,hello\n" +
+            "2018-05-17T13:41:31,hello\n" +
+            "2018-05-17T13:41:32,hello\n" +
+            "2018-05-17T13:41:35\n" + // Just missing the column
+            "2018-05-17T13:41:33,hello again\n";
+        assertFalse(csvFactory.canCreateFromSample(explanation, sample, 0.05));
+        assertTrue("assertion failed. Explanation " + explanation,
+            csvFactory.canCreateFromSample(explanation, sample, 0.10));
+
+        String charset = randomFrom(POSSIBLE_CHARSETS);
+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
+        FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
+            FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
+
+
+        FileStructure structure = structureFinder.getStructure();
+
+        assertEquals(FileStructure.Format.DELIMITED, structure.getFormat());
+        assertEquals(charset, structure.getCharset());
+        if (hasByteOrderMarker == null) {
+            assertNull(structure.getHasByteOrderMarker());
+        } else {
+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
+        }
+        assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern());
+        assertEquals("time", structure.getTimestampField());
+        assertEquals(Collections.singletonList("ISO8601"), structure.getJodaTimestampFormats());
+        assertEquals(Arrays.asList("time", "message"), structure.getColumnNames());
+        assertEquals(Character.valueOf(','), structure.getDelimiter());
+        assertEquals(Character.valueOf('"'), structure.getQuote());
+        assertEquals(structure.getNumMessagesAnalyzed(), 10);
+        assertTrue(structure.getHasHeaderRow());
+        assertNull(structure.getMultilineStartPattern());
+        assertNull(structure.getShouldTrimFields());
+        assertNull(structure.getGrokPattern());
+    }
+
+    public void testCreateConfigsGivenIncompleteCsvWithMultiLinedRows() throws Exception {
+        String sample = "time,message\n" +
+            "2018-05-17T13:41:23,\"hello\nnew line\"\n" +
+            "\"badrow\n\n\n\n\"\n" + // REALLY bad row
+            "2018-05-17T13:41:25,\"hello\nnew line\"\n" +
+            "2018-05-17T13:41:26,\"hello\nnew line\"\n" +
+            "2018-05-17T13:41:27,\"hello\nnew line\"\n" +
+            "2018-05-17T13:41:28,\"hello\nnew line\"\n" +
+            "2018-05-17T13:41:29,\"hello\nnew line\"\n" +
+            "2018-05-17T13:41:30,\"hello\nnew line\"\n" +
+            "2018-05-17T13:41:31,\"hello\nnew line\"\n" +
+            "2018-05-17T13:41:32,\"hello\nnew line\"\n" +
+            "2018-05-17T13:41:35\n" + // Just missing the column
+            "2018-05-17T13:41:33,\"hello again\nnew line\"\n";
+        assertFalse(csvFactory.canCreateFromSample(explanation, sample, 0.05));
+        assertTrue("assertion failed. Explanation " + explanation,
+            csvFactory.canCreateFromSample(explanation, sample, 0.10));
+
+        String charset = randomFrom(POSSIBLE_CHARSETS);
+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
+        FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
+            FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT,
+            FileStructureOverrides.builder().setQuote('"').build(),
+            NOOP_TIMEOUT_CHECKER);
+
+        FileStructure structure = structureFinder.getStructure();
+
+        assertEquals(FileStructure.Format.DELIMITED, structure.getFormat());
+        assertEquals(charset, structure.getCharset());
+        if (hasByteOrderMarker == null) {
+            assertNull(structure.getHasByteOrderMarker());
+        } else {
+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
+        }
+        assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern());
+        assertEquals("time", structure.getTimestampField());
+        assertEquals(Collections.singletonList("ISO8601"), structure.getJodaTimestampFormats());
+        assertEquals(Arrays.asList("time", "message"), structure.getColumnNames());
+        assertEquals(Character.valueOf(','), structure.getDelimiter());
+        assertEquals(Character.valueOf('"'), structure.getQuote());
+        assertEquals(structure.getNumMessagesAnalyzed(), 10);
+        assertTrue(structure.getHasHeaderRow());
+        assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern());
+        assertNull(structure.getShouldTrimFields());
+        assertNull(structure.getGrokPattern());
+    }
+
     public void testCreateConfigsGivenCompleteCsvAndColumnNamesOverride() throws Exception {
 
         FileStructureOverrides overrides = FileStructureOverrides.builder().setColumnNames(Arrays.asList("my_time", "my_message")).build();
@@ -72,7 +165,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
         String sample = "time,message\n" +
             "2018-05-17T13:41:23,hello\n" +
             "2018-05-17T13:41:32,hello again\n";
-        assertTrue(csvFactory.canCreateFromSample(explanation, sample));
+        assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0));
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
@@ -109,7 +202,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
         String sample = "time,message\n" +
             "2018-05-17T13:41:23,hello\n" +
             "2018-05-17T13:41:32,hello again\n";
-        assertTrue(csvFactory.canCreateFromSample(explanation, sample));
+        assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0));
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
@@ -142,7 +235,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
             "2018-05-17T13:41:23,\"hello\n" +
             "world\",1\n" +
             "2019-01-18T14:46:57,\"hello again\n"; // note that this last record is truncated
-        assertTrue(csvFactory.canCreateFromSample(explanation, sample));
+        assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0));
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
@@ -177,7 +270,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
             "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" +
             "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" +
             "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n";
-        assertTrue(csvFactory.canCreateFromSample(explanation, sample));
+        assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0));
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
@@ -222,7 +315,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
             "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" +
             "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" +
             "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n";
-        assertTrue(csvFactory.canCreateFromSample(explanation, sample));
+        assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0));
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
@@ -262,7 +355,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
             "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" +
             "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" +
             "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n";
-        assertTrue(csvFactory.canCreateFromSample(explanation, sample));
+        assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0));
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
@@ -309,7 +402,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
             "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" +
             "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" +
             "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n";
-        assertTrue(csvFactory.canCreateFromSample(explanation, sample));
+        assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0));
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
@@ -347,7 +440,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
         String sample = "\"pos_id\",\"trip_id\",\"latitude\",\"longitude\",\"altitude\",\"timestamp\"\n" +
             "\"1\",\"3\",\"4703.7815\",\"1527.4713\",\"359.9\",\"2017-01-19 16:19:04.742113\"\n" +
             "\"2\",\"3\",\"4703.7815\",\"1527.4714\",\"359.9\",\"2017-01-19 16:19:05.741890\"\n";
-        assertTrue(csvFactory.canCreateFromSample(explanation, sample));
+        assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0));
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
@@ -387,7 +480,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
             "25.76615\t18.436565\t\"25.7661500000,18.4365650000\"\tJul 1 2019 12:06:08\n" +
             "25.76896\t18.43586\t\"25.7689600000,18.4358600000\"\tJul 1 2019 12:13:50\n" +
             "25.76423\t18.43705\t\"25.7642300000,18.4370500000\"\tJul 1 2019 12:39:10\n";
-        assertTrue(tsvFactory.canCreateFromSample(explanation, sample));
+        assertTrue(tsvFactory.canCreateFromSample(explanation, sample, 0.0));
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
@@ -421,7 +514,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
         String sample = "time.iso8601,message\n" +
             "2018-05-17T13:41:23,hello\n" +
             "2018-05-17T13:41:32,hello again\n";
-        assertTrue(csvFactory.canCreateFromSample(explanation, sample));
+        assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0));
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);

+ 1 - 1
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderTests.java

@@ -14,7 +14,7 @@ public class NdJsonFileStructureFinderTests extends FileStructureTestCase {
     private FileStructureFinderFactory factory = new NdJsonFileStructureFinderFactory();
 
     public void testCreateConfigsGivenGoodJson() throws Exception {
-        assertTrue(factory.canCreateFromSample(explanation, NDJSON_SAMPLE));
+        assertTrue(factory.canCreateFromSample(explanation, NDJSON_SAMPLE, 0.0));
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);

+ 7 - 7
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdNdJsonFileStructureFinderFactoryTests.java

@@ -11,36 +11,36 @@ public class NdNdJsonFileStructureFinderFactoryTests extends FileStructureTestCa
 
     public void testCanCreateFromSampleGivenNdJson() {
 
-        assertTrue(factory.canCreateFromSample(explanation, NDJSON_SAMPLE));
+        assertTrue(factory.canCreateFromSample(explanation, NDJSON_SAMPLE, 0.0));
     }
 
     public void testCanCreateFromSampleGivenXml() {
 
-        assertFalse(factory.canCreateFromSample(explanation, XML_SAMPLE));
+        assertFalse(factory.canCreateFromSample(explanation, XML_SAMPLE, 0.0));
     }
 
     public void testCanCreateFromSampleGivenCsv() {
 
-        assertFalse(factory.canCreateFromSample(explanation, CSV_SAMPLE));
+        assertFalse(factory.canCreateFromSample(explanation, CSV_SAMPLE, 0.0));
     }
 
     public void testCanCreateFromSampleGivenTsv() {
 
-        assertFalse(factory.canCreateFromSample(explanation, TSV_SAMPLE));
+        assertFalse(factory.canCreateFromSample(explanation, TSV_SAMPLE, 0.0));
     }
 
     public void testCanCreateFromSampleGivenSemiColonDelimited() {
 
-        assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE));
+        assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE, 0.0));
     }
 
     public void testCanCreateFromSampleGivenPipeDelimited() {
 
-        assertFalse(factory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE));
+        assertFalse(factory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE, 0.0));
     }
 
     public void testCanCreateFromSampleGivenText() {
 
-        assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
+        assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0));
     }
 }

+ 1 - 1
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactoryTests.java

@@ -14,6 +14,6 @@ public class TextLogFileStructureFinderFactoryTests extends FileStructureTestCas
 
     public void testCanCreateFromSampleGivenText() {
 
-        assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
+        assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0));
     }
 }

+ 6 - 6
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java

@@ -30,7 +30,7 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase {
             "continuation line 2.4\n" +
             "2019-05-16 16:56:14 line 3 abcdefghijklmnopqrstuvwxyz\n";
 
-        assertTrue(factory.canCreateFromSample(explanation, sample));
+        assertTrue(factory.canCreateFromSample(explanation, sample, 0.0));
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
@@ -44,7 +44,7 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase {
     }
 
     public void testCreateConfigsGivenElasticsearchLog() throws Exception {
-        assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
+        assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0));
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
@@ -85,7 +85,7 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase {
 
         FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("M/d/yyyy h:mma").build();
 
-        assertTrue(factory.canCreateFromSample(explanation, sample));
+        assertTrue(factory.canCreateFromSample(explanation, sample, 0.0));
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
@@ -121,7 +121,7 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase {
 
         FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampField("my_time").build();
 
-        assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
+        assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0));
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
@@ -158,7 +158,7 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase {
         FileStructureOverrides overrides = FileStructureOverrides.builder().setGrokPattern("\\[%{TIMESTAMP_ISO8601:timestamp}\\]" +
             "\\[%{LOGLEVEL:loglevel} *\\]\\[%{JAVACLASS:class} *\\] \\[%{HOSTNAME:node}\\] %{JAVALOGMESSAGE:message}").build();
 
-        assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
+        assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0));
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
@@ -199,7 +199,7 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase {
         FileStructureOverrides overrides = FileStructureOverrides.builder().setGrokPattern("\\[%{LOGLEVEL:loglevel} *\\]" +
             "\\[%{HOSTNAME:node}\\]\\[%{TIMESTAMP_ISO8601:timestamp}\\] \\[%{JAVACLASS:class} *\\] %{JAVALOGMESSAGE:message}").build();
 
-        assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
+        assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0));
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);

+ 6 - 6
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactoryTests.java

@@ -13,31 +13,31 @@ public class XmlFileStructureFinderFactoryTests extends FileStructureTestCase {
 
     public void testCanCreateFromSampleGivenXml() {
 
-        assertTrue(factory.canCreateFromSample(explanation, XML_SAMPLE));
+        assertTrue(factory.canCreateFromSample(explanation, XML_SAMPLE, 0.0));
     }
 
     public void testCanCreateFromSampleGivenCsv() {
 
-        assertFalse(factory.canCreateFromSample(explanation, CSV_SAMPLE));
+        assertFalse(factory.canCreateFromSample(explanation, CSV_SAMPLE, 0.0));
     }
 
     public void testCanCreateFromSampleGivenTsv() {
 
-        assertFalse(factory.canCreateFromSample(explanation, TSV_SAMPLE));
+        assertFalse(factory.canCreateFromSample(explanation, TSV_SAMPLE, 0.0));
     }
 
     public void testCanCreateFromSampleGivenSemiColonDelimited() {
 
-        assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE));
+        assertFalse(factory.canCreateFromSample(explanation, SEMI_COLON_DELIMITED_SAMPLE, 0.0));
     }
 
     public void testCanCreateFromSampleGivenPipeDelimited() {
 
-        assertFalse(factory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE));
+        assertFalse(factory.canCreateFromSample(explanation, PIPE_DELIMITED_SAMPLE, 0.0));
     }
 
     public void testCanCreateFromSampleGivenText() {
 
-        assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
+        assertFalse(factory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0));
     }
 }

+ 1 - 1
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java

@@ -14,7 +14,7 @@ public class XmlFileStructureFinderTests extends FileStructureTestCase {
     private FileStructureFinderFactory factory = new XmlFileStructureFinderFactory();
 
     public void testCreateConfigsGivenGoodXml() throws Exception {
-        assertTrue(factory.canCreateFromSample(explanation, XML_SAMPLE));
+        assertTrue(factory.canCreateFromSample(explanation, XML_SAMPLE, 0.0));
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);