1
0
Эх сурвалжийг харах

[ML] Allow overrides for some file structure detection decisions (#33630)

This change modifies the file structure detection functionality
such that some of the decisions can be overridden with user
supplied values.

The fields that can be overridden are:

- charset
- format
- has_header_row
- column_names
- delimiter
- quote
- should_trim_fields
- grok_pattern
- timestamp_field
- timestamp_format

If an override makes finding the file structure impossible then
the endpoint will return an exception.
David Roberts 7 жил өмнө
parent
commit
568ac10ca6
30 өөрчлөгдсөн 1668 нэмэгдсэн , 333 устгасан
  1. 215 2
      x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java
  2. 57 26
      x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java
  3. 92 2
      x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionRequestTests.java
  4. 1 0
      x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java
  5. 3 2
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportFindFileStructureAction.java
  6. 64 26
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java
  7. 17 5
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java
  8. 15 3
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java
  9. 69 16
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java
  10. 205 0
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureOverrides.java
  11. 45 14
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java
  12. 72 42
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java
  13. 7 3
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinder.java
  14. 9 3
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderFactory.java
  15. 26 15
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java
  16. 11 2
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java
  17. 68 20
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java
  18. 6 4
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java
  19. 9 2
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java
  20. 11 0
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java
  21. 4 4
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java
  22. 187 15
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java
  23. 50 12
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java
  24. 72 22
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java
  25. 58 3
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java
  26. 3 1
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderTests.java
  27. 193 86
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java
  28. 3 1
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java
  29. 42 1
      x-pack/plugin/src/test/resources/rest-api-spec/api/xpack.ml.find_file_structure.json
  30. 54 1
      x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml

+ 215 - 2
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java

@@ -22,6 +22,9 @@ import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
 import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
 
 
 import java.io.IOException;
 import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Locale;
 import java.util.Objects;
 import java.util.Objects;
 
 
 import static org.elasticsearch.action.ValidateActions.addValidationError;
 import static org.elasticsearch.action.ValidateActions.addValidationError;
@@ -109,8 +112,32 @@ public class FindFileStructureAction extends Action<FindFileStructureAction.Resp
     public static class Request extends ActionRequest {
     public static class Request extends ActionRequest {
 
 
         public static final ParseField LINES_TO_SAMPLE = new ParseField("lines_to_sample");
         public static final ParseField LINES_TO_SAMPLE = new ParseField("lines_to_sample");
+        public static final ParseField CHARSET = FileStructure.CHARSET;
+        public static final ParseField FORMAT = FileStructure.FORMAT;
+        public static final ParseField COLUMN_NAMES = FileStructure.COLUMN_NAMES;
+        public static final ParseField HAS_HEADER_ROW = FileStructure.HAS_HEADER_ROW;
+        public static final ParseField DELIMITER = FileStructure.DELIMITER;
+        public static final ParseField QUOTE = FileStructure.QUOTE;
+        public static final ParseField SHOULD_TRIM_FIELDS = FileStructure.SHOULD_TRIM_FIELDS;
+        public static final ParseField GROK_PATTERN = FileStructure.GROK_PATTERN;
+        // This one is plural in FileStructure, but singular in FileStructureOverrides
+        public static final ParseField TIMESTAMP_FORMAT = new ParseField("timestamp_format");
+        public static final ParseField TIMESTAMP_FIELD = FileStructure.TIMESTAMP_FIELD;
+
+        private static final String ARG_INCOMPATIBLE_WITH_FORMAT_TEMPLATE =
+            "[%s] may only be specified if [" + FORMAT.getPreferredName() + "] is [%s]";
 
 
         private Integer linesToSample;
         private Integer linesToSample;
+        private String charset;
+        private FileStructure.Format format;
+        private List<String> columnNames;
+        private Boolean hasHeaderRow;
+        private Character delimiter;
+        private Character quote;
+        private Boolean shouldTrimFields;
+        private String grokPattern;
+        private String timestampFormat;
+        private String timestampField;
         private BytesReference sample;
         private BytesReference sample;
 
 
         public Request() {
         public Request() {
@@ -124,6 +151,114 @@ public class FindFileStructureAction extends Action<FindFileStructureAction.Resp
             this.linesToSample = linesToSample;
             this.linesToSample = linesToSample;
         }
         }
 
 
+        public String getCharset() {
+            return charset;
+        }
+
+        public void setCharset(String charset) {
+            this.charset = (charset == null || charset.isEmpty()) ? null : charset;
+        }
+
+        public FileStructure.Format getFormat() {
+            return format;
+        }
+
+        public void setFormat(FileStructure.Format format) {
+            this.format = format;
+        }
+
+        public void setFormat(String format) {
+            this.format = (format == null || format.isEmpty()) ? null : FileStructure.Format.fromString(format);
+        }
+
+        public List<String> getColumnNames() {
+            return columnNames;
+        }
+
+        public void setColumnNames(List<String> columnNames) {
+            this.columnNames = (columnNames == null || columnNames.isEmpty()) ? null : columnNames;
+        }
+
+        public void setColumnNames(String[] columnNames) {
+            this.columnNames = (columnNames == null || columnNames.length == 0) ? null : Arrays.asList(columnNames);
+        }
+
+        public Boolean getHasHeaderRow() {
+            return hasHeaderRow;
+        }
+
+        public void setHasHeaderRow(Boolean hasHeaderRow) {
+            this.hasHeaderRow = hasHeaderRow;
+        }
+
+        public Character getDelimiter() {
+            return delimiter;
+        }
+
+        public void setDelimiter(Character delimiter) {
+            this.delimiter = delimiter;
+        }
+
+        public void setDelimiter(String delimiter) {
+            if (delimiter == null || delimiter.isEmpty()) {
+                this.delimiter = null;
+            } else if (delimiter.length() == 1) {
+                this.delimiter = delimiter.charAt(0);
+            } else {
+                throw new IllegalArgumentException(DELIMITER.getPreferredName() + " must be a single character");
+            }
+        }
+
+        public Character getQuote() {
+            return quote;
+        }
+
+        public void setQuote(Character quote) {
+            this.quote = quote;
+        }
+
+        public void setQuote(String quote) {
+            if (quote == null || quote.isEmpty()) {
+                this.quote = null;
+            } else if (quote.length() == 1) {
+                this.quote = quote.charAt(0);
+            } else {
+                throw new IllegalArgumentException(QUOTE.getPreferredName() + " must be a single character");
+            }
+        }
+
+        public Boolean getShouldTrimFields() {
+            return shouldTrimFields;
+        }
+
+        public void setShouldTrimFields(Boolean shouldTrimFields) {
+            this.shouldTrimFields = shouldTrimFields;
+        }
+
+        public String getGrokPattern() {
+            return grokPattern;
+        }
+
+        public void setGrokPattern(String grokPattern) {
+            this.grokPattern = (grokPattern == null || grokPattern.isEmpty()) ? null : grokPattern;
+        }
+
+        public String getTimestampFormat() {
+            return timestampFormat;
+        }
+
+        public void setTimestampFormat(String timestampFormat) {
+            this.timestampFormat = (timestampFormat == null || timestampFormat.isEmpty()) ? null : timestampFormat;
+        }
+
+        public String getTimestampField() {
+            return timestampField;
+        }
+
+        public void setTimestampField(String timestampField) {
+            this.timestampField = (timestampField == null || timestampField.isEmpty()) ? null : timestampField;
+        }
+
         public BytesReference getSample() {
         public BytesReference getSample() {
             return sample;
             return sample;
         }
         }
@@ -132,12 +267,41 @@ public class FindFileStructureAction extends Action<FindFileStructureAction.Resp
             this.sample = sample;
             this.sample = sample;
         }
         }
 
 
+        private static ActionRequestValidationException addIncompatibleArgError(ParseField arg, FileStructure.Format format,
+                                                                                ActionRequestValidationException validationException) {
+            return addValidationError(String.format(Locale.ROOT, ARG_INCOMPATIBLE_WITH_FORMAT_TEMPLATE, arg.getPreferredName(), format),
+                validationException);
+        }
+
         @Override
         @Override
         public ActionRequestValidationException validate() {
         public ActionRequestValidationException validate() {
             ActionRequestValidationException validationException = null;
             ActionRequestValidationException validationException = null;
             if (linesToSample != null && linesToSample <= 0) {
             if (linesToSample != null && linesToSample <= 0) {
                 validationException =
                 validationException =
-                    addValidationError(LINES_TO_SAMPLE.getPreferredName() + " must be positive if specified", validationException);
+                    addValidationError("[" + LINES_TO_SAMPLE.getPreferredName() + "] must be positive if specified", validationException);
+            }
+            if (format != FileStructure.Format.DELIMITED) {
+                if (columnNames != null) {
+                    validationException = addIncompatibleArgError(COLUMN_NAMES, FileStructure.Format.DELIMITED, validationException);
+                }
+                if (hasHeaderRow != null) {
+                    validationException = addIncompatibleArgError(HAS_HEADER_ROW, FileStructure.Format.DELIMITED, validationException);
+                }
+                if (delimiter != null) {
+                    validationException = addIncompatibleArgError(DELIMITER, FileStructure.Format.DELIMITED, validationException);
+                }
+                if (quote != null) {
+                    validationException = addIncompatibleArgError(QUOTE, FileStructure.Format.DELIMITED, validationException);
+                }
+                if (shouldTrimFields != null) {
+                    validationException = addIncompatibleArgError(SHOULD_TRIM_FIELDS, FileStructure.Format.DELIMITED, validationException);
+                }
+            }
+            if (format != FileStructure.Format.SEMI_STRUCTURED_TEXT) {
+                if (grokPattern != null) {
+                    validationException =
+                        addIncompatibleArgError(GROK_PATTERN, FileStructure.Format.SEMI_STRUCTURED_TEXT, validationException);
+                }
             }
             }
             if (sample == null || sample.length() == 0) {
             if (sample == null || sample.length() == 0) {
                 validationException = addValidationError("sample must be specified", validationException);
                 validationException = addValidationError("sample must be specified", validationException);
@@ -149,6 +313,16 @@ public class FindFileStructureAction extends Action<FindFileStructureAction.Resp
         public void readFrom(StreamInput in) throws IOException {
         public void readFrom(StreamInput in) throws IOException {
             super.readFrom(in);
             super.readFrom(in);
             linesToSample = in.readOptionalVInt();
             linesToSample = in.readOptionalVInt();
+            charset = in.readOptionalString();
+            format = in.readBoolean() ? in.readEnum(FileStructure.Format.class) : null;
+            columnNames = in.readBoolean() ? in.readList(StreamInput::readString) : null;
+            hasHeaderRow = in.readOptionalBoolean();
+            delimiter = in.readBoolean() ? (char) in.readVInt() : null;
+            quote = in.readBoolean() ? (char) in.readVInt() : null;
+            shouldTrimFields = in.readOptionalBoolean();
+            grokPattern = in.readOptionalString();
+            timestampFormat = in.readOptionalString();
+            timestampField = in.readOptionalString();
             sample = in.readBytesReference();
             sample = in.readBytesReference();
         }
         }
 
 
@@ -156,12 +330,43 @@ public class FindFileStructureAction extends Action<FindFileStructureAction.Resp
         public void writeTo(StreamOutput out) throws IOException {
         public void writeTo(StreamOutput out) throws IOException {
             super.writeTo(out);
             super.writeTo(out);
             out.writeOptionalVInt(linesToSample);
             out.writeOptionalVInt(linesToSample);
+            out.writeOptionalString(charset);
+            if (format == null) {
+                out.writeBoolean(false);
+            } else {
+                out.writeBoolean(true);
+                out.writeEnum(format);
+            }
+            if (columnNames == null) {
+                out.writeBoolean(false);
+            } else {
+                out.writeBoolean(true);
+                out.writeCollection(columnNames, StreamOutput::writeString);
+            }
+            out.writeOptionalBoolean(hasHeaderRow);
+            if (delimiter == null) {
+                out.writeBoolean(false);
+            } else {
+                out.writeBoolean(true);
+                out.writeVInt(delimiter);
+            }
+            if (quote == null) {
+                out.writeBoolean(false);
+            } else {
+                out.writeBoolean(true);
+                out.writeVInt(quote);
+            }
+            out.writeOptionalBoolean(shouldTrimFields);
+            out.writeOptionalString(grokPattern);
+            out.writeOptionalString(timestampFormat);
+            out.writeOptionalString(timestampField);
             out.writeBytesReference(sample);
             out.writeBytesReference(sample);
         }
         }
 
 
         @Override
         @Override
         public int hashCode() {
         public int hashCode() {
-            return Objects.hash(linesToSample, sample);
+            return Objects.hash(linesToSample, charset, format, columnNames, hasHeaderRow, delimiter, grokPattern, timestampFormat,
+                timestampField, sample);
         }
         }
 
 
         @Override
         @Override
@@ -177,6 +382,14 @@ public class FindFileStructureAction extends Action<FindFileStructureAction.Resp
 
 
             Request that = (Request) other;
             Request that = (Request) other;
             return Objects.equals(this.linesToSample, that.linesToSample) &&
             return Objects.equals(this.linesToSample, that.linesToSample) &&
+                Objects.equals(this.charset, that.charset) &&
+                Objects.equals(this.format, that.format) &&
+                Objects.equals(this.columnNames, that.columnNames) &&
+                Objects.equals(this.hasHeaderRow, that.hasHeaderRow) &&
+                Objects.equals(this.delimiter, that.delimiter) &&
+                Objects.equals(this.grokPattern, that.grokPattern) &&
+                Objects.equals(this.timestampFormat, that.timestampFormat) &&
+                Objects.equals(this.timestampField, that.timestampField) &&
                 Objects.equals(this.sample, that.sample);
                 Objects.equals(this.sample, that.sample);
         }
         }
     }
     }

+ 57 - 26
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java

@@ -84,25 +84,26 @@ public class FileStructure implements ToXContentObject, Writeable {
 
 
     public static final String EXPLAIN = "explain";
     public static final String EXPLAIN = "explain";
 
 
-    static final ParseField NUM_LINES_ANALYZED = new ParseField("num_lines_analyzed");
-    static final ParseField NUM_MESSAGES_ANALYZED = new ParseField("num_messages_analyzed");
-    static final ParseField SAMPLE_START = new ParseField("sample_start");
-    static final ParseField CHARSET = new ParseField("charset");
-    static final ParseField HAS_BYTE_ORDER_MARKER = new ParseField("has_byte_order_marker");
-    static final ParseField STRUCTURE = new ParseField("format");
-    static final ParseField MULTILINE_START_PATTERN = new ParseField("multiline_start_pattern");
-    static final ParseField EXCLUDE_LINES_PATTERN = new ParseField("exclude_lines_pattern");
-    static final ParseField COLUMN_NAMES = new ParseField("column_names");
-    static final ParseField HAS_HEADER_ROW = new ParseField("has_header_row");
-    static final ParseField DELIMITER = new ParseField("delimiter");
-    static final ParseField SHOULD_TRIM_FIELDS = new ParseField("should_trim_fields");
-    static final ParseField GROK_PATTERN = new ParseField("grok_pattern");
-    static final ParseField TIMESTAMP_FIELD = new ParseField("timestamp_field");
-    static final ParseField TIMESTAMP_FORMATS = new ParseField("timestamp_formats");
-    static final ParseField NEED_CLIENT_TIMEZONE = new ParseField("need_client_timezone");
-    static final ParseField MAPPINGS = new ParseField("mappings");
-    static final ParseField FIELD_STATS = new ParseField("field_stats");
-    static final ParseField EXPLANATION = new ParseField("explanation");
+    public static final ParseField NUM_LINES_ANALYZED = new ParseField("num_lines_analyzed");
+    public static final ParseField NUM_MESSAGES_ANALYZED = new ParseField("num_messages_analyzed");
+    public static final ParseField SAMPLE_START = new ParseField("sample_start");
+    public static final ParseField CHARSET = new ParseField("charset");
+    public static final ParseField HAS_BYTE_ORDER_MARKER = new ParseField("has_byte_order_marker");
+    public static final ParseField FORMAT = new ParseField("format");
+    public static final ParseField MULTILINE_START_PATTERN = new ParseField("multiline_start_pattern");
+    public static final ParseField EXCLUDE_LINES_PATTERN = new ParseField("exclude_lines_pattern");
+    public static final ParseField COLUMN_NAMES = new ParseField("column_names");
+    public static final ParseField HAS_HEADER_ROW = new ParseField("has_header_row");
+    public static final ParseField DELIMITER = new ParseField("delimiter");
+    public static final ParseField QUOTE = new ParseField("quote");
+    public static final ParseField SHOULD_TRIM_FIELDS = new ParseField("should_trim_fields");
+    public static final ParseField GROK_PATTERN = new ParseField("grok_pattern");
+    public static final ParseField TIMESTAMP_FIELD = new ParseField("timestamp_field");
+    public static final ParseField TIMESTAMP_FORMATS = new ParseField("timestamp_formats");
+    public static final ParseField NEED_CLIENT_TIMEZONE = new ParseField("need_client_timezone");
+    public static final ParseField MAPPINGS = new ParseField("mappings");
+    public static final ParseField FIELD_STATS = new ParseField("field_stats");
+    public static final ParseField EXPLANATION = new ParseField("explanation");
 
 
     public static final ObjectParser<Builder, Void> PARSER = new ObjectParser<>("file_structure", false, Builder::new);
     public static final ObjectParser<Builder, Void> PARSER = new ObjectParser<>("file_structure", false, Builder::new);
 
 
@@ -112,12 +113,13 @@ public class FileStructure implements ToXContentObject, Writeable {
         PARSER.declareString(Builder::setSampleStart, SAMPLE_START);
         PARSER.declareString(Builder::setSampleStart, SAMPLE_START);
         PARSER.declareString(Builder::setCharset, CHARSET);
         PARSER.declareString(Builder::setCharset, CHARSET);
         PARSER.declareBoolean(Builder::setHasByteOrderMarker, HAS_BYTE_ORDER_MARKER);
         PARSER.declareBoolean(Builder::setHasByteOrderMarker, HAS_BYTE_ORDER_MARKER);
-        PARSER.declareString((p, c) -> p.setFormat(Format.fromString(c)), STRUCTURE);
+        PARSER.declareString((p, c) -> p.setFormat(Format.fromString(c)), FORMAT);
         PARSER.declareString(Builder::setMultilineStartPattern, MULTILINE_START_PATTERN);
         PARSER.declareString(Builder::setMultilineStartPattern, MULTILINE_START_PATTERN);
         PARSER.declareString(Builder::setExcludeLinesPattern, EXCLUDE_LINES_PATTERN);
         PARSER.declareString(Builder::setExcludeLinesPattern, EXCLUDE_LINES_PATTERN);
         PARSER.declareStringArray(Builder::setColumnNames, COLUMN_NAMES);
         PARSER.declareStringArray(Builder::setColumnNames, COLUMN_NAMES);
         PARSER.declareBoolean(Builder::setHasHeaderRow, HAS_HEADER_ROW);
         PARSER.declareBoolean(Builder::setHasHeaderRow, HAS_HEADER_ROW);
         PARSER.declareString((p, c) -> p.setDelimiter(c.charAt(0)), DELIMITER);
         PARSER.declareString((p, c) -> p.setDelimiter(c.charAt(0)), DELIMITER);
+        PARSER.declareString((p, c) -> p.setQuote(c.charAt(0)), QUOTE);
         PARSER.declareBoolean(Builder::setShouldTrimFields, SHOULD_TRIM_FIELDS);
         PARSER.declareBoolean(Builder::setShouldTrimFields, SHOULD_TRIM_FIELDS);
         PARSER.declareString(Builder::setGrokPattern, GROK_PATTERN);
         PARSER.declareString(Builder::setGrokPattern, GROK_PATTERN);
         PARSER.declareString(Builder::setTimestampField, TIMESTAMP_FIELD);
         PARSER.declareString(Builder::setTimestampField, TIMESTAMP_FIELD);
@@ -145,6 +147,7 @@ public class FileStructure implements ToXContentObject, Writeable {
     private final List<String> columnNames;
     private final List<String> columnNames;
     private final Boolean hasHeaderRow;
     private final Boolean hasHeaderRow;
     private final Character delimiter;
     private final Character delimiter;
+    private final Character quote;
     private final Boolean shouldTrimFields;
     private final Boolean shouldTrimFields;
     private final String grokPattern;
     private final String grokPattern;
     private final List<String> timestampFormats;
     private final List<String> timestampFormats;
@@ -156,8 +159,8 @@ public class FileStructure implements ToXContentObject, Writeable {
 
 
     public FileStructure(int numLinesAnalyzed, int numMessagesAnalyzed, String sampleStart, String charset, Boolean hasByteOrderMarker,
     public FileStructure(int numLinesAnalyzed, int numMessagesAnalyzed, String sampleStart, String charset, Boolean hasByteOrderMarker,
                          Format format, String multilineStartPattern, String excludeLinesPattern, List<String> columnNames,
                          Format format, String multilineStartPattern, String excludeLinesPattern, List<String> columnNames,
-                         Boolean hasHeaderRow, Character delimiter, Boolean shouldTrimFields, String grokPattern, String timestampField,
-                         List<String> timestampFormats, boolean needClientTimezone, Map<String, Object> mappings,
+                         Boolean hasHeaderRow, Character delimiter, Character quote, Boolean shouldTrimFields, String grokPattern,
+                         String timestampField, List<String> timestampFormats, boolean needClientTimezone, Map<String, Object> mappings,
                          Map<String, FieldStats> fieldStats, List<String> explanation) {
                          Map<String, FieldStats> fieldStats, List<String> explanation) {
 
 
         this.numLinesAnalyzed = numLinesAnalyzed;
         this.numLinesAnalyzed = numLinesAnalyzed;
@@ -171,6 +174,7 @@ public class FileStructure implements ToXContentObject, Writeable {
         this.columnNames = (columnNames == null) ? null : Collections.unmodifiableList(new ArrayList<>(columnNames));
         this.columnNames = (columnNames == null) ? null : Collections.unmodifiableList(new ArrayList<>(columnNames));
         this.hasHeaderRow = hasHeaderRow;
         this.hasHeaderRow = hasHeaderRow;
         this.delimiter = delimiter;
         this.delimiter = delimiter;
+        this.quote = quote;
         this.shouldTrimFields = shouldTrimFields;
         this.shouldTrimFields = shouldTrimFields;
         this.grokPattern = grokPattern;
         this.grokPattern = grokPattern;
         this.timestampField = timestampField;
         this.timestampField = timestampField;
@@ -193,6 +197,7 @@ public class FileStructure implements ToXContentObject, Writeable {
         columnNames = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null;
         columnNames = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null;
         hasHeaderRow = in.readOptionalBoolean();
         hasHeaderRow = in.readOptionalBoolean();
         delimiter = in.readBoolean() ? (char) in.readVInt() : null;
         delimiter = in.readBoolean() ? (char) in.readVInt() : null;
+        quote = in.readBoolean() ? (char) in.readVInt() : null;
         shouldTrimFields = in.readOptionalBoolean();
         shouldTrimFields = in.readOptionalBoolean();
         grokPattern = in.readOptionalString();
         grokPattern = in.readOptionalString();
         timestampFormats = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null;
         timestampFormats = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null;
@@ -226,6 +231,12 @@ public class FileStructure implements ToXContentObject, Writeable {
             out.writeBoolean(true);
             out.writeBoolean(true);
             out.writeVInt(delimiter);
             out.writeVInt(delimiter);
         }
         }
+        if (quote == null) {
+            out.writeBoolean(false);
+        } else {
+            out.writeBoolean(true);
+            out.writeVInt(quote);
+        }
         out.writeOptionalBoolean(shouldTrimFields);
         out.writeOptionalBoolean(shouldTrimFields);
         out.writeOptionalString(grokPattern);
         out.writeOptionalString(grokPattern);
         if (timestampFormats == null) {
         if (timestampFormats == null) {
@@ -285,6 +296,10 @@ public class FileStructure implements ToXContentObject, Writeable {
         return delimiter;
         return delimiter;
     }
     }
 
 
+    public Character getQuote() {
+        return quote;
+    }
+
     public Boolean getShouldTrimFields() {
     public Boolean getShouldTrimFields() {
         return shouldTrimFields;
         return shouldTrimFields;
     }
     }
@@ -328,7 +343,7 @@ public class FileStructure implements ToXContentObject, Writeable {
         if (hasByteOrderMarker != null) {
         if (hasByteOrderMarker != null) {
             builder.field(HAS_BYTE_ORDER_MARKER.getPreferredName(), hasByteOrderMarker.booleanValue());
             builder.field(HAS_BYTE_ORDER_MARKER.getPreferredName(), hasByteOrderMarker.booleanValue());
         }
         }
-        builder.field(STRUCTURE.getPreferredName(), format);
+        builder.field(FORMAT.getPreferredName(), format);
         if (multilineStartPattern != null && multilineStartPattern.isEmpty() == false) {
         if (multilineStartPattern != null && multilineStartPattern.isEmpty() == false) {
             builder.field(MULTILINE_START_PATTERN.getPreferredName(), multilineStartPattern);
             builder.field(MULTILINE_START_PATTERN.getPreferredName(), multilineStartPattern);
         }
         }
@@ -344,6 +359,9 @@ public class FileStructure implements ToXContentObject, Writeable {
         if (delimiter != null) {
         if (delimiter != null) {
             builder.field(DELIMITER.getPreferredName(), String.valueOf(delimiter));
             builder.field(DELIMITER.getPreferredName(), String.valueOf(delimiter));
         }
         }
+        if (quote != null) {
+            builder.field(QUOTE.getPreferredName(), String.valueOf(quote));
+        }
         if (shouldTrimFields != null) {
         if (shouldTrimFields != null) {
             builder.field(SHOULD_TRIM_FIELDS.getPreferredName(), shouldTrimFields.booleanValue());
             builder.field(SHOULD_TRIM_FIELDS.getPreferredName(), shouldTrimFields.booleanValue());
         }
         }
@@ -377,8 +395,8 @@ public class FileStructure implements ToXContentObject, Writeable {
     public int hashCode() {
     public int hashCode() {
 
 
         return Objects.hash(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format,
         return Objects.hash(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format,
-            multilineStartPattern, excludeLinesPattern, columnNames, hasHeaderRow, delimiter, shouldTrimFields, grokPattern, timestampField,
-            timestampFormats, needClientTimezone, mappings, fieldStats, explanation);
+            multilineStartPattern, excludeLinesPattern, columnNames, hasHeaderRow, delimiter, quote, shouldTrimFields, grokPattern,
+            timestampField, timestampFormats, needClientTimezone, mappings, fieldStats, explanation);
     }
     }
 
 
     @Override
     @Override
@@ -405,6 +423,7 @@ public class FileStructure implements ToXContentObject, Writeable {
             Objects.equals(this.columnNames, that.columnNames) &&
             Objects.equals(this.columnNames, that.columnNames) &&
             Objects.equals(this.hasHeaderRow, that.hasHeaderRow) &&
             Objects.equals(this.hasHeaderRow, that.hasHeaderRow) &&
             Objects.equals(this.delimiter, that.delimiter) &&
             Objects.equals(this.delimiter, that.delimiter) &&
+            Objects.equals(this.quote, that.quote) &&
             Objects.equals(this.shouldTrimFields, that.shouldTrimFields) &&
             Objects.equals(this.shouldTrimFields, that.shouldTrimFields) &&
             Objects.equals(this.grokPattern, that.grokPattern) &&
             Objects.equals(this.grokPattern, that.grokPattern) &&
             Objects.equals(this.timestampField, that.timestampField) &&
             Objects.equals(this.timestampField, that.timestampField) &&
@@ -427,6 +446,7 @@ public class FileStructure implements ToXContentObject, Writeable {
         private List<String> columnNames;
         private List<String> columnNames;
         private Boolean hasHeaderRow;
         private Boolean hasHeaderRow;
         private Character delimiter;
         private Character delimiter;
+        private Character quote;
         private Boolean shouldTrimFields;
         private Boolean shouldTrimFields;
         private String grokPattern;
         private String grokPattern;
         private String timestampField;
         private String timestampField;
@@ -499,6 +519,11 @@ public class FileStructure implements ToXContentObject, Writeable {
             return this;
             return this;
         }
         }
 
 
+        public Builder setQuote(Character quote) {
+            this.quote = quote;
+            return this;
+        }
+
         public Builder setShouldTrimFields(Boolean shouldTrimFields) {
         public Builder setShouldTrimFields(Boolean shouldTrimFields) {
             this.shouldTrimFields = shouldTrimFields;
             this.shouldTrimFields = shouldTrimFields;
             return this;
             return this;
@@ -582,6 +607,9 @@ public class FileStructure implements ToXContentObject, Writeable {
                     if (delimiter != null) {
                     if (delimiter != null) {
                         throw new IllegalArgumentException("Delimiter may not be specified for [" + format + "] structures.");
                         throw new IllegalArgumentException("Delimiter may not be specified for [" + format + "] structures.");
                     }
                     }
+                    if (quote != null) {
+                        throw new IllegalArgumentException("Quote may not be specified for [" + format + "] structures.");
+                    }
                     if (grokPattern != null) {
                     if (grokPattern != null) {
                         throw new IllegalArgumentException("Grok pattern may not be specified for [" + format + "] structures.");
                         throw new IllegalArgumentException("Grok pattern may not be specified for [" + format + "] structures.");
                     }
                     }
@@ -610,6 +638,9 @@ public class FileStructure implements ToXContentObject, Writeable {
                     if (delimiter != null) {
                     if (delimiter != null) {
                         throw new IllegalArgumentException("Delimiter may not be specified for [" + format + "] structures.");
                         throw new IllegalArgumentException("Delimiter may not be specified for [" + format + "] structures.");
                     }
                     }
+                    if (quote != null) {
+                        throw new IllegalArgumentException("Quote may not be specified for [" + format + "] structures.");
+                    }
                     if (shouldTrimFields != null) {
                     if (shouldTrimFields != null) {
                         throw new IllegalArgumentException("Should trim fields may not be specified for [" + format + "] structures.");
                         throw new IllegalArgumentException("Should trim fields may not be specified for [" + format + "] structures.");
                     }
                     }
@@ -638,7 +669,7 @@ public class FileStructure implements ToXContentObject, Writeable {
             }
             }
 
 
             return new FileStructure(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format,
             return new FileStructure(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format,
-                multilineStartPattern, excludeLinesPattern, columnNames, hasHeaderRow, delimiter, shouldTrimFields, grokPattern,
+                multilineStartPattern, excludeLinesPattern, columnNames, hasHeaderRow, delimiter, quote, shouldTrimFields, grokPattern,
                 timestampField, timestampFormats, needClientTimezone, mappings, fieldStats, explanation);
                 timestampField, timestampFormats, needClientTimezone, mappings, fieldStats, explanation);
         }
         }
     }
     }

+ 92 - 2
x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionRequestTests.java

@@ -8,6 +8,9 @@ package org.elasticsearch.xpack.core.ml.action;
 import org.elasticsearch.action.ActionRequestValidationException;
 import org.elasticsearch.action.ActionRequestValidationException;
 import org.elasticsearch.common.bytes.BytesArray;
 import org.elasticsearch.common.bytes.BytesArray;
 import org.elasticsearch.test.AbstractStreamableTestCase;
 import org.elasticsearch.test.AbstractStreamableTestCase;
+import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
+
+import java.util.Arrays;
 
 
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.startsWith;
 import static org.hamcrest.Matchers.startsWith;
@@ -22,6 +25,44 @@ public class FindFileStructureActionRequestTests extends AbstractStreamableTestC
         if (randomBoolean()) {
         if (randomBoolean()) {
             request.setLinesToSample(randomIntBetween(10, 2000));
             request.setLinesToSample(randomIntBetween(10, 2000));
         }
         }
+
+        if (randomBoolean()) {
+            request.setCharset(randomAlphaOfLength(10));
+        }
+
+        if (randomBoolean()) {
+            FileStructure.Format format = randomFrom(FileStructure.Format.values());
+            request.setFormat(format);
+            if (format == FileStructure.Format.DELIMITED) {
+                if (randomBoolean()) {
+                    request.setColumnNames(generateRandomStringArray(10, 15, false, false));
+                }
+                if (randomBoolean()) {
+                    request.setHasHeaderRow(randomBoolean());
+                }
+                if (randomBoolean()) {
+                    request.setDelimiter(randomFrom(',', '\t', ';', '|'));
+                }
+                if (randomBoolean()) {
+                    request.setQuote(randomFrom('"', '\''));
+                }
+                if (randomBoolean()) {
+                    request.setShouldTrimFields(randomBoolean());
+                }
+            } else if (format == FileStructure.Format.SEMI_STRUCTURED_TEXT) {
+                if (randomBoolean()) {
+                    request.setGrokPattern(randomAlphaOfLength(80));
+                }
+            }
+        }
+
+        if (randomBoolean()) {
+            request.setTimestampFormat(randomAlphaOfLength(20));
+        }
+        if (randomBoolean()) {
+            request.setTimestampField(randomAlphaOfLength(15));
+        }
+
         request.setSample(new BytesArray(randomByteArrayOfLength(randomIntBetween(1000, 20000))));
         request.setSample(new BytesArray(randomByteArrayOfLength(randomIntBetween(1000, 20000))));
 
 
         return request;
         return request;
@@ -35,13 +76,62 @@ public class FindFileStructureActionRequestTests extends AbstractStreamableTestC
     public void testValidateLinesToSample() {
     public void testValidateLinesToSample() {
 
 
         FindFileStructureAction.Request request = new FindFileStructureAction.Request();
         FindFileStructureAction.Request request = new FindFileStructureAction.Request();
-        request.setLinesToSample(randomFrom(-1, 0));
+        request.setLinesToSample(randomIntBetween(-1, 0));
+        request.setSample(new BytesArray("foo\n"));
+
+        ActionRequestValidationException e = request.validate();
+        assertNotNull(e);
+        assertThat(e.getMessage(), startsWith("Validation Failed: "));
+        assertThat(e.getMessage(), containsString(" [lines_to_sample] must be positive if specified"));
+    }
+
+    public void testValidateNonDelimited() {
+
+        FindFileStructureAction.Request request = new FindFileStructureAction.Request();
+        String errorField;
+        switch (randomIntBetween(0, 4)) {
+            case 0:
+                errorField = "column_names";
+                request.setColumnNames(Arrays.asList("col1", "col2"));
+                break;
+            case 1:
+                errorField = "has_header_row";
+                request.setHasHeaderRow(randomBoolean());
+                break;
+            case 2:
+                errorField = "delimiter";
+                request.setDelimiter(randomFrom(',', '\t', ';', '|'));
+                break;
+            case 3:
+                errorField = "quote";
+                request.setQuote(randomFrom('"', '\''));
+                break;
+            case 4:
+                errorField = "should_trim_fields";
+                request.setShouldTrimFields(randomBoolean());
+                break;
+            default:
+                throw new IllegalStateException("unexpected switch value");
+        }
+        request.setSample(new BytesArray("foo\n"));
+
+        ActionRequestValidationException e = request.validate();
+        assertNotNull(e);
+        assertThat(e.getMessage(), startsWith("Validation Failed: "));
+        assertThat(e.getMessage(), containsString(" [" + errorField + "] may only be specified if [format] is [delimited]"));
+    }
+
+    public void testValidateNonSemiStructuredText() {
+
+        FindFileStructureAction.Request request = new FindFileStructureAction.Request();
+        request.setFormat(randomFrom(FileStructure.Format.JSON, FileStructure.Format.XML, FileStructure.Format.DELIMITED));
+        request.setGrokPattern(randomAlphaOfLength(80));
         request.setSample(new BytesArray("foo\n"));
         request.setSample(new BytesArray("foo\n"));
 
 
         ActionRequestValidationException e = request.validate();
         ActionRequestValidationException e = request.validate();
         assertNotNull(e);
         assertNotNull(e);
         assertThat(e.getMessage(), startsWith("Validation Failed: "));
         assertThat(e.getMessage(), startsWith("Validation Failed: "));
-        assertThat(e.getMessage(), containsString(" lines_to_sample must be positive if specified"));
+        assertThat(e.getMessage(), containsString(" [grok_pattern] may only be specified if [format] is [semi_structured_text]"));
     }
     }
 
 
     public void testValidateSample() {
     public void testValidateSample() {

+ 1 - 0
x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java

@@ -54,6 +54,7 @@ public class FileStructureTests extends AbstractSerializingTestCase<FileStructur
             builder.setColumnNames(Arrays.asList(generateRandomStringArray(10, 10, false, false)));
             builder.setColumnNames(Arrays.asList(generateRandomStringArray(10, 10, false, false)));
             builder.setHasHeaderRow(randomBoolean());
             builder.setHasHeaderRow(randomBoolean());
             builder.setDelimiter(randomFrom(',', '\t', ';', '|'));
             builder.setDelimiter(randomFrom(',', '\t', ';', '|'));
+            builder.setQuote(randomFrom('"', '\''));
         }
         }
 
 
         if (format == FileStructure.Format.SEMI_STRUCTURED_TEXT) {
         if (format == FileStructure.Format.SEMI_STRUCTURED_TEXT) {

+ 3 - 2
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportFindFileStructureAction.java

@@ -17,6 +17,7 @@ import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction;
 import org.elasticsearch.xpack.ml.MachineLearning;
 import org.elasticsearch.xpack.ml.MachineLearning;
 import org.elasticsearch.xpack.ml.filestructurefinder.FileStructureFinder;
 import org.elasticsearch.xpack.ml.filestructurefinder.FileStructureFinder;
 import org.elasticsearch.xpack.ml.filestructurefinder.FileStructureFinderManager;
 import org.elasticsearch.xpack.ml.filestructurefinder.FileStructureFinderManager;
+import org.elasticsearch.xpack.ml.filestructurefinder.FileStructureOverrides;
 
 
 public class TransportFindFileStructureAction
 public class TransportFindFileStructureAction
     extends HandledTransportAction<FindFileStructureAction.Request, FindFileStructureAction.Response> {
     extends HandledTransportAction<FindFileStructureAction.Request, FindFileStructureAction.Response> {
@@ -49,8 +50,8 @@ public class TransportFindFileStructureAction
 
 
         FileStructureFinderManager structureFinderManager = new FileStructureFinderManager();
         FileStructureFinderManager structureFinderManager = new FileStructureFinderManager();
 
 
-        FileStructureFinder fileStructureFinder =
-            structureFinderManager.findFileStructure(request.getLinesToSample(), request.getSample().streamInput());
+        FileStructureFinder fileStructureFinder = structureFinderManager.findFileStructure(request.getLinesToSample(),
+            request.getSample().streamInput(), new FileStructureOverrides(request));
 
 
         return new FindFileStructureAction.Response(fileStructureFinder.getStructure());
         return new FindFileStructureAction.Response(fileStructureFinder.getStructure());
     }
     }

+ 64 - 26
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java

@@ -33,6 +33,7 @@ import java.util.stream.IntStream;
 
 
 public class DelimitedFileStructureFinder implements FileStructureFinder {
 public class DelimitedFileStructureFinder implements FileStructureFinder {
 
 
+    private static final String REGEX_NEEDS_ESCAPE_PATTERN = "([\\\\|()\\[\\]{}^$.+*?])";
     private static final int MAX_LEVENSHTEIN_COMPARISONS = 100;
     private static final int MAX_LEVENSHTEIN_COMPARISONS = 100;
 
 
     private final List<String> sampleMessages;
     private final List<String> sampleMessages;
@@ -40,21 +41,35 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
 
 
     static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List<String> explanation, String sample, String charsetName,
     static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List<String> explanation, String sample, String charsetName,
                                                                          Boolean hasByteOrderMarker, CsvPreference csvPreference,
                                                                          Boolean hasByteOrderMarker, CsvPreference csvPreference,
-                                                                         boolean trimFields) throws IOException {
+                                                                         boolean trimFields, FileStructureOverrides overrides)
+        throws IOException {
 
 
         Tuple<List<List<String>>, List<Integer>> parsed = readRows(sample, csvPreference);
         Tuple<List<List<String>>, List<Integer>> parsed = readRows(sample, csvPreference);
         List<List<String>> rows = parsed.v1();
         List<List<String>> rows = parsed.v1();
         List<Integer> lineNumbers = parsed.v2();
         List<Integer> lineNumbers = parsed.v2();
 
 
-        Tuple<Boolean, String[]> headerInfo = findHeaderFromSample(explanation, rows);
+        // Even if the column names are overridden we need to know if there's a
+        // header in the file, as it affects which rows are considered records
+        Tuple<Boolean, String[]> headerInfo = findHeaderFromSample(explanation, rows, overrides);
         boolean isHeaderInFile = headerInfo.v1();
         boolean isHeaderInFile = headerInfo.v1();
         String[] header = headerInfo.v2();
         String[] header = headerInfo.v2();
-        // The column names are the header names but with blanks named column1, column2, etc.
-        String[] columnNames = new String[header.length];
-        for (int i = 0; i < header.length; ++i) {
-            assert header[i] != null;
-            String rawHeader = trimFields ? header[i].trim() : header[i];
-            columnNames[i] = rawHeader.isEmpty() ? "column" + (i + 1) : rawHeader;
+
+        String[] columnNames;
+        List<String> overriddenColumnNames = overrides.getColumnNames();
+        if (overriddenColumnNames != null) {
+            if (overriddenColumnNames.size() != header.length) {
+                throw new IllegalArgumentException("[" + overriddenColumnNames.size() + "] column names were specified [" +
+                    String.join(",", overriddenColumnNames) + "] but there are [" + header.length + "] columns in the sample");
+            }
+            columnNames = overriddenColumnNames.toArray(new String[overriddenColumnNames.size()]);
+        } else {
+            // The column names are the header names but with blanks named column1, column2, etc.
+            columnNames = new String[header.length];
+            for (int i = 0; i < header.length; ++i) {
+                assert header[i] != null;
+                String rawHeader = trimFields ? header[i].trim() : header[i];
+                columnNames[i] = rawHeader.isEmpty() ? "column" + (i + 1) : rawHeader;
+            }
         }
         }
 
 
         List<String> sampleLines = Arrays.asList(sample.split("\n"));
         List<String> sampleLines = Arrays.asList(sample.split("\n"));
@@ -84,13 +99,14 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
             .setNumMessagesAnalyzed(sampleRecords.size())
             .setNumMessagesAnalyzed(sampleRecords.size())
             .setHasHeaderRow(isHeaderInFile)
             .setHasHeaderRow(isHeaderInFile)
             .setDelimiter(delimiter)
             .setDelimiter(delimiter)
+            .setQuote(csvPreference.getQuoteChar())
             .setColumnNames(Arrays.stream(columnNames).collect(Collectors.toList()));
             .setColumnNames(Arrays.stream(columnNames).collect(Collectors.toList()));
 
 
         if (trimFields) {
         if (trimFields) {
             structureBuilder.setShouldTrimFields(true);
             structureBuilder.setShouldTrimFields(true);
         }
         }
 
 
-        Tuple<String, TimestampMatch> timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords);
+        Tuple<String, TimestampMatch> timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides);
         if (timeField != null) {
         if (timeField != null) {
             String timeLineRegex = null;
             String timeLineRegex = null;
             StringBuilder builder = new StringBuilder("^");
             StringBuilder builder = new StringBuilder("^");
@@ -98,7 +114,7 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
             // timestamp is the last column then either our assumption is wrong (and the approach will completely
             // timestamp is the last column then either our assumption is wrong (and the approach will completely
             // break down) or else every record is on a single line and there's no point creating a multiline config.
             // break down) or else every record is on a single line and there's no point creating a multiline config.
             // This is why the loop excludes the last column.
             // This is why the loop excludes the last column.
-            for (String column : Arrays.asList(header).subList(0, header.length - 1)) {
+            for (String column : Arrays.asList(columnNames).subList(0, columnNames.length - 1)) {
                 if (timeField.v1().equals(column)) {
                 if (timeField.v1().equals(column)) {
                     builder.append("\"?");
                     builder.append("\"?");
                     String simpleTimePattern = timeField.v2().simplePattern.pattern();
                     String simpleTimePattern = timeField.v2().simplePattern.pattern();
@@ -116,8 +132,11 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
             }
             }
 
 
             if (isHeaderInFile) {
             if (isHeaderInFile) {
+                String quote = String.valueOf(csvPreference.getQuoteChar());
+                String twoQuotes = quote + quote;
+                String optQuote = quote.replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1") + "?";
                 structureBuilder.setExcludeLinesPattern("^" + Arrays.stream(header)
                 structureBuilder.setExcludeLinesPattern("^" + Arrays.stream(header)
-                    .map(column -> "\"?" + column.replace("\"", "\"\"").replaceAll("([\\\\|()\\[\\]{}^$*?])", "\\\\$1") + "\"?")
+                    .map(column -> optQuote + column.replace(quote, twoQuotes).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1") + optQuote)
                     .collect(Collectors.joining(",")));
                     .collect(Collectors.joining(",")));
             }
             }
 
 
@@ -131,7 +150,10 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
             FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords);
             FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords);
 
 
         SortedMap<String, Object> mappings = mappingsAndFieldStats.v1();
         SortedMap<String, Object> mappings = mappingsAndFieldStats.v1();
-        mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date"));
+        if (timeField != null) {
+            mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD,
+                Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date"));
+        }
 
 
         if (mappingsAndFieldStats.v2() != null) {
         if (mappingsAndFieldStats.v2() != null) {
             structureBuilder.setFieldStats(mappingsAndFieldStats.v2());
             structureBuilder.setFieldStats(mappingsAndFieldStats.v2());
@@ -205,45 +227,61 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
         return new Tuple<>(rows, lineNumbers);
         return new Tuple<>(rows, lineNumbers);
     }
     }
 
 
-    static Tuple<Boolean, String[]> findHeaderFromSample(List<String> explanation, List<List<String>> rows) {
+    static Tuple<Boolean, String[]> findHeaderFromSample(List<String> explanation, List<List<String>> rows,
+                                                         FileStructureOverrides overrides) {
 
 
         assert rows.isEmpty() == false;
         assert rows.isEmpty() == false;
 
 
+        List<String> overriddenColumnNames = overrides.getColumnNames();
         List<String> firstRow = rows.get(0);
         List<String> firstRow = rows.get(0);
 
 
         boolean isHeaderInFile = true;
         boolean isHeaderInFile = true;
-        if (rowContainsDuplicateNonEmptyValues(firstRow)) {
-            isHeaderInFile = false;
-            explanation.add("First row contains duplicate values, so assuming it's not a header");
+        if (overrides.getHasHeaderRow() != null) {
+            isHeaderInFile = overrides.getHasHeaderRow();
+            if (isHeaderInFile && overriddenColumnNames == null) {
+                String duplicateValue = findDuplicateNonEmptyValues(firstRow);
+                if (duplicateValue != null) {
+                    throw new IllegalArgumentException("Sample specified to contain a header row, " +
+                        "but the first row contains duplicate values: [" + duplicateValue + "]");
+                }
+            }
+            explanation.add("Sample specified to " + (isHeaderInFile ? "contain" : "not contain") + " a header row");
         } else {
         } else {
-            if (rows.size() < 3) {
-                explanation.add("Too little data to accurately assess whether header is in sample - guessing it is");
+            if (findDuplicateNonEmptyValues(firstRow) != null) {
+                isHeaderInFile = false;
+                explanation.add("First row contains duplicate values, so assuming it's not a header");
             } else {
             } else {
-                isHeaderInFile = isFirstRowUnusual(explanation, rows);
+                if (rows.size() < 3) {
+                    explanation.add("Too little data to accurately assess whether header is in sample - guessing it is");
+                } else {
+                    isHeaderInFile = isFirstRowUnusual(explanation, rows);
+                }
             }
             }
         }
         }
 
 
+        String[] header;
         if (isHeaderInFile) {
         if (isHeaderInFile) {
             // SuperCSV will put nulls in the header if any columns don't have names, but empty strings are better for us
             // SuperCSV will put nulls in the header if any columns don't have names, but empty strings are better for us
-            return new Tuple<>(true, firstRow.stream().map(field -> (field == null) ? "" : field).toArray(String[]::new));
+            header = firstRow.stream().map(field -> (field == null) ? "" : field).toArray(String[]::new);
         } else {
         } else {
-            String[] dummyHeader = new String[firstRow.size()];
-            Arrays.fill(dummyHeader, "");
-            return new Tuple<>(false, dummyHeader);
+            header = new String[firstRow.size()];
+            Arrays.fill(header, "");
         }
         }
+
+        return new Tuple<>(isHeaderInFile, header);
     }
     }
 
 
-    static boolean rowContainsDuplicateNonEmptyValues(List<String> row) {
+    static String findDuplicateNonEmptyValues(List<String> row) {
 
 
         HashSet<String> values = new HashSet<>();
         HashSet<String> values = new HashSet<>();
 
 
         for (String value : row) {
         for (String value : row) {
             if (value != null && value.isEmpty() == false && values.add(value) == false) {
             if (value != null && value.isEmpty() == false && values.add(value) == false) {
-                return true;
+                return value;
             }
             }
         }
         }
 
 
-        return false;
+        return null;
     }
     }
 
 
     private static boolean isFirstRowUnusual(List<String> explanation, List<List<String>> rows) {
     private static boolean isFirstRowUnusual(List<String> explanation, List<List<String>> rows) {

+ 17 - 5
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java

@@ -5,6 +5,7 @@
  */
  */
 package org.elasticsearch.xpack.ml.filestructurefinder;
 package org.elasticsearch.xpack.ml.filestructurefinder;
 
 
+import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
 import org.supercsv.prefs.CsvPreference;
 import org.supercsv.prefs.CsvPreference;
 
 
 import java.io.IOException;
 import java.io.IOException;
@@ -17,12 +18,23 @@ public class DelimitedFileStructureFinderFactory implements FileStructureFinderF
     private final int minFieldsPerRow;
     private final int minFieldsPerRow;
     private final boolean trimFields;
     private final boolean trimFields;
 
 
-    DelimitedFileStructureFinderFactory(char delimiter, int minFieldsPerRow, boolean trimFields) {
-        csvPreference = new CsvPreference.Builder('"', delimiter, "\n").build();
+    DelimitedFileStructureFinderFactory(char delimiter, char quote, int minFieldsPerRow, boolean trimFields) {
+        csvPreference = new CsvPreference.Builder(quote, delimiter, "\n").build();
         this.minFieldsPerRow = minFieldsPerRow;
         this.minFieldsPerRow = minFieldsPerRow;
         this.trimFields = trimFields;
         this.trimFields = trimFields;
     }
     }
 
 
+    DelimitedFileStructureFinderFactory makeSimilar(Character quote, Boolean trimFields) {
+
+        return new DelimitedFileStructureFinderFactory((char) csvPreference.getDelimiterChar(),
+            (quote == null) ? csvPreference.getQuoteChar() : quote, minFieldsPerRow, (trimFields == null) ? this.trimFields : trimFields);
+    }
+
+    @Override
+    public boolean canFindFormat(FileStructure.Format format) {
+        return format == null || format == FileStructure.Format.DELIMITED;
+    }
+
     /**
     /**
      * Rules are:
      * Rules are:
      * - It must contain at least two complete records
      * - It must contain at least two complete records
@@ -49,9 +61,9 @@ public class DelimitedFileStructureFinderFactory implements FileStructureFinderF
     }
     }
 
 
     @Override
     @Override
-    public FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
-        throws IOException {
+    public FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
+                                                FileStructureOverrides overrides) throws IOException {
         return DelimitedFileStructureFinder.makeDelimitedFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
         return DelimitedFileStructureFinder.makeDelimitedFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
-            csvPreference, trimFields);
+            csvPreference, trimFields, overrides);
     }
     }
 }
 }

+ 15 - 3
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java

@@ -5,10 +5,20 @@
  */
  */
 package org.elasticsearch.xpack.ml.filestructurefinder;
 package org.elasticsearch.xpack.ml.filestructurefinder;
 
 
+import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
+
 import java.util.List;
 import java.util.List;
 
 
 public interface FileStructureFinderFactory {
 public interface FileStructureFinderFactory {
 
 
+    /**
+     * Can this factory create a {@link FileStructureFinder} that can find the supplied format?
+     * @param format The format to query, or <code>null</code>.
+     * @return <code>true</code> if {@code format} is <code>null</code> or the factory
+     *         can produce a {@link FileStructureFinder} that can find {@code format}.
+     */
+    boolean canFindFormat(FileStructure.Format format);
+
     /**
     /**
      * Given a sample of a file, decide whether this factory will be able
      * Given a sample of a file, decide whether this factory will be able
      * to create an appropriate object to represent its ingestion configs.
      * to create an appropriate object to represent its ingestion configs.
@@ -27,9 +37,11 @@ public interface FileStructureFinderFactory {
      * @param sample A sample from the file to be ingested.
      * @param sample A sample from the file to be ingested.
      * @param charsetName The name of the character set in which the sample was provided.
      * @param charsetName The name of the character set in which the sample was provided.
      * @param hasByteOrderMarker Did the sample have a byte order marker?  <code>null</code> means "not relevant".
      * @param hasByteOrderMarker Did the sample have a byte order marker?  <code>null</code> means "not relevant".
-     * @return A file structure object suitable for ingesting the supplied sample.
+     * @param overrides Stores structure decisions that have been made by the end user, and should
+     *                  take precedence over anything the {@link FileStructureFinder} may decide.
+     * @return A {@link FileStructureFinder} object suitable for determining the structure of the supplied sample.
      * @throws Exception if something goes wrong during creation.
      * @throws Exception if something goes wrong during creation.
      */
      */
-    FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
-        throws Exception;
+    FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
+                                         FileStructureOverrides overrides) throws Exception;
 }
 }

+ 69 - 16
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java

@@ -13,6 +13,7 @@ import java.io.BufferedInputStream;
 import java.io.BufferedReader;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.Reader;
 import java.nio.charset.Charset;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.nio.charset.StandardCharsets;
@@ -24,6 +25,7 @@ import java.util.List;
 import java.util.Locale;
 import java.util.Locale;
 import java.util.Optional;
 import java.util.Optional;
 import java.util.Set;
 import java.util.Set;
+import java.util.stream.Collectors;
 
 
 /**
 /**
  * Runs the high-level steps needed to create ingest configs for the specified file.  In order:
  * Runs the high-level steps needed to create ingest configs for the specified file.  In order:
@@ -70,15 +72,19 @@ public final class FileStructureFinderManager {
         new JsonFileStructureFinderFactory(),
         new JsonFileStructureFinderFactory(),
         new XmlFileStructureFinderFactory(),
         new XmlFileStructureFinderFactory(),
         // ND-JSON will often also be valid (although utterly weird) CSV, so JSON must come before CSV
         // ND-JSON will often also be valid (although utterly weird) CSV, so JSON must come before CSV
-        new DelimitedFileStructureFinderFactory(',', 2, false),
-        new DelimitedFileStructureFinderFactory('\t', 2, false),
-        new DelimitedFileStructureFinderFactory(';', 4, false),
-        new DelimitedFileStructureFinderFactory('|', 5, true),
+        new DelimitedFileStructureFinderFactory(',', '"', 2, false),
+        new DelimitedFileStructureFinderFactory('\t', '"', 2, false),
+        new DelimitedFileStructureFinderFactory(';', '"', 4, false),
+        new DelimitedFileStructureFinderFactory('|', '"', 5, true),
         new TextLogFileStructureFinderFactory()
         new TextLogFileStructureFinderFactory()
     ));
     ));
 
 
     private static final int BUFFER_SIZE = 8192;
     private static final int BUFFER_SIZE = 8192;
 
 
+    public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile) throws Exception {
+        return findFileStructure(idealSampleLineCount, fromFile, FileStructureOverrides.EMPTY_OVERRIDES);
+    }
+
     /**
     /**
      * Given a stream of data from some file, determine its structure.
      * Given a stream of data from some file, determine its structure.
      * @param idealSampleLineCount Ideally, how many lines from the stream will be read to determine the structure?
      * @param idealSampleLineCount Ideally, how many lines from the stream will be read to determine the structure?
@@ -86,24 +92,42 @@ public final class FileStructureFinderManager {
      *                             least {@link #MIN_SAMPLE_LINE_COUNT} lines can be read.  If <code>null</code>
      *                             least {@link #MIN_SAMPLE_LINE_COUNT} lines can be read.  If <code>null</code>
      *                             the value of {@link #DEFAULT_IDEAL_SAMPLE_LINE_COUNT} will be used.
      *                             the value of {@link #DEFAULT_IDEAL_SAMPLE_LINE_COUNT} will be used.
      * @param fromFile A stream from which the sample will be read.
      * @param fromFile A stream from which the sample will be read.
+     * @param overrides Aspects of the file structure that are known in advance.  These take precedence over
+     *                  values determined by structure analysis.  An exception will be thrown if the file structure
+     *                  is incompatible with an overridden value.
      * @return A {@link FileStructureFinder} object from which the structure and messages can be queried.
      * @return A {@link FileStructureFinder} object from which the structure and messages can be queried.
      * @throws Exception A variety of problems could occur at various stages of the structure finding process.
      * @throws Exception A variety of problems could occur at various stages of the structure finding process.
      */
      */
-    public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile) throws Exception {
+    public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile, FileStructureOverrides overrides)
+        throws Exception {
         return findFileStructure(new ArrayList<>(), (idealSampleLineCount == null) ? DEFAULT_IDEAL_SAMPLE_LINE_COUNT : idealSampleLineCount,
         return findFileStructure(new ArrayList<>(), (idealSampleLineCount == null) ? DEFAULT_IDEAL_SAMPLE_LINE_COUNT : idealSampleLineCount,
-            fromFile);
+            fromFile, overrides);
     }
     }
 
 
     public FileStructureFinder findFileStructure(List<String> explanation, int idealSampleLineCount, InputStream fromFile)
     public FileStructureFinder findFileStructure(List<String> explanation, int idealSampleLineCount, InputStream fromFile)
         throws Exception {
         throws Exception {
+        return findFileStructure(new ArrayList<>(), idealSampleLineCount, fromFile, FileStructureOverrides.EMPTY_OVERRIDES);
+    }
+
+    public FileStructureFinder findFileStructure(List<String> explanation, int idealSampleLineCount, InputStream fromFile,
+                                                 FileStructureOverrides overrides) throws Exception {
 
 
-        CharsetMatch charsetMatch = findCharset(explanation, fromFile);
-        String charsetName = charsetMatch.getName();
+        String charsetName = overrides.getCharset();
+        Reader sampleReader;
+        if (charsetName != null) {
+            // Creating the reader will throw if the specified character set does not exist
+            sampleReader = new InputStreamReader(fromFile, charsetName);
+            explanation.add("Using specified character encoding [" + charsetName + "]");
+        } else {
+            CharsetMatch charsetMatch = findCharset(explanation, fromFile);
+            charsetName = charsetMatch.getName();
+            sampleReader = charsetMatch.getReader();
+        }
 
 
-        Tuple<String, Boolean> sampleInfo = sampleFile(charsetMatch.getReader(), charsetName, MIN_SAMPLE_LINE_COUNT,
+        Tuple<String, Boolean> sampleInfo = sampleFile(sampleReader, charsetName, MIN_SAMPLE_LINE_COUNT,
             Math.max(MIN_SAMPLE_LINE_COUNT, idealSampleLineCount));
             Math.max(MIN_SAMPLE_LINE_COUNT, idealSampleLineCount));
 
 
-        return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2());
+        return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2(), overrides);
     }
     }
 
 
     CharsetMatch findCharset(List<String> explanation, InputStream inputStream) throws Exception {
     CharsetMatch findCharset(List<String> explanation, InputStream inputStream) throws Exception {
@@ -195,15 +219,44 @@ public final class FileStructureFinderManager {
             (containsZeroBytes ? " - could it be binary data?" : ""));
             (containsZeroBytes ? " - could it be binary data?" : ""));
     }
     }
 
 
-    FileStructureFinder makeBestStructureFinder(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
-        throws Exception {
+    FileStructureFinder makeBestStructureFinder(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
+                                                FileStructureOverrides overrides) throws Exception {
 
 
-        for (FileStructureFinderFactory factory : ORDERED_STRUCTURE_FACTORIES) {
+        Character delimiter = overrides.getDelimiter();
+        Character quote = overrides.getQuote();
+        Boolean shouldTrimFields = overrides.getShouldTrimFields();
+        List<FileStructureFinderFactory> factories;
+        if (delimiter != null) {
+
+            // If a precise delimiter is specified, we only need one structure finder
+            // factory, and we'll tolerate as little as one column in the input
+            factories = Collections.singletonList(new DelimitedFileStructureFinderFactory(delimiter, (quote == null) ? '"' : quote, 1,
+                (shouldTrimFields == null) ? (delimiter == '|') : shouldTrimFields));
+
+        } else if (quote != null || shouldTrimFields != null) {
+
+            // The delimiter is not specified, but some other aspect of delimited files is,
+            // so clone our default delimited factories altering the overridden values
+            factories = ORDERED_STRUCTURE_FACTORIES.stream().filter(factory -> factory instanceof DelimitedFileStructureFinderFactory)
+                .map(factory -> ((DelimitedFileStructureFinderFactory) factory).makeSimilar(quote, shouldTrimFields))
+                .collect(Collectors.toList());
+
+        } else {
+
+            // We can use the default factories, but possibly filtered down to a specific format
+            factories = ORDERED_STRUCTURE_FACTORIES.stream()
+                .filter(factory -> factory.canFindFormat(overrides.getFormat())).collect(Collectors.toList());
+
+        }
+
+        for (FileStructureFinderFactory factory : factories) {
             if (factory.canCreateFromSample(explanation, sample)) {
             if (factory.canCreateFromSample(explanation, sample)) {
-                return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker);
+                return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker, overrides);
             }
             }
         }
         }
-        throw new IllegalArgumentException("Input did not match any known formats");
+
+        throw new IllegalArgumentException("Input did not match " +
+            ((overrides.getFormat() == null) ? "any known formats" : "the specified format [" + overrides.getFormat() + "]"));
     }
     }
 
 
     private Tuple<String, Boolean> sampleFile(Reader reader, String charsetName, int minLines, int maxLines) throws IOException {
     private Tuple<String, Boolean> sampleFile(Reader reader, String charsetName, int minLines, int maxLines) throws IOException {
@@ -233,7 +286,7 @@ public final class FileStructureFinderManager {
         }
         }
 
 
         if (lineCount < minLines) {
         if (lineCount < minLines) {
-            throw new IllegalArgumentException("Input contained too few lines to sample");
+            throw new IllegalArgumentException("Input contained too few lines [" + lineCount + "] to obtain a meaningful sample");
         }
         }
 
 
         return new Tuple<>(sample.toString(), hasByteOrderMarker);
         return new Tuple<>(sample.toString(), hasByteOrderMarker);

+ 205 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureOverrides.java

@@ -0,0 +1,205 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.filestructurefinder;
+
+import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction;
+import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+
+/**
+ * An immutable holder for the aspects of file structure detection that can be overridden
+ * by the end user.  Every field can be <code>null</code>, and this means that that
+ * aspect of the file structure detection is not overridden.
+ *
+ * There is no consistency checking in this class.  Consistency checking of the different
+ * fields is done in {@link FindFileStructureAction.Request}.
+ */
+public class FileStructureOverrides {
+
+    public static final FileStructureOverrides EMPTY_OVERRIDES = new Builder().build();
+
+    private final String charset;
+    private final FileStructure.Format format;
+    private final List<String> columnNames;
+    private final Boolean hasHeaderRow;
+    private final Character delimiter;
+    private final Character quote;
+    private final Boolean shouldTrimFields;
+    private final String grokPattern;
+    private final String timestampFormat;
+    private final String timestampField;
+
+    public FileStructureOverrides(FindFileStructureAction.Request request) {
+
+        this(request.getCharset(), request.getFormat(), request.getColumnNames(), request.getHasHeaderRow(), request.getDelimiter(),
+            request.getQuote(), request.getShouldTrimFields(), request.getGrokPattern(), request.getTimestampFormat(),
+            request.getTimestampField());
+    }
+
+    private FileStructureOverrides(String charset, FileStructure.Format format, List<String> columnNames, Boolean hasHeaderRow,
+                                   Character delimiter, Character quote, Boolean shouldTrimFields, String grokPattern,
+                                   String timestampFormat, String timestampField) {
+        this.charset = charset;
+        this.format = format;
+        this.columnNames = (columnNames == null) ? null : Collections.unmodifiableList(new ArrayList<>(columnNames));
+        this.hasHeaderRow = hasHeaderRow;
+        this.delimiter = delimiter;
+        this.quote = quote;
+        this.shouldTrimFields = shouldTrimFields;
+        this.grokPattern = grokPattern;
+        this.timestampFormat = timestampFormat;
+        this.timestampField = timestampField;
+    }
+
+    public static Builder builder() {
+        return new Builder();
+    }
+
+    public String getCharset() {
+        return charset;
+    }
+
+    public FileStructure.Format getFormat() {
+        return format;
+    }
+
+    public List<String> getColumnNames() {
+        return columnNames;
+    }
+
+    public Boolean getHasHeaderRow() {
+        return hasHeaderRow;
+    }
+
+    public Character getDelimiter() {
+        return delimiter;
+    }
+
+    public Character getQuote() {
+        return quote;
+    }
+
+    public Boolean getShouldTrimFields() {
+        return shouldTrimFields;
+    }
+
+    public String getGrokPattern() {
+        return grokPattern;
+    }
+
+    public String getTimestampFormat() {
+        return timestampFormat;
+    }
+
+    public String getTimestampField() {
+        return timestampField;
+    }
+
+    @Override
+    public int hashCode() {
+
+        return Objects.hash(charset, format, columnNames, hasHeaderRow, delimiter, quote, shouldTrimFields, grokPattern, timestampFormat,
+            timestampField);
+    }
+
+    @Override
+    public boolean equals(Object other) {
+
+        if (this == other) {
+            return true;
+        }
+
+        if (other == null || getClass() != other.getClass()) {
+            return false;
+        }
+
+        FileStructureOverrides that = (FileStructureOverrides) other;
+        return Objects.equals(this.charset, that.charset) &&
+            Objects.equals(this.format, that.format) &&
+            Objects.equals(this.columnNames, that.columnNames) &&
+            Objects.equals(this.hasHeaderRow, that.hasHeaderRow) &&
+            Objects.equals(this.delimiter, that.delimiter) &&
+            Objects.equals(this.quote, that.quote) &&
+            Objects.equals(this.shouldTrimFields, that.shouldTrimFields) &&
+            Objects.equals(this.grokPattern, that.grokPattern) &&
+            Objects.equals(this.timestampFormat, that.timestampFormat) &&
+            Objects.equals(this.timestampField, that.timestampField);
+    }
+
+    public static class Builder {
+
+        private String charset;
+        private FileStructure.Format format;
+        private List<String> columnNames;
+        private Boolean hasHeaderRow;
+        private Character delimiter;
+        private Character quote;
+        private Boolean shouldTrimFields;
+        private String grokPattern;
+        private String timestampFormat;
+        private String timestampField;
+
+        public Builder setCharset(String charset) {
+            this.charset = charset;
+            return this;
+        }
+
+        public Builder setFormat(FileStructure.Format format) {
+            this.format = format;
+            return this;
+        }
+
+        public Builder setColumnNames(List<String> columnNames) {
+            this.columnNames = columnNames;
+            return this;
+        }
+
+        public Builder setHasHeaderRow(Boolean hasHeaderRow) {
+            this.hasHeaderRow = hasHeaderRow;
+            return this;
+        }
+
+        public Builder setDelimiter(Character delimiter) {
+            this.delimiter = delimiter;
+            return this;
+        }
+
+        public Builder setQuote(Character quote) {
+            this.quote = quote;
+            return this;
+        }
+
+        public Builder setShouldTrimFields(Boolean shouldTrimFields) {
+            this.shouldTrimFields = shouldTrimFields;
+            return this;
+        }
+
+        public Builder setGrokPattern(String grokPattern) {
+            this.grokPattern = grokPattern;
+            return this;
+        }
+
+        public Builder setTimestampFormat(String timestampFormat) {
+            this.timestampFormat = timestampFormat;
+            return this;
+        }
+
+        public Builder setTimestampField(String timestampField) {
+            this.timestampField = timestampField;
+            return this;
+        }
+
+        public FileStructureOverrides build() {
+
+            return new FileStructureOverrides(charset, format, columnNames, hasHeaderRow, delimiter, quote, shouldTrimFields, grokPattern,
+                timestampFormat, timestampField);
+        }
+    }
+}

+ 45 - 14
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java

@@ -51,29 +51,41 @@ public final class FileStructureUtils {
      *                    may be non-empty when the method is called, and this method may
      *                    may be non-empty when the method is called, and this method may
      *                    append to it.
      *                    append to it.
      * @param sampleRecords List of records derived from the provided sample.
      * @param sampleRecords List of records derived from the provided sample.
+     * @param overrides Aspects of the file structure that are known in advance.  These take precedence over
+     *                  values determined by structure analysis.  An exception will be thrown if the file structure
+     *                  is incompatible with an overridden value.
      * @return A tuple of (field name, timestamp format) if one can be found, or <code>null</code> if
      * @return A tuple of (field name, timestamp format) if one can be found, or <code>null</code> if
      *         there is no consistent timestamp.
      *         there is no consistent timestamp.
      */
      */
-    static Tuple<String, TimestampMatch> guessTimestampField(List<String> explanation, List<Map<String, ?>> sampleRecords) {
+    static Tuple<String, TimestampMatch> guessTimestampField(List<String> explanation, List<Map<String, ?>> sampleRecords,
+                                                             FileStructureOverrides overrides) {
         if (sampleRecords.isEmpty()) {
         if (sampleRecords.isEmpty()) {
             return null;
             return null;
         }
         }
 
 
         // Accept the first match from the first sample that is compatible with all the other samples
         // Accept the first match from the first sample that is compatible with all the other samples
-        for (Tuple<String, TimestampMatch> candidate : findCandidates(explanation, sampleRecords)) {
+        for (Tuple<String, TimestampMatch> candidate : findCandidates(explanation, sampleRecords, overrides)) {
 
 
             boolean allGood = true;
             boolean allGood = true;
             for (Map<String, ?> sampleRecord : sampleRecords.subList(1, sampleRecords.size())) {
             for (Map<String, ?> sampleRecord : sampleRecords.subList(1, sampleRecords.size())) {
                 Object fieldValue = sampleRecord.get(candidate.v1());
                 Object fieldValue = sampleRecord.get(candidate.v1());
                 if (fieldValue == null) {
                 if (fieldValue == null) {
+                    if (overrides.getTimestampField() != null) {
+                        throw new IllegalArgumentException("Specified timestamp field [" + overrides.getTimestampField() +
+                            "] is not present in record [" + sampleRecord + "]");
+                    }
                     explanation.add("First sample match [" + candidate.v1() + "] ruled out because record [" + sampleRecord +
                     explanation.add("First sample match [" + candidate.v1() + "] ruled out because record [" + sampleRecord +
                         "] doesn't have field");
                         "] doesn't have field");
                     allGood = false;
                     allGood = false;
                     break;
                     break;
                 }
                 }
 
 
-                TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(fieldValue.toString());
+                TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(fieldValue.toString(), overrides.getTimestampFormat());
                 if (match == null || match.candidateIndex != candidate.v2().candidateIndex) {
                 if (match == null || match.candidateIndex != candidate.v2().candidateIndex) {
+                    if (overrides.getTimestampFormat() != null) {
+                        throw new IllegalArgumentException("Specified timestamp format [" + overrides.getTimestampFormat() +
+                            "] does not match for record [" + sampleRecord + "]");
+                    }
                     explanation.add("First sample match [" + candidate.v1() + "] ruled out because record [" + sampleRecord +
                     explanation.add("First sample match [" + candidate.v1() + "] ruled out because record [" + sampleRecord +
                         "] matches differently: [" + match + "]");
                         "] matches differently: [" + match + "]");
                     allGood = false;
                     allGood = false;
@@ -82,7 +94,8 @@ public final class FileStructureUtils {
             }
             }
 
 
             if (allGood) {
             if (allGood) {
-                explanation.add("Guessing timestamp field is [" + candidate.v1() + "] with format [" + candidate.v2() + "]");
+                explanation.add(((overrides.getTimestampField() == null) ? "Guessing timestamp" : "Timestamp") +
+                    " field is [" + candidate.v1() + "] with format [" + candidate.v2() + "]");
                 return candidate;
                 return candidate;
             }
             }
         }
         }
@@ -90,23 +103,41 @@ public final class FileStructureUtils {
         return null;
         return null;
     }
     }
 
 
-    private static List<Tuple<String, TimestampMatch>> findCandidates(List<String> explanation, List<Map<String, ?>> sampleRecords) {
+    private static List<Tuple<String, TimestampMatch>> findCandidates(List<String> explanation, List<Map<String, ?>> sampleRecords,
+                                                                      FileStructureOverrides overrides) {
+
+        assert sampleRecords.isEmpty() == false;
+        Map<String, ?> firstRecord = sampleRecords.get(0);
+
+        String onlyConsiderField = overrides.getTimestampField();
+        if (onlyConsiderField != null && firstRecord.get(onlyConsiderField) == null) {
+            throw new IllegalArgumentException("Specified timestamp field [" + overrides.getTimestampField() +
+                "] is not present in record [" + firstRecord + "]");
+        }
 
 
         List<Tuple<String, TimestampMatch>> candidates = new ArrayList<>();
         List<Tuple<String, TimestampMatch>> candidates = new ArrayList<>();
 
 
-        // Get candidate timestamps from the first sample record
-        for (Map.Entry<String, ?> entry : sampleRecords.get(0).entrySet()) {
-            Object value = entry.getValue();
-            if (value != null) {
-                TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(value.toString());
-                if (match != null) {
-                    Tuple<String, TimestampMatch> candidate = new Tuple<>(entry.getKey(), match);
-                    candidates.add(candidate);
-                    explanation.add("First sample timestamp match [" + candidate + "]");
+        // Get candidate timestamps from the possible field(s) of the first sample record
+        for (Map.Entry<String, ?> field : firstRecord.entrySet()) {
+            String fieldName = field.getKey();
+            if (onlyConsiderField == null || onlyConsiderField.equals(fieldName)) {
+                Object value = field.getValue();
+                if (value != null) {
+                    TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(value.toString(), overrides.getTimestampFormat());
+                    if (match != null) {
+                        Tuple<String, TimestampMatch> candidate = new Tuple<>(fieldName, match);
+                        candidates.add(candidate);
+                        explanation.add("First sample timestamp match [" + candidate + "]");
+                    }
                 }
                 }
             }
             }
         }
         }
 
 
+        if (candidates.isEmpty() && overrides.getTimestampFormat() != null) {
+            throw new IllegalArgumentException("Specified timestamp format [" + overrides.getTimestampFormat() +
+                "] does not match for record [" + firstRecord + "]");
+        }
+
         return candidates;
         return candidates;
     }
     }
 
 

+ 72 - 42
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java

@@ -48,21 +48,21 @@ public final class GrokPatternCreator {
      * Grok patterns that are designed to match the whole message, not just a part of it.
      * Grok patterns that are designed to match the whole message, not just a part of it.
      */
      */
     private static final List<FullMatchGrokPatternCandidate> FULL_MATCH_GROK_PATTERNS = Arrays.asList(
     private static final List<FullMatchGrokPatternCandidate> FULL_MATCH_GROK_PATTERNS = Arrays.asList(
-        new FullMatchGrokPatternCandidate("BACULA_LOGLINE", "bts"),
-        new FullMatchGrokPatternCandidate("CATALINALOG", "timestamp"),
-        new FullMatchGrokPatternCandidate("COMBINEDAPACHELOG", "timestamp"),
-        new FullMatchGrokPatternCandidate("COMMONAPACHELOG", "timestamp"),
-        new FullMatchGrokPatternCandidate("ELB_ACCESS_LOG", "timestamp"),
-        new FullMatchGrokPatternCandidate("HAPROXYHTTP", "syslog_timestamp"),
-        new FullMatchGrokPatternCandidate("HAPROXYTCP", "syslog_timestamp"),
-        new FullMatchGrokPatternCandidate("HTTPD20_ERRORLOG", "timestamp"),
-        new FullMatchGrokPatternCandidate("HTTPD24_ERRORLOG", "timestamp"),
-        new FullMatchGrokPatternCandidate("NAGIOSLOGLINE", "nagios_epoch"),
-        new FullMatchGrokPatternCandidate("NETSCREENSESSIONLOG", "date"),
-        new FullMatchGrokPatternCandidate("RAILS3", "timestamp"),
-        new FullMatchGrokPatternCandidate("RUBY_LOGGER", "timestamp"),
-        new FullMatchGrokPatternCandidate("SHOREWALL", "timestamp"),
-        new FullMatchGrokPatternCandidate("TOMCATLOG", "timestamp")
+        FullMatchGrokPatternCandidate.fromGrokPatternName("BACULA_LOGLINE", "bts"),
+        FullMatchGrokPatternCandidate.fromGrokPatternName("CATALINALOG", "timestamp"),
+        FullMatchGrokPatternCandidate.fromGrokPatternName("COMBINEDAPACHELOG", "timestamp"),
+        FullMatchGrokPatternCandidate.fromGrokPatternName("COMMONAPACHELOG", "timestamp"),
+        FullMatchGrokPatternCandidate.fromGrokPatternName("ELB_ACCESS_LOG", "timestamp"),
+        FullMatchGrokPatternCandidate.fromGrokPatternName("HAPROXYHTTP", "syslog_timestamp"),
+        FullMatchGrokPatternCandidate.fromGrokPatternName("HAPROXYTCP", "syslog_timestamp"),
+        FullMatchGrokPatternCandidate.fromGrokPatternName("HTTPD20_ERRORLOG", "timestamp"),
+        FullMatchGrokPatternCandidate.fromGrokPatternName("HTTPD24_ERRORLOG", "timestamp"),
+        FullMatchGrokPatternCandidate.fromGrokPatternName("NAGIOSLOGLINE", "nagios_epoch"),
+        FullMatchGrokPatternCandidate.fromGrokPatternName("NETSCREENSESSIONLOG", "date"),
+        FullMatchGrokPatternCandidate.fromGrokPatternName("RAILS3", "timestamp"),
+        FullMatchGrokPatternCandidate.fromGrokPatternName("RUBY_LOGGER", "timestamp"),
+        FullMatchGrokPatternCandidate.fromGrokPatternName("SHOREWALL", "timestamp"),
+        FullMatchGrokPatternCandidate.fromGrokPatternName("TOMCATLOG", "timestamp")
     );
     );
 
 
     /**
     /**
@@ -87,7 +87,7 @@ public final class GrokPatternCreator {
         // Can't use \b as the breaks, because slashes are not "word" characters
         // Can't use \b as the breaks, because slashes are not "word" characters
         new ValueOnlyGrokPatternCandidate("PATH", "keyword", "path", "(?<!\\w)", "(?!\\w)"),
         new ValueOnlyGrokPatternCandidate("PATH", "keyword", "path", "(?<!\\w)", "(?!\\w)"),
         new ValueOnlyGrokPatternCandidate("EMAILADDRESS", "keyword", "email"),
         new ValueOnlyGrokPatternCandidate("EMAILADDRESS", "keyword", "email"),
-        // TODO: would be nice to have IPORHOST here, but HOST matches almost all words
+        // TODO: would be nice to have IPORHOST here, but HOSTNAME matches almost all words
         new ValueOnlyGrokPatternCandidate("IP", "ip", "ipaddress"),
         new ValueOnlyGrokPatternCandidate("IP", "ip", "ipaddress"),
         new ValueOnlyGrokPatternCandidate("DATE", "date", "date"),
         new ValueOnlyGrokPatternCandidate("DATE", "date", "date"),
         new ValueOnlyGrokPatternCandidate("TIME", "date", "time"),
         new ValueOnlyGrokPatternCandidate("TIME", "date", "time"),
@@ -143,19 +143,39 @@ public final class GrokPatternCreator {
     /**
     /**
      * This method attempts to find a Grok pattern that will match all of the sample messages in their entirety.
      * This method attempts to find a Grok pattern that will match all of the sample messages in their entirety.
      * It will also update mappings and field stats if they are non-<code>null</code>.
      * It will also update mappings and field stats if they are non-<code>null</code>.
+     * @param timestampField If not <code>null</code> then the chosen Grok pattern must use this timestamp field.
      * @return A tuple of (time field name, Grok string), or <code>null</code> if no suitable Grok pattern was found.
      * @return A tuple of (time field name, Grok string), or <code>null</code> if no suitable Grok pattern was found.
      */
      */
-    public Tuple<String, String> findFullLineGrokPattern() {
+    public Tuple<String, String> findFullLineGrokPattern(String timestampField) {
 
 
         for (FullMatchGrokPatternCandidate candidate : FULL_MATCH_GROK_PATTERNS) {
         for (FullMatchGrokPatternCandidate candidate : FULL_MATCH_GROK_PATTERNS) {
-            if (candidate.matchesAll(sampleMessages)) {
-                return candidate.processMatch(explanation, sampleMessages, mappings, fieldStats);
+            if (timestampField == null || timestampField.equals(candidate.getTimeField())) {
+                if (candidate.matchesAll(sampleMessages)) {
+                    return candidate.processMatch(explanation, sampleMessages, mappings, fieldStats);
+                }
             }
             }
         }
         }
 
 
         return null;
         return null;
     }
     }
 
 
+    /**
+     * This method processes a user-supplied Grok pattern that will match all of the sample messages in their entirety.
+     * It will also update mappings and field stats if they are non-<code>null</code>.
+     * @param grokPattern The user supplied Grok pattern.
+     * @param timestampField The name of the timestamp field within the Grok pattern.
+     * @throws IllegalArgumentException If the supplied Grok pattern does not match the sample messages.
+     */
+    public void validateFullLineGrokPattern(String grokPattern, String timestampField) {
+
+        FullMatchGrokPatternCandidate candidate = FullMatchGrokPatternCandidate.fromGrokPattern(grokPattern, timestampField);
+        if (candidate.matchesAll(sampleMessages)) {
+            candidate.processMatch(explanation, sampleMessages, mappings, fieldStats);
+        } else {
+            throw new IllegalArgumentException("Supplied Grok pattern [" + grokPattern + "] does not match sample messages");
+        }
+    }
+
     /**
     /**
      * Build a Grok pattern that will match all of the sample messages in their entirety.
      * Build a Grok pattern that will match all of the sample messages in their entirety.
      * @param seedPatternName A pattern that has already been determined to match some portion of every sample message.
      * @param seedPatternName A pattern that has already been determined to match some portion of every sample message.
@@ -564,14 +584,26 @@ public final class GrokPatternCreator {
      */
      */
     static class FullMatchGrokPatternCandidate {
     static class FullMatchGrokPatternCandidate {
 
 
-        private final String grokString;
+        private final String grokPattern;
         private final String timeField;
         private final String timeField;
         private final Grok grok;
         private final Grok grok;
 
 
-        FullMatchGrokPatternCandidate(String grokPatternName, String timeField) {
-            grokString = "%{" + grokPatternName + "}";
+        static FullMatchGrokPatternCandidate fromGrokPatternName(String grokPatternName, String timeField) {
+            return new FullMatchGrokPatternCandidate("%{" + grokPatternName + "}", timeField);
+        }
+
+        static FullMatchGrokPatternCandidate fromGrokPattern(String grokPattern, String timeField) {
+            return new FullMatchGrokPatternCandidate(grokPattern, timeField);
+        }
+
+        private FullMatchGrokPatternCandidate(String grokPattern, String timeField) {
+            this.grokPattern = grokPattern;
             this.timeField = timeField;
             this.timeField = timeField;
-            grok = new Grok(Grok.getBuiltinPatterns(), grokString);
+            grok = new Grok(Grok.getBuiltinPatterns(), grokPattern);
+        }
+
+        public String getTimeField() {
+            return timeField;
         }
         }
 
 
         public boolean matchesAll(Collection<String> sampleMessages) {
         public boolean matchesAll(Collection<String> sampleMessages) {
@@ -585,7 +617,7 @@ public final class GrokPatternCreator {
         public Tuple<String, String> processMatch(List<String> explanation, Collection<String> sampleMessages,
         public Tuple<String, String> processMatch(List<String> explanation, Collection<String> sampleMessages,
                                                   Map<String, Object> mappings, Map<String, FieldStats> fieldStats) {
                                                   Map<String, Object> mappings, Map<String, FieldStats> fieldStats) {
 
 
-            explanation.add("A full message Grok pattern [" + grokString.substring(2, grokString.length() - 1) + "] looks appropriate");
+            explanation.add("A full message Grok pattern [" + grokPattern.substring(2, grokPattern.length() - 1) + "] looks appropriate");
 
 
             if (mappings != null || fieldStats != null) {
             if (mappings != null || fieldStats != null) {
                 Map<String, Collection<String>> valuesPerField = new HashMap<>();
                 Map<String, Collection<String>> valuesPerField = new HashMap<>();
@@ -594,41 +626,39 @@ public final class GrokPatternCreator {
                     Map<String, Object> captures = grok.captures(sampleMessage);
                     Map<String, Object> captures = grok.captures(sampleMessage);
                     // If the pattern doesn't match then captures will be null
                     // If the pattern doesn't match then captures will be null
                     if (captures == null) {
                     if (captures == null) {
-                        throw new IllegalStateException("[" + grokString + "] does not match snippet [" + sampleMessage + "]");
+                        throw new IllegalStateException("[" + grokPattern + "] does not match snippet [" + sampleMessage + "]");
                     }
                     }
                     for (Map.Entry<String, Object> capture : captures.entrySet()) {
                     for (Map.Entry<String, Object> capture : captures.entrySet()) {
 
 
                         String fieldName = capture.getKey();
                         String fieldName = capture.getKey();
                         String fieldValue = capture.getValue().toString();
                         String fieldValue = capture.getValue().toString();
-
-                        // Exclude the time field because that will be dropped and replaced with @timestamp
-                        if (fieldName.equals(timeField) == false) {
-                            valuesPerField.compute(fieldName, (k, v) -> {
-                                if (v == null) {
-                                    return new ArrayList<>(Collections.singletonList(fieldValue));
-                                } else {
-                                    v.add(fieldValue);
-                                    return v;
-                                }
-                            });
-                        }
+                        valuesPerField.compute(fieldName, (k, v) -> {
+                            if (v == null) {
+                                return new ArrayList<>(Collections.singletonList(fieldValue));
+                            } else {
+                                v.add(fieldValue);
+                                return v;
+                            }
+                        });
                     }
                     }
                 }
                 }
 
 
                 for (Map.Entry<String, Collection<String>> valuesForField : valuesPerField.entrySet()) {
                 for (Map.Entry<String, Collection<String>> valuesForField : valuesPerField.entrySet()) {
                     String fieldName = valuesForField.getKey();
                     String fieldName = valuesForField.getKey();
                     if (mappings != null) {
                     if (mappings != null) {
-                        mappings.put(fieldName,
-                            FileStructureUtils.guessScalarMapping(explanation, fieldName, valuesForField.getValue()));
+                        // Exclude the time field because that will be dropped and replaced with @timestamp
+                        if (fieldName.equals(timeField) == false) {
+                            mappings.put(fieldName,
+                                FileStructureUtils.guessScalarMapping(explanation, fieldName, valuesForField.getValue()));
+                        }
                     }
                     }
                     if (fieldStats != null) {
                     if (fieldStats != null) {
-                        fieldStats.put(fieldName,
-                            FileStructureUtils.calculateFieldStats(valuesForField.getValue()));
+                        fieldStats.put(fieldName, FileStructureUtils.calculateFieldStats(valuesForField.getValue()));
                     }
                     }
                 }
                 }
             }
             }
 
 
-            return new Tuple<>(timeField, grokString);
+            return new Tuple<>(timeField, grokPattern);
         }
         }
     }
     }
 }
 }

+ 7 - 3
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinder.java

@@ -33,7 +33,8 @@ public class JsonFileStructureFinder implements FileStructureFinder {
     private final FileStructure structure;
     private final FileStructure structure;
 
 
     static JsonFileStructureFinder makeJsonFileStructureFinder(List<String> explanation, String sample, String charsetName,
     static JsonFileStructureFinder makeJsonFileStructureFinder(List<String> explanation, String sample, String charsetName,
-                                                               Boolean hasByteOrderMarker) throws IOException {
+                                                               Boolean hasByteOrderMarker, FileStructureOverrides overrides)
+        throws IOException {
 
 
         List<Map<String, ?>> sampleRecords = new ArrayList<>();
         List<Map<String, ?>> sampleRecords = new ArrayList<>();
 
 
@@ -51,7 +52,7 @@ public class JsonFileStructureFinder implements FileStructureFinder {
             .setNumLinesAnalyzed(sampleMessages.size())
             .setNumLinesAnalyzed(sampleMessages.size())
             .setNumMessagesAnalyzed(sampleRecords.size());
             .setNumMessagesAnalyzed(sampleRecords.size());
 
 
-        Tuple<String, TimestampMatch> timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords);
+        Tuple<String, TimestampMatch> timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides);
         if (timeField != null) {
         if (timeField != null) {
             structureBuilder.setTimestampField(timeField.v1())
             structureBuilder.setTimestampField(timeField.v1())
                 .setTimestampFormats(timeField.v2().dateFormats)
                 .setTimestampFormats(timeField.v2().dateFormats)
@@ -62,7 +63,10 @@ public class JsonFileStructureFinder implements FileStructureFinder {
             FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords);
             FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords);
 
 
         SortedMap<String, Object> mappings = mappingsAndFieldStats.v1();
         SortedMap<String, Object> mappings = mappingsAndFieldStats.v1();
-        mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date"));
+        if (timeField != null) {
+            mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD,
+                Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date"));
+        }
 
 
         if (mappingsAndFieldStats.v2() != null) {
         if (mappingsAndFieldStats.v2() != null) {
             structureBuilder.setFieldStats(mappingsAndFieldStats.v2());
             structureBuilder.setFieldStats(mappingsAndFieldStats.v2());

+ 9 - 3
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderFactory.java

@@ -8,6 +8,7 @@ package org.elasticsearch.xpack.ml.filestructurefinder;
 import org.elasticsearch.common.xcontent.DeprecationHandler;
 import org.elasticsearch.common.xcontent.DeprecationHandler;
 import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.common.xcontent.XContentParser;
+import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
 
 
 import java.io.IOException;
 import java.io.IOException;
 import java.io.StringReader;
 import java.io.StringReader;
@@ -18,6 +19,11 @@ import static org.elasticsearch.common.xcontent.json.JsonXContent.jsonXContent;
 
 
 public class JsonFileStructureFinderFactory implements FileStructureFinderFactory {
 public class JsonFileStructureFinderFactory implements FileStructureFinderFactory {
 
 
+    @Override
+    public boolean canFindFormat(FileStructure.Format format) {
+        return format == null || format == FileStructure.Format.JSON;
+    }
+
     /**
     /**
      * This format matches if the sample consists of one or more JSON documents.
      * This format matches if the sample consists of one or more JSON documents.
      * If there is more than one, they must be newline-delimited.  The
      * If there is more than one, they must be newline-delimited.  The
@@ -61,9 +67,9 @@ public class JsonFileStructureFinderFactory implements FileStructureFinderFactor
     }
     }
 
 
     @Override
     @Override
-    public FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
-        throws IOException {
-        return JsonFileStructureFinder.makeJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker);
+    public FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
+                                                FileStructureOverrides overrides) throws IOException {
+        return JsonFileStructureFinder.makeJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides);
     }
     }
 
 
     private static class ContextPrintingStringReader extends StringReader {
     private static class ContextPrintingStringReader extends StringReader {

+ 26 - 15
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java

@@ -28,17 +28,19 @@ public class TextLogFileStructureFinder implements FileStructureFinder {
     private final FileStructure structure;
     private final FileStructure structure;
 
 
     static TextLogFileStructureFinder makeTextLogFileStructureFinder(List<String> explanation, String sample, String charsetName,
     static TextLogFileStructureFinder makeTextLogFileStructureFinder(List<String> explanation, String sample, String charsetName,
-                                                                     Boolean hasByteOrderMarker) {
+                                                                     Boolean hasByteOrderMarker, FileStructureOverrides overrides) {
 
 
         String[] sampleLines = sample.split("\n");
         String[] sampleLines = sample.split("\n");
-        Tuple<TimestampMatch, Set<String>> bestTimestamp = mostLikelyTimestamp(sampleLines);
+        Tuple<TimestampMatch, Set<String>> bestTimestamp = mostLikelyTimestamp(sampleLines, overrides);
         if (bestTimestamp == null) {
         if (bestTimestamp == null) {
             // Is it appropriate to treat a file that is neither structured nor has
             // Is it appropriate to treat a file that is neither structured nor has
             // a regular pattern of timestamps as a log file?  Probably not...
             // a regular pattern of timestamps as a log file?  Probably not...
-            throw new IllegalArgumentException("Could not find a timestamp in the sample provided");
+            throw new IllegalArgumentException("Could not find " +
+                ((overrides.getTimestampFormat() == null) ? "a timestamp" : "the specified timestamp format") + " in the sample provided");
         }
         }
 
 
-        explanation.add("Most likely timestamp format is [" + bestTimestamp.v1() + "]");
+        explanation.add(((overrides.getTimestampFormat() == null) ? "Most likely timestamp" : "Timestamp") + " format is [" +
+            bestTimestamp.v1() + "]");
 
 
         List<String> sampleMessages = new ArrayList<>();
         List<String> sampleMessages = new ArrayList<>();
         StringBuilder preamble = new StringBuilder();
         StringBuilder preamble = new StringBuilder();
@@ -86,17 +88,26 @@ public class TextLogFileStructureFinder implements FileStructureFinder {
 
 
         SortedMap<String, FieldStats> fieldStats = new TreeMap<>();
         SortedMap<String, FieldStats> fieldStats = new TreeMap<>();
 
 
-        // We can't parse directly into @timestamp using Grok, so parse to some other time field, which the date filter will then remove
-        String interimTimestampField;
-        String grokPattern;
         GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, fieldStats);
         GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, fieldStats);
-        Tuple<String, String> timestampFieldAndFullMatchGrokPattern = grokPatternCreator.findFullLineGrokPattern();
-        if (timestampFieldAndFullMatchGrokPattern != null) {
-            interimTimestampField = timestampFieldAndFullMatchGrokPattern.v1();
-            grokPattern = timestampFieldAndFullMatchGrokPattern.v2();
+        // We can't parse directly into @timestamp using Grok, so parse to some other time field, which the date filter will then remove
+        String interimTimestampField = overrides.getTimestampField();
+        String grokPattern = overrides.getGrokPattern();
+        if (grokPattern != null) {
+            if (interimTimestampField == null) {
+                interimTimestampField = "timestamp";
+            }
+            grokPatternCreator.validateFullLineGrokPattern(grokPattern, interimTimestampField);
         } else {
         } else {
-            interimTimestampField = "timestamp";
-            grokPattern = grokPatternCreator.createGrokPatternFromExamples(bestTimestamp.v1().grokPatternName, interimTimestampField);
+            Tuple<String, String> timestampFieldAndFullMatchGrokPattern = grokPatternCreator.findFullLineGrokPattern(interimTimestampField);
+            if (timestampFieldAndFullMatchGrokPattern != null) {
+                interimTimestampField = timestampFieldAndFullMatchGrokPattern.v1();
+                grokPattern = timestampFieldAndFullMatchGrokPattern.v2();
+            } else {
+                if (interimTimestampField == null) {
+                    interimTimestampField = "timestamp";
+                }
+                grokPattern = grokPatternCreator.createGrokPatternFromExamples(bestTimestamp.v1().grokPatternName, interimTimestampField);
+            }
         }
         }
 
 
         FileStructure structure = structureBuilder
         FileStructure structure = structureBuilder
@@ -127,14 +138,14 @@ public class TextLogFileStructureFinder implements FileStructureFinder {
         return structure;
         return structure;
     }
     }
 
 
-    static Tuple<TimestampMatch, Set<String>> mostLikelyTimestamp(String[] sampleLines) {
+    static Tuple<TimestampMatch, Set<String>> mostLikelyTimestamp(String[] sampleLines, FileStructureOverrides overrides) {
 
 
         Map<TimestampMatch, Tuple<Double, Set<String>>> timestampMatches = new LinkedHashMap<>();
         Map<TimestampMatch, Tuple<Double, Set<String>>> timestampMatches = new LinkedHashMap<>();
 
 
         int remainingLines = sampleLines.length;
         int remainingLines = sampleLines.length;
         double differenceBetweenTwoHighestWeights = 0.0;
         double differenceBetweenTwoHighestWeights = 0.0;
         for (String sampleLine : sampleLines) {
         for (String sampleLine : sampleLines) {
-            TimestampMatch match = TimestampFormatFinder.findFirstMatch(sampleLine);
+            TimestampMatch match = TimestampFormatFinder.findFirstMatch(sampleLine, overrides.getTimestampFormat());
             if (match != null) {
             if (match != null) {
                 TimestampMatch pureMatch = new TimestampMatch(match.candidateIndex, "", match.dateFormats, match.simplePattern,
                 TimestampMatch pureMatch = new TimestampMatch(match.candidateIndex, "", match.dateFormats, match.simplePattern,
                     match.grokPatternName, "");
                     match.grokPatternName, "");

+ 11 - 2
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java

@@ -5,6 +5,8 @@
  */
  */
 package org.elasticsearch.xpack.ml.filestructurefinder;
 package org.elasticsearch.xpack.ml.filestructurefinder;
 
 
+import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
+
 import java.util.List;
 import java.util.List;
 import java.util.regex.Pattern;
 import java.util.regex.Pattern;
 
 
@@ -13,6 +15,11 @@ public class TextLogFileStructureFinderFactory implements FileStructureFinderFac
     // This works because, by default, dot doesn't match newlines
     // This works because, by default, dot doesn't match newlines
     private static final Pattern TWO_NON_BLANK_LINES_PATTERN = Pattern.compile(".\n+.");
     private static final Pattern TWO_NON_BLANK_LINES_PATTERN = Pattern.compile(".\n+.");
 
 
+    @Override
+    public boolean canFindFormat(FileStructure.Format format) {
+        return format == null || format == FileStructure.Format.SEMI_STRUCTURED_TEXT;
+    }
+
     /**
     /**
      * This format matches if the sample contains at least one newline and at least two
      * This format matches if the sample contains at least one newline and at least two
      * non-blank lines.
      * non-blank lines.
@@ -33,7 +40,9 @@ public class TextLogFileStructureFinderFactory implements FileStructureFinderFac
     }
     }
 
 
     @Override
     @Override
-    public FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker) {
-        return TextLogFileStructureFinder.makeTextLogFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker);
+    public FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
+                                                FileStructureOverrides overrides) {
+        return TextLogFileStructureFinder.makeTextLogFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
+            overrides);
     }
     }
 }
 }

+ 68 - 20
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java

@@ -148,6 +148,16 @@ public final class TimestampFormatFinder {
         return findFirstMatch(text, 0);
         return findFirstMatch(text, 0);
     }
     }
 
 
+    /**
+     * Find the first timestamp format that matches part of the supplied value.
+     * @param text The value that the returned timestamp format must exist within.
+     * @param requiredFormat A date format that any returned match must support.
+     * @return The timestamp format, or <code>null</code> if none matches.
+     */
+    public static TimestampMatch findFirstMatch(String text, String requiredFormat) {
+        return findFirstMatch(text, 0, requiredFormat);
+    }
+
     /**
     /**
      * Find the first timestamp format that matches part of the supplied value,
      * Find the first timestamp format that matches part of the supplied value,
      * excluding a specified number of candidate formats.
      * excluding a specified number of candidate formats.
@@ -156,26 +166,40 @@ public final class TimestampFormatFinder {
      * @return The timestamp format, or <code>null</code> if none matches.
      * @return The timestamp format, or <code>null</code> if none matches.
      */
      */
     public static TimestampMatch findFirstMatch(String text, int ignoreCandidates) {
     public static TimestampMatch findFirstMatch(String text, int ignoreCandidates) {
+        return findFirstMatch(text, ignoreCandidates, null);
+    }
+
+    /**
+     * Find the first timestamp format that matches part of the supplied value,
+     * excluding a specified number of candidate formats.
+     * @param text             The value that the returned timestamp format must exist within.
+     * @param ignoreCandidates The number of candidate formats to exclude from the search.
+     * @param requiredFormat A date format that any returned match must support.
+     * @return The timestamp format, or <code>null</code> if none matches.
+     */
+    public static TimestampMatch findFirstMatch(String text, int ignoreCandidates, String requiredFormat) {
         Boolean[] quickRuleoutMatches = new Boolean[QUICK_RULE_OUT_PATTERNS.size()];
         Boolean[] quickRuleoutMatches = new Boolean[QUICK_RULE_OUT_PATTERNS.size()];
         int index = ignoreCandidates;
         int index = ignoreCandidates;
         for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS.subList(ignoreCandidates, ORDERED_CANDIDATE_FORMATS.size())) {
         for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS.subList(ignoreCandidates, ORDERED_CANDIDATE_FORMATS.size())) {
-            boolean quicklyRuledOut = false;
-            for (Integer quickRuleOutIndex : candidate.quickRuleOutIndices) {
-                if (quickRuleoutMatches[quickRuleOutIndex] == null) {
-                    quickRuleoutMatches[quickRuleOutIndex] = QUICK_RULE_OUT_PATTERNS.get(quickRuleOutIndex).matcher(text).find();
-                }
-                if (quickRuleoutMatches[quickRuleOutIndex] == false) {
-                    quicklyRuledOut = true;
-                    break;
+            if (requiredFormat == null || candidate.dateFormats.contains(requiredFormat)) {
+                boolean quicklyRuledOut = false;
+                for (Integer quickRuleOutIndex : candidate.quickRuleOutIndices) {
+                    if (quickRuleoutMatches[quickRuleOutIndex] == null) {
+                        quickRuleoutMatches[quickRuleOutIndex] = QUICK_RULE_OUT_PATTERNS.get(quickRuleOutIndex).matcher(text).find();
+                    }
+                    if (quickRuleoutMatches[quickRuleOutIndex] == false) {
+                        quicklyRuledOut = true;
+                        break;
+                    }
                 }
                 }
-            }
-            if (quicklyRuledOut == false) {
-                Map<String, Object> captures = candidate.strictSearchGrok.captures(text);
-                if (captures != null) {
-                    String preface = captures.getOrDefault(PREFACE, "").toString();
-                    String epilogue = captures.getOrDefault(EPILOGUE, "").toString();
-                    return makeTimestampMatch(candidate, index, preface, text.substring(preface.length(),
-                        text.length() - epilogue.length()), epilogue);
+                if (quicklyRuledOut == false) {
+                    Map<String, Object> captures = candidate.strictSearchGrok.captures(text);
+                    if (captures != null) {
+                        String preface = captures.getOrDefault(PREFACE, "").toString();
+                        String epilogue = captures.getOrDefault(EPILOGUE, "").toString();
+                        return makeTimestampMatch(candidate, index, preface, text.substring(preface.length(),
+                            text.length() - epilogue.length()), epilogue);
+                    }
                 }
                 }
             }
             }
             ++index;
             ++index;
@@ -192,6 +216,16 @@ public final class TimestampFormatFinder {
         return findFirstFullMatch(text, 0);
         return findFirstFullMatch(text, 0);
     }
     }
 
 
+    /**
+     * Find the best timestamp format for matching an entire field value.
+     * @param text The value that the returned timestamp format must match in its entirety.
+     * @param requiredFormat A date format that any returned match must support.
+     * @return The timestamp format, or <code>null</code> if none matches.
+     */
+    public static TimestampMatch findFirstFullMatch(String text, String requiredFormat) {
+        return findFirstFullMatch(text, 0, requiredFormat);
+    }
+
     /**
     /**
      * Find the best timestamp format for matching an entire field value,
      * Find the best timestamp format for matching an entire field value,
      * excluding a specified number of candidate formats.
      * excluding a specified number of candidate formats.
@@ -200,11 +234,25 @@ public final class TimestampFormatFinder {
      * @return The timestamp format, or <code>null</code> if none matches.
      * @return The timestamp format, or <code>null</code> if none matches.
      */
      */
     public static TimestampMatch findFirstFullMatch(String text, int ignoreCandidates) {
     public static TimestampMatch findFirstFullMatch(String text, int ignoreCandidates) {
+        return findFirstFullMatch(text, ignoreCandidates, null);
+    }
+
+    /**
+     * Find the best timestamp format for matching an entire field value,
+     * excluding a specified number of candidate formats.
+     * @param text The value that the returned timestamp format must match in its entirety.
+     * @param ignoreCandidates The number of candidate formats to exclude from the search.
+     * @param requiredFormat A date format that any returned match must support.
+     * @return The timestamp format, or <code>null</code> if none matches.
+     */
+    public static TimestampMatch findFirstFullMatch(String text, int ignoreCandidates, String requiredFormat) {
         int index = ignoreCandidates;
         int index = ignoreCandidates;
         for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS.subList(ignoreCandidates, ORDERED_CANDIDATE_FORMATS.size())) {
         for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS.subList(ignoreCandidates, ORDERED_CANDIDATE_FORMATS.size())) {
-            Map<String, Object> captures = candidate.strictFullMatchGrok.captures(text);
-            if (captures != null) {
-                return makeTimestampMatch(candidate, index, "", text, "");
+            if (requiredFormat == null || candidate.dateFormats.contains(requiredFormat)) {
+                Map<String, Object> captures = candidate.strictFullMatchGrok.captures(text);
+                if (captures != null) {
+                    return makeTimestampMatch(candidate, index, "", text, "");
+                }
             }
             }
             ++index;
             ++index;
         }
         }
@@ -417,7 +465,7 @@ public final class TimestampFormatFinder {
             // The (?m) here has the Ruby meaning, which is equivalent to (?s) in Java
             // The (?m) here has the Ruby meaning, which is equivalent to (?s) in Java
             this.strictSearchGrok = new Grok(Grok.getBuiltinPatterns(), "(?m)%{DATA:" + PREFACE + "}" + strictGrokPattern +
             this.strictSearchGrok = new Grok(Grok.getBuiltinPatterns(), "(?m)%{DATA:" + PREFACE + "}" + strictGrokPattern +
                 "%{GREEDYDATA:" + EPILOGUE + "}");
                 "%{GREEDYDATA:" + EPILOGUE + "}");
-            this.strictFullMatchGrok = new Grok(Grok.getBuiltinPatterns(), strictGrokPattern);
+            this.strictFullMatchGrok = new Grok(Grok.getBuiltinPatterns(), "^" + strictGrokPattern + "$");
             this.standardGrokPatternName = standardGrokPatternName;
             this.standardGrokPatternName = standardGrokPatternName;
             assert quickRuleOutIndices.stream()
             assert quickRuleOutIndices.stream()
                 .noneMatch(quickRuleOutIndex -> quickRuleOutIndex < 0 || quickRuleOutIndex >= QUICK_RULE_OUT_PATTERNS.size());
                 .noneMatch(quickRuleOutIndex -> quickRuleOutIndex < 0 || quickRuleOutIndex >= QUICK_RULE_OUT_PATTERNS.size());

+ 6 - 4
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java

@@ -38,7 +38,7 @@ public class XmlFileStructureFinder implements FileStructureFinder {
     private final FileStructure structure;
     private final FileStructure structure;
 
 
     static XmlFileStructureFinder makeXmlFileStructureFinder(List<String> explanation, String sample, String charsetName,
     static XmlFileStructureFinder makeXmlFileStructureFinder(List<String> explanation, String sample, String charsetName,
-                                                             Boolean hasByteOrderMarker)
+                                                             Boolean hasByteOrderMarker, FileStructureOverrides overrides)
         throws IOException, ParserConfigurationException, SAXException {
         throws IOException, ParserConfigurationException, SAXException {
 
 
         String messagePrefix;
         String messagePrefix;
@@ -90,7 +90,7 @@ public class XmlFileStructureFinder implements FileStructureFinder {
             .setNumMessagesAnalyzed(sampleRecords.size())
             .setNumMessagesAnalyzed(sampleRecords.size())
             .setMultilineStartPattern("^\\s*<" + topLevelTag);
             .setMultilineStartPattern("^\\s*<" + topLevelTag);
 
 
-        Tuple<String, TimestampMatch> timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords);
+        Tuple<String, TimestampMatch> timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides);
         if (timeField != null) {
         if (timeField != null) {
             structureBuilder.setTimestampField(timeField.v1())
             structureBuilder.setTimestampField(timeField.v1())
                 .setTimestampFormats(timeField.v2().dateFormats)
                 .setTimestampFormats(timeField.v2().dateFormats)
@@ -110,8 +110,10 @@ public class XmlFileStructureFinder implements FileStructureFinder {
         secondLevelProperties.put(FileStructureUtils.MAPPING_PROPERTIES_SETTING, innerMappings);
         secondLevelProperties.put(FileStructureUtils.MAPPING_PROPERTIES_SETTING, innerMappings);
         SortedMap<String, Object> outerMappings = new TreeMap<>();
         SortedMap<String, Object> outerMappings = new TreeMap<>();
         outerMappings.put(topLevelTag, secondLevelProperties);
         outerMappings.put(topLevelTag, secondLevelProperties);
-        outerMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD,
-            Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date"));
+        if (timeField != null) {
+            outerMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD,
+                Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date"));
+        }
 
 
         FileStructure structure = structureBuilder
         FileStructure structure = structureBuilder
             .setMappings(outerMappings)
             .setMappings(outerMappings)

+ 9 - 2
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java

@@ -5,6 +5,7 @@
  */
  */
 package org.elasticsearch.xpack.ml.filestructurefinder;
 package org.elasticsearch.xpack.ml.filestructurefinder;
 
 
+import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
 import org.xml.sax.SAXException;
 import org.xml.sax.SAXException;
 
 
 import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.parsers.ParserConfigurationException;
@@ -27,6 +28,11 @@ public class XmlFileStructureFinderFactory implements FileStructureFinderFactory
         xmlFactory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE);
         xmlFactory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE);
     }
     }
 
 
+    @Override
+    public boolean canFindFormat(FileStructure.Format format) {
+        return format == null || format == FileStructure.Format.XML;
+    }
+
     /**
     /**
      * This format matches if the sample consists of one or more XML documents,
      * This format matches if the sample consists of one or more XML documents,
      * all with the same root element name.  If there is more than one document,
      * all with the same root element name.  If there is more than one document,
@@ -115,8 +121,9 @@ public class XmlFileStructureFinderFactory implements FileStructureFinderFactory
     }
     }
 
 
     @Override
     @Override
-    public FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
+    public FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
+                                                FileStructureOverrides overrides)
         throws IOException, ParserConfigurationException, SAXException {
         throws IOException, ParserConfigurationException, SAXException {
-        return XmlFileStructureFinder.makeXmlFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker);
+        return XmlFileStructureFinder.makeXmlFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides);
     }
     }
 }
 }

+ 11 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java

@@ -39,6 +39,17 @@ public class RestFindFileStructureAction extends BaseRestHandler {
         FindFileStructureAction.Request request = new FindFileStructureAction.Request();
         FindFileStructureAction.Request request = new FindFileStructureAction.Request();
         request.setLinesToSample(restRequest.paramAsInt(FindFileStructureAction.Request.LINES_TO_SAMPLE.getPreferredName(),
         request.setLinesToSample(restRequest.paramAsInt(FindFileStructureAction.Request.LINES_TO_SAMPLE.getPreferredName(),
             FileStructureFinderManager.DEFAULT_IDEAL_SAMPLE_LINE_COUNT));
             FileStructureFinderManager.DEFAULT_IDEAL_SAMPLE_LINE_COUNT));
+        request.setCharset(restRequest.param(FindFileStructureAction.Request.CHARSET.getPreferredName()));
+        request.setFormat(restRequest.param(FindFileStructureAction.Request.FORMAT.getPreferredName()));
+        request.setColumnNames(restRequest.paramAsStringArray(FindFileStructureAction.Request.COLUMN_NAMES.getPreferredName(), null));
+        request.setHasHeaderRow(restRequest.paramAsBoolean(FindFileStructureAction.Request.HAS_HEADER_ROW.getPreferredName(), null));
+        request.setDelimiter(restRequest.param(FindFileStructureAction.Request.DELIMITER.getPreferredName()));
+        request.setQuote(restRequest.param(FindFileStructureAction.Request.QUOTE.getPreferredName()));
+        request.setShouldTrimFields(restRequest.paramAsBoolean(FindFileStructureAction.Request.SHOULD_TRIM_FIELDS.getPreferredName(),
+            null));
+        request.setGrokPattern(restRequest.param(FindFileStructureAction.Request.GROK_PATTERN.getPreferredName()));
+        request.setTimestampFormat(restRequest.param(FindFileStructureAction.Request.TIMESTAMP_FORMAT.getPreferredName()));
+        request.setTimestampField(restRequest.param(FindFileStructureAction.Request.TIMESTAMP_FIELD.getPreferredName()));
         if (restRequest.hasContent()) {
         if (restRequest.hasContent()) {
             request.setSample(restRequest.content());
             request.setSample(restRequest.content());
         } else {
         } else {

+ 4 - 4
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java

@@ -7,10 +7,10 @@ package org.elasticsearch.xpack.ml.filestructurefinder;
 
 
 public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestCase {
 public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestCase {
 
 
-    private FileStructureFinderFactory csvFactory = new DelimitedFileStructureFinderFactory(',', 2, false);
-    private FileStructureFinderFactory tsvFactory = new DelimitedFileStructureFinderFactory('\t', 2, false);
-    private FileStructureFinderFactory semiColonDelimitedfactory = new DelimitedFileStructureFinderFactory(';', 4, false);
-    private FileStructureFinderFactory pipeDelimitedFactory = new DelimitedFileStructureFinderFactory('|', 5, true);
+    private FileStructureFinderFactory csvFactory = new DelimitedFileStructureFinderFactory(',', '"', 2, false);
+    private FileStructureFinderFactory tsvFactory = new DelimitedFileStructureFinderFactory('\t', '"', 2, false);
+    private FileStructureFinderFactory semiColonDelimitedfactory = new DelimitedFileStructureFinderFactory(';', '"', 4, false);
+    private FileStructureFinderFactory pipeDelimitedFactory = new DelimitedFileStructureFinderFactory('|', '"', 5, true);
 
 
     // CSV - no need to check JSON or XML because they come earlier in the order we check formats
     // CSV - no need to check JSON or XML because they come earlier in the order we check formats
 
 

+ 187 - 15
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java

@@ -19,7 +19,7 @@ import static org.hamcrest.Matchers.arrayContaining;
 
 
 public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
 public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
 
 
-    private FileStructureFinderFactory csvFactory = new DelimitedFileStructureFinderFactory(',', 2, false);
+    private FileStructureFinderFactory csvFactory = new DelimitedFileStructureFinderFactory(',', '"', 2, false);
 
 
     public void testCreateConfigsGivenCompleteCsv() throws Exception {
     public void testCreateConfigsGivenCompleteCsv() throws Exception {
         String sample = "time,message\n" +
         String sample = "time,message\n" +
@@ -29,7 +29,8 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
 
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
-        FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
+        FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
+            FileStructureOverrides.EMPTY_OVERRIDES);
 
 
         FileStructure structure = structureFinder.getStructure();
         FileStructure structure = structureFinder.getStructure();
 
 
@@ -43,6 +44,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
         assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern());
         assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern());
         assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
         assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
         assertEquals(Character.valueOf(','), structure.getDelimiter());
         assertEquals(Character.valueOf(','), structure.getDelimiter());
+        assertEquals(Character.valueOf('"'), structure.getQuote());
         assertTrue(structure.getHasHeaderRow());
         assertTrue(structure.getHasHeaderRow());
         assertNull(structure.getShouldTrimFields());
         assertNull(structure.getShouldTrimFields());
         assertEquals(Arrays.asList("time", "message"), structure.getColumnNames());
         assertEquals(Arrays.asList("time", "message"), structure.getColumnNames());
@@ -51,6 +53,76 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
         assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
         assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
     }
     }
 
 
+    public void testCreateConfigsGivenCompleteCsvAndColumnNamesOverride() throws Exception {
+
+        FileStructureOverrides overrides = FileStructureOverrides.builder().setColumnNames(Arrays.asList("my_time", "my_message")).build();
+
+        String sample = "time,message\n" +
+            "2018-05-17T13:41:23,hello\n" +
+            "2018-05-17T13:41:32,hello again\n";
+        assertTrue(csvFactory.canCreateFromSample(explanation, sample));
+
+        String charset = randomFrom(POSSIBLE_CHARSETS);
+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
+        FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides);
+
+        FileStructure structure = structureFinder.getStructure();
+
+        assertEquals(FileStructure.Format.DELIMITED, structure.getFormat());
+        assertEquals(charset, structure.getCharset());
+        if (hasByteOrderMarker == null) {
+            assertNull(structure.getHasByteOrderMarker());
+        } else {
+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
+        }
+        assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern());
+        assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
+        assertEquals(Character.valueOf(','), structure.getDelimiter());
+        assertEquals(Character.valueOf('"'), structure.getQuote());
+        assertTrue(structure.getHasHeaderRow());
+        assertNull(structure.getShouldTrimFields());
+        assertEquals(Arrays.asList("my_time", "my_message"), structure.getColumnNames());
+        assertNull(structure.getGrokPattern());
+        assertEquals("my_time", structure.getTimestampField());
+        assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
+    }
+
+    public void testCreateConfigsGivenCompleteCsvAndHasHeaderRowOverride() throws Exception {
+
+        // It's obvious the first row really should be a header row, so by overriding
+        // detection with the wrong choice the results will be completely changed
+        FileStructureOverrides overrides = FileStructureOverrides.builder().setHasHeaderRow(false).build();
+
+        String sample = "time,message\n" +
+            "2018-05-17T13:41:23,hello\n" +
+            "2018-05-17T13:41:32,hello again\n";
+        assertTrue(csvFactory.canCreateFromSample(explanation, sample));
+
+        String charset = randomFrom(POSSIBLE_CHARSETS);
+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
+        FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides);
+
+        FileStructure structure = structureFinder.getStructure();
+
+        assertEquals(FileStructure.Format.DELIMITED, structure.getFormat());
+        assertEquals(charset, structure.getCharset());
+        if (hasByteOrderMarker == null) {
+            assertNull(structure.getHasByteOrderMarker());
+        } else {
+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
+        }
+        assertNull(structure.getExcludeLinesPattern());
+        assertNull(structure.getMultilineStartPattern());
+        assertEquals(Character.valueOf(','), structure.getDelimiter());
+        assertEquals(Character.valueOf('"'), structure.getQuote());
+        assertFalse(structure.getHasHeaderRow());
+        assertNull(structure.getShouldTrimFields());
+        assertEquals(Arrays.asList("column1", "column2"), structure.getColumnNames());
+        assertNull(structure.getGrokPattern());
+        assertNull(structure.getTimestampField());
+        assertNull(structure.getTimestampFormats());
+    }
+
     public void testCreateConfigsGivenCsvWithIncompleteLastRecord() throws Exception {
     public void testCreateConfigsGivenCsvWithIncompleteLastRecord() throws Exception {
         String sample = "message,time,count\n" +
         String sample = "message,time,count\n" +
             "\"hello\n" +
             "\"hello\n" +
@@ -60,7 +132,8 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
 
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
-        FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
+        FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
+            FileStructureOverrides.EMPTY_OVERRIDES);
 
 
         FileStructure structure = structureFinder.getStructure();
         FileStructure structure = structureFinder.getStructure();
 
 
@@ -74,6 +147,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
         assertEquals("^\"?message\"?,\"?time\"?,\"?count\"?", structure.getExcludeLinesPattern());
         assertEquals("^\"?message\"?,\"?time\"?,\"?count\"?", structure.getExcludeLinesPattern());
         assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
         assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
         assertEquals(Character.valueOf(','), structure.getDelimiter());
         assertEquals(Character.valueOf(','), structure.getDelimiter());
+        assertEquals(Character.valueOf('"'), structure.getQuote());
         assertTrue(structure.getHasHeaderRow());
         assertTrue(structure.getHasHeaderRow());
         assertNull(structure.getShouldTrimFields());
         assertNull(structure.getShouldTrimFields());
         assertEquals(Arrays.asList("message", "time", "count"), structure.getColumnNames());
         assertEquals(Arrays.asList("message", "time", "count"), structure.getColumnNames());
@@ -93,7 +167,8 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
 
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
-        FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
+        FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
+            FileStructureOverrides.EMPTY_OVERRIDES);
 
 
         FileStructure structure = structureFinder.getStructure();
         FileStructure structure = structureFinder.getStructure();
 
 
@@ -110,6 +185,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
             structure.getExcludeLinesPattern());
             structure.getExcludeLinesPattern());
         assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
         assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
         assertEquals(Character.valueOf(','), structure.getDelimiter());
         assertEquals(Character.valueOf(','), structure.getDelimiter());
+        assertEquals(Character.valueOf('"'), structure.getQuote());
         assertTrue(structure.getHasHeaderRow());
         assertTrue(structure.getHasHeaderRow());
         assertNull(structure.getShouldTrimFields());
         assertNull(structure.getShouldTrimFields());
         assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance",
         assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance",
@@ -120,6 +196,50 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
         assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats());
         assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats());
     }
     }
 
 
+    public void testCreateConfigsGivenCsvWithTrailingNullsAndOverriddenTimeField() throws Exception {
+
+        // Default timestamp field is the first field from the start of each row that contains a
+        // consistent timestamp format, so if we want the second we need an override
+        FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampField("tpep_dropoff_datetime").build();
+
+        String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," +
+            "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," +
+            "improvement_surcharge,total_amount,,\n" +
+            "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" +
+            "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" +
+            "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n";
+        assertTrue(csvFactory.canCreateFromSample(explanation, sample));
+
+        String charset = randomFrom(POSSIBLE_CHARSETS);
+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
+        FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides);
+
+        FileStructure structure = structureFinder.getStructure();
+
+        assertEquals(FileStructure.Format.DELIMITED, structure.getFormat());
+        assertEquals(charset, structure.getCharset());
+        if (hasByteOrderMarker == null) {
+            assertNull(structure.getHasByteOrderMarker());
+        } else {
+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
+        }
+        assertEquals("^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," +
+            "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," +
+            "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?,\"?\"?,\"?\"?",
+            structure.getExcludeLinesPattern());
+        assertEquals("^.*?,.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
+        assertEquals(Character.valueOf(','), structure.getDelimiter());
+        assertEquals(Character.valueOf('"'), structure.getQuote());
+        assertTrue(structure.getHasHeaderRow());
+        assertNull(structure.getShouldTrimFields());
+        assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance",
+            "RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax",
+            "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", "column18", "column19"), structure.getColumnNames());
+        assertNull(structure.getGrokPattern());
+        assertEquals("tpep_dropoff_datetime", structure.getTimestampField());
+        assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats());
+    }
+
     public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exception {
     public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exception {
         String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," +
         String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," +
             "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," +
             "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," +
@@ -131,7 +251,8 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
 
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
-        FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
+        FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
+            FileStructureOverrides.EMPTY_OVERRIDES);
 
 
         FileStructure structure = structureFinder.getStructure();
         FileStructure structure = structureFinder.getStructure();
 
 
@@ -148,6 +269,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
             structure.getExcludeLinesPattern());
             structure.getExcludeLinesPattern());
         assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
         assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
         assertEquals(Character.valueOf(','), structure.getDelimiter());
         assertEquals(Character.valueOf(','), structure.getDelimiter());
+        assertEquals(Character.valueOf('"'), structure.getQuote());
         assertTrue(structure.getHasHeaderRow());
         assertTrue(structure.getHasHeaderRow());
         assertNull(structure.getShouldTrimFields());
         assertNull(structure.getShouldTrimFields());
         assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance",
         assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance",
@@ -158,6 +280,53 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
         assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats());
         assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats());
     }
     }
 
 
+    public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeaderAndColumnNamesOverride() throws Exception {
+
+        FileStructureOverrides overrides = FileStructureOverrides.builder()
+            .setColumnNames(Arrays.asList("my_VendorID", "my_tpep_pickup_datetime", "my_tpep_dropoff_datetime", "my_passenger_count",
+                "my_trip_distance", "my_RatecodeID", "my_store_and_fwd_flag", "my_PULocationID", "my_DOLocationID", "my_payment_type",
+                "my_fare_amount", "my_extra", "my_mta_tax", "my_tip_amount", "my_tolls_amount", "my_improvement_surcharge",
+                "my_total_amount")).build();
+
+        String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," +
+            "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," +
+            "improvement_surcharge,total_amount\n" +
+            "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" +
+            "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" +
+            "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n";
+        assertTrue(csvFactory.canCreateFromSample(explanation, sample));
+
+        String charset = randomFrom(POSSIBLE_CHARSETS);
+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
+        FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides);
+
+        FileStructure structure = structureFinder.getStructure();
+
+        assertEquals(FileStructure.Format.DELIMITED, structure.getFormat());
+        assertEquals(charset, structure.getCharset());
+        if (hasByteOrderMarker == null) {
+            assertNull(structure.getHasByteOrderMarker());
+        } else {
+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
+        }
+        assertEquals("^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," +
+                "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," +
+                "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?",
+            structure.getExcludeLinesPattern());
+        assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
+        assertEquals(Character.valueOf(','), structure.getDelimiter());
+        assertEquals(Character.valueOf('"'), structure.getQuote());
+        assertTrue(structure.getHasHeaderRow());
+        assertNull(structure.getShouldTrimFields());
+        assertEquals(Arrays.asList("my_VendorID", "my_tpep_pickup_datetime", "my_tpep_dropoff_datetime", "my_passenger_count",
+            "my_trip_distance", "my_RatecodeID", "my_store_and_fwd_flag", "my_PULocationID", "my_DOLocationID", "my_payment_type",
+            "my_fare_amount", "my_extra", "my_mta_tax", "my_tip_amount", "my_tolls_amount", "my_improvement_surcharge", "my_total_amount"),
+            structure.getColumnNames());
+        assertNull(structure.getGrokPattern());
+        assertEquals("my_tpep_pickup_datetime", structure.getTimestampField());
+        assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats());
+    }
+
     public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception {
     public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception {
         String sample = "\"pos_id\",\"trip_id\",\"latitude\",\"longitude\",\"altitude\",\"timestamp\"\n" +
         String sample = "\"pos_id\",\"trip_id\",\"latitude\",\"longitude\",\"altitude\",\"timestamp\"\n" +
             "\"1\",\"3\",\"4703.7815\",\"1527.4713\",\"359.9\",\"2017-01-19 16:19:04.742113\"\n" +
             "\"1\",\"3\",\"4703.7815\",\"1527.4713\",\"359.9\",\"2017-01-19 16:19:04.742113\"\n" +
@@ -166,7 +335,8 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
 
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
-        FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker);
+        FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
+            FileStructureOverrides.EMPTY_OVERRIDES);
 
 
         FileStructure structure = structureFinder.getStructure();
         FileStructure structure = structureFinder.getStructure();
 
 
@@ -181,6 +351,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
             structure.getExcludeLinesPattern());
             structure.getExcludeLinesPattern());
         assertNull(structure.getMultilineStartPattern());
         assertNull(structure.getMultilineStartPattern());
         assertEquals(Character.valueOf(','), structure.getDelimiter());
         assertEquals(Character.valueOf(','), structure.getDelimiter());
+        assertEquals(Character.valueOf('"'), structure.getQuote());
         assertTrue(structure.getHasHeaderRow());
         assertTrue(structure.getHasHeaderRow());
         assertNull(structure.getShouldTrimFields());
         assertNull(structure.getShouldTrimFields());
         assertEquals(Arrays.asList("pos_id", "trip_id", "latitude", "longitude", "altitude", "timestamp"), structure.getColumnNames());
         assertEquals(Arrays.asList("pos_id", "trip_id", "latitude", "longitude", "altitude", "timestamp"), structure.getColumnNames());
@@ -197,7 +368,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
             "2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n";
             "2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n";
 
 
         Tuple<Boolean, String[]> header = DelimitedFileStructureFinder.findHeaderFromSample(explanation,
         Tuple<Boolean, String[]> header = DelimitedFileStructureFinder.findHeaderFromSample(explanation,
-            DelimitedFileStructureFinder.readRows(withHeader, CsvPreference.EXCEL_PREFERENCE).v1());
+            DelimitedFileStructureFinder.readRows(withHeader, CsvPreference.EXCEL_PREFERENCE).v1(), FileStructureOverrides.EMPTY_OVERRIDES);
 
 
         assertTrue(header.v1());
         assertTrue(header.v1());
         assertThat(header.v2(), arrayContaining("time", "airline", "responsetime", "sourcetype"));
         assertThat(header.v2(), arrayContaining("time", "airline", "responsetime", "sourcetype"));
@@ -210,7 +381,8 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
             "2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n";
             "2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n";
 
 
         Tuple<Boolean, String[]> header = DelimitedFileStructureFinder.findHeaderFromSample(explanation,
         Tuple<Boolean, String[]> header = DelimitedFileStructureFinder.findHeaderFromSample(explanation,
-            DelimitedFileStructureFinder.readRows(withoutHeader, CsvPreference.EXCEL_PREFERENCE).v1());
+            DelimitedFileStructureFinder.readRows(withoutHeader, CsvPreference.EXCEL_PREFERENCE).v1(),
+            FileStructureOverrides.EMPTY_OVERRIDES);
 
 
         assertFalse(header.v1());
         assertFalse(header.v1());
         assertThat(header.v2(), arrayContaining("", "", "", ""));
         assertThat(header.v2(), arrayContaining("", "", "", ""));
@@ -283,12 +455,12 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
 
 
     public void testRowContainsDuplicateNonEmptyValues() {
     public void testRowContainsDuplicateNonEmptyValues() {
 
 
-        assertFalse(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Collections.singletonList("a")));
-        assertFalse(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Collections.singletonList("")));
-        assertFalse(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "c")));
-        assertTrue(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "a")));
-        assertTrue(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "b")));
-        assertFalse(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "", "")));
-        assertFalse(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("", "a", "")));
+        assertNull(DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Collections.singletonList("a")));
+        assertNull(DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Collections.singletonList("")));
+        assertNull(DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Arrays.asList("a", "b", "c")));
+        assertEquals("a", DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Arrays.asList("a", "b", "a")));
+        assertEquals("b", DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Arrays.asList("a", "b", "b")));
+        assertNull(DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Arrays.asList("a", "", "")));
+        assertNull(DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Arrays.asList("", "a", "")));
     }
     }
 }
 }

+ 50 - 12
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java

@@ -6,12 +6,14 @@
 package org.elasticsearch.xpack.ml.filestructurefinder;
 package org.elasticsearch.xpack.ml.filestructurefinder;
 
 
 import com.ibm.icu.text.CharsetMatch;
 import com.ibm.icu.text.CharsetMatch;
+import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
 
 
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayInputStream;
 import java.nio.charset.Charset;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.Arrays;
 
 
+import static org.elasticsearch.xpack.ml.filestructurefinder.FileStructureOverrides.EMPTY_OVERRIDES;
 import static org.hamcrest.Matchers.startsWith;
 import static org.hamcrest.Matchers.startsWith;
 import static org.hamcrest.core.IsInstanceOf.instanceOf;
 import static org.hamcrest.core.IsInstanceOf.instanceOf;
 
 
@@ -47,26 +49,62 @@ public class FileStructureFinderManagerTests extends FileStructureTestCase {
     }
     }
 
 
     public void testMakeBestStructureGivenJson() throws Exception {
     public void testMakeBestStructureGivenJson() throws Exception {
-        assertThat(structureFinderManager.makeBestStructureFinder(explanation,
-            "{ \"time\": \"2018-05-17T13:41:23\", \"message\": \"hello\" }", StandardCharsets.UTF_8.name(), randomBoolean()),
-            instanceOf(JsonFileStructureFinder.class));
+        assertThat(structureFinderManager.makeBestStructureFinder(explanation, JSON_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
+            EMPTY_OVERRIDES), instanceOf(JsonFileStructureFinder.class));
+    }
+
+    public void testMakeBestStructureGivenJsonAndDelimitedOverride() throws Exception {
+
+        // Need to change the quote character from the default of double quotes
+        // otherwise the quotes in the JSON will stop it parsing as CSV
+        FileStructureOverrides overrides = FileStructureOverrides.builder()
+            .setFormat(FileStructure.Format.DELIMITED).setQuote('\'').build();
+
+        assertThat(structureFinderManager.makeBestStructureFinder(explanation, JSON_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
+            overrides), instanceOf(DelimitedFileStructureFinder.class));
     }
     }
 
 
     public void testMakeBestStructureGivenXml() throws Exception {
     public void testMakeBestStructureGivenXml() throws Exception {
-        assertThat(structureFinderManager.makeBestStructureFinder(explanation,
-            "<log time=\"2018-05-17T13:41:23\"><message>hello</message></log>", StandardCharsets.UTF_8.name(), randomBoolean()),
-            instanceOf(XmlFileStructureFinder.class));
+        assertThat(structureFinderManager.makeBestStructureFinder(explanation, XML_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
+            EMPTY_OVERRIDES), instanceOf(XmlFileStructureFinder.class));
+    }
+
+    public void testMakeBestStructureGivenXmlAndTextOverride() throws Exception {
+
+        FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.SEMI_STRUCTURED_TEXT).build();
+
+        assertThat(structureFinderManager.makeBestStructureFinder(explanation, XML_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
+            overrides), instanceOf(TextLogFileStructureFinder.class));
     }
     }
 
 
     public void testMakeBestStructureGivenCsv() throws Exception {
     public void testMakeBestStructureGivenCsv() throws Exception {
-        assertThat(structureFinderManager.makeBestStructureFinder(explanation, "time,message\n" +
-                "2018-05-17T13:41:23,hello\n", StandardCharsets.UTF_8.name(), randomBoolean()),
-            instanceOf(DelimitedFileStructureFinder.class));
+        assertThat(structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
+            EMPTY_OVERRIDES), instanceOf(DelimitedFileStructureFinder.class));
+    }
+
+    public void testMakeBestStructureGivenCsvAndJsonOverride() {
+
+        FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.JSON).build();
+
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+            () -> structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
+                overrides));
+
+        assertEquals("Input did not match the specified format [json]", e.getMessage());
     }
     }
 
 
     public void testMakeBestStructureGivenText() throws Exception {
     public void testMakeBestStructureGivenText() throws Exception {
-        assertThat(structureFinderManager.makeBestStructureFinder(explanation, "[2018-05-17T13:41:23] hello\n" +
-                "[2018-05-17T13:41:24] hello again\n", StandardCharsets.UTF_8.name(), randomBoolean()),
-            instanceOf(TextLogFileStructureFinder.class));
+        assertThat(structureFinderManager.makeBestStructureFinder(explanation, TEXT_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
+            EMPTY_OVERRIDES), instanceOf(TextLogFileStructureFinder.class));
+    }
+
+    public void testMakeBestStructureGivenTextAndDelimitedOverride() throws Exception {
+
+        // Every line of the text sample has two colons, so colon delimited is possible, just very weird
+        FileStructureOverrides overrides = FileStructureOverrides.builder()
+            .setFormat(FileStructure.Format.DELIMITED).setDelimiter(':').build();
+
+        assertThat(structureFinderManager.makeBestStructureFinder(explanation, TEXT_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(),
+            overrides), instanceOf(DelimitedFileStructureFinder.class));
     }
     }
 }
 }

+ 72 - 22
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java

@@ -17,6 +17,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.Map;
 import java.util.SortedMap;
 import java.util.SortedMap;
 
 
+import static org.elasticsearch.xpack.ml.filestructurefinder.FileStructureOverrides.EMPTY_OVERRIDES;
 import static org.hamcrest.Matchers.contains;
 import static org.hamcrest.Matchers.contains;
 
 
 public class FileStructureUtilsTests extends FileStructureTestCase {
 public class FileStructureUtilsTests extends FileStructureTestCase {
@@ -32,57 +33,106 @@ public class FileStructureUtilsTests extends FileStructureTestCase {
         assertFalse(FileStructureUtils.isMoreLikelyTextThanKeyword(randomAlphaOfLengthBetween(1, 256)));
         assertFalse(FileStructureUtils.isMoreLikelyTextThanKeyword(randomAlphaOfLengthBetween(1, 256)));
     }
     }
 
 
-    public void testSingleSampleSingleField() {
+    public void testGuessTimestampGivenSingleSampleSingleField() {
         Map<String, String> sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
         Map<String, String> sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
         Tuple<String, TimestampMatch> match =
         Tuple<String, TimestampMatch> match =
-            FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample));
+            FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), EMPTY_OVERRIDES);
         assertNotNull(match);
         assertNotNull(match);
         assertEquals("field1", match.v1());
         assertEquals("field1", match.v1());
         assertThat(match.v2().dateFormats, contains("ISO8601"));
         assertThat(match.v2().dateFormats, contains("ISO8601"));
         assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
         assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
     }
     }
 
 
-    public void testSamplesWithSameSingleTimeField() {
+    public void testGuessTimestampGivenSingleSampleSingleFieldAndConsistentTimeFieldOverride() {
+
+        FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampField("field1").build();
+
+        Map<String, String> sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
+        Tuple<String, TimestampMatch> match =
+            FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides);
+        assertNotNull(match);
+        assertEquals("field1", match.v1());
+        assertThat(match.v2().dateFormats, contains("ISO8601"));
+        assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
+    }
+
+    public void testGuessTimestampGivenSingleSampleSingleFieldAndImpossibleTimeFieldOverride() {
+
+        FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampField("field2").build();
+
+        Map<String, String> sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+            () -> FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides));
+
+        assertEquals("Specified timestamp field [field2] is not present in record [{field1=2018-05-24T17:28:31,735}]", e.getMessage());
+    }
+
+    public void testGuessTimestampGivenSingleSampleSingleFieldAndConsistentTimeFormatOverride() {
+
+        FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("ISO8601").build();
+
+        Map<String, String> sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
+        Tuple<String, TimestampMatch> match =
+            FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides);
+        assertNotNull(match);
+        assertEquals("field1", match.v1());
+        assertThat(match.v2().dateFormats, contains("ISO8601"));
+        assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
+    }
+
+    public void testGuessTimestampGivenSingleSampleSingleFieldAndImpossibleTimeFormatOverride() {
+
+        FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("EEE MMM dd HH:mm:ss YYYY").build();
+
+        Map<String, String> sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+            () -> FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides));
+
+        assertEquals("Specified timestamp format [EEE MMM dd HH:mm:ss YYYY] does not match for record [{field1=2018-05-24T17:28:31,735}]",
+            e.getMessage());
+    }
+
+    public void testGuessTimestampGivenSamplesWithSameSingleTimeField() {
         Map<String, String> sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
         Map<String, String> sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
         Map<String, String> sample2 = Collections.singletonMap("field1", "2018-05-24T17:33:39,406");
         Map<String, String> sample2 = Collections.singletonMap("field1", "2018-05-24T17:33:39,406");
         Tuple<String, TimestampMatch> match =
         Tuple<String, TimestampMatch> match =
-            FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
+            FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES);
         assertNotNull(match);
         assertNotNull(match);
         assertEquals("field1", match.v1());
         assertEquals("field1", match.v1());
         assertThat(match.v2().dateFormats, contains("ISO8601"));
         assertThat(match.v2().dateFormats, contains("ISO8601"));
         assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
         assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
     }
     }
 
 
-    public void testSamplesWithOneSingleTimeFieldDifferentFormat() {
+    public void testGuessTimestampGivenSamplesWithOneSingleTimeFieldDifferentFormat() {
         Map<String, String> sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
         Map<String, String> sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
         Map<String, String> sample2 = Collections.singletonMap("field1", "2018-05-24 17:33:39,406");
         Map<String, String> sample2 = Collections.singletonMap("field1", "2018-05-24 17:33:39,406");
         Tuple<String, TimestampMatch> match =
         Tuple<String, TimestampMatch> match =
-            FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
+            FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES);
         assertNull(match);
         assertNull(match);
     }
     }
 
 
-    public void testSamplesWithDifferentSingleTimeField() {
+    public void testGuessTimestampGivenSamplesWithDifferentSingleTimeField() {
         Map<String, String> sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
         Map<String, String> sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735");
         Map<String, String> sample2 = Collections.singletonMap("another_field", "2018-05-24T17:33:39,406");
         Map<String, String> sample2 = Collections.singletonMap("another_field", "2018-05-24T17:33:39,406");
         Tuple<String, TimestampMatch> match =
         Tuple<String, TimestampMatch> match =
-            FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
+            FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES);
         assertNull(match);
         assertNull(match);
     }
     }
 
 
-    public void testSingleSampleManyFieldsOneTimeFormat() {
+    public void testGuessTimestampGivenSingleSampleManyFieldsOneTimeFormat() {
         Map<String, Object> sample = new LinkedHashMap<>();
         Map<String, Object> sample = new LinkedHashMap<>();
         sample.put("foo", "not a time");
         sample.put("foo", "not a time");
         sample.put("time", "2018-05-24 17:28:31,735");
         sample.put("time", "2018-05-24 17:28:31,735");
         sample.put("bar", 42);
         sample.put("bar", 42);
         Tuple<String, TimestampMatch> match =
         Tuple<String, TimestampMatch> match =
-            FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample));
+            FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), EMPTY_OVERRIDES);
         assertNotNull(match);
         assertNotNull(match);
         assertEquals("time", match.v1());
         assertEquals("time", match.v1());
         assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS"));
         assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS"));
         assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
         assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
     }
     }
 
 
-    public void testSamplesWithManyFieldsSameSingleTimeFormat() {
+    public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormat() {
         Map<String, Object> sample1 = new LinkedHashMap<>();
         Map<String, Object> sample1 = new LinkedHashMap<>();
         sample1.put("foo", "not a time");
         sample1.put("foo", "not a time");
         sample1.put("time", "2018-05-24 17:28:31,735");
         sample1.put("time", "2018-05-24 17:28:31,735");
@@ -92,14 +142,14 @@ public class FileStructureUtilsTests extends FileStructureTestCase {
         sample2.put("time", "2018-05-29 11:53:02,837");
         sample2.put("time", "2018-05-29 11:53:02,837");
         sample2.put("bar", 17);
         sample2.put("bar", 17);
         Tuple<String, TimestampMatch> match =
         Tuple<String, TimestampMatch> match =
-            FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
+            FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES);
         assertNotNull(match);
         assertNotNull(match);
         assertEquals("time", match.v1());
         assertEquals("time", match.v1());
         assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS"));
         assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS"));
         assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
         assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
     }
     }
 
 
-    public void testSamplesWithManyFieldsSameTimeFieldDifferentTimeFormat() {
+    public void testGuessTimestampGivenSamplesWithManyFieldsSameTimeFieldDifferentTimeFormat() {
         Map<String, Object> sample1 = new LinkedHashMap<>();
         Map<String, Object> sample1 = new LinkedHashMap<>();
         sample1.put("foo", "not a time");
         sample1.put("foo", "not a time");
         sample1.put("time", "2018-05-24 17:28:31,735");
         sample1.put("time", "2018-05-24 17:28:31,735");
@@ -109,11 +159,11 @@ public class FileStructureUtilsTests extends FileStructureTestCase {
         sample2.put("time", "May 29 2018 11:53:02");
         sample2.put("time", "May 29 2018 11:53:02");
         sample2.put("bar", 17);
         sample2.put("bar", 17);
         Tuple<String, TimestampMatch> match =
         Tuple<String, TimestampMatch> match =
-            FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
+            FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES);
         assertNull(match);
         assertNull(match);
     }
     }
 
 
-    public void testSamplesWithManyFieldsSameSingleTimeFormatDistractionBefore() {
+    public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormatDistractionBefore() {
         Map<String, Object> sample1 = new LinkedHashMap<>();
         Map<String, Object> sample1 = new LinkedHashMap<>();
         sample1.put("red_herring", "May 29 2007 11:53:02");
         sample1.put("red_herring", "May 29 2007 11:53:02");
         sample1.put("time", "2018-05-24 17:28:31,735");
         sample1.put("time", "2018-05-24 17:28:31,735");
@@ -123,14 +173,14 @@ public class FileStructureUtilsTests extends FileStructureTestCase {
         sample2.put("time", "2018-05-29 11:53:02,837");
         sample2.put("time", "2018-05-29 11:53:02,837");
         sample2.put("bar", 17);
         sample2.put("bar", 17);
         Tuple<String, TimestampMatch> match =
         Tuple<String, TimestampMatch> match =
-            FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
+            FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES);
         assertNotNull(match);
         assertNotNull(match);
         assertEquals("time", match.v1());
         assertEquals("time", match.v1());
         assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS"));
         assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS"));
         assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
         assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName);
     }
     }
 
 
-    public void testSamplesWithManyFieldsSameSingleTimeFormatDistractionAfter() {
+    public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormatDistractionAfter() {
         Map<String, Object> sample1 = new LinkedHashMap<>();
         Map<String, Object> sample1 = new LinkedHashMap<>();
         sample1.put("foo", "not a time");
         sample1.put("foo", "not a time");
         sample1.put("time", "May 24 2018 17:28:31");
         sample1.put("time", "May 24 2018 17:28:31");
@@ -140,14 +190,14 @@ public class FileStructureUtilsTests extends FileStructureTestCase {
         sample2.put("time", "May 29 2018 11:53:02");
         sample2.put("time", "May 29 2018 11:53:02");
         sample2.put("red_herring", "17");
         sample2.put("red_herring", "17");
         Tuple<String, TimestampMatch> match =
         Tuple<String, TimestampMatch> match =
-            FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
+            FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES);
         assertNotNull(match);
         assertNotNull(match);
         assertEquals("time", match.v1());
         assertEquals("time", match.v1());
         assertThat(match.v2().dateFormats, contains("MMM dd YYYY HH:mm:ss", "MMM  d YYYY HH:mm:ss"));
         assertThat(match.v2().dateFormats, contains("MMM dd YYYY HH:mm:ss", "MMM  d YYYY HH:mm:ss"));
         assertEquals("CISCOTIMESTAMP", match.v2().grokPatternName);
         assertEquals("CISCOTIMESTAMP", match.v2().grokPatternName);
     }
     }
 
 
-    public void testSamplesWithManyFieldsInconsistentTimeFields() {
+    public void testGuessTimestampGivenSamplesWithManyFieldsInconsistentTimeFields() {
         Map<String, Object> sample1 = new LinkedHashMap<>();
         Map<String, Object> sample1 = new LinkedHashMap<>();
         sample1.put("foo", "not a time");
         sample1.put("foo", "not a time");
         sample1.put("time1", "May 24 2018 17:28:31");
         sample1.put("time1", "May 24 2018 17:28:31");
@@ -157,11 +207,11 @@ public class FileStructureUtilsTests extends FileStructureTestCase {
         sample2.put("time2", "May 29 2018 11:53:02");
         sample2.put("time2", "May 29 2018 11:53:02");
         sample2.put("bar", 42);
         sample2.put("bar", 42);
         Tuple<String, TimestampMatch> match =
         Tuple<String, TimestampMatch> match =
-            FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
+            FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES);
         assertNull(match);
         assertNull(match);
     }
     }
 
 
-    public void testSamplesWithManyFieldsInconsistentAndConsistentTimeFields() {
+    public void testGuessTimestampGivenSamplesWithManyFieldsInconsistentAndConsistentTimeFields() {
         Map<String, Object> sample1 = new LinkedHashMap<>();
         Map<String, Object> sample1 = new LinkedHashMap<>();
         sample1.put("foo", "not a time");
         sample1.put("foo", "not a time");
         sample1.put("time1", "2018-05-09 17:28:31,735");
         sample1.put("time1", "2018-05-09 17:28:31,735");
@@ -173,7 +223,7 @@ public class FileStructureUtilsTests extends FileStructureTestCase {
         sample2.put("time3", "Thu, May 10 2018 11:53:02");
         sample2.put("time3", "Thu, May 10 2018 11:53:02");
         sample2.put("bar", 42);
         sample2.put("bar", 42);
         Tuple<String, TimestampMatch> match =
         Tuple<String, TimestampMatch> match =
-            FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2));
+            FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES);
         assertNotNull(match);
         assertNotNull(match);
         assertEquals("time2", match.v1());
         assertEquals("time2", match.v1());
         assertThat(match.v2().dateFormats, contains("MMM dd YYYY HH:mm:ss", "MMM  d YYYY HH:mm:ss"));
         assertThat(match.v2().dateFormats, contains("MMM dd YYYY HH:mm:ss", "MMM  d YYYY HH:mm:ss"));

+ 58 - 3
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java

@@ -244,8 +244,7 @@ public class GrokPatternCreatorTests extends FileStructureTestCase {
             grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", "timestamp"));
             grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", "timestamp"));
         assertEquals(5, mappings.size());
         assertEquals(5, mappings.size());
         assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field"));
         assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field"));
-        assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date"),
-            mappings.get("extra_timestamp"));
+        assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date"), mappings.get("extra_timestamp"));
         assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field2"));
         assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field2"));
         assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("ipaddress"));
         assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("ipaddress"));
         assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel"));
         assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel"));
@@ -273,7 +272,8 @@ public class GrokPatternCreatorTests extends FileStructureTestCase {
         Map<String, Object> mappings = new HashMap<>();
         Map<String, Object> mappings = new HashMap<>();
         GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null);
         GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null);
 
 
-        assertEquals(new Tuple<>("timestamp", "%{COMBINEDAPACHELOG}"), grokPatternCreator.findFullLineGrokPattern());
+        assertEquals(new Tuple<>("timestamp", "%{COMBINEDAPACHELOG}"),
+            grokPatternCreator.findFullLineGrokPattern(randomBoolean() ? "timestamp" : null));
         assertEquals(10, mappings.size());
         assertEquals(10, mappings.size());
         assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "text"), mappings.get("agent"));
         assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "text"), mappings.get("agent"));
         assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("auth"));
         assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("auth"));
@@ -323,4 +323,59 @@ public class GrokPatternCreatorTests extends FileStructureTestCase {
         assertEquals("", grokPatternCreator.getOverallGrokPatternBuilder().toString());
         assertEquals("", grokPatternCreator.getOverallGrokPatternBuilder().toString());
         assertSame(snippets, adjustedSnippets);
         assertSame(snippets, adjustedSnippets);
     }
     }
+
+    public void testValidateFullLineGrokPatternGivenValid() {
+
+        String timestampField = "utc_timestamp";
+        String grokPattern = "%{INT:serial_no}\\t%{TIMESTAMP_ISO8601:local_timestamp}\\t%{TIMESTAMP_ISO8601:utc_timestamp}\\t" +
+            "%{INT:user_id}\\t%{HOSTNAME:host}\\t%{IP:client_ip}\\t%{WORD:method}\\t%{LOGLEVEL:severity}\\t%{PROG:program}\\t" +
+            "%{GREEDYDATA:message}";
+
+        // Two timestamps: one local, one UTC
+        Collection<String> sampleMessages = Arrays.asList(
+            "559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" +
+                "Info\tsshd\tsubsystem request for sftp",
+            "559550912548986880\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" +
+                "Info\tsshd\tsubsystem request for sftp",
+            "559550912548986887\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" +
+                "Info\tsshd\tsubsystem request for sftp",
+            "559550912603512850\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" +
+                "Info\tsshd\tsubsystem request for sftp");
+
+        Map<String, Object> mappings = new HashMap<>();
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null);
+
+        grokPatternCreator.validateFullLineGrokPattern(grokPattern, timestampField);
+        assertEquals(9, mappings.size());
+        assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("serial_no"));
+        assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date"), mappings.get("local_timestamp"));
+        assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("user_id"));
+        assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("host"));
+        assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("client_ip"));
+        assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("method"));
+        assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("program"));
+        assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("message"));
+    }
+
+    public void testValidateFullLineGrokPatternGivenInvalid() {
+
+        String timestampField = "utc_timestamp";
+        String grokPattern = "%{INT:serial_no}\\t%{TIMESTAMP_ISO8601:local_timestamp}\\t%{TIMESTAMP_ISO8601:utc_timestamp}\\t" +
+            "%{INT:user_id}\\t%{HOSTNAME:host}\\t%{IP:client_ip}\\t%{WORD:method}\\t%{LOGLEVEL:severity}\\t%{PROG:program}\\t" +
+            "%{GREEDYDATA:message}";
+
+        Collection<String> sampleMessages = Arrays.asList(
+            "Sep  8 11:55:06 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'elastic.slack.com/A/IN': 95.110.64.205#53",
+            "Sep  8 11:55:08 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'slack-imgs.com/A/IN': 95.110.64.205#53",
+            "Sep  8 11:55:35 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'www.elastic.co/A/IN': 95.110.68.206#53",
+            "Sep  8 11:55:42 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'b.akamaiedge.net/A/IN': 95.110.64.205#53");
+
+        Map<String, Object> mappings = new HashMap<>();
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null);
+
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+            () -> grokPatternCreator.validateFullLineGrokPattern(grokPattern, timestampField));
+
+        assertEquals("Supplied Grok pattern [" + grokPattern + "] does not match sample messages", e.getMessage());
+    }
 }
 }

+ 3 - 1
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderTests.java

@@ -18,7 +18,8 @@ public class JsonFileStructureFinderTests extends FileStructureTestCase {
 
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
-        FileStructureFinder structureFinder = factory.createFromSample(explanation, JSON_SAMPLE, charset, hasByteOrderMarker);
+        FileStructureFinder structureFinder = factory.createFromSample(explanation, JSON_SAMPLE, charset, hasByteOrderMarker,
+            FileStructureOverrides.EMPTY_OVERRIDES);
 
 
         FileStructure structure = structureFinder.getStructure();
         FileStructure structure = structureFinder.getStructure();
 
 
@@ -32,6 +33,7 @@ public class JsonFileStructureFinderTests extends FileStructureTestCase {
         assertNull(structure.getExcludeLinesPattern());
         assertNull(structure.getExcludeLinesPattern());
         assertNull(structure.getMultilineStartPattern());
         assertNull(structure.getMultilineStartPattern());
         assertNull(structure.getDelimiter());
         assertNull(structure.getDelimiter());
+        assertNull(structure.getQuote());
         assertNull(structure.getHasHeaderRow());
         assertNull(structure.getHasHeaderRow());
         assertNull(structure.getShouldTrimFields());
         assertNull(structure.getShouldTrimFields());
         assertNull(structure.getGrokPattern());
         assertNull(structure.getGrokPattern());

+ 193 - 86
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java

@@ -15,6 +15,90 @@ import java.util.Set;
 
 
 public class TextLogFileStructureFinderTests extends FileStructureTestCase {
 public class TextLogFileStructureFinderTests extends FileStructureTestCase {
 
 
+    private static final String EXCEPTION_TRACE_SAMPLE =
+        "[2018-02-28T14:49:40,517][DEBUG][o.e.a.b.TransportShardBulkAction] [an_index][2] failed to execute bulk item " +
+            "(index) BulkShardRequest [[an_index][2]] containing [33] requests\n" +
+        "java.lang.IllegalArgumentException: Document contains at least one immense term in field=\"message.keyword\" (whose UTF8 " +
+            "encoding is longer than the max length 32766), all of which were skipped.  Please correct the analyzer to not produce " +
+            "such terms.  The prefix of the first immense term is: '[60, 83, 79, 65, 80, 45, 69, 78, 86, 58, 69, 110, 118, 101, 108, " +
+            "111, 112, 101, 32, 120, 109, 108, 110, 115, 58, 83, 79, 65, 80, 45]...', original message: bytes can be at most 32766 " +
+            "in length; got 49023\n" +
+        "\tat org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:796) " +
+            "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
+        "\tat org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:430) " +
+            "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
+        "\tat org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:392) " +
+            "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
+        "\tat org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:240) " +
+            "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
+        "\tat org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:496) " +
+            "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
+        "\tat org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1729) " +
+            "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
+        "\tat org.apache.lucene.index.IndexWriter.addDocument(IndexWriter.java:1464) " +
+            "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
+        "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:1070) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.index.engine.InternalEngine.indexIntoLucene(InternalEngine.java:1012) " +
+            "~[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:878) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.index.shard.IndexShard.index(IndexShard.java:738) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperation(IndexShard.java:707) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperationOnPrimary(IndexShard.java:673) " +
+            "~[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequestOnPrimary(TransportShardBulkAction.java:548) " +
+            "~[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequest(TransportShardBulkAction.java:140) " +
+            "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeBulkItemRequest(TransportShardBulkAction.java:236) " +
+            "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.performOnPrimary(TransportShardBulkAction.java:123) " +
+            "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:110) " +
+            "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:72) " +
+            "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" +
+            "(TransportReplicationAction.java:1034) [elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" +
+            "(TransportReplicationAction.java:1012) [elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.action.support.replication.ReplicationOperation.execute(ReplicationOperation.java:103) " +
+            "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" +
+            "(TransportReplicationAction.java:359) [elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" +
+            "(TransportReplicationAction.java:299) [elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" +
+            "(TransportReplicationAction.java:975) [elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" +
+            "(TransportReplicationAction.java:972) [elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.index.shard.IndexShardOperationPermits.acquire(IndexShardOperationPermits.java:238) " +
+            "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.index.shard.IndexShard.acquirePrimaryOperationPermit(IndexShard.java:2220) " +
+            "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.acquirePrimaryShardReference" +
+            "(TransportReplicationAction.java:984) [elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.access$500(TransportReplicationAction.java:98) " +
+            "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.doRun" +
+            "(TransportReplicationAction.java:320) [elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " +
+            "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" +
+            ".messageReceived(TransportReplicationAction.java:295) [elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" +
+            ".messageReceived(TransportReplicationAction.java:282) [elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:66) " +
+            "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.transport.TransportService$7.doRun(TransportService.java:656) " +
+            "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:635) " +
+            "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " +
+            "[elasticsearch-6.2.1.jar:6.2.1]\n" +
+        "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_144]\n" +
+        "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_144]\n" +
+        "\tat java.lang.Thread.run(Thread.java:748) [?:1.8.0_144]\n";
+
     private FileStructureFinderFactory factory = new TextLogFileStructureFinderFactory();
     private FileStructureFinderFactory factory = new TextLogFileStructureFinderFactory();
 
 
     public void testCreateConfigsGivenElasticsearchLog() throws Exception {
     public void testCreateConfigsGivenElasticsearchLog() throws Exception {
@@ -22,7 +106,8 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase {
 
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
-        FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker);
+        FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker,
+            FileStructureOverrides.EMPTY_OVERRIDES);
 
 
         FileStructure structure = structureFinder.getStructure();
         FileStructure structure = structureFinder.getStructure();
 
 
@@ -36,6 +121,7 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase {
         assertNull(structure.getExcludeLinesPattern());
         assertNull(structure.getExcludeLinesPattern());
         assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
         assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
         assertNull(structure.getDelimiter());
         assertNull(structure.getDelimiter());
+        assertNull(structure.getQuote());
         assertNull(structure.getHasHeaderRow());
         assertNull(structure.getHasHeaderRow());
         assertNull(structure.getShouldTrimFields());
         assertNull(structure.getShouldTrimFields());
         assertEquals("\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} \\]\\[.*", structure.getGrokPattern());
         assertEquals("\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} \\]\\[.*", structure.getGrokPattern());
@@ -43,6 +129,85 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase {
         assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
         assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
     }
     }
 
 
+    public void testCreateConfigsGivenElasticsearchLogAndTimestampFieldOverride() throws Exception {
+
+        FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampField("my_time").build();
+
+        assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
+
+        String charset = randomFrom(POSSIBLE_CHARSETS);
+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
+        FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides);
+
+        FileStructure structure = structureFinder.getStructure();
+
+        assertEquals(FileStructure.Format.SEMI_STRUCTURED_TEXT, structure.getFormat());
+        assertEquals(charset, structure.getCharset());
+        if (hasByteOrderMarker == null) {
+            assertNull(structure.getHasByteOrderMarker());
+        } else {
+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
+        }
+        assertNull(structure.getExcludeLinesPattern());
+        assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
+        assertNull(structure.getDelimiter());
+        assertNull(structure.getQuote());
+        assertNull(structure.getHasHeaderRow());
+        assertNull(structure.getShouldTrimFields());
+        assertEquals("\\[%{TIMESTAMP_ISO8601:my_time}\\]\\[%{LOGLEVEL:loglevel} \\]\\[.*", structure.getGrokPattern());
+        assertEquals("my_time", structure.getTimestampField());
+        assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
+    }
+
+    public void testCreateConfigsGivenElasticsearchLogAndGrokPatternOverride() throws Exception {
+
+        FileStructureOverrides overrides = FileStructureOverrides.builder().setGrokPattern("\\[%{TIMESTAMP_ISO8601:timestamp}\\]" +
+            "\\[%{LOGLEVEL:loglevel} *\\]\\[%{JAVACLASS:class} *\\] \\[%{HOSTNAME:node}\\] %{JAVALOGMESSAGE:message}").build();
+
+        assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
+
+        String charset = randomFrom(POSSIBLE_CHARSETS);
+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
+        FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides);
+
+        FileStructure structure = structureFinder.getStructure();
+
+        assertEquals(FileStructure.Format.SEMI_STRUCTURED_TEXT, structure.getFormat());
+        assertEquals(charset, structure.getCharset());
+        if (hasByteOrderMarker == null) {
+            assertNull(structure.getHasByteOrderMarker());
+        } else {
+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
+        }
+        assertNull(structure.getExcludeLinesPattern());
+        assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern());
+        assertNull(structure.getDelimiter());
+        assertNull(structure.getQuote());
+        assertNull(structure.getHasHeaderRow());
+        assertNull(structure.getShouldTrimFields());
+        assertEquals("\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} *\\]" +
+            "\\[%{JAVACLASS:class} *\\] \\[%{HOSTNAME:node}\\] %{JAVALOGMESSAGE:message}", structure.getGrokPattern());
+        assertEquals("timestamp", structure.getTimestampField());
+        assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats());
+    }
+
+    public void testCreateConfigsGivenElasticsearchLogAndImpossibleGrokPatternOverride() {
+
+        // This Grok pattern cannot be matched against the messages in the sample because the fields are in the wrong order
+        FileStructureOverrides overrides = FileStructureOverrides.builder().setGrokPattern("\\[%{LOGLEVEL:loglevel} *\\]" +
+            "\\[%{HOSTNAME:node}\\]\\[%{TIMESTAMP_ISO8601:timestamp}\\] \\[%{JAVACLASS:class} *\\] %{JAVALOGMESSAGE:message}").build();
+
+        assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE));
+
+        String charset = randomFrom(POSSIBLE_CHARSETS);
+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+            () -> factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides));
+
+        assertEquals("Supplied Grok pattern [\\[%{LOGLEVEL:loglevel} *\\]\\[%{HOSTNAME:node}\\]\\[%{TIMESTAMP_ISO8601:timestamp}\\] " +
+            "\\[%{JAVACLASS:class} *\\] %{JAVALOGMESSAGE:message}] does not match sample messages", e.getMessage());
+    }
+
     public void testCreateMultiLineMessageStartRegexGivenNoPrefaces() {
     public void testCreateMultiLineMessageStartRegexGivenNoPrefaces() {
         for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
         for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) {
             String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
             String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern();
@@ -144,97 +309,17 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase {
             "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-watcher]\n" +
             "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService     ] [node-0] loaded module [x-pack-watcher]\n" +
             "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService     ] [node-0] no plugins loaded\n";
             "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService     ] [node-0] no plugins loaded\n";
 
 
-        Tuple<TimestampMatch, Set<String>> mostLikelyMatch = TextLogFileStructureFinder.mostLikelyTimestamp(sample.split("\n"));
+        Tuple<TimestampMatch, Set<String>> mostLikelyMatch =
+            TextLogFileStructureFinder.mostLikelyTimestamp(sample.split("\n"), FileStructureOverrides.EMPTY_OVERRIDES);
         assertNotNull(mostLikelyMatch);
         assertNotNull(mostLikelyMatch);
         assertEquals(new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""),
         assertEquals(new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""),
             mostLikelyMatch.v1());
             mostLikelyMatch.v1());
     }
     }
 
 
     public void testMostLikelyTimestampGivenExceptionTrace() {
     public void testMostLikelyTimestampGivenExceptionTrace() {
-        String sample = "[2018-02-28T14:49:40,517][DEBUG][o.e.a.b.TransportShardBulkAction] [an_index][2] failed to execute bulk item " +
-                "(index) BulkShardRequest [[an_index][2]] containing [33] requests\n" +
-            "java.lang.IllegalArgumentException: Document contains at least one immense term in field=\"message.keyword\" (whose UTF8 " +
-                "encoding is longer than the max length 32766), all of which were skipped.  Please correct the analyzer to not produce " +
-                "such terms.  The prefix of the first immense term is: '[60, 83, 79, 65, 80, 45, 69, 78, 86, 58, 69, 110, 118, 101, 108, " +
-                "111, 112, 101, 32, 120, 109, 108, 110, 115, 58, 83, 79, 65, 80, 45]...', original message: bytes can be at most 32766 " +
-                "in length; got 49023\n" +
-            "\tat org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:796) " +
-                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
-            "\tat org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:430) " +
-                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
-            "\tat org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:392) " +
-                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
-            "\tat org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:240) " +
-                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
-            "\tat org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:496) " +
-                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
-            "\tat org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1729) " +
-                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
-            "\tat org.apache.lucene.index.IndexWriter.addDocument(IndexWriter.java:1464) " +
-                "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" +
-            "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:1070) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.index.engine.InternalEngine.indexIntoLucene(InternalEngine.java:1012) " +
-                "~[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:878) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.index.shard.IndexShard.index(IndexShard.java:738) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperation(IndexShard.java:707) ~[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperationOnPrimary(IndexShard.java:673) " +
-                "~[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequestOnPrimary(TransportShardBulkAction.java:548) " +
-                "~[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequest(TransportShardBulkAction.java:140) " +
-                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeBulkItemRequest(TransportShardBulkAction.java:236) " +
-                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.performOnPrimary(TransportShardBulkAction.java:123) " +
-                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:110) " +
-                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:72) " +
-                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" +
-                "(TransportReplicationAction.java:1034) [elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" +
-                "(TransportReplicationAction.java:1012) [elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.action.support.replication.ReplicationOperation.execute(ReplicationOperation.java:103) " +
-                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" +
-                "(TransportReplicationAction.java:359) [elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" +
-                "(TransportReplicationAction.java:299) [elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" +
-                "(TransportReplicationAction.java:975) [elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" +
-                "(TransportReplicationAction.java:972) [elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.index.shard.IndexShardOperationPermits.acquire(IndexShardOperationPermits.java:238) " +
-                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.index.shard.IndexShard.acquirePrimaryOperationPermit(IndexShard.java:2220) " +
-                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.acquirePrimaryShardReference" +
-                "(TransportReplicationAction.java:984) [elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.access$500(TransportReplicationAction.java:98) " +
-                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.doRun" +
-                "(TransportReplicationAction.java:320) [elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " +
-                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" +
-                ".messageReceived(TransportReplicationAction.java:295) [elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" +
-                ".messageReceived(TransportReplicationAction.java:282) [elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:66) " +
-                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.transport.TransportService$7.doRun(TransportService.java:656) " +
-                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:635) " +
-                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " +
-                "[elasticsearch-6.2.1.jar:6.2.1]\n" +
-            "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_144]\n" +
-            "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_144]\n" +
-            "\tat java.lang.Thread.run(Thread.java:748) [?:1.8.0_144]\n";
-
-        Tuple<TimestampMatch, Set<String>> mostLikelyMatch = TextLogFileStructureFinder.mostLikelyTimestamp(sample.split("\n"));
+
+        Tuple<TimestampMatch, Set<String>> mostLikelyMatch =
+            TextLogFileStructureFinder.mostLikelyTimestamp(EXCEPTION_TRACE_SAMPLE.split("\n"), FileStructureOverrides.EMPTY_OVERRIDES);
         assertNotNull(mostLikelyMatch);
         assertNotNull(mostLikelyMatch);
 
 
         // Even though many lines have a timestamp near the end (in the Lucene version information),
         // Even though many lines have a timestamp near the end (in the Lucene version information),
@@ -243,4 +328,26 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase {
         assertEquals(new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""),
         assertEquals(new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""),
             mostLikelyMatch.v1());
             mostLikelyMatch.v1());
     }
     }
+
+    public void testMostLikelyTimestampGivenExceptionTraceAndTimestampFormatOverride() {
+
+        FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("YYYY-MM-dd HH:mm:ss").build();
+
+        Tuple<TimestampMatch, Set<String>> mostLikelyMatch =
+            TextLogFileStructureFinder.mostLikelyTimestamp(EXCEPTION_TRACE_SAMPLE.split("\n"), overrides);
+        assertNotNull(mostLikelyMatch);
+
+        // The override should force the seemingly inferior choice of timestamp
+        assertEquals(new TimestampMatch(6, "", "YYYY-MM-dd HH:mm:ss", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601",
+                ""), mostLikelyMatch.v1());
+    }
+
+    public void testMostLikelyTimestampGivenExceptionTraceAndImpossibleTimestampFormatOverride() {
+
+        FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("MMM dd HH:mm:ss").build();
+
+        Tuple<TimestampMatch, Set<String>> mostLikelyMatch =
+            TextLogFileStructureFinder.mostLikelyTimestamp(EXCEPTION_TRACE_SAMPLE.split("\n"), overrides);
+        assertNull(mostLikelyMatch);
+    }
 }
 }

+ 3 - 1
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java

@@ -18,7 +18,8 @@ public class XmlFileStructureFinderTests extends FileStructureTestCase {
 
 
         String charset = randomFrom(POSSIBLE_CHARSETS);
         String charset = randomFrom(POSSIBLE_CHARSETS);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
         Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
-        FileStructureFinder structureFinder = factory.createFromSample(explanation, XML_SAMPLE, charset, hasByteOrderMarker);
+        FileStructureFinder structureFinder = factory.createFromSample(explanation, XML_SAMPLE, charset, hasByteOrderMarker,
+            FileStructureOverrides.EMPTY_OVERRIDES);
 
 
         FileStructure structure = structureFinder.getStructure();
         FileStructure structure = structureFinder.getStructure();
 
 
@@ -32,6 +33,7 @@ public class XmlFileStructureFinderTests extends FileStructureTestCase {
         assertNull(structure.getExcludeLinesPattern());
         assertNull(structure.getExcludeLinesPattern());
         assertEquals("^\\s*<log4j:event", structure.getMultilineStartPattern());
         assertEquals("^\\s*<log4j:event", structure.getMultilineStartPattern());
         assertNull(structure.getDelimiter());
         assertNull(structure.getDelimiter());
+        assertNull(structure.getQuote());
         assertNull(structure.getHasHeaderRow());
         assertNull(structure.getHasHeaderRow());
         assertNull(structure.getShouldTrimFields());
         assertNull(structure.getShouldTrimFields());
         assertNull(structure.getGrokPattern());
         assertNull(structure.getGrokPattern());

+ 42 - 1
x-pack/plugin/src/test/resources/rest-api-spec/api/xpack.ml.find_file_structure.json

@@ -10,9 +10,50 @@
           "type": "int",
           "type": "int",
           "description": "Optional parameter to specify how many lines of the file to include in the analysis"
           "description": "Optional parameter to specify how many lines of the file to include in the analysis"
         },
         },
+        "charset": {
+          "type": "string",
+          "description": "Optional parameter to specify the character set of the file"
+        },
+        "format": {
+          "type": "enum",
+          "options": [ "json", "xml", "delimited", "semi_structured_text" ],
+          "description": "Optional parameter to specify the high level file format"
+        },
+        "has_header_row": {
+          "type": "boolean",
+          "description": "Optional parameter to specify whether a delimited file includes the column names in its first row"
+        },
+        "column_names": {
+          "type": "list",
+          "description": "Optional parameter containing a comma separated list of the column names for a delimited file"
+        },
+        "delimiter": {
+          "type": "string",
+          "description": "Optional parameter to specify the delimiter character for a delimited file - must be a single character"
+        },
+        "quote": {
+          "type": "string",
+          "description": "Optional parameter to specify the quote character for a delimited file - must be a single character"
+        },
+        "should_trim_fields": {
+          "type": "boolean",
+          "description": "Optional parameter to specify whether the values between delimiters in a delimited file should have whitespace trimmed from them"
+        },
+        "grok_pattern": {
+          "type": "string",
+          "description": "Optional parameter to specify the Grok pattern that should be used to extract fields from messages in a semi-structured text file"
+        },
+        "timestamp_field": {
+          "type": "string",
+          "description": "Optional parameter to specify the timestamp field in the file"
+        },
+        "timestamp_format": {
+          "type": "string",
+          "description": "Optional parameter to specify the timestamp format in the file"
+        },
         "explain": {
         "explain": {
           "type": "boolean",
           "type": "boolean",
-          "description": "Optional parameter to include an commentary on how the structure was derived"
+          "description": "Optional parameter to include a commentary on how the structure was derived"
         }
         }
       }
       }
     },
     },

+ 54 - 1
x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml

@@ -1,11 +1,12 @@
 ---
 ---
-"Test JSON file structure analysis":
+"Test JSON file structure analysis without overrides":
   - do:
   - do:
       headers:
       headers:
         # This is to stop the usual content type randomization, which
         # This is to stop the usual content type randomization, which
         # would obviously ruin the results for this particular test
         # would obviously ruin the results for this particular test
         Content-Type: "application/json"
         Content-Type: "application/json"
       xpack.ml.find_file_structure:
       xpack.ml.find_file_structure:
+        lines_to_sample: 3
         body:
         body:
           - airline: AAL
           - airline: AAL
             responsetime: 132.2046
             responsetime: 132.2046
@@ -42,3 +43,55 @@
   - match: { field_stats.time.count: 3 }
   - match: { field_stats.time.count: 3 }
   - match: { field_stats.time.cardinality: 3 }
   - match: { field_stats.time.cardinality: 3 }
   - match: { field_stats.time.cardinality: 3 }
   - match: { field_stats.time.cardinality: 3 }
+  - is_false: explanation
+
+---
+"Test JSON file structure analysis with overrides":
+  - do:
+      headers:
+        # This is to stop the usual content type randomization, which
+        # would obviously ruin the results for this particular test
+        Content-Type: "application/json"
+      xpack.ml.find_file_structure:
+        charset: UTF-8
+        format: json
+        timestamp_field: time
+        timestamp_format: UNIX
+        explain: true
+        body:
+          - airline: AAL
+            responsetime: 132.2046
+            sourcetype: file-structure-test
+            time: 1403481600
+          - airline: JZA
+            responsetime: 990.4628
+            sourcetype: file-structure-test
+            time: 1403481700
+          - airline: AAL
+            responsetime: 134.2046
+            sourcetype: file-structure-test
+            time: 1403481800
+
+  - match: { num_lines_analyzed: 3 }
+  - match: { num_messages_analyzed: 3 }
+  - match: { charset: "UTF-8" }
+  - match: { has_byte_order_marker: false }
+  - match: { format: json }
+  - match: { timestamp_field: time }
+  - match: { timestamp_formats.0: UNIX }
+  - match: { need_client_timezone: false }
+  - match: { mappings.airline.type: keyword }
+  - match: { mappings.responsetime.type: double }
+  - match: { mappings.sourcetype.type: keyword }
+  - match: { mappings.time.type: date }
+  - match: { mappings.time.format: epoch_second }
+  - match: { field_stats.airline.count: 3 }
+  - match: { field_stats.airline.cardinality: 2 }
+  - match: { field_stats.responsetime.count: 3 }
+  - match: { field_stats.responsetime.cardinality: 3 }
+  - match: { field_stats.sourcetype.count: 3 }
+  - match: { field_stats.sourcetype.cardinality: 1 }
+  - match: { field_stats.time.count: 3 }
+  - match: { field_stats.time.cardinality: 3 }
+  - match: { field_stats.time.cardinality: 3 }
+  - match: { explanation.0: "Using specified character encoding [UTF-8]" }