瀏覽代碼

[ML] Add field stats to log structure finder (#33351)

The log structure endpoint will return these in addition to
pure structure information so that it can be used to drive
pre-import data visualizer functionality.

The statistics for every field are count, cardinality
(distinct count) and top hits (most common values).  Extra
statistics are calculated if the field is numeric: min, max,
mean and median.
David Roberts 7 年之前
父節點
當前提交
a296829205
共有 14 個文件被更改,包括 815 次插入85 次删除
  1. 8 1
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/DelimitedLogStructureFinder.java
  2. 147 0
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/FieldStats.java
  3. 182 0
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/FieldStatsCalculator.java
  4. 29 13
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/GrokPatternCreator.java
  5. 8 1
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/JsonLogStructureFinder.java
  6. 32 3
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructure.java
  7. 43 15
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureUtils.java
  8. 4 1
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/TextLogStructureFinder.java
  9. 8 1
      x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/XmlLogStructureFinder.java
  10. 218 0
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/FieldStatsCalculatorTests.java
  11. 61 0
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/FieldStatsTests.java
  12. 17 17
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/GrokPatternCreatorTests.java
  13. 8 0
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureTests.java
  14. 50 33
      x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureUtilsTests.java

+ 8 - 1
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/DelimitedLogStructureFinder.java

@@ -123,9 +123,16 @@ public class DelimitedLogStructureFinder implements LogStructureFinder {
                 .setMultilineStartPattern(timeLineRegex);
         }
 
-        SortedMap<String, Object> mappings = LogStructureUtils.guessMappings(explanation, sampleRecords);
+        Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>> mappingsAndFieldStats =
+            LogStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords);
+
+        SortedMap<String, Object> mappings = mappingsAndFieldStats.v1();
         mappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
 
+        if (mappingsAndFieldStats.v2() != null) {
+            structureBuilder.setFieldStats(mappingsAndFieldStats.v2());
+        }
+
         LogStructure structure = structureBuilder
             .setMappings(mappings)
             .setExplanation(explanation)

+ 147 - 0
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/FieldStats.java

@@ -0,0 +1,147 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.elasticsearch.common.ParseField;
+import org.elasticsearch.common.xcontent.ConstructingObjectParser;
+import org.elasticsearch.common.xcontent.ToXContentObject;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+
+public class FieldStats implements ToXContentObject {
+
+    static final ParseField COUNT = new ParseField("count");
+    static final ParseField CARDINALITY = new ParseField("cardinality");
+    static final ParseField MIN_VALUE = new ParseField("min_value");
+    static final ParseField MAX_VALUE = new ParseField("max_value");
+    static final ParseField MEAN_VALUE = new ParseField("mean_value");
+    static final ParseField MEDIAN_VALUE = new ParseField("median_value");
+    static final ParseField TOP_HITS = new ParseField("top_hits");
+
+    @SuppressWarnings("unchecked")
+    public static final ConstructingObjectParser<FieldStats, Void> PARSER = new ConstructingObjectParser<>("field_stats", false,
+        a -> new FieldStats((long) a[0], (int) a[1], (Double) a[2], (Double) a[3], (Double) a[4], (Double) a[5],
+            (List<Map<String, Object>>) a[6]));
+
+    static {
+        PARSER.declareLong(ConstructingObjectParser.constructorArg(), COUNT);
+        PARSER.declareInt(ConstructingObjectParser.constructorArg(), CARDINALITY);
+        PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MIN_VALUE);
+        PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MAX_VALUE);
+        PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MEAN_VALUE);
+        PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MEDIAN_VALUE);
+        PARSER.declareObjectArray(ConstructingObjectParser.optionalConstructorArg(), (p, c) -> p.mapOrdered(), TOP_HITS);
+    }
+
+    private final long count;
+    private final int cardinality;
+    private final Double minValue;
+    private final Double maxValue;
+    private final Double meanValue;
+    private final Double medianValue;
+    private final List<Map<String, Object>> topHits;
+
+    FieldStats(long count, int cardinality, List<Map<String, Object>> topHits) {
+        this(count, cardinality, null, null, null, null, topHits);
+    }
+
+    FieldStats(long count, int cardinality, Double minValue, Double maxValue, Double meanValue, Double medianValue,
+               List<Map<String, Object>> topHits) {
+        this.count = count;
+        this.cardinality = cardinality;
+        this.minValue = minValue;
+        this.maxValue = maxValue;
+        this.meanValue = meanValue;
+        this.medianValue = medianValue;
+        this.topHits = (topHits == null) ? Collections.emptyList() : Collections.unmodifiableList(topHits);
+    }
+
+    public long getCount() {
+        return count;
+    }
+
+    public int getCardinality() {
+        return cardinality;
+    }
+
+    public Double getMinValue() {
+        return minValue;
+    }
+
+    public Double getMaxValue() {
+        return maxValue;
+    }
+
+    public Double getMeanValue() {
+        return meanValue;
+    }
+
+    public Double getMedianValue() {
+        return medianValue;
+    }
+
+    public List<Map<String, Object>> getTopHits() {
+        return topHits;
+    }
+
+    @Override
+    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
+
+        builder.startObject();
+        builder.field(COUNT.getPreferredName(), count);
+        builder.field(CARDINALITY.getPreferredName(), cardinality);
+        if (minValue != null) {
+            builder.field(MIN_VALUE.getPreferredName(), minValue);
+        }
+        if (maxValue != null) {
+            builder.field(MAX_VALUE.getPreferredName(), maxValue);
+        }
+        if (meanValue != null) {
+            builder.field(MEAN_VALUE.getPreferredName(), meanValue);
+        }
+        if (medianValue != null) {
+            builder.field(MEDIAN_VALUE.getPreferredName(), medianValue);
+        }
+        if (topHits.isEmpty() == false) {
+            builder.field(TOP_HITS.getPreferredName(), topHits);
+        }
+        builder.endObject();
+
+        return builder;
+    }
+
+    @Override
+    public int hashCode() {
+
+        return Objects.hash(count, cardinality, minValue, maxValue, meanValue, medianValue, topHits);
+    }
+
+    @Override
+    public boolean equals(Object other) {
+
+        if (this == other) {
+            return true;
+        }
+
+        if (other == null || getClass() != other.getClass()) {
+            return false;
+        }
+
+        FieldStats that = (FieldStats) other;
+        return this.count == that.count &&
+            this.cardinality == that.cardinality &&
+            Objects.equals(this.minValue, that.minValue) &&
+            Objects.equals(this.maxValue, that.maxValue) &&
+            Objects.equals(this.meanValue, that.meanValue) &&
+            Objects.equals(this.medianValue, that.medianValue) &&
+            Objects.equals(this.topHits, that.topHits);
+    }
+}

+ 182 - 0
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/FieldStatsCalculator.java

@@ -0,0 +1,182 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+
+/**
+ * Calculate statistics for a set of scalar field values.
+ * Count, cardinality (distinct count) and top hits (most common values) are always calculated.
+ * Extra statistics are calculated if the field is numeric: min, max, mean and median.
+ */
+public class FieldStatsCalculator {
+
+    private long count;
+    private SortedMap<String, Integer> countsByStringValue = new TreeMap<>();
+    private SortedMap<Double, Integer> countsByNumericValue = new TreeMap<>();
+
+    /**
+     * Add a collection of values to the calculator.
+     * The values to be added can be combined by the caller and added in a
+     * single call to this method or added in multiple calls to this method.
+     * @param fieldValues Zero or more values to add.  May not be <code>null</code>.
+     */
+    public void accept(Collection<String> fieldValues) {
+
+        count += fieldValues.size();
+
+        for (String fieldValue : fieldValues) {
+
+            countsByStringValue.compute(fieldValue, (k, v) -> (v == null) ? 1 : (1 + v));
+
+            if (countsByNumericValue != null) {
+
+                try {
+                    countsByNumericValue.compute(Double.valueOf(fieldValue), (k, v) -> (v == null) ? 1 : (1 + v));
+                } catch (NumberFormatException e) {
+                    countsByNumericValue = null;
+                }
+            }
+        }
+    }
+
+    /**
+     * Calculate field statistics based on the previously accepted values.
+     * @param numTopHits The maximum number of entries to include in the top hits.
+     * @return The calculated field statistics.
+     */
+    public FieldStats calculate(int numTopHits) {
+
+        if (countsByNumericValue != null && countsByNumericValue.isEmpty() == false) {
+            return new FieldStats(count, countsByNumericValue.size(), countsByNumericValue.firstKey(), countsByNumericValue.lastKey(),
+                calculateMean(), calculateMedian(), findNumericTopHits(numTopHits));
+        } else {
+            return new FieldStats(count, countsByStringValue.size(), findStringTopHits(numTopHits));
+        }
+    }
+
+    Double calculateMean() {
+
+        assert countsByNumericValue != null;
+
+        if (countsByNumericValue.isEmpty()) {
+            return null;
+        }
+
+        double runningCount = 0.0;
+        double runningMean = Double.NaN;
+
+        for (Map.Entry<Double, Integer> entry : countsByNumericValue.entrySet()) {
+
+            double entryCount = (double) entry.getValue();
+            double newRunningCount = runningCount + entryCount;
+
+            // Updating a running mean like this is more numerically stable than using (sum / count)
+            if (runningCount > 0.0) {
+                runningMean = runningMean * (runningCount / newRunningCount) + entry.getKey() * (entryCount / newRunningCount);
+            } else if (entryCount > 0.0) {
+                runningMean = entry.getKey();
+            }
+
+            runningCount = newRunningCount;
+        }
+
+        return runningMean;
+    }
+
+    Double calculateMedian() {
+
+        assert countsByNumericValue != null;
+
+        if (count % 2 == 1) {
+
+            // Simple case - median is middle value
+            long targetCount = count / 2 + 1;
+            long currentUpperBound = 0;
+
+            for (Map.Entry<Double, Integer> entry : countsByNumericValue.entrySet()) {
+
+                currentUpperBound += entry.getValue();
+
+                if (currentUpperBound >= targetCount) {
+                    return entry.getKey();
+                }
+            }
+
+        } else {
+
+            // More complicated case - median is average of two middle values
+            long target1Count = count / 2;
+            long target2Count = target1Count + 1;
+            double target1Value = Double.NaN;
+            long prevUpperBound = -1;
+            long currentUpperBound = 0;
+
+            for (Map.Entry<Double, Integer> entry : countsByNumericValue.entrySet()) {
+
+                currentUpperBound += entry.getValue();
+
+                if (currentUpperBound >= target2Count) {
+
+                    if (prevUpperBound < target1Count) {
+                        // Both target values are the same
+                        return entry.getKey();
+                    } else {
+                        return (target1Value + entry.getKey()) / 2.0;
+                    }
+                }
+
+                if (currentUpperBound >= target1Count) {
+                    target1Value = entry.getKey();
+                }
+
+                prevUpperBound = currentUpperBound;
+            }
+        }
+
+        return null;
+    }
+
+    List<Map<String, Object>> findNumericTopHits(int numTopHits) {
+        assert countsByNumericValue != null;
+        return findTopHits(numTopHits, countsByNumericValue, Comparator.comparing(Map.Entry<Double, Integer>::getKey));
+    }
+
+    List<Map<String, Object>> findStringTopHits(int numTopHits) {
+        return findTopHits(numTopHits, countsByStringValue, Comparator.comparing(Map.Entry<String, Integer>::getKey));
+    }
+
+    /**
+     * Order by descending count, with a secondary sort to ensure reproducibility of results.
+     */
+    private static <T> List<Map<String, Object>> findTopHits(int numTopHits, Map<T, Integer> countsByValue,
+                                                             Comparator<Map.Entry<T, Integer>> secondarySort) {
+
+        List<Map.Entry<T, Integer>> sortedByCount = countsByValue.entrySet().stream()
+            .sorted(Comparator.comparing(Map.Entry<T, Integer>::getValue, Comparator.reverseOrder()).thenComparing(secondarySort))
+            .limit(numTopHits).collect(Collectors.toList());
+
+        List<Map<String, Object>> topHits = new ArrayList<>(sortedByCount.size());
+
+        for (Map.Entry<T, Integer> entry : sortedByCount) {
+
+            Map<String, Object> topHit = new LinkedHashMap<>(3);
+            topHit.put("value", entry.getKey());
+            topHit.put("count", entry.getValue());
+            topHits.add(topHit);
+        }
+
+        return topHits;
+    }
+}

+ 29 - 13
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/GrokPatternCreator.java

@@ -119,6 +119,7 @@ public final class GrokPatternCreator {
      * Both this class and other classes will update it.
      */
     private final Map<String, Object> mappings;
+    private final Map<String, FieldStats> fieldStats;
     private final Map<String, Integer> fieldNameCountStore = new HashMap<>();
     private final StringBuilder overallGrokPatternBuilder = new StringBuilder();
 
@@ -128,22 +129,26 @@ public final class GrokPatternCreator {
      *                    can be appended by the methods of this class.
      * @param sampleMessages Sample messages that any Grok pattern found must match.
      * @param mappings Will be updated with mappings appropriate for the returned pattern, if non-<code>null</code>.
+     * @param fieldStats Will be updated with field stats for the fields in the returned pattern, if non-<code>null</code>.
      */
-    public GrokPatternCreator(List<String> explanation, Collection<String> sampleMessages, Map<String, Object> mappings) {
+    public GrokPatternCreator(List<String> explanation, Collection<String> sampleMessages, Map<String, Object> mappings,
+                              Map<String, FieldStats> fieldStats) {
         this.explanation = explanation;
         this.sampleMessages = Collections.unmodifiableCollection(sampleMessages);
         this.mappings = mappings;
+        this.fieldStats = fieldStats;
     }
 
     /**
      * This method attempts to find a Grok pattern that will match all of the sample messages in their entirety.
+     * It will also update mappings and field stats if they are non-<code>null</code>.
      * @return A tuple of (time field name, Grok string), or <code>null</code> if no suitable Grok pattern was found.
      */
     public Tuple<String, String> findFullLineGrokPattern() {
 
         for (FullMatchGrokPatternCandidate candidate : FULL_MATCH_GROK_PATTERNS) {
             if (candidate.matchesAll(sampleMessages)) {
-                return candidate.processMatch(explanation, sampleMessages, mappings);
+                return candidate.processMatch(explanation, sampleMessages, mappings, fieldStats);
             }
         }
 
@@ -186,7 +191,8 @@ public final class GrokPatternCreator {
 
         Collection<String> prefaces = new ArrayList<>();
         Collection<String> epilogues = new ArrayList<>();
-        String patternBuilderContent = chosenPattern.processCaptures(fieldNameCountStore, snippets, prefaces, epilogues, mappings);
+        String patternBuilderContent =
+            chosenPattern.processCaptures(fieldNameCountStore, snippets, prefaces, epilogues, mappings, fieldStats);
         appendBestGrokMatchForStrings(false, prefaces, ignoreKeyValueCandidateLeft, ignoreValueOnlyCandidatesLeft);
         overallGrokPatternBuilder.append(patternBuilderContent);
         appendBestGrokMatchForStrings(isLast, epilogues, ignoreKeyValueCandidateRight, ignoreValueOnlyCandidatesRight);
@@ -375,11 +381,12 @@ public final class GrokPatternCreator {
         /**
          * After it has been determined that this Grok pattern candidate matches a collection of strings,
          * return collections of the bits that come before (prefaces) and after (epilogues) the bit
-         * that matches.  Also update mappings with the most appropriate field name and type.
+         * that matches.  Also update mappings with the most appropriate field name and type, and
+         * calculate field stats.
          * @return The string that needs to be incorporated into the overall Grok pattern for the line.
          */
         String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
-                               Collection<String> epilogues, Map<String, Object> mappings);
+                               Collection<String> epilogues, Map<String, Object> mappings, Map<String, FieldStats> fieldStats);
     }
 
     /**
@@ -436,7 +443,7 @@ public final class GrokPatternCreator {
          */
         @Override
         public String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
-                                      Collection<String> epilogues, Map<String, Object> mappings) {
+                                      Collection<String> epilogues, Map<String, Object> mappings, Map<String, FieldStats> fieldStats) {
             String sampleValue = null;
             for (String snippet : snippets) {
                 Map<String, Object> captures = grok.captures(snippet);
@@ -505,7 +512,7 @@ public final class GrokPatternCreator {
 
         @Override
         public String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
-                                      Collection<String> epilogues, Map<String, Object> mappings) {
+                                      Collection<String> epilogues, Map<String, Object> mappings, Map<String, FieldStats> fieldStats) {
             if (fieldName == null) {
                 throw new IllegalStateException("Cannot process KV matches until a field name has been determined");
             }
@@ -526,6 +533,9 @@ public final class GrokPatternCreator {
             if (mappings != null) {
                 mappings.put(adjustedFieldName, LogStructureUtils.guessScalarMapping(explanation, adjustedFieldName, values));
             }
+            if (fieldStats != null) {
+                fieldStats.put(adjustedFieldName, LogStructureUtils.calculateFieldStats(values));
+            }
             return "\\b" + fieldName + "=%{USER:" + adjustedFieldName + "}";
         }
     }
@@ -541,8 +551,8 @@ public final class GrokPatternCreator {
 
         @Override
         public String processCaptures(Map<String, Integer> fieldNameCountStore, Collection<String> snippets, Collection<String> prefaces,
-                                      Collection<String> epilogues, Map<String, Object> mappings) {
-            return super.processCaptures(fieldNameCountStore, snippets, prefaces, epilogues, null);
+                                      Collection<String> epilogues, Map<String, Object> mappings, Map<String, FieldStats> fieldStats) {
+            return super.processCaptures(fieldNameCountStore, snippets, prefaces, epilogues, null, fieldStats);
         }
     }
 
@@ -570,11 +580,11 @@ public final class GrokPatternCreator {
          * @return A tuple of (time field name, Grok string).
          */
         public Tuple<String, String> processMatch(List<String> explanation, Collection<String> sampleMessages,
-                                                  Map<String, Object> mappings) {
+                                                  Map<String, Object> mappings, Map<String, FieldStats> fieldStats) {
 
             explanation.add("A full message Grok pattern [" + grokString.substring(2, grokString.length() - 1) + "] looks appropriate");
 
-            if (mappings != null) {
+            if (mappings != null || fieldStats != null) {
                 Map<String, Collection<String>> valuesPerField = new HashMap<>();
 
                 for (String sampleMessage : sampleMessages) {
@@ -604,8 +614,14 @@ public final class GrokPatternCreator {
 
                 for (Map.Entry<String, Collection<String>> valuesForField : valuesPerField.entrySet()) {
                     String fieldName = valuesForField.getKey();
-                    mappings.put(fieldName,
-                        LogStructureUtils.guessScalarMapping(explanation, fieldName, valuesForField.getValue()));
+                    if (mappings != null) {
+                        mappings.put(fieldName,
+                            LogStructureUtils.guessScalarMapping(explanation, fieldName, valuesForField.getValue()));
+                    }
+                    if (fieldStats != null) {
+                        fieldStats.put(fieldName,
+                            LogStructureUtils.calculateFieldStats(valuesForField.getValue()));
+                    }
                 }
             }
 

+ 8 - 1
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/JsonLogStructureFinder.java

@@ -56,9 +56,16 @@ public class JsonLogStructureFinder implements LogStructureFinder {
                 .setNeedClientTimezone(timeField.v2().hasTimezoneDependentParsing());
         }
 
-        SortedMap<String, Object> mappings = LogStructureUtils.guessMappings(explanation, sampleRecords);
+        Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>> mappingsAndFieldStats =
+            LogStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords);
+
+        SortedMap<String, Object> mappings = mappingsAndFieldStats.v1();
         mappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
 
+        if (mappingsAndFieldStats.v2() != null) {
+            structureBuilder.setFieldStats(mappingsAndFieldStats.v2());
+        }
+
         LogStructure structure = structureBuilder
             .setMappings(mappings)
             .setExplanation(explanation)

+ 32 - 3
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructure.java

@@ -9,6 +9,7 @@ import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.xcontent.ObjectParser;
 import org.elasticsearch.common.xcontent.ToXContentObject;
 import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.common.xcontent.XContentParser;
 
 import java.io.IOException;
 import java.util.ArrayList;
@@ -95,6 +96,7 @@ public class LogStructure implements ToXContentObject {
     static final ParseField TIMESTAMP_FORMATS = new ParseField("timestamp_formats");
     static final ParseField NEED_CLIENT_TIMEZONE = new ParseField("need_client_timezone");
     static final ParseField MAPPINGS = new ParseField("mappings");
+    static final ParseField FIELD_STATS = new ParseField("field_stats");
     static final ParseField EXPLANATION = new ParseField("explanation");
 
     public static final ObjectParser<Builder, Void> PARSER = new ObjectParser<>("log_file_structure", false, Builder::new);
@@ -117,6 +119,13 @@ public class LogStructure implements ToXContentObject {
         PARSER.declareStringArray(Builder::setTimestampFormats, TIMESTAMP_FORMATS);
         PARSER.declareBoolean(Builder::setNeedClientTimezone, NEED_CLIENT_TIMEZONE);
         PARSER.declareObject(Builder::setMappings, (p, c) -> new TreeMap<>(p.map()), MAPPINGS);
+        PARSER.declareObject(Builder::setFieldStats, (p, c) -> {
+            Map<String, FieldStats> fieldStats = new TreeMap<>();
+            while (p.nextToken() == XContentParser.Token.FIELD_NAME) {
+                fieldStats.put(p.currentName(), FieldStats.PARSER.apply(p, c));
+            }
+            return fieldStats;
+        }, FIELD_STATS);
         PARSER.declareStringArray(Builder::setExplanation, EXPLANATION);
     }
 
@@ -137,13 +146,14 @@ public class LogStructure implements ToXContentObject {
     private final String timestampField;
     private final boolean needClientTimezone;
     private final SortedMap<String, Object> mappings;
+    private final SortedMap<String, FieldStats> fieldStats;
     private final List<String> explanation;
 
     public LogStructure(int numLinesAnalyzed, int numMessagesAnalyzed, String sampleStart, String charset, Boolean hasByteOrderMarker,
                         Format format, String multilineStartPattern, String excludeLinesPattern, List<String> inputFields,
                         Boolean hasHeaderRow, Character delimiter, Boolean shouldTrimFields, String grokPattern, String timestampField,
                         List<String> timestampFormats, boolean needClientTimezone, Map<String, Object> mappings,
-                        List<String> explanation) {
+                        Map<String, FieldStats> fieldStats, List<String> explanation) {
 
         this.numLinesAnalyzed = numLinesAnalyzed;
         this.numMessagesAnalyzed = numMessagesAnalyzed;
@@ -162,6 +172,7 @@ public class LogStructure implements ToXContentObject {
         this.timestampFormats = (timestampFormats == null) ? null : Collections.unmodifiableList(new ArrayList<>(timestampFormats));
         this.needClientTimezone = needClientTimezone;
         this.mappings = Collections.unmodifiableSortedMap(new TreeMap<>(mappings));
+        this.fieldStats = Collections.unmodifiableSortedMap(new TreeMap<>(fieldStats));
         this.explanation = Collections.unmodifiableList(new ArrayList<>(explanation));
     }
 
@@ -233,6 +244,10 @@ public class LogStructure implements ToXContentObject {
         return mappings;
     }
 
+    public SortedMap<String, FieldStats> getFieldStats() {
+        return fieldStats;
+    }
+
     public List<String> getExplanation() {
         return explanation;
     }
@@ -278,6 +293,13 @@ public class LogStructure implements ToXContentObject {
         }
         builder.field(NEED_CLIENT_TIMEZONE.getPreferredName(), needClientTimezone);
         builder.field(MAPPINGS.getPreferredName(), mappings);
+        if (fieldStats.isEmpty() == false) {
+            builder.startObject(FIELD_STATS.getPreferredName());
+            for (Map.Entry<String, FieldStats> entry : fieldStats.entrySet()) {
+                builder.field(entry.getKey(), entry.getValue());
+            }
+            builder.endObject();
+        }
         builder.field(EXPLANATION.getPreferredName(), explanation);
         builder.endObject();
 
@@ -289,7 +311,7 @@ public class LogStructure implements ToXContentObject {
 
         return Objects.hash(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format,
             multilineStartPattern, excludeLinesPattern, inputFields, hasHeaderRow, delimiter, shouldTrimFields, grokPattern, timestampField,
-            timestampFormats, needClientTimezone, mappings, explanation);
+            timestampFormats, needClientTimezone, mappings, fieldStats, explanation);
     }
 
     @Override
@@ -321,6 +343,7 @@ public class LogStructure implements ToXContentObject {
             Objects.equals(this.timestampField, that.timestampField) &&
             Objects.equals(this.timestampFormats, that.timestampFormats) &&
             Objects.equals(this.mappings, that.mappings) &&
+            Objects.equals(this.fieldStats, that.fieldStats) &&
             Objects.equals(this.explanation, that.explanation);
     }
 
@@ -343,6 +366,7 @@ public class LogStructure implements ToXContentObject {
         private List<String> timestampFormats;
         private boolean needClientTimezone;
         private Map<String, Object> mappings;
+        private Map<String, FieldStats> fieldStats = Collections.emptyMap();
         private List<String> explanation;
 
         public Builder() {
@@ -438,6 +462,11 @@ public class LogStructure implements ToXContentObject {
             return this;
         }
 
+        public Builder setFieldStats(Map<String, FieldStats> fieldStats) {
+            this.fieldStats = Objects.requireNonNull(fieldStats);
+            return this;
+        }
+
         public Builder setExplanation(List<String> explanation) {
             this.explanation = Objects.requireNonNull(explanation);
             return this;
@@ -540,7 +569,7 @@ public class LogStructure implements ToXContentObject {
 
             return new LogStructure(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format,
                 multilineStartPattern, excludeLinesPattern, inputFields, hasHeaderRow, delimiter, shouldTrimFields, grokPattern,
-                timestampField, timestampFormats, needClientTimezone, mappings, explanation);
+                timestampField, timestampFormats, needClientTimezone, mappings, fieldStats, explanation);
         }
     }
 }

+ 43 - 15
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureUtils.java

@@ -16,6 +16,7 @@ import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.SortedMap;
 import java.util.TreeMap;
 import java.util.stream.Collectors;
@@ -28,6 +29,7 @@ public final class LogStructureUtils {
     public static final String MAPPING_FORMAT_SETTING = "format";
     public static final String MAPPING_PROPERTIES_SETTING = "properties";
 
+    private static final int NUM_TOP_HITS = 10;
     // NUMBER Grok pattern doesn't support scientific notation, so we extend it
     private static final Grok NUMBER_GROK = new Grok(Grok.getBuiltinPatterns(), "^%{NUMBER}(?:[eE][+-]?[0-3]?[0-9]{1,2})?$");
     private static final Grok IP_GROK = new Grok(Grok.getBuiltinPatterns(), "^%{IP}$");
@@ -112,26 +114,39 @@ public final class LogStructureUtils {
      * @param sampleRecords The sampled records.
      * @return A map of field name to mapping settings.
      */
-    static SortedMap<String, Object> guessMappings(List<String> explanation, List<Map<String, ?>> sampleRecords) {
+    static Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>>
+        guessMappingsAndCalculateFieldStats(List<String> explanation, List<Map<String, ?>> sampleRecords) {
 
         SortedMap<String, Object> mappings = new TreeMap<>();
+        SortedMap<String, FieldStats> fieldStats = new TreeMap<>();
 
-        for (Map<String, ?> sampleRecord : sampleRecords) {
-            for (String fieldName : sampleRecord.keySet()) {
-                mappings.computeIfAbsent(fieldName, key -> guessMapping(explanation, fieldName,
-                    sampleRecords.stream().flatMap(record -> {
-                            Object fieldValue = record.get(fieldName);
-                            return (fieldValue == null) ? Stream.empty() : Stream.of(fieldValue);
-                        }
-                    ).collect(Collectors.toList())));
+        Set<String> uniqueFieldNames = sampleRecords.stream().flatMap(record -> record.keySet().stream()).collect(Collectors.toSet());
+
+        for (String fieldName : uniqueFieldNames) {
+
+            List<Object> fieldValues = sampleRecords.stream().flatMap(record -> {
+                    Object fieldValue = record.get(fieldName);
+                    return (fieldValue == null) ? Stream.empty() : Stream.of(fieldValue);
+                }
+            ).collect(Collectors.toList());
+
+            Tuple<Map<String, String>, FieldStats> mappingAndFieldStats =
+                guessMappingAndCalculateFieldStats(explanation, fieldName, fieldValues);
+            if (mappingAndFieldStats != null) {
+                if (mappingAndFieldStats.v1() != null) {
+                    mappings.put(fieldName, mappingAndFieldStats.v1());
+                }
+                if (mappingAndFieldStats.v2() != null) {
+                    fieldStats.put(fieldName, mappingAndFieldStats.v2());
+                }
             }
         }
 
-        return mappings;
+        return new Tuple<>(mappings, fieldStats);
     }
 
-    static Map<String, String> guessMapping(List<String> explanation, String fieldName, List<Object> fieldValues) {
-
+    static Tuple<Map<String, String>, FieldStats> guessMappingAndCalculateFieldStats(List<String> explanation,
+                                                                                     String fieldName, List<Object> fieldValues) {
         if (fieldValues == null || fieldValues.isEmpty()) {
             // We can get here if all the records that contained a given field had a null value for it.
             // In this case it's best not to make any statement about what the mapping type should be.
@@ -140,7 +155,7 @@ public final class LogStructureUtils {
 
         if (fieldValues.stream().anyMatch(value -> value instanceof Map)) {
             if (fieldValues.stream().allMatch(value -> value instanceof Map)) {
-                return Collections.singletonMap(MAPPING_TYPE_SETTING, "object");
+                return new Tuple<>(Collections.singletonMap(MAPPING_TYPE_SETTING, "object"), null);
             }
             throw new IllegalArgumentException("Field [" + fieldName +
                 "] has both object and non-object values - this is not supported by Elasticsearch");
@@ -148,11 +163,12 @@ public final class LogStructureUtils {
 
         if (fieldValues.stream().anyMatch(value -> value instanceof List || value instanceof Object[])) {
             // Elasticsearch fields can be either arrays or single values, but array values must all have the same type
-            return guessMapping(explanation, fieldName,
+            return guessMappingAndCalculateFieldStats(explanation, fieldName,
                 fieldValues.stream().flatMap(LogStructureUtils::flatten).collect(Collectors.toList()));
         }
 
-        return guessScalarMapping(explanation, fieldName, fieldValues.stream().map(Object::toString).collect(Collectors.toList()));
+        Collection<String> fieldValuesAsStrings = fieldValues.stream().map(Object::toString).collect(Collectors.toList());
+        return new Tuple<>(guessScalarMapping(explanation, fieldName, fieldValuesAsStrings), calculateFieldStats(fieldValuesAsStrings));
     }
 
     private static Stream<Object> flatten(Object value) {
@@ -227,6 +243,18 @@ public final class LogStructureUtils {
         return Collections.singletonMap(MAPPING_TYPE_SETTING, "keyword");
     }
 
+    /**
+     * Calculate stats for a set of field values.
+     * @param fieldValues Values of the field for which field stats are to be calculated.
+     * @return The stats calculated from the field values.
+     */
+    static FieldStats calculateFieldStats(Collection<String> fieldValues) {
+
+        FieldStatsCalculator calculator = new FieldStatsCalculator();
+        calculator.accept(fieldValues);
+        return calculator.calculate(NUM_TOP_HITS);
+    }
+
     /**
      * The thinking is that the longer the field value and the more spaces it contains,
      * the more likely it is that it should be indexed as text rather than keyword.

+ 4 - 1
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/TextLogStructureFinder.java

@@ -82,10 +82,12 @@ public class TextLogStructureFinder implements LogStructureFinder {
         mappings.put("message", Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "text"));
         mappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));
 
+        SortedMap<String, FieldStats> fieldStats = new TreeMap<>();
+
         // We can't parse directly into @timestamp using Grok, so parse to some other time field, which the date filter will then remove
         String interimTimestampField;
         String grokPattern;
-        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, fieldStats);
         Tuple<String, String> timestampFieldAndFullMatchGrokPattern = grokPatternCreator.findFullLineGrokPattern();
         if (timestampFieldAndFullMatchGrokPattern != null) {
             interimTimestampField = timestampFieldAndFullMatchGrokPattern.v1();
@@ -101,6 +103,7 @@ public class TextLogStructureFinder implements LogStructureFinder {
             .setNeedClientTimezone(bestTimestamp.v1().hasTimezoneDependentParsing())
             .setGrokPattern(grokPattern)
             .setMappings(mappings)
+            .setFieldStats(fieldStats)
             .setExplanation(explanation)
             .build();
 

+ 8 - 1
x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/XmlLogStructureFinder.java

@@ -95,7 +95,14 @@ public class XmlLogStructureFinder implements LogStructureFinder {
                 .setNeedClientTimezone(timeField.v2().hasTimezoneDependentParsing());
         }
 
-        SortedMap<String, Object> innerMappings = LogStructureUtils.guessMappings(explanation, sampleRecords);
+        Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>> mappingsAndFieldStats =
+            LogStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords);
+
+        if (mappingsAndFieldStats.v2() != null) {
+            structureBuilder.setFieldStats(mappingsAndFieldStats.v2());
+        }
+
+        SortedMap<String, Object> innerMappings = mappingsAndFieldStats.v1();
         Map<String, Object> secondLevelProperties = new LinkedHashMap<>();
         secondLevelProperties.put(LogStructureUtils.MAPPING_TYPE_SETTING, "object");
         secondLevelProperties.put(LogStructureUtils.MAPPING_PROPERTIES_SETTING, innerMappings);

+ 218 - 0
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/FieldStatsCalculatorTests.java

@@ -0,0 +1,218 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.DoubleSummaryStatistics;
+import java.util.List;
+import java.util.Map;
+
+public class FieldStatsCalculatorTests extends LogStructureTestCase {
+
+    public void testMean() {
+
+        FieldStatsCalculator calculator = new FieldStatsCalculator();
+
+        calculator.accept(Arrays.asList("1", "3.5", "2.5", "9"));
+
+        assertEquals(4.0, calculator.calculateMean(), 1e-10);
+    }
+
+    public void testMedianGivenOddCount() {
+
+        FieldStatsCalculator calculator = new FieldStatsCalculator();
+
+        calculator.accept(Arrays.asList("3", "23", "-1", "5", "1000"));
+
+        assertEquals(5.0, calculator.calculateMedian(), 1e-10);
+    }
+
+    public void testMedianGivenOddCountMinimal() {
+
+        FieldStatsCalculator calculator = new FieldStatsCalculator();
+
+        calculator.accept(Collections.singletonList("3"));
+
+        assertEquals(3.0, calculator.calculateMedian(), 1e-10);
+    }
+
+    public void testMedianGivenEvenCountMiddleValuesDifferent() {
+
+        FieldStatsCalculator calculator = new FieldStatsCalculator();
+
+        calculator.accept(Arrays.asList("3", "23", "-1", "5", "1000", "6"));
+
+        assertEquals(5.5, calculator.calculateMedian(), 1e-10);
+    }
+
+    public void testMedianGivenEvenCountMiddleValuesSame() {
+
+        FieldStatsCalculator calculator = new FieldStatsCalculator();
+
+        calculator.accept(Arrays.asList("3", "23", "-1", "5", "1000", "5"));
+
+        assertEquals(5.0, calculator.calculateMedian(), 1e-10);
+    }
+
+    public void testMedianGivenEvenCountMinimal() {
+
+        FieldStatsCalculator calculator = new FieldStatsCalculator();
+
+        calculator.accept(Arrays.asList("4", "4"));
+
+        assertEquals(4.0, calculator.calculateMedian(), 1e-10);
+    }
+
+    public void testTopHitsNumeric() {
+
+        FieldStatsCalculator calculator = new FieldStatsCalculator();
+
+        calculator.accept(Arrays.asList("4", "4", "7", "4", "6", "5", "6", "5", "16", "4", "5"));
+
+        List<Map<String, Object>> topHits = calculator.findNumericTopHits(3);
+
+        assertEquals(3, topHits.size());
+        assertEquals(4.0, topHits.get(0).get("value"));
+        assertEquals(4, topHits.get(0).get("count"));
+        assertEquals(5.0, topHits.get(1).get("value"));
+        assertEquals(3, topHits.get(1).get("count"));
+        assertEquals(6.0, topHits.get(2).get("value"));
+        assertEquals(2, topHits.get(2).get("count"));
+    }
+
+    public void testTopHitsString() {
+
+        FieldStatsCalculator calculator = new FieldStatsCalculator();
+
+        calculator.accept(Arrays.asList("s", "s", "d", "s", "f", "x", "f", "x", "n", "s", "x"));
+
+        List<Map<String, Object>> topHits = calculator.findStringTopHits(3);
+
+        assertEquals(3, topHits.size());
+        assertEquals("s", topHits.get(0).get("value"));
+        assertEquals(4, topHits.get(0).get("count"));
+        assertEquals("x", topHits.get(1).get("value"));
+        assertEquals(3, topHits.get(1).get("count"));
+        assertEquals("f", topHits.get(2).get("value"));
+        assertEquals(2, topHits.get(2).get("count"));
+    }
+
+    public void testCalculateGivenEmpty() {
+
+        FieldStatsCalculator calculator = new FieldStatsCalculator();
+
+        calculator.accept(Collections.emptyList());
+
+        FieldStats stats = calculator.calculate(3);
+
+        assertEquals(0L, stats.getCount());
+        assertEquals(0, stats.getCardinality());
+        assertNull(stats.getMinValue());
+        assertNull(stats.getMaxValue());
+        assertNull(stats.getMeanValue());
+        assertNull(stats.getMedianValue());
+        assertEquals(0, stats.getTopHits().size());
+    }
+
+    public void testCalculateGivenNumericField() {
+
+        FieldStatsCalculator calculator = new FieldStatsCalculator();
+
+        calculator.accept(Arrays.asList("4", "4", "7", "4", "6", "5", "6", "5", "16", "4", "5"));
+
+        FieldStats stats = calculator.calculate(3);
+
+        assertEquals(11L, stats.getCount());
+        assertEquals(5, stats.getCardinality());
+        assertEquals(4.0, stats.getMinValue(), 1e-10);
+        assertEquals(16.0, stats.getMaxValue(), 1e-10);
+        assertEquals(6.0, stats.getMeanValue(), 1e-10);
+        assertEquals(5.0, stats.getMedianValue(), 1e-10);
+
+        List<Map<String, Object>> topHits = stats.getTopHits();
+
+        assertEquals(3, topHits.size());
+        assertEquals(4.0, topHits.get(0).get("value"));
+        assertEquals(4, topHits.get(0).get("count"));
+        assertEquals(5.0, topHits.get(1).get("value"));
+        assertEquals(3, topHits.get(1).get("count"));
+        assertEquals(6.0, topHits.get(2).get("value"));
+        assertEquals(2, topHits.get(2).get("count"));
+    }
+
+    public void testCalculateGivenStringField() {
+
+        FieldStatsCalculator calculator = new FieldStatsCalculator();
+
+        calculator.accept(Arrays.asList("s", "s", "d", "s", "f", "x", "f", "x", "n", "s", "x"));
+
+        FieldStats stats = calculator.calculate(3);
+
+        assertEquals(11L, stats.getCount());
+        assertEquals(5, stats.getCardinality());
+        assertNull(stats.getMinValue());
+        assertNull(stats.getMaxValue());
+        assertNull(stats.getMeanValue());
+        assertNull(stats.getMedianValue());
+
+        List<Map<String, Object>> topHits = stats.getTopHits();
+
+        assertEquals(3, topHits.size());
+        assertEquals("s", topHits.get(0).get("value"));
+        assertEquals(4, topHits.get(0).get("count"));
+        assertEquals("x", topHits.get(1).get("value"));
+        assertEquals(3, topHits.get(1).get("count"));
+        assertEquals("f", topHits.get(2).get("value"));
+        assertEquals(2, topHits.get(2).get("count"));
+    }
+
+    public void testCalculateGivenMixedField() {
+
+        FieldStatsCalculator calculator = new FieldStatsCalculator();
+
+        calculator.accept(Arrays.asList("4", "4", "d", "4", "f", "x", "f", "x", "16", "4", "x"));
+
+        FieldStats stats = calculator.calculate(3);
+
+        assertEquals(11L, stats.getCount());
+        assertEquals(5, stats.getCardinality());
+        assertNull(stats.getMinValue());
+        assertNull(stats.getMaxValue());
+        assertNull(stats.getMeanValue());
+        assertNull(stats.getMedianValue());
+
+        List<Map<String, Object>> topHits = stats.getTopHits();
+
+        assertEquals(3, topHits.size());
+        assertEquals("4", topHits.get(0).get("value"));
+        assertEquals(4, topHits.get(0).get("count"));
+        assertEquals("x", topHits.get(1).get("value"));
+        assertEquals(3, topHits.get(1).get("count"));
+        assertEquals("f", topHits.get(2).get("value"));
+        assertEquals(2, topHits.get(2).get("count"));
+    }
+
+    public void testJavaStatsEquivalence() {
+
+        DoubleSummaryStatistics summaryStatistics = new DoubleSummaryStatistics();
+        FieldStatsCalculator calculator = new FieldStatsCalculator();
+
+        for (int numValues = randomIntBetween(1000, 10000); numValues > 0; --numValues) {
+
+            double value = randomDouble();
+            summaryStatistics.accept(value);
+            calculator.accept(Collections.singletonList(Double.toString(value)));
+        }
+
+        FieldStats stats = calculator.calculate(1);
+
+        assertEquals(summaryStatistics.getCount(), stats.getCount());
+        assertEquals(summaryStatistics.getMin(), stats.getMinValue(), 1e-10);
+        assertEquals(summaryStatistics.getMax(), stats.getMaxValue(), 1e-10);
+        assertEquals(summaryStatistics.getAverage(), stats.getMeanValue(), 1e-10);
+    }
+}

+ 61 - 0
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/FieldStatsTests.java

@@ -0,0 +1,61 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.logstructurefinder;
+
+import org.elasticsearch.common.xcontent.XContentParser;
+import org.elasticsearch.test.AbstractXContentTestCase;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+public class FieldStatsTests extends AbstractXContentTestCase<FieldStats> {
+
+    protected FieldStats createTestInstance() {
+        return createTestFieldStats();
+    }
+
+    static FieldStats createTestFieldStats() {
+
+        long count = randomIntBetween(1, 100000);
+        int cardinality = randomIntBetween(1, (int) count);
+
+        Double minValue = null;
+        Double maxValue = null;
+        Double meanValue = null;
+        Double medianValue = null;
+        boolean isMetric = randomBoolean();
+        if (isMetric) {
+            minValue = randomDouble();
+            maxValue = randomDouble();
+            meanValue = randomDouble();
+            medianValue = randomDouble();
+        }
+
+        List<Map<String, Object>> topHits = new ArrayList<>();
+        for (int i = 0; i < Math.min(10, cardinality); ++i) {
+            Map<String, Object> topHit = new LinkedHashMap<>();
+            if (isMetric) {
+                topHit.put("value", randomDouble());
+            } else {
+                topHit.put("value", randomAlphaOfLength(20));
+            }
+            topHit.put("count", randomIntBetween(1, cardinality));
+            topHits.add(topHit);
+        }
+
+        return new FieldStats(count, cardinality, minValue, maxValue, meanValue, medianValue, topHits);
+    }
+
+    protected FieldStats doParseInstance(XContentParser parser) {
+        return FieldStats.PARSER.apply(parser, null);
+    }
+
+    protected boolean supportsUnknownFields() {
+        return false;
+    }
+}

+ 17 - 17
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/GrokPatternCreatorTests.java

@@ -43,7 +43,7 @@ public class GrokPatternCreatorTests extends LogStructureTestCase {
         Collection<String> prefaces = new ArrayList<>();
         Collection<String> epilogues = new ArrayList<>();
 
-        candidate.processCaptures(fieldNameCountStore, matchingStrings, prefaces, epilogues, null);
+        candidate.processCaptures(fieldNameCountStore, matchingStrings, prefaces, epilogues, null, null);
 
         assertThat(prefaces, containsInAnyOrder("[", "[", "junk [", "["));
         assertThat(epilogues, containsInAnyOrder("] DEBUG ", "] ERROR ", "] INFO ", "] DEBUG "));
@@ -60,7 +60,7 @@ public class GrokPatternCreatorTests extends LogStructureTestCase {
         Collection<String> prefaces = new ArrayList<>();
         Collection<String> epilogues = new ArrayList<>();
 
-        candidate.processCaptures(fieldNameCountStore, matchingStrings, prefaces, epilogues, null);
+        candidate.processCaptures(fieldNameCountStore, matchingStrings, prefaces, epilogues, null, null);
 
         assertThat(prefaces, containsInAnyOrder("before ", "abc ", ""));
         assertThat(epilogues, containsInAnyOrder(" after", " xyz", ""));
@@ -73,7 +73,7 @@ public class GrokPatternCreatorTests extends LogStructureTestCase {
             "junk [2018-01-22T07:33:23] INFO ",
             "[2018-01-21T03:33:23] DEBUG ");
 
-        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null);
         grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
 
         assertEquals(".*?\\[%{TIMESTAMP_ISO8601:extra_timestamp}\\] %{LOGLEVEL:loglevel} ",
@@ -87,7 +87,7 @@ public class GrokPatternCreatorTests extends LogStructureTestCase {
             " (4)",
             " (-5) ");
 
-        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null);
         grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
 
         assertEquals(".*?\\(%{INT:field}\\).*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
@@ -99,7 +99,7 @@ public class GrokPatternCreatorTests extends LogStructureTestCase {
             "prior to-3",
             "-4");
 
-        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null);
         grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
 
         // It seems sensible that we don't detect these suffices as either base 10 or base 16 numbers
@@ -113,7 +113,7 @@ public class GrokPatternCreatorTests extends LogStructureTestCase {
             " -123",
             "1f is hex");
 
-        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null);
         grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
 
         assertEquals(".*?%{BASE16NUM:field}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
@@ -124,7 +124,7 @@ public class GrokPatternCreatorTests extends LogStructureTestCase {
         Collection<String> snippets = Arrays.asList("<host1.1.p2ps:",
             "<host2.1.p2ps:");
 
-        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null);
         grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
 
         // We don't want the .1. in the middle to get detected as a hex number
@@ -137,7 +137,7 @@ public class GrokPatternCreatorTests extends LogStructureTestCase {
             "abc bob@acme.com xyz",
             "carol@acme.com");
 
-        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null);
         grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
 
         assertEquals(".*?%{EMAILADDRESS:email}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
@@ -149,7 +149,7 @@ public class GrokPatternCreatorTests extends LogStructureTestCase {
             "https://www.elastic.co/guide/en/x-pack/current/ml-configuring-categories.html#ml-configuring-categories is a section",
             "download today from https://www.elastic.co/downloads");
 
-        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null);
         grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
 
         assertEquals(".*?%{URI:uri}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
@@ -161,7 +161,7 @@ public class GrokPatternCreatorTests extends LogStructureTestCase {
             "on Windows C:\\Users\\dave",
             "on Linux /home/dave");
 
-        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null);
         grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
 
         assertEquals(".*? .*? %{PATH:path}", grokPatternCreator.getOverallGrokPatternBuilder().toString());
@@ -174,7 +174,7 @@ public class GrokPatternCreatorTests extends LogStructureTestCase {
             "foo=3 bar=c",
             " foo=1 bar=a ");
 
-        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null);
         grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0);
 
         assertEquals(".*?\\bfoo=%{USER:foo} .*?\\bbar=%{USER:bar}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString());
@@ -189,7 +189,7 @@ public class GrokPatternCreatorTests extends LogStructureTestCase {
             "Sep  8 11:55:42 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'b.akamaiedge.net/A/IN': 95.110.64.205#53");
 
         Map<String, Object> mappings = new HashMap<>();
-        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null);
 
         assertEquals("%{SYSLOGTIMESTAMP:timestamp} .*? .*?\\[%{INT:field}\\]: %{LOGLEVEL:loglevel} \\(.*? .*? .*?\\) .*? " +
                 "%{QUOTEDSTRING:field2}: %{IP:ipaddress}#%{INT:field3}",
@@ -215,7 +215,7 @@ public class GrokPatternCreatorTests extends LogStructureTestCase {
                 "Invalid chunk ignored.");
 
         Map<String, Object> mappings = new HashMap<>();
-        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null);
 
         assertEquals("%{CATALINA_DATESTAMP:timestamp} .*? .*?\\n%{LOGLEVEL:loglevel}: .*",
             grokPatternCreator.createGrokPatternFromExamples("CATALINA_DATESTAMP", "timestamp"));
@@ -237,7 +237,7 @@ public class GrokPatternCreatorTests extends LogStructureTestCase {
                 "Info\tsshd\tsubsystem request for sftp");
 
         Map<String, Object> mappings = new HashMap<>();
-        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null);
 
         assertEquals("%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{TIMESTAMP_ISO8601:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" +
                 "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*",
@@ -271,7 +271,7 @@ public class GrokPatternCreatorTests extends LogStructureTestCase {
                 "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"");
 
         Map<String, Object> mappings = new HashMap<>();
-        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings);
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null);
 
         assertEquals(new Tuple<>("timestamp", "%{COMBINEDAPACHELOG}"), grokPatternCreator.findFullLineGrokPattern());
         assertEquals(10, mappings.size());
@@ -300,7 +300,7 @@ public class GrokPatternCreatorTests extends LogStructureTestCase {
                 ",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\""
         );
 
-        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null);
         Collection<String> adjustedSnippets = grokPatternCreator.adjustForPunctuation(snippets);
 
         assertEquals("\",", grokPatternCreator.getOverallGrokPatternBuilder().toString());
@@ -317,7 +317,7 @@ public class GrokPatternCreatorTests extends LogStructureTestCase {
                 "was added by 'User1'(id:2) to servergroup 'GAME'(id:9)"
         );
 
-        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null);
+        GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null);
         Collection<String> adjustedSnippets = grokPatternCreator.adjustForPunctuation(snippets);
 
         assertEquals("", grokPatternCreator.getOverallGrokPatternBuilder().toString());

+ 8 - 0
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureTests.java

@@ -66,6 +66,14 @@ public class LogStructureTests extends AbstractXContentTestCase<LogStructure> {
         }
         builder.setMappings(mappings);
 
+        //if (randomBoolean()) {
+            Map<String, FieldStats> fieldStats = new TreeMap<>();
+            for (String field : generateRandomStringArray(5, 20, false, false)) {
+                fieldStats.put(field, FieldStatsTests.createTestFieldStats());
+            }
+            builder.setFieldStats(fieldStats);
+        //}
+
         builder.setExplanation(Arrays.asList(generateRandomStringArray(10, 150, false, false)));
 
         return builder.build();

+ 50 - 33
x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureUtilsTests.java

@@ -12,7 +12,9 @@ import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.LinkedHashMap;
+import java.util.List;
 import java.util.Map;
+import java.util.SortedMap;
 
 import static org.hamcrest.Matchers.contains;
 
@@ -178,96 +180,83 @@ public class LogStructureUtilsTests extends LogStructureTestCase {
     }
 
     public void testGuessMappingGivenNothing() {
-        assertNull(LogStructureUtils.guessMapping(explanation, "foo", Collections.emptyList()));
+        assertNull(guessMapping(explanation, "foo", Collections.emptyList()));
     }
 
     public void testGuessMappingGivenKeyword() {
         Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword");
 
-        assertEquals(expected,
-            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("ERROR", "INFO", "DEBUG")));
-        assertEquals(expected,
-            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("2018-06-11T13:26:47Z", "not a date")));
+        assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("ERROR", "INFO", "DEBUG")));
+        assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("2018-06-11T13:26:47Z", "not a date")));
     }
 
     public void testGuessMappingGivenText() {
         Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "text");
 
-        assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo",
-            Arrays.asList("a", "the quick brown fox jumped over the lazy dog")));
+        assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("a", "the quick brown fox jumped over the lazy dog")));
     }
 
     public void testGuessMappingGivenIp() {
         Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "ip");
 
-        assertEquals(expected,
-            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("10.0.0.1", "172.16.0.1", "192.168.0.1")));
+        assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("10.0.0.1", "172.16.0.1", "192.168.0.1")));
     }
 
     public void testGuessMappingGivenDouble() {
         Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "double");
 
-        assertEquals(expected,
-            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("3.14159265359", "0", "-8")));
+        assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("3.14159265359", "0", "-8")));
         // 12345678901234567890 is too long for long
-        assertEquals(expected,
-            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("1", "2", "12345678901234567890")));
-        assertEquals(expected,
-            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(3.14159265359, 0.0, 1e-308)));
-        assertEquals(expected,
-            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("-1e-1", "-1e308", "1e-308")));
+        assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("1", "2", "12345678901234567890")));
+        assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(3.14159265359, 0.0, 1e-308)));
+        assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("-1e-1", "-1e308", "1e-308")));
     }
 
     public void testGuessMappingGivenLong() {
         Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long");
 
-        assertEquals(expected,
-            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("500", "3", "-3")));
-        assertEquals(expected,
-            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(500, 6, 0)));
+        assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("500", "3", "-3")));
+        assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(500, 6, 0)));
     }
 
     public void testGuessMappingGivenDate() {
         Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date");
 
-        assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo",
-            Arrays.asList("2018-06-11T13:26:47Z", "2018-06-11T13:27:12Z")));
+        assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("2018-06-11T13:26:47Z", "2018-06-11T13:27:12Z")));
     }
 
     public void testGuessMappingGivenBoolean() {
         Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "boolean");
 
-        assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList("false", "true")));
-        assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(true, false)));
+        assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("false", "true")));
+        assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(true, false)));
     }
 
     public void testGuessMappingGivenArray() {
         Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long");
 
-        assertEquals(expected,
-            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(42, Arrays.asList(1, -99))));
+        assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(42, Arrays.asList(1, -99))));
 
         expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword");
 
-        assertEquals(expected,
-            LogStructureUtils.guessMapping(explanation, "foo", Arrays.asList(new String[]{ "x", "y" }, "z")));
+        assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(new String[]{ "x", "y" }, "z")));
     }
 
     public void testGuessMappingGivenObject() {
         Map<String, String> expected = Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "object");
 
-        assertEquals(expected, LogStructureUtils.guessMapping(explanation, "foo",
+        assertEquals(expected, guessMapping(explanation, "foo",
             Arrays.asList(Collections.singletonMap("name", "value1"), Collections.singletonMap("name", "value2"))));
     }
 
     public void testGuessMappingGivenObjectAndNonObject() {
-        RuntimeException e = expectThrows(RuntimeException.class, () -> LogStructureUtils.guessMapping(explanation,
+        RuntimeException e = expectThrows(RuntimeException.class, () -> guessMapping(explanation,
             "foo", Arrays.asList(Collections.singletonMap("name", "value1"), "value2")));
 
         assertEquals("Field [foo] has both object and non-object values - this is not supported by Elasticsearch", e.getMessage());
     }
 
-    public void testGuessMappings() {
+    public void testGuessMappingsAndCalculateFieldStats() {
         Map<String, Object> sample1 = new LinkedHashMap<>();
         sample1.put("foo", "not a time");
         sample1.put("time", "2018-05-24 17:28:31,735");
@@ -279,7 +268,11 @@ public class LogStructureUtilsTests extends LogStructureTestCase {
         sample2.put("bar", 17);
         sample2.put("nothing", null);
 
-        Map<String, Object> mappings = LogStructureUtils.guessMappings(explanation, Arrays.asList(sample1, sample2));
+        Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>> mappingsAndFieldStats =
+            LogStructureUtils.guessMappingsAndCalculateFieldStats(explanation, Arrays.asList(sample1, sample2));
+        assertNotNull(mappingsAndFieldStats);
+
+        Map<String, Object> mappings = mappingsAndFieldStats.v1();
         assertNotNull(mappings);
         assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("foo"));
         Map<String, String> expectedTimeMapping = new HashMap<>();
@@ -288,5 +281,29 @@ public class LogStructureUtilsTests extends LogStructureTestCase {
         assertEquals(expectedTimeMapping, mappings.get("time"));
         assertEquals(Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("bar"));
         assertNull(mappings.get("nothing"));
+
+        Map<String, FieldStats> fieldStats = mappingsAndFieldStats.v2();
+        assertNotNull(fieldStats);
+        assertEquals(3, fieldStats.size());
+        assertEquals(new FieldStats(2, 2, makeTopHits("not a time", 1, "whatever", 1)), fieldStats.get("foo"));
+        assertEquals(new FieldStats(2, 2, makeTopHits("2018-05-24 17:28:31,735", 1, "2018-05-29 11:53:02,837", 1)), fieldStats.get("time"));
+        assertEquals(new FieldStats(2, 2, 17.0, 42.0, 29.5, 29.5, makeTopHits(17.0, 1, 42.0, 1)), fieldStats.get("bar"));
+        assertNull(fieldStats.get("nothing"));
+    }
+
+    private Map<String, String> guessMapping(List<String> explanation, String fieldName, List<Object> fieldValues) {
+        Tuple<Map<String, String>, FieldStats> mappingAndFieldStats =
+            LogStructureUtils.guessMappingAndCalculateFieldStats(explanation, fieldName, fieldValues);
+        return (mappingAndFieldStats == null) ? null : mappingAndFieldStats.v1();
+    }
+
+    private List<Map<String, Object>> makeTopHits(Object value1, int count1, Object value2, int count2) {
+        Map<String, Object> topHit1 = new LinkedHashMap<>();
+        topHit1.put("value", value1);
+        topHit1.put("count", count1);
+        Map<String, Object> topHit2 = new LinkedHashMap<>();
+        topHit2.put("value", value2);
+        topHit2.put("count", count2);
+        return Arrays.asList(topHit1, topHit2);
     }
 }