Selaa lähdekoodia

[ML] Change dots in CSV column names to underscores (#42839)

Dots in the column names cause an error in the ingest
pipeline, as dots are special characters in ingest pipeline.
This PR changes dots into underscores in CSV field names
suggested by the ML find_file_structure endpoint _unless_
the field names are specifically overridden.  The reason for
allowing them in overrides is that fields that are not
mentioned in the ingest pipeline can contain dots.  But it's
more consistent that the default behaviour is to replace
them all.

Fixes elastic/kibana#26800
David Roberts 6 vuotta sitten
vanhempi
commit
f196edd9a6

+ 2 - 2
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java

@@ -62,12 +62,12 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
             }
             columnNames = overriddenColumnNames.toArray(new String[0]);
         } else {
-            // The column names are the header names but with blanks named column1, column2, etc.
+            // The column names are the header names but with dots replaced with underscores and blanks named column1, column2, etc.
             columnNames = new String[header.length];
             for (int i = 0; i < header.length; ++i) {
                 assert header[i] != null;
                 String rawHeader = trimFields ? header[i].trim() : header[i];
-                columnNames[i] = rawHeader.isEmpty() ? "column" + (i + 1) : rawHeader;
+                columnNames[i] = rawHeader.isEmpty() ? "column" + (i + 1) : rawHeader.replace('.', '_');
             }
         }
 

+ 33 - 0
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java

@@ -364,6 +364,39 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
         assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss.SSSSSS"), structure.getJodaTimestampFormats());
     }
 
+    public void testCreateConfigsGivenDotInFieldName() throws Exception {
+        String sample = "time.iso8601,message\n" +
+            "2018-05-17T13:41:23,hello\n" +
+            "2018-05-17T13:41:32,hello again\n";
+        assertTrue(csvFactory.canCreateFromSample(explanation, sample));
+
+        String charset = randomFrom(POSSIBLE_CHARSETS);
+        Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
+        FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
+            FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
+
+        FileStructure structure = structureFinder.getStructure();
+
+        assertEquals(FileStructure.Format.DELIMITED, structure.getFormat());
+        assertEquals(charset, structure.getCharset());
+        if (hasByteOrderMarker == null) {
+            assertNull(structure.getHasByteOrderMarker());
+        } else {
+            assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
+        }
+        // The exclude pattern needs to work on the raw text, so reflects the unmodified field names
+        assertEquals("^\"?time\\.iso8601\"?,\"?message\"?", structure.getExcludeLinesPattern());
+        assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern());
+        assertEquals(Character.valueOf(','), structure.getDelimiter());
+        assertEquals(Character.valueOf('"'), structure.getQuote());
+        assertTrue(structure.getHasHeaderRow());
+        assertNull(structure.getShouldTrimFields());
+        assertEquals(Arrays.asList("time_iso8601", "message"), structure.getColumnNames());
+        assertNull(structure.getGrokPattern());
+        assertEquals("time_iso8601", structure.getTimestampField());
+        assertEquals(Collections.singletonList("ISO8601"), structure.getJodaTimestampFormats());
+    }
+
     public void testFindHeaderFromSampleGivenHeaderInSample() throws IOException {
         String withHeader = "time,airline,responsetime,sourcetype\n" +
             "2014-06-23 00:00:00Z,AAL,132.2046,farequote\n" +