|
@@ -37,7 +37,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|
|
String sample = "time,message\n" +
|
|
|
"2018-05-17T13:41:23,hello\n" +
|
|
|
"2018-05-17T13:41:32,hello again\n";
|
|
|
- assertTrue(csvFactory.canCreateFromSample(explanation, sample));
|
|
|
+ assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0));
|
|
|
|
|
|
String charset = randomFrom(POSSIBLE_CHARSETS);
|
|
|
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
|
@@ -65,6 +65,99 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|
|
assertEquals(Collections.singletonList("ISO8601"), structure.getJodaTimestampFormats());
|
|
|
}
|
|
|
|
|
|
+ public void testCreateConfigsGivenIncompleteCsv() throws Exception {
|
|
|
+ String sample = "time,message\n" +
|
|
|
+ "2018-05-17T13:41:23,hello\n" +
|
|
|
+ "badrow\n" + // REALLY bad row
|
|
|
+ "2018-05-17T13:41:25,hello\n" +
|
|
|
+ "2018-05-17T13:41:26,hello\n" +
|
|
|
+ "2018-05-17T13:41:27,hello\n" +
|
|
|
+ "2018-05-17T13:41:28,hello\n" +
|
|
|
+ "2018-05-17T13:41:29,hello\n" +
|
|
|
+ "2018-05-17T13:41:30,hello\n" +
|
|
|
+ "2018-05-17T13:41:31,hello\n" +
|
|
|
+ "2018-05-17T13:41:32,hello\n" +
|
|
|
+ "2018-05-17T13:41:35\n" + // Just missing the column
|
|
|
+ "2018-05-17T13:41:33,hello again\n";
|
|
|
+ assertFalse(csvFactory.canCreateFromSample(explanation, sample, 0.05));
|
|
|
+ assertTrue("assertion failed. Explanation " + explanation,
|
|
|
+ csvFactory.canCreateFromSample(explanation, sample, 0.10));
|
|
|
+
|
|
|
+ String charset = randomFrom(POSSIBLE_CHARSETS);
|
|
|
+ Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
|
|
+ FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
|
|
|
+ FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER);
|
|
|
+
|
|
|
+
|
|
|
+ FileStructure structure = structureFinder.getStructure();
|
|
|
+
|
|
|
+ assertEquals(FileStructure.Format.DELIMITED, structure.getFormat());
|
|
|
+ assertEquals(charset, structure.getCharset());
|
|
|
+ if (hasByteOrderMarker == null) {
|
|
|
+ assertNull(structure.getHasByteOrderMarker());
|
|
|
+ } else {
|
|
|
+ assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
|
|
|
+ }
|
|
|
+ assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern());
|
|
|
+ assertEquals("time", structure.getTimestampField());
|
|
|
+ assertEquals(Collections.singletonList("ISO8601"), structure.getJodaTimestampFormats());
|
|
|
+ assertEquals(Arrays.asList("time", "message"), structure.getColumnNames());
|
|
|
+ assertEquals(Character.valueOf(','), structure.getDelimiter());
|
|
|
+ assertEquals(Character.valueOf('"'), structure.getQuote());
|
|
|
+ assertEquals(structure.getNumMessagesAnalyzed(), 10);
|
|
|
+ assertTrue(structure.getHasHeaderRow());
|
|
|
+ assertNull(structure.getMultilineStartPattern());
|
|
|
+ assertNull(structure.getShouldTrimFields());
|
|
|
+ assertNull(structure.getGrokPattern());
|
|
|
+ }
|
|
|
+
|
|
|
+ public void testCreateConfigsGivenIncompleteCsvWithMultiLinedRows() throws Exception {
|
|
|
+ String sample = "time,message\n" +
|
|
|
+ "2018-05-17T13:41:23,\"hello\nnew line\"\n" +
|
|
|
+ "\"badrow\n\n\n\n\"\n" + // REALLY bad row
|
|
|
+ "2018-05-17T13:41:25,\"hello\nnew line\"\n" +
|
|
|
+ "2018-05-17T13:41:26,\"hello\nnew line\"\n" +
|
|
|
+ "2018-05-17T13:41:27,\"hello\nnew line\"\n" +
|
|
|
+ "2018-05-17T13:41:28,\"hello\nnew line\"\n" +
|
|
|
+ "2018-05-17T13:41:29,\"hello\nnew line\"\n" +
|
|
|
+ "2018-05-17T13:41:30,\"hello\nnew line\"\n" +
|
|
|
+ "2018-05-17T13:41:31,\"hello\nnew line\"\n" +
|
|
|
+ "2018-05-17T13:41:32,\"hello\nnew line\"\n" +
|
|
|
+ "2018-05-17T13:41:35\n" + // Just missing the column
|
|
|
+ "2018-05-17T13:41:33,\"hello again\nnew line\"\n";
|
|
|
+ assertFalse(csvFactory.canCreateFromSample(explanation, sample, 0.05));
|
|
|
+ assertTrue("assertion failed. Explanation " + explanation,
|
|
|
+ csvFactory.canCreateFromSample(explanation, sample, 0.10));
|
|
|
+
|
|
|
+ String charset = randomFrom(POSSIBLE_CHARSETS);
|
|
|
+ Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
|
|
+ FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker,
|
|
|
+ FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT,
|
|
|
+ FileStructureOverrides.builder().setQuote('"').build(),
|
|
|
+ NOOP_TIMEOUT_CHECKER);
|
|
|
+
|
|
|
+ FileStructure structure = structureFinder.getStructure();
|
|
|
+
|
|
|
+ assertEquals(FileStructure.Format.DELIMITED, structure.getFormat());
|
|
|
+ assertEquals(charset, structure.getCharset());
|
|
|
+ if (hasByteOrderMarker == null) {
|
|
|
+ assertNull(structure.getHasByteOrderMarker());
|
|
|
+ } else {
|
|
|
+ assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker());
|
|
|
+ }
|
|
|
+ assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern());
|
|
|
+ assertEquals("time", structure.getTimestampField());
|
|
|
+ assertEquals(Collections.singletonList("ISO8601"), structure.getJodaTimestampFormats());
|
|
|
+ assertEquals(Arrays.asList("time", "message"), structure.getColumnNames());
|
|
|
+ assertEquals(Character.valueOf(','), structure.getDelimiter());
|
|
|
+ assertEquals(Character.valueOf('"'), structure.getQuote());
|
|
|
+ assertEquals(structure.getNumMessagesAnalyzed(), 10);
|
|
|
+ assertTrue(structure.getHasHeaderRow());
|
|
|
+ assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern());
|
|
|
+ assertNull(structure.getShouldTrimFields());
|
|
|
+ assertNull(structure.getGrokPattern());
|
|
|
+ }
|
|
|
+
|
|
|
public void testCreateConfigsGivenCompleteCsvAndColumnNamesOverride() throws Exception {
|
|
|
|
|
|
FileStructureOverrides overrides = FileStructureOverrides.builder().setColumnNames(Arrays.asList("my_time", "my_message")).build();
|
|
@@ -72,7 +165,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|
|
String sample = "time,message\n" +
|
|
|
"2018-05-17T13:41:23,hello\n" +
|
|
|
"2018-05-17T13:41:32,hello again\n";
|
|
|
- assertTrue(csvFactory.canCreateFromSample(explanation, sample));
|
|
|
+ assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0));
|
|
|
|
|
|
String charset = randomFrom(POSSIBLE_CHARSETS);
|
|
|
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
|
@@ -109,7 +202,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|
|
String sample = "time,message\n" +
|
|
|
"2018-05-17T13:41:23,hello\n" +
|
|
|
"2018-05-17T13:41:32,hello again\n";
|
|
|
- assertTrue(csvFactory.canCreateFromSample(explanation, sample));
|
|
|
+ assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0));
|
|
|
|
|
|
String charset = randomFrom(POSSIBLE_CHARSETS);
|
|
|
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
|
@@ -142,7 +235,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|
|
"2018-05-17T13:41:23,\"hello\n" +
|
|
|
"world\",1\n" +
|
|
|
"2019-01-18T14:46:57,\"hello again\n"; // note that this last record is truncated
|
|
|
- assertTrue(csvFactory.canCreateFromSample(explanation, sample));
|
|
|
+ assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0));
|
|
|
|
|
|
String charset = randomFrom(POSSIBLE_CHARSETS);
|
|
|
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
|
@@ -177,7 +270,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|
|
"2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" +
|
|
|
"1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" +
|
|
|
"1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n";
|
|
|
- assertTrue(csvFactory.canCreateFromSample(explanation, sample));
|
|
|
+ assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0));
|
|
|
|
|
|
String charset = randomFrom(POSSIBLE_CHARSETS);
|
|
|
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
|
@@ -222,7 +315,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|
|
"2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" +
|
|
|
"1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" +
|
|
|
"1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n";
|
|
|
- assertTrue(csvFactory.canCreateFromSample(explanation, sample));
|
|
|
+ assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0));
|
|
|
|
|
|
String charset = randomFrom(POSSIBLE_CHARSETS);
|
|
|
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
|
@@ -262,7 +355,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|
|
"2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" +
|
|
|
"1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" +
|
|
|
"1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n";
|
|
|
- assertTrue(csvFactory.canCreateFromSample(explanation, sample));
|
|
|
+ assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0));
|
|
|
|
|
|
String charset = randomFrom(POSSIBLE_CHARSETS);
|
|
|
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
|
@@ -309,7 +402,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|
|
"2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" +
|
|
|
"1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" +
|
|
|
"1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n";
|
|
|
- assertTrue(csvFactory.canCreateFromSample(explanation, sample));
|
|
|
+ assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0));
|
|
|
|
|
|
String charset = randomFrom(POSSIBLE_CHARSETS);
|
|
|
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
|
@@ -347,7 +440,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|
|
String sample = "\"pos_id\",\"trip_id\",\"latitude\",\"longitude\",\"altitude\",\"timestamp\"\n" +
|
|
|
"\"1\",\"3\",\"4703.7815\",\"1527.4713\",\"359.9\",\"2017-01-19 16:19:04.742113\"\n" +
|
|
|
"\"2\",\"3\",\"4703.7815\",\"1527.4714\",\"359.9\",\"2017-01-19 16:19:05.741890\"\n";
|
|
|
- assertTrue(csvFactory.canCreateFromSample(explanation, sample));
|
|
|
+ assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0));
|
|
|
|
|
|
String charset = randomFrom(POSSIBLE_CHARSETS);
|
|
|
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
|
@@ -387,7 +480,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|
|
"25.76615\t18.436565\t\"25.7661500000,18.4365650000\"\tJul 1 2019 12:06:08\n" +
|
|
|
"25.76896\t18.43586\t\"25.7689600000,18.4358600000\"\tJul 1 2019 12:13:50\n" +
|
|
|
"25.76423\t18.43705\t\"25.7642300000,18.4370500000\"\tJul 1 2019 12:39:10\n";
|
|
|
- assertTrue(tsvFactory.canCreateFromSample(explanation, sample));
|
|
|
+ assertTrue(tsvFactory.canCreateFromSample(explanation, sample, 0.0));
|
|
|
|
|
|
String charset = randomFrom(POSSIBLE_CHARSETS);
|
|
|
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|
|
@@ -421,7 +514,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase {
|
|
|
String sample = "time.iso8601,message\n" +
|
|
|
"2018-05-17T13:41:23,hello\n" +
|
|
|
"2018-05-17T13:41:32,hello again\n";
|
|
|
- assertTrue(csvFactory.canCreateFromSample(explanation, sample));
|
|
|
+ assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0));
|
|
|
|
|
|
String charset = randomFrom(POSSIBLE_CHARSETS);
|
|
|
Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset);
|