12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955 |
- [role="xpack"]
- [testenv="basic"]
- [[ml-find-file-structure]]
- = Find file structure API
- ++++
- <titleabbrev>Find file structure</titleabbrev>
- ++++
- deprecated::[7.12.0,Replaced by <<find-structure>>]
- Finds the structure of a text file. The text file must contain data that is
- suitable to be ingested into {es}.
- [[ml-find-file-structure-request]]
- == {api-request-title}
- `POST _ml/find_file_structure`
- [[ml-find-file-structure-prereqs]]
- == {api-prereq-title}
- * If the {es} {security-features} are enabled, you must have `monitor_ml` or
- `monitor` cluster privileges to use this API. See
- <<security-privileges>> and
- {ml-docs-setup-privileges}.
- [[ml-find-file-structure-desc]]
- == {api-description-title}
- This API provides a starting point for ingesting data into {es} in a format that
- is suitable for subsequent use with other {ml} functionality.
- Unlike other {es} endpoints, the data that is posted to this endpoint does not
- need to be UTF-8 encoded and in JSON format. It must, however, be text; binary
- file formats are not currently supported.
- The response from the API contains:
- * A couple of messages from the beginning of the file.
- * Statistics that reveal the most common values for all fields detected within
- the file and basic numeric statistics for numeric fields.
- * Information about the structure of the file, which is useful when you write
- ingest configurations to index the file contents.
- * Appropriate mappings for an {es} index, which you could use to ingest the file
- contents.
- All this information can be calculated by the structure finder with no guidance.
- However, you can optionally override some of the decisions about the file
- structure by specifying one or more query parameters.
- Details of the output can be seen in the
- <<ml-find-file-structure-examples,examples>>.
- If the structure finder produces unexpected results for a particular file,
- specify the `explain` query parameter. It causes an `explanation` to appear in
- the response, which should help in determining why the returned structure was
- chosen.
- [[ml-find-file-structure-query-parms]]
- == {api-query-parms-title}
- `charset`::
- (Optional, string) The file's character set. It must be a character set that
- is supported by the JVM that {es} uses. For example, `UTF-8`, `UTF-16LE`,
- `windows-1252`, or `EUC-JP`. If this parameter is not specified, the structure
- finder chooses an appropriate character set.
- `column_names`::
- (Optional, string) If you have set `format` to `delimited`, you can specify
- the column names in a comma-separated list. If this parameter is not specified,
- the structure finder uses the column names from the header row of the file. If
- the file does not have a header role, columns are named "column1", "column2",
- "column3", etc.
- `delimiter`::
- (Optional, string) If you have set `format` to `delimited`, you can specify
- the character used to delimit the values in each row. Only a single character
- is supported; the delimiter cannot have multiple characters. By default, the
- API considers the following possibilities: comma, tab, semi-colon, and pipe
- (`|`). In this default scenario, all rows must have the same number of fields
- for the delimited format to be detected. If you specify a delimiter, up to 10%
- of the rows can have a different number of columns than the first row.
- `explain`::
- (Optional, Boolean) If this parameter is set to `true`, the response includes
- a field named `explanation`, which is an array of strings that indicate how
- the structure finder produced its result. The default value is `false`.
- `format`::
- (Optional, string) The high level structure of the file. Valid values are
- `ndjson`, `xml`, `delimited`, and `semi_structured_text`. By default, the
- API chooses the format. In this default scenario, all rows must
- have the same number of fields for a delimited format to be detected. If the
- `format` is set to `delimited` and the `delimiter` is not set, however, the
- API tolerates up to 5% of rows that have a different number of
- columns than the first row.
- `grok_pattern`::
- (Optional, string) If you have set `format` to `semi_structured_text`, you can
- specify a Grok pattern that is used to extract fields from every message in
- the file. The name of the timestamp field in the Grok pattern must match what
- is specified in the `timestamp_field` parameter. If that parameter is not
- specified, the name of the timestamp field in the Grok pattern must match
- "timestamp". If `grok_pattern` is not specified, the structure finder creates
- a Grok pattern.
- `has_header_row`::
- (Optional, Boolean) If you have set `format` to `delimited`, you can use this
- parameter to indicate whether the column names are in the first row of the
- file. If this parameter is not specified, the structure finder guesses based
- on the similarity of the first row of the file to other rows.
- `line_merge_size_limit`::
- (Optional, unsigned integer) The maximum number of characters in a message
- when lines are merged to form messages while analyzing semi-structured files.
- The default is `10000`. If you have extremely long messages you may need to
- increase this, but be aware that this may lead to very long processing times
- if the way to group lines into messages is misdetected.
- `lines_to_sample`::
- (Optional, unsigned integer) The number of lines to include in the structural
- analysis, starting from the beginning of the file. The minimum is 2; the
- default is `1000`. If the value of this parameter is greater than the number
- of lines in the file, the analysis proceeds (as long as there are at least two
- lines in the file) for all of the lines. +
- +
- --
- NOTE: The number of lines and the variation of the lines affects the speed of
- the analysis. For example, if you upload a log file where the first 1000 lines
- are all variations on the same message, the analysis will find more commonality
- than would be seen with a bigger sample. If possible, however, it is more
- efficient to upload a sample file with more variety in the first 1000 lines than
- to request analysis of 100000 lines to achieve some variety.
- --
- `quote`::
- (Optional, string) If you have set `format` to `delimited`, you can specify
- the character used to quote the values in each row if they contain newlines or
- the delimiter character. Only a single character is supported. If this
- parameter is not specified, the default value is a double quote (`"`). If your
- delimited file format does not use quoting, a workaround is to set this
- argument to a character that does not appear anywhere in the sample.
- `should_trim_fields`::
- (Optional, Boolean) If you have set `format` to `delimited`, you can specify
- whether values between delimiters should have whitespace trimmed from them. If
- this parameter is not specified and the delimiter is pipe (`|`), the default
- value is `true`. Otherwise, the default value is `false`.
- `timeout`::
- (Optional, <<time-units,time units>>) Sets the maximum amount of time that the
- structure analysis make take. If the analysis is still running when the
- timeout expires then it will be aborted. The default value is 25 seconds.
- `timestamp_field`::
- (Optional, string) The name of the field that contains the primary timestamp
- of each record in the file. In particular, if the file were ingested into an
- index, this is the field that would be used to populate the `@timestamp` field.
- +
- --
- If the `format` is `semi_structured_text`, this field must match the name of the
- appropriate extraction in the `grok_pattern`. Therefore, for semi-structured
- file formats, it is best not to specify this parameter unless `grok_pattern` is
- also specified.
- For structured file formats, if you specify this parameter, the field must exist
- within the file.
- If this parameter is not specified, the structure finder makes a decision about
- which field (if any) is the primary timestamp field. For structured file
- formats, it is not compulsory to have a timestamp in the file.
- --
- `timestamp_format`::
- (Optional, string) The Java time format of the timestamp field in the file.
- +
- --
- Only a subset of Java time format letter groups are supported:
- * `a`
- * `d`
- * `dd`
- * `EEE`
- * `EEEE`
- * `H`
- * `HH`
- * `h`
- * `M`
- * `MM`
- * `MMM`
- * `MMMM`
- * `mm`
- * `ss`
- * `XX`
- * `XXX`
- * `yy`
- * `yyyy`
- * `zzz`
- Additionally `S` letter groups (fractional seconds) of length one to nine are
- supported providing they occur after `ss` and separated from the `ss` by a `.`,
- `,` or `:`. Spacing and punctuation is also permitted with the exception of `?`,
- newline and carriage return, together with literal text enclosed in single
- quotes. For example, `MM/dd HH.mm.ss,SSSSSS 'in' yyyy` is a valid override
- format.
- One valuable use case for this parameter is when the format is semi-structured
- text, there are multiple timestamp formats in the file, and you know which
- format corresponds to the primary timestamp, but you do not want to specify the
- full `grok_pattern`. Another is when the timestamp format is one that the
- structure finder does not consider by default.
- If this parameter is not specified, the structure finder chooses the best
- format from a built-in set.
- The following table provides the appropriate `timeformat` values for some example timestamps:
- |===
- | Timeformat | Presentation
- | yyyy-MM-dd HH:mm:ssZ | 2019-04-20 13:15:22+0000
- | EEE, d MMM yyyy HH:mm:ss Z | Sat, 20 Apr 2019 13:15:22 +0000
- | dd.MM.yy HH:mm:ss.SSS | 20.04.19 13:15:22.285
- |===
- See
- https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html[the Java date/time format documentation]
- for more information about date and time format syntax.
- --
- [[ml-find-file-structure-request-body]]
- == {api-request-body-title}
- The text file that you want to analyze. It must contain data that is suitable to
- be ingested into {es}. It does not need to be in JSON format and it does not
- need to be UTF-8 encoded. The size is limited to the {es} HTTP receive buffer
- size, which defaults to 100 Mb.
- [[ml-find-file-structure-examples]]
- == {api-examples-title}
- [[ml-find-file-structure-example-nld-json]]
- === Ingesting newline-delimited JSON
- Suppose you have a newline-delimited JSON file that contains information about
- some books. You can send the contents to the `find_file_structure` endpoint:
- [source,console]
- ----
- POST _ml/find_file_structure
- {"name": "Leviathan Wakes", "author": "James S.A. Corey", "release_date": "2011-06-02", "page_count": 561}
- {"name": "Hyperion", "author": "Dan Simmons", "release_date": "1989-05-26", "page_count": 482}
- {"name": "Dune", "author": "Frank Herbert", "release_date": "1965-06-01", "page_count": 604}
- {"name": "Dune Messiah", "author": "Frank Herbert", "release_date": "1969-10-15", "page_count": 331}
- {"name": "Children of Dune", "author": "Frank Herbert", "release_date": "1976-04-21", "page_count": 408}
- {"name": "God Emperor of Dune", "author": "Frank Herbert", "release_date": "1981-05-28", "page_count": 454}
- {"name": "Consider Phlebas", "author": "Iain M. Banks", "release_date": "1987-04-23", "page_count": 471}
- {"name": "Pandora's Star", "author": "Peter F. Hamilton", "release_date": "2004-03-02", "page_count": 768}
- {"name": "Revelation Space", "author": "Alastair Reynolds", "release_date": "2000-03-15", "page_count": 585}
- {"name": "A Fire Upon the Deep", "author": "Vernor Vinge", "release_date": "1992-06-01", "page_count": 613}
- {"name": "Ender's Game", "author": "Orson Scott Card", "release_date": "1985-06-01", "page_count": 324}
- {"name": "1984", "author": "George Orwell", "release_date": "1985-06-01", "page_count": 328}
- {"name": "Fahrenheit 451", "author": "Ray Bradbury", "release_date": "1953-10-15", "page_count": 227}
- {"name": "Brave New World", "author": "Aldous Huxley", "release_date": "1932-06-01", "page_count": 268}
- {"name": "Foundation", "author": "Isaac Asimov", "release_date": "1951-06-01", "page_count": 224}
- {"name": "The Giver", "author": "Lois Lowry", "release_date": "1993-04-26", "page_count": 208}
- {"name": "Slaughterhouse-Five", "author": "Kurt Vonnegut", "release_date": "1969-06-01", "page_count": 275}
- {"name": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams", "release_date": "1979-10-12", "page_count": 180}
- {"name": "Snow Crash", "author": "Neal Stephenson", "release_date": "1992-06-01", "page_count": 470}
- {"name": "Neuromancer", "author": "William Gibson", "release_date": "1984-07-01", "page_count": 271}
- {"name": "The Handmaid's Tale", "author": "Margaret Atwood", "release_date": "1985-06-01", "page_count": 311}
- {"name": "Starship Troopers", "author": "Robert A. Heinlein", "release_date": "1959-12-01", "page_count": 335}
- {"name": "The Left Hand of Darkness", "author": "Ursula K. Le Guin", "release_date": "1969-06-01", "page_count": 304}
- {"name": "The Moon is a Harsh Mistress", "author": "Robert A. Heinlein", "release_date": "1966-04-01", "page_count": 288}
- ----
- // TEST[warning:[POST /_ml/find_file_structure] is deprecated! Use [POST /_text_structure/find_structure] instead.]
- If the request does not encounter errors, you receive the following result:
- [source,console-result]
- ----
- {
- "num_lines_analyzed" : 24, <1>
- "num_messages_analyzed" : 24, <2>
- "sample_start" : "{\"name\": \"Leviathan Wakes\", \"author\": \"James S.A. Corey\", \"release_date\": \"2011-06-02\", \"page_count\": 561}\n{\"name\": \"Hyperion\", \"author\": \"Dan Simmons\", \"release_date\": \"1989-05-26\", \"page_count\": 482}\n", <3>
- "charset" : "UTF-8", <4>
- "has_byte_order_marker" : false, <5>
- "format" : "ndjson", <6>
- "timestamp_field" : "release_date", <7>
- "joda_timestamp_formats" : [ <8>
- "ISO8601"
- ],
- "java_timestamp_formats" : [ <9>
- "ISO8601"
- ],
- "need_client_timezone" : true, <10>
- "mappings" : { <11>
- "properties" : {
- "@timestamp" : {
- "type" : "date"
- },
- "author" : {
- "type" : "keyword"
- },
- "name" : {
- "type" : "keyword"
- },
- "page_count" : {
- "type" : "long"
- },
- "release_date" : {
- "type" : "date",
- "format" : "iso8601"
- }
- }
- },
- "ingest_pipeline" : {
- "description" : "Ingest pipeline created by file structure finder",
- "processors" : [
- {
- "date" : {
- "field" : "release_date",
- "timezone" : "{{ event.timezone }}",
- "formats" : [
- "ISO8601"
- ]
- }
- }
- ]
- },
- "field_stats" : { <12>
- "author" : {
- "count" : 24,
- "cardinality" : 20,
- "top_hits" : [
- {
- "value" : "Frank Herbert",
- "count" : 4
- },
- {
- "value" : "Robert A. Heinlein",
- "count" : 2
- },
- {
- "value" : "Alastair Reynolds",
- "count" : 1
- },
- {
- "value" : "Aldous Huxley",
- "count" : 1
- },
- {
- "value" : "Dan Simmons",
- "count" : 1
- },
- {
- "value" : "Douglas Adams",
- "count" : 1
- },
- {
- "value" : "George Orwell",
- "count" : 1
- },
- {
- "value" : "Iain M. Banks",
- "count" : 1
- },
- {
- "value" : "Isaac Asimov",
- "count" : 1
- },
- {
- "value" : "James S.A. Corey",
- "count" : 1
- }
- ]
- },
- "name" : {
- "count" : 24,
- "cardinality" : 24,
- "top_hits" : [
- {
- "value" : "1984",
- "count" : 1
- },
- {
- "value" : "A Fire Upon the Deep",
- "count" : 1
- },
- {
- "value" : "Brave New World",
- "count" : 1
- },
- {
- "value" : "Children of Dune",
- "count" : 1
- },
- {
- "value" : "Consider Phlebas",
- "count" : 1
- },
- {
- "value" : "Dune",
- "count" : 1
- },
- {
- "value" : "Dune Messiah",
- "count" : 1
- },
- {
- "value" : "Ender's Game",
- "count" : 1
- },
- {
- "value" : "Fahrenheit 451",
- "count" : 1
- },
- {
- "value" : "Foundation",
- "count" : 1
- }
- ]
- },
- "page_count" : {
- "count" : 24,
- "cardinality" : 24,
- "min_value" : 180,
- "max_value" : 768,
- "mean_value" : 387.0833333333333,
- "median_value" : 329.5,
- "top_hits" : [
- {
- "value" : 180,
- "count" : 1
- },
- {
- "value" : 208,
- "count" : 1
- },
- {
- "value" : 224,
- "count" : 1
- },
- {
- "value" : 227,
- "count" : 1
- },
- {
- "value" : 268,
- "count" : 1
- },
- {
- "value" : 271,
- "count" : 1
- },
- {
- "value" : 275,
- "count" : 1
- },
- {
- "value" : 288,
- "count" : 1
- },
- {
- "value" : 304,
- "count" : 1
- },
- {
- "value" : 311,
- "count" : 1
- }
- ]
- },
- "release_date" : {
- "count" : 24,
- "cardinality" : 20,
- "earliest" : "1932-06-01",
- "latest" : "2011-06-02",
- "top_hits" : [
- {
- "value" : "1985-06-01",
- "count" : 3
- },
- {
- "value" : "1969-06-01",
- "count" : 2
- },
- {
- "value" : "1992-06-01",
- "count" : 2
- },
- {
- "value" : "1932-06-01",
- "count" : 1
- },
- {
- "value" : "1951-06-01",
- "count" : 1
- },
- {
- "value" : "1953-10-15",
- "count" : 1
- },
- {
- "value" : "1959-12-01",
- "count" : 1
- },
- {
- "value" : "1965-06-01",
- "count" : 1
- },
- {
- "value" : "1966-04-01",
- "count" : 1
- },
- {
- "value" : "1969-10-15",
- "count" : 1
- }
- ]
- }
- }
- }
- ----
- // TESTRESPONSE[s/"sample_start" : ".*",/"sample_start" : "$body.sample_start",/]
- // The substitution is because the "file" is pre-processed by the test harness,
- // so the fields may get reordered in the JSON the endpoint sees
- <1> `num_lines_analyzed` indicates how many lines of the file were analyzed.
- <2> `num_messages_analyzed` indicates how many distinct messages the lines contained.
- For NDJSON, this value is the same as `num_lines_analyzed`. For other file
- formats, messages can span several lines.
- <3> `sample_start` reproduces the first two messages in the file verbatim. This
- may help to diagnose parse errors or accidental uploads of the wrong file.
- <4> `charset` indicates the character encoding used to parse the file.
- <5> For UTF character encodings, `has_byte_order_marker` indicates whether the
- file begins with a byte order marker.
- <6> `format` is one of `ndjson`, `xml`, `delimited` or `semi_structured_text`.
- <7> The `timestamp_field` names the field considered most likely to be the
- primary timestamp of each document.
- <8> `joda_timestamp_formats` are used to tell Logstash how to parse timestamps.
- <9> `java_timestamp_formats` are the Java time formats recognized in the time
- fields. Elasticsearch mappings and Ingest pipeline use this format.
- <10> If a timestamp format is detected that does not include a timezone,
- `need_client_timezone` will be `true`. The server that parses the file must
- therefore be told the correct timezone by the client.
- <11> `mappings` contains some suitable mappings for an index into which the data
- could be ingested. In this case, the `release_date` field has been given a
- `keyword` type as it is not considered specific enough to convert to the
- `date` type.
- <12> `field_stats` contains the most common values of each field, plus basic
- numeric statistics for the numeric `page_count` field. This information
- may provide clues that the data needs to be cleaned or transformed prior
- to use by other {ml} functionality.
- [[ml-find-file-structure-example-nyc]]
- === Finding the structure of NYC yellow cab example data
- The next example shows how it's possible to find the structure of some New York
- City yellow cab trip data. The first `curl` command downloads the data, the
- first 20000 lines of which are then piped into the `find_file_structure`
- endpoint. The `lines_to_sample` query parameter of the endpoint is set to 20000
- to match what is specified in the `head` command.
- [source,js]
- ----
- curl -s "s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2018-06.csv" | head -20000 | curl -s -H "Content-Type: application/json" -XPOST "localhost:9200/_ml/find_file_structure?pretty&lines_to_sample=20000" -T -
- ----
- // NOTCONSOLE
- // Not converting to console because this shows how curl can be used
- --
- NOTE: The `Content-Type: application/json` header must be set even though in
- this case the data is not JSON. (Alternatively the `Content-Type` can be set
- to any other supported by {es}, but it must be set.)
- --
- If the request does not encounter errors, you receive the following result:
- [source,js]
- ----
- {
- "num_lines_analyzed" : 20000,
- "num_messages_analyzed" : 19998, <1>
- "sample_start" : "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount\n\n1,2018-06-01 00:15:40,2018-06-01 00:16:46,1,.00,1,N,145,145,2,3,0.5,0.5,0,0,0.3,4.3\n",
- "charset" : "UTF-8",
- "has_byte_order_marker" : false,
- "format" : "delimited", <2>
- "multiline_start_pattern" : "^.*?,\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}",
- "exclude_lines_pattern" : "^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?,\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?,\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?",
- "column_names" : [ <3>
- "VendorID",
- "tpep_pickup_datetime",
- "tpep_dropoff_datetime",
- "passenger_count",
- "trip_distance",
- "RatecodeID",
- "store_and_fwd_flag",
- "PULocationID",
- "DOLocationID",
- "payment_type",
- "fare_amount",
- "extra",
- "mta_tax",
- "tip_amount",
- "tolls_amount",
- "improvement_surcharge",
- "total_amount"
- ],
- "has_header_row" : true, <4>
- "delimiter" : ",", <5>
- "quote" : "\"", <6>
- "timestamp_field" : "tpep_pickup_datetime", <7>
- "joda_timestamp_formats" : [ <8>
- "YYYY-MM-dd HH:mm:ss"
- ],
- "java_timestamp_formats" : [ <9>
- "yyyy-MM-dd HH:mm:ss"
- ],
- "need_client_timezone" : true, <10>
- "mappings" : {
- "properties" : {
- "@timestamp" : {
- "type" : "date"
- },
- "DOLocationID" : {
- "type" : "long"
- },
- "PULocationID" : {
- "type" : "long"
- },
- "RatecodeID" : {
- "type" : "long"
- },
- "VendorID" : {
- "type" : "long"
- },
- "extra" : {
- "type" : "double"
- },
- "fare_amount" : {
- "type" : "double"
- },
- "improvement_surcharge" : {
- "type" : "double"
- },
- "mta_tax" : {
- "type" : "double"
- },
- "passenger_count" : {
- "type" : "long"
- },
- "payment_type" : {
- "type" : "long"
- },
- "store_and_fwd_flag" : {
- "type" : "keyword"
- },
- "tip_amount" : {
- "type" : "double"
- },
- "tolls_amount" : {
- "type" : "double"
- },
- "total_amount" : {
- "type" : "double"
- },
- "tpep_dropoff_datetime" : {
- "type" : "date",
- "format" : "yyyy-MM-dd HH:mm:ss"
- },
- "tpep_pickup_datetime" : {
- "type" : "date",
- "format" : "yyyy-MM-dd HH:mm:ss"
- },
- "trip_distance" : {
- "type" : "double"
- }
- }
- },
- "ingest_pipeline" : {
- "description" : "Ingest pipeline created by file structure finder",
- "processors" : [
- {
- "csv" : {
- "field" : "message",
- "target_fields" : [
- "VendorID",
- "tpep_pickup_datetime",
- "tpep_dropoff_datetime",
- "passenger_count",
- "trip_distance",
- "RatecodeID",
- "store_and_fwd_flag",
- "PULocationID",
- "DOLocationID",
- "payment_type",
- "fare_amount",
- "extra",
- "mta_tax",
- "tip_amount",
- "tolls_amount",
- "improvement_surcharge",
- "total_amount"
- ]
- }
- },
- {
- "date" : {
- "field" : "tpep_pickup_datetime",
- "timezone" : "{{ event.timezone }}",
- "formats" : [
- "yyyy-MM-dd HH:mm:ss"
- ]
- }
- },
- {
- "convert" : {
- "field" : "DOLocationID",
- "type" : "long"
- }
- },
- {
- "convert" : {
- "field" : "PULocationID",
- "type" : "long"
- }
- },
- {
- "convert" : {
- "field" : "RatecodeID",
- "type" : "long"
- }
- },
- {
- "convert" : {
- "field" : "VendorID",
- "type" : "long"
- }
- },
- {
- "convert" : {
- "field" : "extra",
- "type" : "double"
- }
- },
- {
- "convert" : {
- "field" : "fare_amount",
- "type" : "double"
- }
- },
- {
- "convert" : {
- "field" : "improvement_surcharge",
- "type" : "double"
- }
- },
- {
- "convert" : {
- "field" : "mta_tax",
- "type" : "double"
- }
- },
- {
- "convert" : {
- "field" : "passenger_count",
- "type" : "long"
- }
- },
- {
- "convert" : {
- "field" : "payment_type",
- "type" : "long"
- }
- },
- {
- "convert" : {
- "field" : "tip_amount",
- "type" : "double"
- }
- },
- {
- "convert" : {
- "field" : "tolls_amount",
- "type" : "double"
- }
- },
- {
- "convert" : {
- "field" : "total_amount",
- "type" : "double"
- }
- },
- {
- "convert" : {
- "field" : "trip_distance",
- "type" : "double"
- }
- },
- {
- "remove" : {
- "field" : "message"
- }
- }
- ]
- },
- "field_stats" : {
- "DOLocationID" : {
- "count" : 19998,
- "cardinality" : 240,
- "min_value" : 1,
- "max_value" : 265,
- "mean_value" : 150.26532653265312,
- "median_value" : 148,
- "top_hits" : [
- {
- "value" : 79,
- "count" : 760
- },
- {
- "value" : 48,
- "count" : 683
- },
- {
- "value" : 68,
- "count" : 529
- },
- {
- "value" : 170,
- "count" : 506
- },
- {
- "value" : 107,
- "count" : 468
- },
- {
- "value" : 249,
- "count" : 457
- },
- {
- "value" : 230,
- "count" : 441
- },
- {
- "value" : 186,
- "count" : 432
- },
- {
- "value" : 141,
- "count" : 409
- },
- {
- "value" : 263,
- "count" : 386
- }
- ]
- },
- "PULocationID" : {
- "count" : 19998,
- "cardinality" : 154,
- "min_value" : 1,
- "max_value" : 265,
- "mean_value" : 153.4042404240424,
- "median_value" : 148,
- "top_hits" : [
- {
- "value" : 79,
- "count" : 1067
- },
- {
- "value" : 230,
- "count" : 949
- },
- {
- "value" : 148,
- "count" : 940
- },
- {
- "value" : 132,
- "count" : 897
- },
- {
- "value" : 48,
- "count" : 853
- },
- {
- "value" : 161,
- "count" : 820
- },
- {
- "value" : 234,
- "count" : 750
- },
- {
- "value" : 249,
- "count" : 722
- },
- {
- "value" : 164,
- "count" : 663
- },
- {
- "value" : 114,
- "count" : 646
- }
- ]
- },
- "RatecodeID" : {
- "count" : 19998,
- "cardinality" : 5,
- "min_value" : 1,
- "max_value" : 5,
- "mean_value" : 1.0656565656565653,
- "median_value" : 1,
- "top_hits" : [
- {
- "value" : 1,
- "count" : 19311
- },
- {
- "value" : 2,
- "count" : 468
- },
- {
- "value" : 5,
- "count" : 195
- },
- {
- "value" : 4,
- "count" : 17
- },
- {
- "value" : 3,
- "count" : 7
- }
- ]
- },
- "VendorID" : {
- "count" : 19998,
- "cardinality" : 2,
- "min_value" : 1,
- "max_value" : 2,
- "mean_value" : 1.59005900590059,
- "median_value" : 2,
- "top_hits" : [
- {
- "value" : 2,
- "count" : 11800
- },
- {
- "value" : 1,
- "count" : 8198
- }
- ]
- },
- "extra" : {
- "count" : 19998,
- "cardinality" : 3,
- "min_value" : -0.5,
- "max_value" : 0.5,
- "mean_value" : 0.4815981598159816,
- "median_value" : 0.5,
- "top_hits" : [
- {
- "value" : 0.5,
- "count" : 19281
- },
- {
- "value" : 0,
- "count" : 698
- },
- {
- "value" : -0.5,
- "count" : 19
- }
- ]
- },
- "fare_amount" : {
- "count" : 19998,
- "cardinality" : 208,
- "min_value" : -100,
- "max_value" : 300,
- "mean_value" : 13.937719771977209,
- "median_value" : 9.5,
- "top_hits" : [
- {
- "value" : 6,
- "count" : 1004
- },
- {
- "value" : 6.5,
- "count" : 935
- },
- {
- "value" : 5.5,
- "count" : 909
- },
- {
- "value" : 7,
- "count" : 903
- },
- {
- "value" : 5,
- "count" : 889
- },
- {
- "value" : 7.5,
- "count" : 854
- },
- {
- "value" : 4.5,
- "count" : 802
- },
- {
- "value" : 8.5,
- "count" : 790
- },
- {
- "value" : 8,
- "count" : 789
- },
- {
- "value" : 9,
- "count" : 711
- }
- ]
- },
- "improvement_surcharge" : {
- "count" : 19998,
- "cardinality" : 3,
- "min_value" : -0.3,
- "max_value" : 0.3,
- "mean_value" : 0.29915991599159913,
- "median_value" : 0.3,
- "top_hits" : [
- {
- "value" : 0.3,
- "count" : 19964
- },
- {
- "value" : -0.3,
- "count" : 22
- },
- {
- "value" : 0,
- "count" : 12
- }
- ]
- },
- "mta_tax" : {
- "count" : 19998,
- "cardinality" : 3,
- "min_value" : -0.5,
- "max_value" : 0.5,
- "mean_value" : 0.4962246224622462,
- "median_value" : 0.5,
- "top_hits" : [
- {
- "value" : 0.5,
- "count" : 19868
- },
- {
- "value" : 0,
- "count" : 109
- },
- {
- "value" : -0.5,
- "count" : 21
- }
- ]
- },
- "passenger_count" : {
- "count" : 19998,
- "cardinality" : 7,
- "min_value" : 0,
- "max_value" : 6,
- "mean_value" : 1.6201620162016201,
- "median_value" : 1,
- "top_hits" : [
- {
- "value" : 1,
- "count" : 14219
- },
- {
- "value" : 2,
- "count" : 2886
- },
- {
- "value" : 5,
- "count" : 1047
- },
- {
- "value" : 3,
- "count" : 804
- },
- {
- "value" : 6,
- "count" : 523
- },
- {
- "value" : 4,
- "count" : 406
- },
- {
- "value" : 0,
- "count" : 113
- }
- ]
- },
- "payment_type" : {
- "count" : 19998,
- "cardinality" : 4,
- "min_value" : 1,
- "max_value" : 4,
- "mean_value" : 1.315631563156316,
- "median_value" : 1,
- "top_hits" : [
- {
- "value" : 1,
- "count" : 13936
- },
- {
- "value" : 2,
- "count" : 5857
- },
- {
- "value" : 3,
- "count" : 160
- },
- {
- "value" : 4,
- "count" : 45
- }
- ]
- },
- "store_and_fwd_flag" : {
- "count" : 19998,
- "cardinality" : 2,
- "top_hits" : [
- {
- "value" : "N",
- "count" : 19910
- },
- {
- "value" : "Y",
- "count" : 88
- }
- ]
- },
- "tip_amount" : {
- "count" : 19998,
- "cardinality" : 717,
- "min_value" : 0,
- "max_value" : 128,
- "mean_value" : 2.010959095909593,
- "median_value" : 1.45,
- "top_hits" : [
- {
- "value" : 0,
- "count" : 6917
- },
- {
- "value" : 1,
- "count" : 1178
- },
- {
- "value" : 2,
- "count" : 624
- },
- {
- "value" : 3,
- "count" : 248
- },
- {
- "value" : 1.56,
- "count" : 206
- },
- {
- "value" : 1.46,
- "count" : 205
- },
- {
- "value" : 1.76,
- "count" : 196
- },
- {
- "value" : 1.45,
- "count" : 195
- },
- {
- "value" : 1.36,
- "count" : 191
- },
- {
- "value" : 1.5,
- "count" : 187
- }
- ]
- },
- "tolls_amount" : {
- "count" : 19998,
- "cardinality" : 26,
- "min_value" : 0,
- "max_value" : 35,
- "mean_value" : 0.2729697969796978,
- "median_value" : 0,
- "top_hits" : [
- {
- "value" : 0,
- "count" : 19107
- },
- {
- "value" : 5.76,
- "count" : 791
- },
- {
- "value" : 10.5,
- "count" : 36
- },
- {
- "value" : 2.64,
- "count" : 21
- },
- {
- "value" : 11.52,
- "count" : 8
- },
- {
- "value" : 5.54,
- "count" : 4
- },
- {
- "value" : 8.5,
- "count" : 4
- },
- {
- "value" : 17.28,
- "count" : 4
- },
- {
- "value" : 2,
- "count" : 2
- },
- {
- "value" : 2.16,
- "count" : 2
- }
- ]
- },
- "total_amount" : {
- "count" : 19998,
- "cardinality" : 1267,
- "min_value" : -100.3,
- "max_value" : 389.12,
- "mean_value" : 17.499898989898995,
- "median_value" : 12.35,
- "top_hits" : [
- {
- "value" : 7.3,
- "count" : 478
- },
- {
- "value" : 8.3,
- "count" : 443
- },
- {
- "value" : 8.8,
- "count" : 420
- },
- {
- "value" : 6.8,
- "count" : 406
- },
- {
- "value" : 7.8,
- "count" : 405
- },
- {
- "value" : 6.3,
- "count" : 371
- },
- {
- "value" : 9.8,
- "count" : 368
- },
- {
- "value" : 5.8,
- "count" : 362
- },
- {
- "value" : 9.3,
- "count" : 332
- },
- {
- "value" : 10.3,
- "count" : 332
- }
- ]
- },
- "tpep_dropoff_datetime" : {
- "count" : 19998,
- "cardinality" : 9066,
- "earliest" : "2018-05-31 06:18:15",
- "latest" : "2018-06-02 02:25:44",
- "top_hits" : [
- {
- "value" : "2018-06-01 01:12:12",
- "count" : 10
- },
- {
- "value" : "2018-06-01 00:32:15",
- "count" : 9
- },
- {
- "value" : "2018-06-01 00:44:27",
- "count" : 9
- },
- {
- "value" : "2018-06-01 00:46:42",
- "count" : 9
- },
- {
- "value" : "2018-06-01 01:03:22",
- "count" : 9
- },
- {
- "value" : "2018-06-01 01:05:13",
- "count" : 9
- },
- {
- "value" : "2018-06-01 00:11:20",
- "count" : 8
- },
- {
- "value" : "2018-06-01 00:16:03",
- "count" : 8
- },
- {
- "value" : "2018-06-01 00:19:47",
- "count" : 8
- },
- {
- "value" : "2018-06-01 00:25:17",
- "count" : 8
- }
- ]
- },
- "tpep_pickup_datetime" : {
- "count" : 19998,
- "cardinality" : 8760,
- "earliest" : "2018-05-31 06:08:31",
- "latest" : "2018-06-02 01:21:21",
- "top_hits" : [
- {
- "value" : "2018-06-01 00:01:23",
- "count" : 12
- },
- {
- "value" : "2018-06-01 00:04:31",
- "count" : 10
- },
- {
- "value" : "2018-06-01 00:05:38",
- "count" : 10
- },
- {
- "value" : "2018-06-01 00:09:50",
- "count" : 10
- },
- {
- "value" : "2018-06-01 00:12:01",
- "count" : 10
- },
- {
- "value" : "2018-06-01 00:14:17",
- "count" : 10
- },
- {
- "value" : "2018-06-01 00:00:34",
- "count" : 9
- },
- {
- "value" : "2018-06-01 00:00:40",
- "count" : 9
- },
- {
- "value" : "2018-06-01 00:02:53",
- "count" : 9
- },
- {
- "value" : "2018-06-01 00:05:40",
- "count" : 9
- }
- ]
- },
- "trip_distance" : {
- "count" : 19998,
- "cardinality" : 1687,
- "min_value" : 0,
- "max_value" : 64.63,
- "mean_value" : 3.6521062106210715,
- "median_value" : 2.16,
- "top_hits" : [
- {
- "value" : 0.9,
- "count" : 335
- },
- {
- "value" : 0.8,
- "count" : 320
- },
- {
- "value" : 1.1,
- "count" : 316
- },
- {
- "value" : 0.7,
- "count" : 304
- },
- {
- "value" : 1.2,
- "count" : 303
- },
- {
- "value" : 1,
- "count" : 296
- },
- {
- "value" : 1.3,
- "count" : 280
- },
- {
- "value" : 1.5,
- "count" : 268
- },
- {
- "value" : 1.6,
- "count" : 268
- },
- {
- "value" : 0.6,
- "count" : 256
- }
- ]
- }
- }
- }
- ----
- // NOTCONSOLE
- <1> `num_messages_analyzed` is 2 lower than `num_lines_analyzed` because only
- data records count as messages. The first line contains the column names
- and in this sample the second line is blank.
- <2> Unlike the first example, in this case the `format` has been identified as
- `delimited`.
- <3> Because the `format` is `delimited`, the `column_names` field in the output
- lists the column names in the order they appear in the sample.
- <4> `has_header_row` indicates that for this sample the column names were in
- the first row of the sample. (If they hadn't been then it would have been
- a good idea to specify them in the `column_names` query parameter.)
- <5> The `delimiter` for this sample is a comma, as it's a CSV file.
- <6> The `quote` character is the default double quote. (The structure finder
- does not attempt to deduce any other quote character, so if you have a
- delimited file that's quoted with some other character you must specify it
- using the `quote` query parameter.)
- <7> The `timestamp_field` has been chosen to be `tpep_pickup_datetime`.
- `tpep_dropoff_datetime` would work just as well, but `tpep_pickup_datetime`
- was chosen because it comes first in the column order. If you prefer
- `tpep_dropoff_datetime` then force it to be chosen using the
- `timestamp_field` query parameter.
- <8> `joda_timestamp_formats` are used to tell Logstash how to parse timestamps.
- <9> `java_timestamp_formats` are the Java time formats recognized in the time
- fields. Elasticsearch mappings and Ingest pipeline use this format.
- <10> The timestamp format in this sample doesn't specify a timezone, so to
- accurately convert them to UTC timestamps to store in Elasticsearch it's
- necessary to supply the timezone they relate to. `need_client_timezone`
- will be `false` for timestamp formats that include the timezone.
- [[ml-find-file-structure-example-timeout]]
- === Setting the timeout parameter
- If you try to analyze a lot of data then the analysis will take a long time.
- If you want to limit the amount of processing your {es} cluster performs for
- a request, use the `timeout` query parameter. The analysis will be aborted and
- an error returned when the timeout expires. For example, you can replace 20000
- lines in the previous example with 200000 and set a 1 second timeout on the
- analysis:
- [source,js]
- ----
- curl -s "s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2018-06.csv" | head -200000 | curl -s -H "Content-Type: application/json" -XPOST "localhost:9200/_ml/find_file_structure?pretty&lines_to_sample=200000&timeout=1s" -T -
- ----
- // NOTCONSOLE
- // Not converting to console because this shows how curl can be used
- Unless you are using an incredibly fast computer you'll receive a timeout error:
- [source,js]
- ----
- {
- "error" : {
- "root_cause" : [
- {
- "type" : "timeout_exception",
- "reason" : "Aborting structure analysis during [delimited record parsing] as it has taken longer than the timeout of [1s]"
- }
- ],
- "type" : "timeout_exception",
- "reason" : "Aborting structure analysis during [delimited record parsing] as it has taken longer than the timeout of [1s]"
- },
- "status" : 500
- }
- ----
- // NOTCONSOLE
- --
- NOTE: If you try the example above yourself you will note that the overall
- running time of the `curl` commands is considerably longer than 1 second. This
- is because it takes a while to download 200000 lines of CSV from the internet,
- and the timeout is measured from the time this endpoint starts to process the
- data.
- --
- [[ml-find-file-structure-example-eslog]]
- === Analyzing {es} log files
- This is an example of analyzing {es}'s own log file:
- [source,js]
- ----
- curl -s -H "Content-Type: application/json" -XPOST "localhost:9200/_ml/find_file_structure?pretty" -T "$ES_HOME/logs/elasticsearch.log"
- ----
- // NOTCONSOLE
- // Not converting to console because this shows how curl can be used
- If the request does not encounter errors, the result will look something like
- this:
- [source,js]
- ----
- {
- "num_lines_analyzed" : 53,
- "num_messages_analyzed" : 53,
- "sample_start" : "[2018-09-27T14:39:28,518][INFO ][o.e.e.NodeEnvironment ] [node-0] using [1] data paths, mounts [[/ (/dev/disk1)]], net usable_space [165.4gb], net total_space [464.7gb], types [hfs]\n[2018-09-27T14:39:28,521][INFO ][o.e.e.NodeEnvironment ] [node-0] heap size [494.9mb], compressed ordinary object pointers [true]\n",
- "charset" : "UTF-8",
- "has_byte_order_marker" : false,
- "format" : "semi_structured_text", <1>
- "multiline_start_pattern" : "^\\[\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", <2>
- "grok_pattern" : "\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel}.*", <3>
- "timestamp_field" : "timestamp",
- "joda_timestamp_formats" : [
- "ISO8601"
- ],
- "java_timestamp_formats" : [
- "ISO8601"
- ],
- "need_client_timezone" : true,
- "mappings" : {
- "properties" : {
- "@timestamp" : {
- "type" : "date"
- },
- "loglevel" : {
- "type" : "keyword"
- },
- "message" : {
- "type" : "text"
- }
- }
- },
- "ingest_pipeline" : {
- "description" : "Ingest pipeline created by file structure finder",
- "processors" : [
- {
- "grok" : {
- "field" : "message",
- "patterns" : [
- "\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel}.*"
- ]
- }
- },
- {
- "date" : {
- "field" : "timestamp",
- "timezone" : "{{ event.timezone }}",
- "formats" : [
- "ISO8601"
- ]
- }
- },
- {
- "remove" : {
- "field" : "timestamp"
- }
- }
- ]
- },
- "field_stats" : {
- "loglevel" : {
- "count" : 53,
- "cardinality" : 3,
- "top_hits" : [
- {
- "value" : "INFO",
- "count" : 51
- },
- {
- "value" : "DEBUG",
- "count" : 1
- },
- {
- "value" : "WARN",
- "count" : 1
- }
- ]
- },
- "timestamp" : {
- "count" : 53,
- "cardinality" : 28,
- "earliest" : "2018-09-27T14:39:28,518",
- "latest" : "2018-09-27T14:39:37,012",
- "top_hits" : [
- {
- "value" : "2018-09-27T14:39:29,859",
- "count" : 10
- },
- {
- "value" : "2018-09-27T14:39:29,860",
- "count" : 9
- },
- {
- "value" : "2018-09-27T14:39:29,858",
- "count" : 6
- },
- {
- "value" : "2018-09-27T14:39:28,523",
- "count" : 3
- },
- {
- "value" : "2018-09-27T14:39:34,234",
- "count" : 2
- },
- {
- "value" : "2018-09-27T14:39:28,518",
- "count" : 1
- },
- {
- "value" : "2018-09-27T14:39:28,521",
- "count" : 1
- },
- {
- "value" : "2018-09-27T14:39:28,522",
- "count" : 1
- },
- {
- "value" : "2018-09-27T14:39:29,861",
- "count" : 1
- },
- {
- "value" : "2018-09-27T14:39:32,786",
- "count" : 1
- }
- ]
- }
- }
- }
- ----
- // NOTCONSOLE
- <1> This time the `format` has been identified as `semi_structured_text`.
- <2> The `multiline_start_pattern` is set on the basis that the timestamp appears
- in the first line of each multi-line log message.
- <3> A very simple `grok_pattern` has been created, which extracts the timestamp
- and recognizable fields that appear in every analyzed message. In this case
- the only field that was recognized beyond the timestamp was the log level.
- [[ml-find-file-structure-example-grok]]
- === Specifying `grok_pattern` as query parameter
- If you recognize more fields than the simple `grok_pattern` produced by the
- structure finder unaided then you can resubmit the request specifying a more
- advanced `grok_pattern` as a query parameter and the structure finder will
- calculate `field_stats` for your additional fields.
- In the case of the {es} log a more complete Grok pattern is
- `\[%{TIMESTAMP_ISO8601:timestamp}\]\[%{LOGLEVEL:loglevel} *\]\[%{JAVACLASS:class} *\] \[%{HOSTNAME:node}\] %{JAVALOGMESSAGE:message}`.
- You can analyze the same log file again, submitting this `grok_pattern` as a
- query parameter (appropriately URL escaped):
- [source,js]
- ----
- curl -s -H "Content-Type: application/json" -XPOST "localhost:9200/_ml/find_file_structure?pretty&format=semi_structured_text&grok_pattern=%5C%5B%25%7BTIMESTAMP_ISO8601:timestamp%7D%5C%5D%5C%5B%25%7BLOGLEVEL:loglevel%7D%20*%5C%5D%5C%5B%25%7BJAVACLASS:class%7D%20*%5C%5D%20%5C%5B%25%7BHOSTNAME:node%7D%5C%5D%20%25%7BJAVALOGMESSAGE:message%7D" -T "$ES_HOME/logs/elasticsearch.log"
- ----
- // NOTCONSOLE
- // Not converting to console because this shows how curl can be used
- If the request does not encounter errors, the result will look something like
- this:
- [source,js]
- ----
- {
- "num_lines_analyzed" : 53,
- "num_messages_analyzed" : 53,
- "sample_start" : "[2018-09-27T14:39:28,518][INFO ][o.e.e.NodeEnvironment ] [node-0] using [1] data paths, mounts [[/ (/dev/disk1)]], net usable_space [165.4gb], net total_space [464.7gb], types [hfs]\n[2018-09-27T14:39:28,521][INFO ][o.e.e.NodeEnvironment ] [node-0] heap size [494.9mb], compressed ordinary object pointers [true]\n",
- "charset" : "UTF-8",
- "has_byte_order_marker" : false,
- "format" : "semi_structured_text",
- "multiline_start_pattern" : "^\\[\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}",
- "grok_pattern" : "\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} *\\]\\[%{JAVACLASS:class} *\\] \\[%{HOSTNAME:node}\\] %{JAVALOGMESSAGE:message}", <1>
- "timestamp_field" : "timestamp",
- "joda_timestamp_formats" : [
- "ISO8601"
- ],
- "java_timestamp_formats" : [
- "ISO8601"
- ],
- "need_client_timezone" : true,
- "mappings" : {
- "properties" : {
- "@timestamp" : {
- "type" : "date"
- },
- "class" : {
- "type" : "keyword"
- },
- "loglevel" : {
- "type" : "keyword"
- },
- "message" : {
- "type" : "text"
- },
- "node" : {
- "type" : "keyword"
- }
- }
- },
- "ingest_pipeline" : {
- "description" : "Ingest pipeline created by file structure finder",
- "processors" : [
- {
- "grok" : {
- "field" : "message",
- "patterns" : [
- "\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} *\\]\\[%{JAVACLASS:class} *\\] \\[%{HOSTNAME:node}\\] %{JAVALOGMESSAGE:message}"
- ]
- }
- },
- {
- "date" : {
- "field" : "timestamp",
- "timezone" : "{{ event.timezone }}",
- "formats" : [
- "ISO8601"
- ]
- }
- },
- {
- "remove" : {
- "field" : "timestamp"
- }
- }
- ]
- },
- "field_stats" : { <2>
- "class" : {
- "count" : 53,
- "cardinality" : 14,
- "top_hits" : [
- {
- "value" : "o.e.p.PluginsService",
- "count" : 26
- },
- {
- "value" : "o.e.c.m.MetadataIndexTemplateService",
- "count" : 8
- },
- {
- "value" : "o.e.n.Node",
- "count" : 7
- },
- {
- "value" : "o.e.e.NodeEnvironment",
- "count" : 2
- },
- {
- "value" : "o.e.a.ActionModule",
- "count" : 1
- },
- {
- "value" : "o.e.c.s.ClusterApplierService",
- "count" : 1
- },
- {
- "value" : "o.e.c.s.MasterService",
- "count" : 1
- },
- {
- "value" : "o.e.d.DiscoveryModule",
- "count" : 1
- },
- {
- "value" : "o.e.g.GatewayService",
- "count" : 1
- },
- {
- "value" : "o.e.l.LicenseService",
- "count" : 1
- }
- ]
- },
- "loglevel" : {
- "count" : 53,
- "cardinality" : 3,
- "top_hits" : [
- {
- "value" : "INFO",
- "count" : 51
- },
- {
- "value" : "DEBUG",
- "count" : 1
- },
- {
- "value" : "WARN",
- "count" : 1
- }
- ]
- },
- "message" : {
- "count" : 53,
- "cardinality" : 53,
- "top_hits" : [
- {
- "value" : "Using REST wrapper from plugin org.elasticsearch.xpack.security.Security",
- "count" : 1
- },
- {
- "value" : "adding template [.monitoring-alerts] for index patterns [.monitoring-alerts-6]",
- "count" : 1
- },
- {
- "value" : "adding template [.monitoring-beats] for index patterns [.monitoring-beats-6-*]",
- "count" : 1
- },
- {
- "value" : "adding template [.monitoring-es] for index patterns [.monitoring-es-6-*]",
- "count" : 1
- },
- {
- "value" : "adding template [.monitoring-kibana] for index patterns [.monitoring-kibana-6-*]",
- "count" : 1
- },
- {
- "value" : "adding template [.monitoring-logstash] for index patterns [.monitoring-logstash-6-*]",
- "count" : 1
- },
- {
- "value" : "adding template [.triggered_watches] for index patterns [.triggered_watches*]",
- "count" : 1
- },
- {
- "value" : "adding template [.watch-history-9] for index patterns [.watcher-history-9*]",
- "count" : 1
- },
- {
- "value" : "adding template [.watches] for index patterns [.watches*]",
- "count" : 1
- },
- {
- "value" : "starting ...",
- "count" : 1
- }
- ]
- },
- "node" : {
- "count" : 53,
- "cardinality" : 1,
- "top_hits" : [
- {
- "value" : "node-0",
- "count" : 53
- }
- ]
- },
- "timestamp" : {
- "count" : 53,
- "cardinality" : 28,
- "earliest" : "2018-09-27T14:39:28,518",
- "latest" : "2018-09-27T14:39:37,012",
- "top_hits" : [
- {
- "value" : "2018-09-27T14:39:29,859",
- "count" : 10
- },
- {
- "value" : "2018-09-27T14:39:29,860",
- "count" : 9
- },
- {
- "value" : "2018-09-27T14:39:29,858",
- "count" : 6
- },
- {
- "value" : "2018-09-27T14:39:28,523",
- "count" : 3
- },
- {
- "value" : "2018-09-27T14:39:34,234",
- "count" : 2
- },
- {
- "value" : "2018-09-27T14:39:28,518",
- "count" : 1
- },
- {
- "value" : "2018-09-27T14:39:28,521",
- "count" : 1
- },
- {
- "value" : "2018-09-27T14:39:28,522",
- "count" : 1
- },
- {
- "value" : "2018-09-27T14:39:29,861",
- "count" : 1
- },
- {
- "value" : "2018-09-27T14:39:32,786",
- "count" : 1
- }
- ]
- }
- }
- }
- ----
- // NOTCONSOLE
- <1> The `grok_pattern` in the output is now the overridden one supplied in the
- query parameter.
- <2> The returned `field_stats` include entries for the fields from the
- overridden `grok_pattern`.
- The URL escaping is hard, so if you are working interactively it is best to use
- the {ml} UI!
|