Browse Source

CONSOLEify analysis docs

Converts the analysis docs to that were marked as json into `CONSOLE`
format. A few of them were in yaml but marked as json for historical
reasons. I added more complete examples for a few of the less obvious
sounding ones.

Relates to #18160
Nik Everett 8 years ago
parent
commit
ad69503dce

+ 4 - 12
docs/build.gradle

@@ -53,18 +53,6 @@ buildRestTests.expectedUnconvertedCandidates = [
   'reference/aggregations/pipeline/serial-diff-aggregation.asciidoc',
   'reference/aggregations/pipeline/stats-bucket-aggregation.asciidoc',
   'reference/aggregations/pipeline/sum-bucket-aggregation.asciidoc',
-  'reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/elision-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc',
-  'reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc',
   'reference/cat/snapshots.asciidoc',
   'reference/cat/templates.asciidoc',
   'reference/cat/thread_pool.asciidoc',
@@ -124,10 +112,14 @@ integTestCluster {
   configFile 'scripts/my_map_script.painless'
   configFile 'scripts/my_combine_script.painless'
   configFile 'scripts/my_reduce_script.painless'
+  configFile 'analysis/example_word_list.txt'
+  configFile 'analysis/hyphenation_patterns.xml'
   configFile 'analysis/synonym.txt'
   configFile 'analysis/stemmer_override.txt'
   configFile 'userdict_ja.txt'
   configFile 'KeywordTokenizer.rbbi'
+  extraConfigFile 'hunspell/en_US/en_US.aff', '../core/src/test/resources/indices/analyze/conf_dir/hunspell/en_US/en_US.aff'
+  extraConfigFile 'hunspell/en_US/en_US.dic', '../core/src/test/resources/indices/analyze/conf_dir/hunspell/en_US/en_US.dic'
   // Whitelist reindexing from the local node so we can test it.
   setting 'reindex.remote.whitelist', '127.0.0.1:*'
 }

+ 26 - 18
docs/reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc

@@ -8,17 +8,21 @@ equivalents, if one exists.  Example:
 
 [source,js]
 --------------------------------------------------
-"index" : {
-    "analysis" : {
-        "analyzer" : {
-            "default" : {
-                "tokenizer" : "standard",
-                "filter" : ["standard", "asciifolding"]
+PUT /asciifold_example
+{
+    "settings" : {
+        "analysis" : {
+            "analyzer" : {
+                "default" : {
+                    "tokenizer" : "standard",
+                    "filter" : ["standard", "asciifolding"]
+                }
             }
         }
     }
 }
 --------------------------------------------------
+// CONSOLE
 
 Accepts `preserve_original` setting which defaults to false but if true
 will keep the original token as well as emit the folded token.  For
@@ -26,20 +30,24 @@ example:
 
 [source,js]
 --------------------------------------------------
-"index" : {
-    "analysis" : {
-        "analyzer" : {
-            "default" : {
-                "tokenizer" : "standard",
-                "filter" : ["standard", "my_ascii_folding"]
-            }
-        },
-        "filter" : {
-            "my_ascii_folding" : {
-                "type" : "asciifolding",
-                "preserve_original" : true
+PUT /asciifold_example
+{
+    "settings" : {
+        "analysis" : {
+            "analyzer" : {
+                "default" : {
+                    "tokenizer" : "standard",
+                    "filter" : ["standard", "my_ascii_folding"]
+                }
+            },
+            "filter" : {
+                "my_ascii_folding" : {
+                    "type" : "asciifolding",
+                    "preserve_original" : true
+                }
             }
         }
     }
 }
 --------------------------------------------------
+// CONSOLE

+ 3 - 1
docs/reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc

@@ -16,8 +16,9 @@ Bigrams are generated for characters in `han`, `hiragana`, `katakana` and
 
 [source,js]
 --------------------------------------------------
+PUT /cjk_bigram_example
 {
-    "index" : {
+    "settings" : {
         "analysis" : {
             "analyzer" : {
                 "han_bigrams" : {
@@ -40,3 +41,4 @@ Bigrams are generated for characters in `han`, `hiragana`, `katakana` and
     }
 }
 --------------------------------------------------
+// CONSOLE

+ 29 - 17
docs/reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc

@@ -41,21 +41,33 @@ Here is an example:
 
 [source,js]
 --------------------------------------------------
-index :
-    analysis :
-        analyzer :
-            index_grams :
-                tokenizer : whitespace
-                filter : [common_grams]
-            search_grams :
-                tokenizer : whitespace
-                filter : [common_grams_query]
-        filter :
-            common_grams :
-                type : common_grams
-                common_words: [a, an, the]                
-            common_grams_query :
-                type : common_grams
-                query_mode: true
-                common_words: [a, an, the]                
+PUT /common_grams_example
+{
+    "settings": {
+        "analysis": {
+            "my_analyzer": {
+                "index_grams": {
+                    "tokenizer": "whitespace",
+                    "filter": ["common_grams"]
+                },
+                "search_grams": {
+                    "tokenizer": "whitespace",
+                    "filter": ["common_grams_query"]
+                }
+            },
+            "filter": {
+                "common_grams": {
+                    "type": "common_grams",
+                    "common_words": ["a", "an", "the"]
+                },
+                "common_grams_query": {
+                    "type": "common_grams",
+                    "query_mode": true,
+                    "common_words": ["a", "an", "the"]
+                }
+            }
+        }
+    }
+}
 --------------------------------------------------
+// CONSOLE

+ 28 - 17
docs/reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc

@@ -1,5 +1,5 @@
 [[analysis-compound-word-tokenfilter]]
-=== Compound Word Token Filter
+=== Compound Word Token Filters
 
 The `hyphenation_decompounder` and `dictionary_decompounder` token filters can
 decompose compound words found in many German languages into word parts.
@@ -84,20 +84,31 @@ Here is an example:
 
 [source,js]
 --------------------------------------------------
-index :
-    analysis :
-        analyzer :
-            myAnalyzer2 :
-                type : custom
-                tokenizer : standard
-                filter : [myTokenFilter1, myTokenFilter2]
-        filter :
-            myTokenFilter1 :
-                type : dictionary_decompounder
-                word_list: [one, two, three]
-            myTokenFilter2 :
-                type : hyphenation_decompounder
-                word_list_path: path/to/words.txt
-                hyphenation_patterns_path: path/to/fop.xml
-                max_subword_size : 22
+PUT /compound_word_example
+{
+    "index": {
+        "analysis": {
+            "analyzer": {
+                "my_analyzer": {
+                    "type": "custom",
+                    "tokenizer": "standard",
+                    "filter": ["dictionary_decompounder", "hyphenation_decompounder"]
+                }
+            },
+            "filter": {
+                "dictionary_decompounder": {
+                    "type": "dictionary_decompounder",
+                    "word_list": ["one", "two", "three"]
+                },
+                "hyphenation_decompounder": {
+                    "type" : "hyphenation_decompounder",
+                    "word_list_path": "analysis/example_word_list.txt",
+                    "hyphenation_patterns_path": "analysis/hyphenation_patterns.xml",
+                    "max_subword_size": 22
+                }
+            }
+        }
+    }
+}
 --------------------------------------------------
+// CONSOLE

+ 16 - 12
docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc

@@ -9,20 +9,24 @@ example:
 
 [source,js]
 --------------------------------------------------
-"index" : {
-    "analysis" : {
-        "analyzer" : {
-            "default" : {
-                "tokenizer" : "standard",
-                "filter" : ["standard", "elision"]
-            }
-        },
-        "filter" : {
-            "elision" : {
-                "type" : "elision",
-                "articles" : ["l", "m", "t", "qu", "n", "s", "j"]
+PUT /elision_example
+{
+    "settings" : {
+        "analysis" : {
+            "analyzer" : {
+                "default" : {
+                    "tokenizer" : "standard",
+                    "filter" : ["standard", "elision"]
+                }
+            },
+            "filter" : {
+                "elision" : {
+                    "type" : "elision",
+                    "articles" : ["l", "m", "t", "qu", "n", "s", "j"]
+                }
             }
         }
     }
 }
 --------------------------------------------------
+// CONSOLE

+ 17 - 13
docs/reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc

@@ -10,7 +10,7 @@ one or more `*.dic` files (all of which will automatically be picked up).
 For example, assuming the default hunspell location is used, the
 following directory layout will define the `en_US` dictionary:
 
-[source,js]
+[source,txt]
 --------------------------------------------------
 - conf
     |-- hunspell
@@ -42,24 +42,28 @@ settings:
 
 [source,js]
 --------------------------------------------------
+PUT /hunspell_example
 {
-    "analysis" : {
-        "analyzer" : {
-            "en" : {
-                "tokenizer" : "standard",
-                "filter" : [ "lowercase", "en_US" ]
-            }
-        },
-        "filter" : {
-            "en_US" : {
-                "type" : "hunspell",
-                "locale" : "en_US",
-                "dedup" : true
+    "settings": {
+        "analysis" : {
+            "analyzer" : {
+                "en" : {
+                    "tokenizer" : "standard",
+                    "filter" : [ "lowercase", "en_US" ]
+                }
+            },
+            "filter" : {
+                "en_US" : {
+                    "type" : "hunspell",
+                    "locale" : "en_US",
+                    "dedup" : true
+                }
             }
         }
     }
 }
 --------------------------------------------------
+// CONSOLE
 
 The hunspell token filter accepts four options:
 

+ 41 - 4
docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc

@@ -1,7 +1,7 @@
 [[analysis-keep-types-tokenfilter]]
 === Keep Types Token Filter
 
-A token filter of type `keep_types` that only keeps tokens with a token type 
+A token filter of type `keep_types` that only keeps tokens with a token type
 contained in a predefined set.
 
 
@@ -14,24 +14,61 @@ types:: a list of types to keep
 [float]
 === Settings example
 
+You can set it up like:
+
 [source,js]
 --------------------------------------------------
+PUT /keep_types_example
 {
-    "index" : {
+    "settings" : {
         "analysis" : {
             "analyzer" : {
                 "my_analyzer" : {
                     "tokenizer" : "standard",
                     "filter" : ["standard", "lowercase", "extract_numbers"]
-                },
+                }
             },
             "filter" : {
                 "extract_numbers" : {
                     "type" : "keep_types",
                     "types" : [ "<NUM>" ]
-                },
+                }
             }
         }
     }
 }
 --------------------------------------------------
+// CONSOLE
+
+And test it like:
+
+[source,js]
+--------------------------------------------------
+POST /keep_types_example/_analyze
+{
+  "analyzer" : "my_analyzer",
+  "text" : "this is just 1 a test"
+}
+--------------------------------------------------
+// CONSOLE
+// TEST[continued]
+
+And it'd respond:
+
+[source,js]
+--------------------------------------------------
+{
+  "tokens": [
+    {
+      "token": "1",
+      "start_offset": 13,
+      "end_offset": 14,
+      "type": "<NUM>",
+      "position": 3
+    }
+  ]
+}
+--------------------------------------------------
+// TESTRESPONSE
+
+Note how only the `<NUM>` token is in the output.

+ 8 - 6
docs/reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc

@@ -20,17 +20,18 @@ keep_words_case:: a boolean indicating whether to lower case the words (defaults
 
 [source,js]
 --------------------------------------------------
+PUT /keep_words_example
 {
-    "index" : {
+    "settings" : {
         "analysis" : {
             "analyzer" : {
-                "my_analyzer" : {
+                "example_1" : {
                     "tokenizer" : "standard",
                     "filter" : ["standard", "lowercase", "words_till_three"]
                 },
-                "my_analyzer1" : {
+                "example_2" : {
                     "tokenizer" : "standard",
-                    "filter" : ["standard", "lowercase", "words_on_file"]
+                    "filter" : ["standard", "lowercase", "words_in_file"]
                 }
             },
             "filter" : {
@@ -38,12 +39,13 @@ keep_words_case:: a boolean indicating whether to lower case the words (defaults
                     "type" : "keep",
                     "keep_words" : [ "one", "two", "three"]
                 },
-                "words_on_file" : {
+                "words_in_file" : {
                     "type" : "keep",
-                    "keep_words_path" : "/path/to/word/file"
+                    "keep_words_path" : "analysis/example_word_list.txt"
                 }
             }
         }
     }
 }
 --------------------------------------------------
+// CONSOLE

+ 117 - 12
docs/reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc

@@ -19,19 +19,124 @@ in the text.
 `false`.
 |=======================================================================
 
-Here is an example:
+You can configure it like:
 
 [source,js]
 --------------------------------------------------
-index :
-    analysis :
-        analyzer :
-            myAnalyzer :
-                type : custom
-                tokenizer : standard
-                filter : [lowercase, protwords, porter_stem]    
-        filter :
-            protwords :
-                type : keyword_marker
-                keywords_path : analysis/protwords.txt
+PUT /keyword_marker_example
+{
+  "settings": {
+    "analysis": {
+      "analyzer": {
+        "protect_cats": {
+          "type": "custom",
+          "tokenizer": "standard",
+          "filter": ["lowercase", "protect_cats", "porter_stem"]
+        },
+        "normal": {
+          "type": "custom",
+          "tokenizer": "standard",
+          "filter": ["lowercase", "porter_stem"]
+        }
+      },
+      "filter": {
+        "protect_cats": {
+          "type": "keyword_marker",
+          "keywords": ["cats"]
+        }
+      }
+    }
+  }
+}
 --------------------------------------------------
+// CONSOLE
+
+And test it with:
+
+[source,js]
+--------------------------------------------------
+POST /keyword_marker_example/_analyze
+{
+  "analyzer" : "protect_cats",
+  "text" : "I like cats"
+}
+--------------------------------------------------
+// CONSOLE
+// TEST[continued]
+
+And it'd respond:
+
+[source,js]
+--------------------------------------------------
+{
+  "tokens": [
+    {
+      "token": "i",
+      "start_offset": 0,
+      "end_offset": 1,
+      "type": "<ALPHANUM>",
+      "position": 0
+    },
+    {
+      "token": "like",
+      "start_offset": 2,
+      "end_offset": 6,
+      "type": "<ALPHANUM>",
+      "position": 1
+    },
+    {
+      "token": "cats",
+      "start_offset": 7,
+      "end_offset": 11,
+      "type": "<ALPHANUM>",
+      "position": 2
+    }
+  ]
+}
+--------------------------------------------------
+// TESTRESPONSE
+
+As compared to the `normal` analyzer which has `cats` stemmed to `cat`:
+
+[source,js]
+--------------------------------------------------
+POST /keyword_marker_example/_analyze
+{
+  "analyzer" : "normal",
+  "text" : "I like cats"
+}
+--------------------------------------------------
+// CONSOLE
+// TEST[continued]
+
+Response:
+
+[source,js]
+--------------------------------------------------
+{
+  "tokens": [
+    {
+      "token": "i",
+      "start_offset": 0,
+      "end_offset": 1,
+      "type": "<ALPHANUM>",
+      "position": 0
+    },
+    {
+      "token": "like",
+      "start_offset": 2,
+      "end_offset": 6,
+      "type": "<ALPHANUM>",
+      "position": 1
+    },
+    {
+      "token": "cat",
+      "start_offset": 7,
+      "end_offset": 11,
+      "type": "<ALPHANUM>",
+      "position": 2
+    }
+  ]
+}
+--------------------------------------------------
+// TESTRESPONSE

+ 78 - 11
docs/reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc

@@ -9,18 +9,85 @@ subsequent stemmer will be indexed twice. Therefore, consider adding a
 `unique` filter with `only_on_same_position` set to `true` to drop
 unnecessary duplicates.
 
-Here is an example:
+Here is an example of using the `keyword_repeat` token filter to
+preserve both the stemmed and unstemmed version of tokens:
 
 [source,js]
 --------------------------------------------------
-index :
-    analysis :
-        analyzer :
-            myAnalyzer :
-                type : custom
-                tokenizer : standard
-                filter : [lowercase, keyword_repeat, porter_stem, unique_stem]    
-            unique_stem:
-                type: unique
-                only_on_same_position : true
+PUT /keyword_repeat_example
+{
+  "settings": {
+    "analysis": {
+      "analyzer": {
+        "stemmed_and_unstemmed": {
+          "type": "custom",
+          "tokenizer": "standard",
+          "filter": ["lowercase", "keyword_repeat", "porter_stem", "unique_stem"]
+        }
+      },
+      "filter": {
+        "unique_stem": {
+          "type": "unique",
+          "only_on_same_position": true
+        }
+      }
+    }
+  }
+}
 --------------------------------------------------
+// CONSOLE
+
+And you can test it with:
+
+[source,js]
+--------------------------------------------------
+POST /keyword_repeat_example/_analyze
+{
+  "analyzer" : "stemmed_and_unstemmed",
+  "text" : "I like cats"
+}
+--------------------------------------------------
+// CONSOLE
+// TEST[continued]
+
+And it'd respond:
+
+[source,js]
+--------------------------------------------------
+{
+  "tokens": [
+    {
+      "token": "i",
+      "start_offset": 0,
+      "end_offset": 1,
+      "type": "<ALPHANUM>",
+      "position": 0
+    },
+    {
+      "token": "like",
+      "start_offset": 2,
+      "end_offset": 6,
+      "type": "<ALPHANUM>",
+      "position": 1
+    },
+    {
+      "token": "cats",
+      "start_offset": 7,
+      "end_offset": 11,
+      "type": "<ALPHANUM>",
+      "position": 2
+    },
+    {
+      "token": "cat",
+      "start_offset": 7,
+      "end_offset": 11,
+      "type": "<ALPHANUM>",
+      "position": 2
+    }
+  ]
+}
+--------------------------------------------------
+// TESTRESPONSE
+
+Which preserves both the `cat` and `cats` tokens. Compare this to the example
+on the <<analysis-keyword-marker-tokenfilter>>.

+ 21 - 11
docs/reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc

@@ -18,15 +18,25 @@ Here is an example:
 
 [source,js]
 --------------------------------------------------
-index :
-    analysis :
-        analyzer :
-            myAnalyzer :
-                type : custom
-                tokenizer : standard
-                filter : [lowercase, five_token_limit]
-        filter :
-            five_token_limit :
-                type : limit
-                max_token_count : 5
+PUT /limit_example
+{
+  "settings": {
+    "analysis": {
+      "analyzer": {
+        "limit_example": {
+          "type": "custom",
+          "tokenizer": "standard",
+          "filter": ["lowercase", "five_token_limit"]
+        }
+      },
+      "filter": {
+        "five_token_limit": {
+          "type": "limit",
+          "max_token_count": 5
+        }
+      }
+    }
+  }
+}
 --------------------------------------------------
+// CONSOLE

+ 26 - 24
docs/reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc

@@ -10,28 +10,30 @@ custom analyzer
 
 [source,js]
 --------------------------------------------------
-index :
-    analysis :
-        analyzer : 
-            myAnalyzer2 :
-                type : custom
-                tokenizer : myTokenizer1
-                filter : [myTokenFilter1, myGreekLowerCaseFilter]
-                char_filter : [my_html]
-        tokenizer :
-            myTokenizer1 :
-                type : standard
-                max_token_length : 900
-        filter :
-            myTokenFilter1 :
-                type : stop
-                stopwords : [stop1, stop2, stop3, stop4]
-            myGreekLowerCaseFilter :
-                type : lowercase
-                language : greek
-        char_filter :
-              my_html :
-                type : html_strip
-                escaped_tags : [xxx, yyy]
-                read_ahead : 1024
+PUT /lowercase_example
+{
+  "settings": {
+    "analysis": {
+      "analyzer": {
+        "standard_lowercase_example": {
+          "type": "custom",
+          "tokenizer": "standard",
+          "filter": ["lowercase"]
+        },
+        "greek_lowercase_example": {
+          "type": "custom",
+          "tokenizer": "standard",
+          "filter": ["greek_lowercase"]
+        }
+      },
+      "filter": {
+        "greek_lowercase": {
+          "type": "lowercase",
+          "language": "greek"
+        }
+      }
+    }
+  }
+}
 --------------------------------------------------
+// CONSOLE

+ 4 - 0
docs/src/test/cluster/config/analysis/example_word_list.txt

@@ -0,0 +1,4 @@
+test
+list
+of
+words

+ 21 - 0
docs/src/test/cluster/config/analysis/hyphenation_patterns.xml

@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE hyphenation-info SYSTEM "hyphenation.dtd">
+
+<!-- Example hyphenation patterns file. -->
+
+<hyphenation-info>
+
+<hyphen-char value="-"/>
+<hyphen-min before="2" after="2"/>
+
+<classes>
+aA
+</classes>
+
+<exceptions>
+</exceptions>
+
+<patterns>
+.a2
+</patterns>
+</hyphenation-info>