Sfoglia il codice sorgente

Docs: Test examples that recreate lang analyzers (#29535)

We have a pile of documentation describing how to rebuild the built in
language analyzers and, previously, our documentation testing framework
made sure that the examples successfully built *an* analyzer but they
didn't assert that the analyzer built by the documentation matches the
built in anlayzer. Unsuprisingly, some of the examples aren't quite
right.

This adds a mechanism that tests that the analyzers built by the docs.
The mechanism is fairly simple and brutal but it seems to be working:
build a hundred random unicode sequences and send them through the
`_analyze` API with the rebuilt analyzer and then again through the
built in analyzer. Then make sure both APIs return the same results.
Each of these calls to `_anlayze` takes about 20ms on my laptop which
seems fine.
Nik Everett 7 anni fa
parent
commit
f9dc86836d

+ 9 - 2
buildSrc/src/main/groovy/org/elasticsearch/gradle/doc/RestTestsFromSnippetsTask.groovy

@@ -141,9 +141,11 @@ public class RestTestsFromSnippetsTask extends SnippetsTask {
         private static final String SYNTAX = {
             String method = /(?<method>GET|PUT|POST|HEAD|OPTIONS|DELETE)/
             String pathAndQuery = /(?<pathAndQuery>[^\n]+)/
-            String badBody = /GET|PUT|POST|HEAD|OPTIONS|DELETE|#/
+            String badBody = /GET|PUT|POST|HEAD|OPTIONS|DELETE|startyaml|#/
             String body = /(?<body>(?:\n(?!$badBody)[^\n]+)+)/
-            String nonComment = /$method\s+$pathAndQuery$body?/
+            String rawRequest = /(?:$method\s+$pathAndQuery$body?)/
+            String yamlRequest = /(?:startyaml(?s)(?<yaml>.+?)(?-s)endyaml)/
+            String nonComment = /(?:$rawRequest|$yamlRequest)/
             String comment = /(?<comment>#.+)/
             /(?:$comment|$nonComment)\n+/
         }()
@@ -333,6 +335,11 @@ public class RestTestsFromSnippetsTask extends SnippetsTask {
                     // Comment
                     return
                 }
+                String yamlRequest = matcher.group("yaml");
+                if (yamlRequest != null) {
+                    current.println(yamlRequest)
+                    return
+                }
                 String method = matcher.group("method")
                 String pathAndQuery = matcher.group("pathAndQuery")
                 String body = matcher.group("body")

+ 17 - 0
docs/README.asciidoc

@@ -68,6 +68,23 @@ for its modifiers:
   but rather than the setup defined in `docs/build.gradle` the setup is defined
   right in the documentation file.
 
+In addition to the standard CONSOLE syntax these snippets can contain blocks
+of yaml surrounded by markers like this:
+
+```
+startyaml
+  - compare_analyzers: {index: thai_example, first: thai, second: rebuilt_thai}
+endyaml
+```
+
+This allows slightly more expressive testing of the snippets. Since that syntax
+is not supported by CONSOLE the usual way to incorporate it is with a
+`// TEST[s//]` marker like this:
+
+```
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: thai_example, first: thai, second: rebuilt_thai}\nendyaml\n/]
+```
+
 Any place you can use json you can use elements like `$body.path.to.thing`
 which is replaced on the fly with the contents of the thing at `path.to.thing`
 in the last response.

+ 2 - 0
docs/build.gradle

@@ -60,6 +60,8 @@ buildRestTests.docs = fileTree(projectDir) {
   exclude 'build.gradle'
   // That is where the snippets go, not where they come from!
   exclude 'build'
+  // Just syntax examples
+  exclude 'README.asciidoc'
 }
 
 Closure setupTwitter = { String name, int count ->

+ 128 - 40
docs/reference/analysis/analyzers/lang-analyzer.asciidoc

@@ -97,10 +97,11 @@ PUT /arabic_example
         }
       },
       "analyzer": {
-        "arabic": {
+        "rebuilt_arabic": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
+            "decimal_digit",
             "arabic_stop",
             "arabic_normalization",
             "arabic_keywords",
@@ -113,6 +114,8 @@ PUT /arabic_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"arabic_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: arabic_example, first: arabic, second: rebuilt_arabic}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -144,7 +147,7 @@ PUT /armenian_example
         }
       },
       "analyzer": {
-        "armenian": {
+        "rebuilt_armenian": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
@@ -159,6 +162,8 @@ PUT /armenian_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"armenian_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: armenian_example, first: armenian, second: rebuilt_armenian}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -190,7 +195,7 @@ PUT /basque_example
         }
       },
       "analyzer": {
-        "basque": {
+        "rebuilt_basque": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
@@ -205,6 +210,8 @@ PUT /basque_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"basque_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: basque_example, first: basque, second: rebuilt_basque}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -236,14 +243,15 @@ PUT /bengali_example
         }
       },
       "analyzer": {
-        "bengali": {
+        "rebuilt_bengali": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
+            "decimal_digit",
+            "bengali_keywords",
             "indic_normalization",
             "bengali_normalization",
             "bengali_stop",
-            "bengali_keywords",
             "bengali_stemmer"
           ]
         }
@@ -253,6 +261,8 @@ PUT /bengali_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"bengali_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: bengali_example, first: bengali, second: rebuilt_bengali}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -284,7 +294,7 @@ PUT /brazilian_example
         }
       },
       "analyzer": {
-        "brazilian": {
+        "rebuilt_brazilian": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
@@ -299,6 +309,8 @@ PUT /brazilian_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"brazilian_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: brazilian_example, first: brazilian, second: rebuilt_brazilian}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -330,7 +342,7 @@ PUT /bulgarian_example
         }
       },
       "analyzer": {
-        "bulgarian": {
+        "rebuilt_bulgarian": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
@@ -345,6 +357,8 @@ PUT /bulgarian_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"bulgarian_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: bulgarian_example, first: bulgarian, second: rebuilt_bulgarian}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -380,7 +394,7 @@ PUT /catalan_example
         }
       },
       "analyzer": {
-        "catalan": {
+        "rebuilt_catalan": {
           "tokenizer":  "standard",
           "filter": [
             "catalan_elision",
@@ -396,6 +410,8 @@ PUT /catalan_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"catalan_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: catalan_example, first: catalan, second: rebuilt_catalan}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -415,11 +431,17 @@ PUT /cjk_example
       "filter": {
         "english_stop": {
           "type":       "stop",
-          "stopwords":  "_english_" <1>
+          "stopwords":  [ <1>
+            "a", "and", "are", "as", "at", "be", "but", "by", "for",
+            "if", "in", "into", "is", "it", "no", "not", "of", "on",
+            "or", "s", "such", "t", "that", "the", "their", "then",
+            "there", "these", "they", "this", "to", "was", "will",
+            "with", "www"
+          ]
         }
       },
       "analyzer": {
-        "cjk": {
+        "rebuilt_cjk": {
           "tokenizer":  "standard",
           "filter": [
             "cjk_width",
@@ -434,8 +456,12 @@ PUT /cjk_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"cjk_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: cjk_example, first: cjk, second: rebuilt_cjk}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
-    or `stopwords_path` parameters.
+    or `stopwords_path` parameters. The default stop words are
+    *almost* the same as the `_english_` set, but not exactly
+    the same.
 
 [[czech-analyzer]]
 ===== `czech` analyzer
@@ -463,7 +489,7 @@ PUT /czech_example
         }
       },
       "analyzer": {
-        "czech": {
+        "rebuilt_czech": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
@@ -478,6 +504,8 @@ PUT /czech_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"czech_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: czech_example, first: czech, second: rebuilt_czech}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -509,7 +537,7 @@ PUT /danish_example
         }
       },
       "analyzer": {
-        "danish": {
+        "rebuilt_danish": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
@@ -524,6 +552,8 @@ PUT /danish_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"danish_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: danish_example, first: danish, second: rebuilt_danish}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -564,7 +594,7 @@ PUT /dutch_example
         }
       },
       "analyzer": {
-        "dutch": {
+        "rebuilt_dutch": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
@@ -580,6 +610,8 @@ PUT /dutch_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"dutch_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: dutch_example, first: dutch, second: rebuilt_dutch}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -615,7 +647,7 @@ PUT /english_example
         }
       },
       "analyzer": {
-        "english": {
+        "rebuilt_english": {
           "tokenizer":  "standard",
           "filter": [
             "english_possessive_stemmer",
@@ -631,6 +663,8 @@ PUT /english_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"english_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: english_example, first: english, second: rebuilt_english}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -662,7 +696,7 @@ PUT /finnish_example
         }
       },
       "analyzer": {
-        "finnish": {
+        "rebuilt_finnish": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
@@ -677,6 +711,8 @@ PUT /finnish_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"finnish_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: finnish_example, first: finnish, second: rebuilt_finnish}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -717,7 +753,7 @@ PUT /french_example
         }
       },
       "analyzer": {
-        "french": {
+        "rebuilt_french": {
           "tokenizer":  "standard",
           "filter": [
             "french_elision",
@@ -733,6 +769,8 @@ PUT /french_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"french_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: french_example, first: french, second: rebuilt_french}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -764,7 +802,7 @@ PUT /galician_example
         }
       },
       "analyzer": {
-        "galician": {
+        "rebuilt_galician": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
@@ -779,6 +817,8 @@ PUT /galician_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"galician_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: galician_example, first: galician, second: rebuilt_galician}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -810,7 +850,7 @@ PUT /german_example
         }
       },
       "analyzer": {
-        "german": {
+        "rebuilt_german": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
@@ -826,6 +866,8 @@ PUT /german_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"german_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: german_example, first: german, second: rebuilt_german}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -861,7 +903,7 @@ PUT /greek_example
         }
       },
       "analyzer": {
-        "greek": {
+        "rebuilt_greek": {
           "tokenizer":  "standard",
           "filter": [
             "greek_lowercase",
@@ -876,6 +918,8 @@ PUT /greek_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"greek_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: greek_example, first: greek, second: rebuilt_greek}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -907,14 +951,15 @@ PUT /hindi_example
         }
       },
       "analyzer": {
-        "hindi": {
+        "rebuilt_hindi": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
+            "decimal_digit",
+            "hindi_keywords",
             "indic_normalization",
             "hindi_normalization",
             "hindi_stop",
-            "hindi_keywords",
             "hindi_stemmer"
           ]
         }
@@ -924,6 +969,8 @@ PUT /hindi_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"hindi_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: hindi_example, first: hindi, second: rebuilt_hindi}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -955,7 +1002,7 @@ PUT /hungarian_example
         }
       },
       "analyzer": {
-        "hungarian": {
+        "rebuilt_hungarian": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
@@ -970,6 +1017,8 @@ PUT /hungarian_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"hungarian_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: hungarian_example, first: hungarian, second: rebuilt_hungarian}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -1002,7 +1051,7 @@ PUT /indonesian_example
         }
       },
       "analyzer": {
-        "indonesian": {
+        "rebuilt_indonesian": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
@@ -1017,6 +1066,8 @@ PUT /indonesian_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"indonesian_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: indonesian_example, first: indonesian, second: rebuilt_indonesian}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -1034,9 +1085,15 @@ PUT /irish_example
   "settings": {
     "analysis": {
       "filter": {
+        "irish_hyphenation": {
+          "type":       "stop",
+          "stopwords":  [ "h", "n", "t" ],
+          "ignore_case": true
+        },
         "irish_elision": {
           "type":       "elision",
-          "articles": [ "h", "n", "t" ]
+          "articles":   [ "d", "m", "b" ],
+          "articles_case": true
         },
         "irish_stop": {
           "type":       "stop",
@@ -1056,12 +1113,13 @@ PUT /irish_example
         }
       },
       "analyzer": {
-        "irish": {
+        "rebuilt_irish": {
           "tokenizer":  "standard",
           "filter": [
-            "irish_stop",
+            "irish_hyphenation",
             "irish_elision",
             "irish_lowercase",
+            "irish_stop",
             "irish_keywords",
             "irish_stemmer"
           ]
@@ -1072,6 +1130,8 @@ PUT /irish_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"irish_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: irish_example, first: irish, second: rebuilt_irish}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -1112,7 +1172,7 @@ PUT /italian_example
         }
       },
       "analyzer": {
-        "italian": {
+        "rebuilt_italian": {
           "tokenizer":  "standard",
           "filter": [
             "italian_elision",
@@ -1128,6 +1188,8 @@ PUT /italian_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"italian_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: italian_example, first: italian, second: rebuilt_italian}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -1159,7 +1221,7 @@ PUT /latvian_example
         }
       },
       "analyzer": {
-        "latvian": {
+        "rebuilt_latvian": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
@@ -1174,6 +1236,8 @@ PUT /latvian_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"latvian_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: latvian_example, first: latvian, second: rebuilt_latvian}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -1205,7 +1269,7 @@ PUT /lithuanian_example
         }
       },
       "analyzer": {
-        "lithuanian": {
+        "rebuilt_lithuanian": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
@@ -1220,6 +1284,8 @@ PUT /lithuanian_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"lithuanian_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: lithuanian_example, first: lithuanian, second: rebuilt_lithuanian}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -1251,7 +1317,7 @@ PUT /norwegian_example
         }
       },
       "analyzer": {
-        "norwegian": {
+        "rebuilt_norwegian": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
@@ -1266,6 +1332,8 @@ PUT /norwegian_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"norwegian_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: norwegian_example, first: norwegian, second: rebuilt_norwegian}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -1295,11 +1363,12 @@ PUT /persian_example
         }
       },
       "analyzer": {
-        "persian": {
+        "rebuilt_persian": {
           "tokenizer":     "standard",
           "char_filter": [ "zero_width_spaces" ],
           "filter": [
             "lowercase",
+            "decimal_digit",
             "arabic_normalization",
             "persian_normalization",
             "persian_stop"
@@ -1311,6 +1380,7 @@ PUT /persian_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: persian_example, first: persian, second: rebuilt_persian}\nendyaml\n/]
 <1> Replaces zero-width non-joiners with an ASCII space.
 <2> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
@@ -1341,7 +1411,7 @@ PUT /portuguese_example
         }
       },
       "analyzer": {
-        "portuguese": {
+        "rebuilt_portuguese": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
@@ -1356,6 +1426,8 @@ PUT /portuguese_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"portuguese_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: portuguese_example, first: portuguese, second: rebuilt_portuguese}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -1387,7 +1459,7 @@ PUT /romanian_example
         }
       },
       "analyzer": {
-        "romanian": {
+        "rebuilt_romanian": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
@@ -1402,6 +1474,8 @@ PUT /romanian_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"romanian_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: romanian_example, first: romanian, second: rebuilt_romanian}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -1434,7 +1508,7 @@ PUT /russian_example
         }
       },
       "analyzer": {
-        "russian": {
+        "rebuilt_russian": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
@@ -1449,6 +1523,8 @@ PUT /russian_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"russian_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: russian_example, first: russian, second: rebuilt_russian}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -1480,11 +1556,12 @@ PUT /sorani_example
         }
       },
       "analyzer": {
-        "sorani": {
+        "rebuilt_sorani": {
           "tokenizer":  "standard",
           "filter": [
             "sorani_normalization",
             "lowercase",
+            "decimal_digit",
             "sorani_stop",
             "sorani_keywords",
             "sorani_stemmer"
@@ -1496,6 +1573,8 @@ PUT /sorani_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"sorani_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: sorani_example, first: sorani, second: rebuilt_sorani}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -1527,7 +1606,7 @@ PUT /spanish_example
         }
       },
       "analyzer": {
-        "spanish": {
+        "rebuilt_spanish": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
@@ -1542,6 +1621,8 @@ PUT /spanish_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"spanish_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: spanish_example, first: spanish, second: rebuilt_spanish}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -1573,7 +1654,7 @@ PUT /swedish_example
         }
       },
       "analyzer": {
-        "swedish": {
+        "rebuilt_swedish": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
@@ -1588,6 +1669,8 @@ PUT /swedish_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"swedish_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: swedish_example, first: swedish, second: rebuilt_swedish}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -1623,7 +1706,7 @@ PUT /turkish_example
         }
       },
       "analyzer": {
-        "turkish": {
+        "rebuilt_turkish": {
           "tokenizer":  "standard",
           "filter": [
             "apostrophe",
@@ -1639,6 +1722,8 @@ PUT /turkish_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"turkish_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: turkish_example, first: turkish, second: rebuilt_turkish}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.
 <2> This filter should be removed unless there are words which should
@@ -1662,10 +1747,11 @@ PUT /thai_example
         }
       },
       "analyzer": {
-        "thai": {
+        "rebuilt_thai": {
           "tokenizer":  "thai",
           "filter": [
             "lowercase",
+            "decimal_digit",
             "thai_stop"
           ]
         }
@@ -1675,5 +1761,7 @@ PUT /thai_example
 }
 ----------------------------------------------------
 // CONSOLE
+// TEST[s/"thai_keywords",//]
+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: thai_example, first: thai, second: rebuilt_thai}\nendyaml\n/]
 <1> The default stopwords can be overridden with the `stopwords`
     or `stopwords_path` parameters.

+ 140 - 2
docs/src/test/java/org/elasticsearch/smoketest/DocsClientYamlTestSuiteIT.java

@@ -20,18 +20,39 @@
 package org.elasticsearch.smoketest;
 
 import org.apache.http.HttpHost;
+import org.apache.lucene.util.BytesRef;
+
 import com.carrotsearch.randomizedtesting.annotations.Name;
 import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
 import org.elasticsearch.Version;
 import org.elasticsearch.client.RestClient;
+import org.elasticsearch.common.ParseField;
+import org.elasticsearch.common.xcontent.ConstructingObjectParser;
+import org.elasticsearch.common.xcontent.NamedXContentRegistry;
+import org.elasticsearch.common.xcontent.XContentLocation;
+import org.elasticsearch.common.xcontent.XContentParser;
+import org.elasticsearch.common.xcontent.XContentParser.Token;
 import org.elasticsearch.test.rest.yaml.ClientYamlDocsTestClient;
 import org.elasticsearch.test.rest.yaml.ClientYamlTestCandidate;
 import org.elasticsearch.test.rest.yaml.ClientYamlTestClient;
+import org.elasticsearch.test.rest.yaml.ClientYamlTestExecutionContext;
+import org.elasticsearch.test.rest.yaml.ClientYamlTestResponse;
 import org.elasticsearch.test.rest.yaml.ESClientYamlSuiteTestCase;
 import org.elasticsearch.test.rest.yaml.restspec.ClientYamlSuiteRestSpec;
+import org.elasticsearch.test.rest.yaml.section.ExecutableSection;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
 import java.util.List;
+import java.util.Map;
+
+import static org.elasticsearch.common.xcontent.ConstructingObjectParser.constructorArg;
+
+import static java.util.Collections.emptyMap;
+import static java.util.Collections.singletonList;
+import static java.util.Collections.singletonMap;
 
 public class DocsClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase {
 
@@ -41,7 +62,12 @@ public class DocsClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase {
 
     @ParametersFactory
     public static Iterable<Object[]> parameters() throws Exception {
-        return ESClientYamlSuiteTestCase.createParameters();
+        List<NamedXContentRegistry.Entry> entries = new ArrayList<>(ExecutableSection.DEFAULT_EXECUTABLE_CONTEXTS.size() + 1);
+        entries.addAll(ExecutableSection.DEFAULT_EXECUTABLE_CONTEXTS);
+        entries.add(new NamedXContentRegistry.Entry(ExecutableSection.class,
+                new ParseField("compare_analyzers"), CompareAnalyzers::parse));
+        NamedXContentRegistry executeableSectionRegistry = new NamedXContentRegistry(entries);
+        return ESClientYamlSuiteTestCase.createParameters(executeableSectionRegistry);
     }
 
     @Override
@@ -64,5 +90,117 @@ public class DocsClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase {
                                                             List<HttpHost> hosts, Version esVersion) throws IOException {
         return new ClientYamlDocsTestClient(restSpec, restClient, hosts, esVersion);
     }
-}
 
+    /**
+     * Compares the the results of running two analyzers against many random
+     * strings. The goal is to figure out if two anlayzers are "the same" by
+     * comparing their results. This is far from perfect but should be fairly
+     * accurate, especially for gross things like missing {@code decimal_digit}
+     * token filters, and should be fairly fast because it compares a fairly
+     * small number of tokens.
+     */
+    private static class CompareAnalyzers implements ExecutableSection {
+        private static ConstructingObjectParser<CompareAnalyzers, XContentLocation> PARSER =
+            new ConstructingObjectParser<>("test_analyzer", false, (a, location) -> {
+                String index = (String) a[0];
+                String first = (String) a[1];
+                String second = (String) a[2];
+                return new CompareAnalyzers(location, index, first, second);
+            });
+        static {
+            PARSER.declareString(constructorArg(), new ParseField("index"));
+            PARSER.declareString(constructorArg(), new ParseField("first"));
+            PARSER.declareString(constructorArg(), new ParseField("second"));
+        }
+        private static CompareAnalyzers parse(XContentParser parser) throws IOException {
+            XContentLocation location = parser.getTokenLocation();
+            CompareAnalyzers section = PARSER.parse(parser, location);
+            assert parser.currentToken() == Token.END_OBJECT : "End of object required";
+            parser.nextToken(); // throw out the END_OBJECT to conform with other ExecutableSections
+            return section;
+        }
+
+        private final XContentLocation location;
+        private final String index;
+        private final String first;
+        private final String second;
+
+        private CompareAnalyzers(XContentLocation location, String index, String first, String second) {
+            this.location = location;
+            this.index = index;
+            this.first = first;
+            this.second = second;
+        }
+
+        @Override
+        public XContentLocation getLocation() {
+            return location;
+        }
+
+        @Override
+        public void execute(ClientYamlTestExecutionContext executionContext) throws IOException {
+            int size = 100;
+            int maxLength = 15;
+            List<String> testText = new ArrayList<>(size);
+            for (int i = 0; i < size; i++) {
+                /**
+                 * Build a string with a few unicode sequences separated by
+                 * spaces. The unicode sequences aren't going to be of the same
+                 * code page which is a shame because it makes the entire
+                 * string less realistic. But this still provides a fairly
+                 * nice string to compare.
+                 */
+                int spaces = between(0, 5);
+                StringBuilder b = new StringBuilder((spaces + 1) * maxLength);
+                b.append(randomRealisticUnicodeOfCodepointLengthBetween(1, maxLength));
+                for (int t = 0; t < spaces; t++) {
+                    b.append(' ');
+                    b.append(randomRealisticUnicodeOfCodepointLengthBetween(1, maxLength));
+                }
+                testText.add(b.toString()
+                    // Don't look up stashed values
+                    .replace("$", "\\$"));
+            }
+            Map<String, Object> body = new HashMap<>(2);
+            body.put("analyzer", first);
+            body.put("text", testText);
+            ClientYamlTestResponse response = executionContext.callApi("indices.analyze", singletonMap("index", index),
+                    singletonList(body), emptyMap());
+            Iterator<?> firstTokens = ((List<?>) response.evaluate("tokens")).iterator();
+            body.put("analyzer", second);
+            response = executionContext.callApi("indices.analyze", singletonMap("index", index),
+                    singletonList(body), emptyMap());
+            Iterator<?> secondTokens = ((List<?>) response.evaluate("tokens")).iterator();
+
+            Object previousFirst = null;
+            Object previousSecond = null;
+            while (firstTokens.hasNext()) {
+                if (false == secondTokens.hasNext()) {
+                    fail(second + " has fewer tokens than " + first + ". "
+                        + first + " has [" + firstTokens.next() + "] but " + second + " is out of tokens. "
+                        + first + "'s last token was [" + previousFirst + "] and "
+                        + second + "'s last token was' [" + previousSecond + "]");
+                }
+                Map<?, ?> firstToken = (Map<?, ?>) firstTokens.next();
+                Map<?, ?> secondToken = (Map<?, ?>) secondTokens.next();
+                String firstText = (String) firstToken.get("token");
+                String secondText = (String) secondToken.get("token");
+                // Check the text and produce an error message with the utf8 sequence if they don't match.
+                if (false == secondText.equals(firstText)) {
+                    fail("text differs: " + first + " was [" + firstText + "] but " + second + " was [" + secondText
+                        + "]. In utf8 those are\n" + new BytesRef(firstText) + " and\n" + new BytesRef(secondText));
+                }
+                // Now check the whole map just in case the text matches but something else differs
+                assertEquals(firstToken, secondToken);
+                previousFirst = firstToken;
+                previousSecond = secondToken;
+            }
+            if (secondTokens.hasNext()) {
+                fail(second + " has more tokens than " + first + ". "
+                    + second + " has [" + secondTokens.next() + "] but " + first + " is out of tokens. "
+                    + first + "'s last token was [" + previousFirst + "] and "
+                    + second + "'s last token was' [" + previousSecond + "]");
+            }
+        }
+    }
+}

+ 19 - 8
test/framework/src/main/java/org/elasticsearch/test/rest/yaml/ClientYamlTestClient.java

@@ -121,7 +121,7 @@ public class ClientYamlTestClient {
             }
             String contentType = entity.getContentType().getValue();
             //randomly test the GET with source param instead of GET/POST with body
-            if (sendBodyAsSourceParam(supportedMethods, contentType)) {
+            if (sendBodyAsSourceParam(supportedMethods, contentType, entity.getContentLength())) {
                 logger.debug("sending the request body as source param with GET method");
                 queryStringParams.put("source", EntityUtils.toString(entity));
                 queryStringParams.put("source_content_type", contentType);
@@ -177,14 +177,25 @@ public class ClientYamlTestClient {
         }
     }
 
-    private static boolean sendBodyAsSourceParam(List<String> supportedMethods, String contentType) {
-        if (supportedMethods.contains(HttpGet.METHOD_NAME)) {
-            if (contentType.startsWith(ContentType.APPLICATION_JSON.getMimeType()) ||
-                    contentType.startsWith(YAML_CONTENT_TYPE.getMimeType())) {
-                return RandomizedTest.rarely();
-            }
+    private static boolean sendBodyAsSourceParam(List<String> supportedMethods, String contentType, long contentLength) {
+        if (false == supportedMethods.contains(HttpGet.METHOD_NAME)) {
+            // The API doesn't claim to support GET anyway
+            return false;
+        }
+        if (contentLength < 0) {
+            // Negative length means "unknown" or "huge" in this case. Either way we can't send it as a parameter
+            return false;
+        }
+        if (contentLength > 2000) {
+            // Long bodies won't fit in the parameter and will cause a too_long_frame_exception
+            return false;
+        }
+        if (false == contentType.startsWith(ContentType.APPLICATION_JSON.getMimeType())
+                && false == contentType.startsWith(YAML_CONTENT_TYPE.getMimeType())) {
+            // We can only encode JSON or YAML this way.
+            return false;
         }
-        return false;
+        return RandomizedTest.rarely();
     }
 
     private ClientYamlSuiteRestApi restApi(String apiName) {

+ 14 - 1
test/framework/src/main/java/org/elasticsearch/test/rest/yaml/ESClientYamlSuiteTestCase.java

@@ -28,6 +28,7 @@ import org.elasticsearch.client.RestClient;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.collect.Tuple;
 import org.elasticsearch.common.io.PathUtils;
+import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.test.rest.ESRestTestCase;
 import org.elasticsearch.test.rest.yaml.restspec.ClientYamlSuiteRestApi;
 import org.elasticsearch.test.rest.yaml.restspec.ClientYamlSuiteRestSpec;
@@ -143,7 +144,19 @@ public abstract class ESClientYamlSuiteTestCase extends ESRestTestCase {
         return new ClientYamlTestClient(restSpec, restClient, hosts, esVersion);
     }
 
+    /**
+     * Create parameters for this parameterized test. Uses the
+     * {@link ExecutableSection#XCONTENT_REGISTRY list} of executable sections
+     * defined in {@link ExecutableSection}.
+     */
     public static Iterable<Object[]> createParameters() throws Exception {
+        return createParameters(ExecutableSection.XCONTENT_REGISTRY);
+    }
+
+    /**
+     * Create parameters for this parameterized test.
+     */
+    public static Iterable<Object[]> createParameters(NamedXContentRegistry executeableSectionRegistry) throws Exception {
         String[] paths = resolvePathsProperty(REST_TESTS_SUITE, ""); // default to all tests under the test root
         List<Object[]> tests = new ArrayList<>();
         Map<String, Set<Path>> yamlSuites = loadSuites(paths);
@@ -151,7 +164,7 @@ public abstract class ESClientYamlSuiteTestCase extends ESRestTestCase {
         for (String api : yamlSuites.keySet()) {
             List<Path> yamlFiles = new ArrayList<>(yamlSuites.get(api));
             for (Path yamlFile : yamlFiles) {
-                ClientYamlTestSuite restTestSuite = ClientYamlTestSuite.parse(api, yamlFile);
+                ClientYamlTestSuite restTestSuite = ClientYamlTestSuite.parse(executeableSectionRegistry, api, yamlFile);
                 for (ClientYamlTestSection testSection : restTestSuite.getTestSections()) {
                     tests.add(new Object[]{ new ClientYamlTestCandidate(restTestSuite, testSection) });
                 }

+ 3 - 2
test/framework/src/main/java/org/elasticsearch/test/rest/yaml/section/ClientYamlTestSuite.java

@@ -21,6 +21,7 @@ package org.elasticsearch.test.rest.yaml.section;
 import org.elasticsearch.common.ParsingException;
 import org.elasticsearch.common.xcontent.DeprecationHandler;
 import org.elasticsearch.common.xcontent.LoggingDeprecationHandler;
+import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.common.xcontent.yaml.YamlXContent;
 
@@ -40,7 +41,7 @@ import java.util.TreeSet;
  * Supports a setup section and multiple test sections.
  */
 public class ClientYamlTestSuite {
-    public static ClientYamlTestSuite parse(String api, Path file) throws IOException {
+    public static ClientYamlTestSuite parse(NamedXContentRegistry executeableSectionRegistry, String api, Path file) throws IOException {
         if (!Files.isRegularFile(file)) {
             throw new IllegalArgumentException(file.toAbsolutePath() + " is not a file");
         }
@@ -64,7 +65,7 @@ public class ClientYamlTestSuite {
             }
         }
 
-        try (XContentParser parser = YamlXContent.yamlXContent.createParser(ExecutableSection.XCONTENT_REGISTRY,
+        try (XContentParser parser = YamlXContent.yamlXContent.createParser(executeableSectionRegistry,
             LoggingDeprecationHandler.INSTANCE, Files.newInputStream(file))) {
             return parse(api, filename, parser);
         } catch(Exception e) {

+ 12 - 3
test/framework/src/main/java/org/elasticsearch/test/rest/yaml/section/ExecutableSection.java

@@ -26,15 +26,18 @@ import org.elasticsearch.test.rest.yaml.ClientYamlTestExecutionContext;
 
 import java.io.IOException;
 import java.util.Arrays;
+import java.util.List;
+
+import static java.util.Collections.unmodifiableList;
 
 /**
  * Represents a test fragment that can be executed (e.g. api call, assertion)
  */
 public interface ExecutableSection {
     /**
-     * {@link NamedXContentRegistry} needed in the {@link XContentParser} before calling {@link ExecutableSection#parse(XContentParser)}.
+     * Default list of {@link ExecutableSection}s available for tests.
      */
-    NamedXContentRegistry XCONTENT_REGISTRY = new NamedXContentRegistry(Arrays.asList(
+    List<NamedXContentRegistry.Entry> DEFAULT_EXECUTABLE_CONTEXTS = unmodifiableList(Arrays.asList(
             new NamedXContentRegistry.Entry(ExecutableSection.class, new ParseField("do"), DoSection::parse),
             new NamedXContentRegistry.Entry(ExecutableSection.class, new ParseField("set"), SetSection::parse),
             new NamedXContentRegistry.Entry(ExecutableSection.class, new ParseField("match"), MatchAssertion::parse),
@@ -46,6 +49,12 @@ public interface ExecutableSection {
             new NamedXContentRegistry.Entry(ExecutableSection.class, new ParseField("lte"), LessThanOrEqualToAssertion::parse),
             new NamedXContentRegistry.Entry(ExecutableSection.class, new ParseField("length"), LengthAssertion::parse)));
 
+    /**
+     * {@link NamedXContentRegistry} that parses the default list of
+     * {@link ExecutableSection}s available for tests.
+     */
+    NamedXContentRegistry XCONTENT_REGISTRY = new NamedXContentRegistry(DEFAULT_EXECUTABLE_CONTEXTS);
+
     static ExecutableSection parse(XContentParser parser) throws IOException {
         ParserUtils.advanceToFieldName(parser);
         String section = parser.currentName();
@@ -60,7 +69,7 @@ public interface ExecutableSection {
     }
 
     /**
-     * Get the location in the test that this was defined. 
+     * Get the location in the test that this was defined.
      */
     XContentLocation getLocation();