10 years ago · 3a69b65e88
--- a/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc
@@ -7,16 +7,13 @@ via a regular expression. Accepts the following settings:
 
				 The following are settings that can be set for a `pattern` analyzer
			
 
				 type:
			
 
				 
			
 
				-[cols="<,<",options="header",]
			
 
				-|===================================================================
			
 
				-|Setting |Description
			
 
				-|`lowercase` |Should terms be lowercased or not. Defaults to `true`.
			
 
				-|`pattern` |The regular expression pattern, defaults to `\W+`.
			
 
				-|`flags` |The regular expression flags.
			
 
				-|`stopwords` |A list of stopwords to initialize the stop filter with.
			
 
				-Defaults to an 'empty' stopword list Check
			
 
				-<<analysis-stop-analyzer,Stop Analyzer>> for more details.
			
 
				-|===================================================================
			
 
				+[horizontal]
			
 
				+`lowercase`::   Should terms be lowercased or not. Defaults to `true`.
			
 
				+`pattern`::     The regular expression pattern, defaults to `\W+`.
			
 
				+`flags`::       The regular expression flags.
			
 
				+`stopwords`::   A list of stopwords to initialize the stop filter with.
			
 
				+                Defaults to an 'empty' stopword list Check
			
 
				+                <<analysis-stop-analyzer,Stop Analyzer>> for more details.
			
 
				 
			
 
				 *IMPORTANT*: The regular expression should match the *token separators*,
			
 
				 not the tokens themselves.
			
@@ -29,101 +26,103 @@ Pattern API] for more details about `flags` options.
 
				 ==== Pattern Analyzer Examples
			
 
				 
			
 
				 In order to try out these examples, you should delete the `test` index
			
 
				-before running each example:
			
 
				-
			
 
				-[source,js]
			
 
				---------------------------------------------------
			
 
				-    curl -XDELETE localhost:9200/test
			
 
				---------------------------------------------------
			
 
				+before running each example.
			
 
				 
			
 
				 [float]
			
 
				 ===== Whitespace tokenizer
			
 
				 
			
 
				 [source,js]
			
 
				 --------------------------------------------------
			
 
				-    curl -XPUT 'localhost:9200/test' -d '
			
 
				-    {
			
 
				-        "settings":{
			
 
				-            "analysis": {
			
 
				-                "analyzer": {
			
 
				-                    "whitespace":{
			
 
				-                        "type": "pattern",
			
 
				-                        "pattern":"\\\\s+"
			
 
				-                    }
			
 
				-                }
			
 
				-            }
			
 
				+DELETE test
			
 
				+
			
 
				+PUT /test
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "whitespace": {
			
 
				+          "type": "pattern",
			
 
				+          "pattern": "\\s+"
			
 
				         }
			
 
				-    }'
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				 
			
 
				-    curl 'localhost:9200/test/_analyze?pretty=1&analyzer=whitespace' -d 'foo,bar baz'
			
 
				-    # "foo,bar", "baz"
			
 
				+GET /test/_analyze?analyzer=whitespace&text=foo,bar baz
			
 
				+
			
 
				+# "foo,bar", "baz"
			
 
				 --------------------------------------------------
			
 
				+// AUTOSENSE
			
 
				 
			
 
				 [float]
			
 
				 ===== Non-word character tokenizer
			
 
				 
			
 
				 [source,js]
			
 
				 --------------------------------------------------
			
 
				-
			
 
				-    curl -XPUT 'localhost:9200/test' -d '
			
 
				-    {
			
 
				-        "settings":{
			
 
				-            "analysis": {
			
 
				-                "analyzer": {
			
 
				-                    "nonword":{
			
 
				-                        "type": "pattern",
			
 
				-                        "pattern":"[^\\\\w]+"
			
 
				-                    }
			
 
				-                }
			
 
				-            }
			
 
				+DELETE test
			
 
				+
			
 
				+PUT /test
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "nonword": {
			
 
				+          "type": "pattern",
			
 
				+          "pattern": "[^\\w]+" <1>
			
 
				         }
			
 
				-    }'
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				 
			
 
				-    curl 'localhost:9200/test/_analyze?pretty=1&analyzer=nonword' -d 'foo,bar baz'
			
 
				-    # "foo,bar baz" becomes "foo", "bar", "baz"
			
 
				+GET /test/_analyze?analyzer=nonword&text=foo,bar baz
			
 
				+# "foo,bar baz" becomes "foo", "bar", "baz"
			
 
				 
			
 
				-    curl 'localhost:9200/test/_analyze?pretty=1&analyzer=nonword' -d 'type_1-type_4'
			
 
				-    # "type_1","type_4"
			
 
				+GET /test/_analyze?analyzer=nonword&text=type_1-type_4
			
 
				+# "type_1","type_4"
			
 
				 --------------------------------------------------
			
 
				+// AUTOSENSE
			
 
				+
			
 
				 
			
 
				 [float]
			
 
				 ===== CamelCase tokenizer
			
 
				 
			
 
				 [source,js]
			
 
				 --------------------------------------------------
			
 
				-
			
 
				-    curl -XPUT 'localhost:9200/test?pretty=1' -d '
			
 
				-    {
			
 
				-        "settings":{
			
 
				-            "analysis": {
			
 
				-                "analyzer": {
			
 
				-                    "camel":{
			
 
				-                        "type": "pattern",
			
 
				-                        "pattern":"([^\\\\p{L}\\\\d]+)|(?<=\\\\D)(?=\\\\d)|(?<=\\\\d)(?=\\\\D)|(?<=[\\\\p{L}&&[^\\\\p{Lu}]])(?=\\\\p{Lu})|(?<=\\\\p{Lu})(?=\\\\p{Lu}[\\\\p{L}&&[^\\\\p{Lu}]])"
			
 
				-                    }
			
 
				-                }
			
 
				-            }
			
 
				+DELETE test
			
 
				+
			
 
				+PUT /test?pretty=1
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "camel": {
			
 
				+          "type": "pattern",
			
 
				+          "pattern": "([^\\p{L}\\d]+)|(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)|(?<=[\\p{L}&&[^\\p{Lu}]])(?=\\p{Lu})|(?<=\\p{Lu})(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])"
			
 
				         }
			
 
				-    }'
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				 
			
 
				-    curl 'localhost:9200/test/_analyze?pretty=1&analyzer=camel' -d '
			
 
				-        MooseX::FTPClass2_beta
			
 
				-    '
			
 
				-    # "moose","x","ftp","class","2","beta"
			
 
				+GET /test/_analyze?analyzer=camel&text=MooseX::FTPClass2_beta
			
 
				+# "moose","x","ftp","class","2","beta"
			
 
				 --------------------------------------------------
			
 
				+// AUTOSENSE
			
 
				 
			
 
				 The regex above is easier to understand as:
			
 
				 
			
 
				 [source,js]
			
 
				 --------------------------------------------------
			
 
				 
			
 
				-      ([^\\p{L}\\d]+)                 # swallow non letters and numbers,
			
 
				-    | (?<=\\D)(?=\\d)                 # or non-number followed by number,
			
 
				-    | (?<=\\d)(?=\\D)                 # or number followed by non-number,
			
 
				-    | (?<=[ \\p{L} && [^\\p{Lu}]])    # or lower case
			
 
				-      (?=\\p{Lu})                    #   followed by upper case,
			
 
				-    | (?<=\\p{Lu})                   # or upper case
			
 
				-      (?=\\p{Lu}                     #   followed by upper case
			
 
				-        [\\p{L}&&[^\\p{Lu}]]          #   then lower case
			
 
				-      )
			
 
				+  ([^\p{L}\d]+)                 # swallow non letters and numbers,
			
 
				+| (?<=\D)(?=\d)                 # or non-number followed by number,
			
 
				+| (?<=\d)(?=\D)                 # or number followed by non-number,
			
 
				+| (?<=[ \p{L} && [^\p{Lu}]])    # or lower case
			
 
				+  (?=\p{Lu})                    #   followed by upper case,
			
 
				+| (?<=\p{Lu})                   # or upper case
			
 
				+  (?=\p{Lu}                     #   followed by upper case
			
 
				+    [\p{L}&&[^\p{Lu}]]          #   then lower case
			
 
				+  )
			
 
				 --------------------------------------------------