5 years ago · bb66d594d1
--- a/docs/reference/analysis/tokenfilters/pattern_replace-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/pattern_replace-tokenfilter.asciidoc
@@ -4,23 +4,157 @@
 
				 <titleabbrev>Pattern replace</titleabbrev>
			
 
				 ++++
			
 
				 
			
 
				-The `pattern_replace` token filter allows to easily handle string
			
 
				-replacements based on a regular expression. The regular expression is
			
 
				-defined using the `pattern` parameter, and the replacement string can be
			
 
				-provided using the `replacement` parameter (supporting referencing the
			
 
				-original text, as explained
			
 
				-http://docs.oracle.com/javase/6/docs/api/java/util/regex/Matcher.html#appendReplacement(java.lang.StringBuffer,%20java.lang.String)[here]).
			
 
				+Uses a regular expression to match and replace token substrings.
			
 
				+
			
 
				+The `pattern_replace` filter uses
			
 
				+http://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html[Java's
			
 
				+regular expression syntax]. By default, the filter replaces matching
			
 
				+substrings with an empty substring (`""`).
			
 
				+
			
 
				+Regular expressions cannot be anchored to the
			
 
				+beginning or end of a token. Replacement substrings can use Java's
			
 
				+https://docs.oracle.com/javase/8/docs/api/java/util/regex/Matcher.html#appendReplacement-java.lang.StringBuffer-java.lang.String-[`$g` syntax] to reference capture groups
			
 
				+from the original token text.
			
 
				 
			
 
				 [WARNING]
			
 
				-.Beware of Pathological Regular Expressions
			
 
				-========================================
			
 
				+====
			
 
				+A poorly-written regular expression may run slowly or return a
			
 
				+StackOverflowError, causing the node running the expression to exit suddenly.
			
 
				+
			
 
				+Read more about
			
 
				+http://www.regular-expressions.info/catastrophic.html[pathological regular
			
 
				+expressions and how to avoid them].
			
 
				+====
			
 
				+
			
 
				+This filter uses Lucene's
			
 
				+{lucene-analysis-docs}/pattern/PatternReplaceFilter.html[PatternReplaceFilter].
			
 
				+
			
 
				+[[analysis-pattern-replace-tokenfilter-analyze-ex]]
			
 
				+==== Example
			
 
				+
			
 
				+The following <<indices-analyze,analyze API>> request uses the `pattern_replace`
			
 
				+filter to prepend `watch` to the substring `dog` in `foxes jump lazy dogs`.
			
 
				+
			
 
				+[source,console]
			
 
				+----
			
 
				+GET /_analyze
			
 
				+{
			
 
				+  "tokenizer": "whitespace",
			
 
				+  "filter": [
			
 
				+    {
			
 
				+      "type": "pattern_replace",
			
 
				+      "pattern": "(dog)",
			
 
				+      "replacement": "watch$1"
			
 
				+    }
			
 
				+  ],
			
 
				+  "text": "foxes jump lazy dogs"
			
 
				+}
			
 
				+----
			
 
				+
			
 
				+The filter produces the following tokens.
			
 
				+
			
 
				+[source,text]
			
 
				+----
			
 
				+[ foxes, jump, lazy, watchdogs ]
			
 
				+----
			
 
				+
			
 
				+////
			
 
				+[source,console-result]
			
 
				+----
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "foxes",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 5,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "jump",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 10,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "lazy",
			
 
				+      "start_offset": 11,
			
 
				+      "end_offset": 15,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "watchdogs",
			
 
				+      "start_offset": 16,
			
 
				+      "end_offset": 20,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----
			
 
				+////
			
 
				+
			
 
				+[[analysis-pattern-replace-tokenfilter-configure-parms]]
			
 
				+==== Configurable parameters
			
 
				+
			
 
				+`all`::
			
 
				+(Optional, boolean)
			
 
				+If `true`, all substrings matching the `pattern` parameter's regular expression
			
 
				+are replaced. If `false`, the filter replaces only the first matching substring
			
 
				+in each token. Defaults to `true`.
			
 
				+
			
 
				+`pattern`::
			
 
				+(Required, string)
			
 
				+Regular expression, written in
			
 
				+http://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html[Java's
			
 
				+regular expression syntax]. The filter replaces token substrings matching this
			
 
				+pattern with the substring in the `replacement` parameter.
			
 
				+
			
 
				+`replacement`::
			
 
				+(Optional, string)
			
 
				+Replacement substring. Defaults to an empty substring (`""`).
			
 
				+
			
 
				+[[analysis-pattern-replace-tokenfilter-customize]]
			
 
				+==== Customize and add to an analyzer
			
 
				 
			
 
				-The pattern replace token filter uses
			
 
				-http://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html[Java Regular Expressions].
			
 
				+To customize the `pattern_replace` filter, duplicate it to create the basis
			
 
				+for a new custom token filter. You can modify the filter using its configurable
			
 
				+parameters.
			
 
				 
			
 
				-A badly written regular expression could run very slowly or even throw a
			
 
				-StackOverflowError and cause the node it is running on to exit suddenly.
			
 
				+The following <<indices-create-index,create index API>> request
			
 
				+configures a new <<analysis-custom-analyzer,custom analyzer>> using a custom
			
 
				+`pattern_replace` filter, `my_pattern_replace_filter`.
			
 
				 
			
 
				-Read more about http://www.regular-expressions.info/catastrophic.html[pathological regular expressions and how to avoid them].
			
 
				+The `my_pattern_replace_filter` filter uses the regular expression `[£|€]` to
			
 
				+match and remove the currency symbols `£` and `€`. The filter's `all`
			
 
				+parameter is `false`, meaning only the first matching symbol in each token is
			
 
				+removed.
			
 
				 
			
 
				-========================================
			
 
				+[source,console]
			
 
				+----
			
 
				+PUT /my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_analyzer": {
			
 
				+          "tokenizer": "keyword",
			
 
				+          "filter": [
			
 
				+            "my_pattern_replace_filter"
			
 
				+          ]
			
 
				+        }
			
 
				+      },
			
 
				+      "filter": {
			
 
				+        "my_pattern_replace_filter": {
			
 
				+          "type": "pattern_replace",
			
 
				+          "pattern": "[£|€]",
			
 
				+          "replacement": "",
			
 
				+          "all": false
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+----