5 years ago · 98a64da87c
--- a/docs/reference/analysis/charfilters/htmlstrip-charfilter.asciidoc
+++ b/docs/reference/analysis/charfilters/htmlstrip-charfilter.asciidoc
@@ -1,29 +1,44 @@
 
				 [[analysis-htmlstrip-charfilter]]
			
 
				-=== HTML Strip Char Filter
			
 
				+=== HTML strip character filter
			
 
				+++++
			
 
				+<titleabbrev>HTML strip</titleabbrev>
			
 
				+++++
			
 
				 
			
 
				-The `html_strip` character filter strips HTML elements from the text and
			
 
				-replaces HTML entities with their decoded value (e.g. replacing `&amp;` with
			
 
				-`&`).
			
 
				+Strips HTML elements from a text and replaces HTML entities with their decoded
			
 
				+value (e.g, replaces `&amp;` with `&`).
			
 
				 
			
 
				-[float]
			
 
				-=== Example output
			
 
				+The `html_strip` filter uses Lucene's
			
 
				+{lucene-analysis-docs}/charfilter/HTMLStripCharFilter.html[HTMLStripCharFilter].
			
 
				+
			
 
				+[[analysis-htmlstrip-charfilter-analyze-ex]]
			
 
				+==== Example
			
 
				+
			
 
				+The following <<indices-analyze,analyze API>> request uses the
			
 
				+`html_strip` filter to change the text `<p>I&apos;m so <b>happy</b>!</p>` to
			
 
				+`\nI'm so happy!\n`.
			
 
				 
			
 
				 [source,console]
			
 
				----------------------------
			
 
				-POST _analyze
			
 
				+----
			
 
				+GET /_analyze
			
 
				 {
			
 
				-  "tokenizer":      "keyword", <1>
			
 
				-  "char_filter":  [ "html_strip" ],
			
 
				+  "tokenizer": "keyword",
			
 
				+  "char_filter": [
			
 
				+    "html_strip"
			
 
				+  ],
			
 
				   "text": "<p>I&apos;m so <b>happy</b>!</p>"
			
 
				 }
			
 
				----------------------------
			
 
				+----
			
 
				 
			
 
				-<1> The <<analysis-keyword-tokenizer,`keyword` tokenizer>> returns a single term.
			
 
				+The filter produces the following text:
			
 
				 
			
 
				-/////////////////////
			
 
				+[source,text]
			
 
				+----
			
 
				+[ \nI'm so happy!\n ]
			
 
				+----
			
 
				 
			
 
				+////
			
 
				 [source,console-result]
			
 
				-----------------------------
			
 
				+----
			
 
				 {
			
 
				   "tokens": [
			
 
				     {
			
@@ -35,43 +50,60 @@ POST _analyze
 
				     }
			
 
				   ]
			
 
				 }
			
 
				-----------------------------
			
 
				-
			
 
				-/////////////////////
			
 
				-
			
 
				+----
			
 
				+////
			
 
				 
			
 
				-The above example returns the term:
			
 
				+[[analysis-htmlstrip-charfilter-analyzer-ex]]
			
 
				+==== Add to an analyzer
			
 
				 
			
 
				-[source,text]
			
 
				----------------------------
			
 
				-[ \nI'm so happy!\n ]
			
 
				----------------------------
			
 
				-
			
 
				-The same example with the `standard` tokenizer would return the following terms:
			
 
				+The following <<indices-create-index,create index API>> request uses the
			
 
				+`html_strip` filter to configure a new
			
 
				+<<analysis-custom-analyzer,custom analyzer>>.
			
 
				 
			
 
				-[source,text]
			
 
				----------------------------
			
 
				-[ I'm, so, happy ]
			
 
				----------------------------
			
 
				-
			
 
				-[float]
			
 
				-=== Configuration
			
 
				+[source,console]
			
 
				+----
			
 
				+PUT /my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_analyzer": {
			
 
				+          "tokenizer": "keyword",
			
 
				+          "char_filter": [
			
 
				+            "html_strip"
			
 
				+          ]
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+----
			
 
				 
			
 
				-The `html_strip` character filter accepts the following parameter:
			
 
				+[[analysis-htmlstrip-charfilter-configure-parms]]
			
 
				+==== Configurable parameters
			
 
				 
			
 
				-[horizontal]
			
 
				 `escaped_tags`::
			
 
				+(Optional, array of strings)
			
 
				+Array of HTML elements without enclosing angle brackets (`< >`). The filter
			
 
				+skips these HTML elements when stripping HTML from the text. For example, a
			
 
				+value of `[ "p" ]` skips the `<p>` HTML element.
			
 
				 
			
 
				-    An array of HTML tags which should not be stripped from the original text.
			
 
				+[[analysis-htmlstrip-charfilter-customize]]
			
 
				+==== Customize
			
 
				 
			
 
				-[float]
			
 
				-=== Example configuration
			
 
				+To customize the `html_strip` filter, duplicate it to create the basis
			
 
				+for a new custom token filter. You can modify the filter using its configurable
			
 
				+parameters.
			
 
				 
			
 
				-In this example, we configure the `html_strip` character filter to leave `<b>`
			
 
				-tags in place:
			
 
				+The following <<indices-create-index,create index API>> request
			
 
				+configures a new <<analysis-custom-analyzer,custom analyzer>> using a custom
			
 
				+`html_strip` filter, `my_custom_html_strip_char_filter`.
			
 
				+
			
 
				+The `my_custom_html_strip_char_filter` filter skips the removal of the `<b>`
			
 
				+HTML element.
			
 
				 
			
 
				 [source,console]
			
 
				-----------------------------
			
 
				+----
			
 
				 PUT my_index
			
 
				 {
			
 
				   "settings": {
			
@@ -79,49 +111,20 @@ PUT my_index
 
				       "analyzer": {
			
 
				         "my_analyzer": {
			
 
				           "tokenizer": "keyword",
			
 
				-          "char_filter": ["my_char_filter"]
			
 
				+          "char_filter": [
			
 
				+            "my_custom_html_strip_char_filter"
			
 
				+          ]
			
 
				         }
			
 
				       },
			
 
				       "char_filter": {
			
 
				-        "my_char_filter": {
			
 
				+        "my_custom_html_strip_char_filter": {
			
 
				           "type": "html_strip",
			
 
				-          "escaped_tags": ["b"]
			
 
				+          "escaped_tags": [
			
 
				+            "b"
			
 
				+          ]
			
 
				         }
			
 
				       }
			
 
				     }
			
 
				   }
			
 
				 }
			
 
				-
			
 
				-POST my_index/_analyze
			
 
				-{
			
 
				-  "analyzer": "my_analyzer",
			
 
				-  "text": "<p>I&apos;m so <b>happy</b>!</p>"
			
 
				-}
			
 
				-----------------------------
			
 
				-
			
 
				-/////////////////////
			
 
				-
			
 
				-[source,console-result]
			
 
				-----------------------------
			
 
				-{
			
 
				-  "tokens": [
			
 
				-    {
			
 
				-      "token": "\nI'm so <b>happy</b>!\n",
			
 
				-      "start_offset": 0,
			
 
				-      "end_offset": 32,
			
 
				-      "type": "word",
			
 
				-      "position": 0
			
 
				-    }
			
 
				-  ]
			
 
				-}
			
 
				-----------------------------
			
 
				-
			
 
				-/////////////////////
			
 
				-
			
 
				-
			
 
				-The above example produces the following term:
			
 
				-
			
 
				-[source,text]
			
 
				----------------------------
			
 
				-[ \nI'm so <b>happy</b>!\n ]
			
 
				----------------------------
			
 
				+----