Browse Source

New plugin - Annotated_text field type (#30364)

New plugin for annotated_text field type.
Largely a copy of `text` field type but adds ability to include markdown-like syntax in the text.
The “AnnotatedText” class parses text+markup and converts into plain text and AnnotationTokens.
The annotation token values are injected unchanged alongside the regular text tokens to provide a
form of additional indexed overlay useful in positional searches and highlighting.
Annotated_text fields do not support fielddata as we want to phase this out.
Also includes a new "annotated" highlighter type that retains annotations and merges in search
hits as additional annotation markup.

Closes #29467
markharwood 7 years ago
parent
commit
2fa09f062e
18 changed files with 2523 additions and 30 deletions
  1. 328 0
      docs/plugins/mapper-annotated-text.asciidoc
  2. 8 0
      docs/plugins/mapper.asciidoc
  3. 1 0
      docs/reference/cat/plugins.asciidoc
  4. 1 0
      docs/reference/mapping/types.asciidoc
  5. 23 0
      plugins/mapper-annotated-text/build.gradle
  6. 776 0
      plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java
  7. 44 0
      plugins/mapper-annotated-text/src/main/java/org/elasticsearch/plugin/mapper/AnnotatedTextPlugin.java
  8. 201 0
      plugins/mapper-annotated-text/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedPassageFormatter.java
  9. 64 0
      plugins/mapper-annotated-text/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighter.java
  10. 39 0
      plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextClientYamlTestSuiteIT.java
  11. 681 0
      plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapperTests.java
  12. 73 0
      plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextParsingTests.java
  13. 185 0
      plugins/mapper-annotated-text/src/test/java/org/elasticsearch/search/highlight/AnnotatedTextHighlighterTests.java
  14. 44 0
      plugins/mapper-annotated-text/src/test/resources/rest-api-spec/test/mapper_annotatedtext/10_basic.yml
  15. 8 0
      qa/vagrant/src/test/resources/packaging/tests/module_and_plugin_test_cases.bash
  16. 16 3
      server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightUtils.java
  17. 1 2
      server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java
  18. 30 25
      server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java

+ 328 - 0
docs/plugins/mapper-annotated-text.asciidoc

@@ -0,0 +1,328 @@
+[[mapper-annotated-text]]
+=== Mapper Annotated Text Plugin
+
+experimental[]
+
+The mapper-annotated-text plugin provides the ability to index text that is a
+combination of free-text and special markup that is typically used to identify
+items of interest such as people or organisations (see NER or Named Entity Recognition
+tools). 
+
+
+The elasticsearch markup allows one or more additional tokens to be injected, unchanged, into the token
+stream at the same position as the underlying text it annotates.
+
+:plugin_name: mapper-annotated-text
+include::install_remove.asciidoc[]
+
+[[mapper-annotated-text-usage]]
+==== Using the `annotated-text` field
+
+The `annotated-text` tokenizes text content as per the more common `text` field (see 
+"limitations" below) but also injects any marked-up annotation tokens directly into
+the search index:
+
+[source,js]
+--------------------------
+PUT my_index
+{
+  "mappings": {
+    "_doc": {
+      "properties": {
+        "my_field": {
+          "type": "annotated_text"
+        }
+      }
+    }
+  }
+}
+--------------------------
+// CONSOLE
+
+Such a mapping would allow marked-up text eg wikipedia articles to be indexed as both text
+and structured tokens. The annotations use a markdown-like syntax using URL encoding of
+one or more values separated by the `&` symbol.
+
+
+We can use the "_analyze" api to test how an example annotation would be stored as tokens
+in the search index:
+
+
+[source,js]
+--------------------------
+GET my_index/_analyze
+{
+  "field": "my_field", 
+  "text":"Investors in [Apple](Apple+Inc.) rejoiced."
+}
+--------------------------
+// NOTCONSOLE
+
+Response:
+
+[source,js]
+--------------------------------------------------
+{
+  "tokens": [
+    {
+      "token": "investors",
+      "start_offset": 0,
+      "end_offset": 9,
+      "type": "<ALPHANUM>",
+      "position": 0
+    },
+    {
+      "token": "in",
+      "start_offset": 10,
+      "end_offset": 12,
+      "type": "<ALPHANUM>",
+      "position": 1
+    },
+    {
+      "token": "Apple Inc.", <1> 
+      "start_offset": 13,
+      "end_offset": 18,
+      "type": "annotation",
+      "position": 2
+    },
+    {
+      "token": "apple",
+      "start_offset": 13,
+      "end_offset": 18,
+      "type": "<ALPHANUM>",
+      "position": 2
+    },
+    {
+      "token": "rejoiced",
+      "start_offset": 19,
+      "end_offset": 27,
+      "type": "<ALPHANUM>",
+      "position": 3
+    }
+  ]
+}
+--------------------------------------------------
+// NOTCONSOLE
+
+<1> Note the whole annotation token `Apple Inc.` is placed, unchanged as a single token in
+the token stream and at the same position (position 2) as the text token (`apple`) it annotates.
+
+
+We can now perform searches for annotations using regular `term` queries that don't tokenize
+the provided search values. Annotations are a more precise way of matching as can be seen 
+in this example where a search for `Beck` will not match `Jeff Beck` :
+
+[source,js]
+--------------------------
+# Example documents
+PUT my_index/_doc/1
+{
+  "my_field": "[Beck](Beck) announced a new tour"<2>
+}
+
+PUT my_index/_doc/2
+{
+  "my_field": "[Jeff Beck](Jeff+Beck&Guitarist) plays a strat"<1>
+}
+
+# Example search
+GET my_index/_search
+{
+  "query": {
+    "term": {
+        "my_field": "Beck" <3>
+    }
+  }
+}
+--------------------------
+// CONSOLE
+
+<1> As well as tokenising the plain text into single words e.g. `beck`, here we 
+inject the single token value `Beck` at the same position as `beck` in the token stream.
+<2> Note annotations can inject multiple tokens at the same position - here we inject both
+the very specific value `Jeff Beck` and the broader term `Guitarist`. This enables
+broader positional queries e.g. finding mentions of a `Guitarist` near to `strat`.
+<3> A benefit of searching with these carefully defined annotation tokens is that a query for 
+`Beck` will not match document 2 that contains the tokens `jeff`, `beck` and `Jeff Beck`
+
+WARNING: Any use of `=` signs in annotation values eg `[Prince](person=Prince)` will 
+cause the document to be rejected with a parse failure. In future we hope to have a use for
+the equals signs so wil actively reject documents that contain this today.
+
+
+[[mapper-annotated-text-tips]]
+==== Data modelling tips
+===== Use structured and unstructured fields
+
+Annotations are normally a way of weaving structured information into unstructured text for
+higher-precision search.
+
+`Entity resolution` is a form of document enrichment undertaken by specialist software or people 
+where references to entities in a document are disambiguated by attaching a canonical ID.
+The ID is used to resolve any number of aliases or distinguish between people with the
+same name. The hyperlinks connecting Wikipedia's articles are a good example of resolved 
+entity IDs woven into text. 
+
+These IDs can be embedded as annotations in an annotated_text field but it often makes 
+sense to include them in dedicated structured fields to support discovery via aggregations:
+
+[source,js]
+--------------------------
+PUT my_index
+{
+  "mappings": {
+    "_doc": {
+      "properties": {
+        "my_unstructured_text_field": {
+          "type": "annotated_text"
+        },
+        "my_structured_people_field": {
+          "type": "text",
+          "fields": {
+          	"keyword" :{
+          	  "type": "keyword"
+          	}
+          }
+        }
+      }
+    }
+  }
+}
+--------------------------
+// CONSOLE
+
+Applications would then typically provide content and discover it as follows:
+
+[source,js]
+--------------------------
+# Example documents
+PUT my_index/_doc/1
+{
+  "my_unstructured_text_field": "[Shay](%40kimchy) created elasticsearch",
+  "my_twitter_handles": ["@kimchy"] <1>
+}
+
+GET my_index/_search
+{
+  "query": {
+    "query_string": {
+        "query": "elasticsearch OR logstash OR kibana",<2>
+        "default_field": "my_unstructured_text_field"
+    }
+  },
+  "aggregations": {
+  	"top_people" :{
+  	    "significant_terms" : { <3>
+	       "field" : "my_twitter_handles.keyword"
+  	    }
+  	}
+  }
+}
+--------------------------
+// CONSOLE
+
+<1> Note the `my_twitter_handles` contains a list of the annotation values
+also used in the unstructured text. (Note the annotated_text syntax requires escaping). 
+By repeating the annotation values in a structured field this application has ensured that 
+the tokens discovered in the structured field can be used for search and highlighting 
+in the unstructured field.  
+<2> In this example we search for documents that talk about components of the elastic stack
+<3> We use the `my_twitter_handles` field here to discover people who are significantly
+associated with the elastic stack.
+
+===== Avoiding over-matching annotations
+By design, the regular text tokens and the annotation tokens co-exist in the same indexed 
+field but in rare cases this can lead to some over-matching.
+
+The value of an annotation often denotes a _named entity_ (a person, place or company).
+The tokens for these named entities are inserted untokenized, and differ from typical text 
+tokens because they are normally:
+
+* Mixed case e.g. `Madonna`
+* Multiple words e.g. `Jeff Beck`
+* Can have punctuation or numbers e.g. `Apple Inc.` or `@kimchy`
+
+This means, for the most part, a search for a named entity in the annotated text field will
+not have any false positives e.g. when selecting `Apple Inc.` from an aggregation result 
+you can drill down to highlight uses in the text without "over matching" on any text tokens 
+like the word `apple` in this context:
+
+    the apple was very juicy
+    
+However, a problem arises if your named entity happens to be a single term and lower-case e.g. the 
+company `elastic`. In this case, a search on the annotated text field for the token `elastic`
+may match a text document such as this:
+
+    he fired an elastic band
+
+To avoid such false matches users should consider prefixing annotation values to ensure 
+they don't name clash with text tokens e.g.
+
+    [elastic](Company_elastic) released version 7.0 of the elastic stack today
+
+
+
+
+[[mapper-annotated-text-highlighter]]
+==== Using the `annotated` highlighter
+
+The `annotated-text` plugin includes a custom highlighter designed to mark up search hits
+in a way which is respectful of the original markup:
+
+[source,js]
+--------------------------
+# Example documents
+PUT my_index/_doc/1
+{
+  "my_field": "The cat sat on the [mat](sku3578)"
+}
+
+GET my_index/_search
+{
+  "query": {
+    "query_string": {
+        "query": "cats" 
+    }
+  },
+  "highlight": {
+    "fields": {
+      "my_field": {
+        "type": "annotated", <1>
+        "require_field_match": false
+      }
+    }
+  }
+}
+--------------------------
+// CONSOLE
+<1> The `annotated` highlighter type is designed for use with annotated_text fields
+
+The annotated highlighter is based on the `unified` highlighter and supports the same
+settings but does not use the `pre_tags` or `post_tags` parameters. Rather than using
+html-like markup such as `<em>cat</em>` the annotated highlighter uses the same 
+markdown-like syntax used for annotations and injects a key=value annotation where `_hit_term`
+is the  key and the matched search term is the value e.g. 
+
+    The [cat](_hit_term=cat) sat on the [mat](sku3578)
+
+The annotated highlighter tries to be respectful of any existing markup in the original 
+text:
+
+* If the search term matches exactly the location of an existing annotation then the 
+`_hit_term` key is merged into the url-like syntax used in the `(...)` part of the
+existing annotation. 
+* However, if the search term overlaps the span of an existing annotation it would break
+the markup formatting so the original annotation is removed in favour of a new annotation
+with just the search hit information in the results. 
+* Any non-overlapping annotations in the original text are preserved in highlighter
+selections
+
+
+[[mapper-annotated-text-limitations]]
+==== Limitations
+
+The annotated_text field type supports the same mapping settings as the `text` field type
+but with the following exceptions:
+
+* No support for `fielddata` or `fielddata_frequency_filter`
+* No support for `index_prefixes` or `index_phrases` indexing

+ 8 - 0
docs/plugins/mapper.asciidoc

@@ -19,5 +19,13 @@ indexes the size in bytes of the original
 The mapper-murmur3 plugin allows hashes to be computed at index-time and stored
 in the index for later use with the `cardinality` aggregation.
 
+<<mapper-annotated-text>>::
+
+The annotated text plugin provides the ability to index text that is a
+combination of free-text and special markup that is typically used to identify
+items of interest such as people or organisations (see NER or Named Entity Recognition
+tools).
+
 include::mapper-size.asciidoc[]
 include::mapper-murmur3.asciidoc[]
+include::mapper-annotated-text.asciidoc[]

+ 1 - 0
docs/reference/cat/plugins.asciidoc

@@ -28,6 +28,7 @@ U7321H6 discovery-gce           {version} The Google Compute Engine (GCE) Discov
 U7321H6 ingest-attachment       {version} Ingest processor that uses Apache Tika to extract contents
 U7321H6 ingest-geoip            {version} Ingest processor that uses looksup geo data based on ip adresses using the Maxmind geo database
 U7321H6 ingest-user-agent       {version} Ingest processor that extracts information from a user agent
+U7321H6 mapper-annotated-text   {version} The Mapper Annotated_text plugin adds support for text fields with markup used to inject annotation tokens into the index.
 U7321H6 mapper-murmur3          {version} The Mapper Murmur3 plugin allows to compute hashes of a field's values at index-time and to store them in the index.
 U7321H6 mapper-size             {version} The Mapper Size plugin allows document to record their uncompressed size at index time.
 U7321H6 store-smb               {version} The Store SMB plugin adds support for SMB stores.

+ 1 - 0
docs/reference/mapping/types.asciidoc

@@ -35,6 +35,7 @@ string::        <<text,`text`>> and <<keyword,`keyword`>>
                     `completion` to provide auto-complete suggestions
 <<token-count>>::   `token_count` to count the number of tokens in a string
 {plugins}/mapper-murmur3.html[`mapper-murmur3`]:: `murmur3` to compute hashes of values at index-time and store them in the index
+{plugins}/mapper-annotated-text.html[`mapper-annotated-text`]:: `annotated-text` to index text containing special markup (typically used for identifying named entities)
 
 <<percolator>>::    Accepts queries from the query-dsl
 

+ 23 - 0
plugins/mapper-annotated-text/build.gradle

@@ -0,0 +1,23 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+esplugin {
+  description 'The Mapper Annotated_text plugin adds support for text fields with markup used to inject annotation tokens into the index.'
+  classname 'org.elasticsearch.plugin.mapper.AnnotatedTextPlugin'
+}

+ 776 - 0
plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java

@@ -0,0 +1,776 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.mapper.annotatedtext;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
+import org.apache.lucene.analysis.AnalyzerWrapper;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.MultiPhraseQuery;
+import org.apache.lucene.search.NormsFieldExistsQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.elasticsearch.ElasticsearchParseException;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.common.xcontent.support.XContentMapValues;
+import org.elasticsearch.index.analysis.AnalyzerScope;
+import org.elasticsearch.index.analysis.NamedAnalyzer;
+import org.elasticsearch.index.mapper.FieldMapper;
+import org.elasticsearch.index.mapper.FieldNamesFieldMapper;
+import org.elasticsearch.index.mapper.MappedFieldType;
+import org.elasticsearch.index.mapper.Mapper;
+import org.elasticsearch.index.mapper.MapperParsingException;
+import org.elasticsearch.index.mapper.ParseContext;
+import org.elasticsearch.index.mapper.StringFieldType;
+import org.elasticsearch.index.mapper.TextFieldMapper;
+import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken;
+import org.elasticsearch.index.query.QueryShardContext;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.io.UncheckedIOException;
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import static org.elasticsearch.index.mapper.TypeParsers.parseTextField;
+
+/** A {@link FieldMapper} for full-text fields with annotation markup e.g.
+ * 
+ *    "New mayor is [John Smith](type=person&amp;value=John%20Smith) "
+ * 
+ * A special Analyzer wraps the default choice of analyzer in order
+ * to strip the text field of annotation markup and inject the related
+ * entity annotation tokens as supplementary tokens at the relevant points
+ * in the token stream.
+ * This code is largely a copy of TextFieldMapper which is less than ideal - 
+ * my attempts to subclass TextFieldMapper failed but we can revisit this.
+ **/
+public class AnnotatedTextFieldMapper extends FieldMapper {
+
+    public static final String CONTENT_TYPE = "annotated_text";
+    private static final int POSITION_INCREMENT_GAP_USE_ANALYZER = -1;
+
+    public static class Defaults {
+        public static final MappedFieldType FIELD_TYPE = new AnnotatedTextFieldType();
+        static {
+            FIELD_TYPE.freeze();
+        }
+    }
+
+    public static class Builder extends FieldMapper.Builder<Builder, AnnotatedTextFieldMapper> {
+
+        private int positionIncrementGap = POSITION_INCREMENT_GAP_USE_ANALYZER;
+        
+        public Builder(String name) {
+            super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
+            builder = this;
+        }
+
+        @Override
+        public AnnotatedTextFieldType fieldType() {
+            return (AnnotatedTextFieldType) super.fieldType();
+        }
+
+        public Builder positionIncrementGap(int positionIncrementGap) {
+            if (positionIncrementGap < 0) {
+                throw new MapperParsingException("[positions_increment_gap] must be positive, got " + positionIncrementGap);
+            }
+            this.positionIncrementGap = positionIncrementGap;
+            return this;
+        }
+        
+        @Override
+        public Builder docValues(boolean docValues) {
+            if (docValues) {
+                throw new IllegalArgumentException("[" + CONTENT_TYPE + "] fields do not support doc values");
+            }
+            return super.docValues(docValues);
+        }
+
+        @Override
+        public AnnotatedTextFieldMapper build(BuilderContext context) {
+            if (fieldType().indexOptions() == IndexOptions.NONE ) {
+                throw new IllegalArgumentException("[" + CONTENT_TYPE + "] fields must be indexed");
+            }
+            if (positionIncrementGap != POSITION_INCREMENT_GAP_USE_ANALYZER) {
+                if (fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
+                    throw new IllegalArgumentException("Cannot set position_increment_gap on field ["
+                        + name + "] without positions enabled");
+                }
+                fieldType.setIndexAnalyzer(new NamedAnalyzer(fieldType.indexAnalyzer(), positionIncrementGap));
+                fieldType.setSearchAnalyzer(new NamedAnalyzer(fieldType.searchAnalyzer(), positionIncrementGap));
+                fieldType.setSearchQuoteAnalyzer(new NamedAnalyzer(fieldType.searchQuoteAnalyzer(), positionIncrementGap));
+            } else {
+                //Using the analyzer's default BUT need to do the same thing AnalysisRegistry.processAnalyzerFactory 
+                // does to splice in new default of posIncGap=100 by wrapping the analyzer                
+                if (fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
+                    int overrideInc = TextFieldMapper.Defaults.POSITION_INCREMENT_GAP;
+                    fieldType.setIndexAnalyzer(new NamedAnalyzer(fieldType.indexAnalyzer(), overrideInc));
+                    fieldType.setSearchAnalyzer(new NamedAnalyzer(fieldType.searchAnalyzer(), overrideInc));
+                    fieldType.setSearchQuoteAnalyzer(new NamedAnalyzer(fieldType.searchQuoteAnalyzer(),overrideInc));
+                }
+            }
+            setupFieldType(context);
+            return new AnnotatedTextFieldMapper(
+                    name, fieldType(), defaultFieldType, positionIncrementGap,
+                    context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo);
+        }
+    }
+
+    public static class TypeParser implements Mapper.TypeParser {
+        @Override
+        public Mapper.Builder<AnnotatedTextFieldMapper.Builder, AnnotatedTextFieldMapper> parse(
+                String fieldName, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
+            AnnotatedTextFieldMapper.Builder builder = new AnnotatedTextFieldMapper.Builder(fieldName);
+            
+            builder.fieldType().setIndexAnalyzer(parserContext.getIndexAnalyzers().getDefaultIndexAnalyzer());
+            builder.fieldType().setSearchAnalyzer(parserContext.getIndexAnalyzers().getDefaultSearchAnalyzer());
+            builder.fieldType().setSearchQuoteAnalyzer(parserContext.getIndexAnalyzers().getDefaultSearchQuoteAnalyzer());
+            parseTextField(builder, fieldName, node, parserContext);
+            for (Iterator<Map.Entry<String, Object>> iterator = node.entrySet().iterator(); iterator.hasNext();) {
+                Map.Entry<String, Object> entry = iterator.next();
+                String propName = entry.getKey();
+                Object propNode = entry.getValue();
+                if (propName.equals("position_increment_gap")) {
+                    int newPositionIncrementGap = XContentMapValues.nodeIntegerValue(propNode, -1);
+                    builder.positionIncrementGap(newPositionIncrementGap);
+                    iterator.remove();
+                }
+            }
+            return builder;
+        }
+    }
+
+    
+    /**
+     * Parses markdown-like syntax into plain text and AnnotationTokens with offsets for
+     * annotations found in texts
+     */
+    public static final class AnnotatedText {
+        public final String textPlusMarkup;
+        public final String textMinusMarkup;
+        List<AnnotationToken> annotations;
+        
+        // Format is markdown-like syntax for URLs eg:
+        //   "New mayor is [John Smith](type=person&value=John%20Smith) "
+        static Pattern markdownPattern = Pattern.compile("\\[([^\\]\\[]*)\\]\\(([^\\)\\(]*)\\)");  
+        
+        public static AnnotatedText parse (String textPlusMarkup) {
+            List<AnnotationToken> annotations =new ArrayList<>();
+            Matcher m = markdownPattern.matcher(textPlusMarkup);                
+            int lastPos = 0;
+            StringBuilder sb = new StringBuilder();
+            while(m.find()){
+                if(m.start() > lastPos){
+                    sb.append(textPlusMarkup.substring(lastPos, m.start()));
+                }
+                
+                int startOffset = sb.length();
+                int endOffset = sb.length() + m.group(1).length();
+                sb.append(m.group(1));
+                lastPos = m.end();
+                
+                String[] pairs = m.group(2).split("&");
+                String value = null;
+                for (String pair : pairs) {
+                    String[] kv = pair.split("=");
+                    try {
+                        if(kv.length == 2){               
+                            throw new ElasticsearchParseException("key=value pairs are not supported in annotations");
+                        }
+                        if(kv.length == 1) {
+                            //Check "=" sign wasn't in the pair string
+                            if(kv[0].length() == pair.length()) {
+                                //untyped value
+                                value = URLDecoder.decode(kv[0], "UTF-8");
+                            }
+                        }
+                        if (value!=null && value.length() > 0) {
+                            annotations.add(new AnnotationToken(startOffset, endOffset, value));
+                        }
+                    } catch (UnsupportedEncodingException uee){
+                        throw new ElasticsearchParseException("Unsupported encoding parsing annotated text", uee);
+                    }                        
+                }                      
+            }   
+            if(lastPos < textPlusMarkup.length()){
+                sb.append(textPlusMarkup.substring(lastPos));
+            }
+            return new AnnotatedText(sb.toString(), textPlusMarkup, annotations);
+        }
+
+        protected AnnotatedText(String textMinusMarkup, String textPlusMarkup, List<AnnotationToken> annotations) {
+            this.textMinusMarkup = textMinusMarkup;
+            this.textPlusMarkup = textPlusMarkup;
+            this.annotations = annotations;    
+        }
+        
+        public static final class AnnotationToken {
+            public final int offset;
+            public final int endOffset;
+            
+            public final String value;
+            public AnnotationToken(int offset, int endOffset, String value) {
+                this.offset = offset;
+                this.endOffset = endOffset;
+                this.value = value;
+            }
+            @Override
+            public String toString() {
+               return value +" ("+offset+" - "+endOffset+")";
+            }
+            
+            public boolean intersects(int start, int end) {
+                return (start <= offset && end >= offset) || (start <= endOffset && end >= endOffset)
+                        || (start >= offset && end <= endOffset);
+            }
+            
+            @Override
+            public int hashCode() {
+                final int prime = 31;
+                int result = 1;
+                result = prime * result + endOffset;
+                result = prime * result + offset;
+                result = prime * result + Objects.hashCode(value);
+                return result;
+            }
+            
+            @Override
+            public boolean equals(Object obj) {
+                if (this == obj)
+                    return true;
+                if (obj == null)
+                    return false;
+                if (getClass() != obj.getClass())
+                    return false;
+                AnnotationToken other = (AnnotationToken) obj;
+                return Objects.equals(endOffset, other.endOffset) && Objects.equals(offset, other.offset)
+                        && Objects.equals(value, other.value);
+            }
+            
+        }
+        
+        @Override
+        public String toString() {
+           StringBuilder sb = new StringBuilder();
+           sb.append(textMinusMarkup);
+           sb.append("\n");
+           annotations.forEach(a -> {sb.append(a); sb.append("\n");});
+           return sb.toString();
+        }
+
+        public int numAnnotations() {
+            return annotations.size();
+        }
+
+        public AnnotationToken getAnnotation(int index) {
+            return annotations.get(index);
+        }   
+    }
+    
+    // A utility class for use with highlighters where the content being highlighted 
+    // needs plain text format for highlighting but marked-up format for token discovery.
+    // The class takes markedup format field values and returns plain text versions.
+    // When asked to tokenize plain-text versions by the highlighter it tokenizes the
+    // original markup form in order to inject annotations.
+    public static final class AnnotatedHighlighterAnalyzer extends AnalyzerWrapper {
+        private Analyzer delegate;
+        private AnnotatedText[] annotations;
+        public AnnotatedHighlighterAnalyzer(Analyzer delegate){
+            super(delegate.getReuseStrategy());
+            this.delegate = delegate;
+        }
+
+        public void init(String[] markedUpFieldValues) {
+            this.annotations = new AnnotatedText[markedUpFieldValues.length];
+            for (int i = 0; i < markedUpFieldValues.length; i++) {
+                annotations[i] = AnnotatedText.parse(markedUpFieldValues[i]);
+            }
+        }
+        
+        public String []  getPlainTextValuesForHighlighter(){
+            String [] result = new String[annotations.length];
+            for (int i = 0; i < annotations.length; i++) {
+                result[i] = annotations[i].textMinusMarkup;
+            }
+            return result;
+        }
+        
+        public AnnotationToken[] getIntersectingAnnotations(int start, int end) {
+            List<AnnotationToken> intersectingAnnotations = new ArrayList<>();
+            int fieldValueOffset =0;
+            for (AnnotatedText fieldValueAnnotations : this.annotations) {
+                //This is called from a highlighter where all of the field values are concatenated
+                // so each annotation offset will need to be adjusted so that it takes into account 
+                // the previous values AND the MULTIVAL delimiter
+                for (AnnotationToken token : fieldValueAnnotations.annotations) {
+                    if(token.intersects(start - fieldValueOffset , end - fieldValueOffset)) {
+                        intersectingAnnotations.add(new AnnotationToken(token.offset + fieldValueOffset, 
+                                token.endOffset + fieldValueOffset, token.value));
+                    }
+                } 
+                //add 1 for the fieldvalue separator character
+                fieldValueOffset +=fieldValueAnnotations.textMinusMarkup.length() +1;
+            }
+            return intersectingAnnotations.toArray(new AnnotationToken[intersectingAnnotations.size()]);
+        }        
+        
+        @Override
+        public Analyzer getWrappedAnalyzer(String fieldName) {
+          return delegate;
+        }   
+        
+        @Override
+        protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
+            if(components instanceof AnnotatedHighlighterTokenStreamComponents){
+                // already wrapped.
+                return components;
+            }
+            AnnotationsInjector injector = new AnnotationsInjector(components.getTokenStream());
+            return new AnnotatedHighlighterTokenStreamComponents(components.getTokenizer(), injector, this.annotations);
+        }        
+    }
+    private static final class AnnotatedHighlighterTokenStreamComponents extends TokenStreamComponents{
+
+        private AnnotationsInjector annotationsInjector;
+        private AnnotatedText[] annotations;
+        int readerNum = 0;
+
+        AnnotatedHighlighterTokenStreamComponents(Tokenizer source, AnnotationsInjector annotationsFilter,
+                AnnotatedText[] annotations) {
+            super(source, annotationsFilter);
+            this.annotationsInjector = annotationsFilter;
+            this.annotations = annotations;            
+        }
+
+        @Override
+        protected void setReader(Reader reader) {
+            String plainText = readToString(reader);
+            AnnotatedText at = this.annotations[readerNum++];
+            assert at.textMinusMarkup.equals(plainText);
+            // This code is reliant on the behaviour of highlighter logic - it 
+            // takes plain text multi-value fields and then calls the same analyzer 
+            // for each field value in turn. This class has cached the annotations
+            // associated with each plain-text value and are arranged in the same order
+            annotationsInjector.setAnnotations(at);
+            super.setReader(new StringReader(at.textMinusMarkup));  
+        }
+               
+    }    
+    
+    
+    public static final class AnnotationAnalyzerWrapper extends AnalyzerWrapper {
+        
+
+        private final Analyzer delegate;
+
+        public AnnotationAnalyzerWrapper (Analyzer delegate) {
+          super(delegate.getReuseStrategy());
+          this.delegate = delegate;
+        }
+
+        /**
+         * Wraps {@link StandardAnalyzer}. 
+         */
+        public AnnotationAnalyzerWrapper() {
+          this(new StandardAnalyzer());
+        }
+        
+
+        @Override
+        public Analyzer getWrappedAnalyzer(String fieldName) {
+          return delegate;
+        }     
+
+        @Override
+        protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
+            if(components instanceof AnnotatedTokenStreamComponents){
+                // already wrapped.
+                return components;
+            }
+            AnnotationsInjector injector = new AnnotationsInjector(components.getTokenStream());
+            return new AnnotatedTokenStreamComponents(components.getTokenizer(), injector);
+        }
+      }
+    
+    
+    //This Analyzer is not "wrappable" because of a limitation in Lucene https://issues.apache.org/jira/browse/LUCENE-8352    
+    private static final class AnnotatedTokenStreamComponents extends TokenStreamComponents{
+        private AnnotationsInjector annotationsInjector;
+
+        AnnotatedTokenStreamComponents(Tokenizer source, AnnotationsInjector annotationsInjector) {
+            super(source, annotationsInjector);
+            this.annotationsInjector = annotationsInjector;
+        }
+
+        @Override
+        protected void setReader(Reader reader) {
+            // Sneaky code to change the content downstream components will parse.
+            // Replace the marked-up content Reader with a plain text Reader and prime the 
+            // annotations injector with the AnnotatedTokens that need to be injected 
+            // as plain-text parsing progresses.
+            AnnotatedText annotations = AnnotatedText.parse(readToString(reader));
+            annotationsInjector.setAnnotations(annotations);
+            super.setReader(new StringReader(annotations.textMinusMarkup));
+        }
+    }
+    
+    static String readToString(Reader reader) {       
+        char[] arr = new char[8 * 1024];
+        StringBuilder buffer = new StringBuilder();
+        int numCharsRead;
+        try {
+            while ((numCharsRead = reader.read(arr, 0, arr.length)) != -1) {
+                buffer.append(arr, 0, numCharsRead);
+            }
+            reader.close();
+            return buffer.toString();            
+        } catch (IOException e) {
+            throw new UncheckedIOException("IO Error reading field content", e);
+        }
+    }         
+
+    
+    public static final class AnnotationsInjector extends TokenFilter {
+        
+        private AnnotatedText annotatedText;
+        AnnotatedText.AnnotationToken nextAnnotationForInjection = null;
+        private int currentAnnotationIndex = 0;
+        List<State> pendingStates = new ArrayList<>();
+        int pendingStatePos = 0;
+        boolean inputExhausted = false;
+
+        private final OffsetAttribute textOffsetAtt = addAttribute(OffsetAttribute.class);
+        private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+        private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
+        private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
+        private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+
+        public AnnotationsInjector(TokenStream in) {
+          super(in);
+        }
+
+        public void setAnnotations(AnnotatedText annotatedText) {
+          this.annotatedText = annotatedText;
+          currentAnnotationIndex = 0;
+          if(annotatedText!=null && annotatedText.numAnnotations()>0){
+              nextAnnotationForInjection = annotatedText.getAnnotation(0);
+          } else {
+              nextAnnotationForInjection = null;
+          }
+        }
+        
+        
+
+        @Override
+        public void reset() throws IOException {
+            pendingStates.clear();
+            pendingStatePos = 0;
+            inputExhausted = false;
+            super.reset();
+        }
+        
+        // Abstracts if we are pulling from some pre-cached buffer of
+        // text tokens or directly from the wrapped TokenStream
+        private boolean internalNextToken() throws IOException{
+            if (pendingStatePos < pendingStates.size()){
+                restoreState(pendingStates.get(pendingStatePos));
+                pendingStatePos ++;
+                if(pendingStatePos >=pendingStates.size()){
+                    pendingStatePos =0;
+                    pendingStates.clear();
+                }
+                return true;
+            }       
+            if(inputExhausted) {
+                return false;
+            }
+            return input.incrementToken();
+        }
+
+        @Override
+        public boolean incrementToken() throws IOException {
+            if (internalNextToken()) {
+                if (nextAnnotationForInjection != null) {
+                    // If we are at the right point to inject an annotation....
+                    if (textOffsetAtt.startOffset() >= nextAnnotationForInjection.offset) {
+                        int firstSpannedTextPosInc = posAtt.getPositionIncrement();
+                        int annotationPosLen = 1;
+
+                        // Capture the text token's state for later replay - but
+                        // with a zero pos increment so is same as annotation
+                        // that is injected before it
+                        posAtt.setPositionIncrement(0);
+                        pendingStates.add(captureState());
+
+                        while (textOffsetAtt.endOffset() <= nextAnnotationForInjection.endOffset) {
+                            // Buffer up all the other tokens spanned by this annotation to determine length.
+                            if (input.incrementToken()) {
+                                if (textOffsetAtt.endOffset() <= nextAnnotationForInjection.endOffset
+                                        && textOffsetAtt.startOffset() < nextAnnotationForInjection.endOffset) {
+                                    annotationPosLen += posAtt.getPositionIncrement();
+                                }
+                                pendingStates.add(captureState());
+                            } else {
+                                inputExhausted = true;
+                                break;
+                            }
+                        }
+                        emitAnnotation(firstSpannedTextPosInc, annotationPosLen);
+                        return true;
+                    }
+                }
+                return true;
+            } else {
+                inputExhausted = true;
+                return false;
+            }
+        }
+        private void setType(AnnotationToken token) {
+            //Default annotation type - in future AnnotationTokens may contain custom type info
+            typeAtt.setType("annotation");
+        }
+
+        private void emitAnnotation(int firstSpannedTextPosInc, int annotationPosLen) throws IOException {
+            // Set the annotation's attributes
+            posLenAtt.setPositionLength(annotationPosLen);
+            textOffsetAtt.setOffset(nextAnnotationForInjection.offset, nextAnnotationForInjection.endOffset);
+            setType(nextAnnotationForInjection);
+            
+            // We may have multiple annotations at this location - stack them up
+            final int annotationOffset = nextAnnotationForInjection.offset;
+            final AnnotatedText.AnnotationToken firstAnnotationAtThisPos = nextAnnotationForInjection;
+            while (nextAnnotationForInjection != null && nextAnnotationForInjection.offset == annotationOffset) {
+
+                
+                setType(nextAnnotationForInjection);
+                termAtt.resizeBuffer(nextAnnotationForInjection.value.length());
+                termAtt.copyBuffer(nextAnnotationForInjection.value.toCharArray(), 0, nextAnnotationForInjection.value.length());
+                
+                if (nextAnnotationForInjection == firstAnnotationAtThisPos) {
+                    posAtt.setPositionIncrement(firstSpannedTextPosInc);
+                    //Put at the head of the queue of tokens to be emitted
+                    pendingStates.add(0, captureState());                
+                } else {
+                    posAtt.setPositionIncrement(0);                    
+                    //Put after the head of the queue of tokens to be emitted
+                    pendingStates.add(1, captureState());                
+                }
+                
+                
+                // Flag the inject annotation as null to prevent re-injection.
+                currentAnnotationIndex++;
+                if (currentAnnotationIndex < annotatedText.numAnnotations()) {
+                    nextAnnotationForInjection = annotatedText.getAnnotation(currentAnnotationIndex);
+                } else {
+                    nextAnnotationForInjection = null;
+                }
+            }
+            // Now pop the first of many potential buffered tokens:
+            internalNextToken();
+        }
+
+      }
+  
+
+    public static final class AnnotatedTextFieldType extends StringFieldType {
+
+        public AnnotatedTextFieldType() {
+            setTokenized(true);
+        }
+
+        protected AnnotatedTextFieldType(AnnotatedTextFieldType ref) {
+            super(ref);
+        }
+        
+        @Override
+        public void setIndexAnalyzer(NamedAnalyzer delegate) {
+            if(delegate.analyzer() instanceof AnnotationAnalyzerWrapper){
+                // Already wrapped the Analyzer with an AnnotationAnalyzer
+                super.setIndexAnalyzer(delegate);
+            } else {
+                // Wrap the analyzer with an AnnotationAnalyzer that will inject required annotations
+                super.setIndexAnalyzer(new NamedAnalyzer(delegate.name(), AnalyzerScope.INDEX,
+                    new AnnotationAnalyzerWrapper(delegate.analyzer())));
+            }
+        }
+
+        public AnnotatedTextFieldType clone() {
+            return new AnnotatedTextFieldType(this);
+        }
+
+        @Override
+        public String typeName() {
+            return CONTENT_TYPE;
+        }
+
+        @Override
+        public Query existsQuery(QueryShardContext context) {
+            if (omitNorms()) {
+                return new TermQuery(new Term(FieldNamesFieldMapper.NAME, name()));
+            } else {
+                return new NormsFieldExistsQuery(name());
+            }
+        }
+        
+        @Override
+        public Query phraseQuery(String field, TokenStream stream, int slop, boolean enablePosIncrements) throws IOException {
+            PhraseQuery.Builder builder = new PhraseQuery.Builder();
+            builder.setSlop(slop);
+
+            TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
+            PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
+            int position = -1;
+
+            stream.reset();
+            while (stream.incrementToken()) {
+                if (enablePosIncrements) {
+                    position += posIncrAtt.getPositionIncrement();
+                }
+                else {
+                    position += 1;
+                }
+                builder.add(new Term(field, termAtt.getBytesRef()), position);
+            }
+
+            return builder.build();
+        }
+        
+        @Override
+        public Query multiPhraseQuery(String field, TokenStream stream, int slop, boolean enablePositionIncrements) throws IOException {
+
+            MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder();
+            mpqb.setSlop(slop);
+
+            TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
+
+            PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
+            int position = -1;
+
+            List<Term> multiTerms = new ArrayList<>();
+            stream.reset();
+            while (stream.incrementToken()) {
+                int positionIncrement = posIncrAtt.getPositionIncrement();
+
+                if (positionIncrement > 0 && multiTerms.size() > 0) {
+                    if (enablePositionIncrements) {
+                        mpqb.add(multiTerms.toArray(new Term[0]), position);
+                    } else {
+                        mpqb.add(multiTerms.toArray(new Term[0]));
+                    }
+                    multiTerms.clear();
+                }
+                position += positionIncrement;
+                multiTerms.add(new Term(field, termAtt.getBytesRef()));
+            }
+
+            if (enablePositionIncrements) {
+                mpqb.add(multiTerms.toArray(new Term[0]), position);
+            } else {
+                mpqb.add(multiTerms.toArray(new Term[0]));
+            }
+            return mpqb.build();
+        }        
+    }
+    
+    private int positionIncrementGap;
+    protected AnnotatedTextFieldMapper(String simpleName, AnnotatedTextFieldType fieldType, MappedFieldType defaultFieldType,
+                                int positionIncrementGap, 
+                                Settings indexSettings, MultiFields multiFields, CopyTo copyTo) {
+        super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo);
+        assert fieldType.tokenized();
+        assert fieldType.hasDocValues() == false;
+        this.positionIncrementGap = positionIncrementGap;
+    }
+
+    @Override
+    protected AnnotatedTextFieldMapper clone() {
+        return (AnnotatedTextFieldMapper) super.clone();
+    }
+
+    public int getPositionIncrementGap() {
+        return this.positionIncrementGap;
+    }
+
+    @Override
+    protected void parseCreateField(ParseContext context, List<IndexableField> fields) throws IOException {
+        final String value;
+        if (context.externalValueSet()) {
+            value = context.externalValue().toString();
+        } else {
+            value = context.parser().textOrNull();
+        }
+
+        if (value == null) {
+            return;
+        }
+
+        if (fieldType().indexOptions() != IndexOptions.NONE || fieldType().stored()) {
+            Field field = new Field(fieldType().name(), value, fieldType());
+            fields.add(field);
+            if (fieldType().omitNorms()) {
+                createFieldNamesField(context, fields);
+            }
+        }
+    }
+
+    @Override
+    protected String contentType() {
+        return CONTENT_TYPE;
+    }
+
+    @Override
+    public AnnotatedTextFieldType fieldType() {
+        return (AnnotatedTextFieldType) super.fieldType();
+    }
+
+    @Override
+    protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException {
+        super.doXContentBody(builder, includeDefaults, params);
+        doXContentAnalyzers(builder, includeDefaults);
+
+        if (includeDefaults || positionIncrementGap != POSITION_INCREMENT_GAP_USE_ANALYZER) {
+            builder.field("position_increment_gap", positionIncrementGap);
+        }        
+    }
+}

+ 44 - 0
plugins/mapper-annotated-text/src/main/java/org/elasticsearch/plugin/mapper/AnnotatedTextPlugin.java

@@ -0,0 +1,44 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.plugin.mapper;
+
+import java.util.Collections;
+import java.util.Map;
+
+import org.elasticsearch.index.mapper.Mapper;
+import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper;
+import org.elasticsearch.plugins.MapperPlugin;
+import org.elasticsearch.plugins.Plugin;
+import org.elasticsearch.plugins.SearchPlugin;
+import org.elasticsearch.search.fetch.subphase.highlight.AnnotatedTextHighlighter;
+import org.elasticsearch.search.fetch.subphase.highlight.Highlighter;
+
+public class AnnotatedTextPlugin extends Plugin implements MapperPlugin, SearchPlugin {
+
+    @Override
+    public Map<String, Mapper.TypeParser> getMappers() {
+        return Collections.singletonMap(AnnotatedTextFieldMapper.CONTENT_TYPE, new AnnotatedTextFieldMapper.TypeParser());
+    }
+    
+    @Override
+    public Map<String, Highlighter> getHighlighters() {
+        return Collections.singletonMap(AnnotatedTextHighlighter.NAME, new AnnotatedTextHighlighter());   
+    }
+}

+ 201 - 0
plugins/mapper-annotated-text/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedPassageFormatter.java

@@ -0,0 +1,201 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.search.fetch.subphase.highlight;
+
+import org.apache.lucene.search.highlight.Encoder;
+import org.apache.lucene.search.uhighlight.Passage;
+import org.apache.lucene.search.uhighlight.PassageFormatter;
+import org.apache.lucene.search.uhighlight.Snippet;
+import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer;
+import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken;
+
+import java.io.UnsupportedEncodingException;
+import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Custom passage formatter that :
+ * 1) marks up search hits in markdown-like syntax for URLs ({@link Snippet})
+ * 2) injects any annotations from the original text that don't conflict with search hit highlighting
+ */
+public class AnnotatedPassageFormatter extends PassageFormatter {
+
+
+    public static final String SEARCH_HIT_TYPE = "_hit_term";
+    private final Encoder encoder;
+    private AnnotatedHighlighterAnalyzer annotatedHighlighterAnalyzer;
+
+    public AnnotatedPassageFormatter(AnnotatedHighlighterAnalyzer annotatedHighlighterAnalyzer, Encoder encoder) {
+        this.annotatedHighlighterAnalyzer = annotatedHighlighterAnalyzer;
+        this.encoder = encoder;
+    }
+
+    static class MarkupPassage {
+        List<Markup> markups = new ArrayList<>();
+        int lastMarkupEnd = -1;
+
+        public void addUnlessOverlapping(Markup newMarkup) {
+            
+            // Fast exit.
+            if(newMarkup.start > lastMarkupEnd) {
+                markups.add(newMarkup);
+                lastMarkupEnd = newMarkup.end;                 
+                return;
+            }
+            
+            // Check to see if this new markup overlaps with any prior
+            int index=0;
+            for (Markup existingMarkup: markups) {
+                if(existingMarkup.samePosition(newMarkup)) {
+                    existingMarkup.merge(newMarkup);
+                    return;
+                }
+                if(existingMarkup.overlaps(newMarkup)) {
+                    // existing markup wins - we throw away the new markup that would span this position
+                    return;
+                }
+                // markup list is in start offset order so we can insert at this position then shift others right 
+                if(existingMarkup.isAfter(newMarkup)) {
+                    markups.add(index, newMarkup);
+                    return;
+                }
+                index++;
+            }
+            markups.add(newMarkup);
+            lastMarkupEnd = newMarkup.end; 
+        }
+        
+    }
+    static class Markup {
+        int start;
+        int end;
+        String metadata;
+        Markup(int start, int end, String metadata) {
+            super();
+            this.start = start;
+            this.end = end;
+            this.metadata = metadata;
+        }
+        boolean isAfter(Markup other) {
+            return start > other.end;
+        }
+        void merge(Markup newMarkup) {
+            // metadata is key1=value&key2=value&.... syntax used for urls 
+            assert samePosition(newMarkup);
+            metadata += "&" + newMarkup.metadata;
+        }
+        boolean samePosition(Markup other) {
+            return this.start == other.start && this.end == other.end;
+        }
+        boolean overlaps(Markup other) {
+            return (start<=other.start  && end >= other.start)
+                    || (start <= other.end && end >=other.end)
+                    || (start>=other.start && end<=other.end);
+        }
+        @Override
+        public String toString() {
+            return "Markup [start=" + start + ", end=" + end + ", metadata=" + metadata + "]";
+        }
+        
+        
+    }
+    // Merge original annotations and search hits into a single set of markups for each passage
+    static MarkupPassage mergeAnnotations(AnnotationToken [] annotations, Passage passage){
+        try {
+            MarkupPassage markupPassage = new MarkupPassage();
+
+            // Add search hits first - they take precedence over any other markup
+            for (int i = 0; i < passage.getNumMatches(); i++) {
+                int start = passage.getMatchStarts()[i];
+                int end = passage.getMatchEnds()[i];
+                String searchTerm = passage.getMatchTerms()[i].utf8ToString();
+                Markup markup = new Markup(start, end, SEARCH_HIT_TYPE+"="+URLEncoder.encode(searchTerm, StandardCharsets.UTF_8.name()));
+                markupPassage.addUnlessOverlapping(markup);
+            }
+            
+            // Now add original text's annotations - ignoring any that might conflict with the search hits markup.
+            for (AnnotationToken token: annotations) {                
+                int start = token.offset;
+                int end = token.endOffset;
+                if(start >= passage.getStartOffset() && end<=passage.getEndOffset()) {
+                    String escapedValue = URLEncoder.encode(token.value, StandardCharsets.UTF_8.name());
+                    Markup markup = new Markup(start, end, escapedValue);
+                    markupPassage.addUnlessOverlapping(markup);                        
+                }
+            }
+            return markupPassage;
+            
+        } catch (UnsupportedEncodingException e) {
+            // We should always have UTF-8 support
+            throw new IllegalStateException(e);
+        }
+    }
+    
+
+    @Override
+    public Snippet[] format(Passage[] passages, String content) {
+        Snippet[] snippets = new Snippet[passages.length];
+
+        int pos;
+        int j = 0;
+        for (Passage passage : passages) {
+            AnnotationToken [] annotations = annotatedHighlighterAnalyzer.getIntersectingAnnotations(passage.getStartOffset(), 
+                    passage.getEndOffset());            
+            MarkupPassage mergedMarkup = mergeAnnotations(annotations, passage);
+            
+            StringBuilder sb = new StringBuilder();
+            pos = passage.getStartOffset();            
+            for(Markup markup: mergedMarkup.markups) {
+                int start = markup.start;
+                int end = markup.end;
+                // its possible to have overlapping terms
+                if (start > pos) {
+                    append(sb, content, pos, start);
+                }
+                if (end > pos) {
+                    sb.append("[");
+                    append(sb, content, Math.max(pos, start), end);
+                    
+                    sb.append("](");
+                    sb.append(markup.metadata);
+                    sb.append(")");
+                    pos = end;
+                }
+            }
+            // its possible a "term" from the analyzer could span a sentence boundary.
+            append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
+            //we remove the paragraph separator if present at the end of the snippet (we used it as separator between values)
+            if (sb.charAt(sb.length() - 1) == HighlightUtils.PARAGRAPH_SEPARATOR) {
+                sb.deleteCharAt(sb.length() - 1);
+            } else if (sb.charAt(sb.length() - 1) == HighlightUtils.NULL_SEPARATOR) {
+                sb.deleteCharAt(sb.length() - 1);
+            }
+            //and we trim the snippets too
+            snippets[j++] = new Snippet(sb.toString().trim(), passage.getScore(), passage.getNumMatches() > 0);
+        }                    
+        return snippets;
+    }
+
+    private void append(StringBuilder dest, String content, int start, int end) {
+        dest.append(encoder.encodeText(content.substring(start, end)));
+    }
+}

+ 64 - 0
plugins/mapper-annotated-text/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighter.java

@@ -0,0 +1,64 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.search.fetch.subphase.highlight;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.search.highlight.Encoder;
+import org.apache.lucene.search.uhighlight.PassageFormatter;
+import org.elasticsearch.index.mapper.DocumentMapper;
+import org.elasticsearch.index.mapper.MappedFieldType;
+import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer;
+import org.elasticsearch.search.fetch.FetchSubPhase.HitContext;
+import org.elasticsearch.search.fetch.subphase.highlight.SearchContextHighlight.Field;
+import org.elasticsearch.search.internal.SearchContext;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+
+public class AnnotatedTextHighlighter extends UnifiedHighlighter {
+    
+    public static final String NAME = "annotated";
+
+    AnnotatedHighlighterAnalyzer annotatedHighlighterAnalyzer = null;    
+    
+    @Override
+    protected Analyzer getAnalyzer(DocumentMapper docMapper, MappedFieldType type) {
+        annotatedHighlighterAnalyzer = new AnnotatedHighlighterAnalyzer(super.getAnalyzer(docMapper, type));
+        return annotatedHighlighterAnalyzer;
+    }
+
+    // Convert the marked-up values held on-disk to plain-text versions for highlighting
+    @Override
+    protected List<Object> loadFieldValues(MappedFieldType fieldType, Field field, SearchContext context, HitContext hitContext)
+            throws IOException {
+        List<Object> fieldValues = super.loadFieldValues(fieldType, field, context, hitContext);
+        String[] fieldValuesAsString = fieldValues.toArray(new String[fieldValues.size()]);
+        annotatedHighlighterAnalyzer.init(fieldValuesAsString);
+        return Arrays.asList((Object[]) annotatedHighlighterAnalyzer.getPlainTextValuesForHighlighter());
+    }
+
+    @Override
+    protected PassageFormatter getPassageFormatter(SearchContextHighlight.Field field, Encoder encoder) {
+        return new AnnotatedPassageFormatter(annotatedHighlighterAnalyzer, encoder);
+
+    }
+
+}

+ 39 - 0
plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextClientYamlTestSuiteIT.java

@@ -0,0 +1,39 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.mapper.annotatedtext;
+
+import com.carrotsearch.randomizedtesting.annotations.Name;
+import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
+
+import org.elasticsearch.test.rest.yaml.ClientYamlTestCandidate;
+import org.elasticsearch.test.rest.yaml.ESClientYamlSuiteTestCase;
+
+public class AnnotatedTextClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase {
+
+    public AnnotatedTextClientYamlTestSuiteIT(@Name("yaml") ClientYamlTestCandidate testCandidate) {
+        super(testCandidate);
+    }
+
+    @ParametersFactory
+    public static Iterable<Object[]> parameters() throws Exception {
+        return createParameters();
+    }
+}
+

+ 681 - 0
plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapperTests.java

@@ -0,0 +1,681 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.mapper.annotatedtext;
+
+import org.apache.lucene.index.DocValuesType;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.IndexableFieldType;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.action.bulk.BulkRequestBuilder;
+import org.elasticsearch.action.index.IndexRequest;
+import org.elasticsearch.action.termvectors.TermVectorsRequest;
+import org.elasticsearch.action.termvectors.TermVectorsResponse;
+import org.elasticsearch.common.Strings;
+import org.elasticsearch.common.bytes.BytesReference;
+import org.elasticsearch.common.compress.CompressedXContent;
+import org.elasticsearch.common.lucene.uid.Versions;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.xcontent.ToXContent;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.common.xcontent.XContentFactory;
+import org.elasticsearch.common.xcontent.XContentType;
+import org.elasticsearch.index.IndexService;
+import org.elasticsearch.index.VersionType;
+import org.elasticsearch.index.engine.Engine;
+import org.elasticsearch.index.mapper.DocumentMapper;
+import org.elasticsearch.index.mapper.DocumentMapperParser;
+import org.elasticsearch.index.mapper.MapperParsingException;
+import org.elasticsearch.index.mapper.MapperService.MergeReason;
+import org.elasticsearch.index.mapper.ParsedDocument;
+import org.elasticsearch.index.mapper.SourceToParse;
+import org.elasticsearch.index.mapper.TextFieldMapper;
+import org.elasticsearch.index.shard.IndexShard;
+import org.elasticsearch.index.termvectors.TermVectorsService;
+import org.elasticsearch.indices.IndicesService;
+import org.elasticsearch.plugin.mapper.AnnotatedTextPlugin;
+import org.elasticsearch.plugins.Plugin;
+import org.elasticsearch.test.ESSingleNodeTestCase;
+import org.junit.Before;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.notNullValue;
+
+public class AnnotatedTextFieldMapperTests extends ESSingleNodeTestCase {
+
+    IndexService indexService;
+    DocumentMapperParser parser;
+
+    @Before
+    public void setup() {
+        Settings settings = Settings.builder()
+            .put("index.analysis.filter.mySynonyms.type", "synonym")
+            .putList("index.analysis.filter.mySynonyms.synonyms", Collections.singletonList("car, auto"))
+            .put("index.analysis.analyzer.synonym.tokenizer", "standard")
+            .put("index.analysis.analyzer.synonym.filter", "mySynonyms")
+            // Stop filter remains in server as it is part of lucene-core
+            .put("index.analysis.analyzer.my_stop_analyzer.tokenizer", "standard")
+            .put("index.analysis.analyzer.my_stop_analyzer.filter", "stop")
+            .build();
+        indexService = createIndex("test", settings);
+        parser = indexService.mapperService().documentMapperParser();
+    }    
+    
+    
+    
+    @Override
+    protected Collection<Class<? extends Plugin>> getPlugins() {
+        List<Class<? extends Plugin>> classpathPlugins = new ArrayList<>();
+        classpathPlugins.add(AnnotatedTextPlugin.class);
+        return classpathPlugins;
+    }
+
+
+
+    protected String getFieldType() {
+        return "annotated_text";
+    }    
+    
+    public void testAnnotationInjection() throws IOException {
+       
+        String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type")
+                .startObject("properties").startObject("field").field("type", getFieldType()).endObject().endObject()
+                .endObject().endObject());
+
+        DocumentMapper mapper = indexService.mapperService().merge("type",
+                new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE);        
+
+        // Use example of typed and untyped annotations
+        String annotatedText = "He paid [Stormy Daniels](Stephanie+Clifford&Payee) hush money";
+        SourceToParse sourceToParse = SourceToParse.source("test", "type", "1", BytesReference
+                .bytes(XContentFactory.jsonBuilder()
+                        .startObject()
+                        .field("field", annotatedText)
+                        .endObject()),
+            XContentType.JSON);
+        ParsedDocument doc = mapper.parse(sourceToParse);
+
+        IndexableField[] fields = doc.rootDoc().getFields("field");
+        assertEquals(1, fields.length);
+
+        assertEquals(annotatedText, fields[0].stringValue());
+
+        IndexShard shard = indexService.getShard(0);
+        shard.applyIndexOperationOnPrimary(Versions.MATCH_ANY, VersionType.INTERNAL,
+            sourceToParse, IndexRequest.UNSET_AUTO_GENERATED_TIMESTAMP, false);
+        shard.refresh("test");
+        try (Engine.Searcher searcher = shard.acquireSearcher("test")) {
+            LeafReader leaf = searcher.getDirectoryReader().leaves().get(0).reader();
+            TermsEnum terms = leaf.terms("field").iterator();
+            
+            assertTrue(terms.seekExact(new BytesRef("stormy")));
+            PostingsEnum postings = terms.postings(null, PostingsEnum.POSITIONS);
+            assertEquals(0, postings.nextDoc());
+            assertEquals(2, postings.nextPosition());   
+            
+            assertTrue(terms.seekExact(new BytesRef("Stephanie Clifford")));
+            postings = terms.postings(null, PostingsEnum.POSITIONS);
+            assertEquals(0, postings.nextDoc());
+            assertEquals(2, postings.nextPosition());
+
+            assertTrue(terms.seekExact(new BytesRef("Payee")));
+            postings = terms.postings(null, PostingsEnum.POSITIONS);
+            assertEquals(0, postings.nextDoc());
+            assertEquals(2, postings.nextPosition());
+
+            
+            assertTrue(terms.seekExact(new BytesRef("hush")));
+            postings = terms.postings(null, PostingsEnum.POSITIONS);
+            assertEquals(0, postings.nextDoc());
+            assertEquals(4, postings.nextPosition());   
+            
+        }
+    }  
+    
+    public void testToleranceForBadAnnotationMarkup() throws IOException {
+        
+        String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type")
+                .startObject("properties").startObject("field").field("type", getFieldType()).endObject().endObject()
+                .endObject().endObject());
+
+        DocumentMapper mapper = indexService.mapperService().merge("type",
+                new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE);        
+
+        String annotatedText = "foo [bar](MissingEndBracket baz";
+        SourceToParse sourceToParse = SourceToParse.source("test", "type", "1", BytesReference
+                .bytes(XContentFactory.jsonBuilder()
+                        .startObject()
+                        .field("field", annotatedText)
+                        .endObject()),
+            XContentType.JSON);
+        ParsedDocument doc = mapper.parse(sourceToParse);
+
+        IndexableField[] fields = doc.rootDoc().getFields("field");
+        assertEquals(1, fields.length);
+
+        assertEquals(annotatedText, fields[0].stringValue());
+
+        IndexShard shard = indexService.getShard(0);
+        shard.applyIndexOperationOnPrimary(Versions.MATCH_ANY, VersionType.INTERNAL,
+            sourceToParse, IndexRequest.UNSET_AUTO_GENERATED_TIMESTAMP, false);
+        shard.refresh("test");
+        try (Engine.Searcher searcher = shard.acquireSearcher("test")) {
+            LeafReader leaf = searcher.getDirectoryReader().leaves().get(0).reader();
+            TermsEnum terms = leaf.terms("field").iterator();
+            
+            assertTrue(terms.seekExact(new BytesRef("foo")));
+            PostingsEnum postings = terms.postings(null, PostingsEnum.POSITIONS);
+            assertEquals(0, postings.nextDoc());
+            assertEquals(0, postings.nextPosition());   
+            
+            assertTrue(terms.seekExact(new BytesRef("bar")));
+            postings = terms.postings(null, PostingsEnum.POSITIONS);
+            assertEquals(0, postings.nextDoc());
+            assertEquals(1, postings.nextPosition());
+
+            assertFalse(terms.seekExact(new BytesRef("MissingEndBracket")));
+            // Bad markup means value is treated as plain text and fed through tokenisation
+            assertTrue(terms.seekExact(new BytesRef("missingendbracket")));
+            
+        }
+    }  
+    
+    public void testAgainstTermVectorsAPI() throws IOException {
+        String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type")
+                .startObject("properties").startObject("tvfield").field("type", getFieldType())
+                .field("term_vector", "with_positions_offsets_payloads")
+                .endObject().endObject()
+                .endObject().endObject());        
+        indexService.mapperService().merge("type", new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE);          
+        
+
+        int max = between(3, 10);
+        BulkRequestBuilder bulk = client().prepareBulk();
+        for (int i = 0; i < max; i++) {
+            bulk.add(client().prepareIndex("test", "type", Integer.toString(i))
+                    .setSource("tvfield", "the quick [brown](Color) fox jumped over the lazy dog"));
+        }
+        bulk.get();
+
+        TermVectorsRequest request = new TermVectorsRequest("test", "type", "0").termStatistics(true);
+        
+        IndicesService indicesService = getInstanceFromNode(IndicesService.class);
+        IndexService test = indicesService.indexService(resolveIndex("test"));
+        IndexShard shard = test.getShardOrNull(0);
+        assertThat(shard, notNullValue());
+        TermVectorsResponse response = TermVectorsService.getTermVectors(shard, request);        
+        assertEquals(1, response.getFields().size());   
+
+        Terms terms = response.getFields().terms("tvfield");
+        TermsEnum iterator = terms.iterator();
+        BytesRef term;
+        Set<String> foundTerms = new HashSet<>();
+        while ((term = iterator.next()) != null) {
+            foundTerms.add(term.utf8ToString());
+        }        
+        //Check we have both text and annotation tokens
+        assertTrue(foundTerms.contains("brown"));
+        assertTrue(foundTerms.contains("Color"));
+        assertTrue(foundTerms.contains("fox"));
+        
+    }    
+        
+    // ===== Code below copied from TextFieldMapperTests ========
+
+    public void testDefaults() throws IOException {
+        String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type")
+                .startObject("properties").startObject("field").field("type", getFieldType()).endObject().endObject()
+                .endObject().endObject());
+
+        DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
+
+        assertEquals(mapping, mapper.mappingSource().toString());
+
+        ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", BytesReference
+                .bytes(XContentFactory.jsonBuilder()
+                        .startObject()
+                        .field("field", "1234")
+                        .endObject()),
+                XContentType.JSON));
+
+        IndexableField[] fields = doc.rootDoc().getFields("field");
+        assertEquals(1, fields.length);
+
+        assertEquals("1234", fields[0].stringValue());
+        IndexableFieldType fieldType = fields[0].fieldType();
+        assertThat(fieldType.omitNorms(), equalTo(false));
+        assertTrue(fieldType.tokenized());
+        assertFalse(fieldType.stored());
+        assertThat(fieldType.indexOptions(), equalTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS));
+        assertThat(fieldType.storeTermVectors(), equalTo(false));
+        assertThat(fieldType.storeTermVectorOffsets(), equalTo(false));
+        assertThat(fieldType.storeTermVectorPositions(), equalTo(false));
+        assertThat(fieldType.storeTermVectorPayloads(), equalTo(false));
+        assertEquals(DocValuesType.NONE, fieldType.docValuesType());
+    }
+
+    public void testEnableStore() throws IOException {
+        String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type")
+                .startObject("properties").startObject("field").field("type", getFieldType()).field("store", true).endObject().endObject()
+                .endObject().endObject());
+
+        DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
+
+        assertEquals(mapping, mapper.mappingSource().toString());
+
+        ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", BytesReference
+                .bytes(XContentFactory.jsonBuilder()
+                        .startObject()
+                        .field("field", "1234")
+                        .endObject()),
+                XContentType.JSON));
+
+        IndexableField[] fields = doc.rootDoc().getFields("field");
+        assertEquals(1, fields.length);
+        assertTrue(fields[0].fieldType().stored());
+    }
+
+    public void testDisableNorms() throws IOException {
+        String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type")
+                .startObject("properties").startObject("field")
+                    .field("type", getFieldType())
+                    .field("norms", false)
+                .endObject().endObject()
+                .endObject().endObject());
+
+        DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
+
+        assertEquals(mapping, mapper.mappingSource().toString());
+
+        ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", BytesReference
+                .bytes(XContentFactory.jsonBuilder()
+                        .startObject()
+                        .field("field", "1234")
+                        .endObject()),
+                XContentType.JSON));
+
+        IndexableField[] fields = doc.rootDoc().getFields("field");
+        assertEquals(1, fields.length);
+        assertTrue(fields[0].fieldType().omitNorms());
+    }
+
+    public void testIndexOptions() throws IOException {
+        Map<String, IndexOptions> supportedOptions = new HashMap<>();
+        supportedOptions.put("docs", IndexOptions.DOCS);
+        supportedOptions.put("freqs", IndexOptions.DOCS_AND_FREQS);
+        supportedOptions.put("positions", IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
+        supportedOptions.put("offsets", IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+
+        XContentBuilder mappingBuilder = XContentFactory.jsonBuilder().startObject().startObject("type").startObject("properties");
+        for (String option : supportedOptions.keySet()) {
+            mappingBuilder.startObject(option).field("type", getFieldType()).field("index_options", option).endObject();
+        }
+        String mapping = Strings.toString(mappingBuilder.endObject().endObject().endObject());
+
+        DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
+
+        XContentBuilder jsonDoc = XContentFactory.jsonBuilder().startObject();
+        for (String option : supportedOptions.keySet()) {
+            jsonDoc.field(option, "1234");
+        }
+        ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", BytesReference.bytes(jsonDoc.endObject()),
+                XContentType.JSON));
+
+        for (Map.Entry<String, IndexOptions> entry : supportedOptions.entrySet()) {
+            String field = entry.getKey();
+            IndexOptions options = entry.getValue();
+            IndexableField[] fields = doc.rootDoc().getFields(field);
+            assertEquals(1, fields.length);
+            assertEquals(options, fields[0].fieldType().indexOptions());
+        }
+    }
+
+    public void testDefaultPositionIncrementGap() throws IOException {
+        String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type")
+                .startObject("properties").startObject("field").field("type", getFieldType()).endObject().endObject()
+                .endObject().endObject());
+
+        DocumentMapper mapper = indexService.mapperService().merge("type",
+                new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE);
+
+        assertEquals(mapping, mapper.mappingSource().toString());
+
+        SourceToParse sourceToParse = SourceToParse.source("test", "type", "1", BytesReference
+                .bytes(XContentFactory.jsonBuilder()
+                        .startObject()
+                        .array("field", new String[] {"a", "b"})
+                        .endObject()),
+            XContentType.JSON);
+        ParsedDocument doc = mapper.parse(sourceToParse);
+
+        IndexableField[] fields = doc.rootDoc().getFields("field");
+        assertEquals(2, fields.length);
+
+        assertEquals("a", fields[0].stringValue());
+        assertEquals("b", fields[1].stringValue());
+
+        IndexShard shard = indexService.getShard(0);
+        shard.applyIndexOperationOnPrimary(Versions.MATCH_ANY, VersionType.INTERNAL,
+            sourceToParse, IndexRequest.UNSET_AUTO_GENERATED_TIMESTAMP, false);
+        shard.refresh("test");
+        try (Engine.Searcher searcher = shard.acquireSearcher("test")) {
+            LeafReader leaf = searcher.getDirectoryReader().leaves().get(0).reader();
+            TermsEnum terms = leaf.terms("field").iterator();
+            assertTrue(terms.seekExact(new BytesRef("b")));
+            PostingsEnum postings = terms.postings(null, PostingsEnum.POSITIONS);
+            assertEquals(0, postings.nextDoc());
+            assertEquals(TextFieldMapper.Defaults.POSITION_INCREMENT_GAP + 1, postings.nextPosition());
+        }
+    }
+
+    public void testPositionIncrementGap() throws IOException {
+        final int positionIncrementGap = randomIntBetween(1, 1000);
+        String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type")
+                .startObject("properties").startObject("field")
+                    .field("type", getFieldType())
+                    .field("position_increment_gap", positionIncrementGap)
+                .endObject().endObject()
+                .endObject().endObject());
+
+        DocumentMapper mapper = indexService.mapperService().merge("type",
+                new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE);
+
+        assertEquals(mapping, mapper.mappingSource().toString());
+
+        SourceToParse sourceToParse = SourceToParse.source("test", "type", "1", BytesReference
+                .bytes(XContentFactory.jsonBuilder()
+                        .startObject()
+                        .array("field", new String[]{"a", "b"})
+                        .endObject()),
+            XContentType.JSON);
+        ParsedDocument doc = mapper.parse(sourceToParse);
+
+        IndexableField[] fields = doc.rootDoc().getFields("field");
+        assertEquals(2, fields.length);
+
+        assertEquals("a", fields[0].stringValue());
+        assertEquals("b", fields[1].stringValue());
+
+        IndexShard shard = indexService.getShard(0);
+        shard.applyIndexOperationOnPrimary(Versions.MATCH_ANY, VersionType.INTERNAL,
+            sourceToParse, IndexRequest.UNSET_AUTO_GENERATED_TIMESTAMP, false);
+        shard.refresh("test");
+        try (Engine.Searcher searcher = shard.acquireSearcher("test")) {
+            LeafReader leaf = searcher.getDirectoryReader().leaves().get(0).reader();
+            TermsEnum terms = leaf.terms("field").iterator();
+            assertTrue(terms.seekExact(new BytesRef("b")));
+            PostingsEnum postings = terms.postings(null, PostingsEnum.POSITIONS);
+            assertEquals(0, postings.nextDoc());
+            assertEquals(positionIncrementGap + 1, postings.nextPosition());
+        }
+    }
+
+    public void testSearchAnalyzerSerialization() throws IOException {
+        String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type")
+                .startObject("properties")
+                    .startObject("field")
+                        .field("type", getFieldType())
+                        .field("analyzer", "standard")
+                        .field("search_analyzer", "keyword")
+                    .endObject()
+                .endObject().endObject().endObject());
+
+        DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
+        assertEquals(mapping,  mapper.mappingSource().toString());
+
+        // special case: default index analyzer
+        mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type")
+                .startObject("properties")
+                    .startObject("field")
+                        .field("type", getFieldType())
+                        .field("analyzer", "default")
+                        .field("search_analyzer", "keyword")
+                    .endObject()
+                .endObject().endObject().endObject());
+
+        mapper = parser.parse("type", new CompressedXContent(mapping));
+        assertEquals(mapping,  mapper.mappingSource().toString());
+
+        mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type")
+            .startObject("properties")
+            .startObject("field")
+            .field("type", getFieldType())
+            .field("analyzer", "keyword")
+            .endObject()
+            .endObject().endObject().endObject());
+
+        mapper = parser.parse("type", new CompressedXContent(mapping));
+        assertEquals(mapping,  mapper.mappingSource().toString());
+
+        // special case: default search analyzer
+        mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type")
+            .startObject("properties")
+            .startObject("field")
+            .field("type", getFieldType())
+            .field("analyzer", "keyword")
+            .field("search_analyzer", "default")
+            .endObject()
+            .endObject().endObject().endObject());
+
+        mapper = parser.parse("type", new CompressedXContent(mapping));
+        assertEquals(mapping,  mapper.mappingSource().toString());
+
+        mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type")
+            .startObject("properties")
+            .startObject("field")
+            .field("type", getFieldType())
+            .field("analyzer", "keyword")
+            .endObject()
+            .endObject().endObject().endObject());
+        mapper = parser.parse("type", new CompressedXContent(mapping));
+
+        XContentBuilder builder = XContentFactory.jsonBuilder();
+        builder.startObject();
+        mapper.toXContent(builder, new ToXContent.MapParams(Collections.singletonMap("include_defaults", "true")));
+        builder.endObject();
+
+        String mappingString = Strings.toString(builder);
+        assertTrue(mappingString.contains("analyzer"));
+        assertTrue(mappingString.contains("search_analyzer"));
+        assertTrue(mappingString.contains("search_quote_analyzer"));
+    }
+
+    public void testSearchQuoteAnalyzerSerialization() throws IOException {
+        String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type")
+                .startObject("properties")
+                    .startObject("field")
+                        .field("type", getFieldType())
+                        .field("analyzer", "standard")
+                        .field("search_analyzer", "standard")
+                        .field("search_quote_analyzer", "keyword")
+                    .endObject()
+                .endObject().endObject().endObject());
+
+        DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
+        assertEquals(mapping,  mapper.mappingSource().toString());
+
+        // special case: default index/search analyzer
+        mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type")
+                .startObject("properties")
+                    .startObject("field")
+                        .field("type", getFieldType())
+                        .field("analyzer", "default")
+                        .field("search_analyzer", "default")
+                        .field("search_quote_analyzer", "keyword")
+                    .endObject()
+                .endObject().endObject().endObject());
+
+        mapper = parser.parse("type", new CompressedXContent(mapping));
+        assertEquals(mapping,  mapper.mappingSource().toString());
+    }
+
+    public void testTermVectors() throws IOException {
+        String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type")
+                .startObject("properties")
+                .startObject("field1")
+                    .field("type", getFieldType())
+                    .field("term_vector", "no")
+                .endObject()
+                .startObject("field2")
+                    .field("type", getFieldType())
+                    .field("term_vector", "yes")
+                .endObject()
+                .startObject("field3")
+                    .field("type", getFieldType())
+                    .field("term_vector", "with_offsets")
+                .endObject()
+                .startObject("field4")
+                    .field("type", getFieldType())
+                    .field("term_vector", "with_positions")
+                .endObject()
+                .startObject("field5")
+                    .field("type", getFieldType())
+                    .field("term_vector", "with_positions_offsets")
+                .endObject()
+                .startObject("field6")
+                    .field("type", getFieldType())
+                    .field("term_vector", "with_positions_offsets_payloads")
+                .endObject()
+                .endObject()
+                .endObject().endObject());
+
+        DocumentMapper defaultMapper = parser.parse("type", new CompressedXContent(mapping));
+
+        ParsedDocument doc = defaultMapper.parse(SourceToParse.source("test", "type", "1", BytesReference
+                .bytes(XContentFactory.jsonBuilder()
+                        .startObject()
+                        .field("field1", "1234")
+                        .field("field2", "1234")
+                        .field("field3", "1234")
+                        .field("field4", "1234")
+                        .field("field5", "1234")
+                        .field("field6", "1234")
+                        .endObject()),
+                XContentType.JSON));
+
+        assertThat(doc.rootDoc().getField("field1").fieldType().storeTermVectors(), equalTo(false));
+        assertThat(doc.rootDoc().getField("field1").fieldType().storeTermVectorOffsets(), equalTo(false));
+        assertThat(doc.rootDoc().getField("field1").fieldType().storeTermVectorPositions(), equalTo(false));
+        assertThat(doc.rootDoc().getField("field1").fieldType().storeTermVectorPayloads(), equalTo(false));
+
+        assertThat(doc.rootDoc().getField("field2").fieldType().storeTermVectors(), equalTo(true));
+        assertThat(doc.rootDoc().getField("field2").fieldType().storeTermVectorOffsets(), equalTo(false));
+        assertThat(doc.rootDoc().getField("field2").fieldType().storeTermVectorPositions(), equalTo(false));
+        assertThat(doc.rootDoc().getField("field2").fieldType().storeTermVectorPayloads(), equalTo(false));
+
+        assertThat(doc.rootDoc().getField("field3").fieldType().storeTermVectors(), equalTo(true));
+        assertThat(doc.rootDoc().getField("field3").fieldType().storeTermVectorOffsets(), equalTo(true));
+        assertThat(doc.rootDoc().getField("field3").fieldType().storeTermVectorPositions(), equalTo(false));
+        assertThat(doc.rootDoc().getField("field3").fieldType().storeTermVectorPayloads(), equalTo(false));
+
+        assertThat(doc.rootDoc().getField("field4").fieldType().storeTermVectors(), equalTo(true));
+        assertThat(doc.rootDoc().getField("field4").fieldType().storeTermVectorOffsets(), equalTo(false));
+        assertThat(doc.rootDoc().getField("field4").fieldType().storeTermVectorPositions(), equalTo(true));
+        assertThat(doc.rootDoc().getField("field4").fieldType().storeTermVectorPayloads(), equalTo(false));
+
+        assertThat(doc.rootDoc().getField("field5").fieldType().storeTermVectors(), equalTo(true));
+        assertThat(doc.rootDoc().getField("field5").fieldType().storeTermVectorOffsets(), equalTo(true));
+        assertThat(doc.rootDoc().getField("field5").fieldType().storeTermVectorPositions(), equalTo(true));
+        assertThat(doc.rootDoc().getField("field5").fieldType().storeTermVectorPayloads(), equalTo(false));
+
+        assertThat(doc.rootDoc().getField("field6").fieldType().storeTermVectors(), equalTo(true));
+        assertThat(doc.rootDoc().getField("field6").fieldType().storeTermVectorOffsets(), equalTo(true));
+        assertThat(doc.rootDoc().getField("field6").fieldType().storeTermVectorPositions(), equalTo(true));
+        assertThat(doc.rootDoc().getField("field6").fieldType().storeTermVectorPayloads(), equalTo(true));
+    }
+   
+    public void testNullConfigValuesFail() throws MapperParsingException, IOException {
+        String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject()
+                .startObject("type")
+                    .startObject("properties")
+                        .startObject("field")
+                            .field("type", getFieldType())
+                            .field("analyzer", (String) null)
+                        .endObject()
+                    .endObject()
+                .endObject().endObject());
+
+        Exception e = expectThrows(MapperParsingException.class, () -> parser.parse("type", new CompressedXContent(mapping)));
+        assertEquals("[analyzer] must not have a [null] value", e.getMessage());
+    }
+
+    public void testNotIndexedField() throws IOException {
+        String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type")
+            .startObject("properties").startObject("field")
+            .field("type", getFieldType())
+            .field("index", false)
+            .endObject().endObject().endObject().endObject());
+
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+            () -> parser.parse("type", new CompressedXContent(mapping)));
+        assertEquals("[annotated_text] fields must be indexed", e.getMessage());
+    }
+
+    public void testAnalyzedFieldPositionIncrementWithoutPositions() throws IOException {
+        for (String indexOptions : Arrays.asList("docs", "freqs")) {
+            String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type")
+                .startObject("properties").startObject("field")
+                .field("type", getFieldType())
+                .field("index_options", indexOptions)
+                .field("position_increment_gap", 10)
+                .endObject().endObject().endObject().endObject());
+
+            IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+                () -> parser.parse("type", new CompressedXContent(mapping)));
+            assertEquals("Cannot set position_increment_gap on field [field] without positions enabled", e.getMessage());
+        }
+    }
+
+    public void testEmptyName() throws IOException {
+        String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject()
+                .startObject("type")
+                    .startObject("properties")
+                        .startObject("")
+                            .field("type", getFieldType())
+                        .endObject()
+                    .endObject()
+                .endObject().endObject());
+
+        // Empty name not allowed in index created after 5.0
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+            () -> parser.parse("type", new CompressedXContent(mapping))
+        );
+        assertThat(e.getMessage(), containsString("name cannot be empty string"));
+    }
+
+
+        
+}

+ 73 - 0
plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextParsingTests.java

@@ -0,0 +1,73 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.mapper.annotatedtext;
+
+import org.elasticsearch.ElasticsearchParseException;
+import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText;
+import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken;
+import org.elasticsearch.test.ESTestCase;
+
+import java.util.List;
+
+import static org.hamcrest.Matchers.equalTo;
+
+public class AnnotatedTextParsingTests extends ESTestCase {
+    
+    private void checkParsing(String markup, String expectedPlainText, AnnotationToken... expectedTokens) {
+        AnnotatedText at = AnnotatedText.parse(markup);
+        assertEquals(expectedPlainText, at.textMinusMarkup);
+        List<AnnotationToken> actualAnnotations = at.annotations;
+        assertEquals(expectedTokens.length, actualAnnotations.size());
+        for (int i = 0; i < expectedTokens.length; i++) {
+            assertEquals(expectedTokens[i], actualAnnotations.get(i));
+        }
+    }  
+    
+    public void testSingleValueMarkup() {
+        checkParsing("foo [bar](Y)", "foo bar", new AnnotationToken(4,7,"Y"));
+    }   
+    
+    public void testMultiValueMarkup() {
+        checkParsing("foo [bar](Y&B)", "foo bar", new AnnotationToken(4,7,"Y"), 
+                new AnnotationToken(4,7,"B"));
+    }    
+    
+    public void testBlankTextAnnotation() {
+        checkParsing("It sounded like this:[](theSoundOfOneHandClapping)", "It sounded like this:", 
+                new AnnotationToken(21,21,"theSoundOfOneHandClapping"));
+    }    
+    
+    public void testMissingBracket() {
+        checkParsing("[foo](MissingEndBracket bar",
+                "[foo](MissingEndBracket bar", new AnnotationToken[0]);
+    }
+    
+    public void testAnnotationWithType() {
+        Exception expectedException = expectThrows(ElasticsearchParseException.class,
+                () -> checkParsing("foo [bar](type=foo) baz", "foo bar baz",  new AnnotationToken(4,7, "noType")));
+            assertThat(expectedException.getMessage(), equalTo("key=value pairs are not supported in annotations"));
+    }
+    
+    public void testMissingValue() {
+        checkParsing("[foo]() bar", "foo bar", new AnnotationToken[0]);
+    }    
+        
+
+}

+ 185 - 0
plugins/mapper-annotated-text/src/test/java/org/elasticsearch/search/highlight/AnnotatedTextHighlighterTests.java

@@ -0,0 +1,185 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.search.highlight;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.highlight.DefaultEncoder;
+import org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator;
+import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter;
+import org.apache.lucene.search.uhighlight.PassageFormatter;
+import org.apache.lucene.search.uhighlight.Snippet;
+import org.apache.lucene.search.uhighlight.SplittingBreakIterator;
+import org.apache.lucene.store.Directory;
+import org.elasticsearch.common.Strings;
+import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer;
+import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotationAnalyzerWrapper;
+import org.elasticsearch.search.fetch.subphase.highlight.AnnotatedPassageFormatter;
+import org.elasticsearch.test.ESTestCase;
+
+import java.net.URLEncoder;
+import java.text.BreakIterator;
+import java.util.Locale;
+
+import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
+import static org.hamcrest.CoreMatchers.equalTo;
+
+public class AnnotatedTextHighlighterTests  extends ESTestCase {
+    
+    private void assertHighlightOneDoc(String fieldName, String []markedUpInputs,
+            Query query, Locale locale, BreakIterator breakIterator,
+            int noMatchSize, String[] expectedPassages) throws Exception {
+        
+        // Annotated fields wrap the usual analyzer with one that injects extra tokens
+        Analyzer wrapperAnalyzer = new AnnotationAnalyzerWrapper(new StandardAnalyzer());
+        AnnotatedHighlighterAnalyzer hiliteAnalyzer = new AnnotatedHighlighterAnalyzer(wrapperAnalyzer);
+        hiliteAnalyzer.init(markedUpInputs);
+        PassageFormatter passageFormatter = new AnnotatedPassageFormatter(hiliteAnalyzer,new DefaultEncoder());
+        String []plainTextForHighlighter = hiliteAnalyzer.getPlainTextValuesForHighlighter();
+
+        
+        Directory dir = newDirectory();
+        IndexWriterConfig iwc = newIndexWriterConfig(wrapperAnalyzer);
+        iwc.setMergePolicy(newTieredMergePolicy(random()));
+        RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+        FieldType ft = new FieldType(TextField.TYPE_STORED);
+        if (randomBoolean()) {
+            ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+        } else {
+            ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+        }
+        ft.freeze();
+        Document doc = new Document();
+        for (String input : markedUpInputs) {
+            Field field = new Field(fieldName, "", ft);
+            field.setStringValue(input);
+            doc.add(field);
+        }
+        iw.addDocument(doc);
+        DirectoryReader reader = iw.getReader();
+        IndexSearcher searcher = newSearcher(reader);
+        iw.close();
+        TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
+        assertThat(topDocs.totalHits.value, equalTo(1L));
+        String rawValue = Strings.arrayToDelimitedString(plainTextForHighlighter, String.valueOf(MULTIVAL_SEP_CHAR));
+        
+        CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, hiliteAnalyzer, null,
+                passageFormatter, locale,
+                breakIterator, rawValue, noMatchSize);
+        highlighter.setFieldMatcher((name) -> "text".equals(name));
+        final Snippet[] snippets =
+            highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);
+        assertEquals(expectedPassages.length, snippets.length);
+        for (int i = 0; i < snippets.length; i++) {
+            assertEquals(expectedPassages[i], snippets[i].getText());
+        }
+        reader.close();
+        dir.close();
+    }
+    
+
+    public void testAnnotatedTextStructuredMatch() throws Exception {
+        // Check that a structured token eg a URL can be highlighted in a query
+        // on marked-up
+        // content using an "annotated_text" type field.
+        String url = "https://en.wikipedia.org/wiki/Key_Word_in_Context";
+        String encodedUrl = URLEncoder.encode(url, "UTF-8");
+        String annotatedWord = "[highlighting](" + encodedUrl + ")";
+        String highlightedAnnotatedWord = "[highlighting](" + AnnotatedPassageFormatter.SEARCH_HIT_TYPE + "=" + encodedUrl + "&"
+                + encodedUrl + ")";
+        final String[] markedUpInputs = { "This is a test. Just a test1 " + annotatedWord + " from [annotated](bar) highlighter.",
+                "This is the second " + annotatedWord + " value to perform highlighting on a longer text that gets scored lower." };
+
+        String[] expectedPassages = {
+                "This is a test. Just a test1 " + highlightedAnnotatedWord + " from [annotated](bar) highlighter.",
+                "This is the second " + highlightedAnnotatedWord + " value to perform highlighting on a"
+                        + " longer text that gets scored lower." };
+        Query query = new TermQuery(new Term("text", url));
+        BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
+        assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);
+    }
+
+    public void testAnnotatedTextOverlapsWithUnstructuredSearchTerms() throws Exception {
+        final String[] markedUpInputs = { "[Donald Trump](Donald+Trump) visited Singapore",
+                "Donald duck is a [Disney](Disney+Inc) invention" };
+
+        String[] expectedPassages = { "[Donald](_hit_term=donald) Trump visited Singapore",
+                "[Donald](_hit_term=donald) duck is a [Disney](Disney+Inc) invention" };
+        Query query = new TermQuery(new Term("text", "donald"));
+        BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
+        assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);
+    }
+
+    public void testAnnotatedTextMultiFieldWithBreakIterator() throws Exception {
+        final String[] markedUpInputs = { "[Donald Trump](Donald+Trump) visited Singapore. Kim shook hands with Donald",
+                "Donald duck is a [Disney](Disney+Inc) invention" };
+        String[] expectedPassages = { "[Donald](_hit_term=donald) Trump visited Singapore",
+                "Kim shook hands with [Donald](_hit_term=donald)",
+                "[Donald](_hit_term=donald) duck is a [Disney](Disney+Inc) invention" };
+        Query query = new TermQuery(new Term("text", "donald"));
+        BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
+        breakIterator = new SplittingBreakIterator(breakIterator, '.');
+        assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);
+    }
+    
+    public void testAnnotatedTextSingleFieldWithBreakIterator() throws Exception {
+        final String[] markedUpInputs = { "[Donald Trump](Donald+Trump) visited Singapore. Kim shook hands with Donald"};
+        String[] expectedPassages = { "[Donald](_hit_term=donald) Trump visited Singapore",
+                "Kim shook hands with [Donald](_hit_term=donald)"};
+        Query query = new TermQuery(new Term("text", "donald"));
+        BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
+        breakIterator = new SplittingBreakIterator(breakIterator, '.');
+        assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);
+    }  
+    
+    public void testAnnotatedTextSingleFieldWithPhraseQuery() throws Exception {
+        final String[] markedUpInputs = { "[Donald Trump](Donald+Trump) visited Singapore", 
+                "Donald Jr was with Melania Trump"};
+        String[] expectedPassages = { "[Donald](_hit_term=donald) [Trump](_hit_term=trump) visited Singapore"};
+        Query query = new PhraseQuery("text", "donald", "trump");
+        BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
+        assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);
+    }   
+    
+    public void testBadAnnotation() throws Exception {
+        final String[] markedUpInputs = { "Missing bracket for [Donald Trump](Donald+Trump visited Singapore"};
+        String[] expectedPassages = { "Missing bracket for [Donald Trump](Donald+Trump visited [Singapore](_hit_term=singapore)"};
+        Query query = new TermQuery(new Term("text", "singapore"));
+        BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
+        assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);
+    }     
+   
+}

+ 44 - 0
plugins/mapper-annotated-text/src/test/resources/rest-api-spec/test/mapper_annotatedtext/10_basic.yml

@@ -0,0 +1,44 @@
+# Integration tests for Mapper Annotated_text components
+#
+
+---
+"annotated highlighter on annotated text":
+  - skip:
+      version: " - 6.99.99"
+      reason: Annotated text type introduced in 7.0.0-alpha1
+
+  - do:
+      indices.create:
+        index: annotated
+        body:
+          settings:
+            number_of_shards: "1"
+            number_of_replicas: "0"
+          mappings:
+            doc:
+              properties:
+                text:
+                  type: annotated_text
+                entityID:
+                  type: keyword
+
+  - do:
+      index:
+        index: annotated
+        type: doc
+        body: 
+            "text" : "The [quick brown fox](entity_3789) is brown."
+            "entityID": "entity_3789"
+        refresh: true
+
+  - do:
+      search:
+        body: { "query" : {"term" : { "entityID" : "entity_3789" } }, "highlight" : { "type" : "annotated", "require_field_match": false, "fields" : { "text" : {} } } }
+
+  - match: {hits.hits.0.highlight.text.0: "The [quick brown fox](_hit_term=entity_3789&entity_3789) is brown."}
+
+  - do:
+      search:
+        body: { "query" : {"term" : { "text" : "quick" } }, "highlight" : { "type" : "annotated", "require_field_match": false, "fields" : { "text" : {} } } }
+
+  - match: {hits.hits.0.highlight.text.0: "The [quick](_hit_term=quick) brown fox is brown."}

+ 8 - 0
qa/vagrant/src/test/resources/packaging/tests/module_and_plugin_test_cases.bash

@@ -266,6 +266,10 @@ fi
     install_and_check_plugin mapper murmur3
 }
 
+@test "[$GROUP] install annotated-text mapper plugin" {
+    install_and_check_plugin mapper annotated-text
+}
+
 @test "[$GROUP] check reindex module" {
     check_module reindex
 }
@@ -380,6 +384,10 @@ fi
     remove_plugin mapper-murmur3
 }
 
+@test "[$GROUP] remove annotated-text mapper plugin" {
+    remove_plugin mapper-annotated-text
+}
+
 @test "[$GROUP] remove size mapper plugin" {
     remove_plugin mapper-size
 }

+ 16 - 3
server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightUtils.java

@@ -18,10 +18,13 @@
  */
 package org.elasticsearch.search.fetch.subphase.highlight;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.search.highlight.DefaultEncoder;
 import org.apache.lucene.search.highlight.Encoder;
 import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
 import org.elasticsearch.index.fieldvisitor.CustomFieldsVisitor;
+import org.elasticsearch.index.mapper.DocumentMapper;
+import org.elasticsearch.index.mapper.KeywordFieldMapper;
 import org.elasticsearch.index.mapper.MappedFieldType;
 import org.elasticsearch.search.fetch.FetchSubPhase;
 import org.elasticsearch.search.internal.SearchContext;
@@ -70,8 +73,18 @@ public final class HighlightUtils {
         return textsToHighlight;
     }
 
-    static class Encoders {
-        static final Encoder DEFAULT = new DefaultEncoder();
-        static final Encoder HTML = new SimpleHTMLEncoder();
+    public static class Encoders {
+        public static final Encoder DEFAULT = new DefaultEncoder();
+        public static final Encoder HTML = new SimpleHTMLEncoder();
     }
+    
+    static Analyzer getAnalyzer(DocumentMapper docMapper, MappedFieldType type) {
+        if (type instanceof KeywordFieldMapper.KeywordFieldType) {
+            KeywordFieldMapper.KeywordFieldType keywordFieldType = (KeywordFieldMapper.KeywordFieldType) type;
+            if (keywordFieldType.normalizer() != null) {
+                return  keywordFieldType.normalizer();
+            }
+        }
+        return docMapper.mappers().indexAnalyzer();
+    }    
 }

+ 1 - 2
server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java

@@ -49,7 +49,6 @@ import java.util.List;
 import java.util.Map;
 
 import static org.elasticsearch.search.fetch.subphase.highlight.UnifiedHighlighter.convertFieldValue;
-import static org.elasticsearch.search.fetch.subphase.highlight.UnifiedHighlighter.getAnalyzer;
 
 public class PlainHighlighter implements Highlighter {
     private static final String CACHE_KEY = "highlight-plain";
@@ -102,7 +101,7 @@ public class PlainHighlighter implements Highlighter {
         int numberOfFragments = field.fieldOptions().numberOfFragments() == 0 ? 1 : field.fieldOptions().numberOfFragments();
         ArrayList<TextFragment> fragsList = new ArrayList<>();
         List<Object> textsToHighlight;
-        Analyzer analyzer = getAnalyzer(context.mapperService().documentMapper(hitContext.hit().getType()), fieldType);
+        Analyzer analyzer = HighlightUtils.getAnalyzer(context.mapperService().documentMapper(hitContext.hit().getType()), fieldType);
         final int maxAnalyzedOffset = context.indexShard().indexSettings().getHighlightMaxAnalyzedOffset();
 
         try {

+ 30 - 25
server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java

@@ -26,6 +26,7 @@ import org.apache.lucene.search.uhighlight.BoundedBreakIteratorScanner;
 import org.apache.lucene.search.uhighlight.CustomPassageFormatter;
 import org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator;
 import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter;
+import org.apache.lucene.search.uhighlight.PassageFormatter;
 import org.apache.lucene.search.uhighlight.Snippet;
 import org.apache.lucene.search.uhighlight.UnifiedHighlighter.OffsetSource;
 import org.apache.lucene.util.BytesRef;
@@ -34,7 +35,6 @@ import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.text.Text;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.mapper.DocumentMapper;
-import org.elasticsearch.index.mapper.KeywordFieldMapper;
 import org.elasticsearch.index.mapper.MappedFieldType;
 import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
 import org.elasticsearch.search.fetch.FetchSubPhase;
@@ -54,7 +54,7 @@ public class UnifiedHighlighter implements Highlighter {
     public boolean canHighlight(MappedFieldType fieldType) {
         return true;
     }
-
+    
     @Override
     public HighlightField highlight(HighlighterContext highlighterContext) {
         MappedFieldType fieldType = highlighterContext.fieldType;
@@ -62,23 +62,18 @@ public class UnifiedHighlighter implements Highlighter {
         SearchContext context = highlighterContext.context;
         FetchSubPhase.HitContext hitContext = highlighterContext.hitContext;
         Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
-        CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0],
-            field.fieldOptions().postTags()[0], encoder);
         final int maxAnalyzedOffset = context.indexShard().indexSettings().getHighlightMaxAnalyzedOffset();
 
         List<Snippet> snippets = new ArrayList<>();
         int numberOfFragments;
         try {
 
-            final Analyzer analyzer =
-                getAnalyzer(context.mapperService().documentMapper(hitContext.hit().getType()), fieldType);
-            List<Object> fieldValues = HighlightUtils.loadFieldValues(field, fieldType, context, hitContext);
-            fieldValues = fieldValues.stream()
-                .map((s) -> convertFieldValue(fieldType, s))
-                .collect(Collectors.toList());
+            final Analyzer analyzer = getAnalyzer(context.mapperService().documentMapper(hitContext.hit().getType()), fieldType);
+            List<Object> fieldValues = loadFieldValues(fieldType, field, context, hitContext);
             if (fieldValues.size() == 0) {
                 return null;
             }
+            final PassageFormatter passageFormatter = getPassageFormatter(field, encoder);
             final IndexSearcher searcher = new IndexSearcher(hitContext.reader());
             final CustomUnifiedHighlighter highlighter;
             final String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
@@ -145,7 +140,27 @@ public class UnifiedHighlighter implements Highlighter {
         return null;
     }
 
-    private BreakIterator getBreakIterator(SearchContextHighlight.Field field) {
+    protected PassageFormatter getPassageFormatter(SearchContextHighlight.Field field, Encoder encoder) {
+        CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0],
+            field.fieldOptions().postTags()[0], encoder);
+        return passageFormatter;
+    }
+
+    
+    protected Analyzer getAnalyzer(DocumentMapper docMapper, MappedFieldType type) {
+        return HighlightUtils.getAnalyzer(docMapper, type);
+    }
+    
+    protected List<Object> loadFieldValues(MappedFieldType fieldType, SearchContextHighlight.Field field, SearchContext context,
+            FetchSubPhase.HitContext hitContext) throws IOException {
+        List<Object> fieldValues = HighlightUtils.loadFieldValues(field, fieldType, context, hitContext);
+        fieldValues = fieldValues.stream()
+            .map((s) -> convertFieldValue(fieldType, s))
+            .collect(Collectors.toList());
+        return fieldValues;
+    }
+
+    protected BreakIterator getBreakIterator(SearchContextHighlight.Field field) {
         final SearchContextHighlight.FieldOptions fieldOptions = field.fieldOptions();
         final Locale locale =
             fieldOptions.boundaryScannerLocale() != null ? fieldOptions.boundaryScannerLocale() :
@@ -168,7 +183,7 @@ public class UnifiedHighlighter implements Highlighter {
         }
     }
 
-    private static List<Snippet> filterSnippets(List<Snippet> snippets, int numberOfFragments) {
+    protected static List<Snippet> filterSnippets(List<Snippet> snippets, int numberOfFragments) {
 
         //We need to filter the snippets as due to no_match_size we could have
         //either highlighted snippets or non highlighted ones and we don't want to mix those up
@@ -203,17 +218,7 @@ public class UnifiedHighlighter implements Highlighter {
         return filteredSnippets;
     }
 
-    static Analyzer getAnalyzer(DocumentMapper docMapper, MappedFieldType type) {
-        if (type instanceof KeywordFieldMapper.KeywordFieldType) {
-            KeywordFieldMapper.KeywordFieldType keywordFieldType = (KeywordFieldMapper.KeywordFieldType) type;
-            if (keywordFieldType.normalizer() != null) {
-                return  keywordFieldType.normalizer();
-            }
-        }
-        return docMapper.mappers().indexAnalyzer();
-    }
-
-    static String convertFieldValue(MappedFieldType type, Object value) {
+    protected static String convertFieldValue(MappedFieldType type, Object value) {
         if (value instanceof BytesRef) {
             return type.valueForDisplay(value).toString();
         } else {
@@ -221,14 +226,14 @@ public class UnifiedHighlighter implements Highlighter {
         }
     }
 
-    private static String mergeFieldValues(List<Object> fieldValues, char valuesSeparator) {
+    protected static String mergeFieldValues(List<Object> fieldValues, char valuesSeparator) {
         //postings highlighter accepts all values in a single string, as offsets etc. need to match with content
         //loaded from stored fields, we merge all values using a proper separator
         String rawValue = Strings.collectionToDelimitedString(fieldValues, String.valueOf(valuesSeparator));
         return rawValue.substring(0, Math.min(rawValue.length(), Integer.MAX_VALUE - 1));
     }
 
-    private OffsetSource getOffsetSource(MappedFieldType fieldType) {
+    protected OffsetSource getOffsetSource(MappedFieldType fieldType) {
         if (fieldType.indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
             return fieldType.storeTermVectors() ? OffsetSource.POSTINGS_WITH_TERM_VECTORS : OffsetSource.POSTINGS;
         }