فهرست منبع

Analysis: Add multi-valued text support

Add support array text as a multi-valued for AnalyzeRequestBuilder
Add support array text as a multi-valued for Analyze REST API
Add docs

Closes #3023
Jun Ohtani 10 سال پیش
والد
کامیت
3a1a4d3e89

+ 13 - 0
docs/reference/indices/analyze.asciidoc

@@ -18,6 +18,19 @@ curl -XGET 'localhost:9200/_analyze' -d '
 
 coming[2.0.0, body based parameters were added in 2.0.0]
 
+If text parameter is provided as array of strings, it is analyzed as a multi-valued field.
+
+[source,js]
+--------------------------------------------------
+curl -XGET 'localhost:9200/_analyze' -d '
+{
+  "analyzer" : "standard",
+  "text" : ["this is a test", "the second text"]
+}'
+--------------------------------------------------
+
+coming[2.0.0, body based parameters were added in 2.0.0]
+
 Or by building a custom transient analyzer out of tokenizers,
 token filters and char filters. Token filters can use the shorter 'filters'
 parameter name:

+ 1 - 1
rest-api-spec/api/indices.analyze.json

@@ -37,7 +37,7 @@
           "description" : "With `true`, specify that a local shard should be used if available, with `false`, use a random shard (default: true)"
         },
         "text": {
-          "type" : "string",
+          "type" : "list",
           "description" : "The text on which the analysis should be performed (when request body is not used)"
         },
         "tokenizer": {

+ 8 - 0
rest-api-spec/test/indices.analyze/10_analyze.yaml

@@ -63,3 +63,11 @@ setup:
           body: { "text": "Bar Foo", "filters": ["lowercase"], "tokenizer": keyword }
     - length: {tokens: 1 }
     - match:     { tokens.0.token: bar foo }
+---
+"Array text":
+    - do:
+        indices.analyze:
+          body: { "text": ["Foo Bar", "Baz"], "filters": ["lowercase"], "tokenizer": keyword }
+    - length: {tokens: 2 }
+    - match:     { tokens.0.token: foo bar }
+    - match:     { tokens.1.token: baz }

+ 6 - 6
src/main/java/org/elasticsearch/action/admin/indices/analyze/AnalyzeRequest.java

@@ -36,7 +36,7 @@ import static org.elasticsearch.action.ValidateActions.addValidationError;
  */
 public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest> {
 
-    private String text;
+    private String[] text;
 
     private String analyzer;
 
@@ -61,11 +61,11 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
         this.index(index);
     }
 
-    public String text() {
+    public String[] text() {
         return this.text;
     }
 
-    public AnalyzeRequest text(String text) {
+    public AnalyzeRequest text(String... text) {
         this.text = text;
         return this;
     }
@@ -118,7 +118,7 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
     @Override
     public ActionRequestValidationException validate() {
         ActionRequestValidationException validationException = super.validate();
-        if (text == null) {
+        if (text == null || text.length == 0) {
             validationException = addValidationError("text is missing", validationException);
         }
         if (tokenFilters == null) {
@@ -133,7 +133,7 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
     @Override
     public void readFrom(StreamInput in) throws IOException {
         super.readFrom(in);
-        text = in.readString();
+        text = in.readStringArray();
         analyzer = in.readOptionalString();
         tokenizer = in.readOptionalString();
         tokenFilters = in.readStringArray();
@@ -144,7 +144,7 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
     @Override
     public void writeTo(StreamOutput out) throws IOException {
         super.writeTo(out);
-        out.writeString(text);
+        out.writeStringArray(text);
         out.writeOptionalString(analyzer);
         out.writeOptionalString(tokenizer);
         out.writeStringArray(tokenFilters);

+ 9 - 1
src/main/java/org/elasticsearch/action/admin/indices/analyze/AnalyzeRequestBuilder.java

@@ -30,7 +30,7 @@ public class AnalyzeRequestBuilder extends SingleCustomOperationRequestBuilder<A
         super(client, action, new AnalyzeRequest());
     }
 
-    public AnalyzeRequestBuilder(ElasticsearchClient client, AnalyzeAction action, String index, String text) {
+    public AnalyzeRequestBuilder(ElasticsearchClient client, AnalyzeAction action, String index, String... text) {
         super(client, action, new AnalyzeRequest(index).text(text));
     }
 
@@ -86,4 +86,12 @@ public class AnalyzeRequestBuilder extends SingleCustomOperationRequestBuilder<A
         request.charFilters(charFilters);
         return this;
     }
+
+    /**
+     * Sets texts to analyze
+     */
+    public AnalyzeRequestBuilder setText(String... texts) {
+        request.text(texts);
+        return this;
+    }
 }

+ 34 - 28
src/main/java/org/elasticsearch/action/admin/indices/analyze/TransportAnalyzeAction.java

@@ -25,6 +25,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.IOUtils;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.Version;
 import org.elasticsearch.action.support.ActionFilters;
@@ -210,36 +211,41 @@ public class TransportAnalyzeAction extends TransportSingleCustomOperationAction
 
         List<AnalyzeResponse.AnalyzeToken> tokens = Lists.newArrayList();
         TokenStream stream = null;
-        try {
-            stream = analyzer.tokenStream(field, request.text());
-            stream.reset();
-            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
-            PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
-            OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
-            TypeAttribute type = stream.addAttribute(TypeAttribute.class);
-
-            int position = -1;
-            while (stream.incrementToken()) {
-                int increment = posIncr.getPositionIncrement();
-                if (increment > 0) {
-                    position = position + increment;
-                }
-                tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), position, offset.startOffset(), offset.endOffset(), type.type()));
-            }
-            stream.end();
-        } catch (IOException e) {
-            throw new ElasticsearchException("failed to analyze", e);
-        } finally {
-            if (stream != null) {
-                try {
-                    stream.close();
-                } catch (IOException e) {
-                    // ignore
+        int lastPosition = -1;
+        int lastOffset = 0;
+        for (String text : request.text()) {
+            try {
+                stream = analyzer.tokenStream(field, text);
+                stream.reset();
+                CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
+                PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
+                OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
+                TypeAttribute type = stream.addAttribute(TypeAttribute.class);
+
+                while (stream.incrementToken()) {
+                    int increment = posIncr.getPositionIncrement();
+                    if (increment > 0) {
+                        lastPosition = lastPosition + increment;
+                    }
+                    tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type()));
+
                 }
+                stream.end();
+                lastOffset += offset.endOffset();
+                lastPosition += posIncr.getPositionIncrement();
+
+                lastPosition += analyzer.getPositionIncrementGap(field);
+                lastOffset += analyzer.getOffsetGap(field);
+
+            } catch (IOException e) {
+                throw new ElasticsearchException("failed to analyze", e);
+            } finally {
+                IOUtils.closeWhileHandlingException(stream);
             }
-            if (closeAnalyzer) {
-                analyzer.close();
-            }
+        }
+
+        if (closeAnalyzer) {
+            analyzer.close();
         }
 
         return new AnalyzeResponse(tokens);

+ 6 - 0
src/main/java/org/elasticsearch/client/IndicesAdminClient.java

@@ -587,6 +587,12 @@ public interface IndicesAdminClient extends ElasticsearchClient {
      */
     AnalyzeRequestBuilder prepareAnalyze(String text);
 
+    /**
+     * Analyze text/texts.
+     *
+     */
+    AnalyzeRequestBuilder prepareAnalyze();
+
     /**
      * Puts an index template.
      */

+ 5 - 0
src/main/java/org/elasticsearch/client/support/AbstractClient.java

@@ -1478,6 +1478,11 @@ public abstract class AbstractClient extends AbstractComponent implements Client
             return new AnalyzeRequestBuilder(this, AnalyzeAction.INSTANCE, null, text);
         }
 
+        @Override
+        public AnalyzeRequestBuilder prepareAnalyze() {
+            return new AnalyzeRequestBuilder(this, AnalyzeAction.INSTANCE);
+        }
+
         @Override
         public ActionFuture<PutIndexTemplateResponse> putTemplate(final PutIndexTemplateRequest request) {
             return execute(PutIndexTemplateAction.INSTANCE, request);

+ 18 - 8
src/main/java/org/elasticsearch/rest/action/admin/indices/analyze/RestAnalyzeAction.java

@@ -22,6 +22,7 @@ import com.google.common.collect.Lists;
 import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequest;
 import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
 import org.elasticsearch.client.Client;
+import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.settings.Settings;
@@ -55,10 +56,10 @@ public class RestAnalyzeAction extends BaseRestHandler {
     @Override
     public void handleRequest(final RestRequest request, final RestChannel channel, final Client client) {
 
-        String text = request.param("text");
+        String[] texts = request.paramAsStringArrayOrEmptyIfAll("text");
 
         AnalyzeRequest analyzeRequest = new AnalyzeRequest(request.param("index"));
-        analyzeRequest.text(text);
+        analyzeRequest.text(texts);
         analyzeRequest.preferLocal(request.paramAsBoolean("prefer_local", analyzeRequest.preferLocalShard()));
         analyzeRequest.analyzer(request.param("analyzer"));
         analyzeRequest.field(request.param("field"));
@@ -69,9 +70,9 @@ public class RestAnalyzeAction extends BaseRestHandler {
         if (RestActions.hasBodyContent(request)) {
             XContentType type = RestActions.guessBodyContentType(request);
             if (type == null) {
-                if (text == null) {
-                    text = RestActions.getRestContent(request).toUtf8();
-                    analyzeRequest.text(text);
+                if (texts == null || texts.length == 0) {
+                    texts = new String[]{ RestActions.getRestContent(request).toUtf8() };
+                    analyzeRequest.text(texts);
                 }
             } else {
                 // NOTE: if rest request with xcontent body has request parameters, the parameters does not override xcontent values
@@ -95,7 +96,16 @@ public class RestAnalyzeAction extends BaseRestHandler {
                     } else if ("prefer_local".equals(currentFieldName) && token == XContentParser.Token.VALUE_BOOLEAN) {
                         analyzeRequest.preferLocal(parser.booleanValue());
                     } else if ("text".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) {
-                            analyzeRequest.text(parser.text());
+                        analyzeRequest.text(parser.text());
+                    } else if ("text".equals(currentFieldName) && token == XContentParser.Token.START_ARRAY) {
+                        List<String> texts = Lists.newArrayList();
+                        while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
+                            if (token.isValue() == false) {
+                                throw new IllegalArgumentException(currentFieldName + " array element should only contain text");
+                            }
+                            texts.add(parser.text());
+                        }
+                        analyzeRequest.text(texts.toArray(Strings.EMPTY_ARRAY));
                     } else if ("analyzer".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) {
                         analyzeRequest.analyzer(parser.text());
                     } else if ("field".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) {
@@ -110,7 +120,7 @@ public class RestAnalyzeAction extends BaseRestHandler {
                             }
                             filters.add(parser.text());
                         }
-                        analyzeRequest.tokenFilters(filters.toArray(new String[0]));
+                        analyzeRequest.tokenFilters(filters.toArray(Strings.EMPTY_ARRAY));
                     } else if ("char_filters".equals(currentFieldName) && token == XContentParser.Token.START_ARRAY) {
                         List<String> charFilters = Lists.newArrayList();
                         while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
@@ -119,7 +129,7 @@ public class RestAnalyzeAction extends BaseRestHandler {
                             }
                             charFilters.add(parser.text());
                         }
-                        analyzeRequest.tokenFilters(charFilters.toArray(new String[0]));
+                        analyzeRequest.tokenFilters(charFilters.toArray(Strings.EMPTY_ARRAY));
                     } else {
                         throw new IllegalArgumentException("Unknown parameter [" + currentFieldName + "] in request body or parameter is of the wrong type[" + token + "] ");
                     }

+ 34 - 14
src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionTests.java

@@ -158,18 +158,7 @@ public class AnalyzeActionTests extends ElasticsearchIntegrationTest {
         ensureGreen();
 
         client().admin().indices().preparePutMapping("test")
-                .setType("document").setSource(
-                "{\n" +
-                        "    \"document\":{\n" +
-                        "        \"properties\":{\n" +
-                        "            \"simple\":{\n" +
-                        "                \"type\":\"string\",\n" +
-                        "                \"analyzer\": \"simple\"\n" +
-                        "            }\n" +
-                        "        }\n" +
-                        "    }\n" +
-                        "}"
-        ).get();
+                .setType("document").setSource("simple", "type=string,analyzer=simple").get();
 
         for (int i = 0; i < 10; i++) {
             final AnalyzeRequestBuilder requestBuilder = client().admin().indices().prepareAnalyze("THIS IS A TEST");
@@ -220,7 +209,8 @@ public class AnalyzeActionTests extends ElasticsearchIntegrationTest {
 
         RestAnalyzeAction.buildFromContent(content, analyzeRequest);
 
-        assertThat(analyzeRequest.text(), equalTo("THIS IS A TEST"));
+        assertThat(analyzeRequest.text().length, equalTo(1));
+        assertThat(analyzeRequest.text(), equalTo(new String[]{"THIS IS A TEST"}));
         assertThat(analyzeRequest.tokenizer(), equalTo("keyword"));
         assertThat(analyzeRequest.tokenFilters(), equalTo(new String[]{"lowercase"}));
     }
@@ -239,7 +229,6 @@ public class AnalyzeActionTests extends ElasticsearchIntegrationTest {
         }
     }
 
-
     @Test
     public void testParseXContentForAnalyzeRequestWithUnknownParamThrowsException() throws Exception {
         AnalyzeRequest analyzeRequest = new AnalyzeRequest("for test");
@@ -258,4 +247,35 @@ public class AnalyzeActionTests extends ElasticsearchIntegrationTest {
         }
     }
 
+    @Test
+    public void analyzerWithMultiValues() throws Exception {
+
+        assertAcked(prepareCreate("test").addAlias(new Alias("alias")));
+        ensureGreen();
+
+        client().admin().indices().preparePutMapping("test")
+            .setType("document").setSource("simple", "type=string,analyzer=simple,position_offset_gap=100").get();
+
+        String[] texts = new String[]{"THIS IS A TEST", "THE SECOND TEXT"};
+
+        final AnalyzeRequestBuilder requestBuilder = client().admin().indices().prepareAnalyze();
+        requestBuilder.setText(texts);
+        requestBuilder.setIndex(indexOrAlias());
+        requestBuilder.setField("simple");
+        AnalyzeResponse analyzeResponse = requestBuilder.get();
+        assertThat(analyzeResponse.getTokens().size(), equalTo(7));
+        AnalyzeResponse.AnalyzeToken token = analyzeResponse.getTokens().get(3);
+        assertThat(token.getTerm(), equalTo("test"));
+        assertThat(token.getPosition(), equalTo(3));
+        assertThat(token.getStartOffset(), equalTo(10));
+        assertThat(token.getEndOffset(), equalTo(14));
+
+        token = analyzeResponse.getTokens().get(5);
+        assertThat(token.getTerm(), equalTo("second"));
+        assertThat(token.getPosition(), equalTo(105));
+        assertThat(token.getStartOffset(), equalTo(19));
+        assertThat(token.getEndOffset(), equalTo(25));
+
+    }
+
 }