Ver código fonte

Added support for char filters in the analyze API

Closes #5148
Brusic 11 anos atrás
pai
commit
95274c18c5

+ 6 - 2
docs/reference/indices/analyze.asciidoc

@@ -12,12 +12,16 @@ analyzers:
 curl -XGET 'localhost:9200/_analyze?analyzer=standard' -d 'this is a test'
 --------------------------------------------------
 
-Or by building a custom transient analyzer out of tokenizers and
-filters:
+Or by building a custom transient analyzer out of tokenizers,
+token filters and char filters. Token filters can use the shorter 'filters'
+parameter name:
 
 [source,js]
 --------------------------------------------------
 curl -XGET 'localhost:9200/_analyze?tokenizer=keyword&filters=lowercase' -d 'this is a test'
+
+curl -XGET 'localhost:9200/_analyze?tokenizer=keyword&token_filters=lowercase&char_filters=html_strip' -d 'this is a <b>test</b>'
+
 --------------------------------------------------
 
 It can also run against a specific index:

+ 26 - 14
src/main/java/org/elasticsearch/action/admin/indices/analyze/AnalyzeRequest.java

@@ -18,9 +18,11 @@
  */
 package org.elasticsearch.action.admin.indices.analyze;
 
+import org.elasticsearch.Version;
 import org.elasticsearch.action.ActionRequestValidationException;
 import org.elasticsearch.action.support.single.custom.SingleCustomOperationRequest;
 import org.elasticsearch.common.Nullable;
+import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
 
@@ -42,7 +44,9 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
 
     private String tokenizer;
 
-    private String[] tokenFilters;
+    private String[] tokenFilters = Strings.EMPTY_ARRAY;
+
+    private String[] charFilters = Strings.EMPTY_ARRAY;
 
     private String field;
 
@@ -110,6 +114,15 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
         return this.tokenFilters;
     }
 
+    public AnalyzeRequest charFilters(String... charFilters) {
+        this.charFilters = charFilters;
+        return this;
+    }
+
+    public String[] charFilters() {
+        return this.charFilters;
+    }
+
     public AnalyzeRequest field(String field) {
         this.field = field;
         return this;
@@ -125,6 +138,12 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
         if (text == null) {
             validationException = addValidationError("text is missing", validationException);
         }
+        if (tokenFilters == null) {
+            validationException = addValidationError("token filters must not be null", validationException);
+        }
+        if (charFilters == null) {
+            validationException = addValidationError("char filters must not be null", validationException);
+        }
         return validationException;
     }
 
@@ -135,12 +154,9 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
         text = in.readString();
         analyzer = in.readOptionalString();
         tokenizer = in.readOptionalString();
-        int size = in.readVInt();
-        if (size > 0) {
-            tokenFilters = new String[size];
-            for (int i = 0; i < size; i++) {
-                tokenFilters[i] = in.readString();
-            }
+        tokenFilters = in.readStringArray();
+        if (in.getVersion().onOrAfter(Version.V_1_1_0)) {
+            charFilters = in.readStringArray();
         }
         field = in.readOptionalString();
     }
@@ -152,13 +168,9 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
         out.writeString(text);
         out.writeOptionalString(analyzer);
         out.writeOptionalString(tokenizer);
-        if (tokenFilters == null) {
-            out.writeVInt(0);
-        } else {
-            out.writeVInt(tokenFilters.length);
-            for (String tokenFilter : tokenFilters) {
-                out.writeString(tokenFilter);
-            }
+        out.writeStringArray(tokenFilters);
+        if (out.getVersion().onOrAfter(Version.V_1_1_0)) {
+            out.writeStringArray(charFilters);
         }
         out.writeOptionalString(field);
     }

+ 8 - 0
src/main/java/org/elasticsearch/action/admin/indices/analyze/AnalyzeRequestBuilder.java

@@ -81,6 +81,14 @@ public class AnalyzeRequestBuilder extends SingleCustomOperationRequestBuilder<A
         return this;
     }
 
+    /**
+     * Sets char filters that will be used before the tokenizer.
+     */
+    public AnalyzeRequestBuilder setCharFilters(String... charFilters) {
+        request.charFilters(charFilters);
+        return this;
+    }
+
     @Override
     protected void doExecute(ActionListener<AnalyzeResponse> listener) {
         ((IndicesAdminClient) client).analyze(request, listener);

+ 29 - 4
src/main/java/org/elasticsearch/action/admin/indices/analyze/TransportAnalyzeAction.java

@@ -162,6 +162,7 @@ public class TransportAnalyzeAction extends TransportSingleCustomOperationAction
                     throw new ElasticsearchIllegalArgumentException("failed to find tokenizer under [" + request.tokenizer() + "]");
                 }
             }
+
             TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
             if (request.tokenFilters() != null && request.tokenFilters().length > 0) {
                 tokenFilterFactories = new TokenFilterFactory[request.tokenFilters().length];
@@ -170,21 +171,45 @@ public class TransportAnalyzeAction extends TransportSingleCustomOperationAction
                     if (indexService == null) {
                         TokenFilterFactoryFactory tokenFilterFactoryFactory = indicesAnalysisService.tokenFilterFactoryFactory(tokenFilterName);
                         if (tokenFilterFactoryFactory == null) {
-                            throw new ElasticsearchIllegalArgumentException("failed to find global token filter under [" + request.tokenizer() + "]");
+                            throw new ElasticsearchIllegalArgumentException("failed to find global token filter under [" + tokenFilterName + "]");
                         }
                         tokenFilterFactories[i] = tokenFilterFactoryFactory.create(tokenFilterName, ImmutableSettings.Builder.EMPTY_SETTINGS);
                     } else {
                         tokenFilterFactories[i] = indexService.analysisService().tokenFilter(tokenFilterName);
                         if (tokenFilterFactories[i] == null) {
-                            throw new ElasticsearchIllegalArgumentException("failed to find token filter under [" + request.tokenizer() + "]");
+                            throw new ElasticsearchIllegalArgumentException("failed to find token filter under [" + tokenFilterName + "]");
                         }
                     }
                     if (tokenFilterFactories[i] == null) {
-                        throw new ElasticsearchIllegalArgumentException("failed to find token filter under [" + request.tokenizer() + "]");
+                        throw new ElasticsearchIllegalArgumentException("failed to find token filter under [" + tokenFilterName + "]");
                     }
                 }
             }
-            analyzer = new CustomAnalyzer(tokenizerFactory, new CharFilterFactory[0], tokenFilterFactories);
+
+            CharFilterFactory[] charFilterFactories = new CharFilterFactory[0];
+            if (request.charFilters() != null && request.charFilters().length > 0) {
+                charFilterFactories = new CharFilterFactory[request.charFilters().length];
+                for (int i = 0; i < request.charFilters().length; i++) {
+                    String charFilterName = request.charFilters()[i];
+                    if (indexService == null) {
+                        CharFilterFactoryFactory charFilterFactoryFactory = indicesAnalysisService.charFilterFactoryFactory(charFilterName);
+                        if (charFilterFactoryFactory == null) {
+                            throw new ElasticsearchIllegalArgumentException("failed to find global char filter under [" + charFilterName + "]");
+                        }
+                        charFilterFactories[i] = charFilterFactoryFactory.create(charFilterName, ImmutableSettings.Builder.EMPTY_SETTINGS);
+                    } else {
+                        charFilterFactories[i] = indexService.analysisService().charFilter(charFilterName);
+                        if (charFilterFactories[i] == null) {
+                            throw new ElasticsearchIllegalArgumentException("failed to find token char under [" + charFilterName + "]");
+                        }
+                    }
+                    if (charFilterFactories[i] == null) {
+                        throw new ElasticsearchIllegalArgumentException("failed to find token char under [" + charFilterName + "]");
+                    }
+                }
+            }
+
+            analyzer = new CustomAnalyzer(tokenizerFactory, charFilterFactories, tokenFilterFactories);
             closeAnalyzer = true;
         } else if (analyzer == null) {
             if (indexService == null) {

+ 2 - 1
src/main/java/org/elasticsearch/rest/action/admin/indices/analyze/RestAnalyzeAction.java

@@ -70,7 +70,8 @@ public class RestAnalyzeAction extends BaseRestHandler {
         analyzeRequest.analyzer(request.param("analyzer"));
         analyzeRequest.field(request.param("field"));
         analyzeRequest.tokenizer(request.param("tokenizer"));
-        analyzeRequest.tokenFilters(request.paramAsStringArray("token_filters", request.paramAsStringArray("filters", null)));
+        analyzeRequest.tokenFilters(request.paramAsStringArray("token_filters", request.paramAsStringArray("filters", analyzeRequest.tokenFilters())));
+        analyzeRequest.charFilters(request.paramAsStringArray("char_filters", analyzeRequest.charFilters()));
         client.admin().indices().analyze(analyzeRequest, new ActionListener<AnalyzeResponse>() {
             @Override
             public void onResponse(AnalyzeResponse response) {

+ 45 - 0
src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionTests.java

@@ -29,6 +29,8 @@ import org.junit.Test;
 
 import java.io.IOException;
 
+import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
 import static org.hamcrest.Matchers.equalTo;
 
 /**
@@ -106,6 +108,49 @@ public class AnalyzeActionTests extends ElasticsearchIntegrationTest {
         analyzeResponse = client().admin().indices().prepareAnalyze("THIS IS A TEST").setTokenizer("keyword").setTokenFilters("lowercase").execute().actionGet();
         assertThat(analyzeResponse.getTokens().size(), equalTo(1));
         assertThat(analyzeResponse.getTokens().get(0).getTerm(), equalTo("this is a test"));
+
+        analyzeResponse = client().admin().indices().prepareAnalyze("THIS IS A TEST").setTokenizer("standard").setTokenFilters("lowercase", "reverse").execute().actionGet();
+        assertThat(analyzeResponse.getTokens().size(), equalTo(4));
+        AnalyzeResponse.AnalyzeToken token = analyzeResponse.getTokens().get(0);
+        assertThat(token.getTerm(), equalTo("siht"));
+        token = analyzeResponse.getTokens().get(1);
+        assertThat(token.getTerm(), equalTo("si"));
+        token = analyzeResponse.getTokens().get(2);
+        assertThat(token.getTerm(), equalTo("a"));
+        token = analyzeResponse.getTokens().get(3);
+        assertThat(token.getTerm(), equalTo("tset"));
+    }
+
+    @Test
+    public void analyzeWithCharFilters() throws Exception {
+
+        assertAcked(prepareCreate("test").setSettings(settingsBuilder()
+                .put(indexSettings())
+                .put("index.analysis.char_filter.custom_mapping.type", "mapping")
+                .putArray("index.analysis.char_filter.custom_mapping.mappings", "ph=>f", "qu=>q")
+                .put("index.analysis.analyzer.custom_with_char_filter.tokenizer", "standard")
+                .putArray("index.analysis.analyzer.custom_with_char_filter.char_filter", "custom_mapping")));
+        ensureGreen();
+
+        AnalyzeResponse analyzeResponse = client().admin().indices().prepareAnalyze("<h2><b>THIS</b> IS A</h2> <a href=\"#\">TEST</a>").setTokenizer("standard").setCharFilters("html_strip").execute().actionGet();
+        assertThat(analyzeResponse.getTokens().size(), equalTo(4));
+
+        analyzeResponse = client().admin().indices().prepareAnalyze("THIS IS A <b>TEST</b>").setTokenizer("keyword").setTokenFilters("lowercase").setCharFilters("html_strip").execute().actionGet();
+        assertThat(analyzeResponse.getTokens().size(), equalTo(1));
+        assertThat(analyzeResponse.getTokens().get(0).getTerm(), equalTo("this is a test"));
+
+        analyzeResponse = client().admin().indices().prepareAnalyze("test", "jeff quit phish").setTokenizer("keyword").setTokenFilters("lowercase").setCharFilters("custom_mapping").execute().actionGet();
+        assertThat(analyzeResponse.getTokens().size(), equalTo(1));
+        assertThat(analyzeResponse.getTokens().get(0).getTerm(), equalTo("jeff qit fish"));
+
+        analyzeResponse = client().admin().indices().prepareAnalyze("test", "<a href=\"#\">jeff quit fish</a>").setTokenizer("standard").setCharFilters("html_strip", "custom_mapping").execute().actionGet();
+        assertThat(analyzeResponse.getTokens().size(), equalTo(3));
+        AnalyzeResponse.AnalyzeToken token = analyzeResponse.getTokens().get(0);
+        assertThat(token.getTerm(), equalTo("jeff"));
+        token = analyzeResponse.getTokens().get(1);
+        assertThat(token.getTerm(), equalTo("qit"));
+        token = analyzeResponse.getTokens().get(2);
+        assertThat(token.getTerm(), equalTo("fish"));
     }
 
     @Test