|
@@ -0,0 +1,404 @@
|
|
|
|
+/*
|
|
|
|
+ * Licensed to Elasticsearch under one or more contributor
|
|
|
|
+ * license agreements. See the NOTICE file distributed with
|
|
|
|
+ * this work for additional information regarding copyright
|
|
|
|
+ * ownership. Elasticsearch licenses this file to you under
|
|
|
|
+ * the Apache License, Version 2.0 (the "License"); you may
|
|
|
|
+ * not use this file except in compliance with the License.
|
|
|
|
+ * You may obtain a copy of the License at
|
|
|
|
+ *
|
|
|
|
+ * http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
+ *
|
|
|
|
+ * Unless required by applicable law or agreed to in writing,
|
|
|
|
+ * software distributed under the License is distributed on an
|
|
|
|
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
|
|
+ * KIND, either express or implied. See the License for the
|
|
|
|
+ * specific language governing permissions and limitations
|
|
|
|
+ * under the License.
|
|
|
|
+ */
|
|
|
|
+
|
|
|
|
+package org.elasticsearch.search.aggregations.bucket;
|
|
|
|
+
|
|
|
|
+import org.elasticsearch.action.index.IndexRequestBuilder;
|
|
|
|
+import org.elasticsearch.action.search.SearchPhaseExecutionException;
|
|
|
|
+import org.elasticsearch.action.search.SearchResponse;
|
|
|
|
+import org.elasticsearch.common.ParseField;
|
|
|
|
+import org.elasticsearch.common.io.stream.StreamInput;
|
|
|
|
+import org.elasticsearch.common.io.stream.StreamOutput;
|
|
|
|
+import org.elasticsearch.common.settings.Settings;
|
|
|
|
+import org.elasticsearch.common.xcontent.XContentBuilder;
|
|
|
|
+import org.elasticsearch.common.xcontent.XContentFactory;
|
|
|
|
+import org.elasticsearch.common.xcontent.XContentParser;
|
|
|
|
+import org.elasticsearch.index.query.FilterBuilders;
|
|
|
|
+import org.elasticsearch.index.query.QueryParsingException;
|
|
|
|
+import org.elasticsearch.plugins.AbstractPlugin;
|
|
|
|
+import org.elasticsearch.search.aggregations.Aggregation;
|
|
|
|
+import org.elasticsearch.search.aggregations.Aggregations;
|
|
|
|
+import org.elasticsearch.search.aggregations.bucket.filter.FilterAggregationBuilder;
|
|
|
|
+import org.elasticsearch.search.aggregations.bucket.filter.InternalFilter;
|
|
|
|
+import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms;
|
|
|
|
+import org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsAggregatorFactory;
|
|
|
|
+import org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsBuilder;
|
|
|
|
+import org.elasticsearch.search.aggregations.bucket.significant.heuristics.*;
|
|
|
|
+import org.elasticsearch.search.aggregations.bucket.terms.StringTerms;
|
|
|
|
+import org.elasticsearch.search.aggregations.bucket.terms.Terms;
|
|
|
|
+import org.elasticsearch.search.aggregations.bucket.terms.TermsBuilder;
|
|
|
|
+import org.elasticsearch.test.ElasticsearchIntegrationTest;
|
|
|
|
+import org.junit.Test;
|
|
|
|
+
|
|
|
|
+import java.io.IOException;
|
|
|
|
+import java.util.*;
|
|
|
|
+import java.util.concurrent.ExecutionException;
|
|
|
|
+
|
|
|
|
+import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS;
|
|
|
|
+import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS;
|
|
|
|
+import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
|
|
|
|
+import static org.elasticsearch.test.ElasticsearchIntegrationTest.ClusterScope;
|
|
|
|
+import static org.elasticsearch.test.ElasticsearchIntegrationTest.Scope;
|
|
|
|
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
|
|
|
|
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse;
|
|
|
|
+import static org.hamcrest.Matchers.*;
|
|
|
|
+
|
|
|
|
+/**
|
|
|
|
+ *
|
|
|
|
+ */
|
|
|
|
+@ClusterScope(scope = Scope.SUITE)
|
|
|
|
+public class SignificantTermsSignificanceScoreTests extends ElasticsearchIntegrationTest {
|
|
|
|
+
|
|
|
|
+ static final String INDEX_NAME = "testidx";
|
|
|
|
+ static final String DOC_TYPE = "doc";
|
|
|
|
+ static final String TEXT_FIELD = "text";
|
|
|
|
+ static final String CLASS_FIELD = "class";
|
|
|
|
+
|
|
|
|
+ @Override
|
|
|
|
+ protected Settings nodeSettings(int nodeOrdinal) {
|
|
|
|
+ return settingsBuilder()
|
|
|
|
+ .put("plugin.types", CustomSignificanceHeuristicPlugin.class.getName())
|
|
|
|
+ .put(super.nodeSettings(nodeOrdinal))
|
|
|
|
+ .build();
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ public String randomExecutionHint() {
|
|
|
|
+ return randomBoolean() ? null : randomFrom(SignificantTermsAggregatorFactory.ExecutionMode.values()).toString();
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ @Test
|
|
|
|
+ public void testPlugin() throws Exception {
|
|
|
|
+ String type = randomBoolean() ? "string" : "long";
|
|
|
|
+ String settings = "{\"index.number_of_shards\": 1, \"index.number_of_replicas\": 0}";
|
|
|
|
+ index01Docs(type, settings);
|
|
|
|
+ SearchResponse response = client().prepareSearch(INDEX_NAME).setTypes(DOC_TYPE)
|
|
|
|
+ .addAggregation(new TermsBuilder("class")
|
|
|
|
+ .field(CLASS_FIELD)
|
|
|
|
+ .subAggregation((new SignificantTermsBuilder("sig_terms"))
|
|
|
|
+ .field(TEXT_FIELD)
|
|
|
|
+ .significanceHeuristic(new SimpleHeuristic.SimpleHeuristicBuilder())
|
|
|
|
+ .minDocCount(1)
|
|
|
|
+ )
|
|
|
|
+ )
|
|
|
|
+ .execute()
|
|
|
|
+ .actionGet();
|
|
|
|
+ assertSearchResponse(response);
|
|
|
|
+ StringTerms classes = (StringTerms) response.getAggregations().get("class");
|
|
|
|
+ assertThat(classes.getBuckets().size(), equalTo(2));
|
|
|
|
+ for (Terms.Bucket classBucket : classes.getBuckets()) {
|
|
|
|
+ Map<String, Aggregation> aggs = classBucket.getAggregations().asMap();
|
|
|
|
+ assertTrue(aggs.containsKey("sig_terms"));
|
|
|
|
+ SignificantTerms agg = (SignificantTerms) aggs.get("sig_terms");
|
|
|
|
+ assertThat(agg.getBuckets().size(), equalTo(2));
|
|
|
|
+ Iterator<SignificantTerms.Bucket> bucketIterator = agg.iterator();
|
|
|
|
+ SignificantTerms.Bucket sigBucket = bucketIterator.next();
|
|
|
|
+ String term = sigBucket.getKey();
|
|
|
|
+ String classTerm = classBucket.getKey();
|
|
|
|
+ assertTrue(term.equals(classTerm));
|
|
|
|
+ assertThat(sigBucket.getSignificanceScore(), closeTo(2.0, 1.e-8));
|
|
|
|
+ sigBucket = bucketIterator.next();
|
|
|
|
+ assertThat(sigBucket.getSignificanceScore(), closeTo(1.0, 1.e-8));
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // we run the same test again but this time we do not call assertSearchResponse() before the assertions
|
|
|
|
+ // the reason is that this would trigger toXContent and we would like to check that this has no potential side effects
|
|
|
|
+
|
|
|
|
+ response = client().prepareSearch(INDEX_NAME).setTypes(DOC_TYPE)
|
|
|
|
+ .addAggregation(new TermsBuilder("class")
|
|
|
|
+ .field(CLASS_FIELD)
|
|
|
|
+ .subAggregation((new SignificantTermsBuilder("sig_terms"))
|
|
|
|
+ .field(TEXT_FIELD)
|
|
|
|
+ .significanceHeuristic(new SimpleHeuristic.SimpleHeuristicBuilder())
|
|
|
|
+ .minDocCount(1)
|
|
|
|
+ )
|
|
|
|
+ )
|
|
|
|
+ .execute()
|
|
|
|
+ .actionGet();
|
|
|
|
+
|
|
|
|
+ classes = (StringTerms) response.getAggregations().get("class");
|
|
|
|
+ assertThat(classes.getBuckets().size(), equalTo(2));
|
|
|
|
+ for (Terms.Bucket classBucket : classes.getBuckets()) {
|
|
|
|
+ Map<String, Aggregation> aggs = classBucket.getAggregations().asMap();
|
|
|
|
+ assertTrue(aggs.containsKey("sig_terms"));
|
|
|
|
+ SignificantTerms agg = (SignificantTerms) aggs.get("sig_terms");
|
|
|
|
+ assertThat(agg.getBuckets().size(), equalTo(2));
|
|
|
|
+ Iterator<SignificantTerms.Bucket> bucketIterator = agg.iterator();
|
|
|
|
+ SignificantTerms.Bucket sigBucket = bucketIterator.next();
|
|
|
|
+ String term = sigBucket.getKey();
|
|
|
|
+ String classTerm = classBucket.getKey();
|
|
|
|
+ assertTrue(term.equals(classTerm));
|
|
|
|
+ assertThat(sigBucket.getSignificanceScore(), closeTo(2.0, 1.e-8));
|
|
|
|
+ sigBucket = bucketIterator.next();
|
|
|
|
+ assertThat(sigBucket.getSignificanceScore(), closeTo(1.0, 1.e-8));
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ public static class CustomSignificanceHeuristicPlugin extends AbstractPlugin {
|
|
|
|
+
|
|
|
|
+ @Override
|
|
|
|
+ public String name() {
|
|
|
|
+ return "test-plugin-significance-heuristic";
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ @Override
|
|
|
|
+ public String description() {
|
|
|
|
+ return "Significance heuristic plugin";
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ public void onModule(SignificantTermsHeuristicModule significanceModule) {
|
|
|
|
+ significanceModule.registerHeuristic(SimpleHeuristic.SimpleHeuristicParser.class, SimpleHeuristic.STREAM);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ public static class SimpleHeuristic implements SignificanceHeuristic {
|
|
|
|
+
|
|
|
|
+ protected static final String[] NAMES = {"simple"};
|
|
|
|
+
|
|
|
|
+ public static final SignificanceHeuristicStreams.Stream STREAM = new SignificanceHeuristicStreams.Stream() {
|
|
|
|
+ @Override
|
|
|
|
+ public SignificanceHeuristic readResult(StreamInput in) throws IOException {
|
|
|
|
+ return readFrom(in);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ @Override
|
|
|
|
+ public String getName() {
|
|
|
|
+ return NAMES[0];
|
|
|
|
+ }
|
|
|
|
+ };
|
|
|
|
+
|
|
|
|
+ public static SignificanceHeuristic readFrom(StreamInput in) throws IOException {
|
|
|
|
+ return new SimpleHeuristic();
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * @param subsetFreq The frequency of the term in the selected sample
|
|
|
|
+ * @param subsetSize The size of the selected sample (typically number of docs)
|
|
|
|
+ * @param supersetFreq The frequency of the term in the superset from which the sample was taken
|
|
|
|
+ * @param supersetSize The size of the superset from which the sample was taken (typically number of docs)
|
|
|
|
+ * @return a "significance" score
|
|
|
|
+ */
|
|
|
|
+ @Override
|
|
|
|
+ public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) {
|
|
|
|
+ return subsetFreq / subsetSize > supersetFreq / supersetSize ? 2.0 : 1.0;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ @Override
|
|
|
|
+ public void writeTo(StreamOutput out) throws IOException {
|
|
|
|
+ out.writeString(STREAM.getName());
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ public static class SimpleHeuristicParser implements SignificanceHeuristicParser {
|
|
|
|
+
|
|
|
|
+ @Override
|
|
|
|
+ public SignificanceHeuristic parse(XContentParser parser) throws IOException, QueryParsingException {
|
|
|
|
+ parser.nextToken();
|
|
|
|
+ return new SimpleHeuristic();
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ @Override
|
|
|
|
+ public String[] getNames() {
|
|
|
|
+ return NAMES;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ public static class SimpleHeuristicBuilder implements SignificanceHeuristicBuilder {
|
|
|
|
+
|
|
|
|
+ @Override
|
|
|
|
+ public void toXContent(XContentBuilder builder) throws IOException {
|
|
|
|
+ builder.startObject(STREAM.getName()).endObject();
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ @Test
|
|
|
|
+ public void testXContentResponse() throws Exception {
|
|
|
|
+
|
|
|
|
+ String type = randomBoolean() ? "string" : "long";
|
|
|
|
+ String settings = "{\"index.number_of_shards\": 1, \"index.number_of_replicas\": 0}";
|
|
|
|
+ index01Docs(type, settings);
|
|
|
|
+ SearchResponse response = client().prepareSearch(INDEX_NAME).setTypes(DOC_TYPE)
|
|
|
|
+ .addAggregation(new TermsBuilder("class").field(CLASS_FIELD).subAggregation(new SignificantTermsBuilder("sig_terms").field(TEXT_FIELD)))
|
|
|
|
+ .execute()
|
|
|
|
+ .actionGet();
|
|
|
|
+ assertSearchResponse(response);
|
|
|
|
+ StringTerms classes = (StringTerms) response.getAggregations().get("class");
|
|
|
|
+ assertThat(classes.getBuckets().size(), equalTo(2));
|
|
|
|
+ for (Terms.Bucket classBucket : classes.getBuckets()) {
|
|
|
|
+ Map<String, Aggregation> aggs = classBucket.getAggregations().asMap();
|
|
|
|
+ assertTrue(aggs.containsKey("sig_terms"));
|
|
|
|
+ SignificantTerms agg = (SignificantTerms) aggs.get("sig_terms");
|
|
|
|
+ assertThat(agg.getBuckets().size(), equalTo(1));
|
|
|
|
+ String term = agg.iterator().next().getKey();
|
|
|
|
+ String classTerm = classBucket.getKey();
|
|
|
|
+ assertTrue(term.equals(classTerm));
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ XContentBuilder responseBuilder = XContentFactory.jsonBuilder();
|
|
|
|
+ classes.toXContent(responseBuilder, null);
|
|
|
|
+ String result = null;
|
|
|
|
+ if (type.equals("long")) {
|
|
|
|
+ result = "\"class\"{\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"sig_terms\":{\"doc_count\":4,\"buckets\":[{\"key\":0,\"key_as_string\":\"0\",\"doc_count\":4,\"score\":0.39999999999999997,\"bg_count\":5}]}},{\"key\":\"1\",\"doc_count\":3,\"sig_terms\":{\"doc_count\":3,\"buckets\":[{\"key\":1,\"key_as_string\":\"1\",\"doc_count\":3,\"score\":0.75,\"bg_count\":4}]}}]}";
|
|
|
|
+ } else {
|
|
|
|
+ result = "\"class\"{\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"sig_terms\":{\"doc_count\":4,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"score\":0.39999999999999997,\"bg_count\":5}]}},{\"key\":\"1\",\"doc_count\":3,\"sig_terms\":{\"doc_count\":3,\"buckets\":[{\"key\":\"1\",\"doc_count\":3,\"score\":0.75,\"bg_count\":4}]}}]}";
|
|
|
|
+ }
|
|
|
|
+ assertThat(responseBuilder.string(), equalTo(result));
|
|
|
|
+
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // compute significance score by
|
|
|
|
+ // 1. terms agg on class and significant terms
|
|
|
|
+ // 2. filter buckets and set the background to the other class and set is_background false
|
|
|
|
+ // both should yield exact same result
|
|
|
|
+ @Test
|
|
|
|
+ public void testBackgroundVsSeparateSet() throws Exception {
|
|
|
|
+ String type = randomBoolean() ? "string" : "long";
|
|
|
|
+ String settings = "{\"index.number_of_shards\": 1, \"index.number_of_replicas\": 0}";
|
|
|
|
+ index01Docs(type, settings);
|
|
|
|
+ SearchResponse response1 = client().prepareSearch(INDEX_NAME).setTypes(DOC_TYPE)
|
|
|
|
+ .addAggregation(new TermsBuilder("class")
|
|
|
|
+ .field(CLASS_FIELD)
|
|
|
|
+ .subAggregation(
|
|
|
|
+ new SignificantTermsBuilder("sig_terms")
|
|
|
|
+ .field(TEXT_FIELD)
|
|
|
|
+ .minDocCount(1)
|
|
|
|
+ .significanceHeuristic(
|
|
|
|
+ new MutualInformation.MutualInformationBuilder(true, true))))
|
|
|
|
+ .execute()
|
|
|
|
+ .actionGet();
|
|
|
|
+ assertSearchResponse(response1);
|
|
|
|
+ SearchResponse response2 = client().prepareSearch(INDEX_NAME).setTypes(DOC_TYPE)
|
|
|
|
+ .addAggregation((new FilterAggregationBuilder("0"))
|
|
|
|
+ .filter(FilterBuilders.termFilter(CLASS_FIELD, "0"))
|
|
|
|
+ .subAggregation(new SignificantTermsBuilder("sig_terms")
|
|
|
|
+ .field(TEXT_FIELD)
|
|
|
|
+ .minDocCount(1)
|
|
|
|
+ .backgroundFilter(FilterBuilders.termFilter(CLASS_FIELD, "1"))
|
|
|
|
+ .significanceHeuristic(new MutualInformation.MutualInformationBuilder(true, false))))
|
|
|
|
+ .addAggregation((new FilterAggregationBuilder("1"))
|
|
|
|
+ .filter(FilterBuilders.termFilter(CLASS_FIELD, "1"))
|
|
|
|
+ .subAggregation(new SignificantTermsBuilder("sig_terms")
|
|
|
|
+ .field(TEXT_FIELD)
|
|
|
|
+ .minDocCount(1)
|
|
|
|
+ .backgroundFilter(FilterBuilders.termFilter(CLASS_FIELD, "0"))
|
|
|
|
+ .significanceHeuristic(new MutualInformation.MutualInformationBuilder(true, false))))
|
|
|
|
+ .execute()
|
|
|
|
+ .actionGet();
|
|
|
|
+
|
|
|
|
+ SignificantTerms sigTerms0 = ((SignificantTerms) (((StringTerms) response1.getAggregations().get("class")).getBucketByKey("0").getAggregations().asMap().get("sig_terms")));
|
|
|
|
+ assertThat(sigTerms0.getBuckets().size(), equalTo(2));
|
|
|
|
+ double score00Background = sigTerms0.getBucketByKey("0").getSignificanceScore();
|
|
|
|
+ double score01Background = sigTerms0.getBucketByKey("1").getSignificanceScore();
|
|
|
|
+ SignificantTerms sigTerms1 = ((SignificantTerms) (((StringTerms) response1.getAggregations().get("class")).getBucketByKey("0").getAggregations().asMap().get("sig_terms")));
|
|
|
|
+ double score10Background = sigTerms1.getBucketByKey("0").getSignificanceScore();
|
|
|
|
+ double score11Background = sigTerms1.getBucketByKey("1").getSignificanceScore();
|
|
|
|
+
|
|
|
|
+ double score00SeparateSets = ((SignificantTerms) ((InternalFilter) response2.getAggregations().get("0")).getAggregations().getAsMap().get("sig_terms")).getBucketByKey("0").getSignificanceScore();
|
|
|
|
+ double score01SeparateSets = ((SignificantTerms) ((InternalFilter) response2.getAggregations().get("0")).getAggregations().getAsMap().get("sig_terms")).getBucketByKey("1").getSignificanceScore();
|
|
|
|
+ double score10SeparateSets = ((SignificantTerms) ((InternalFilter) response2.getAggregations().get("1")).getAggregations().getAsMap().get("sig_terms")).getBucketByKey("0").getSignificanceScore();
|
|
|
|
+ double score11SeparateSets = ((SignificantTerms) ((InternalFilter) response2.getAggregations().get("1")).getAggregations().getAsMap().get("sig_terms")).getBucketByKey("1").getSignificanceScore();
|
|
|
|
+
|
|
|
|
+ assertThat(score00Background, equalTo(score00SeparateSets));
|
|
|
|
+ assertThat(score01Background, equalTo(score01SeparateSets));
|
|
|
|
+ assertThat(score10Background, equalTo(score10SeparateSets));
|
|
|
|
+ assertThat(score11Background, equalTo(score11SeparateSets));
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ private void index01Docs(String type, String settings) throws ExecutionException, InterruptedException {
|
|
|
|
+ String mappings = "{\"doc\": {\"properties\":{\"text\": {\"type\":\"" + type + "\"}}}}";
|
|
|
|
+ assertAcked(prepareCreate(INDEX_NAME).setSettings(settings).addMapping("doc", mappings));
|
|
|
|
+ String[] gb = {"0", "1"};
|
|
|
|
+ List<IndexRequestBuilder> indexRequestBuilderList = new ArrayList<>();
|
|
|
|
+ indexRequestBuilderList.add(client().prepareIndex(INDEX_NAME, DOC_TYPE, "1")
|
|
|
|
+ .setSource(TEXT_FIELD, "1", CLASS_FIELD, "1"));
|
|
|
|
+ indexRequestBuilderList.add(client().prepareIndex(INDEX_NAME, DOC_TYPE, "2")
|
|
|
|
+ .setSource(TEXT_FIELD, "1", CLASS_FIELD, "1"));
|
|
|
|
+ indexRequestBuilderList.add(client().prepareIndex(INDEX_NAME, DOC_TYPE, "3")
|
|
|
|
+ .setSource(TEXT_FIELD, "0", CLASS_FIELD, "0"));
|
|
|
|
+ indexRequestBuilderList.add(client().prepareIndex(INDEX_NAME, DOC_TYPE, "4")
|
|
|
|
+ .setSource(TEXT_FIELD, "0", CLASS_FIELD, "0"));
|
|
|
|
+ indexRequestBuilderList.add(client().prepareIndex(INDEX_NAME, DOC_TYPE, "5")
|
|
|
|
+ .setSource(TEXT_FIELD, gb, CLASS_FIELD, "1"));
|
|
|
|
+ indexRequestBuilderList.add(client().prepareIndex(INDEX_NAME, DOC_TYPE, "6")
|
|
|
|
+ .setSource(TEXT_FIELD, gb, CLASS_FIELD, "0"));
|
|
|
|
+ indexRequestBuilderList.add(client().prepareIndex(INDEX_NAME, DOC_TYPE, "7")
|
|
|
|
+ .setSource(TEXT_FIELD, "0", CLASS_FIELD, "0"));
|
|
|
|
+ indexRandom(true, indexRequestBuilderList);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ @Test
|
|
|
|
+ public void testMutualInformationEqual() throws Exception {
|
|
|
|
+ indexEqualTestData();
|
|
|
|
+ //now, check that results for both classes are the same with exclude negatives = false and classes are routing ids
|
|
|
|
+ SearchResponse response = client().prepareSearch("test")
|
|
|
|
+ .addAggregation(new TermsBuilder("class").field("class").subAggregation(new SignificantTermsBuilder("mySignificantTerms")
|
|
|
|
+ .field("text")
|
|
|
|
+ .executionHint(randomExecutionHint())
|
|
|
|
+ .significanceHeuristic(new MutualInformation.MutualInformationBuilder(true, true))
|
|
|
|
+ .minDocCount(1).shardSize(1000).size(1000)))
|
|
|
|
+ .execute()
|
|
|
|
+ .actionGet();
|
|
|
|
+ assertSearchResponse(response);
|
|
|
|
+ StringTerms classes = (StringTerms) response.getAggregations().get("class");
|
|
|
|
+ assertThat(classes.getBuckets().size(), equalTo(2));
|
|
|
|
+ Iterator<Terms.Bucket> classBuckets = classes.getBuckets().iterator();
|
|
|
|
+ Collection<SignificantTerms.Bucket> classA = ((SignificantTerms) classBuckets.next().getAggregations().get("mySignificantTerms")).getBuckets();
|
|
|
|
+ Iterator<SignificantTerms.Bucket> classBBucketIterator = ((SignificantTerms) classBuckets.next().getAggregations().get("mySignificantTerms")).getBuckets().iterator();
|
|
|
|
+ assertThat(classA.size(), greaterThan(0));
|
|
|
|
+ for (SignificantTerms.Bucket classABucket : classA) {
|
|
|
|
+ SignificantTerms.Bucket classBBucket = classBBucketIterator.next();
|
|
|
|
+ assertThat(classABucket.getKey(), equalTo(classBBucket.getKey()));
|
|
|
|
+ assertThat(classABucket.getSignificanceScore(), closeTo(classBBucket.getSignificanceScore(), 1.e-5));
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ private void indexEqualTestData() throws ExecutionException, InterruptedException {
|
|
|
|
+ assertAcked(prepareCreate("test").setSettings(SETTING_NUMBER_OF_SHARDS, 1, SETTING_NUMBER_OF_REPLICAS, 0).addMapping("doc",
|
|
|
|
+ "text", "type=string", "class", "type=string"));
|
|
|
|
+ createIndex("idx_unmapped");
|
|
|
|
+
|
|
|
|
+ ensureGreen();
|
|
|
|
+ String data[] = {
|
|
|
|
+ "A\ta",
|
|
|
|
+ "A\ta",
|
|
|
|
+ "A\tb",
|
|
|
|
+ "A\tb",
|
|
|
|
+ "A\tb",
|
|
|
|
+ "B\tc",
|
|
|
|
+ "B\tc",
|
|
|
|
+ "B\tc",
|
|
|
|
+ "B\tc",
|
|
|
|
+ "B\td",
|
|
|
|
+ "B\td",
|
|
|
|
+ "B\td",
|
|
|
|
+ "B\td",
|
|
|
|
+ "B\td",
|
|
|
|
+ "A\tc d",
|
|
|
|
+ "B\ta b"
|
|
|
|
+ };
|
|
|
|
+
|
|
|
|
+ List<IndexRequestBuilder> indexRequestBuilders = new ArrayList<>();
|
|
|
|
+ for (int i = 0; i < data.length; i++) {
|
|
|
|
+ String[] parts = data[i].split("\t");
|
|
|
|
+ indexRequestBuilders.add(client().prepareIndex("test", "doc", "" + i)
|
|
|
|
+ .setSource("class", parts[0], "text", parts[1]));
|
|
|
|
+ }
|
|
|
|
+ indexRandom(true, indexRequestBuilders);
|
|
|
|
+ }
|
|
|
|
+}
|