Browse Source

Term Vectors: terms filtering

This adds a new feature to the Term Vectors API which allows for filtering of
terms based on their tf-idf scores. With `dfs` option on, this could be useful
for finding out a good characteric vector of a document or a set of documents.
The parameters are similar to the ones used in the MLT Query.

Closes #9561
Alex Ksikes 10 years ago
parent
commit
d339ee4005

+ 100 - 0
docs/reference/docs/termvectors.asciidoc

@@ -85,6 +85,34 @@ Setting `dfs` to `true` (default is `false`) will return the term statistics
 or the field statistics of the entire index, and not just at the shard. Use it
 with caution as distributed frequencies can have a serious performance impact.
 
+[float]
+==== Terms Filtering coming[2.0]
+
+With the parameter `filter`, the terms returned could also be filtered based
+on their tf-idf scores. This could be useful in order find out a good
+characteristic vector of a document. This feature works in a similar manner to
+the <<mlt-query-term-selection,second phase>> of the
+<<query-dsl-mlt-query,More Like This Query>>. See <<docs-termvectors-terms-filtering,example 5>>
+for usage.
+
+The following sub-parameters are supported:
+
+[horizontal]
+`max_num_terms`::
+  Maximum number of terms that must be returned per field. Defaults to `25`.
+`min_term_freq`::
+  Ignore words with less than this frequency in the source doc. Defaults to `1`.
+`max_term_freq`::
+  Ignore words with more than this frequency in the source doc. Defaults to unbounded.
+`min_doc_freq`::
+  Ignore terms which do not occur in at least this many docs. Defaults to `1`.
+`max_doc_freq`::
+  Ignore words which occur in more than this many docs. Defaults to unbounded.
+`min_word_length`::
+  The minimum word length below which words will be ignored. Defaults to `0`.
+`max_word_length`::
+  The maximum word length above which words will be ignored. Defaults to unbounded (`0`).
+
 [float]
 === Behaviour
 
@@ -337,3 +365,75 @@ Response:
   }
 }
 --------------------------------------------------
+
+[float]
+[[docs-termvectors-terms-filtering]]
+=== Example 5
+
+Finally, the terms returned could be filtered based on their tf-idf scores. In
+the example below we obtain the three most "interesting" keywords from the
+artificial document having the given "plot" field value. Additionally, we are
+asking for distributed frequencies to obtain more accurate results. Notice
+that the keyword "Tony" or any stop words are not part of the response, as
+their tf-idf must be too low.
+
+[source,js]
+--------------------------------------------------
+GET /imdb/movies/_termvectors
+{
+    "doc": {
+      "plot": "When wealthy industrialist Tony Stark is forced to build an armored suit after a life-threatening incident, he ultimately decides to use its technology to fight against evil."
+    },
+    "term_statistics" : true,
+    "field_statistics" : true,
+    "dfs": true,
+    "positions": false,
+    "offsets": false,
+    "filter" : {
+      "max_num_terms" : 3,
+      "min_term_freq" : 1,
+      "min_doc_freq" : 1
+    }
+}
+--------------------------------------------------
+
+Response:
+
+[source,js]
+--------------------------------------------------
+{
+   "_index": "imdb",
+   "_type": "movies",
+   "_version": 0,
+   "found": true,
+   "term_vectors": {
+      "plot": {
+         "field_statistics": {
+            "sum_doc_freq": 3384269,
+            "doc_count": 176214,
+            "sum_ttf": 3753460
+         },
+         "terms": {
+            "armored": {
+               "doc_freq": 27,
+               "ttf": 27,
+               "term_freq": 1,
+               "score": 9.74725
+            },
+            "industrialist": {
+               "doc_freq": 88,
+               "ttf": 88,
+               "term_freq": 1,
+               "score": 8.590818
+            },
+            "stark": {
+               "doc_freq": 44,
+               "ttf": 47,
+               "term_freq": 1,
+               "score": 9.272792
+            }
+         }
+      }
+   }
+}
+--------------------------------------------------

+ 1 - 0
docs/reference/query-dsl/queries/mlt-query.asciidoc

@@ -178,6 +178,7 @@ The text to find documents like it.
 A list of documents following the same syntax as the <<docs-multi-get,Multi GET API>>.
 
 [float]
+[[mlt-query-term-selection]]
 ==== Term Selection Parameters
 
 [horizontal]

+ 15 - 4
src/main/java/org/elasticsearch/action/termvectors/TermVectorsFields.java

@@ -21,7 +21,11 @@ package org.elasticsearch.action.termvectors;
 
 import com.carrotsearch.hppc.ObjectLongOpenHashMap;
 import com.carrotsearch.hppc.cursors.ObjectLongCursor;
-import org.apache.lucene.index.*;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.BoostAttribute;
 import org.apache.lucene.util.*;
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.io.stream.BytesStreamInput;
@@ -40,7 +44,7 @@ import static org.apache.lucene.util.ArrayUtil.grow;
  * <tt>-1,</tt>, if no positions were returned by the {@link TermVectorsRequest}.
  * <p/>
  * The data is stored in two byte arrays ({@code headerRef} and
- * {@code termVectors}, both {@link ByteRef}) that have the following format:
+ * {@code termVectors}, both {@link BytesRef}) that have the following format:
  * <p/>
  * {@code headerRef}: Stores offsets per field in the {@code termVectors} array
  * and some header information as {@link BytesRef}. Format is
@@ -113,6 +117,7 @@ public final class TermVectorsFields extends Fields {
     private final BytesReference termVectors;
     final boolean hasTermStatistic;
     final boolean hasFieldStatistic;
+    public final boolean hasScores;
 
     /**
      * @param headerRef   Stores offsets per field in the {@code termVectors} and some
@@ -130,6 +135,7 @@ public final class TermVectorsFields extends Fields {
         assert version == -1;
         hasTermStatistic = header.readBoolean();
         hasFieldStatistic = header.readBoolean();
+        hasScores = header.readBoolean();
         final int numFields = header.readVInt();
         for (int i = 0; i < numFields; i++) {
             fieldMap.put((header.readString()), header.readVLong());
@@ -226,6 +232,7 @@ public final class TermVectorsFields extends Fields {
                 int[] endOffsets = new int[1];
                 BytesRefBuilder[] payloads = new BytesRefBuilder[1];
                 final BytesRefBuilder spare = new BytesRefBuilder();
+                BoostAttribute boostAtt = this.attributes().addAttribute(BoostAttribute.class);
 
                 @Override
                 public BytesRef next() throws IOException {
@@ -250,6 +257,11 @@ public final class TermVectorsFields extends Fields {
                         // currentPosition etc. so that we can just iterate
                         // later
                         writeInfos(perFieldTermVectorInput);
+
+                        // read the score if available
+                        if (hasScores) {
+                            boostAtt.setBoost(perFieldTermVectorInput.readFloat());
+                        }
                         return spare.get();
 
                     } else {
@@ -482,5 +494,4 @@ public final class TermVectorsFields extends Fields {
     long readPotentiallyNegativeVLong(BytesStreamInput stream) throws IOException {
         return stream.readVLong() - 1;
     }
-
-}
+}

+ 325 - 0
src/main/java/org/elasticsearch/action/termvectors/TermVectorsFilter.java

@@ -0,0 +1,325 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.action.termvectors;
+
+import com.google.common.util.concurrent.AtomicLongMap;
+import org.apache.lucene.index.*;
+import org.apache.lucene.search.TermStatistics;
+import org.apache.lucene.search.similarities.DefaultSimilarity;
+import org.apache.lucene.search.similarities.TFIDFSimilarity;
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.Nullable;
+import org.elasticsearch.search.dfs.AggregatedDfs;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+public class TermVectorsFilter {
+    public static final int DEFAULT_MAX_QUERY_TERMS = 25;
+    public static final int DEFAULT_MIN_TERM_FREQ = 0;
+    public static final int DEFAULT_MAX_TERM_FREQ = Integer.MAX_VALUE;
+    public static final int DEFAULT_MIN_DOC_FREQ = 0;
+    public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE;
+    public static final int DEFAULT_MIN_WORD_LENGTH = 0;
+    public static final int DEFAULT_MAX_WORD_LENGTH = 0;
+
+    private int maxNumTerms = DEFAULT_MAX_QUERY_TERMS;
+    private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
+    private int maxTermFreq = DEFAULT_MAX_TERM_FREQ;
+    private int minDocFreq = DEFAULT_MIN_DOC_FREQ;
+    private int maxDocFreq = DEFAULT_MAX_DOC_FREQ;
+    private int minWordLength = DEFAULT_MIN_WORD_LENGTH;
+    private int maxWordLength = DEFAULT_MAX_WORD_LENGTH;
+
+    private Fields fields;
+    private Fields topLevelFields;
+    private final Set<String> selectedFields;
+    private AggregatedDfs dfs;
+    private Map<Term, ScoreTerm> scoreTerms;
+    private AtomicLongMap<String> sizes;
+    private TFIDFSimilarity similarity;
+
+    public TermVectorsFilter(Fields termVectorsByField, Fields topLevelFields, Set<String> selectedFields, @Nullable AggregatedDfs dfs) {
+        this.fields = termVectorsByField;
+        this.topLevelFields = topLevelFields;
+        this.selectedFields = selectedFields;
+
+        this.dfs = dfs;
+        this.scoreTerms = new HashMap<>();
+        this.sizes = AtomicLongMap.create();
+        this.similarity = new DefaultSimilarity();
+    }
+
+    public void setSettings(TermVectorsRequest.FilterSettings settings) {
+        if (settings.maxNumTerms != null) {
+            setMaxNumTerms(settings.maxNumTerms);
+        }
+        if (settings.minTermFreq != null) {
+            setMinTermFreq(settings.minTermFreq);
+        }
+        if (settings.maxTermFreq != null) {
+            setMaxTermFreq(settings.maxTermFreq);
+        }
+        if (settings.minDocFreq != null) {
+            setMinDocFreq(settings.minDocFreq);
+        }
+        if (settings.maxDocFreq != null) {
+            setMaxDocFreq(settings.maxDocFreq);
+        }
+        if (settings.minWordLength != null) {
+            setMinWordLength(settings.minWordLength);
+        }
+        if (settings.maxWordLength != null) {
+            setMaxWordLength(settings.maxWordLength);
+        }
+    }
+
+    public ScoreTerm getScoreTerm(Term term) {
+        return scoreTerms.get(term);
+    }
+
+    public boolean hasScoreTerm(Term term) {
+        return getScoreTerm(term) != null;
+    }
+
+    public long size(String fieldName) {
+        return sizes.get(fieldName);
+    }
+
+    public int getMaxNumTerms() {
+        return maxNumTerms;
+    }
+
+    public int getMinTermFreq() {
+        return minTermFreq;
+    }
+
+    public int getMaxTermFreq() {
+        return maxTermFreq;
+    }
+
+    public int getMinDocFreq() {
+        return minDocFreq;
+    }
+
+    public int getMaxDocFreq() {
+        return maxDocFreq;
+    }
+
+    public int getMinWordLength() {
+        return minWordLength;
+    }
+
+    public int getMaxWordLength() {
+        return maxWordLength;
+    }
+
+    public void setMaxNumTerms(int maxNumTerms) {
+        this.maxNumTerms = maxNumTerms;
+    }
+
+    public void setMinTermFreq(int minTermFreq) {
+        this.minTermFreq = minTermFreq;
+    }
+
+    public void setMaxTermFreq(int maxTermFreq) {
+        this.maxTermFreq = maxTermFreq;
+    }
+
+    public void setMinDocFreq(int minDocFreq) {
+        this.minDocFreq = minDocFreq;
+    }
+
+    public void setMaxDocFreq(int maxDocFreq) {
+        this.maxDocFreq = maxDocFreq;
+    }
+
+    public void setMinWordLength(int minWordLength) {
+        this.minWordLength = minWordLength;
+    }
+
+    public void setMaxWordLength(int maxWordLength) {
+        this.maxWordLength = maxWordLength;
+    }
+
+    public static final class ScoreTerm {
+        public String field;
+        public String word;
+        public float score;
+
+        ScoreTerm(String field, String word, float score) {
+            this.field = field;
+            this.word = word;
+            this.score = score;
+        }
+
+        void update(String field, String word, float score) {
+            this.field = field;
+            this.word = word;
+            this.score = score;
+        }
+    }
+
+    public void selectBestTerms() throws IOException {
+        PostingsEnum docsEnum = null;
+
+        for (String fieldName : fields) {
+            if ((selectedFields != null) && (!selectedFields.contains(fieldName))) {
+                continue;
+            }
+
+            Terms terms = fields.terms(fieldName);
+            Terms topLevelTerms = topLevelFields.terms(fieldName);
+
+            // if no terms found, take the retrieved term vector fields for stats
+            if (topLevelTerms == null) {
+                topLevelTerms = terms;
+            }
+
+            long numDocs = getDocCount(fieldName, topLevelTerms);
+
+            // one queue per field name
+            ScoreTermsQueue queue = new ScoreTermsQueue(Math.min(maxNumTerms, (int) terms.size()));
+
+            // select terms with highest tf-idf
+            TermsEnum termsEnum = terms.iterator();
+            TermsEnum topLevelTermsEnum = topLevelTerms.iterator();
+            while (termsEnum.next() != null) {
+                BytesRef termBytesRef = termsEnum.term();
+                boolean foundTerm = topLevelTermsEnum.seekExact(termBytesRef);
+                assert foundTerm : "Term: " + termBytesRef.utf8ToString() + " not found!";
+                
+                Term term = new Term(fieldName, termBytesRef);
+                
+                // remove noise words
+                int freq = getTermFreq(termsEnum, docsEnum);
+                if (isNoise(term.bytes().utf8ToString(), freq)) {
+                    continue;
+                }
+                
+                // now call on docFreq
+                long docFreq = getTermStatistics(topLevelTermsEnum, term).docFreq();
+                if (!isAccepted(docFreq)) {
+                    continue;
+                }
+                
+                // filter based on score
+                float score = computeScore(docFreq, freq, numDocs);
+                queue.addOrUpdate(new ScoreTerm(term.field(), term.bytes().utf8ToString(), score));
+            }
+
+            // retain the best terms for quick lookups
+            ScoreTerm scoreTerm;
+            while ((scoreTerm = queue.pop()) != null) {
+                scoreTerms.put(new Term(scoreTerm.field, scoreTerm.word), scoreTerm);
+                sizes.incrementAndGet(scoreTerm.field);
+            }
+        }
+    }
+
+    private boolean isNoise(String word, int freq) {
+        // filter out words based on length
+        int len = word.length();
+        if (minWordLength > 0 && len < minWordLength) {
+            return true;
+        }
+        if (maxWordLength > 0 && len > maxWordLength) {
+            return true;
+        }
+        // filter out words that don't occur enough times in the source
+        if (minTermFreq > 0 && freq < minTermFreq) {
+            return true;
+        }
+        // filter out words that occur too many times in the source
+        if (freq > maxTermFreq) {
+            return true;
+        }
+        return false;
+    }
+
+    private boolean isAccepted(long docFreq) {
+        // filter out words that don't occur in enough docs
+        if (minDocFreq > 0 && docFreq < minDocFreq) {
+            return false;
+        }
+        // filter out words that occur in too many docs
+        if (docFreq > maxDocFreq) {
+            return false;
+        }
+        // index update problem?
+        if (docFreq == 0) {
+            return false;
+        }
+        return true;
+    }
+
+    private long getDocCount(String fieldName, Terms topLevelTerms) throws IOException {
+        if (dfs != null) {
+            return dfs.fieldStatistics().get(fieldName).docCount();
+        }
+        return topLevelTerms.getDocCount();
+    }
+
+    private TermStatistics getTermStatistics(TermsEnum termsEnum, Term term) throws IOException {
+        if (dfs != null) {
+            return dfs.termStatistics().get(term);
+        }
+        return new TermStatistics(termsEnum.term(), termsEnum.docFreq(), termsEnum.totalTermFreq());
+    }
+
+    private int getTermFreq(TermsEnum termsEnum, PostingsEnum docsEnum) throws IOException {
+        docsEnum = termsEnum.postings(null, docsEnum);
+        docsEnum.nextDoc();
+        return docsEnum.freq();
+    }
+
+    private float computeScore(long docFreq, int freq, long numDocs) {
+        return freq * similarity.idf(docFreq, numDocs);
+    }
+
+    private static class ScoreTermsQueue extends org.apache.lucene.util.PriorityQueue<ScoreTerm> {
+        private final int limit;
+
+        ScoreTermsQueue(int maxSize) {
+            super(maxSize);
+            this.limit = maxSize;
+        }
+
+        @Override
+        protected boolean lessThan(ScoreTerm a, ScoreTerm b) {
+            return a.score < b.score;
+        }
+
+        public void addOrUpdate(ScoreTerm scoreTerm) {
+            if (this.size() < limit) {
+                // there is still space in the queue
+                this.add(scoreTerm);
+            } else {
+                // otherwise update the smallest in the queue in place and update the queue
+                ScoreTerm scoreTermTop = this.top();
+                if (scoreTermTop.score < scoreTerm.score) {
+                    scoreTermTop.update(scoreTerm.field, scoreTerm.word, scoreTerm.score);
+                    this.updateTop();
+                }
+            }
+        }
+    }
+}

+ 105 - 0
src/main/java/org/elasticsearch/action/termvectors/TermVectorsRequest.java

@@ -28,6 +28,7 @@ import org.elasticsearch.action.DocumentRequest;
 import org.elasticsearch.action.ValidateActions;
 import org.elasticsearch.action.get.MultiGetRequest;
 import org.elasticsearch.action.support.single.shard.SingleShardOperationRequest;
+import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
@@ -74,6 +75,54 @@ public class TermVectorsRequest extends SingleShardOperationRequest<TermVectorsR
 
     private Map<String, String> perFieldAnalyzer;
 
+    private FilterSettings filterSettings;
+
+    public static final class FilterSettings {
+        public Integer maxNumTerms;
+        public Integer minTermFreq;
+        public Integer maxTermFreq;
+        public Integer minDocFreq;
+        public Integer maxDocFreq;
+        public Integer minWordLength;
+        public Integer maxWordLength;
+
+        public FilterSettings() {
+
+        }
+
+        public FilterSettings(@Nullable Integer maxNumTerms, @Nullable Integer minTermFreq, @Nullable Integer maxTermFreq,
+                              @Nullable Integer minDocFreq, @Nullable Integer maxDocFreq, @Nullable Integer minWordLength,
+                              @Nullable Integer maxWordLength) {
+            this.maxNumTerms = maxNumTerms;
+            this.minTermFreq = minTermFreq;
+            this.maxTermFreq = maxTermFreq;
+            this.minDocFreq = minDocFreq;
+            this.maxDocFreq = maxDocFreq;
+            this.minWordLength = minWordLength;
+            this.maxWordLength = maxWordLength;
+        }
+
+        public void readFrom(StreamInput in) throws IOException {
+            maxNumTerms = in.readOptionalVInt();
+            minTermFreq = in.readOptionalVInt();
+            maxTermFreq = in.readOptionalVInt();
+            minDocFreq = in.readOptionalVInt();
+            maxDocFreq = in.readOptionalVInt();
+            minWordLength = in.readOptionalVInt();
+            maxWordLength = in.readOptionalVInt();
+        }
+
+        public void writeTo(StreamOutput out) throws IOException {
+            out.writeOptionalVInt(maxNumTerms);
+            out.writeOptionalVInt(minTermFreq);
+            out.writeOptionalVInt(maxTermFreq);
+            out.writeOptionalVInt(minDocFreq);
+            out.writeOptionalVInt(maxDocFreq);
+            out.writeOptionalVInt(minWordLength);
+            out.writeOptionalVInt(maxWordLength);
+        }
+    }
+
     private EnumSet<Flag> flagsEnum = EnumSet.of(Flag.Positions, Flag.Offsets, Flag.Payloads,
             Flag.FieldStatistics);
 
@@ -375,6 +424,21 @@ public class TermVectorsRequest extends SingleShardOperationRequest<TermVectorsR
         return this;
     }
 
+    /**
+     * Return the settings for filtering out terms.
+     */
+    public FilterSettings filterSettings() {
+        return this.filterSettings;
+    }
+
+    /**
+     * Sets the settings for filtering out terms.
+     */
+    public TermVectorsRequest filterSettings(FilterSettings settings) {
+        this.filterSettings = settings != null ? settings : null;
+        return this;
+    }
+
     public long version() {
         return version;
     }
@@ -454,6 +518,10 @@ public class TermVectorsRequest extends SingleShardOperationRequest<TermVectorsR
         if (in.readBoolean()) {
             perFieldAnalyzer = readPerFieldAnalyzer(in.readMap());
         }
+        if (in.readBoolean()) {
+            filterSettings = new FilterSettings();
+            filterSettings.readFrom(in);
+        }
         realtime = in.readBoolean();
         versionType = VersionType.fromValue(in.readByte());
         version = in.readLong();
@@ -488,6 +556,10 @@ public class TermVectorsRequest extends SingleShardOperationRequest<TermVectorsR
         if (perFieldAnalyzer != null) {
             out.writeGenericValue(perFieldAnalyzer);
         }
+        out.writeBoolean(filterSettings != null);
+        if (filterSettings != null) {
+            filterSettings.writeTo(out);
+        }
         out.writeBoolean(realtime());
         out.writeByte(versionType.getValue());
         out.writeLong(version);
@@ -533,6 +605,8 @@ public class TermVectorsRequest extends SingleShardOperationRequest<TermVectorsR
                     termVectorsRequest.dfs(parser.booleanValue());
                 } else if (currentFieldName.equals("per_field_analyzer") || currentFieldName.equals("perFieldAnalyzer")) {
                     termVectorsRequest.perFieldAnalyzer(readPerFieldAnalyzer(parser.map()));
+                } else if (currentFieldName.equals("filter")) {
+                    termVectorsRequest.filterSettings(readFilterSettings(parser, termVectorsRequest));
                 } else if ("_index".equals(currentFieldName)) { // the following is important for multi request parsing.
                     termVectorsRequest.index = parser.text();
                 } else if ("_type".equals(currentFieldName)) {
@@ -577,4 +651,35 @@ public class TermVectorsRequest extends SingleShardOperationRequest<TermVectorsR
         }
         return mapStrStr;
     }
+
+    private static FilterSettings readFilterSettings(XContentParser parser, TermVectorsRequest termVectorsRequest) throws IOException {
+        FilterSettings settings = new FilterSettings();
+        XContentParser.Token token;
+        String currentFieldName = null;
+        while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
+            if (token == XContentParser.Token.FIELD_NAME) {
+                currentFieldName = parser.currentName();
+            } else if (currentFieldName != null) {
+                if (currentFieldName.equals("max_num_terms")) {
+                    settings.maxNumTerms = parser.intValue();
+                } else if (currentFieldName.equals("min_term_freq")) {
+                    settings.minTermFreq = parser.intValue();
+                } else if (currentFieldName.equals("max_term_freq")) {
+                    settings.maxTermFreq = parser.intValue();
+                } else if (currentFieldName.equals("min_doc_freq")) {
+                    settings.minDocFreq = parser.intValue();
+                } else if (currentFieldName.equals("max_doc_freq")) {
+                    settings.maxDocFreq = parser.intValue();
+                } else if (currentFieldName.equals("min_word_length")) {
+                    settings.minWordLength = parser.intValue();
+                } else if (currentFieldName.equals("max_word_length")) {
+                    settings.maxWordLength = parser.intValue();
+                } else {
+                    throw new ElasticsearchParseException("The parameter " + currentFieldName
+                            + " is not valid for filter parameter for term vector request!");
+                }
+            }
+        }
+        return settings;
+    }
 }

+ 8 - 0
src/main/java/org/elasticsearch/action/termvectors/TermVectorsRequestBuilder.java

@@ -200,6 +200,14 @@ public class TermVectorsRequestBuilder extends ActionRequestBuilder<TermVectorsR
         return this;
     }
 
+    /**
+     * Sets the settings for filtering out terms.
+     */
+    public TermVectorsRequestBuilder setFilterSettings(TermVectorsRequest.FilterSettings filterSettings) {
+        request.filterSettings(filterSettings);
+        return this;
+    }
+
     @Override
     protected void doExecute(ActionListener<TermVectorsResponse> listener) {
         client.termVectors(request, listener);

+ 20 - 7
src/main/java/org/elasticsearch/action/termvectors/TermVectorsResponse.java

@@ -20,11 +20,11 @@
 package org.elasticsearch.action.termvectors;
 
 import com.google.common.collect.Iterators;
-
 import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.BoostAttribute;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRefBuilder;
@@ -55,6 +55,7 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
         public static final XContentBuilderString TTF = new XContentBuilderString("ttf");
         public static final XContentBuilderString DOC_FREQ = new XContentBuilderString("doc_freq");
         public static final XContentBuilderString TERM_FREQ = new XContentBuilderString("term_freq");
+        public static final XContentBuilderString SCORE = new XContentBuilderString("score");
 
         // field statistics strings
         public static final XContentBuilderString FIELD_STATISTICS = new XContentBuilderString("field_statistics");
@@ -86,6 +87,7 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
     private boolean exists = false;
     private boolean artificial = false;
     private long tookInMillis;
+    private boolean hasScores = false;
 
     private boolean sourceCopied = false;
 
@@ -146,7 +148,9 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
                 headerRef = headerRef.copyBytesArray();
                 termVectors = termVectors.copyBytesArray();
             }
-            return new TermVectorsFields(headerRef, termVectors);
+            TermVectorsFields termVectorsFields = new TermVectorsFields(headerRef, termVectors);
+            hasScores = termVectorsFields.hasScores;
+            return termVectorsFields;
         } else {
             return new Fields() {
                 @Override
@@ -202,14 +206,15 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
         buildFieldStatistics(builder, curTerms);
         builder.startObject(FieldStrings.TERMS);
         TermsEnum termIter = curTerms.iterator();
+        BoostAttribute boostAtt = termIter.attributes().addAttribute(BoostAttribute.class);
         for (int i = 0; i < curTerms.size(); i++) {
-            buildTerm(builder, spare, curTerms, termIter);
+            buildTerm(builder, spare, curTerms, termIter, boostAtt);
         }
         builder.endObject();
         builder.endObject();
     }
 
-    private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter) throws IOException {
+    private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException {
         // start term, optimized writing
         BytesRef term = termIter.next();
         spare.copyUTF8Bytes(term);
@@ -222,6 +227,7 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
         initMemory(curTerms, termFreq);
         initValues(curTerms, posEnum, termFreq);
         buildValues(builder, curTerms, termFreq);
+        buildScore(builder, boostAtt);
         builder.endObject();
     }
 
@@ -332,6 +338,12 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
     public long getTookInMillis() {
         return tookInMillis;
     }
+    
+    private void buildScore(XContentBuilder builder, BoostAttribute boostAtt) throws IOException {
+        if (hasScores) {
+            builder.field(FieldStrings.SCORE, boostAtt.getBoost());
+        }
+    }
 
     public boolean isExists() {
         return exists;
@@ -342,14 +354,15 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
     }
 
     public void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields) throws IOException {
-        setFields(termVectorsByField, selectedFields, flags, topLevelFields, null);
+        setFields(termVectorsByField, selectedFields, flags, topLevelFields, null, null);
     }
 
-    public void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields, @Nullable AggregatedDfs dfs) throws IOException {
+    public void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields, @Nullable AggregatedDfs dfs,
+                          TermVectorsFilter termVectorsFilter) throws IOException {
         TermVectorsWriter tvw = new TermVectorsWriter(this);
 
         if (termVectorsByField != null) {
-            tvw.setFields(termVectorsByField, selectedFields, flags, topLevelFields, dfs);
+            tvw.setFields(termVectorsByField, selectedFields, flags, topLevelFields, dfs, termVectorsFilter);
         }
     }
 

+ 34 - 13
src/main/java/org/elasticsearch/action/termvectors/TermVectorsWriter.java

@@ -50,10 +50,13 @@ final class TermVectorsWriter {
         response = termVectorsResponse;
     }
 
-    void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields, @Nullable AggregatedDfs dfs) throws IOException {
+    void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields,
+                   @Nullable AggregatedDfs dfs, @Nullable TermVectorsFilter termVectorsFilter) throws IOException {
         int numFieldsWritten = 0;
         PostingsEnum docsAndPosEnum = null;
         PostingsEnum docsEnum = null;
+        boolean hasScores = termVectorsFilter != null;
+
         for (String field : termVectorsByField) {
             if ((selectedFields != null) && (!selectedFields.contains(field))) {
                 continue;
@@ -71,7 +74,13 @@ final class TermVectorsWriter {
             boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions();
             boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets();
             boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads();
-            startField(field, fieldTermVector.size(), positions, offsets, payloads);
+
+            long termsSize = fieldTermVector.size();
+            if (hasScores) {
+                termsSize = Math.min(termsSize, termVectorsFilter.size(field));
+            }
+            startField(field, termsSize, positions, offsets, payloads);
+
             if (flags.contains(Flag.FieldStatistics)) {
                 if (dfs != null) {
                     writeFieldStatistics(dfs.fieldStatistics().get(field));
@@ -81,15 +90,21 @@ final class TermVectorsWriter {
             }
             TermsEnum iterator = fieldTermVector.iterator();
             final boolean useDocsAndPos = positions || offsets || payloads;
-            while (iterator.next() != null) { // iterate all terms of the
-                // current field
-                // get the doc frequency
-                BytesRef term = iterator.term();
-                boolean foundTerm = topLevelIterator.seekExact(term);
-                startTerm(term);
+            while (iterator.next() != null) { // iterate all terms of the current field
+                BytesRef termBytesRef = iterator.term();
+                boolean foundTerm = topLevelIterator.seekExact(termBytesRef);
+                Term term = new Term(field, termBytesRef);
+
+                // with filtering we only keep the best terms
+                if (hasScores && !termVectorsFilter.hasScoreTerm(term)) {
+                    continue;
+                }
+
+                startTerm(termBytesRef);
                 if (flags.contains(Flag.TermStatistics)) {
+                    // get the doc frequency
                     if (dfs != null) {
-                        writeTermStatistics(dfs.termStatistics().get(new Term(field, term.utf8ToString())));
+                        writeTermStatistics(dfs.termStatistics().get(term));
                     } else {
                         writeTermStatistics(topLevelIterator);
                     }
@@ -102,14 +117,17 @@ final class TermVectorsWriter {
                     // get the frequency from a PostingsEnum.
                     docsEnum = writeTermWithDocsOnly(iterator, docsEnum);
                 }
+                if (hasScores) {
+                    writeScoreTerm(termVectorsFilter.getScoreTerm(term));
+                }
             }
             numFieldsWritten++;
         }
         response.setTermVectorsField(output);
-        response.setHeader(writeHeader(numFieldsWritten, flags.contains(Flag.TermStatistics), flags.contains(Flag.FieldStatistics)));
+        response.setHeader(writeHeader(numFieldsWritten, flags.contains(Flag.TermStatistics), flags.contains(Flag.FieldStatistics), hasScores));
     }
 
-    private BytesReference writeHeader(int numFieldsWritten, boolean getTermStatistics, boolean getFieldStatistics) throws IOException {
+    private BytesReference writeHeader(int numFieldsWritten, boolean getTermStatistics, boolean getFieldStatistics, boolean scores) throws IOException {
         // now, write the information about offset of the terms in the
         // termVectors field
         BytesStreamOutput header = new BytesStreamOutput();
@@ -117,6 +135,7 @@ final class TermVectorsWriter {
         header.writeInt(CURRENT_VERSION);
         header.writeBoolean(getTermStatistics);
         header.writeBoolean(getFieldStatistics);
+        header.writeBoolean(scores);
         header.writeVInt(numFieldsWritten);
         for (int i = 0; i < fields.size(); i++) {
             header.writeString(fields.get(i));
@@ -205,7 +224,6 @@ final class TermVectorsWriter {
     private void startTerm(BytesRef term) throws IOException {
         output.writeVInt(term.length);
         output.writeBytes(term.bytes, term.offset, term.length);
-
     }
 
     private void writeTermStatistics(TermsEnum topLevelIterator) throws IOException {
@@ -250,6 +268,10 @@ final class TermVectorsWriter {
         writePotentiallyNegativeVInt(dc);
     }
 
+    private void writeScoreTerm(TermVectorsFilter.ScoreTerm scoreTerm) throws IOException {
+        output.writeFloat(Math.max(0, scoreTerm.score));
+    }
+
     private void writePotentiallyNegativeVInt(int value) throws IOException {
         // term freq etc. can be negative if not present... we transport that
         // further...
@@ -261,5 +283,4 @@ final class TermVectorsWriter {
         // further...
         output.writeVLong(Math.max(0, value + 1));
     }
-
 }

+ 8 - 0
src/main/java/org/elasticsearch/common/io/stream/StreamInput.java

@@ -229,6 +229,14 @@ public abstract class StreamInput extends InputStream {
         return null;
     }
 
+    @Nullable
+    public Integer readOptionalVInt() throws IOException {
+        if (readBoolean()) {
+            return readVInt();
+        }
+        return null;
+    }
+
     private final CharsRefBuilder spare = new CharsRefBuilder();
 
     public String readString() throws IOException {

+ 9 - 0
src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java

@@ -176,6 +176,15 @@ public abstract class StreamOutput extends OutputStream {
         }
     }
 
+    public void writeOptionalVInt(@Nullable Integer integer) throws IOException {
+        if (integer == null) {
+            writeBoolean(false);
+        } else {
+            writeBoolean(true);
+            writeVInt(integer);
+        }
+    }
+
     public void writeOptionalText(@Nullable Text text) throws IOException {
         if (text == null) {
             writeInt(-1);

+ 27 - 12
src/main/java/org/elasticsearch/index/termvectors/ShardTermVectorsService.java

@@ -23,6 +23,7 @@ import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.index.*;
 import org.apache.lucene.index.memory.MemoryIndex;
 import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.action.termvectors.TermVectorsFilter;
 import org.elasticsearch.action.termvectors.TermVectorsRequest;
 import org.elasticsearch.action.termvectors.TermVectorsResponse;
 import org.elasticsearch.action.termvectors.dfs.DfsOnlyRequest;
@@ -82,8 +83,10 @@ public class ShardTermVectorsService extends AbstractIndexShardComponent {
 
         Engine.GetResult get = indexShard.get(new Engine.Get(request.realtime(), uidTerm).version(request.version()).versionType(request.versionType()));
 
+        Fields termVectorsByField = null;
         boolean docFromTranslog = get.source() != null;
         AggregatedDfs dfs = null;
+        TermVectorsFilter termVectorsFilter = null;
 
         /* fetched from translog is treated as an artificial document */
         if (docFromTranslog) {
@@ -102,22 +105,18 @@ public class ShardTermVectorsService extends AbstractIndexShardComponent {
             Versions.DocIdAndVersion docIdAndVersion = get.docIdAndVersion();
             /* from an artificial document */
             if (request.doc() != null) {
-                Fields termVectorsByField = generateTermVectorsFromDoc(request, !docFromTranslog);
+                termVectorsByField = generateTermVectorsFromDoc(request, !docFromTranslog);
                 // if no document indexed in shard, take the queried document itself for stats
                 if (topLevelFields == null) {
                     topLevelFields = termVectorsByField;
                 }
-                if (termVectorsByField != null && useDfs(request)) {
-                    dfs = getAggregatedDfs(termVectorsByField, request);
-                }
-                termVectorsResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields, dfs);
-                termVectorsResponse.setExists(true);
                 termVectorsResponse.setArtificial(!docFromTranslog);
+                termVectorsResponse.setExists(true);
             }
             /* or from an existing document */
             else if (docIdAndVersion != null) {
                 // fields with stored term vectors
-                Fields termVectorsByField = docIdAndVersion.context.reader().getTermVectors(docIdAndVersion.docId);
+                termVectorsByField = docIdAndVersion.context.reader().getTermVectors(docIdAndVersion.docId);
                 Set<String> selectedFields = request.selectedFields();
                 // generate tvs for fields where analyzer is overridden
                 if (selectedFields == null && request.perFieldAnalyzer() != null) {
@@ -127,15 +126,31 @@ public class ShardTermVectorsService extends AbstractIndexShardComponent {
                 if (selectedFields != null) {
                     termVectorsByField = addGeneratedTermVectors(get, termVectorsByField, request, selectedFields);
                 }
-                if (termVectorsByField != null && useDfs(request)) {
-                    dfs = getAggregatedDfs(termVectorsByField, request);
-                }
-                termVectorsResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields, dfs);
                 termVectorsResponse.setDocVersion(docIdAndVersion.version);
                 termVectorsResponse.setExists(true);
-            } else {
+            }
+            /* no term vectors generated or found */
+            else {
                 termVectorsResponse.setExists(false);
             }
+            /* if there are term vectors, optional compute dfs and/or terms filtering */
+            if (termVectorsByField != null) {
+                if (useDfs(request)) {
+                    dfs = getAggregatedDfs(termVectorsByField, request);
+                }
+
+                if (request.filterSettings() != null) {
+                    termVectorsFilter = new TermVectorsFilter(termVectorsByField, topLevelFields, request.selectedFields(), dfs);
+                    termVectorsFilter.setSettings(request.filterSettings());
+                    try {
+                        termVectorsFilter.selectBestTerms();
+                    } catch (IOException e) {
+                        throw new ElasticsearchException("failed to select best terms", e);
+                    }
+                }
+                // write term vectors
+                termVectorsResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields, dfs, termVectorsFilter);
+            }
         } catch (Throwable ex) {
             throw new ElasticsearchException("failed to execute term vector request", ex);
         } finally {

+ 129 - 0
src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsTests.java

@@ -1225,4 +1225,133 @@ public class GetTermVectorsTests extends AbstractTermVectorsTests {
         assertThat(response.getIndex(), equalTo("test"));
         assertThat(response.getVersion(), equalTo(2l));
     }
+
+    @Test
+    public void testFilterLength() throws ExecutionException, InterruptedException, IOException {
+        logger.info("Setting up the index ...");
+        ImmutableSettings.Builder settings = settingsBuilder()
+                .put(indexSettings())
+                .put("index.analysis.analyzer", "keyword");
+        assertAcked(prepareCreate("test")
+                .setSettings(settings)
+                .addMapping("type1", "tags", "type=string"));
+        ensureYellow();
+
+        int numTerms = scaledRandomIntBetween(10, 50);
+        logger.info("Indexing one document with tags of increasing length ...");
+        List<String> tags = new ArrayList<>();
+        for (int i = 0; i < numTerms; i++) {
+            String tag = "a";
+            for (int j = 0; j < i; j++) {
+                tag += "a";
+            }
+            tags.add(tag);
+        }
+        indexRandom(true, client().prepareIndex("test", "type1", "1").setSource("tags", tags));
+
+        logger.info("Checking best tags by longest to shortest size ...");
+        TermVectorsRequest.FilterSettings filterSettings = new TermVectorsRequest.FilterSettings();
+        filterSettings.maxNumTerms = numTerms;
+        TermVectorsResponse response;
+        for (int i = 0; i < numTerms; i++) {
+            filterSettings.minWordLength = numTerms - i;
+            response = client().prepareTermVectors("test", "type1", "1")
+                    .setSelectedFields("tags")
+                    .setFieldStatistics(true)
+                    .setTermStatistics(true)
+                    .setFilterSettings(filterSettings)
+                    .get();
+            checkBestTerms(response.getFields().terms("tags"), tags.subList((numTerms - i - 1), numTerms));
+        }
+    }
+
+    @Test
+    public void testFilterTermFreq() throws ExecutionException, InterruptedException, IOException {
+        logger.info("Setting up the index ...");
+        ImmutableSettings.Builder settings = settingsBuilder()
+                .put(indexSettings())
+                .put("index.analysis.analyzer", "keyword");
+        assertAcked(prepareCreate("test")
+                .setSettings(settings)
+                .addMapping("type1", "tags", "type=string"));
+        ensureYellow();
+
+        logger.info("Indexing one document with tags of increasing frequencies ...");
+        int numTerms = scaledRandomIntBetween(10, 50);
+        List<String> tags = new ArrayList<>();
+        List<String> uniqueTags = new ArrayList<>();
+        String tag;
+        for (int i = 0; i < numTerms; i++) {
+            tag = "tag_" + i;
+            tags.add(tag);
+            for (int j = 0; j < i; j++) {
+                tags.add(tag);
+            }
+            uniqueTags.add(tag);
+        }
+        indexRandom(true, client().prepareIndex("test", "type1", "1").setSource("tags", tags));
+
+        logger.info("Checking best tags by highest to lowest term freq ...");
+        TermVectorsRequest.FilterSettings filterSettings = new TermVectorsRequest.FilterSettings();
+        TermVectorsResponse response;
+        for (int i = 0; i < numTerms; i++) {
+            filterSettings.maxNumTerms = i + 1;
+            response = client().prepareTermVectors("test", "type1", "1")
+                    .setSelectedFields("tags")
+                    .setFieldStatistics(true)
+                    .setTermStatistics(true)
+                    .setFilterSettings(filterSettings)
+                    .get();
+            checkBestTerms(response.getFields().terms("tags"), uniqueTags.subList((numTerms - i - 1), numTerms));
+        }
+    }
+
+    @Test
+    public void testFilterDocFreq() throws ExecutionException, InterruptedException, IOException {
+        logger.info("Setting up the index ...");
+        ImmutableSettings.Builder settings = settingsBuilder()
+                .put(indexSettings())
+                .put("index.analysis.analyzer", "keyword")
+                .put("index.number_of_shards", 1); // no dfs
+        assertAcked(prepareCreate("test")
+                .setSettings(settings)
+                .addMapping("type1", "tags", "type=string"));
+        ensureYellow();
+
+        int numDocs = scaledRandomIntBetween(10, 50); // as many terms as there are docs
+        logger.info("Indexing {} documents with tags of increasing dfs ...", numDocs);
+        List<IndexRequestBuilder> builders = new ArrayList<>();
+        List<String> tags = new ArrayList<>();
+        for (int i = 0; i < numDocs; i++) {
+            tags.add("tag_" + i);
+            builders.add(client().prepareIndex("test", "type1", i + "").setSource("tags", tags));
+        }
+        indexRandom(true, builders);
+
+        logger.info("Checking best terms by highest to lowest idf ...");
+        TermVectorsRequest.FilterSettings filterSettings = new TermVectorsRequest.FilterSettings();
+        TermVectorsResponse response;
+        for (int i = 0; i < numDocs; i++) {
+            filterSettings.maxNumTerms = i + 1;
+            response = client().prepareTermVectors("test", "type1", (numDocs - 1) + "")
+                    .setSelectedFields("tags")
+                    .setFieldStatistics(true)
+                    .setTermStatistics(true)
+                    .setFilterSettings(filterSettings)
+                    .get();
+            checkBestTerms(response.getFields().terms("tags"), tags.subList((numDocs - i - 1), numDocs));
+        }
+    }
+
+    private void checkBestTerms(Terms terms, List<String> expectedTerms) throws IOException {
+        final TermsEnum termsEnum = terms.iterator();
+        List<String> bestTerms = new ArrayList<>();
+        BytesRef text;
+        while((text = termsEnum.next()) != null) {
+            bestTerms.add(text.utf8ToString());
+        }
+        Collections.sort(expectedTerms);
+        Collections.sort(bestTerms);
+        assertArrayEquals(expectedTerms.toArray(), bestTerms.toArray());
+    }
 }