11 years ago · 917c93d7ee
--- a/src/main/java/org/elasticsearch/search/suggest/phrase/StupidBackoffScorer.java
+++ b/src/main/java/org/elasticsearch/search/suggest/phrase/StupidBackoffScorer.java
@@ -54,18 +54,18 @@ public class StupidBackoffScorer extends WordScorer {
 
				 
			
 
				     @Override
			
 
				     protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws IOException {
			
 
				+        // First see if there are bigrams.  If there aren't then skip looking up the trigram.  This saves lookups
			
 
				+        // when the bigrams and trigrams are rare and we need both anyway.
			
 
				+        SuggestUtils.join(separator, spare, w_1.term, w.term);
			
 
				+        long bigramCount = frequency(spare);
			
 
				+        if (bigramCount < 1) {
			
 
				+            return discount * scoreUnigram(w);
			
 
				+        }
			
 
				         SuggestUtils.join(separator, spare, w_2.term, w_1.term, w.term);
			
 
				-        final long trigramCount = frequency(spare);
			
 
				+        long trigramCount = frequency(spare);
			
 
				         if (trigramCount < 1) {
			
 
				-            SuggestUtils.join(separator, spare, w_1.term, w.term);
			
 
				-            final long count = frequency(spare);
			
 
				-            if (count < 1) {
			
 
				-                return discount * scoreUnigram(w);
			
 
				-            }
			
 
				-            return discount * (count / (w_1.frequency + 0.00000000001d));
			
 
				+            return discount * (bigramCount / (w_1.frequency + 0.00000000001d));
			
 
				         }
			
 
				-        SuggestUtils.join(separator, spare, w_1.term, w.term);
			
 
				-        final long bigramCount = frequency(spare);
			
 
				         return trigramCount / (bigramCount + 0.00000000001d);
			
 
				     }
			
 
				 
			
--- a/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java
+++ b/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java
@@ -18,10 +18,11 @@
 
				  */
			
 
				 package org.elasticsearch.search.suggest.phrase;
			
 
				 
			
 
				-import org.apache.lucene.index.IndexReader;
			
 
				-import org.apache.lucene.index.MultiFields;
			
 
				-import org.apache.lucene.index.Terms;
			
 
				-import org.apache.lucene.index.TermsEnum;
			
 
				+import com.carrotsearch.hppc.ObjectObjectMap;
			
 
				+import com.carrotsearch.hppc.ObjectObjectOpenHashMap;
			
 
				+import org.apache.lucene.index.*;
			
 
				+import org.apache.lucene.index.FilterAtomicReader.FilterTermsEnum;
			
 
				+import org.apache.lucene.util.Bits;
			
 
				 import org.apache.lucene.util.BytesRef;
			
 
				 import org.elasticsearch.ElasticsearchIllegalArgumentException;
			
 
				 import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
			
@@ -35,17 +36,17 @@ public abstract class WordScorer {
 
				     protected final String field;
			
 
				     protected final Terms terms;
			
 
				     protected final long vocabluarySize;
			
 
				-    protected double realWordLikelyhood;
			
 
				+    protected final double realWordLikelyhood;
			
 
				     protected final BytesRef spare = new BytesRef();
			
 
				     protected final BytesRef separator;
			
 
				     protected final TermsEnum termsEnum;
			
 
				     private final long numTerms;
			
 
				     private final boolean useTotalTermFreq;
			
 
				-    
			
 
				+
			
 
				     public WordScorer(IndexReader reader, String field, double realWordLikelyHood, BytesRef separator) throws IOException {
			
 
				         this(reader, MultiFields.getTerms(reader, field), field, realWordLikelyHood, separator);
			
 
				     }
			
 
				-    
			
 
				+
			
 
				     public WordScorer(IndexReader reader, Terms terms, String field, double realWordLikelyHood, BytesRef separator) throws IOException {
			
 
				         this.field = field;
			
 
				         if (terms == null) {
			
@@ -56,19 +57,19 @@ public abstract class WordScorer {
 
				         this.vocabluarySize =  vocSize == -1 ? reader.maxDoc() : vocSize;
			
 
				         this.useTotalTermFreq = vocSize != -1;
			
 
				         this.numTerms = terms.size();
			
 
				-        this.termsEnum = terms.iterator(null);
			
 
				+        this.termsEnum = new FrequencyCachingTermsEnumWrapper(terms.iterator(null));
			
 
				         this.reader = reader;
			
 
				         this.realWordLikelyhood = realWordLikelyHood;
			
 
				         this.separator = separator;
			
 
				+    }
			
 
				+
			
 
				+    public long frequency(BytesRef term) throws IOException {
			
 
				+        if (termsEnum.seekExact(term)) {
			
 
				+            return useTotalTermFreq ? termsEnum.totalTermFreq() : termsEnum.docFreq();
			
 
				+        }
			
 
				+        return 0;
			
 
				    }
			
 
				-    
			
 
				-   public long frequency(BytesRef term) throws IOException {
			
 
				-      if (termsEnum.seekExact(term)) {
			
 
				-          return useTotalTermFreq ? termsEnum.totalTermFreq() : termsEnum.docFreq();
			
 
				-      }
			
 
				-      return 0;
			
 
				-   }
			
 
				-   
			
 
				+
			
 
				    protected double channelScore(Candidate candidate, Candidate original) throws IOException {
			
 
				        if (candidate.stringDistance == 1.0d) {
			
 
				            return realWordLikelyhood;
			
@@ -82,7 +83,7 @@ public abstract class WordScorer {
 
				        } else if (at == 1 || gramSize == 2) {
			
 
				            return Math.log10(channelScore(path[at], candidateSet[at].originalTerm) * scoreBigram(path[at], path[at - 1]));
			
 
				        } else {
			
 
				-           return Math.log10(channelScore(path[at], candidateSet[at].originalTerm) * scoreTrigram(path[at], path[at - 1], path[at - 2]));           
			
 
				+           return Math.log10(channelScore(path[at], candidateSet[at].originalTerm) * scoreTrigram(path[at], path[at - 1], path[at - 2]));
			
 
				        }
			
 
				    }
			
 
				    
			
@@ -94,12 +95,93 @@ public abstract class WordScorer {
 
				        return scoreUnigram(word);
			
 
				    }
			
 
				    
			
 
				-   protected double scoreTrigram(Candidate word, Candidate w_1, Candidate w_2)  throws IOException {
			
 
				+   protected double scoreTrigram(Candidate word, Candidate w_1, Candidate w_2) throws IOException {
			
 
				        return scoreBigram(word, w_1);
			
 
				    }
			
 
				-   
			
 
				+
			
 
				    public static interface WordScorerFactory {
			
 
				        public WordScorer newScorer(IndexReader reader, Terms terms,
			
 
				             String field, double realWordLikelyhood, BytesRef separator) throws IOException;
			
 
				    }
			
 
				+
			
 
				+   /**
			
 
				+    * Terms enum wrapper that caches term frequencies in an effort to outright skip seeks.  Only works with seekExact(BytesRef), not next or
			
 
				+    * not seekCeil.  Because of this it really only makes sense in this context.
			
 
				+    */
			
 
				+   private static class FrequencyCachingTermsEnumWrapper extends FilterTermsEnum {
			
 
				+       private ObjectObjectMap<BytesRef, CacheEntry> cache = new ObjectObjectOpenHashMap<BytesRef, CacheEntry>();
			
 
				+       /**
			
 
				+        * The last term that the called attempted to seek to.
			
 
				+        */
			
 
				+       private CacheEntry last;
			
 
				+
			
 
				+       public FrequencyCachingTermsEnumWrapper(TermsEnum in) {
			
 
				+           super(in);
			
 
				+       }
			
 
				+
			
 
				+       @Override
			
 
				+       public boolean seekExact(BytesRef text) throws IOException {
			
 
				+           last = cache.get(text);
			
 
				+           if (last != null) {
			
 
				+               // This'll fail to work properly if the user seeks but doesn't check the frequency, causing us to cache it.
			
 
				+               // That is OK because WordScorer only seeks to check the frequency.
			
 
				+               return last.ttf != 0 || last.df != 0;
			
 
				+           }
			
 
				+           last = new CacheEntry();
			
 
				+           cache.put(BytesRef.deepCopyOf(text), last);
			
 
				+           if (in.seekExact(text)) {
			
 
				+               // Found so mark the term uncached.
			
 
				+               last.df = -1;
			
 
				+               last.ttf = -1;
			
 
				+               return true;
			
 
				+           }
			
 
				+           // Not found.  The cache will default to 0 for the freqs, meaning not found.
			
 
				+           return false;
			
 
				+       }
			
 
				+
			
 
				+       @Override
			
 
				+       public long totalTermFreq() throws IOException {
			
 
				+           if (last.ttf == -1) {
			
 
				+               last.ttf = in.totalTermFreq();
			
 
				+           }
			
 
				+           return last.ttf;
			
 
				+       }
			
 
				+
			
 
				+       @Override
			
 
				+       public int docFreq() throws IOException {
			
 
				+           if (last.df == -1) {
			
 
				+               last.df = in.docFreq();
			
 
				+           }
			
 
				+           return last.df;
			
 
				+       }
			
 
				+
			
 
				+       @Override
			
 
				+       public void seekExact(long ord) throws IOException {
			
 
				+           throw new UnsupportedOperationException();
			
 
				+       }
			
 
				+
			
 
				+       @Override
			
 
				+       public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException {
			
 
				+           throw new UnsupportedOperationException();
			
 
				+       }
			
 
				+
			
 
				+       @Override
			
 
				+       public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException {
			
 
				+           throw new UnsupportedOperationException();
			
 
				+       }
			
 
				+
			
 
				+       public SeekStatus seekCeil(BytesRef text) throws IOException {
			
 
				+           throw new UnsupportedOperationException();
			
 
				+       }
			
 
				+
			
 
				+       @Override
			
 
				+       public BytesRef next() {
			
 
				+           throw new UnsupportedOperationException();
			
 
				+       }
			
 
				+
			
 
				+       private static class CacheEntry {
			
 
				+           private long ttf;
			
 
				+           private int df;
			
 
				+       }
			
 
				+   }
			
 
				 }
			
--- a/src/test/java/org/elasticsearch/search/suggest/SuggestSearchTests.java
+++ b/src/test/java/org/elasticsearch/search/suggest/SuggestSearchTests.java
@@ -20,11 +20,13 @@
 
				 package org.elasticsearch.search.suggest;
			
 
				 
			
 
				 import com.google.common.base.Charsets;
			
 
				+import com.google.common.collect.ImmutableList;
			
 
				 import com.google.common.io.Resources;
			
 
				 import org.apache.lucene.util.LuceneTestCase.Slow;
			
 
				 import org.elasticsearch.ElasticsearchException;
			
 
				 import org.elasticsearch.ElasticsearchIllegalStateException;
			
 
				 import org.elasticsearch.action.admin.indices.create.CreateIndexRequestBuilder;
			
 
				+import org.elasticsearch.action.index.IndexRequestBuilder;
			
 
				 import org.elasticsearch.action.search.*;
			
 
				 import org.elasticsearch.common.xcontent.XContentBuilder;
			
 
				 import org.elasticsearch.common.xcontent.XContentFactory;
			
@@ -930,6 +932,156 @@ public class SuggestSearchTests extends ElasticsearchIntegrationTest {
 
				                 .size(1));
			
 
				         assertSuggestion(searchSuggest, 0, 0, "simple_phrase", "nobel prize");
			
 
				     }
			
 
				+
			
 
				+    /**
			
 
				+     * If the suggester finds tons of options then picking the right one is slow without <<<INSERT SOLUTION HERE>>>.
			
 
				+     */
			
 
				+    @Test
			
 
				+    public void suggestWithManyCandidates() throws InterruptedException, ExecutionException, IOException {
			
 
				+        CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(settingsBuilder()
			
 
				+                .put(indexSettings())
			
 
				+                .put(SETTING_NUMBER_OF_SHARDS, 1) // A single shard will help to keep the tests repeatable.
			
 
				+                .put(SETTING_NUMBER_OF_REPLICAS, between(0, cluster().size() - 1))
			
 
				+                .put("index.analysis.analyzer.text.tokenizer", "standard")
			
 
				+                .putArray("index.analysis.analyzer.text.filter", "lowercase", "my_shingle")
			
 
				+                .put("index.analysis.filter.my_shingle.type", "shingle")
			
 
				+                .put("index.analysis.filter.my_shingle.output_unigrams", true)
			
 
				+                .put("index.analysis.filter.my_shingle.min_shingle_size", 2)
			
 
				+                .put("index.analysis.filter.my_shingle.max_shingle_size", 3));
			
 
				+
			
 
				+        XContentBuilder mapping = XContentFactory.jsonBuilder()
			
 
				+                .startObject()
			
 
				+                    .startObject("type1")
			
 
				+                        .startObject("properties")
			
 
				+                            .startObject("title")
			
 
				+                                .field("type", "string")
			
 
				+                                .field("analyzer", "text")
			
 
				+                            .endObject()
			
 
				+                        .endObject()
			
 
				+                    .endObject()
			
 
				+                .endObject();
			
 
				+        assertAcked(builder.addMapping("type1", mapping));
			
 
				+        ensureGreen();
			
 
				+
			
 
				+        ImmutableList.Builder<String> titles = ImmutableList.<String>builder();
			
 
				+
			
 
				+        // We're going to be searching for:
			
 
				+        //   united states house of representatives elections in washington 2006
			
 
				+        // But we need to make sure we generate a ton of suggestions so we add a bunch of candidates.
			
 
				+        // Many of these candidates are drawn from page names on English Wikipedia.
			
 
				+
			
 
				+        // Tons of different options very near the exact query term
			
 
				+        titles.add("United States House of Representatives Elections in Washington 1789");
			
 
				+        for (int year = 1790; year < 2014; year+= 2) {
			
 
				+            titles.add("United States House of Representatives Elections in Washington " + year);
			
 
				+        }
			
 
				+        // Six of these are near enough to be viable suggestions, just not the top one
			
 
				+
			
 
				+        // But we can't stop there!  Titles that are just a year are pretty common so lets just add one per year
			
 
				+        // since 0.  Why not?
			
 
				+        for (int year = 0; year < 2015; year++) {
			
 
				+            titles.add(Integer.toString(year));
			
 
				+        }
			
 
				+        // That ought to provide more less good candidates for the last term
			
 
				+
			
 
				+        // Now remove or add plural copies of every term we can
			
 
				+        titles.add("State");
			
 
				+        titles.add("Houses of Parliament");
			
 
				+        titles.add("Representative Government");
			
 
				+        titles.add("Election");
			
 
				+
			
 
				+        // Now some possessive
			
 
				+        titles.add("Washington's Birthday");
			
 
				+
			
 
				+        // And some conjugation
			
 
				+        titles.add("Unified Modeling Language");
			
 
				+        titles.add("Unite Against Fascism");
			
 
				+        titles.add("Stated Income Tax");
			
 
				+        titles.add("Media organizations housed within colleges");
			
 
				+
			
 
				+        // And other stuff
			
 
				+        titles.add("Untied shoelaces");
			
 
				+        titles.add("Unit circle");
			
 
				+        titles.add("Untitled");
			
 
				+        titles.add("Unicef");
			
 
				+        titles.add("Unrated");
			
 
				+        titles.add("UniRed");
			
 
				+        titles.add("Jalan Uniten–Dengkil"); // Highway in Malaysia
			
 
				+        titles.add("UNITAS");
			
 
				+        titles.add("UNITER");
			
 
				+        titles.add("Un-Led-Ed");
			
 
				+        titles.add("STATS LLC");
			
 
				+        titles.add("Staples");
			
 
				+        titles.add("Skates");
			
 
				+        titles.add("Statues of the Liberators");
			
 
				+        titles.add("Staten Island");
			
 
				+        titles.add("Statens Museum for Kunst");
			
 
				+        titles.add("Hause"); // The last name or the German word, whichever.
			
 
				+        titles.add("Hose");
			
 
				+        titles.add("Hoses");
			
 
				+        titles.add("Howse Peak");
			
 
				+        titles.add("The Hoose-Gow");
			
 
				+        titles.add("Hooser");
			
 
				+        titles.add("Electron");
			
 
				+        titles.add("Electors");
			
 
				+        titles.add("Evictions");
			
 
				+        titles.add("Coronal mass ejection");
			
 
				+        titles.add("Wasington"); // A film?
			
 
				+        titles.add("Warrington"); // A town in England
			
 
				+        titles.add("Waddington"); // Lots of places have this name
			
 
				+        titles.add("Watlington"); // Ditto
			
 
				+        titles.add("Waplington"); // Yup, also a town
			
 
				+        titles.add("Washing of the Spears"); // Book
			
 
				+
			
 
				+        for (char c = 'A'; c <= 'Z'; c++) {
			
 
				+            // Can't forget lists, glorious lists!
			
 
				+            titles.add("List of former members of the United States House of Representatives (" + c + ")");
			
 
				+
			
 
				+            // Lots of people are named Washington <Middle Initial>. LastName
			
 
				+            titles.add("Washington " + c + ". Lastname");
			
 
				+
			
 
				+            // Lets just add some more to be evil
			
 
				+            titles.add("United " + c);
			
 
				+            titles.add("States " + c);
			
 
				+            titles.add("House " + c);
			
 
				+            titles.add("Elections " + c);
			
 
				+            titles.add("2006 " + c);
			
 
				+            titles.add(c + " United");
			
 
				+            titles.add(c + " States");
			
 
				+            titles.add(c + " House");
			
 
				+            titles.add(c + " Elections");
			
 
				+            titles.add(c + " 2006");
			
 
				+        }
			
 
				+
			
 
				+        List<IndexRequestBuilder> builders = new ArrayList<IndexRequestBuilder>();
			
 
				+        for (String title: titles.build()) {
			
 
				+            builders.add(client().prepareIndex("test", "type1").setSource("title", title));
			
 
				+        }
			
 
				+        indexRandom(true, builders);
			
 
				+
			
 
				+        PhraseSuggestionBuilder suggest = phraseSuggestion("title")
			
 
				+                .field("title")
			
 
				+                .addCandidateGenerator(PhraseSuggestionBuilder.candidateGenerator("title")
			
 
				+                        .suggestMode("always")
			
 
				+                        .maxTermFreq(.99f)
			
 
				+                        .size(1000) // Setting a silly high size helps of generate a larger list of candidates for testing.
			
 
				+                        .maxInspections(1000) // This too
			
 
				+                )
			
 
				+                .confidence(0f)
			
 
				+                .maxErrors(2f)
			
 
				+                .shardSize(30000)
			
 
				+                .size(30000);
			
 
				+        Suggest searchSuggest = searchSuggest("united states house of representatives elections in washington 2006", suggest);
			
 
				+        assertSuggestion(searchSuggest, 0, 0, "title", "united states house of representatives elections in washington 2006");
			
 
				+        assertSuggestionSize(searchSuggest, 0, 25480, "title");  // Just to prove that we've run through a ton of options
			
 
				+
			
 
				+        suggest.size(1);
			
 
				+        long start = System.currentTimeMillis();
			
 
				+        searchSuggest = searchSuggest("united states house of representatives elections in washington 2006", suggest);
			
 
				+        long total = System.currentTimeMillis() - start;
			
 
				+        assertSuggestion(searchSuggest, 0, 0, "title", "united states house of representatives elections in washington 2006");
			
 
				+        assertThat(total, lessThan(1000L)); // Takes many seconds without fix
			
 
				+    }
			
 
				     
			
 
				     protected Suggest searchSuggest(SuggestionBuilder<?>... suggestion) {
			
 
				         return searchSuggest(null, suggestion);