123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226 |
- /*
- * Licensed to Elasticsearch under one or more contributor
- * license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright
- * ownership. Elasticsearch licenses this file to you under
- * the Apache License, Version 2.0 (the "License"); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
- package org.apache.lucene.queries;
- import org.apache.lucene.analysis.MockAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.FieldType;
- import org.apache.lucene.document.TextField;
- import org.apache.lucene.index.DirectoryReader;
- import org.apache.lucene.index.FieldInfo;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.search.*;
- import org.apache.lucene.search.similarities.BM25Similarity;
- import org.apache.lucene.search.similarities.DefaultSimilarity;
- import org.apache.lucene.search.similarities.Similarity;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.util._TestUtil;
- import org.elasticsearch.test.ElasticsearchLuceneTestCase;
- import org.junit.Test;
- import java.io.IOException;
- import java.util.*;
- import static org.hamcrest.Matchers.containsInAnyOrder;
- import static org.hamcrest.Matchers.equalTo;
- /**
- */
- public class BlendedTermQueryTest extends ElasticsearchLuceneTestCase {
- @Test
- public void testBooleanQuery() throws IOException {
- Directory dir = newDirectory();
- IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
- String[] firstNames = new String[]{
- "simon", "paul"
- };
- String[] surNames = new String[]{
- "willnauer", "simon"
- };
- for (int i = 0; i < surNames.length; i++) {
- Document d = new Document();
- d.add(new TextField("id", Integer.toString(i), Field.Store.YES));
- d.add(new TextField("firstname", firstNames[i], Field.Store.NO));
- d.add(new TextField("surname", surNames[i], Field.Store.NO));
- w.addDocument(d);
- }
- int iters = scaledRandomIntBetween(25, 100);
- for (int j = 0; j < iters; j++) {
- Document d = new Document();
- d.add(new TextField("id", Integer.toString(firstNames.length + j), Field.Store.YES));
- d.add(new TextField("firstname", rarely() ? "some_other_name" :
- "simon the sorcerer", Field.Store.NO)); // make sure length-norm is the tie-breaker
- d.add(new TextField("surname", "bogus", Field.Store.NO));
- w.addDocument(d);
- }
- w.commit();
- DirectoryReader reader = DirectoryReader.open(w, true);
- IndexSearcher searcher = setSimilarity(newSearcher(reader));
- {
- Term[] terms = new Term[]{new Term("firstname", "simon"), new Term("surname", "simon")};
- BlendedTermQuery query = BlendedTermQuery.booleanBlendedQuery(terms, true);
- TopDocs search = searcher.search(query, 3);
- ScoreDoc[] scoreDocs = search.scoreDocs;
- assertEquals(3, scoreDocs.length);
- assertEquals(Integer.toString(0), reader.document(scoreDocs[0].doc).getField("id").stringValue());
- }
- {
- BooleanQuery query = new BooleanQuery(false);
- query.add(new TermQuery(new Term("firstname", "simon")), BooleanClause.Occur.SHOULD);
- query.add(new TermQuery(new Term("surname", "simon")), BooleanClause.Occur.SHOULD);
- TopDocs search = searcher.search(query, 1);
- ScoreDoc[] scoreDocs = search.scoreDocs;
- assertEquals(Integer.toString(1), reader.document(scoreDocs[0].doc).getField("id").stringValue());
- }
- reader.close();
- w.close();
- dir.close();
- }
- @Test
- public void testDismaxQuery() throws IOException {
- Directory dir = newDirectory();
- IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
- String[] username = new String[]{
- "foo fighters", "some cool fan", "cover band"};
- String[] song = new String[]{
- "generator", "foo fighers - generator", "foo fighters generator"
- };
- FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
- ft.setIndexOptions(random().nextBoolean() ? FieldInfo.IndexOptions.DOCS_ONLY : FieldInfo.IndexOptions.DOCS_AND_FREQS);
- ft.setOmitNorms(random().nextBoolean());
- ft.freeze();
- FieldType ft1 = new FieldType(TextField.TYPE_NOT_STORED);
- ft1.setIndexOptions(random().nextBoolean() ? FieldInfo.IndexOptions.DOCS_ONLY : FieldInfo.IndexOptions.DOCS_AND_FREQS);
- ft1.setOmitNorms(random().nextBoolean());
- ft1.freeze();
- for (int i = 0; i < username.length; i++) {
- Document d = new Document();
- d.add(new TextField("id", Integer.toString(i), Field.Store.YES));
- d.add(new Field("username", username[i], ft));
- d.add(new Field("song", song[i], ft));
- w.addDocument(d);
- }
- int iters = scaledRandomIntBetween(25, 100);
- for (int j = 0; j < iters; j++) {
- Document d = new Document();
- d.add(new TextField("id", Integer.toString(username.length + j), Field.Store.YES));
- d.add(new Field("username", "foo fighters", ft1));
- d.add(new Field("song", "some bogus text to bump up IDF", ft1));
- w.addDocument(d);
- }
- w.commit();
- DirectoryReader reader = DirectoryReader.open(w, true);
- IndexSearcher searcher = setSimilarity(newSearcher(reader));
- {
- String[] fields = new String[]{"username", "song"};
- BooleanQuery query = new BooleanQuery(false);
- query.add(BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "foo"), 0.1f), BooleanClause.Occur.SHOULD);
- query.add(BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "fighters"), 0.1f), BooleanClause.Occur.SHOULD);
- query.add(BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "generator"), 0.1f), BooleanClause.Occur.SHOULD);
- TopDocs search = searcher.search(query, 10);
- ScoreDoc[] scoreDocs = search.scoreDocs;
- assertEquals(Integer.toString(0), reader.document(scoreDocs[0].doc).getField("id").stringValue());
- }
- {
- BooleanQuery query = new BooleanQuery(false);
- DisjunctionMaxQuery uname = new DisjunctionMaxQuery(0.0f);
- uname.add(new TermQuery(new Term("username", "foo")));
- uname.add(new TermQuery(new Term("song", "foo")));
- DisjunctionMaxQuery s = new DisjunctionMaxQuery(0.0f);
- s.add(new TermQuery(new Term("username", "fighers")));
- s.add(new TermQuery(new Term("song", "fighers")));
- DisjunctionMaxQuery gen = new DisjunctionMaxQuery(0f);
- gen.add(new TermQuery(new Term("username", "generator")));
- gen.add(new TermQuery(new Term("song", "generator")));
- query.add(uname, BooleanClause.Occur.SHOULD);
- query.add(s, BooleanClause.Occur.SHOULD);
- query.add(gen, BooleanClause.Occur.SHOULD);
- TopDocs search = searcher.search(query, 4);
- ScoreDoc[] scoreDocs = search.scoreDocs;
- assertEquals(Integer.toString(1), reader.document(scoreDocs[0].doc).getField("id").stringValue());
- }
- reader.close();
- w.close();
- dir.close();
- }
- @Test
- public void testBasics() {
- final int iters = scaledRandomIntBetween(5, 25);
- for (int j = 0; j < iters; j++) {
- String[] fields = new String[1 + random().nextInt(10)];
- for (int i = 0; i < fields.length; i++) {
- fields[i] = _TestUtil.randomRealisticUnicodeString(random(), 1, 10);
- }
- String term = _TestUtil.randomRealisticUnicodeString(random(), 1, 10);
- Term[] terms = toTerms(fields, term);
- boolean disableCoord = random().nextBoolean();
- boolean useBoolean = random().nextBoolean();
- float tieBreaker = random().nextFloat();
- BlendedTermQuery query = useBoolean ? BlendedTermQuery.booleanBlendedQuery(terms, disableCoord) : BlendedTermQuery.dismaxBlendedQuery(terms, tieBreaker);
- QueryUtils.check(query);
- terms = toTerms(fields, term);
- BlendedTermQuery query2 = useBoolean ? BlendedTermQuery.booleanBlendedQuery(terms, disableCoord) : BlendedTermQuery.dismaxBlendedQuery(terms, tieBreaker);
- assertEquals(query, query2);
- }
- }
- public Term[] toTerms(String[] fields, String term) {
- Term[] terms = new Term[fields.length];
- List<String> fieldsList = Arrays.asList(fields);
- Collections.shuffle(fieldsList, random());
- fields = fieldsList.toArray(new String[0]);
- for (int i = 0; i < fields.length; i++) {
- terms[i] = new Term(fields[i], term);
- }
- return terms;
- }
- public IndexSearcher setSimilarity(IndexSearcher searcher) {
- Similarity similarity = random().nextBoolean() ? new BM25Similarity() : new DefaultSimilarity();
- searcher.setSimilarity(similarity);
- return searcher;
- }
- @Test
- public void testExtractTerms() {
- Set<Term> terms = new HashSet<>();
- int num = scaledRandomIntBetween(1, 10);
- for (int i = 0; i < num; i++) {
- terms.add(new Term(_TestUtil.randomRealisticUnicodeString(random(), 1, 10), _TestUtil.randomRealisticUnicodeString(random(), 1, 10)));
- }
- BlendedTermQuery blendedTermQuery = random().nextBoolean() ? BlendedTermQuery.dismaxBlendedQuery(terms.toArray(new Term[0]), random().nextFloat()) :
- BlendedTermQuery.booleanBlendedQuery(terms.toArray(new Term[0]), random().nextBoolean());
- Set<Term> extracted = new HashSet<>();
- blendedTermQuery.extractTerms(extracted);
- assertThat(extracted.size(), equalTo(terms.size()));
- assertThat(extracted, containsInAnyOrder(terms.toArray(new Term[0])));
- }
- }
|