Browse Source

add lucene language model similarities (Dirichlet & JelinekMercer)

Kevin Wang 11 years ago
parent
commit
ecab74fe6c

+ 25 - 0
docs/reference/index-modules/similarity.asciidoc

@@ -121,6 +121,31 @@ based model] . This similarity has the following options:
 
 Type name: `IB`
 
+[float]
+[[lm_dirichlet]]
+==== LM Dirichlet similarity.
+
+http://lucene.apache.org/core/4_7_1/core/org/apache/lucene/search/similarities/LMDirichletSimilarity.html[LM
+Dirichlet similarity] . This similarity has the following options:
+
+[horizontal]
+`mu`::  Default to `2000`.
+
+Type name: `LMDirichlet`
+
+[float]
+[[lm_jelinek_mercer]]
+==== LM Jelinek Mercer similarity.
+
+http://lucene.apache.org/core/4_7_1/core/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.html[LM
+Jelinek Mercer similarity] . This similarity has the following options:
+
+[horizontal]
+`lambda`::  The optimal value depends on both the collection and the query. The optimal value is around `0.1`
+for title queries and `0.7` for long queries. Default to `0.1`.
+
+Type name: `LMJelinekMercer`
+
 [float]
 [[default-base]]
 ==== Default and Base Similarities

+ 55 - 0
src/main/java/org/elasticsearch/index/similarity/LMDirichletSimilarityProvider.java

@@ -0,0 +1,55 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.similarity;
+
+import org.apache.lucene.search.similarities.LMDirichletSimilarity;
+import org.apache.lucene.search.similarities.Similarity;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+
+/**
+ * {@link SimilarityProvider} for {@link LMDirichletSimilarity}.
+ * <p/>
+ * Configuration options available:
+ * <ul>
+ *     <li>mu</li>
+ * </ul>
+ * @see LMDirichletSimilarity For more information about configuration
+ */
+public class LMDirichletSimilarityProvider extends AbstractSimilarityProvider {
+
+    private final LMDirichletSimilarity similarity;
+
+    @Inject
+    public LMDirichletSimilarityProvider(@Assisted String name, @Assisted Settings settings) {
+        super(name);
+        float mu = settings.getAsFloat("mu", 2000f);
+        this.similarity = new LMDirichletSimilarity(mu);
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public Similarity get() {
+        return similarity;
+    }
+}

+ 55 - 0
src/main/java/org/elasticsearch/index/similarity/LMJelinekMercerSimilarityProvider.java

@@ -0,0 +1,55 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.similarity;
+
+import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity;
+import org.apache.lucene.search.similarities.Similarity;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+
+/**
+ * {@link SimilarityProvider} for {@link LMJelinekMercerSimilarity}.
+ * <p/>
+ * Configuration options available:
+ * <ul>
+ *     <li>lambda</li>
+ * </ul>
+ * @see LMJelinekMercerSimilarity For more information about configuration
+ */
+public class LMJelinekMercerSimilarityProvider extends AbstractSimilarityProvider {
+
+    private final LMJelinekMercerSimilarity similarity;
+
+    @Inject
+    public LMJelinekMercerSimilarityProvider(@Assisted String name, @Assisted Settings settings) {
+        super(name);
+        float lambda = settings.getAsFloat("lambda", 0.1f);
+        this.similarity = new LMJelinekMercerSimilarity(lambda);
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public Similarity get() {
+        return similarity;
+    }
+}

+ 41 - 0
src/test/java/org/elasticsearch/index/similarity/SimilarityTests.java

@@ -151,6 +151,47 @@ public class SimilarityTests extends ElasticsearchTestCase {
         assertThat(((NormalizationH2) similarity.getNormalization()).getC(), equalTo(3f));
     }
 
+    @Test
+    public void testResolveSimilaritiesFromMapping_LMDirichlet() throws IOException {
+        String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
+                .startObject("properties")
+                .startObject("field1").field("type", "string").field("similarity", "my_similarity").endObject()
+                .endObject()
+                .endObject().endObject().string();
+
+        Settings indexSettings = ImmutableSettings.settingsBuilder()
+                .put("index.similarity.my_similarity.type", "LMDirichlet")
+                .put("index.similarity.my_similarity.mu", 3000f)
+                .build();
+        SimilarityService similarityService = similarityService(indexSettings);
+        DocumentMapper documentMapper = similarityService.mapperService().documentMapperParser().parse(mapping);
+        assertThat(documentMapper.mappers().name("field1").mapper().similarity(), instanceOf(LMDirichletSimilarityProvider.class));
+
+        LMDirichletSimilarity similarity = (LMDirichletSimilarity) documentMapper.mappers().name("field1").mapper().similarity().get();
+        assertThat(similarity.getMu(), equalTo(3000f));
+    }
+
+    @Test
+    public void testResolveSimilaritiesFromMapping_LMJelinekMercer() throws IOException {
+        String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
+                .startObject("properties")
+                .startObject("field1").field("type", "string").field("similarity", "my_similarity").endObject()
+                .endObject()
+                .endObject().endObject().string();
+
+        Settings indexSettings = ImmutableSettings.settingsBuilder()
+                .put("index.similarity.my_similarity.type", "LMJelinekMercer")
+                .put("index.similarity.my_similarity.lambda", 0.7f)
+                .build();
+        SimilarityService similarityService = similarityService(indexSettings);
+        DocumentMapper documentMapper = similarityService.mapperService().documentMapperParser().parse(mapping);
+        assertThat(documentMapper.mappers().name("field1").mapper().similarity(), instanceOf(LMJelinekMercerSimilarityProvider.class));
+
+        LMJelinekMercerSimilarity similarity = (LMJelinekMercerSimilarity) documentMapper.mappers().name("field1").mapper().similarity().get();
+        assertThat(similarity.getLambda(), equalTo(0.7f));
+    }
+
+
     private static SimilarityService similarityService() {
         return similarityService(ImmutableSettings.Builder.EMPTY_SETTINGS);
     }