ソースを参照

[core] add best_compression option for Lucene 5.0

Upgrades lucene to latest, and supports the BEST_COMPRESSION parameter
now supported (with backwards compatibility, etc) in Lucene.
This option uses deflate, tuned for highly compressible data.

index.codec::
The default value compresses stored data with LZ4 compression, but
this can be set to best_compression for a higher compression ratio,
at the expense of slower stored fields performance.

IMO its safest to implement as a named codec here, because ES already
has logic to handle this correctly, and because its unrealistic to have
a plethora of options to Lucene's default codec... we are practically
limited in Lucene to what we can support with back compat, so I don't
think we should overengineer this and add additional unnecessary plumbing.

See also:
https://issues.apache.org/jira/browse/LUCENE-5914
https://issues.apache.org/jira/browse/LUCENE-6089
https://issues.apache.org/jira/browse/LUCENE-6090
https://issues.apache.org/jira/browse/LUCENE-6100

Closes #8863
Robert Muir 11 年 前
コミット
a2ffe494ae

+ 5 - 0
docs/reference/index-modules.asciidoc

@@ -41,6 +41,11 @@ otherwise it is written in non-compound format.
 	refresh operation will be executed. Defaults to `1s`. Can be set to `-1`
 	in order to disable it.
 
+`index.codec`::
+        The `default` value compresses stored data with LZ4 compression, but
+        this can be set to `best_compression` for a higher compression ratio,
+        at the expense of slower stored fields performance.
+
 `index.shard.check_on_startup`::
         Should shard consistency be checked upon opening.
         When `true`, the shard will be checked, preventing it from being open in

+ 2 - 2
pom.xml

@@ -32,7 +32,7 @@
 
     <properties>
         <lucene.version>5.0.0</lucene.version>
-        <lucene.maven.version>5.0.0-snapshot-1642891</lucene.maven.version>
+        <lucene.maven.version>5.0.0-snapshot-1644303</lucene.maven.version>
         <tests.jvms>auto</tests.jvms>
         <tests.shuffle>true</tests.shuffle>
         <tests.output>onerror</tests.output>
@@ -54,7 +54,7 @@
         </repository>
         <repository>
             <id>Lucene snapshots</id>
-            <url>https://download.elasticsearch.org/lucenesnapshots/1642891</url>
+            <url>https://download.elasticsearch.org/lucenesnapshots/1644303</url>
         </repository>
     </repositories>
 

+ 15 - 2
src/main/java/org/elasticsearch/index/codec/CodecService.java

@@ -20,7 +20,11 @@
 package org.elasticsearch.index.codec;
 
 import com.google.common.collect.ImmutableMap;
+
 import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.lucene50.Lucene50Codec;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
 import org.elasticsearch.ElasticsearchIllegalArgumentException;
 import org.elasticsearch.common.collect.MapBuilder;
 import org.elasticsearch.common.inject.Inject;
@@ -50,6 +54,7 @@ public class CodecService extends AbstractIndexComponent {
     private final ImmutableMap<String, Codec> codecs;
 
     public final static String DEFAULT_CODEC = "default";
+    public final static String BEST_COMPRESSION_CODEC = "best_compression";
 
     public CodecService(Index index) {
         this(index, ImmutableSettings.Builder.EMPTY_SETTINGS);
@@ -68,9 +73,17 @@ public class CodecService extends AbstractIndexComponent {
         this.mapperService = mapperService;
         MapBuilder<String, Codec> codecs = MapBuilder.<String, Codec>newMapBuilder();
         if (mapperService == null) {
-            codecs.put(DEFAULT_CODEC, Codec.getDefault());
+            codecs.put(DEFAULT_CODEC, new Lucene50Codec());
+            codecs.put(BEST_COMPRESSION_CODEC, new Lucene50Codec(Mode.BEST_COMPRESSION));
         } else {
-            codecs.put(DEFAULT_CODEC, new PerFieldMappingPostingFormatCodec(mapperService,
+            codecs.put(DEFAULT_CODEC, 
+                    new PerFieldMappingPostingFormatCodec(Mode.BEST_SPEED,
+                    mapperService,
+                    postingsFormatService.get(PostingsFormatService.DEFAULT_FORMAT).get(),
+                    docValuesFormatService.get(DocValuesFormatService.DEFAULT_FORMAT).get(), logger));
+            codecs.put(BEST_COMPRESSION_CODEC, 
+                    new PerFieldMappingPostingFormatCodec(Mode.BEST_COMPRESSION,
+                    mapperService,
                     postingsFormatService.get(PostingsFormatService.DEFAULT_FORMAT).get(),
                     docValuesFormatService.get(DocValuesFormatService.DEFAULT_FORMAT).get(), logger));
         }

+ 3 - 1
src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java

@@ -23,6 +23,7 @@ import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.codecs.DocValuesFormat;
 import org.apache.lucene.codecs.PostingsFormat;
 import org.apache.lucene.codecs.lucene50.Lucene50Codec;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
 import org.elasticsearch.common.logging.ESLogger;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.index.codec.docvaluesformat.DocValuesFormatProvider;
@@ -49,7 +50,8 @@ public class PerFieldMappingPostingFormatCodec extends Lucene50Codec {
         assert Codec.forName(Lucene.LATEST_CODEC).getClass().isAssignableFrom(PerFieldMappingPostingFormatCodec.class) : "PerFieldMappingPostingFormatCodec must subclass the latest lucene codec: " + Lucene.LATEST_CODEC;
     }
 
-    public PerFieldMappingPostingFormatCodec(MapperService mapperService, PostingsFormat defaultPostingFormat, DocValuesFormat defaultDocValuesFormat, ESLogger logger) {
+    public PerFieldMappingPostingFormatCodec(Lucene50StoredFieldsFormat.Mode compressionMode, MapperService mapperService, PostingsFormat defaultPostingFormat, DocValuesFormat defaultDocValuesFormat, ESLogger logger) {
+        super(compressionMode);
         this.mapperService = mapperService;
         this.logger = logger;
         this.defaultPostingFormat = defaultPostingFormat;

+ 7 - 2
src/main/java/org/elasticsearch/index/engine/internal/InternalEngine.java

@@ -20,6 +20,7 @@
 package org.elasticsearch.index.engine.internal;
 
 import com.google.common.collect.Lists;
+
 import org.apache.lucene.index.*;
 import org.apache.lucene.index.IndexWriter.IndexReaderWarmer;
 import org.apache.lucene.search.*;
@@ -69,6 +70,7 @@ import org.elasticsearch.threadpool.ThreadPool;
 
 import java.io.Closeable;
 import java.io.IOException;
+import java.lang.reflect.Method;
 import java.util.*;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
@@ -1054,10 +1056,13 @@ public class InternalEngine implements Engine {
         }
     }
 
+    // TODO: can we please remove this method?!
     private void waitForMerges(boolean flushAfter) {
         try {
-            currentIndexWriter().waitForMerges();
-        } catch (IOException e) {
+            Method method = IndexWriter.class.getDeclaredMethod("waitForMerges");
+            method.setAccessible(true);
+            method.invoke(currentIndexWriter());
+        } catch (ReflectiveOperationException e) {
             throw new OptimizeFailedEngineException(shardId, e);
         }
         if (flushAfter) {

+ 36 - 0
src/test/java/org/elasticsearch/index/codec/CodecTests.java

@@ -33,7 +33,15 @@ import org.apache.lucene.codecs.lucene46.Lucene46Codec;
 import org.apache.lucene.codecs.lucene49.Lucene49Codec;
 import org.apache.lucene.codecs.lucene50.Lucene50Codec;
 import org.apache.lucene.codecs.lucene50.Lucene50DocValuesFormat;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
 import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.SegmentReader;
+import org.apache.lucene.store.Directory;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.settings.ImmutableSettings;
 import org.elasticsearch.common.settings.Settings;
@@ -173,6 +181,34 @@ public class CodecTests extends ElasticsearchSingleNodeLuceneTestCase {
         assertThat(documentMapper.rootMapper(VersionFieldMapper.class).docValuesFormatProvider(), instanceOf(PreBuiltDocValuesFormatProvider.class));
         assertThat(documentMapper.rootMapper(VersionFieldMapper.class).docValuesFormatProvider().get(), instanceOf(Lucene410DocValuesFormat.class));
     }
+    
+    public void testDefault() throws Exception {
+        Codec codec = createCodecService().codec("default");
+        assertCompressionEquals(Mode.BEST_SPEED, codec);
+    }
+    
+    public void testBestCompression() throws Exception {
+        Codec codec = createCodecService().codec("best_compression");
+        assertCompressionEquals(Mode.BEST_COMPRESSION, codec);
+    }
+    
+    // write some docs with it, inspect .si to see this was the used compression
+    private void assertCompressionEquals(Mode expected, Codec actual) throws Exception {
+        Directory dir = newDirectory();
+        IndexWriterConfig iwc = newIndexWriterConfig(null);
+        iwc.setCodec(actual);
+        IndexWriter iw = new IndexWriter(dir, iwc);
+        iw.addDocument(new Document());
+        iw.commit();
+        iw.close();
+        DirectoryReader ir = DirectoryReader.open(dir);
+        SegmentReader sr = (SegmentReader) ir.leaves().get(0).reader();
+        String v = sr.getSegmentInfo().info.getAttribute(Lucene50StoredFieldsFormat.MODE_KEY);
+        assertNotNull(v);
+        assertEquals(expected, Mode.valueOf(v));
+        ir.close();
+        dir.close();
+    }
 
     private static CodecService createCodecService() {
         return createCodecService(ImmutableSettings.Builder.EMPTY_SETTINGS);

+ 3 - 0
src/test/java/org/elasticsearch/index/store/StoreTest.java

@@ -187,6 +187,7 @@ public class StoreTest extends ElasticsearchLuceneTestCase {
         IOUtils.close(verifyingOutput, dir);
     }
 
+    // TODO: remove this, its too fragile. just use a static old index instead.
     private static final class OldSIMockingCodec extends FilterCodec {
 
         protected OldSIMockingCodec() {
@@ -232,6 +233,7 @@ public class StoreTest extends ElasticsearchLuceneTestCase {
                             }
                         }
                         output.writeStringSet(files);
+                        output.writeStringStringMap(si.getAttributes());
                         CodecUtil.writeFooter(output);
                         success = true;
                     } finally {
@@ -245,6 +247,7 @@ public class StoreTest extends ElasticsearchLuceneTestCase {
         }
     }
 
+    // IF THIS TEST FAILS ON UPGRADE GO LOOK AT THE OldSIMockingCodec!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
     @Test
     public void testWriteLegacyChecksums() throws IOException {
         final ShardId shardId = new ShardId(new Index("index"), 1);

+ 2 - 0
src/test/java/org/elasticsearch/test/ElasticsearchIntegrationTest.java

@@ -315,6 +315,8 @@ public abstract class ElasticsearchIntegrationTest extends ElasticsearchTestCase
 
             randomSettingsBuilder.put(SETTING_NUMBER_OF_SHARDS, numberOfShards())
                     .put(SETTING_NUMBER_OF_REPLICAS, numberOfReplicas());
+            
+            randomSettingsBuilder.put("index.codec", randomFrom("default", "best_compression"));
             XContentBuilder mappings = null;
             if (frequently() && randomDynamicTemplates()) {
                 mappings = XContentFactory.jsonBuilder().startObject().startObject("_default_");