|
@@ -46,6 +46,7 @@ import org.apache.lucene.store.Directory;
|
|
|
import org.apache.lucene.store.SimpleFSDirectory;
|
|
|
import org.apache.lucene.util.Bits;
|
|
|
import org.apache.lucene.util.BytesRef;
|
|
|
+import org.apache.lucene.util.BytesRefIterator;
|
|
|
import org.apache.lucene.util.SetOnce;
|
|
|
import org.elasticsearch.Version;
|
|
|
import org.elasticsearch.cluster.ClusterState;
|
|
@@ -53,14 +54,19 @@ import org.elasticsearch.cluster.metadata.IndexMetadata;
|
|
|
import org.elasticsearch.cluster.metadata.Metadata;
|
|
|
import org.elasticsearch.common.CheckedConsumer;
|
|
|
import org.elasticsearch.common.Nullable;
|
|
|
-import org.elasticsearch.common.io.stream.ReleasableBytesStreamOutput;
|
|
|
+import org.elasticsearch.common.bytes.PagedBytesReference;
|
|
|
+import org.elasticsearch.common.bytes.RecyclingBytesStreamOutput;
|
|
|
+import org.elasticsearch.common.io.Streams;
|
|
|
import org.elasticsearch.common.lease.Releasable;
|
|
|
+import org.elasticsearch.common.lease.Releasables;
|
|
|
import org.elasticsearch.common.logging.Loggers;
|
|
|
import org.elasticsearch.common.lucene.Lucene;
|
|
|
import org.elasticsearch.common.settings.ClusterSettings;
|
|
|
import org.elasticsearch.common.settings.Setting;
|
|
|
import org.elasticsearch.common.unit.TimeValue;
|
|
|
import org.elasticsearch.common.util.BigArrays;
|
|
|
+import org.elasticsearch.common.util.ByteArray;
|
|
|
+import org.elasticsearch.common.util.PageCacheRecycler;
|
|
|
import org.elasticsearch.common.xcontent.LoggingDeprecationHandler;
|
|
|
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
|
|
|
import org.elasticsearch.common.xcontent.ToXContent;
|
|
@@ -73,7 +79,6 @@ import org.elasticsearch.env.NodeMetadata;
|
|
|
import org.elasticsearch.index.Index;
|
|
|
|
|
|
import java.io.Closeable;
|
|
|
-import java.io.FilterOutputStream;
|
|
|
import java.io.IOError;
|
|
|
import java.io.IOException;
|
|
|
import java.nio.file.Files;
|
|
@@ -452,29 +457,6 @@ public class PersistedClusterStateService {
|
|
|
FORMAT_PARAMS = new ToXContent.MapParams(params);
|
|
|
}
|
|
|
|
|
|
- /**
|
|
|
- * A {@link Document} with a stored field containing serialized metadata written to a {@link ReleasableBytesStreamOutput} which must be
|
|
|
- * released when no longer needed.
|
|
|
- */
|
|
|
- private static class ReleasableDocument implements Releasable {
|
|
|
- private final Document document;
|
|
|
- private final Releasable releasable;
|
|
|
-
|
|
|
- ReleasableDocument(Document document, Releasable releasable) {
|
|
|
- this.document = document;
|
|
|
- this.releasable = releasable;
|
|
|
- }
|
|
|
-
|
|
|
- Document getDocument() {
|
|
|
- return document;
|
|
|
- }
|
|
|
-
|
|
|
- @Override
|
|
|
- public void close() {
|
|
|
- releasable.close();
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
/**
|
|
|
* Encapsulates a single {@link IndexWriter} with its {@link Directory} for ease of closing, and a {@link Logger}. There is one of these
|
|
|
* for each data path.
|
|
@@ -547,6 +529,10 @@ public class PersistedClusterStateService {
|
|
|
boolean fullStateWritten = false;
|
|
|
private final AtomicBoolean closed = new AtomicBoolean();
|
|
|
|
|
|
+ // The size of the document buffer that was used for the last write operation, used as a hint for allocating the buffer for the
|
|
|
+ // next one.
|
|
|
+ private int documentBufferUsed;
|
|
|
+
|
|
|
private Writer(List<MetadataIndexWriter> metadataIndexWriters, String nodeId, BigArrays bigArrays,
|
|
|
LongSupplier relativeTimeMillisSupplier, Supplier<TimeValue> slowWriteLoggingThresholdSupplier) {
|
|
|
this.metadataIndexWriters = metadataIndexWriters;
|
|
@@ -650,56 +636,60 @@ public class PersistedClusterStateService {
|
|
|
logger.trace("currentTerm [{}] matches previous currentTerm, writing changes only",
|
|
|
metadata.coordinationMetadata().term());
|
|
|
|
|
|
- final boolean updateGlobalMeta = Metadata.isGlobalStateEquals(previouslyWrittenMetadata, metadata) == false;
|
|
|
- if (updateGlobalMeta) {
|
|
|
- try (ReleasableDocument globalMetadataDocument = makeGlobalMetadataDocument(metadata)) {
|
|
|
+ try (DocumentBuffer documentBuffer = allocateBuffer()) {
|
|
|
+
|
|
|
+ final boolean updateGlobalMeta = Metadata.isGlobalStateEquals(previouslyWrittenMetadata, metadata) == false;
|
|
|
+ if (updateGlobalMeta) {
|
|
|
+ final Document globalMetadataDocument = makeGlobalMetadataDocument(metadata, documentBuffer);
|
|
|
for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
|
|
|
- metadataIndexWriter.updateGlobalMetadata(globalMetadataDocument.getDocument());
|
|
|
+ metadataIndexWriter.updateGlobalMetadata(globalMetadataDocument);
|
|
|
}
|
|
|
}
|
|
|
- }
|
|
|
|
|
|
- final Map<String, Long> indexMetadataVersionByUUID = new HashMap<>(previouslyWrittenMetadata.indices().size());
|
|
|
- for (ObjectCursor<IndexMetadata> cursor : previouslyWrittenMetadata.indices().values()) {
|
|
|
- final IndexMetadata indexMetadata = cursor.value;
|
|
|
- final Long previousValue = indexMetadataVersionByUUID.putIfAbsent(indexMetadata.getIndexUUID(), indexMetadata.getVersion());
|
|
|
- assert previousValue == null : indexMetadata.getIndexUUID() + " already mapped to " + previousValue;
|
|
|
- }
|
|
|
+ final Map<String, Long> indexMetadataVersionByUUID = new HashMap<>(previouslyWrittenMetadata.indices().size());
|
|
|
+ for (ObjectCursor<IndexMetadata> cursor : previouslyWrittenMetadata.indices().values()) {
|
|
|
+ final IndexMetadata indexMetadata = cursor.value;
|
|
|
+ final Long previousValue
|
|
|
+ = indexMetadataVersionByUUID.putIfAbsent(indexMetadata.getIndexUUID(), indexMetadata.getVersion());
|
|
|
+ assert previousValue == null : indexMetadata.getIndexUUID() + " already mapped to " + previousValue;
|
|
|
+ }
|
|
|
|
|
|
- int numIndicesUpdated = 0;
|
|
|
- int numIndicesUnchanged = 0;
|
|
|
- for (ObjectCursor<IndexMetadata> cursor : metadata.indices().values()) {
|
|
|
- final IndexMetadata indexMetadata = cursor.value;
|
|
|
- final Long previousVersion = indexMetadataVersionByUUID.get(indexMetadata.getIndexUUID());
|
|
|
- if (previousVersion == null || indexMetadata.getVersion() != previousVersion) {
|
|
|
- logger.trace("updating metadata for [{}], changing version from [{}] to [{}]",
|
|
|
- indexMetadata.getIndex(), previousVersion, indexMetadata.getVersion());
|
|
|
- numIndicesUpdated++;
|
|
|
- try (ReleasableDocument indexMetadataDocument = makeIndexMetadataDocument(indexMetadata)) {
|
|
|
+ int numIndicesUpdated = 0;
|
|
|
+ int numIndicesUnchanged = 0;
|
|
|
+ for (ObjectCursor<IndexMetadata> cursor : metadata.indices().values()) {
|
|
|
+ final IndexMetadata indexMetadata = cursor.value;
|
|
|
+ final Long previousVersion = indexMetadataVersionByUUID.get(indexMetadata.getIndexUUID());
|
|
|
+ if (previousVersion == null || indexMetadata.getVersion() != previousVersion) {
|
|
|
+ logger.trace("updating metadata for [{}], changing version from [{}] to [{}]",
|
|
|
+ indexMetadata.getIndex(), previousVersion, indexMetadata.getVersion());
|
|
|
+ numIndicesUpdated++;
|
|
|
+ final Document indexMetadataDocument = makeIndexMetadataDocument(indexMetadata, documentBuffer);
|
|
|
for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
|
|
|
- metadataIndexWriter.updateIndexMetadataDocument(indexMetadataDocument.getDocument(), indexMetadata.getIndex());
|
|
|
+ metadataIndexWriter.updateIndexMetadataDocument(indexMetadataDocument, indexMetadata.getIndex());
|
|
|
}
|
|
|
+ } else {
|
|
|
+ numIndicesUnchanged++;
|
|
|
+ logger.trace("no action required for [{}]", indexMetadata.getIndex());
|
|
|
}
|
|
|
- } else {
|
|
|
- numIndicesUnchanged++;
|
|
|
- logger.trace("no action required for [{}]", indexMetadata.getIndex());
|
|
|
+ indexMetadataVersionByUUID.remove(indexMetadata.getIndexUUID());
|
|
|
}
|
|
|
- indexMetadataVersionByUUID.remove(indexMetadata.getIndexUUID());
|
|
|
- }
|
|
|
|
|
|
- for (String removedIndexUUID : indexMetadataVersionByUUID.keySet()) {
|
|
|
+ documentBufferUsed = documentBuffer.getMaxUsed();
|
|
|
+
|
|
|
+ for (String removedIndexUUID : indexMetadataVersionByUUID.keySet()) {
|
|
|
+ for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
|
|
|
+ metadataIndexWriter.deleteIndexMetadata(removedIndexUUID);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Flush, to try and expose a failure (e.g. out of disk space) before committing, because we can handle a failure here more
|
|
|
+ // gracefully than one that occurs during the commit process.
|
|
|
for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
|
|
|
- metadataIndexWriter.deleteIndexMetadata(removedIndexUUID);
|
|
|
+ metadataIndexWriter.flush();
|
|
|
}
|
|
|
- }
|
|
|
|
|
|
- // Flush, to try and expose a failure (e.g. out of disk space) before committing, because we can handle a failure here more
|
|
|
- // gracefully than one that occurs during the commit process.
|
|
|
- for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
|
|
|
- metadataIndexWriter.flush();
|
|
|
+ return new WriterStats(updateGlobalMeta, numIndicesUpdated, numIndicesUnchanged);
|
|
|
}
|
|
|
-
|
|
|
- return new WriterStats(updateGlobalMeta, numIndicesUpdated, numIndicesUnchanged);
|
|
|
}
|
|
|
|
|
|
/**
|
|
@@ -716,28 +706,39 @@ public class PersistedClusterStateService {
|
|
|
* Add documents for the metadata of the given cluster state, assuming that there are currently no documents.
|
|
|
*/
|
|
|
private WriterStats addMetadata(Metadata metadata) throws IOException {
|
|
|
- try (ReleasableDocument globalMetadataDocument = makeGlobalMetadataDocument(metadata)) {
|
|
|
+ try (DocumentBuffer documentBuffer = allocateBuffer()) {
|
|
|
+
|
|
|
+ final Document globalMetadataDocument = makeGlobalMetadataDocument(metadata, documentBuffer);
|
|
|
for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
|
|
|
- metadataIndexWriter.updateGlobalMetadata(globalMetadataDocument.getDocument());
|
|
|
+ metadataIndexWriter.updateGlobalMetadata(globalMetadataDocument);
|
|
|
}
|
|
|
- }
|
|
|
|
|
|
- for (ObjectCursor<IndexMetadata> cursor : metadata.indices().values()) {
|
|
|
- final IndexMetadata indexMetadata = cursor.value;
|
|
|
- try (ReleasableDocument indexMetadataDocument = makeIndexMetadataDocument(indexMetadata)) {
|
|
|
+ for (ObjectCursor<IndexMetadata> cursor : metadata.indices().values()) {
|
|
|
+ final IndexMetadata indexMetadata = cursor.value;
|
|
|
+ final Document indexMetadataDocument = makeIndexMetadataDocument(indexMetadata, documentBuffer);
|
|
|
for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
|
|
|
- metadataIndexWriter.updateIndexMetadataDocument(indexMetadataDocument.getDocument(), indexMetadata.getIndex());
|
|
|
+ metadataIndexWriter.updateIndexMetadataDocument(indexMetadataDocument, indexMetadata.getIndex());
|
|
|
}
|
|
|
}
|
|
|
- }
|
|
|
|
|
|
- // Flush, to try and expose a failure (e.g. out of disk space) before committing, because we can handle a failure here more
|
|
|
- // gracefully than one that occurs during the commit process.
|
|
|
- for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
|
|
|
- metadataIndexWriter.flush();
|
|
|
+ documentBufferUsed = documentBuffer.getMaxUsed();
|
|
|
+
|
|
|
+ // Flush, to try and expose a failure (e.g. out of disk space) before committing, because we can handle a failure here more
|
|
|
+ // gracefully than one that occurs during the commit process.
|
|
|
+ for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
|
|
|
+ metadataIndexWriter.flush();
|
|
|
+ }
|
|
|
+
|
|
|
+ return new WriterStats(true, metadata.indices().size(), 0);
|
|
|
}
|
|
|
+ }
|
|
|
|
|
|
- return new WriterStats(true, metadata.indices().size(), 0);
|
|
|
+ private DocumentBuffer allocateBuffer() {
|
|
|
+ // heuristics for picking the initial buffer size based on the buffer we needed last time: try and fit within a single page,
|
|
|
+ // but if we needed more than a single page last time then allow a bit more space to try and avoid needing to grow the buffer
|
|
|
+ // later on.
|
|
|
+ final int extraSpace = documentBufferUsed <= PageCacheRecycler.PAGE_SIZE_IN_BYTES ? 0 : PageCacheRecycler.PAGE_SIZE_IN_BYTES;
|
|
|
+ return new DocumentBuffer(documentBufferUsed + extraSpace, bigArrays);
|
|
|
}
|
|
|
|
|
|
public void writeIncrementalTermUpdateAndCommit(long currentTerm, long lastAcceptedVersion) throws IOException {
|
|
@@ -802,59 +803,97 @@ public class PersistedClusterStateService {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- private ReleasableDocument makeIndexMetadataDocument(IndexMetadata indexMetadata) throws IOException {
|
|
|
- final ReleasableDocument indexMetadataDocument = makeDocument(INDEX_TYPE_NAME, indexMetadata);
|
|
|
- boolean success = false;
|
|
|
- try {
|
|
|
- final String indexUUID = indexMetadata.getIndexUUID();
|
|
|
- assert indexUUID.equals(IndexMetadata.INDEX_UUID_NA_VALUE) == false;
|
|
|
- indexMetadataDocument.getDocument().add(new StringField(INDEX_UUID_FIELD_NAME, indexUUID, Field.Store.NO));
|
|
|
- success = true;
|
|
|
- return indexMetadataDocument;
|
|
|
- } finally {
|
|
|
- if (success == false) {
|
|
|
- IOUtils.closeWhileHandlingException(indexMetadataDocument);
|
|
|
- }
|
|
|
- }
|
|
|
+ private Document makeIndexMetadataDocument(IndexMetadata indexMetadata, DocumentBuffer documentBuffer) throws IOException {
|
|
|
+ final Document indexMetadataDocument = makeDocument(INDEX_TYPE_NAME, indexMetadata, documentBuffer);
|
|
|
+ final String indexUUID = indexMetadata.getIndexUUID();
|
|
|
+ assert indexUUID.equals(IndexMetadata.INDEX_UUID_NA_VALUE) == false;
|
|
|
+ indexMetadataDocument.add(new StringField(INDEX_UUID_FIELD_NAME, indexUUID, Field.Store.NO));
|
|
|
+ return indexMetadataDocument;
|
|
|
}
|
|
|
|
|
|
- private ReleasableDocument makeGlobalMetadataDocument(Metadata metadata) throws IOException {
|
|
|
- return makeDocument(GLOBAL_TYPE_NAME, metadata);
|
|
|
+ private Document makeGlobalMetadataDocument(Metadata metadata, DocumentBuffer documentBuffer) throws IOException {
|
|
|
+ return makeDocument(GLOBAL_TYPE_NAME, metadata, documentBuffer);
|
|
|
}
|
|
|
|
|
|
- private ReleasableDocument makeDocument(String typeName, ToXContent metadata) throws IOException {
|
|
|
+ private Document makeDocument(String typeName, ToXContent metadata, DocumentBuffer documentBuffer) throws IOException {
|
|
|
final Document document = new Document();
|
|
|
document.add(new StringField(TYPE_FIELD_NAME, typeName, Field.Store.NO));
|
|
|
|
|
|
- boolean success = false;
|
|
|
- final ReleasableBytesStreamOutput releasableBytesStreamOutput = new ReleasableBytesStreamOutput(bigArrays);
|
|
|
- try {
|
|
|
- final FilterOutputStream outputStream = new FilterOutputStream(releasableBytesStreamOutput) {
|
|
|
-
|
|
|
- @Override
|
|
|
- public void write(byte[] b, int off, int len) throws IOException {
|
|
|
- out.write(b, off, len);
|
|
|
- }
|
|
|
-
|
|
|
- @Override
|
|
|
- public void close() {
|
|
|
- // closing the XContentBuilder should not release the bytes yet
|
|
|
- }
|
|
|
- };
|
|
|
- try (XContentBuilder xContentBuilder = XContentFactory.contentBuilder(XContentType.SMILE, outputStream)) {
|
|
|
+ try (RecyclingBytesStreamOutput streamOutput = documentBuffer.streamOutput()) {
|
|
|
+ try (XContentBuilder xContentBuilder = XContentFactory.contentBuilder(XContentType.SMILE,
|
|
|
+ Streams.flushOnCloseStream(streamOutput))) {
|
|
|
xContentBuilder.startObject();
|
|
|
metadata.toXContent(xContentBuilder, FORMAT_PARAMS);
|
|
|
xContentBuilder.endObject();
|
|
|
}
|
|
|
- document.add(new StoredField(DATA_FIELD_NAME, releasableBytesStreamOutput.bytes().toBytesRef()));
|
|
|
- final ReleasableDocument releasableDocument = new ReleasableDocument(document, releasableBytesStreamOutput);
|
|
|
- success = true;
|
|
|
- return releasableDocument;
|
|
|
- } finally {
|
|
|
- if (success == false) {
|
|
|
- IOUtils.closeWhileHandlingException(releasableBytesStreamOutput);
|
|
|
+ document.add(new StoredField(DATA_FIELD_NAME, streamOutput.toBytesRef()));
|
|
|
+ }
|
|
|
+
|
|
|
+ return document;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Holds the current buffer, keeping track of new allocations as it grows.
|
|
|
+ */
|
|
|
+ private static class DocumentBuffer implements Releasable {
|
|
|
+ private final BigArrays bigArrays;
|
|
|
+
|
|
|
+ @Nullable // if the initial page doesn't need releasing
|
|
|
+ private final Releasable releasable;
|
|
|
+ private byte[] buffer;
|
|
|
+ private int maxUsed;
|
|
|
+
|
|
|
+ DocumentBuffer(int size, BigArrays bigArrays) {
|
|
|
+ if (size <= PageCacheRecycler.PAGE_SIZE_IN_BYTES) {
|
|
|
+ final ByteArray byteArray = bigArrays.newByteArray(PageCacheRecycler.PAGE_SIZE_IN_BYTES);
|
|
|
+ final BytesRefIterator iterator = new PagedBytesReference(byteArray, Math.toIntExact(byteArray.size())).iterator();
|
|
|
+ final BytesRef firstPage;
|
|
|
+ try {
|
|
|
+ firstPage = iterator.next();
|
|
|
+ assert iterator.next() == null : "should be one page";
|
|
|
+ } catch (IOException e) {
|
|
|
+ throw new AssertionError("impossible", e);
|
|
|
}
|
|
|
+
|
|
|
+ // we require that we have the whole page to ourselves
|
|
|
+ assert firstPage.offset == 0 : firstPage.offset;
|
|
|
+ assert firstPage.bytes.length == PageCacheRecycler.PAGE_SIZE_IN_BYTES : firstPage.bytes.length;
|
|
|
+ buffer = firstPage.bytes;
|
|
|
+ releasable = byteArray;
|
|
|
+ } else {
|
|
|
+ buffer = new byte[size];
|
|
|
+ releasable = null;
|
|
|
}
|
|
|
+ this.bigArrays = bigArrays;
|
|
|
+ maxUsed = 0;
|
|
|
+ }
|
|
|
+
|
|
|
+ RecyclingBytesStreamOutput streamOutput() {
|
|
|
+ return new RecyclingBytesStreamOutput(buffer, bigArrays) {
|
|
|
+ @Override
|
|
|
+ public BytesRef toBytesRef() {
|
|
|
+ final BytesRef bytesRef = super.toBytesRef();
|
|
|
+ maxUsed = Math.max(maxUsed, bytesRef.length);
|
|
|
+ if (buffer != bytesRef.bytes) {
|
|
|
+ assert bytesRef.length > buffer.length;
|
|
|
+ logger.trace("growing document buffer from [{}] to [{}]", buffer.length, maxUsed);
|
|
|
+ buffer = bytesRef.bytes;
|
|
|
+ }
|
|
|
+ assert maxUsed <= buffer.length;
|
|
|
+ return bytesRef;
|
|
|
+ }
|
|
|
+ };
|
|
|
+ }
|
|
|
+
|
|
|
+ int getMaxUsed() {
|
|
|
+ return maxUsed;
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public void close() {
|
|
|
+ Releasables.close(releasable);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+
|