浏览代码

Add detail to slow cluster state warning message (#83221)

Today if writing the cluster state is slow then we report how many index
metadata documents we wrote but we don't mention how many documents were
deleted. Deleting docs might trigger a merge and explain why the update
is slower than expected.

This commit adds info about the number of removed indices and also
distinguishes new indices from updates to existing indices, since they
also have different behaviour in terms of deletes.
David Turner 3 年之前
父节点
当前提交
3d0655fb42

+ 5 - 0
docs/changelog/83221.yaml

@@ -0,0 +1,5 @@
+pr: 83221
+summary: Add detail to slow cluster state warning message
+area: Cluster Coordination
+type: enhancement
+issues: []

+ 42 - 33
server/src/main/java/org/elasticsearch/gateway/PersistedClusterStateService.java

@@ -84,6 +84,7 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicBoolean;
@@ -770,18 +771,13 @@ public class PersistedClusterStateService {
                 final TimeValue finalSlowWriteLoggingThreshold = slowWriteLoggingThresholdSupplier.get();
                 if (durationMillis >= finalSlowWriteLoggingThreshold.getMillis()) {
                     logger.warn(
-                        "writing cluster state took [{}ms] which is above the warn threshold of [{}]; "
-                            + "wrote full state with [{}] indices",
+                        "writing full cluster state took [{}ms] which is above the warn threshold of [{}]; {}",
                         durationMillis,
                         finalSlowWriteLoggingThreshold,
-                        stats.numIndicesUpdated
+                        stats
                     );
                 } else {
-                    logger.debug(
-                        "writing cluster state took [{}ms]; " + "wrote full state with [{}] indices",
-                        durationMillis,
-                        stats.numIndicesUpdated
-                    );
+                    logger.debug("writing full cluster state took [{}ms]; {}", durationMillis, stats);
                 }
             } finally {
                 closeIfAnyIndexWriterHasTragedyOrIsClosed();
@@ -809,23 +805,13 @@ public class PersistedClusterStateService {
                 final TimeValue finalSlowWriteLoggingThreshold = slowWriteLoggingThresholdSupplier.get();
                 if (durationMillis >= finalSlowWriteLoggingThreshold.getMillis()) {
                     logger.warn(
-                        "writing cluster state took [{}ms] which is above the warn threshold of [{}]; "
-                            + "wrote global metadata [{}] and metadata for [{}] indices and skipped [{}] unchanged indices",
+                        "writing cluster state took [{}ms] which is above the warn threshold of [{}]; {}",
                         durationMillis,
                         finalSlowWriteLoggingThreshold,
-                        stats.globalMetaUpdated,
-                        stats.numIndicesUpdated,
-                        stats.numIndicesUnchanged
+                        stats
                     );
                 } else {
-                    logger.debug(
-                        "writing cluster state took [{}ms]; "
-                            + "wrote global metadata [{}] and metadata for [{}] indices and skipped [{}] unchanged indices",
-                        durationMillis,
-                        stats.globalMetaUpdated,
-                        stats.numIndicesUpdated,
-                        stats.numIndicesUnchanged
-                    );
+                    logger.debug("writing cluster state took [{}ms]; {}", durationMillis, stats);
                 }
             } finally {
                 closeIfAnyIndexWriterHasTragedyOrIsClosed();
@@ -864,7 +850,9 @@ public class PersistedClusterStateService {
                 assert previousValue == null : indexMetadata.getIndexUUID() + " already mapped to " + previousValue;
             }
 
+            int numIndicesAdded = 0;
             int numIndicesUpdated = 0;
+            int numIndicesRemoved = 0;
             int numIndicesUnchanged = 0;
             for (IndexMetadata indexMetadata : metadata.indices().values()) {
                 final Long previousVersion = indexMetadataVersionByUUID.get(indexMetadata.getIndexUUID());
@@ -875,7 +863,11 @@ public class PersistedClusterStateService {
                         previousVersion,
                         indexMetadata.getVersion()
                     );
-                    numIndicesUpdated++;
+                    if (previousVersion == null) {
+                        numIndicesAdded++;
+                    } else {
+                        numIndicesUpdated++;
+                    }
 
                     for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
                         metadataIndexWriter.deleteIndexMetadata(indexMetadata.getIndexUUID());
@@ -891,6 +883,7 @@ public class PersistedClusterStateService {
 
             for (String removedIndexUUID : indexMetadataVersionByUUID.keySet()) {
                 for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
+                    numIndicesRemoved++;
                     metadataIndexWriter.deleteIndexMetadata(removedIndexUUID);
                 }
             }
@@ -901,7 +894,7 @@ public class PersistedClusterStateService {
                 metadataIndexWriter.flush();
             }
 
-            return new WriterStats(updateGlobalMeta, numIndicesUpdated, numIndicesUnchanged);
+            return new WriterStats(false, updateGlobalMeta, numIndicesUnchanged, numIndicesAdded, numIndicesUpdated, numIndicesRemoved);
         }
 
         private static int lastPageValue(boolean isLastPage) {
@@ -977,7 +970,7 @@ public class PersistedClusterStateService {
                 metadataIndexWriter.flush();
             }
 
-            return new WriterStats(true, metadata.indices().size(), 0);
+            return new WriterStats(true, true, 0, 0, metadata.indices().size(), 0);
         }
 
         public void writeIncrementalTermUpdateAndCommit(long currentTerm, long lastAcceptedVersion, Version oldestIndexVersion)
@@ -1056,15 +1049,31 @@ public class PersistedClusterStateService {
             }
         }
 
-        static class WriterStats {
-            final boolean globalMetaUpdated;
-            final long numIndicesUpdated;
-            final long numIndicesUnchanged;
-
-            WriterStats(boolean globalMetaUpdated, long numIndicesUpdated, long numIndicesUnchanged) {
-                this.globalMetaUpdated = globalMetaUpdated;
-                this.numIndicesUpdated = numIndicesUpdated;
-                this.numIndicesUnchanged = numIndicesUnchanged;
+        private record WriterStats(
+            boolean isFullWrite,
+            boolean globalMetaUpdated,
+            int numIndicesUnchanged,
+            int numIndicesAdded,
+            int numIndicesUpdated,
+            int numIndicesRemoved
+        ) {
+            @Override
+            public String toString() {
+                if (isFullWrite) {
+                    return String.format(Locale.ROOT, "wrote global metadata and metadata for [%d] indices", numIndicesUpdated);
+                } else {
+                    return String.format(
+                        Locale.ROOT,
+                        """
+                            [%s] global metadata, wrote metadata for [%d] new indices and [%d] existing indices, \
+                            removed metadata for [%d] indices and skipped [%d] unchanged indices""",
+                        globalMetaUpdated ? "wrote" : "skipped writing",
+                        numIndicesAdded,
+                        numIndicesUpdated,
+                        numIndicesRemoved,
+                        numIndicesUnchanged
+                    );
+                }
             }
         }
     }

+ 13 - 5
server/src/test/java/org/elasticsearch/gateway/PersistedClusterStateServiceTests.java

@@ -1192,7 +1192,9 @@ public class PersistedClusterStateServiceTests extends ESTestCase {
                         "should see warning at threshold",
                         PersistedClusterStateService.class.getCanonicalName(),
                         Level.WARN,
-                        "writing cluster state took [*] which is above the warn threshold of [*]; " + "wrote full state with [0] indices"
+                        """
+                            writing full cluster state took [*] which is above the warn threshold of [*]; \
+                            wrote global metadata and metadata for [0] indices"""
                     )
                 );
 
@@ -1206,7 +1208,9 @@ public class PersistedClusterStateServiceTests extends ESTestCase {
                         "should see warning above threshold",
                         PersistedClusterStateService.class.getCanonicalName(),
                         Level.WARN,
-                        "writing cluster state took [*] which is above the warn threshold of [*]; " + "wrote full state with [0] indices"
+                        """
+                            writing full cluster state took [*] which is above the warn threshold of [*]; \
+                            wrote global metadata and metadata for [0] indices"""
                     )
                 );
 
@@ -1238,7 +1242,9 @@ public class PersistedClusterStateServiceTests extends ESTestCase {
                         "should see warning at reduced threshold",
                         PersistedClusterStateService.class.getCanonicalName(),
                         Level.WARN,
-                        "writing cluster state took [*] which is above the warn threshold of [*]; " + "wrote full state with [0] indices"
+                        """
+                            writing full cluster state took [*] which is above the warn threshold of [*]; \
+                            wrote global metadata and metadata for [0] indices"""
                     )
                 );
 
@@ -1269,8 +1275,10 @@ public class PersistedClusterStateServiceTests extends ESTestCase {
                         "should see warning at threshold",
                         PersistedClusterStateService.class.getCanonicalName(),
                         Level.WARN,
-                        "writing cluster state took [*] which is above the warn threshold of [*]; "
-                            + "wrote global metadata [false] and metadata for [1] indices and skipped [0] unchanged indices"
+                        """
+                            writing cluster state took [*] which is above the warn threshold of [*]; [skipped writing] global metadata, \
+                            wrote metadata for [1] new indices and [0] existing indices, removed metadata for [0] indices and \
+                            skipped [0] unchanged indices"""
                     )
                 );