Browse Source

Permit metadata updates on flood-stage-blocked indices (#81781)

If a node reaches the flood stage watermark then we automatically apply
the `read_only_allow_delete` block to all its indices to prevent any
further growth in data. Users are expected to fix the disk space issue
by adding more space or deleting indices. However some users may prefer
to fix the disk space issues by modifying some of the index settings,
perhaps removing replicas or adjusting an allocation filter to move
shards onto nodes with more space. Today this isn't possible since the
`read_only_allow_delete` block also applies to metadata writes. Blocking
metadata writes isn't necessary to protect against further increases in
disk usage, and makes it harder for users to resolve the disk space
issue, so this commit removes the `METADATA_WRITE` level from the block
definition.
David Turner 3 years ago
parent
commit
b1db7b67c6

+ 13 - 6
docs/reference/index-modules/blocks.asciidoc

@@ -25,13 +25,19 @@ index:
 
 `index.blocks.read_only_allow_delete`::
 
-    Similar to `index.blocks.read_only`, but also allows deleting the index to
+    Similar to `index.blocks.write`, but also allows deleting the index to
     make more resources available. The <<disk-based-shard-allocation,disk-based shard
     allocator>> may add and remove this block automatically.
 +
-Deleting documents from an index to release resources - rather than deleting the index itself - can increase the index size over time. When `index.blocks.read_only_allow_delete` is set to `true`, deleting documents is not permitted. However, deleting the index itself releases the read-only index block and makes resources available almost immediately.
+Deleting documents from an index to release resources - rather than deleting
+the index itself - can increase the index size over time. When
+`index.blocks.read_only_allow_delete` is set to `true`, deleting documents is
+not permitted. However, deleting the index itself releases the read-only index
+block and makes resources available almost immediately.
 +
-IMPORTANT: {es} adds and removes the read-only index block automatically when the disk utilization falls below the high watermark, controlled by <<cluster-routing-flood-stage,cluster.routing.allocation.disk.watermark.flood_stage>>.
+IMPORTANT: {es} adds and removes the read-only index block automatically when
+the disk utilization falls below the high watermark, controlled by
+<<cluster-routing-flood-stage,cluster.routing.allocation.disk.watermark.flood_stage>>.
 
 `index.blocks.read`::
 
@@ -39,9 +45,10 @@ IMPORTANT: {es} adds and removes the read-only index block automatically when th
 
 `index.blocks.write`::
 
-    Set to `true` to disable data write operations against the index. Unlike `read_only`,
-    this setting does not affect metadata. For instance, you can close an index with a `write`
-    block, but you cannot close an index with a `read_only` block.
+    Set to `true` to disable data write operations against the index. Unlike
+    `read_only`, this setting does not affect metadata. For instance, you can
+    adjust the settings of an index with a `write` block, but you cannot adjust
+    the settings of an index with a `read_only` block.
 
 `index.blocks.metadata`::
 

+ 1 - 2
server/src/internalClusterTest/java/org/elasticsearch/action/admin/indices/cache/clear/ClearIndicesCacheBlocksIT.java

@@ -17,7 +17,6 @@ import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_BLOCKS_ME
 import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_BLOCKS_READ;
 import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_BLOCKS_WRITE;
 import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_READ_ONLY;
-import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_READ_ONLY_ALLOW_DELETE;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertBlocked;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoFailures;
 import static org.hamcrest.Matchers.equalTo;
@@ -49,7 +48,7 @@ public class ClearIndicesCacheBlocksIT extends ESIntegTestCase {
             }
         }
         // Request is blocked
-        for (String blockSetting : Arrays.asList(SETTING_READ_ONLY, SETTING_BLOCKS_METADATA, SETTING_READ_ONLY_ALLOW_DELETE)) {
+        for (String blockSetting : Arrays.asList(SETTING_READ_ONLY, SETTING_BLOCKS_METADATA)) {
             try {
                 enableIndexBlock("test", blockSetting);
                 assertBlocked(

+ 10 - 4
server/src/internalClusterTest/java/org/elasticsearch/action/admin/indices/delete/DeleteIndexBlocksIT.java

@@ -15,11 +15,21 @@ import org.elasticsearch.cluster.metadata.Metadata;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.test.ESIntegTestCase;
 
+import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertBlocked;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchHits;
 
 public class DeleteIndexBlocksIT extends ESIntegTestCase {
+
+    @Override
+    protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
+        return Settings.builder()
+            .put(super.nodeSettings(nodeOrdinal, otherSettings))
+            .put(CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), false) // we control the read-only-allow-delete block
+            .build();
+    }
+
     public void testDeleteIndexWithBlocks() {
         createIndex("test");
         ensureGreen("test");
@@ -44,10 +54,6 @@ public class DeleteIndexBlocksIT extends ESIntegTestCase {
                 client().prepareIndex().setIndex("test").setId("2").setSource("foo", "bar"),
                 IndexMetadata.INDEX_READ_ONLY_ALLOW_DELETE_BLOCK
             );
-            assertBlocked(
-                client().admin().indices().prepareUpdateSettings("test").setSettings(Settings.builder().put("index.number_of_replicas", 2)),
-                IndexMetadata.INDEX_READ_ONLY_ALLOW_DELETE_BLOCK
-            );
             assertSearchHits(client().prepareSearch().get(), "1");
             assertAcked(client().admin().indices().prepareDelete("test"));
         } finally {

+ 12 - 2
server/src/internalClusterTest/java/org/elasticsearch/action/admin/indices/forcemerge/ForceMergeBlocksIT.java

@@ -8,6 +8,7 @@
 
 package org.elasticsearch.action.admin.indices.forcemerge;
 
+import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.test.ESIntegTestCase;
 import org.elasticsearch.test.ESIntegTestCase.ClusterScope;
 
@@ -18,6 +19,7 @@ import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_BLOCKS_RE
 import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_BLOCKS_WRITE;
 import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_READ_ONLY;
 import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_READ_ONLY_ALLOW_DELETE;
+import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertBlocked;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoFailures;
 import static org.hamcrest.Matchers.equalTo;
@@ -25,6 +27,14 @@ import static org.hamcrest.Matchers.equalTo;
 @ClusterScope(scope = ESIntegTestCase.Scope.TEST)
 public class ForceMergeBlocksIT extends ESIntegTestCase {
 
+    @Override
+    protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
+        return Settings.builder()
+            .put(super.nodeSettings(nodeOrdinal, otherSettings))
+            .put(CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), false) // we control the read-only-allow-delete block
+            .build();
+    }
+
     public void testForceMergeWithBlocks() {
         createIndex("test");
         ensureGreen("test");
@@ -37,7 +47,7 @@ public class ForceMergeBlocksIT extends ESIntegTestCase {
         }
 
         // Request is not blocked
-        for (String blockSetting : Arrays.asList(SETTING_BLOCKS_READ, SETTING_BLOCKS_WRITE)) {
+        for (String blockSetting : Arrays.asList(SETTING_BLOCKS_READ, SETTING_BLOCKS_WRITE, SETTING_READ_ONLY_ALLOW_DELETE)) {
             try {
                 enableIndexBlock("test", blockSetting);
                 ForceMergeResponse response = client().admin().indices().prepareForceMerge("test").execute().actionGet();
@@ -49,7 +59,7 @@ public class ForceMergeBlocksIT extends ESIntegTestCase {
         }
 
         // Request is blocked
-        for (String blockSetting : Arrays.asList(SETTING_READ_ONLY, SETTING_BLOCKS_METADATA, SETTING_READ_ONLY_ALLOW_DELETE)) {
+        for (String blockSetting : Arrays.asList(SETTING_READ_ONLY, SETTING_BLOCKS_METADATA)) {
             try {
                 enableIndexBlock("test", blockSetting);
                 assertBlocked(client().admin().indices().prepareForceMerge("test"));

+ 48 - 6
server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorIT.java

@@ -8,10 +8,11 @@
 
 package org.elasticsearch.cluster.routing.allocation;
 
-import org.elasticsearch.cluster.ClusterInfoService;
 import org.elasticsearch.cluster.DiskUsageIntegTestCase;
-import org.elasticsearch.cluster.InternalClusterInfoService;
 import org.elasticsearch.cluster.metadata.IndexMetadata;
+import org.elasticsearch.cluster.routing.ShardRouting;
+import org.elasticsearch.cluster.routing.ShardRoutingState;
+import org.elasticsearch.common.Priority;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.ByteSizeUnit;
 import org.elasticsearch.common.unit.ByteSizeValue;
@@ -20,6 +21,7 @@ import org.elasticsearch.test.ESIntegTestCase;
 
 import java.util.Locale;
 
+import static org.elasticsearch.cluster.metadata.IndexMetadata.INDEX_ROUTING_REQUIRE_GROUP_SETTING;
 import static org.elasticsearch.index.store.Store.INDEX_STORE_STATS_REFRESH_INTERVAL_SETTING;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertBlocked;
@@ -45,10 +47,6 @@ public class DiskThresholdMonitorIT extends DiskUsageIntegTestCase {
         internalCluster().startMasterOnlyNode();
         final String dataNodeName = internalCluster().startDataOnlyNode();
 
-        final InternalClusterInfoService clusterInfoService = (InternalClusterInfoService) internalCluster().getCurrentMasterNodeInstance(
-            ClusterInfoService.class
-        );
-
         final String indexName = randomAlphaOfLength(10).toLowerCase(Locale.ROOT);
         createIndex(
             indexName,
@@ -56,6 +54,7 @@ public class DiskThresholdMonitorIT extends DiskUsageIntegTestCase {
                 .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)
                 .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1)
                 .put(INDEX_STORE_STATS_REFRESH_INTERVAL_SETTING.getKey(), "0ms")
+                .put(INDEX_ROUTING_REQUIRE_GROUP_SETTING.getConcreteSettingForNamespace("_name").getKey(), dataNodeName)
                 .build()
         );
         // ensure we have a system index on the data node too.
@@ -78,5 +77,48 @@ public class DiskThresholdMonitorIT extends DiskUsageIntegTestCase {
                 equalTo("true")
             );
         });
+
+        // Verify that we can adjust things like allocation filters even while blocked
+        assertAcked(
+            client().admin()
+                .indices()
+                .prepareUpdateSettings(indexName)
+                .setSettings(
+                    Settings.builder().putNull(INDEX_ROUTING_REQUIRE_GROUP_SETTING.getConcreteSettingForNamespace("_name").getKey())
+                )
+        );
+
+        // Verify that we can still move shards around even while blocked
+        final String newDataNodeName = internalCluster().startDataOnlyNode();
+        final String newDataNodeId = client().admin().cluster().prepareNodesInfo(newDataNodeName).get().getNodes().get(0).getNode().getId();
+        assertBusy(() -> {
+            final ShardRouting primaryShard = client().admin()
+                .cluster()
+                .prepareState()
+                .clear()
+                .setRoutingTable(true)
+                .setNodes(true)
+                .setIndices(indexName)
+                .get()
+                .getState()
+                .routingTable()
+                .index(indexName)
+                .shard(0)
+                .primaryShard();
+            assertThat(primaryShard.state(), equalTo(ShardRoutingState.STARTED));
+            assertThat(primaryShard.currentNodeId(), equalTo(newDataNodeId));
+        });
+
+        // Verify that the block is removed once the shard migration is complete
+        refreshClusterInfo();
+        assertFalse(client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get().isTimedOut());
+        assertNull(
+            client().admin()
+                .indices()
+                .prepareGetSettings(indexName)
+                .setNames(IndexMetadata.SETTING_READ_ONLY_ALLOW_DELETE)
+                .get()
+                .getSetting(indexName, IndexMetadata.SETTING_READ_ONLY_ALLOW_DELETE)
+        );
     }
 }

+ 1 - 0
server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/MockDiskUsagesIT.java

@@ -433,6 +433,7 @@ public class MockDiskUsagesIT extends ESIntegTestCase {
                         .put(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "90%")
                         .put(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "90%")
                         .put(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "100%")
+                        .put(CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.getKey(), "0ms")
                 )
         );
 

+ 1 - 2
server/src/internalClusterTest/java/org/elasticsearch/indices/state/OpenCloseIndexIT.java

@@ -35,7 +35,6 @@ import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_BLOCKS_ME
 import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_BLOCKS_READ;
 import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_BLOCKS_WRITE;
 import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_READ_ONLY;
-import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_READ_ONLY_ALLOW_DELETE;
 import static org.elasticsearch.indices.state.CloseIndexIT.assertIndexIsClosed;
 import static org.elasticsearch.indices.state.CloseIndexIT.assertIndexIsOpened;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
@@ -352,7 +351,7 @@ public class OpenCloseIndexIT extends ESIntegTestCase {
         assertIndexIsClosed("test");
 
         // Opening an index is blocked
-        for (String blockSetting : Arrays.asList(SETTING_READ_ONLY, SETTING_READ_ONLY_ALLOW_DELETE, SETTING_BLOCKS_METADATA)) {
+        for (String blockSetting : Arrays.asList(SETTING_READ_ONLY, SETTING_BLOCKS_METADATA)) {
             try {
                 enableIndexBlock("test", blockSetting);
                 assertBlocked(client().admin().indices().prepareOpen("test"));

+ 1 - 1
server/src/main/java/org/elasticsearch/cluster/metadata/IndexMetadata.java

@@ -117,7 +117,7 @@ public class IndexMetadata implements Diffable<IndexMetadata>, ToXContentFragmen
         false,
         true,
         RestStatus.TOO_MANY_REQUESTS,
-        EnumSet.of(ClusterBlockLevel.METADATA_WRITE, ClusterBlockLevel.WRITE)
+        EnumSet.of(ClusterBlockLevel.WRITE)
     );
 
     public enum State {

+ 18 - 0
server/src/main/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitor.java

@@ -36,6 +36,7 @@ import org.elasticsearch.common.util.set.Sets;
 import org.elasticsearch.index.Index;
 
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
@@ -91,6 +92,14 @@ public class DiskThresholdMonitor {
      */
     private final Set<String> nodesOverHighThresholdAndRelocating = Sets.newConcurrentHashSet();
 
+    /**
+     * The IDs of the nodes in the last info received. Tracked because when a new node joins we consider its disk usage to be equal to
+     * the average disk usage in the cluster but we don't keep track of whether this puts it over any of the watermarks so when we receive
+     * its actual disk usage we may be able to move some more shards around. No need for synchronization, all access is protected by
+     * {@code checkInProgress}.
+     */
+    private Set<String> lastNodes = Collections.emptySet();
+
     public DiskThresholdMonitor(
         Settings settings,
         Supplier<ClusterState> clusterStateSupplier,
@@ -123,6 +132,7 @@ public class DiskThresholdMonitor {
         final ImmutableOpenMap<String, DiskUsage> usages = info.getNodeLeastAvailableDiskUsages();
         if (usages == null) {
             logger.trace("skipping monitor as no disk usage information is available");
+            lastNodes = Collections.emptySet();
             checkFinished();
             return;
         }
@@ -140,6 +150,14 @@ public class DiskThresholdMonitor {
         cleanUpRemovedNodes(nodes, nodesOverHighThreshold);
         cleanUpRemovedNodes(nodes, nodesOverHighThresholdAndRelocating);
 
+        if (lastNodes.equals(nodes) == false) {
+            if (lastNodes.containsAll(nodes) == false) {
+                logger.debug("rerouting because disk usage info received from new nodes");
+                reroute = true;
+            }
+            lastNodes = Collections.unmodifiableSet(nodes);
+        }
+
         final ClusterState state = clusterStateSupplier.get();
         final Set<String> indicesToMarkReadOnly = new HashSet<>();
         RoutingNodes routingNodes = state.getRoutingNodes();

+ 34 - 6
server/src/test/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorTests.java

@@ -44,6 +44,7 @@ import org.elasticsearch.test.junit.annotations.TestLogging;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
+import java.util.Optional;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicLong;
@@ -130,10 +131,16 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, 4));
         builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, 30));
         builder.put("frozen", new DiskUsage("frozen", "frozen", "/foo/bar", 100, between(0, 100)));
-        monitor.onNewInfo(clusterInfo(builder.build()));
-        assertFalse(reroute.get());
+        final ClusterInfo initialClusterInfo = clusterInfo(builder.build());
+        monitor.onNewInfo(initialClusterInfo);
+        assertTrue(reroute.get()); // reroute on new nodes
         assertEquals(new HashSet<>(Arrays.asList("test_1", "test_2")), indices.get());
 
+        indices.set(null);
+        reroute.set(false);
+        monitor.onNewInfo(initialClusterInfo);
+        assertFalse(reroute.get()); // no reroute if no change
+
         indices.set(null);
         builder = ImmutableOpenMap.builder();
         builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, 4));
@@ -230,13 +237,31 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         oneDiskAboveWatermarkBuilder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, 50));
         final ImmutableOpenMap<String, DiskUsage> oneDiskAboveWatermark = oneDiskAboveWatermarkBuilder.build();
 
-        // should not reroute when all disks are ok
+        // should reroute when receiving info about previously-unknown nodes
         currentTime.addAndGet(randomLongBetween(0, 120000));
         monitor.onNewInfo(clusterInfo(allDisksOk));
-        assertNull(listenerReference.get());
+        assertNotNull(listenerReference.get());
+        listenerReference.getAndSet(null).onResponse(clusterState);
 
-        // should reroute when one disk goes over the watermark
+        // should not reroute when all disks are ok and no new info received
         currentTime.addAndGet(randomLongBetween(0, 120000));
+        monitor.onNewInfo(clusterInfo(allDisksOk));
+        assertNull(listenerReference.get());
+
+        // might or might not reroute when a node crosses a watermark, depending on whether the reroute interval has elapsed or not
+        if (randomBoolean()) {
+            currentTime.addAndGet(randomLongBetween(0, 120000));
+            monitor.onNewInfo(clusterInfo(oneDiskAboveWatermark));
+            Optional.ofNullable(listenerReference.getAndSet(null)).ifPresent(l -> l.onResponse(clusterState));
+        }
+
+        // however once the reroute interval has elapsed then we must reroute again
+        currentTime.addAndGet(
+            randomLongBetween(
+                DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.get(Settings.EMPTY).millis(),
+                120000
+            )
+        );
         monitor.onNewInfo(clusterInfo(oneDiskAboveWatermark));
         assertNotNull(listenerReference.get());
         listenerReference.getAndSet(null).onResponse(clusterState);
@@ -672,7 +697,7 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
             .nodes(DiscoveryNodes.builder().add(newNormalNode("node1")).add(newFrozenOnlyNode("frozen")))
             .build();
         final AtomicReference<ClusterState> clusterStateRef = new AtomicReference<>(clusterState);
-        final AtomicBoolean advanceTime = new AtomicBoolean(randomBoolean());
+        final AtomicBoolean advanceTime = new AtomicBoolean(true);
 
         final LongSupplier timeSupplier = new LongSupplier() {
             long time;
@@ -764,6 +789,9 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         );
         final ImmutableOpenMap<String, DiskUsage> frozenAboveFloodStageMaxHeadroom = frozenAboveFloodStageMaxHeadroomBuilder.build();
 
+        advanceTime.set(true); // first check sees new nodes and triggers a reroute
+        assertNoLogging(monitor, allDisksOk);
+        advanceTime.set(randomBoolean()); // no new nodes so no reroute delay needed
         assertNoLogging(monitor, allDisksOk);
 
         assertSingleInfoMessage(