Browse Source

Disk decider respect watermarks for single data node (#55805)

The disk decider had special handling for the single data node case,
allowing any allocation (skipping watermark checks) for such clusters.
This special handling can now be avoided via a setting.
Henning Andersen 5 years ago
parent
commit
0bd28aed4e

+ 6 - 0
docs/reference/modules/cluster/disk_allocator.asciidoc

@@ -31,6 +31,12 @@ file or updated dynamically on a live cluster with the
     than the specified amount of free space. This setting affects the
     allocation of all shards, whether previously allocated or not.
 
+`cluster.routing.allocation.disk.watermark.enable_for_single_data_node`::
+    For a single data node, the default is to disregard disk watermarks when
+    making an allocation decision. This is deprecated behavior and will be
+    changed in 8.0. This setting can be set to `true` to enable the
+    disk watermarks for a single data node cluster (will become default in 8.0).
+
 `cluster.routing.allocation.disk.watermark.flood_stage`::
 +
 --

+ 10 - 2
server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDecider.java

@@ -7,7 +7,7 @@
  * not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
@@ -22,6 +22,7 @@ package org.elasticsearch.cluster.routing.allocation.decider;
 import com.carrotsearch.hppc.cursors.ObjectCursor;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
+import org.elasticsearch.Version;
 import org.elasticsearch.cluster.ClusterInfo;
 import org.elasticsearch.cluster.DiskUsage;
 import org.elasticsearch.cluster.metadata.IndexMetadata;
@@ -37,6 +38,7 @@ import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.collect.ImmutableOpenMap;
 import org.elasticsearch.common.settings.ClusterSettings;
+import org.elasticsearch.common.settings.Setting;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.ByteSizeValue;
 import org.elasticsearch.index.Index;
@@ -76,10 +78,16 @@ public class DiskThresholdDecider extends AllocationDecider {
 
     public static final String NAME = "disk_threshold";
 
+    public static final Setting<Boolean> ENABLE_FOR_SINGLE_DATA_NODE =
+        Setting.boolSetting("cluster.routing.allocation.disk.watermark.enable_for_single_data_node", false, Setting.Property.NodeScope);
+
     private final DiskThresholdSettings diskThresholdSettings;
+    private final boolean enableForSingleDataNode;
 
     public DiskThresholdDecider(Settings settings, ClusterSettings clusterSettings) {
         this.diskThresholdSettings = new DiskThresholdSettings(settings, clusterSettings);
+        assert Version.CURRENT.major < 9 : "remove enable_for_single_data_node in 9";
+        this.enableForSingleDataNode = ENABLE_FOR_SINGLE_DATA_NODE.get(settings);
     }
 
     /**
@@ -415,7 +423,7 @@ public class DiskThresholdDecider extends AllocationDecider {
         }
 
         // Allow allocation regardless if only a single data node is available
-        if (allocation.nodes().getDataNodes().size() <= 1) {
+        if (enableForSingleDataNode == false && allocation.nodes().getDataNodes().size() <= 1) {
             if (logger.isTraceEnabled()) {
                 logger.trace("only a single data node is present, allowing allocation");
             }

+ 3 - 1
server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java

@@ -7,7 +7,7 @@
  * not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
@@ -49,6 +49,7 @@ import org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllo
 import org.elasticsearch.cluster.routing.allocation.decider.AwarenessAllocationDecider;
 import org.elasticsearch.cluster.routing.allocation.decider.ClusterRebalanceAllocationDecider;
 import org.elasticsearch.cluster.routing.allocation.decider.ConcurrentRebalanceAllocationDecider;
+import org.elasticsearch.cluster.routing.allocation.decider.DiskThresholdDecider;
 import org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider;
 import org.elasticsearch.cluster.routing.allocation.decider.FilterAllocationDecider;
 import org.elasticsearch.cluster.routing.allocation.decider.SameShardAllocationDecider;
@@ -217,6 +218,7 @@ public final class ClusterSettings extends AbstractScopedSettings {
             ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING,
             ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_OUTGOING_RECOVERIES_SETTING,
             ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING,
+            DiskThresholdDecider.ENABLE_FOR_SINGLE_DATA_NODE,
             DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING,
             DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING,
             DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING,

+ 82 - 1
server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderTests.java

@@ -7,7 +7,7 @@
  * not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
@@ -50,6 +50,7 @@ import org.elasticsearch.cluster.routing.allocation.command.MoveAllocationComman
 import org.elasticsearch.common.collect.ImmutableOpenMap;
 import org.elasticsearch.common.settings.ClusterSettings;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.shard.ShardId;
 import org.elasticsearch.test.gateway.TestGatewayAllocator;
 
 import java.util.Arrays;
@@ -894,6 +895,7 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
     }
 
     public void testForSingleDataNode() {
+        // remove test in 9.0
         Settings diskSettings = Settings.builder()
                 .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true)
                 .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "60%")
@@ -1020,6 +1022,85 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
         assertThat(result.routingTable().index("test").getShards().get(1).primaryShard().relocatingNodeId(), equalTo("node3"));
     }
 
+    public void testWatermarksEnabledForSingleDataNode() {
+        Settings diskSettings = Settings.builder()
+            .put(DiskThresholdDecider.ENABLE_FOR_SINGLE_DATA_NODE.getKey(), true)
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true)
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "60%")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "70%").build();
+
+        ImmutableOpenMap.Builder<String, DiskUsage> usagesBuilder = ImmutableOpenMap.builder();
+        usagesBuilder.put("data", new DiskUsage("data", "data", "/dev/null", 100, 20));  // 80% used
+        ImmutableOpenMap<String, DiskUsage> usages = usagesBuilder.build();
+
+        // We have an index with 1 primary shard, taking 40 bytes. The single data node has only 20 bytes free.
+        ImmutableOpenMap.Builder<String, Long> shardSizes = ImmutableOpenMap.builder();
+        shardSizes.put("[test][0][p]", 40L);
+        final ClusterInfo clusterInfo = new DevNullClusterInfo(usages, usages, shardSizes.build());
+
+        DiskThresholdDecider diskThresholdDecider = makeDecider(diskSettings);
+        Metadata metadata = Metadata.builder()
+            .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(0))
+            .build();
+        RoutingTable initialRoutingTable = RoutingTable.builder()
+            .addAsNew(metadata.index("test"))
+            .build();
+
+        DiscoveryNode masterNode = new DiscoveryNode("master", "master", buildNewFakeTransportAddress(), emptyMap(),
+            singleton(DiscoveryNodeRole.MASTER_ROLE), Version.CURRENT);
+        DiscoveryNode dataNode = new DiscoveryNode("data", "data", buildNewFakeTransportAddress(), emptyMap(),
+            singleton(DiscoveryNodeRole.DATA_ROLE), Version.CURRENT);
+        DiscoveryNodes.Builder discoveryNodesBuilder = DiscoveryNodes.builder().add(dataNode);
+        if (randomBoolean()) {
+            discoveryNodesBuilder.add(masterNode);
+        }
+        DiscoveryNodes discoveryNodes = discoveryNodesBuilder.build();
+
+        ClusterState clusterState = ClusterState.builder(new ClusterName("test"))
+            .nodes(discoveryNodes)
+            .metadata(metadata)
+            .routingTable(initialRoutingTable).build();
+
+        // validate that the shard cannot be allocated
+        ClusterInfoService cis = () -> clusterInfo;
+        AllocationDeciders deciders = new AllocationDeciders(new HashSet<>(Arrays.asList(
+            new SameShardAllocationDecider(
+                Settings.EMPTY, new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS)
+            ),
+            diskThresholdDecider
+        )));
+        AllocationService strategy = new AllocationService(deciders,
+            new TestGatewayAllocator(), new BalancedShardsAllocator(Settings.EMPTY), cis);
+        ClusterState result = strategy.reroute(clusterState, "reroute");
+
+        ShardRouting shardRouting = result.routingTable().index("test").getShards().get(0).primaryShard();
+        assertThat(shardRouting.state(), equalTo(UNASSIGNED));
+        assertThat(shardRouting.currentNodeId(), nullValue());
+        assertThat(shardRouting.relocatingNodeId(), nullValue());
+
+        // force assign shard and validate that it cannot remain.
+        ShardId shardId = shardRouting.shardId();
+        ShardRouting startedShard = shardRouting.initialize("data", null, 40L).moveToStarted();
+        RoutingTable forceAssignedRoutingTable = RoutingTable.builder().add(
+            IndexRoutingTable.builder(shardId.getIndex())
+                .addIndexShard(new IndexShardRoutingTable.Builder(shardId)
+                    .addShard(startedShard)
+                    .build()
+                )
+        ).build();
+        clusterState = ClusterState.builder(clusterState).routingTable(forceAssignedRoutingTable).build();
+
+        RoutingAllocation routingAllocation = new RoutingAllocation(null, new RoutingNodes(clusterState), clusterState, clusterInfo,
+            System.nanoTime());
+        routingAllocation.debugDecision(true);
+        Decision decision = diskThresholdDecider.canRemain(startedShard, clusterState.getRoutingNodes().node("data"), routingAllocation);
+        assertThat(decision.type(), equalTo(Decision.Type.NO));
+        assertThat(decision.getExplanation(), containsString(
+            "the shard cannot remain on this node because it is above the high watermark cluster setting" +
+                " [cluster.routing.allocation.disk.watermark.high=70%] and there is less than the required [30.0%] free disk on node," +
+                " actual free: [20.0%]"));
+    }
+
     public void logShardStates(ClusterState state) {
         RoutingNodes rn = state.getRoutingNodes();
         logger.info("--> counts: total: {}, unassigned: {}, initializing: {}, relocating: {}, started: {}",