Browse Source

Enable testSnapshotBasedRecovery (#97654)

Cancel shard allocation command is broken for initial desired balance versions
and might allocate shard on the node where it is not supposed to be. This
is fixed by https://github.com/elastic/elasticsearch/pull/93635. Disabling test
when upgrading from affected versions.
Ievgen Degtiarenko 2 năm trước cách đây
mục cha
commit
32fb774976

+ 1 - 4
qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/DesiredNodesUpgradeIT.java

@@ -38,10 +38,7 @@ public class DesiredNodesUpgradeIT extends AbstractRollingTestCase {
     }
 
     public void testUpgradeDesiredNodes() throws Exception {
-        // Desired nodes was introduced in 8.1
-        if (UPGRADE_FROM_VERSION.before(Version.V_8_1_0)) {
-            return;
-        }
+        assumeTrue("Desired nodes was introduced in 8.1", UPGRADE_FROM_VERSION.onOrAfter(Version.V_8_1_0));
 
         if (UPGRADE_FROM_VERSION.onOrAfter(Processors.DOUBLE_PROCESSORS_SUPPORT_VERSION)) {
             assertUpgradedNodesCanReadDesiredNodes();

+ 9 - 21
qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/SnapshotBasedRecoveryIT.java

@@ -42,8 +42,14 @@ import static org.hamcrest.Matchers.notNullValue;
 
 public class SnapshotBasedRecoveryIT extends AbstractRollingTestCase {
 
-    @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/93271")
     public void testSnapshotBasedRecovery() throws Exception {
+
+        assumeFalse(
+            "Cancel shard allocation command is broken for initial desired balance versions and might allocate shard "
+                + "on the node where it is not supposed to be. Fixed by https://github.com/elastic/elasticsearch/pull/93635",
+            UPGRADE_FROM_VERSION == Version.V_8_6_0 || UPGRADE_FROM_VERSION == Version.V_8_6_1 || UPGRADE_FROM_VERSION == Version.V_8_7_0
+        );
+
         final String indexName = "snapshot_based_recovery";
         final String repositoryName = "snapshot_based_recovery_repo";
         final int numDocs = 200;
@@ -122,12 +128,8 @@ public class SnapshotBasedRecoveryIT extends AbstractRollingTestCase {
                     Settings.builder().put(IndexMetadata.INDEX_NUMBER_OF_REPLICAS_SETTING.getKey(), 1)
                 );
                 logger.info("--> finished adding replicas from [{}]", indexName);
-                try {
-                    ensureGreen(indexName);
-                } catch (AssertionError e) {
-                    logAllocationExplain();
-                    throw e;
-                }
+                ensureGreen(indexName);
+
                 assertMatchAllReturnsAllDocuments(indexName, numDocs);
                 assertMatchQueryReturnsAllDocuments(indexName, numDocs);
             }
@@ -135,20 +137,6 @@ public class SnapshotBasedRecoveryIT extends AbstractRollingTestCase {
         }
     }
 
-    private void logAllocationExplain() throws Exception {
-        // Used to debug #91383
-        var request = new Request(HttpGet.METHOD_NAME, "_cluster/allocation/explain?include_disk_info=true&include_yes_decisions=true");
-        request.setJsonEntity("""
-                    {
-                      "index": "snapshot_based_recovery",
-                      "shard": 0,
-                      "primary": false
-                    }
-            """);
-        var response = client().performRequest(request);
-        logger.info("--> allocation explain {}", EntityUtils.toString(response.getEntity()));
-    }
-
     private List<String> getUpgradedNodeIds() throws IOException {
         Request request = new Request(HttpGet.METHOD_NAME, "_nodes/_all");
         Response response = client().performRequest(request);

+ 1 - 3
server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java

@@ -12,7 +12,6 @@ import org.apache.logging.log4j.Level;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.elasticsearch.cluster.ClusterInfoSimulator;
-import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.routing.RoutingNodes;
 import org.elasticsearch.cluster.routing.ShardRouting;
 import org.elasticsearch.cluster.routing.UnassignedInfo;
@@ -39,7 +38,6 @@ import java.util.TreeMap;
 import java.util.TreeSet;
 import java.util.function.Predicate;
 
-import static java.util.stream.Collectors.toSet;
 import static java.util.stream.Collectors.toUnmodifiableSet;
 
 /**
@@ -81,9 +79,9 @@ public class DesiredBalanceComputer {
 
         final var routingAllocation = desiredBalanceInput.routingAllocation().mutableCloneForSimulation();
         final var routingNodes = routingAllocation.routingNodes();
+        final var knownNodeIds = routingNodes.getAllNodeIds();
         final var changes = routingAllocation.changes();
         final var ignoredShards = getIgnoredShardsWithDiscardedAllocationStatus(desiredBalanceInput.ignoredShards());
-        final var knownNodeIds = routingAllocation.nodes().stream().map(DiscoveryNode::getId).collect(toSet());
         final var clusterInfoSimulator = new ClusterInfoSimulator(routingAllocation.clusterInfo());
 
         if (routingNodes.size() == 0) {

+ 58 - 0
server/src/test/java/org/elasticsearch/cluster/routing/allocation/AllocationCommandsTests.java

@@ -22,11 +22,15 @@ import org.elasticsearch.cluster.metadata.Metadata;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodeRole;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
+import org.elasticsearch.cluster.routing.AllocationId;
+import org.elasticsearch.cluster.routing.IndexRoutingTable;
 import org.elasticsearch.cluster.routing.RecoverySource;
 import org.elasticsearch.cluster.routing.RoutingNode;
 import org.elasticsearch.cluster.routing.RoutingTable;
 import org.elasticsearch.cluster.routing.ShardRouting;
 import org.elasticsearch.cluster.routing.ShardRoutingState;
+import org.elasticsearch.cluster.routing.allocation.allocator.DesiredBalanceShardsAllocator;
+import org.elasticsearch.cluster.routing.allocation.allocator.ShardAssignment;
 import org.elasticsearch.cluster.routing.allocation.command.AbstractAllocateAllocationCommand;
 import org.elasticsearch.cluster.routing.allocation.command.AllocateEmptyPrimaryAllocationCommand;
 import org.elasticsearch.cluster.routing.allocation.command.AllocateReplicaAllocationCommand;
@@ -36,6 +40,7 @@ import org.elasticsearch.cluster.routing.allocation.command.CancelAllocationComm
 import org.elasticsearch.cluster.routing.allocation.command.MoveAllocationCommand;
 import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders;
 import org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider;
+import org.elasticsearch.common.UUIDs;
 import org.elasticsearch.common.io.stream.BytesStreamOutput;
 import org.elasticsearch.common.io.stream.NamedWriteableAwareStreamInput;
 import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
@@ -44,6 +49,7 @@ import org.elasticsearch.common.network.NetworkModule;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.IndexNotFoundException;
+import org.elasticsearch.index.IndexVersion;
 import org.elasticsearch.index.shard.ShardId;
 import org.elasticsearch.index.shard.ShardNotFoundException;
 import org.elasticsearch.snapshots.SnapshotShardSizeInfo;
@@ -63,6 +69,7 @@ import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING;
 import static org.elasticsearch.cluster.routing.ShardRoutingState.RELOCATING;
 import static org.elasticsearch.cluster.routing.ShardRoutingState.STARTED;
 import static org.elasticsearch.cluster.routing.ShardRoutingState.UNASSIGNED;
+import static org.elasticsearch.cluster.routing.TestShardRouting.newShardRouting;
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.hasSize;
@@ -689,6 +696,57 @@ public class AllocationCommandsTests extends ESAllocationTestCase {
         }
     }
 
+    public void testCanceledShardIsInitializedRespectingAllocationDeciders() {
+
+        var allocationId1 = AllocationId.newInitializing(UUIDs.randomBase64UUID());
+        var allocationId2 = AllocationId.newInitializing(UUIDs.randomBase64UUID());
+
+        var indexMetadata = IndexMetadata.builder("test")
+            .settings(indexSettings(IndexVersion.current(), 1, 1).put("index.routing.allocation.exclude._id", "node-0"))
+            .putInSyncAllocationIds(0, Set.of(allocationId1.getId(), allocationId2.getId()))
+            .build();
+        var shardId = new ShardId(indexMetadata.getIndex(), 0);
+
+        ShardRouting primary = newShardRouting(shardId, "node-0", null, true, STARTED, allocationId1);
+        ShardRouting replica = newShardRouting(shardId, "node-1", null, false, STARTED, allocationId2);
+
+        ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT)
+            .nodes(
+                DiscoveryNodes.builder()
+                    .add(newNode("node-0", Version.V_8_10_0))
+                    .add(newNode("node-1", Version.V_8_9_0))
+                    .add(newNode("node-2", Version.V_8_9_0))
+            )
+            .metadata(Metadata.builder().put(indexMetadata, false))
+            .routingTable(RoutingTable.builder().add(IndexRoutingTable.builder(shardId.getIndex()).addShard(primary).addShard(replica)))
+            .build();
+
+        var allocation = createAllocationService();
+        clusterState = startInitializingShardsAndReroute(allocation, clusterState);
+        if (allocation.shardsAllocator instanceof DesiredBalanceShardsAllocator dbsa) {
+            // ShardAssignment still contains `node-0` even though `can_remain_decision=no` for it
+            assertThat(dbsa.getDesiredBalance().getAssignment(shardId), equalTo(new ShardAssignment(Set.of("node-0", "node-1"), 2, 0, 0)));
+        }
+
+        clusterState = allocation.reroute(
+            clusterState,
+            new AllocationCommands(new CancelAllocationCommand(shardId.getIndexName(), 0, "node-0", true)),
+            false,
+            false,
+            false,
+            ActionListener.noop()
+        ).clusterState();
+        clusterState = startInitializingShardsAndReroute(allocation, clusterState);
+
+        assertThat(clusterState.getRoutingNodes().node("node-0").size(), equalTo(0));
+        assertThat(clusterState.getRoutingNodes().node("node-1").numberOfShardsWithState(STARTED), equalTo(1));
+        assertThat(clusterState.getRoutingNodes().node("node-2").numberOfShardsWithState(STARTED), equalTo(1));
+
+        if (allocation.shardsAllocator instanceof DesiredBalanceShardsAllocator dbsa) {
+            assertThat(dbsa.getDesiredBalance().getAssignment(shardId), equalTo(new ShardAssignment(Set.of("node-1", "node-2"), 2, 0, 0)));
+        }
+    }
+
     public void testSerialization() throws Exception {
         AllocationCommands commands = new AllocationCommands(
             new AllocateEmptyPrimaryAllocationCommand("test", 1, "node1", true),