Browse Source

[ML] Fix NPE in Get Deployment Stats (#115404) (#115474)

If a node has been removed from the cluster and the trained model 
assignment has not been updated the GET stats action can have an
inconsistent view where it thinks a model is deployed on the removed
node. The bug only affected nodes with failed deployments.
David Kyle 1 year ago
parent
commit
cd2a8845cd

+ 5 - 0
docs/changelog/115404.yaml

@@ -0,0 +1,5 @@
+pr: 115404
+summary: Fix NPE in Get Deployment Stats
+area: Machine Learning
+type: bug
+issues: []

+ 1 - 1
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportGetDeploymentStatsAction.java

@@ -220,7 +220,7 @@ public class TransportGetDeploymentStatsAction extends TransportTasksAction<
 
 
                 // add nodes from the failures that were not in the task responses
                 // add nodes from the failures that were not in the task responses
                 for (var nodeRoutingState : nodeToRoutingStates.entrySet()) {
                 for (var nodeRoutingState : nodeToRoutingStates.entrySet()) {
-                    if (visitedNodes.contains(nodeRoutingState.getKey()) == false) {
+                    if ((visitedNodes.contains(nodeRoutingState.getKey()) == false) && nodes.nodeExists(nodeRoutingState.getKey())) {
                         updatedNodeStats.add(
                         updatedNodeStats.add(
                             AssignmentStats.NodeStats.forNotStartedState(
                             AssignmentStats.NodeStats.forNotStartedState(
                                 nodes.get(nodeRoutingState.getKey()),
                                 nodes.get(nodeRoutingState.getKey()),

+ 39 - 0
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/action/TransportGetDeploymentStatsActionTests.java

@@ -148,6 +148,45 @@ public class TransportGetDeploymentStatsActionTests extends ESTestCase {
         assertEquals(RoutingState.FAILED, results.get(0).getNodeStats().get(1).getRoutingState().getState());
         assertEquals(RoutingState.FAILED, results.get(0).getNodeStats().get(1).getRoutingState().getState());
     }
     }
 
 
+    public void testAddFailedRoutes_MissingNode() throws UnknownHostException {
+        DiscoveryNodes nodes = buildNodes("node1", "node2");
+        var missingNode = DiscoveryNodeUtils.create(
+            "node3",
+            new TransportAddress(InetAddress.getByAddress(new byte[] { (byte) 192, (byte) 168, (byte) 0, (byte) 1 }), 9203)
+        );
+
+        List<AssignmentStats.NodeStats> nodeStatsList = new ArrayList<>();
+        nodeStatsList.add(AssignmentStatsTests.randomNodeStats(nodes.get("node1")));
+        nodeStatsList.add(AssignmentStatsTests.randomNodeStats(nodes.get("node2")));
+
+        var model1 = new AssignmentStats(
+            "model1",
+            "deployment1",
+            randomBoolean() ? null : randomIntBetween(1, 8),
+            randomBoolean() ? null : randomIntBetween(1, 8),
+            null,
+            randomBoolean() ? null : randomIntBetween(1, 10000),
+            randomBoolean() ? null : ByteSizeValue.ofBytes(randomLongBetween(1, 1000000)),
+            Instant.now(),
+            nodeStatsList,
+            randomFrom(Priority.values())
+        );
+        var response = new GetDeploymentStatsAction.Response(Collections.emptyList(), Collections.emptyList(), List.of(model1), 1);
+
+        // failed state for node 3 conflicts
+        Map<TrainedModelAssignment, Map<String, RoutingInfo>> badRoutes = new HashMap<>();
+        Map<String, RoutingInfo> nodeRoutes = new HashMap<>();
+        nodeRoutes.put("node3", new RoutingInfo(1, 1, RoutingState.FAILED, "failed on node3"));
+        badRoutes.put(createAssignment("model1"), nodeRoutes);
+
+        var modified = TransportGetDeploymentStatsAction.addFailedRoutes(response, badRoutes, nodes);
+        List<AssignmentStats> results = modified.getStats().results();
+        assertThat(results, hasSize(1));
+        assertThat(results.get(0).getNodeStats(), hasSize(2)); // 3
+        assertEquals("node1", results.get(0).getNodeStats().get(0).getNode().getId());
+        assertEquals("node2", results.get(0).getNodeStats().get(1).getNode().getId());
+    }
+
     private DiscoveryNodes buildNodes(String... nodeIds) throws UnknownHostException {
     private DiscoveryNodes buildNodes(String... nodeIds) throws UnknownHostException {
         InetAddress inetAddress = InetAddress.getByAddress(new byte[] { (byte) 192, (byte) 168, (byte) 0, (byte) 1 });
         InetAddress inetAddress = InetAddress.getByAddress(new byte[] { (byte) 192, (byte) 168, (byte) 0, (byte) 1 });
         DiscoveryNodes.Builder builder = DiscoveryNodes.builder();
         DiscoveryNodes.Builder builder = DiscoveryNodes.builder();