Browse Source

[ML] autoscaling context current capacity could be null, this commit handles that (#74822)

context commit may be null. This should only really happen early in a cluster's life cycle or if a node was just recently brought online. Mainly because the current node sizes have not been discovered yet and cached.

This change should really have been part of #74691
Benjamin Trent 4 years ago
parent
commit
c8c420094f

+ 4 - 3
x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/AutoscalingIT.java

@@ -46,7 +46,7 @@ public class AutoscalingIT extends MlNativeAutodetectIntegTestCase {
 
     // This test assumes that xpack.ml.max_machine_memory_percent is 30
     // and that xpack.ml.use_auto_machine_memory_percent is false
-    public void testMLAutoscalingCapacity() {
+    public void testMLAutoscalingCapacity() throws Exception {
         SortedMap<String, Settings> deciders = new TreeMap<>();
         deciders.put(MlAutoscalingDeciderService.NAME,
             Settings.builder().put(MlAutoscalingDeciderService.DOWN_SCALE_DELAY.getKey(), TimeValue.ZERO).build());
@@ -57,14 +57,15 @@ public class AutoscalingIT extends MlNativeAutodetectIntegTestCase {
         );
         assertAcked(client().execute(PutAutoscalingPolicyAction.INSTANCE, request).actionGet());
 
-        assertMlCapacity(
+        assertBusy(() -> assertMlCapacity(
             client().execute(
                 GetAutoscalingCapacityAction.INSTANCE,
                 new GetAutoscalingCapacityAction.Request()
             ).actionGet(),
             "Requesting scale down as tier and/or node size could be smaller",
             0L,
-            0L);
+            0L)
+        );
 
         putJob("job1", 100);
         putJob("job2", 200);

+ 10 - 3
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/autoscaling/MlAutoscalingDeciderService.java

@@ -534,9 +534,13 @@ public class MlAutoscalingDeciderService implements AutoscalingDeciderService,
             // Due to weird rounding errors, it may be that a scale down result COULD cause a scale up
             // Ensuring the scaleDown here forces the scale down result to always be lower than the current capacity.
             // This is safe as we know that ALL jobs are assigned at the current capacity
-            .map(result -> new AutoscalingDeciderResult(
-                ensureScaleDown(result.requiredCapacity(), context.currentCapacity()), result.reason()
-            ));
+            .map(result -> {
+                AutoscalingCapacity capacity = ensureScaleDown(result.requiredCapacity(), context.currentCapacity());
+                if (capacity == null) {
+                    return null;
+                }
+                return new AutoscalingDeciderResult(capacity, result.reason());
+            });
 
         if (maybeScaleDown.isPresent()) {
             final AutoscalingDeciderResult scaleDownDecisionResult = maybeScaleDown.get();
@@ -599,6 +603,9 @@ public class MlAutoscalingDeciderService implements AutoscalingDeciderService,
     }
 
     static AutoscalingCapacity ensureScaleDown(AutoscalingCapacity scaleDownResult, AutoscalingCapacity currentCapacity) {
+        if (scaleDownResult == null || currentCapacity == null) {
+            return null;
+        }
         AutoscalingCapacity newCapacity = new AutoscalingCapacity(
             new AutoscalingCapacity.AutoscalingResources(
                 currentCapacity.total().storage(),