浏览代码

[ML] Consider xpack.ml.max_ml_node_size in effective_model_memory_limit (#70473)

Changes the calculation of effective_model_memory_limit in the
_ml/info response to take account of xpack.ml.max_ml_node_size
if it is set and xpack.ml.max_lazy_ml_nodes would allow more
ML nodes to be added to the cluster. The assumption is that
if necessary the size of the newly added nodes would be
xpack.ml.max_ml_node_size, so it's reasonable for newly created
jobs to have a model_memory_limit that would fit on a node of
that size.

Fixes #70069
David Roberts 4 年之前
父节点
当前提交
25cb095f76

+ 21 - 2
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportMlInfoAction.java

@@ -41,6 +41,11 @@ import java.util.Map;
 import java.util.OptionalLong;
 import java.util.concurrent.TimeoutException;
 
+import static org.elasticsearch.xpack.ml.MachineLearning.MAX_LAZY_ML_NODES;
+import static org.elasticsearch.xpack.ml.MachineLearning.MAX_MACHINE_MEMORY_PERCENT;
+import static org.elasticsearch.xpack.ml.MachineLearning.MAX_ML_NODE_SIZE;
+import static org.elasticsearch.xpack.ml.MachineLearning.USE_AUTO_MACHINE_MEMORY_PERCENT;
+
 public class TransportMlInfoAction extends HandledTransportAction<MlInfoAction.Request, MlInfoAction.Response> {
 
     private static final Logger logger = LogManager.getLogger(TransportMlInfoAction.class);
@@ -137,6 +142,7 @@ public class TransportMlInfoAction extends HandledTransportAction<MlInfoAction.R
     static ByteSizeValue calculateEffectiveMaxModelMemoryLimit(ClusterSettings clusterSettings, DiscoveryNodes nodes) {
 
         long maxMlMemory = -1;
+        int numMlNodes = 0;
 
         for (DiscoveryNode node : nodes) {
             OptionalLong limit = NativeMemoryCalculator.allowedBytesForMl(node, clusterSettings);
@@ -144,11 +150,24 @@ public class TransportMlInfoAction extends HandledTransportAction<MlInfoAction.R
                 continue;
             }
             maxMlMemory = Math.max(maxMlMemory, limit.getAsLong());
+            ++numMlNodes;
+        }
+
+        // It is possible that there is scope for more ML nodes to be added
+        // to the cluster, in which case take those into account too
+        long maxMlNodeSize = clusterSettings.get(MAX_ML_NODE_SIZE).getBytes();
+        int maxLazyNodes = clusterSettings.get(MAX_LAZY_ML_NODES);
+        if (maxMlNodeSize > 0 && numMlNodes < maxLazyNodes) {
+            maxMlMemory = Math.max(maxMlMemory, NativeMemoryCalculator.allowedBytesForMl(
+                maxMlNodeSize,
+                clusterSettings.get(MAX_MACHINE_MEMORY_PERCENT),
+                clusterSettings.get(USE_AUTO_MACHINE_MEMORY_PERCENT)));
         }
 
         if (maxMlMemory <= 0) {
-            // This implies there are currently no ML nodes in the cluster, so we
-            // have no idea what the effective limit would be if one were added
+            // This implies there are currently no ML nodes in the cluster, and
+            // no automatic mechanism for adding one, so we have no idea what
+            // the effective limit would be if one were added
             return null;
         }
 

+ 130 - 25
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/action/TransportMlInfoActionTests.java

@@ -23,8 +23,11 @@ import org.elasticsearch.xpack.ml.MachineLearning;
 import java.net.InetAddress;
 import java.util.Collections;
 
+import static org.elasticsearch.xpack.ml.MachineLearning.MAX_LAZY_ML_NODES;
 import static org.elasticsearch.xpack.ml.MachineLearning.MAX_MACHINE_MEMORY_PERCENT;
+import static org.elasticsearch.xpack.ml.MachineLearning.MAX_ML_NODE_SIZE;
 import static org.elasticsearch.xpack.ml.MachineLearning.USE_AUTO_MACHINE_MEMORY_PERCENT;
+import static org.hamcrest.Matchers.greaterThan;
 import static org.hamcrest.Matchers.is;
 import static org.hamcrest.Matchers.lessThanOrEqualTo;
 import static org.hamcrest.Matchers.notNullValue;
@@ -32,45 +35,31 @@ import static org.hamcrest.Matchers.nullValue;
 
 public class TransportMlInfoActionTests extends ESTestCase {
 
-    public void testCalculateEffectiveMaxModelMemoryLimit() {
+    public void testCalculateEffectiveMaxModelMemoryLimitWithoutMaxMlNodeSize() {
 
         int mlMemoryPercent = randomIntBetween(5, 90);
+        long mlMachineMemory = randomLongBetween(2000000000L, 100000000000L);
+        int numMlNodes = randomIntBetween(0, 10);
+        int numNonMlNodes = randomIntBetween(0, 10);
         ClusterSettings clusterSettings = new ClusterSettings(
             Settings.builder().put(MAX_MACHINE_MEMORY_PERCENT.getKey(), mlMemoryPercent).build(),
-            Sets.newHashSet(MAX_MACHINE_MEMORY_PERCENT, USE_AUTO_MACHINE_MEMORY_PERCENT));
-        long highestMlMachineMemoryBytes = -1;
-        long totalMlMemoryBytes = 0;
+            Sets.newHashSet(MAX_LAZY_ML_NODES, MAX_MACHINE_MEMORY_PERCENT, MAX_ML_NODE_SIZE, USE_AUTO_MACHINE_MEMORY_PERCENT));
+        long totalMlMemoryBytes = numMlNodes * mlMachineMemory * mlMemoryPercent / 100;
 
-        DiscoveryNodes.Builder builder = DiscoveryNodes.builder();
-        for (int i = randomIntBetween(1, 10); i > 0; --i) {
-            String nodeName = "_node_name" + i;
-            String nodeId = "_node_id" + i;
-            TransportAddress ta = new TransportAddress(InetAddress.getLoopbackAddress(), 9300 + i);
-            if (randomBoolean()) {
-                // Not an ML node
-                builder.add(new DiscoveryNode(nodeName, nodeId, ta, Collections.emptyMap(), Collections.emptySet(), Version.CURRENT));
-            } else {
-                // ML node
-                long machineMemory = randomLongBetween(2000000000L, 100000000000L);
-                highestMlMachineMemoryBytes = Math.max(machineMemory, highestMlMachineMemoryBytes);
-                totalMlMemoryBytes += machineMemory * mlMemoryPercent / 100;
-                builder.add(new DiscoveryNode(nodeName, nodeId, ta,
-                    Collections.singletonMap(MachineLearning.MACHINE_MEMORY_NODE_ATTR, String.valueOf(machineMemory)),
-                    Collections.emptySet(), Version.CURRENT));
-            }
-        }
-        DiscoveryNodes nodes = builder.build();
+        DiscoveryNodes nodes = randomNodes(numMlNodes, numNonMlNodes, mlMachineMemory);
 
         ByteSizeValue effectiveMaxModelMemoryLimit = TransportMlInfoAction.calculateEffectiveMaxModelMemoryLimit(clusterSettings, nodes);
 
-        if (highestMlMachineMemoryBytes < 0) {
+        if (numMlNodes == 0) {
+            // "Don't know"
             assertThat(effectiveMaxModelMemoryLimit, nullValue());
         } else {
+            // Expect configured percentage of current node size (allowing for small rounding errors)
             assertThat(effectiveMaxModelMemoryLimit, notNullValue());
             assertThat(effectiveMaxModelMemoryLimit.getBytes()
                     + Math.max(Job.PROCESS_MEMORY_OVERHEAD.getBytes(), DataFrameAnalyticsConfig.PROCESS_MEMORY_OVERHEAD.getBytes())
                     + MachineLearning.NATIVE_EXECUTABLE_CODE_OVERHEAD.getBytes(),
-                lessThanOrEqualTo(highestMlMachineMemoryBytes * mlMemoryPercent / 100));
+                lessThanOrEqualTo(mlMachineMemory * mlMemoryPercent / 100));
         }
 
         ByteSizeValue totalMlMemory = TransportMlInfoAction.calculateTotalMlMemory(clusterSettings, nodes);
@@ -78,4 +67,120 @@ public class TransportMlInfoActionTests extends ESTestCase {
         assertThat(totalMlMemory, notNullValue());
         assertThat(totalMlMemory, is(ByteSizeValue.ofMb(totalMlMemoryBytes / (1024 * 1024))));
     }
+
+    public void testCalculateEffectiveMaxModelMemoryLimitNoMlNodesButMaxMlNodeSizeAndLazyNodesAllowed() {
+
+        int mlMemoryPercent = randomIntBetween(5, 90);
+        long mlMaxNodeSize = randomLongBetween(2000000000L, 100000000000L);
+        int numNonMlNodes = randomIntBetween(0, 10);
+        ClusterSettings clusterSettings = new ClusterSettings(
+            Settings.builder().put(MAX_ML_NODE_SIZE.getKey(), mlMaxNodeSize + "b")
+                .put(MAX_LAZY_ML_NODES.getKey(), randomIntBetween(1, 100))
+                .put(MAX_MACHINE_MEMORY_PERCENT.getKey(), mlMemoryPercent).build(),
+            Sets.newHashSet(MAX_LAZY_ML_NODES, MAX_MACHINE_MEMORY_PERCENT, MAX_ML_NODE_SIZE, USE_AUTO_MACHINE_MEMORY_PERCENT));
+
+        DiscoveryNodes nodes = randomNodes(0, numNonMlNodes, 0);
+
+        ByteSizeValue effectiveMaxModelMemoryLimit = TransportMlInfoAction.calculateEffectiveMaxModelMemoryLimit(clusterSettings, nodes);
+
+        // Expect configured percentage of maximum declared node size (allowing for small rounding errors)
+        assertThat(effectiveMaxModelMemoryLimit, notNullValue());
+        assertThat(effectiveMaxModelMemoryLimit.getBytes()
+                + Math.max(Job.PROCESS_MEMORY_OVERHEAD.getBytes(), DataFrameAnalyticsConfig.PROCESS_MEMORY_OVERHEAD.getBytes())
+                + MachineLearning.NATIVE_EXECUTABLE_CODE_OVERHEAD.getBytes(),
+            lessThanOrEqualTo(mlMaxNodeSize * mlMemoryPercent / 100));
+
+        ByteSizeValue totalMlMemory = TransportMlInfoAction.calculateTotalMlMemory(clusterSettings, nodes);
+
+        assertThat(totalMlMemory, notNullValue());
+        assertThat(totalMlMemory, is(ByteSizeValue.ofMb(0)));
+    }
+
+    public void testCalculateEffectiveMaxModelMemoryLimitSmallMlNodesButMaxMlNodeSizeBiggerAndLazyNodesAllowed() {
+
+        int mlMemoryPercent = randomIntBetween(5, 90);
+        long mlMaxNodeSize = randomLongBetween(2000000000L, 100000000000L);
+        long mlMachineMemory = mlMaxNodeSize / randomLongBetween(3, 5);
+        int numMlNodes = randomIntBetween(1, 10);
+        int numNonMlNodes = randomIntBetween(0, 10);
+        ClusterSettings clusterSettings = new ClusterSettings(
+            Settings.builder().put(MAX_ML_NODE_SIZE.getKey(), mlMaxNodeSize + "b")
+                .put(MAX_LAZY_ML_NODES.getKey(), randomIntBetween(numMlNodes + 1, 100))
+                .put(MAX_MACHINE_MEMORY_PERCENT.getKey(), mlMemoryPercent).build(),
+            Sets.newHashSet(MAX_LAZY_ML_NODES, MAX_MACHINE_MEMORY_PERCENT, MAX_ML_NODE_SIZE, USE_AUTO_MACHINE_MEMORY_PERCENT));
+        long totalMlMemoryBytes = numMlNodes * mlMachineMemory * mlMemoryPercent / 100;
+
+        DiscoveryNodes nodes = randomNodes(numMlNodes, numNonMlNodes, mlMachineMemory);
+
+        ByteSizeValue effectiveMaxModelMemoryLimit = TransportMlInfoAction.calculateEffectiveMaxModelMemoryLimit(clusterSettings, nodes);
+
+        // Expect configured percentage of maximum declared node size (allowing for small rounding errors) - bigger than current node size
+        assertThat(effectiveMaxModelMemoryLimit, notNullValue());
+        assertThat(effectiveMaxModelMemoryLimit.getBytes()
+                + Math.max(Job.PROCESS_MEMORY_OVERHEAD.getBytes(), DataFrameAnalyticsConfig.PROCESS_MEMORY_OVERHEAD.getBytes())
+                + MachineLearning.NATIVE_EXECUTABLE_CODE_OVERHEAD.getBytes(),
+            lessThanOrEqualTo(mlMaxNodeSize * mlMemoryPercent / 100));
+        assertThat(effectiveMaxModelMemoryLimit.getBytes()
+                + Math.max(Job.PROCESS_MEMORY_OVERHEAD.getBytes(), DataFrameAnalyticsConfig.PROCESS_MEMORY_OVERHEAD.getBytes())
+                + MachineLearning.NATIVE_EXECUTABLE_CODE_OVERHEAD.getBytes(),
+            greaterThan(2 * mlMachineMemory * mlMemoryPercent / 100));
+
+        ByteSizeValue totalMlMemory = TransportMlInfoAction.calculateTotalMlMemory(clusterSettings, nodes);
+
+        assertThat(totalMlMemory, notNullValue());
+        assertThat(totalMlMemory, is(ByteSizeValue.ofMb(totalMlMemoryBytes / (1024 * 1024))));
+    }
+
+    public void testCalculateEffectiveMaxModelMemoryLimitSmallMlNodesButMaxMlNodeSizeBiggerAndLazyNodesExhausted() {
+
+        int mlMemoryPercent = randomIntBetween(5, 90);
+        long mlMaxNodeSize = randomLongBetween(2000000000L, 100000000000L);
+        long mlMachineMemory = mlMaxNodeSize / randomLongBetween(3, 5);
+        int numMlNodes = randomIntBetween(2, 10);
+        int numNonMlNodes = randomIntBetween(0, 10);
+        ClusterSettings clusterSettings = new ClusterSettings(
+            Settings.builder().put(MAX_ML_NODE_SIZE.getKey(), mlMaxNodeSize + "b")
+                .put(MAX_LAZY_ML_NODES.getKey(), randomIntBetween(1, numMlNodes - 1))
+                .put(MAX_MACHINE_MEMORY_PERCENT.getKey(), mlMemoryPercent).build(),
+            Sets.newHashSet(MAX_LAZY_ML_NODES, MAX_MACHINE_MEMORY_PERCENT, MAX_ML_NODE_SIZE, USE_AUTO_MACHINE_MEMORY_PERCENT));
+        long totalMlMemoryBytes = numMlNodes * mlMachineMemory * mlMemoryPercent / 100;
+
+        DiscoveryNodes nodes = randomNodes(numMlNodes, numNonMlNodes, mlMachineMemory);
+
+        ByteSizeValue effectiveMaxModelMemoryLimit = TransportMlInfoAction.calculateEffectiveMaxModelMemoryLimit(clusterSettings, nodes);
+
+        // Expect configured percentage of current node size (allowing for small rounding errors) - max is bigger but can't be added
+        assertThat(effectiveMaxModelMemoryLimit, notNullValue());
+        assertThat(effectiveMaxModelMemoryLimit.getBytes()
+                + Math.max(Job.PROCESS_MEMORY_OVERHEAD.getBytes(), DataFrameAnalyticsConfig.PROCESS_MEMORY_OVERHEAD.getBytes())
+                + MachineLearning.NATIVE_EXECUTABLE_CODE_OVERHEAD.getBytes(),
+            lessThanOrEqualTo(mlMachineMemory * mlMemoryPercent / 100));
+
+        ByteSizeValue totalMlMemory = TransportMlInfoAction.calculateTotalMlMemory(clusterSettings, nodes);
+
+        assertThat(totalMlMemory, notNullValue());
+        assertThat(totalMlMemory, is(ByteSizeValue.ofMb(totalMlMemoryBytes / (1024 * 1024))));
+    }
+
+    DiscoveryNodes randomNodes(int numMlNodes, int numNonMlNodes, long mlMachineMemory) {
+
+        DiscoveryNodes.Builder builder = DiscoveryNodes.builder();
+
+        for (int i = 0; i < numMlNodes + numNonMlNodes; ++i) {
+            String nodeName = "_node_name" + i;
+            String nodeId = "_node_id" + i;
+            TransportAddress ta = new TransportAddress(InetAddress.getLoopbackAddress(), 9300 + i);
+            if (i < numMlNodes) {
+                // ML node
+                builder.add(new DiscoveryNode(nodeName, nodeId, ta,
+                    Collections.singletonMap(MachineLearning.MACHINE_MEMORY_NODE_ATTR, String.valueOf(mlMachineMemory)),
+                    Collections.emptySet(), Version.CURRENT));
+            } else {
+                // Not an ML node
+                builder.add(new DiscoveryNode(nodeName, nodeId, ta, Collections.emptyMap(), Collections.emptySet(), Version.CURRENT));
+            }
+        }
+
+        return builder.build();
+    }
 }