Browse Source

[ML] Add total ML memory to ML info (#65195)

This change adds an extra piece of information,
limits.total_ml_memory, to the ML info response.
This returns the total amount of memory that ML
is permitted to use for native processes across
all ML nodes in the cluster.  Some of this may
already be in use; the value returned is total,
not available ML memory.
David Roberts 4 years ago
parent
commit
e4ce39845b

+ 5 - 2
docs/reference/ml/anomaly-detection/apis/get-ml-info.asciidoc

@@ -30,7 +30,8 @@ privileges. See <<security-privileges>>, <<built-in-roles>> and
 This endpoint is designed to be used by a user interface that needs to fully
 understand machine learning configurations where some options are not specified,
 meaning that the defaults should be used.  This endpoint may be used to find out
-what those defaults are.
+what those defaults are.  It also provides information about the maximum size
+of {ml} jobs that could run in the current cluster configuration.
 
 [[get-ml-info-example]]
 == {api-examples-title}
@@ -115,7 +116,8 @@ This is a possible response:
     "build_hash": "99a07c016d5a73"
   },
   "limits" : {
-    "effective_max_model_memory_limit": "28961mb"
+    "effective_max_model_memory_limit": "28961mb",
+    "total_ml_memory": "86883mb"
   }
 }
 ----
@@ -123,3 +125,4 @@ This is a possible response:
 // TESTRESPONSE[s/"version": "7.0.0",/"version": "$body.native_code.version",/]
 // TESTRESPONSE[s/"build_hash": "99a07c016d5a73"/"build_hash": "$body.native_code.build_hash"/]
 // TESTRESPONSE[s/"effective_max_model_memory_limit": "28961mb"/"effective_max_model_memory_limit": "$body.limits.effective_max_model_memory_limit"/]
+// TESTRESPONSE[s/"total_ml_memory": "86883mb"/"total_ml_memory": "$body.limits.total_ml_memory"/]

+ 21 - 1
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportMlInfoAction.java

@@ -15,6 +15,7 @@ import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.settings.ClusterSettings;
+import org.elasticsearch.common.unit.ByteSizeUnit;
 import org.elasticsearch.common.unit.ByteSizeValue;
 import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.tasks.Task;
@@ -115,6 +116,23 @@ public class TransportMlInfoAction extends HandledTransportAction<MlInfoAction.R
         return anomalyDetectorsDefaults;
     }
 
+    static ByteSizeValue calculateTotalMlMemory(ClusterSettings clusterSettings, DiscoveryNodes nodes) {
+
+        long totalMlMemory = 0;
+
+        for (DiscoveryNode node : nodes) {
+            OptionalLong limit = NativeMemoryCalculator.allowedBytesForMl(node, clusterSettings);
+            if (limit.isEmpty()) {
+                continue;
+            }
+            totalMlMemory += limit.getAsLong();
+        }
+
+        // Round down to a whole number of megabytes, since we generally deal with model
+        // memory limits in whole megabytes
+        return ByteSizeValue.ofMb(ByteSizeUnit.BYTES.toMB(totalMlMemory));
+    }
+
     static ByteSizeValue calculateEffectiveMaxModelMemoryLimit(ClusterSettings clusterSettings, DiscoveryNodes nodes) {
 
         long maxMlMemory = -1;
@@ -135,7 +153,7 @@ public class TransportMlInfoAction extends HandledTransportAction<MlInfoAction.R
 
         maxMlMemory -= Math.max(Job.PROCESS_MEMORY_OVERHEAD.getBytes(), DataFrameAnalyticsConfig.PROCESS_MEMORY_OVERHEAD.getBytes());
         maxMlMemory -= MachineLearning.NATIVE_EXECUTABLE_CODE_OVERHEAD.getBytes();
-        return ByteSizeValue.ofMb(Math.max(0L, maxMlMemory) / 1024 / 1024);
+        return ByteSizeValue.ofMb(ByteSizeUnit.BYTES.toMB(Math.max(0L, maxMlMemory)));
     }
 
     private Map<String, Object> limits() {
@@ -153,6 +171,8 @@ public class TransportMlInfoAction extends HandledTransportAction<MlInfoAction.R
         if (effectiveMaxModelMemoryLimit != null) {
             limits.put("effective_max_model_memory_limit", effectiveMaxModelMemoryLimit.getStringRep());
         }
+        limits.put("total_ml_memory",
+            calculateTotalMlMemory(clusterService.getClusterSettings(), clusterService.state().getNodes()).getStringRep());
         return limits;
     }
 }

+ 12 - 4
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/action/TransportMlInfoActionTests.java

@@ -24,6 +24,7 @@ import java.util.Collections;
 
 import static org.elasticsearch.xpack.ml.MachineLearning.MAX_MACHINE_MEMORY_PERCENT;
 import static org.elasticsearch.xpack.ml.MachineLearning.USE_AUTO_MACHINE_MEMORY_PERCENT;
+import static org.hamcrest.Matchers.is;
 import static org.hamcrest.Matchers.lessThanOrEqualTo;
 import static org.hamcrest.Matchers.notNullValue;
 import static org.hamcrest.Matchers.nullValue;
@@ -36,7 +37,8 @@ public class TransportMlInfoActionTests extends ESTestCase {
         ClusterSettings clusterSettings = new ClusterSettings(
             Settings.builder().put(MAX_MACHINE_MEMORY_PERCENT.getKey(), mlMemoryPercent).build(),
             Sets.newHashSet(MAX_MACHINE_MEMORY_PERCENT, USE_AUTO_MACHINE_MEMORY_PERCENT));
-        long highestMlMachineMemory = -1;
+        long highestMlMachineMemoryBytes = -1;
+        long totalMlMemoryBytes = 0;
 
         DiscoveryNodes.Builder builder = DiscoveryNodes.builder();
         for (int i = randomIntBetween(1, 10); i > 0; --i) {
@@ -49,7 +51,8 @@ public class TransportMlInfoActionTests extends ESTestCase {
             } else {
                 // ML node
                 long machineMemory = randomLongBetween(2000000000L, 100000000000L);
-                highestMlMachineMemory = Math.max(machineMemory, highestMlMachineMemory);
+                highestMlMachineMemoryBytes = Math.max(machineMemory, highestMlMachineMemoryBytes);
+                totalMlMemoryBytes += machineMemory * mlMemoryPercent / 100;
                 builder.add(new DiscoveryNode(nodeName, nodeId, ta,
                     Collections.singletonMap(MachineLearning.MACHINE_MEMORY_NODE_ATTR, String.valueOf(machineMemory)),
                     Collections.emptySet(), Version.CURRENT));
@@ -59,14 +62,19 @@ public class TransportMlInfoActionTests extends ESTestCase {
 
         ByteSizeValue effectiveMaxModelMemoryLimit = TransportMlInfoAction.calculateEffectiveMaxModelMemoryLimit(clusterSettings, nodes);
 
-        if (highestMlMachineMemory < 0) {
+        if (highestMlMachineMemoryBytes < 0) {
             assertThat(effectiveMaxModelMemoryLimit, nullValue());
         } else {
             assertThat(effectiveMaxModelMemoryLimit, notNullValue());
             assertThat(effectiveMaxModelMemoryLimit.getBytes()
                     + Math.max(Job.PROCESS_MEMORY_OVERHEAD.getBytes(), DataFrameAnalyticsConfig.PROCESS_MEMORY_OVERHEAD.getBytes())
                     + MachineLearning.NATIVE_EXECUTABLE_CODE_OVERHEAD.getBytes(),
-                lessThanOrEqualTo(highestMlMachineMemory * mlMemoryPercent / 100));
+                lessThanOrEqualTo(highestMlMachineMemoryBytes * mlMemoryPercent / 100));
         }
+
+        ByteSizeValue totalMlMemory = TransportMlInfoAction.calculateTotalMlMemory(clusterSettings, nodes);
+
+        assertThat(totalMlMemory, notNullValue());
+        assertThat(totalMlMemory, is(ByteSizeValue.ofMb(totalMlMemoryBytes / (1024 * 1024))));
     }
 }

+ 9 - 4
x-pack/plugin/src/test/resources/rest-api-spec/test/ml/ml_info.yml

@@ -17,8 +17,9 @@ teardown:
   - match: { defaults.anomaly_detectors.daily_model_snapshot_retention_after_days: 1 }
   - match: { defaults.datafeeds.scroll_size: 1000 }
   - is_false: limits.max_model_memory_limit
-  # We cannot assert an exact value for the next one as it will vary depending on the test machine
+  # We cannot assert an exact value for the next two as they will vary depending on the test machine
   - match: { limits.effective_max_model_memory_limit: "/\\d+[kmg]?b/" }
+  - match: { limits.total_ml_memory: "/\\d+mb/" }
   - match: { upgrade_mode: false }
 
   - do:
@@ -36,8 +37,9 @@ teardown:
   - match: { defaults.anomaly_detectors.daily_model_snapshot_retention_after_days: 1 }
   - match: { defaults.datafeeds.scroll_size: 1000 }
   - match: { limits.max_model_memory_limit: "512mb" }
-  # We cannot assert an exact value for the next one as it will vary depending on the test machine
+  # We cannot assert an exact value for the next two as they will vary depending on the test machine
   - match: { limits.effective_max_model_memory_limit: "/\\d+[kmg]?b/" }
+  - match: { limits.total_ml_memory: "/\\d+mb/" }
   - match: { upgrade_mode: false }
 
   - do:
@@ -55,8 +57,9 @@ teardown:
   - match: { defaults.anomaly_detectors.daily_model_snapshot_retention_after_days: 1 }
   - match: { defaults.datafeeds.scroll_size: 1000 }
   - match: { limits.max_model_memory_limit: "6gb" }
-  # We cannot assert an exact value for the next one as it will vary depending on the test machine
+  # We cannot assert an exact value for the next two as they will vary depending on the test machine
   - match: { limits.effective_max_model_memory_limit: "/\\d+[kmg]?b/" }
+  - match: { limits.total_ml_memory: "/\\d+mb/" }
   - match: { upgrade_mode: false }
 
   - do:
@@ -74,8 +77,9 @@ teardown:
   - match: { defaults.anomaly_detectors.daily_model_snapshot_retention_after_days: 1 }
   - match: { defaults.datafeeds.scroll_size: 1000 }
   - match: { limits.max_model_memory_limit: "6gb" }
-  # We cannot assert an exact value for the next one as it will vary depending on the test machine
+  # We cannot assert an exact value for the next two as they will vary depending on the test machine
   - match: { limits.effective_max_model_memory_limit: "/\\d+[kmg]?b/" }
+  - match: { limits.total_ml_memory: "/\\d+mb/" }
   - match: { upgrade_mode: false }
 
   - do:
@@ -95,4 +99,5 @@ teardown:
   - match: { limits.max_model_memory_limit: "1mb" }
   # This time we can assert an exact value for the next one because the hard limit is so low
   - match: { limits.effective_max_model_memory_limit: "1mb" }
+  - match: { limits.total_ml_memory: "/\\d+mb/" }
   - match: { upgrade_mode: false }