瀏覽代碼

Merge pull request #15907 from jasontedor/load-average

Reintroduce five-minute and fifteen-minute load averages on Linux

Relates #12049, relates #14741
Jason Tedor 9 年之前
父節點
當前提交
d9fd6e2fe3

+ 4 - 4
core/src/main/java/org/elasticsearch/env/Environment.java

@@ -40,7 +40,7 @@ import static org.elasticsearch.common.Strings.cleanPath;
  * The environment of where things exists.
  * The environment of where things exists.
  */
  */
 @SuppressForbidden(reason = "configures paths for the system")
 @SuppressForbidden(reason = "configures paths for the system")
-// TODO: move PathUtils to be package-private here instead of 
+// TODO: move PathUtils to be package-private here instead of
 // public+forbidden api!
 // public+forbidden api!
 public class Environment {
 public class Environment {
 
 
@@ -72,7 +72,7 @@ public class Environment {
 
 
     /** Path to the PID file (can be null if no PID file is configured) **/
     /** Path to the PID file (can be null if no PID file is configured) **/
     private final Path pidFile;
     private final Path pidFile;
-    
+
     /** Path to the temporary file directory used by the JDK */
     /** Path to the temporary file directory used by the JDK */
     private final Path tmpFile = PathUtils.get(System.getProperty("java.io.tmpdir"));
     private final Path tmpFile = PathUtils.get(System.getProperty("java.io.tmpdir"));
 
 
@@ -292,7 +292,7 @@ public class Environment {
     public Path pidFile() {
     public Path pidFile() {
         return pidFile;
         return pidFile;
     }
     }
-    
+
     /** Path to the default temp directory used by the JDK */
     /** Path to the default temp directory used by the JDK */
     public Path tmpFile() {
     public Path tmpFile() {
         return tmpFile;
         return tmpFile;
@@ -317,7 +317,7 @@ public class Environment {
     public static FileStore getFileStore(Path path) throws IOException {
     public static FileStore getFileStore(Path path) throws IOException {
         return ESFileStore.getMatchingFileStore(path, fileStores);
         return ESFileStore.getMatchingFileStore(path, fileStores);
     }
     }
-    
+
     /**
     /**
      * Returns true if the path is writable.
      * Returns true if the path is writable.
      * Acts just like {@link Files#isWritable(Path)}, except won't
      * Acts just like {@link Files#isWritable(Path)}, except won't

+ 33 - 5
core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java

@@ -20,11 +20,16 @@
 package org.elasticsearch.monitor.os;
 package org.elasticsearch.monitor.os;
 
 
 import org.apache.lucene.util.Constants;
 import org.apache.lucene.util.Constants;
+import org.apache.lucene.util.SuppressForbidden;
+import org.elasticsearch.common.io.PathUtils;
 import org.elasticsearch.monitor.Probes;
 import org.elasticsearch.monitor.Probes;
 
 
+import java.io.IOException;
 import java.lang.management.ManagementFactory;
 import java.lang.management.ManagementFactory;
 import java.lang.management.OperatingSystemMXBean;
 import java.lang.management.OperatingSystemMXBean;
 import java.lang.reflect.Method;
 import java.lang.reflect.Method;
+import java.nio.file.Files;
+import java.util.List;
 
 
 public class OsProbe {
 public class OsProbe {
 
 
@@ -103,17 +108,40 @@ public class OsProbe {
     }
     }
 
 
     /**
     /**
-     * Returns the system load average for the last minute.
+     * Returns the system load averages
      */
      */
-    public double getSystemLoadAverage() {
+    public double[] getSystemLoadAverage() {
+        if (Constants.LINUX) {
+            double[] loadAverage = readProcLoadavg("/proc/loadavg");
+            if (loadAverage != null) {
+                return loadAverage;
+            }
+            // fallback
+        }
         if (getSystemLoadAverage == null) {
         if (getSystemLoadAverage == null) {
-            return -1;
+            return null;
         }
         }
         try {
         try {
-            return (double) getSystemLoadAverage.invoke(osMxBean);
+            double oneMinuteLoadAverage = (double) getSystemLoadAverage.invoke(osMxBean);
+            return new double[] { oneMinuteLoadAverage, -1, -1 };
         } catch (Throwable t) {
         } catch (Throwable t) {
-            return -1;
+            return null;
+        }
+    }
+
+    @SuppressForbidden(reason = "access /proc")
+    private static double[] readProcLoadavg(String procLoadavg) {
+        try {
+            List<String> lines = Files.readAllLines(PathUtils.get(procLoadavg));
+            if (!lines.isEmpty()) {
+                String[] fields = lines.get(0).split("\\s+");
+                return new double[] { Double.parseDouble(fields[0]), Double.parseDouble(fields[1]), Double.parseDouble(fields[2]) };
+            }
+        } catch (IOException e) {
+            // do not fail Elasticsearch if something unexpected
+            // happens here
         }
         }
+        return null;
     }
     }
 
 
     public short getSystemCpuPercent() {
     public short getSystemCpuPercent() {

+ 21 - 5
core/src/main/java/org/elasticsearch/monitor/os/OsStats.java

@@ -87,7 +87,13 @@ public class OsStats implements Streamable, ToXContent {
         if (cpu != null) {
         if (cpu != null) {
             builder.startObject(Fields.CPU);
             builder.startObject(Fields.CPU);
             builder.field(Fields.PERCENT, cpu.getPercent());
             builder.field(Fields.PERCENT, cpu.getPercent());
-            builder.field(Fields.LOAD_AVERAGE, cpu.getLoadAverage());
+            if (cpu.getLoadAverage() != null) {
+                builder.startArray(Fields.LOAD_AVERAGE);
+                builder.value(cpu.getLoadAverage()[0]);
+                builder.value(cpu.getLoadAverage()[1]);
+                builder.value(cpu.getLoadAverage()[2]);
+                builder.endArray();
+            }
             builder.endObject();
             builder.endObject();
         }
         }
 
 
@@ -152,8 +158,9 @@ public class OsStats implements Streamable, ToXContent {
     }
     }
 
 
     public static class Cpu implements Streamable {
     public static class Cpu implements Streamable {
+
         short percent = -1;
         short percent = -1;
-        double loadAverage = -1;
+        double[] loadAverage = null;
 
 
         Cpu() {}
         Cpu() {}
 
 
@@ -166,20 +173,29 @@ public class OsStats implements Streamable, ToXContent {
         @Override
         @Override
         public void readFrom(StreamInput in) throws IOException {
         public void readFrom(StreamInput in) throws IOException {
             percent = in.readShort();
             percent = in.readShort();
-            loadAverage = in.readDouble();
+            if (in.readBoolean()) {
+                loadAverage = in.readDoubleArray();
+            } else {
+                loadAverage = null;
+            }
         }
         }
 
 
         @Override
         @Override
         public void writeTo(StreamOutput out) throws IOException {
         public void writeTo(StreamOutput out) throws IOException {
             out.writeShort(percent);
             out.writeShort(percent);
-            out.writeDouble(loadAverage);
+            if (loadAverage == null) {
+                out.writeBoolean(false);
+            } else {
+                out.writeBoolean(true);
+                out.writeDoubleArray(loadAverage);
+            }
         }
         }
 
 
         public short getPercent() {
         public short getPercent() {
             return percent;
             return percent;
         }
         }
 
 
-        public double getLoadAverage() {
+        public double[] getLoadAverage() {
             return loadAverage;
             return loadAverage;
         }
         }
     }
     }

+ 7 - 2
core/src/main/java/org/elasticsearch/rest/action/cat/RestNodesAction.java

@@ -134,7 +134,9 @@ public class RestNodesAction extends AbstractCatAction {
         table.addCell("file_desc.max", "default:false;alias:fdm,fileDescriptorMax;text-align:right;desc:max file descriptors");
         table.addCell("file_desc.max", "default:false;alias:fdm,fileDescriptorMax;text-align:right;desc:max file descriptors");
 
 
         table.addCell("cpu", "alias:cpu;text-align:right;desc:recent cpu usage");
         table.addCell("cpu", "alias:cpu;text-align:right;desc:recent cpu usage");
-        table.addCell("load", "alias:l;text-align:right;desc:most recent load avg");
+        table.addCell("load_1m", "alias:l;text-align:right;desc:1m load avg");
+        table.addCell("load_5m", "alias:l;text-align:right;desc:5m load avg");
+        table.addCell("load_15m", "alias:l;text-align:right;desc:15m load avg");
         table.addCell("uptime", "default:false;alias:u;text-align:right;desc:node uptime");
         table.addCell("uptime", "default:false;alias:u;text-align:right;desc:node uptime");
         table.addCell("node.role", "alias:r,role,dc,nodeRole;desc:d:data node, c:client node");
         table.addCell("node.role", "alias:r,role,dc,nodeRole;desc:d:data node, c:client node");
         table.addCell("master", "alias:m;desc:m:master-eligible, *:current master");
         table.addCell("master", "alias:m;desc:m:master-eligible, *:current master");
@@ -263,7 +265,10 @@ public class RestNodesAction extends AbstractCatAction {
             table.addCell(processStats == null ? null : processStats.getMaxFileDescriptors());
             table.addCell(processStats == null ? null : processStats.getMaxFileDescriptors());
 
 
             table.addCell(osStats == null ? null : Short.toString(osStats.getCpu().getPercent()));
             table.addCell(osStats == null ? null : Short.toString(osStats.getCpu().getPercent()));
-            table.addCell(osStats == null ? null : String.format(Locale.ROOT, "%.2f", osStats.getCpu().getLoadAverage()));
+            boolean hasLoadAverage = osStats != null && osStats.getCpu().getLoadAverage() != null;
+            table.addCell(!hasLoadAverage ? null : String.format(Locale.ROOT, "%.2f", osStats.getCpu().getLoadAverage()[0]));
+            table.addCell(!hasLoadAverage ? null : String.format(Locale.ROOT, "%.2f", osStats.getCpu().getLoadAverage()[1]));
+            table.addCell(!hasLoadAverage ? null : String.format(Locale.ROOT, "%.2f", osStats.getCpu().getLoadAverage()[2]));
             table.addCell(jvmStats == null ? null : jvmStats.getUptime());
             table.addCell(jvmStats == null ? null : jvmStats.getUptime());
             table.addCell(node.clientNode() ? "c" : node.dataNode() ? "d" : "-");
             table.addCell(node.clientNode() ? "c" : node.dataNode() ? "d" : "-");
             table.addCell(masterId == null ? "x" : masterId.equals(node.id()) ? "*" : node.masterNode() ? "m" : "-");
             table.addCell(masterId == null ? "x" : masterId.equals(node.id()) ? "*" : node.masterNode() ? "m" : "-");

+ 3 - 0
core/src/main/resources/org/elasticsearch/bootstrap/security.policy

@@ -115,4 +115,7 @@ grant {
 
 
   // needed by JDKESLoggerTests
   // needed by JDKESLoggerTests
   permission java.util.logging.LoggingPermission "control";
   permission java.util.logging.LoggingPermission "control";
+
+  // load averages on Linux
+  permission java.io.FilePermission "/proc/loadavg", "read";
 };
 };

+ 23 - 4
core/src/test/java/org/elasticsearch/monitor/os/OsProbeTests.java

@@ -50,12 +50,31 @@ public class OsProbeTests extends ESTestCase {
         assertNotNull(stats);
         assertNotNull(stats);
         assertThat(stats.getTimestamp(), greaterThan(0L));
         assertThat(stats.getTimestamp(), greaterThan(0L));
         assertThat(stats.getCpu().getPercent(), anyOf(equalTo((short) -1), is(both(greaterThanOrEqualTo((short) 0)).and(lessThanOrEqualTo((short) 100)))));
         assertThat(stats.getCpu().getPercent(), anyOf(equalTo((short) -1), is(both(greaterThanOrEqualTo((short) 0)).and(lessThanOrEqualTo((short) 100)))));
+        double[] loadAverage = stats.getCpu().loadAverage;
+        if (loadAverage != null) {
+            assertThat(loadAverage.length, equalTo(3));
+        }
         if (Constants.WINDOWS) {
         if (Constants.WINDOWS) {
-            // Load average is always -1 on Windows platforms
-            assertThat(stats.getCpu().getLoadAverage(), equalTo((double) -1));
+            // load average is unavailable on Windows
+            if (loadAverage != null) {
+                assertThat(loadAverage[0], equalTo((double) -1));
+                assertThat(loadAverage[1], equalTo((double) -1));
+                assertThat(loadAverage[2], equalTo((double) -1));
+            }
+        } else if (Constants.LINUX) {
+            // we should be able to get the load average
+            assertNotNull(loadAverage);
+            assertThat(loadAverage[0], greaterThanOrEqualTo((double) 0));
+            assertThat(loadAverage[1], greaterThanOrEqualTo((double) 0));
+            assertThat(loadAverage[2], greaterThanOrEqualTo((double) 0));
         } else {
         } else {
-            // Load average can be negative if not available or not computed yet, otherwise it should be >= 0
-            assertThat(stats.getCpu().getLoadAverage(), anyOf(lessThan((double) 0), greaterThanOrEqualTo((double) 0)));
+            // one minute load average is available, but 10-minute and 15-minute load averages are not
+            // load average can be negative if not available or not computed yet, otherwise it should be >= 0
+            if (loadAverage != null) {
+                assertThat(loadAverage[0], anyOf(lessThan((double) 0), greaterThanOrEqualTo((double) 0)));
+                assertThat(loadAverage[1], equalTo((double) -1));
+                assertThat(loadAverage[2], equalTo((double) -1));
+            }
         }
         }
 
 
         assertNotNull(stats.getMem());
         assertNotNull(stats.getMem());

+ 2 - 1
docs/reference/cluster/nodes-stats.asciidoc

@@ -132,7 +132,8 @@ the operating system:
     Recent CPU usage for the whole system, or -1 if not supported
     Recent CPU usage for the whole system, or -1 if not supported
 
 
 `os.cpu.load_average`::
 `os.cpu.load_average`::
-	System load average for the last minute, or -1 if not supported
+	Array of system load averages for the last one minute, five
+	minute and fifteen minutes (value of -1 indicates not supported)
 
 
 `os.mem.total_in_bytes`::
 `os.mem.total_in_bytes`::
 	Total amount of physical memory in bytes
 	Total amount of physical memory in bytes

+ 26 - 11
docs/reference/migration/migrate_3_0.asciidoc

@@ -552,17 +552,32 @@ and high risk of being misused. The ability to change the thread pool type for a
 that it is still possible to adjust relevant thread pool parameters for each of the thread pools (e.g., depending on
 that it is still possible to adjust relevant thread pool parameters for each of the thread pools (e.g., depending on
 the thread pool type, `keep_alive`, `queue_size`, etc.).
 the thread pool type, `keep_alive`, `queue_size`, etc.).
 
 
-=== Adding system CPU percent to OS stats
-
-The recent CPU usage (as a percent) has been added to the OS stats reported under the node stats API and the cat nodes
-API. The breaking change here is that there is a new object in the "os" object in the node stats response. This object
-is called "cpu" and includes "percent" and "load_average" as fields. This moves the "load_average" field that was
-previously a top-level field in the "os" object to the "cpu" object. Additionally, the "cpu" field in the cat nodes API
-response is output by default.
-
-Finally, the API for org.elasticsearch.monitor.os.OsStats has changed. The `getLoadAverage` method has been removed. The
-value for this can now be obtained from `OsStats.Cpu#getLoadAverage`. Additionally, the recent CPU usage can be obtained
-from `OsStats.Cpu#getPercent`.
+=== System CPU stats
+
+The recent CPU usage (as a percent) has been added to the OS stats
+reported under the node stats API and the cat nodes API. The breaking
+change here is that there is a new object in the "os" object in the node
+stats response. This object is called "cpu" and includes "percent" and
+"load_average" as fields. This moves the "load_average" field that was
+previously a top-level field in the "os" object to the "cpu" object. The
+format of the "load_average" field has changed to an array of length
+three representing the one-minute, five-minute and fifteen-minute load
+averages (a value of -1 for any of array components indicates that the
+corresponding metric is not available).
+
+In the cat nodes API response, the "cpu" field is output by default. The
+previous "load" field has been removed and is replaced by "load_1m",
+"load_5m", and "load_15m" which represent the one-minute, five-minute
+and fifteen-minute loads respectively. These values are output by
+default, and a value of -1 indicates that the corresponding metric is
+not available.
+
+Finally, the API for org.elasticsearch.monitor.os.OsStats has
+changed. The `getLoadAverage` method has been removed. The value for
+this can now be obtained from `OsStats.Cpu#getLoadAverage` but it is no
+longer a double and is instead an object encapuslating the one-minute,
+five-minute and fifteen-minute load averages. Additionally, the recent
+CPU usage can be obtained from `OsStats.Cpu#getPercent`.
 
 
 === Fields option
 === Fields option
 Only stored fields are retrievable with this option.
 Only stored fields are retrievable with this option.

+ 4 - 4
rest-api-spec/src/main/resources/rest-api-spec/test/cat.nodes/10_basic.yaml

@@ -6,8 +6,8 @@
 
 
   - match:
   - match:
       $body: |
       $body: |
-               /  #host       ip                          heap.percent        ram.percent     cpu         load                    node.role        master          name
-               ^  (\S+   \s+  (\d{1,3}\.){3}\d{1,3}  \s+  \d+            \s+  \d*         \s+ (-)?\d* \s+ (-)?\d*(\.\d+)?    \s+  [-dc]       \s+  [-*mx]    \s+   (\S+\s?)+     \n)+  $/
+               /  #host       ip                          heap.percent        ram.percent     cpu         load_1m                load_5m                load_15m               node.role        master          name
+               ^  (\S+   \s+  (\d{1,3}\.){3}\d{1,3}  \s+  \d+            \s+  \d*         \s+ (-)?\d* \s+ ((-)?\d*(\.\d+)?)? \s+ ((-)?\d*(\.\d+)?)? \s+ ((-)?\d*(\.\d+)?)? \s+ [-dc]       \s+  [-*mx]    \s+   (\S+\s?)+     \n)+  $/
 
 
   - do:
   - do:
       cat.nodes:
       cat.nodes:
@@ -15,8 +15,8 @@
 
 
   - match:
   - match:
       $body: |
       $body: |
-               /^  host  \s+  ip                     \s+  heap\.percent   \s+  ram\.percent \s+ cpu      \s+ load           \s+  node\.role   \s+  master   \s+   name  \n
-                  (\S+   \s+  (\d{1,3}\.){3}\d{1,3}  \s+  \d+             \s+  \d*          \s+ (-)?\d* \s+ (-)?\d*(\.\d+)?    \s+  [-dc]        \s+  [-*mx]    \s+   (\S+\s?)+     \n)+  $/
+               /^  host  \s+  ip                     \s+  heap\.percent   \s+  ram\.percent \s+ cpu      \s+ load_1m            \s+ load_5m            \s+ load_15m           \s+ node\.role   \s+  master   \s+   name  \n
+                  (\S+   \s+  (\d{1,3}\.){3}\d{1,3}  \s+  \d+             \s+  \d*          \s+ (-)?\d* \s+  ((-)?\d*(\.\d+)?)? \s+ ((-)?\d*(\.\d+)?)? \s+ ((-)?\d*(\.\d+)?)? \s+ [-dc]        \s+  [-*mx]    \s+   (\S+\s?)+     \n)+  $/
 
 
   - do:
   - do:
       cat.nodes:
       cat.nodes: