Browse Source

Reintroduce five-minute and fifteen-minute load averages on Linux

This commit reintroduces the five-minute and fifteen-minute load stats
on Linux, and changes the format of the load_average field back to an
array.
Jason Tedor 9 years ago
parent
commit
1de2081ed3

+ 4 - 4
core/src/main/java/org/elasticsearch/env/Environment.java

@@ -40,7 +40,7 @@ import static org.elasticsearch.common.Strings.cleanPath;
  * The environment of where things exists.
  */
 @SuppressForbidden(reason = "configures paths for the system")
-// TODO: move PathUtils to be package-private here instead of 
+// TODO: move PathUtils to be package-private here instead of
 // public+forbidden api!
 public class Environment {
 
@@ -72,7 +72,7 @@ public class Environment {
 
     /** Path to the PID file (can be null if no PID file is configured) **/
     private final Path pidFile;
-    
+
     /** Path to the temporary file directory used by the JDK */
     private final Path tmpFile = PathUtils.get(System.getProperty("java.io.tmpdir"));
 
@@ -292,7 +292,7 @@ public class Environment {
     public Path pidFile() {
         return pidFile;
     }
-    
+
     /** Path to the default temp directory used by the JDK */
     public Path tmpFile() {
         return tmpFile;
@@ -317,7 +317,7 @@ public class Environment {
     public static FileStore getFileStore(Path path) throws IOException {
         return ESFileStore.getMatchingFileStore(path, fileStores);
     }
-    
+
     /**
      * Returns true if the path is writable.
      * Acts just like {@link Files#isWritable(Path)}, except won't

+ 33 - 5
core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java

@@ -20,11 +20,16 @@
 package org.elasticsearch.monitor.os;
 
 import org.apache.lucene.util.Constants;
+import org.apache.lucene.util.SuppressForbidden;
+import org.elasticsearch.common.io.PathUtils;
 import org.elasticsearch.monitor.Probes;
 
+import java.io.IOException;
 import java.lang.management.ManagementFactory;
 import java.lang.management.OperatingSystemMXBean;
 import java.lang.reflect.Method;
+import java.nio.file.Files;
+import java.util.List;
 
 public class OsProbe {
 
@@ -103,17 +108,40 @@ public class OsProbe {
     }
 
     /**
-     * Returns the system load average for the last minute.
+     * Returns the system load averages
      */
-    public double getSystemLoadAverage() {
+    public double[] getSystemLoadAverage() {
+        if (Constants.LINUX) {
+            double[] loadAverage = readProcLoadavg("/proc/loadavg");
+            if (loadAverage != null) {
+                return loadAverage;
+            }
+            // fallback
+        }
         if (getSystemLoadAverage == null) {
-            return -1;
+            return null;
         }
         try {
-            return (double) getSystemLoadAverage.invoke(osMxBean);
+            double oneMinuteLoadAverage = (double) getSystemLoadAverage.invoke(osMxBean);
+            return new double[] { oneMinuteLoadAverage, -1, -1 };
         } catch (Throwable t) {
-            return -1;
+            return null;
+        }
+    }
+
+    @SuppressForbidden(reason = "access /proc")
+    private static double[] readProcLoadavg(String procLoadavg) {
+        try {
+            List<String> lines = Files.readAllLines(PathUtils.get(procLoadavg));
+            if (!lines.isEmpty()) {
+                String[] fields = lines.get(0).split("\\s+");
+                return new double[] { Double.parseDouble(fields[0]), Double.parseDouble(fields[1]), Double.parseDouble(fields[2]) };
+            }
+        } catch (IOException e) {
+            // do not fail Elasticsearch if something unexpected
+            // happens here
         }
+        return null;
     }
 
     public short getSystemCpuPercent() {

+ 21 - 5
core/src/main/java/org/elasticsearch/monitor/os/OsStats.java

@@ -87,7 +87,13 @@ public class OsStats implements Streamable, ToXContent {
         if (cpu != null) {
             builder.startObject(Fields.CPU);
             builder.field(Fields.PERCENT, cpu.getPercent());
-            builder.field(Fields.LOAD_AVERAGE, cpu.getLoadAverage());
+            if (cpu.getLoadAverage() != null) {
+                builder.startArray(Fields.LOAD_AVERAGE);
+                builder.value(cpu.getLoadAverage()[0]);
+                builder.value(cpu.getLoadAverage()[1]);
+                builder.value(cpu.getLoadAverage()[2]);
+                builder.endArray();
+            }
             builder.endObject();
         }
 
@@ -152,8 +158,9 @@ public class OsStats implements Streamable, ToXContent {
     }
 
     public static class Cpu implements Streamable {
+
         short percent = -1;
-        double loadAverage = -1;
+        double[] loadAverage = null;
 
         Cpu() {}
 
@@ -166,20 +173,29 @@ public class OsStats implements Streamable, ToXContent {
         @Override
         public void readFrom(StreamInput in) throws IOException {
             percent = in.readShort();
-            loadAverage = in.readDouble();
+            if (in.readBoolean()) {
+                loadAverage = in.readDoubleArray();
+            } else {
+                loadAverage = null;
+            }
         }
 
         @Override
         public void writeTo(StreamOutput out) throws IOException {
             out.writeShort(percent);
-            out.writeDouble(loadAverage);
+            if (loadAverage == null) {
+                out.writeBoolean(false);
+            } else {
+                out.writeBoolean(true);
+                out.writeDoubleArray(loadAverage);
+            }
         }
 
         public short getPercent() {
             return percent;
         }
 
-        public double getLoadAverage() {
+        public double[] getLoadAverage() {
             return loadAverage;
         }
     }

+ 7 - 2
core/src/main/java/org/elasticsearch/rest/action/cat/RestNodesAction.java

@@ -134,7 +134,9 @@ public class RestNodesAction extends AbstractCatAction {
         table.addCell("file_desc.max", "default:false;alias:fdm,fileDescriptorMax;text-align:right;desc:max file descriptors");
 
         table.addCell("cpu", "alias:cpu;text-align:right;desc:recent cpu usage");
-        table.addCell("load", "alias:l;text-align:right;desc:most recent load avg");
+        table.addCell("load_1m", "alias:l;text-align:right;desc:1m load avg");
+        table.addCell("load_5m", "alias:l;text-align:right;desc:5m load avg");
+        table.addCell("load_15m", "alias:l;text-align:right;desc:15m load avg");
         table.addCell("uptime", "default:false;alias:u;text-align:right;desc:node uptime");
         table.addCell("node.role", "alias:r,role,dc,nodeRole;desc:d:data node, c:client node");
         table.addCell("master", "alias:m;desc:m:master-eligible, *:current master");
@@ -263,7 +265,10 @@ public class RestNodesAction extends AbstractCatAction {
             table.addCell(processStats == null ? null : processStats.getMaxFileDescriptors());
 
             table.addCell(osStats == null ? null : Short.toString(osStats.getCpu().getPercent()));
-            table.addCell(osStats == null ? null : String.format(Locale.ROOT, "%.2f", osStats.getCpu().getLoadAverage()));
+            boolean hasLoadAverage = osStats != null && osStats.getCpu().getLoadAverage() != null;
+            table.addCell(!hasLoadAverage ? null : String.format(Locale.ROOT, "%.2f", osStats.getCpu().getLoadAverage()[0]));
+            table.addCell(!hasLoadAverage ? null : String.format(Locale.ROOT, "%.2f", osStats.getCpu().getLoadAverage()[1]));
+            table.addCell(!hasLoadAverage ? null : String.format(Locale.ROOT, "%.2f", osStats.getCpu().getLoadAverage()[2]));
             table.addCell(jvmStats == null ? null : jvmStats.getUptime());
             table.addCell(node.clientNode() ? "c" : node.dataNode() ? "d" : "-");
             table.addCell(masterId == null ? "x" : masterId.equals(node.id()) ? "*" : node.masterNode() ? "m" : "-");

+ 3 - 0
core/src/main/resources/org/elasticsearch/bootstrap/security.policy

@@ -115,4 +115,7 @@ grant {
 
   // needed by JDKESLoggerTests
   permission java.util.logging.LoggingPermission "control";
+
+  // load averages on Linux
+  permission java.io.FilePermission "/proc/loadavg", "read";
 };

+ 23 - 4
core/src/test/java/org/elasticsearch/monitor/os/OsProbeTests.java

@@ -50,12 +50,31 @@ public class OsProbeTests extends ESTestCase {
         assertNotNull(stats);
         assertThat(stats.getTimestamp(), greaterThan(0L));
         assertThat(stats.getCpu().getPercent(), anyOf(equalTo((short) -1), is(both(greaterThanOrEqualTo((short) 0)).and(lessThanOrEqualTo((short) 100)))));
+        double[] loadAverage = stats.getCpu().loadAverage;
+        if (loadAverage != null) {
+            assertThat(loadAverage.length, equalTo(3));
+        }
         if (Constants.WINDOWS) {
-            // Load average is always -1 on Windows platforms
-            assertThat(stats.getCpu().getLoadAverage(), equalTo((double) -1));
+            // load average is unavailable on Windows
+            if (loadAverage != null) {
+                assertThat(loadAverage[0], equalTo((double) -1));
+                assertThat(loadAverage[1], equalTo((double) -1));
+                assertThat(loadAverage[2], equalTo((double) -1));
+            }
+        } else if (Constants.LINUX) {
+            // we should be able to get the load average
+            assertNotNull(loadAverage);
+            assertThat(loadAverage[0], greaterThanOrEqualTo((double) 0));
+            assertThat(loadAverage[1], greaterThanOrEqualTo((double) 0));
+            assertThat(loadAverage[2], greaterThanOrEqualTo((double) 0));
         } else {
-            // Load average can be negative if not available or not computed yet, otherwise it should be >= 0
-            assertThat(stats.getCpu().getLoadAverage(), anyOf(lessThan((double) 0), greaterThanOrEqualTo((double) 0)));
+            // one minute load average is available, but 10-minute and 15-minute load averages are not
+            // load average can be negative if not available or not computed yet, otherwise it should be >= 0
+            if (loadAverage != null) {
+                assertThat(loadAverage[0], anyOf(lessThan((double) 0), greaterThanOrEqualTo((double) 0)));
+                assertThat(loadAverage[1], equalTo((double) -1));
+                assertThat(loadAverage[2], equalTo((double) -1));
+            }
         }
 
         assertNotNull(stats.getMem());

+ 2 - 1
docs/reference/cluster/nodes-stats.asciidoc

@@ -132,7 +132,8 @@ the operating system:
     Recent CPU usage for the whole system, or -1 if not supported
 
 `os.cpu.load_average`::
-	System load average for the last minute, or -1 if not supported
+	Array of system load averages for the last one minute, five
+	minute and fifteen minutes (value of -1 indicates not supported)
 
 `os.mem.total_in_bytes`::
 	Total amount of physical memory in bytes

+ 26 - 11
docs/reference/migration/migrate_3_0.asciidoc

@@ -552,17 +552,32 @@ and high risk of being misused. The ability to change the thread pool type for a
 that it is still possible to adjust relevant thread pool parameters for each of the thread pools (e.g., depending on
 the thread pool type, `keep_alive`, `queue_size`, etc.).
 
-=== Adding system CPU percent to OS stats
-
-The recent CPU usage (as a percent) has been added to the OS stats reported under the node stats API and the cat nodes
-API. The breaking change here is that there is a new object in the "os" object in the node stats response. This object
-is called "cpu" and includes "percent" and "load_average" as fields. This moves the "load_average" field that was
-previously a top-level field in the "os" object to the "cpu" object. Additionally, the "cpu" field in the cat nodes API
-response is output by default.
-
-Finally, the API for org.elasticsearch.monitor.os.OsStats has changed. The `getLoadAverage` method has been removed. The
-value for this can now be obtained from `OsStats.Cpu#getLoadAverage`. Additionally, the recent CPU usage can be obtained
-from `OsStats.Cpu#getPercent`.
+=== System CPU stats
+
+The recent CPU usage (as a percent) has been added to the OS stats
+reported under the node stats API and the cat nodes API. The breaking
+change here is that there is a new object in the "os" object in the node
+stats response. This object is called "cpu" and includes "percent" and
+"load_average" as fields. This moves the "load_average" field that was
+previously a top-level field in the "os" object to the "cpu" object. The
+format of the "load_average" field has changed to an array of length
+three representing the one-minute, five-minute and fifteen-minute load
+averages (a value of -1 for any of array components indicates that the
+corresponding metric is not available).
+
+In the cat nodes API response, the "cpu" field is output by default. The
+previous "load" field has been removed and is replaced by "load_1m",
+"load_5m", and "load_15m" which represent the one-minute, five-minute
+and fifteen-minute loads respectively. These values are output by
+default, and a value of -1 indicates that the corresponding metric is
+not available.
+
+Finally, the API for org.elasticsearch.monitor.os.OsStats has
+changed. The `getLoadAverage` method has been removed. The value for
+this can now be obtained from `OsStats.Cpu#getLoadAverage` but it is no
+longer a double and is instead an object encapuslating the one-minute,
+five-minute and fifteen-minute load averages. Additionally, the recent
+CPU usage can be obtained from `OsStats.Cpu#getPercent`.
 
 === Fields option
 Only stored fields are retrievable with this option.

+ 4 - 4
rest-api-spec/src/main/resources/rest-api-spec/test/cat.nodes/10_basic.yaml

@@ -6,8 +6,8 @@
 
   - match:
       $body: |
-               /  #host       ip                          heap.percent        ram.percent     cpu         load                    node.role        master          name
-               ^  (\S+   \s+  (\d{1,3}\.){3}\d{1,3}  \s+  \d+            \s+  \d*         \s+ (-)?\d* \s+ (-)?\d*(\.\d+)?    \s+  [-dc]       \s+  [-*mx]    \s+   (\S+\s?)+     \n)+  $/
+               /  #host       ip                          heap.percent        ram.percent     cpu         load_1m                load_5m                load_15m               node.role        master          name
+               ^  (\S+   \s+  (\d{1,3}\.){3}\d{1,3}  \s+  \d+            \s+  \d*         \s+ (-)?\d* \s+ ((-)?\d*(\.\d+)?)? \s+ ((-)?\d*(\.\d+)?)? \s+ ((-)?\d*(\.\d+)?)? \s+ [-dc]       \s+  [-*mx]    \s+   (\S+\s?)+     \n)+  $/
 
   - do:
       cat.nodes:
@@ -15,8 +15,8 @@
 
   - match:
       $body: |
-               /^  host  \s+  ip                     \s+  heap\.percent   \s+  ram\.percent \s+ cpu      \s+ load           \s+  node\.role   \s+  master   \s+   name  \n
-                  (\S+   \s+  (\d{1,3}\.){3}\d{1,3}  \s+  \d+             \s+  \d*          \s+ (-)?\d* \s+ (-)?\d*(\.\d+)?    \s+  [-dc]        \s+  [-*mx]    \s+   (\S+\s?)+     \n)+  $/
+               /^  host  \s+  ip                     \s+  heap\.percent   \s+  ram\.percent \s+ cpu      \s+ load_1m            \s+ load_5m            \s+ load_15m           \s+ node\.role   \s+  master   \s+   name  \n
+                  (\S+   \s+  (\d{1,3}\.){3}\d{1,3}  \s+  \d+             \s+  \d*          \s+ (-)?\d* \s+  ((-)?\d*(\.\d+)?)? \s+ ((-)?\d*(\.\d+)?)? \s+ ((-)?\d*(\.\d+)?)? \s+ [-dc]        \s+  [-*mx]    \s+   (\S+\s?)+     \n)+  $/
 
   - do:
       cat.nodes: