瀏覽代碼

Return 0 for negative "free" and "total" memory reported by the OS (#42725)

* Return 0 for negative "free" and "total" memory reported by the OS

We've had a situation where the MX bean reported negative values for the
free memory of the OS, in those rare cases we want to return a value of
0 rather than blowing up later down the pipeline.

In the event that there is a serialization or creation error with regard
to memory use, this adds asserts so the failure will occur as soon as
possible and give us a better location for investigation.

Resolves #42157

* Fix test passing in invalid memory value

* Fix another test passing in invalid memory value

* Also change mem check in MachineLearning.machineMemoryFromStats

* Add background documentation for why we prevent negative return values

* Clarify comment a bit more
Lee Hinman 6 年之前
父節點
當前提交
1ae9f91e14

+ 38 - 6
server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java

@@ -42,6 +42,24 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
+/**
+ * The {@link OsProbe} class retrieves information about the physical and swap size of the machine
+ * memory, as well as the system load average and cpu load.
+ *
+ * In some exceptional cases, it's possible the underlying native method used by
+ * {@link #getFreePhysicalMemorySize()} and {@link #getTotalPhysicalMemorySize()} can return a
+ * negative value. Because of this, we prevent those methods from returning negative values,
+ * returning 0 instead.
+ *
+ * The OS can report a negative number in a number of cases:
+ * - Non-supported OSes (HP-UX, or AIX)
+ * - A failure of macOS to initialize host statistics
+ * - An OS that does not support the {@code _SC_PHYS_PAGES} or {@code _SC_PAGE_SIZE} flags for the {@code sysconf()} linux kernel call
+ * - An overflow of the product of {@code _SC_PHYS_PAGES} and {@code _SC_PAGE_SIZE}
+ * - An error case retrieving these values from a linux kernel
+ * - A non-standard libc implementation not implementing the required values
+ * For a more exhaustive explanation, see https://github.com/elastic/elasticsearch/pull/42725
+ */
 public class OsProbe {
 
     private static final OperatingSystemMXBean osMxBean = ManagementFactory.getOperatingSystemMXBean();
@@ -67,12 +85,19 @@ public class OsProbe {
      */
     public long getFreePhysicalMemorySize() {
         if (getFreePhysicalMemorySize == null) {
-            return -1;
+            logger.warn("getFreePhysicalMemorySize is not available");
+            return 0;
         }
         try {
-            return (long) getFreePhysicalMemorySize.invoke(osMxBean);
+            final long freeMem = (long) getFreePhysicalMemorySize.invoke(osMxBean);
+            if (freeMem < 0) {
+                logger.warn("OS reported a negative free memory value [{}]", freeMem);
+                return 0;
+            }
+            return freeMem;
         } catch (Exception e) {
-            return -1;
+            logger.warn("exception retrieving free physical memory", e);
+            return 0;
         }
     }
 
@@ -81,12 +106,19 @@ public class OsProbe {
      */
     public long getTotalPhysicalMemorySize() {
         if (getTotalPhysicalMemorySize == null) {
-            return -1;
+            logger.warn("getTotalPhysicalMemorySize is not available");
+            return 0;
         }
         try {
-            return (long) getTotalPhysicalMemorySize.invoke(osMxBean);
+            final long totalMem = (long) getTotalPhysicalMemorySize.invoke(osMxBean);
+            if (totalMem < 0) {
+                logger.warn("OS reported a negative total memory value [{}]", totalMem);
+                return 0;
+            }
+            return totalMem;
         } catch (Exception e) {
-            return -1;
+            logger.warn("exception retrieving total physical memory", e);
+            return 0;
         }
     }
 

+ 4 - 0
server/src/main/java/org/elasticsearch/monitor/os/OsStats.java

@@ -228,13 +228,17 @@ public class OsStats implements Writeable, ToXContentFragment {
         private final long free;
 
         public Mem(long total, long free) {
+            assert total >= 0 : "expected total memory to be positive, got: " + total;
+            assert free >= 0 : "expected free memory to be positive, got: " + total;
             this.total = total;
             this.free = free;
         }
 
         public Mem(StreamInput in) throws IOException {
             this.total = in.readLong();
+            assert total >= 0 : "expected total memory to be positive, got: " + total;
             this.free = in.readLong();
+            assert free >= 0 : "expected free memory to be positive, got: " + total;
         }
 
         @Override

+ 2 - 2
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java

@@ -801,8 +801,8 @@ public class MachineLearning extends Plugin implements ActionPlugin, AnalysisPlu
             if (containerLimitStr != null) {
                 BigInteger containerLimit = new BigInteger(containerLimitStr);
                 if ((containerLimit.compareTo(BigInteger.valueOf(mem)) < 0 && containerLimit.compareTo(BigInteger.ZERO) > 0)
-                        // mem < 0 means the value couldn't be obtained for some reason
-                        || (mem < 0 && containerLimit.compareTo(BigInteger.valueOf(Long.MAX_VALUE)) < 0)) {
+                        // mem <= 0 means the value couldn't be obtained for some reason
+                        || (mem <= 0 && containerLimit.compareTo(BigInteger.valueOf(Long.MAX_VALUE)) < 0)) {
                     mem = containerLimit.longValue();
                 }
             }

+ 2 - 2
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/MachineLearningTests.java

@@ -97,8 +97,8 @@ public class MachineLearningTests extends ESTestCase {
 
     public void testMachineMemory_givenStatsFailure() throws IOException {
         OsStats stats = mock(OsStats.class);
-        when(stats.getMem()).thenReturn(new OsStats.Mem(-1, -1));
-        assertEquals(-1L, MachineLearning.machineMemoryFromStats(stats));
+        when(stats.getMem()).thenReturn(new OsStats.Mem(0, 0));
+        assertEquals(0L, MachineLearning.machineMemoryFromStats(stats));
     }
 
     public void testMachineMemory_givenNoCgroup() throws IOException {

+ 1 - 1
x-pack/plugin/monitoring/src/test/java/org/elasticsearch/xpack/monitoring/collector/node/NodeStatsMonitoringDocTests.java

@@ -329,7 +329,7 @@ public class NodeStatsMonitoringDocTests extends BaseFilteredMonitoringDocTestCa
         final OsStats.Cgroup osCgroup = new OsStats.Cgroup("_cpu_acct_ctrl_group", ++iota, "_cpu_ctrl_group", ++iota, ++iota, osCpuStat,
                 "_memory_ctrl_group", "2000000000", "1000000000");
 
-        final OsStats.Mem osMem = new OsStats.Mem(no, no);
+        final OsStats.Mem osMem = new OsStats.Mem(0, 0);
         final OsStats.Swap osSwap = new OsStats.Swap(no, no);
         final OsStats os = new OsStats(no, osCpu, osMem, osSwap, osCgroup);