Browse Source

Introduce max headroom for disk watermark stages (#88639)

Introduce max headroom settings for the low, high, and flood disk watermark stages, similar to the existing max headroom setting for the flood stage of the frozen tier. Introduce new max headrooms in HealthMetadata and in ReactiveStorageDeciderService. Add multiple tests in DiskThresholdDeciderUnitTests, DiskThresholdDeciderTests and DiskThresholdMonitorTests. Moreover, addition & subtraction for ByteSizeValue, and min.
Iraklis Psaroudakis 3 years ago
parent
commit
34471b1cd2
22 changed files with 2067 additions and 392 deletions
  1. 6 0
      docs/changelog/88639.yaml
  2. 28 18
      docs/reference/how-to/fix-common-cluster-issues.asciidoc
  3. 9 3
      docs/reference/index-modules/blocks.asciidoc
  4. 19 3
      docs/reference/modules/cluster/disk_allocator.asciidoc
  5. 13 3
      docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc
  6. 5 5
      docs/reference/troubleshooting/fix-common-cluster-issues.asciidoc
  7. 26 24
      server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/MockDiskUsagesIT.java
  8. 61 39
      server/src/internalClusterTest/java/org/elasticsearch/health/HealthMetadataServiceIT.java
  9. 227 21
      server/src/main/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdSettings.java
  10. 9 1
      server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDecider.java
  11. 3 0
      server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java
  12. 18 0
      server/src/main/java/org/elasticsearch/common/settings/Setting.java
  13. 41 3
      server/src/main/java/org/elasticsearch/common/unit/ByteSizeValue.java
  14. 4 1
      server/src/main/java/org/elasticsearch/common/unit/RelativeByteSizeValue.java
  15. 75 10
      server/src/main/java/org/elasticsearch/health/metadata/HealthMetadata.java
  16. 21 0
      server/src/main/java/org/elasticsearch/health/metadata/HealthMetadataService.java
  17. 496 108
      server/src/test/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorTests.java
  18. 442 9
      server/src/test/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdSettingsTests.java
  19. 300 101
      server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderTests.java
  20. 93 38
      server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderUnitTests.java
  21. 153 0
      server/src/test/java/org/elasticsearch/common/unit/ByteSizeValueTests.java
  22. 18 5
      server/src/test/java/org/elasticsearch/health/metadata/HealthMetadataSerializationTests.java

+ 6 - 0
docs/changelog/88639.yaml

@@ -0,0 +1,6 @@
+pr: 88639
+summary: Introduce max headroom for disk watermark stages
+area: Infra/Settings
+type: enhancement
+issues:
+ - 81406

+ 28 - 18
docs/reference/how-to/fix-common-cluster-issues.asciidoc

@@ -51,8 +51,13 @@ PUT _cluster/settings
 {
   "persistent": {
     "cluster.routing.allocation.disk.watermark.low": "90%",
+    "cluster.routing.allocation.disk.watermark.low.max_headroom": "100GB",
     "cluster.routing.allocation.disk.watermark.high": "95%",
-    "cluster.routing.allocation.disk.watermark.flood_stage": "97%"
+    "cluster.routing.allocation.disk.watermark.high.max_headroom": "20GB",
+    "cluster.routing.allocation.disk.watermark.flood_stage": "97%",
+    "cluster.routing.allocation.disk.watermark.flood_stage.max_headroom": "5GB",
+    "cluster.routing.allocation.disk.watermark.flood_stage.frozen": "97%",
+    "cluster.routing.allocation.disk.watermark.flood_stage.frozen.max_headroom": "5GB"
   }
 }
 
@@ -82,8 +87,13 @@ PUT _cluster/settings
 {
   "persistent": {
     "cluster.routing.allocation.disk.watermark.low": null,
+    "cluster.routing.allocation.disk.watermark.low.max_headroom": null,
     "cluster.routing.allocation.disk.watermark.high": null,
-    "cluster.routing.allocation.disk.watermark.flood_stage": null
+    "cluster.routing.allocation.disk.watermark.high.max_headroom": null,
+    "cluster.routing.allocation.disk.watermark.flood_stage": null,
+    "cluster.routing.allocation.disk.watermark.flood_stage.max_headroom": null,
+    "cluster.routing.allocation.disk.watermark.flood_stage.frozen": null,
+    "cluster.routing.allocation.disk.watermark.flood_stage.frozen.max_headroom": null
   }
 }
 ----
@@ -674,8 +684,8 @@ for tips on diagnosing and preventing them.
 [[task-queue-backlog]]
 === Task queue backlog
 
-A backlogged task queue can prevent tasks from completing and 
-put the cluster into an unhealthy state. 
+A backlogged task queue can prevent tasks from completing and
+put the cluster into an unhealthy state.
 Resource constraints, a large number of tasks being triggered at once,
 and long running tasks can all contribute to a backlogged task queue.
 
@@ -685,11 +695,11 @@ and long running tasks can all contribute to a backlogged task queue.
 
 **Check the thread pool status**
 
-A <<high-cpu-usage,depleted thread pool>> can result in <<rejected-requests,rejected requests>>. 
+A <<high-cpu-usage,depleted thread pool>> can result in <<rejected-requests,rejected requests>>.
 
-You can use the <<cat-thread-pool,cat thread pool API>> to 
+You can use the <<cat-thread-pool,cat thread pool API>> to
 see the number of active threads in each thread pool and
-how many tasks are queued, how many have been rejected, and how many have completed. 
+how many tasks are queued, how many have been rejected, and how many have completed.
 
 [source,console]
 ----
@@ -698,9 +708,9 @@ GET /_cat/thread_pool?v&s=t,n&h=type,name,node_name,active,queue,rejected,comple
 
 **Inspect the hot threads on each node**
 
-If a particular thread pool queue is backed up, 
-you can periodically poll the <<cluster-nodes-hot-threads,Nodes hot threads>> API 
-to determine if the thread has sufficient 
+If a particular thread pool queue is backed up,
+you can periodically poll the <<cluster-nodes-hot-threads,Nodes hot threads>> API
+to determine if the thread has sufficient
 resources to progress and gauge how quickly it is progressing.
 
 [source,console]
@@ -710,9 +720,9 @@ GET /_nodes/hot_threads
 
 **Look for long running tasks**
 
-Long-running tasks can also cause a backlog. 
-You can use the <<tasks,task management>> API to get information about the tasks that are running. 
-Check the `running_time_in_nanos` to identify tasks that are taking an excessive amount of time to complete. 
+Long-running tasks can also cause a backlog.
+You can use the <<tasks,task management>> API to get information about the tasks that are running.
+Check the `running_time_in_nanos` to identify tasks that are taking an excessive amount of time to complete.
 
 [source,console]
 ----
@@ -723,10 +733,10 @@ GET /_tasks?filter_path=nodes.*.tasks
 [[resolve-task-queue-backlog]]
 ==== Resolve a task queue backlog
 
-**Increase available resources** 
+**Increase available resources**
 
-If tasks are progressing slowly and the queue is backing up, 
-you might need to take steps to <<reduce-cpu-usage>>. 
+If tasks are progressing slowly and the queue is backing up,
+you might need to take steps to <<reduce-cpu-usage>>.
 
 In some cases, increasing the thread pool size might help.
 For example, the `force_merge` thread pool defaults to a single thread.
@@ -734,5 +744,5 @@ Increasing the size to 2 might help reduce a backlog of force merge requests.
 
 **Cancel stuck tasks**
 
-If you find the active task's hot thread isn't progressing and there's a backlog, 
-consider canceling the task. 
+If you find the active task's hot thread isn't progressing and there's a backlog,
+consider canceling the task.

+ 9 - 3
docs/reference/index-modules/blocks.asciidoc

@@ -35,9 +35,15 @@ the index itself - can increase the index size over time. When
 not permitted. However, deleting the index itself releases the read-only index
 block and makes resources available almost immediately.
 +
-IMPORTANT: {es} adds and removes the read-only index block automatically when
-the disk utilization falls below the high watermark, controlled by
-<<cluster-routing-flood-stage,cluster.routing.allocation.disk.watermark.flood_stage>>.
+IMPORTANT: {es} adds the read-only index block automatically when the disk
+utilization exceeds the flood stage watermark, controlled by the
+<<cluster-routing-flood-stage,cluster.routing.allocation.disk.watermark.flood_stage>>
+and <<cluster-routing-flood-stage,cluster.routing.allocation.disk.watermark.flood_stage.max_headroom>>
+settings, and removes the block automatically when the disk utilization falls
+under the high watermark, controlled by the
+<<cluster-routing-flood-stage,cluster.routing.allocation.disk.watermark.high>>
+and <<cluster-routing-flood-stage,cluster.routing.allocation.disk.watermark.high.max_headroom>>
+settings.
 
 `index.blocks.read`::
 

+ 19 - 3
docs/reference/modules/cluster/disk_allocator.asciidoc

@@ -75,6 +75,11 @@ Defaults to `true`. Set to `false` to disable the disk allocation decider. Upon
 Controls the low watermark for disk usage. It defaults to `85%`, meaning that {es} will not allocate shards to nodes that have more than 85% disk used. It can alternatively be set to a ratio value, e.g., `0.85`. It can also be set to an absolute byte value (like `500mb`) to prevent {es} from allocating shards if less than the specified amount of space is available. This setting has no effect on the primary shards of newly-created indices but will prevent their replicas from being allocated.
 // end::cluster-routing-watermark-low-tag[]
 
+`cluster.routing.allocation.disk.watermark.low.max_headroom`::
+(<<dynamic-cluster-setting,Dynamic>>) Controls the max headroom for the low watermark (in case of a percentage/ratio value).
+Defaults to 200GB when `cluster.routing.allocation.disk.watermark.low` is not explicitly set.
+This caps the amount of free space required.
+
 [[cluster-routing-watermark-high]]
 // tag::cluster-routing-watermark-high-tag[]
 `cluster.routing.allocation.disk.watermark.high` {ess-icon}::
@@ -82,6 +87,11 @@ Controls the low watermark for disk usage. It defaults to `85%`, meaning that {e
 Controls the high watermark. It defaults to `90%`, meaning that {es} will attempt to relocate shards away from a node whose disk usage is above 90%. It can alternatively be set to a ratio value, e.g., `0.9`. It can also be set to an absolute byte value (similarly to the low watermark) to relocate shards away from a node if it has less than the specified amount of free space. This setting affects the allocation of all shards, whether previously allocated or not.
 // end::cluster-routing-watermark-high-tag[]
 
+`cluster.routing.allocation.disk.watermark.high.max_headroom`::
+(<<dynamic-cluster-setting,Dynamic>>) Controls the max headroom for the high watermark (in case of a percentage/ratio value).
+Defaults to 150GB when `cluster.routing.allocation.disk.watermark.high` is not explicitly set.
+This caps the amount of free space required.
+
 `cluster.routing.allocation.disk.watermark.enable_for_single_data_node`::
     (<<static-cluster-setting,Static>>)
 In earlier releases, the default behaviour was to disregard disk watermarks for a single
@@ -97,8 +107,14 @@ is now `true`. The setting will be removed in a future release.
 (<<dynamic-cluster-setting,Dynamic>>)
 Controls the flood stage watermark, which defaults to 95%. {es} enforces a read-only index block (`index.blocks.read_only_allow_delete`) on every index that has one or more shards allocated on the node, and that has at least one disk exceeding the flood stage. This setting is a last resort to prevent nodes from running out of disk space. The index block is automatically released when the disk utilization falls below the high watermark. Similarly to the low and high watermark values, it can alternatively be set to a ratio value, e.g., `0.95`, or an absolute byte value.
 
+`cluster.routing.allocation.disk.watermark.flood_stage.max_headroom`::
+(<<dynamic-cluster-setting,Dynamic>>) Controls the max headroom for the flood stage watermark (in case of a percentage/ratio value).
+Defaults to 100GB when
+`cluster.routing.allocation.disk.watermark.flood_stage` is not explicitly set.
+This caps the amount of free space required.
+
 NOTE: You cannot mix the usage of percentage/ratio values and byte values within
-the watermark settings. Either all values are set to percentage/ratio values, or all are set to byte values. This enforcement is so that {es} can validate that the settings are internally consistent, ensuring that the low disk threshold is less than the high disk threshold, and the high disk threshold is less than the flood stage threshold.
+the watermark settings. Either all values are set to percentage/ratio values, or all are set to byte values. This enforcement is so that {es} can validate that the settings are internally consistent, ensuring that the low disk threshold is less than the high disk threshold, and the high disk threshold is less than the flood stage threshold. A similar check is done for the max headroom values.
 
 An example of resetting the read-only index block on the `my-index-000001` index:
 
@@ -122,8 +138,8 @@ Controls the flood stage watermark for dedicated frozen nodes, which defaults to
 
 `cluster.routing.allocation.disk.watermark.flood_stage.frozen.max_headroom` {ess-icon}::
 (<<dynamic-cluster-setting,Dynamic>>)
-Controls the max headroom for the flood stage watermark for dedicated frozen
-nodes. Defaults to 20GB when
+Controls the max headroom for the flood stage watermark (in case of a
+percentage/ratio value) for dedicated frozen nodes. Defaults to 20GB when
 `cluster.routing.allocation.disk.watermark.flood_stage.frozen` is not explicitly
 set. This caps the amount of free space required on dedicated frozen nodes.
 

+ 13 - 3
docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc

@@ -46,8 +46,13 @@ PUT _cluster/settings
 {
   "persistent": {
     "cluster.routing.allocation.disk.watermark.low": "90%",
+    "cluster.routing.allocation.disk.watermark.low.max_headroom": "100GB",
     "cluster.routing.allocation.disk.watermark.high": "95%",
-    "cluster.routing.allocation.disk.watermark.flood_stage": "97%"
+    "cluster.routing.allocation.disk.watermark.high.max_headroom": "20GB",
+    "cluster.routing.allocation.disk.watermark.flood_stage": "97%",
+    "cluster.routing.allocation.disk.watermark.flood_stage.max_headroom": "5GB",
+    "cluster.routing.allocation.disk.watermark.flood_stage.frozen": "97%",
+    "cluster.routing.allocation.disk.watermark.flood_stage.frozen.max_headroom": "5GB"
   }
 }
 
@@ -77,8 +82,13 @@ PUT _cluster/settings
 {
   "persistent": {
     "cluster.routing.allocation.disk.watermark.low": null,
+    "cluster.routing.allocation.disk.watermark.low.max_headroom": null,
     "cluster.routing.allocation.disk.watermark.high": null,
-    "cluster.routing.allocation.disk.watermark.flood_stage": null
+    "cluster.routing.allocation.disk.watermark.high.max_headroom": null,
+    "cluster.routing.allocation.disk.watermark.flood_stage": null,
+    "cluster.routing.allocation.disk.watermark.flood_stage.max_headroom": null,
+    "cluster.routing.allocation.disk.watermark.flood_stage.frozen": null,
+    "cluster.routing.allocation.disk.watermark.flood_stage.frozen.max_headroom": null
   }
 }
-----
+----

+ 5 - 5
docs/reference/troubleshooting/fix-common-cluster-issues.asciidoc

@@ -16,8 +16,8 @@ the operation and returns an error.
 The most common causes of high CPU usage and their solutions.
 
 <<high-jvm-memory-pressure,High JVM memory pressure>>::
-High JVM memory usage can degrade cluster performance and trigger circuit 
-breaker errors. 
+High JVM memory usage can degrade cluster performance and trigger circuit
+breaker errors.
 
 <<red-yellow-cluster-status,Red or yellow cluster status>>::
 A red or yellow cluster status indicates one or more shards are missing or
@@ -29,8 +29,8 @@ When {es} rejects a request, it stops the operation and returns an error with a
 `429` response code.
 
 <<task-queue-backlog,Task queue backlog>>::
-A backlogged task queue can prevent tasks from completing and put the cluster 
-into an unhealthy state. 
+A backlogged task queue can prevent tasks from completing and put the cluster
+into an unhealthy state.
 
 <<diagnose-unassigned-shards,Diagnose unassigned shards>>::
 There are multiple reasons why shards might get unassigned, ranging from 
@@ -47,4 +47,4 @@ include::common-issues/high-jvm-memory-pressure.asciidoc[]
 include::common-issues/red-yellow-cluster-status.asciidoc[]
 include::common-issues/rejected-requests.asciidoc[]
 include::common-issues/task-queue-backlog.asciidoc[]
-include::common-issues/diagnose-unassigned-shards.asciidoc[]
+include::common-issues/diagnose-unassigned-shards.asciidoc[]

+ 26 - 24
server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/MockDiskUsagesIT.java

@@ -34,8 +34,11 @@ import java.util.List;
 import java.util.Map;
 import java.util.concurrent.atomic.AtomicReference;
 
+import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING;
 import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING;
+import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING;
 import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING;
+import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING;
 import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING;
 import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING;
 import static org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING;
@@ -92,18 +95,18 @@ public class MockDiskUsagesIT extends ESIntegTestCase {
         clusterInfoService.setDiskUsageFunctionAndRefresh((discoveryNode, fsInfoPath) -> setDiskUsage(fsInfoPath, 100, between(10, 100)));
 
         final boolean watermarkBytes = randomBoolean(); // we have to consistently use bytes or percentage for the disk watermark settings
-        assertAcked(
-            client().admin()
-                .cluster()
-                .prepareUpdateSettings()
-                .setPersistentSettings(
-                    Settings.builder()
-                        .put(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), watermarkBytes ? "10b" : "90%")
-                        .put(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), watermarkBytes ? "10b" : "90%")
-                        .put(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), watermarkBytes ? "0b" : "100%")
-                        .put(CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.getKey(), "0ms")
-                )
-        );
+        Settings.Builder settings = Settings.builder()
+            .put(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), watermarkBytes ? "10b" : "90%")
+            .put(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), watermarkBytes ? "10b" : "90%")
+            .put(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), watermarkBytes ? "0b" : "100%")
+            .put(CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.getKey(), "0ms");
+        if (watermarkBytes == false && randomBoolean()) {
+            String headroom = randomIntBetween(10, 100) + "b";
+            settings = settings.put(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), headroom)
+                .put(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), headroom)
+                .put(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), headroom);
+        }
+        assertAcked(client().admin().cluster().prepareUpdateSettings().setPersistentSettings(settings));
         // Create an index with 10 shards so we can check allocation for it
         assertAcked(prepareCreate("test").setSettings(Settings.builder().put("number_of_shards", 10).put("number_of_replicas", 0)));
         ensureGreen("test");
@@ -172,18 +175,17 @@ public class MockDiskUsagesIT extends ESIntegTestCase {
         clusterInfoService.setDiskUsageFunctionAndRefresh((discoveryNode, fsInfoPath) -> setDiskUsage(fsInfoPath, 100, between(15, 100)));
 
         final boolean watermarkBytes = randomBoolean(); // we have to consistently use bytes or percentage for the disk watermark settings
-        assertAcked(
-            client().admin()
-                .cluster()
-                .prepareUpdateSettings()
-                .setPersistentSettings(
-                    Settings.builder()
-                        .put(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), watermarkBytes ? "10b" : "90%")
-                        .put(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), watermarkBytes ? "10b" : "90%")
-                        .put(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), watermarkBytes ? "5b" : "95%")
-                        .put(CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.getKey(), "150ms")
-                )
-        );
+        Settings.Builder builder = Settings.builder()
+            .put(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), watermarkBytes ? "10b" : "90%")
+            .put(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), watermarkBytes ? "10b" : "90%")
+            .put(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), watermarkBytes ? "5b" : "95%")
+            .put(CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.getKey(), "150ms");
+        if (watermarkBytes == false) {
+            builder = builder.put(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "10b")
+                .put(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), "10b")
+                .put(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "5b");
+        }
+        assertAcked(client().admin().cluster().prepareUpdateSettings().setPersistentSettings(builder));
 
         // Create an index with 6 shards so we can check allocation for it
         prepareCreate("test").setSettings(Settings.builder().put("number_of_shards", 6).put("number_of_replicas", 0)).get();

+ 61 - 39
server/src/internalClusterTest/java/org/elasticsearch/health/HealthMetadataServiceIT.java

@@ -39,12 +39,14 @@ public class HealthMetadataServiceIT extends ESIntegTestCase {
         try (InternalTestCluster internalCluster = internalCluster()) {
             int numberOfNodes = 3;
             Map<String, String> watermarkByNode = new HashMap<>();
+            Map<String, ByteSizeValue> maxHeadroomByNode = new HashMap<>();
             for (int i = 0; i < numberOfNodes; i++) {
-                String customWatermark = percentageMode
-                    ? randomIntBetween(86, 94) + "%"
-                    : new ByteSizeValue(randomIntBetween(6, 19)).toString();
-                String nodeName = startNode(internalCluster, customWatermark);
+                ByteSizeValue randomBytes = new ByteSizeValue(randomLongBetween(6, 19));
+                String customWatermark = percentageMode ? randomIntBetween(86, 94) + "%" : randomBytes.toString();
+                ByteSizeValue customMaxHeadroom = percentageMode ? randomBytes : ByteSizeValue.MINUS_ONE;
+                String nodeName = startNode(internalCluster, customWatermark, customMaxHeadroom.toString());
                 watermarkByNode.put(nodeName, customWatermark);
+                maxHeadroomByNode.put(nodeName, customMaxHeadroom);
             }
             ensureStableCluster(numberOfNodes);
 
@@ -53,6 +55,7 @@ public class HealthMetadataServiceIT extends ESIntegTestCase {
                 HealthMetadata.Disk diskMetadata = HealthMetadata.getFromClusterState(internalCluster.clusterService().state())
                     .getDiskMetadata();
                 assertThat(diskMetadata.describeHighWatermark(), equalTo(watermarkByNode.get(electedMaster)));
+                assertThat(diskMetadata.highMaxHeadroom(), equalTo(maxHeadroomByNode.get(electedMaster)));
             }
 
             // Stop the master to ensure another node will become master with a different watermark
@@ -63,6 +66,7 @@ public class HealthMetadataServiceIT extends ESIntegTestCase {
                 HealthMetadata.Disk diskMetadata = HealthMetadata.getFromClusterState(internalCluster.clusterService().state())
                     .getDiskMetadata();
                 assertThat(diskMetadata.describeHighWatermark(), equalTo(watermarkByNode.get(electedMaster)));
+                assertThat(diskMetadata.highMaxHeadroom(), equalTo(maxHeadroomByNode.get(electedMaster)));
             }
         }
     }
@@ -70,68 +74,76 @@ public class HealthMetadataServiceIT extends ESIntegTestCase {
     public void testWatermarkSettingUpdate() throws Exception {
         try (InternalTestCluster internalCluster = internalCluster()) {
             int numberOfNodes = 3;
-            String initialWatermark = percentageMode
-                ? randomIntBetween(86, 94) + "%"
-                : new ByteSizeValue(randomIntBetween(6, 19)).toString();
+            ByteSizeValue randomBytes = new ByteSizeValue(randomLongBetween(6, 19));
+            String initialWatermark = percentageMode ? randomIntBetween(86, 94) + "%" : randomBytes.toString();
+            ByteSizeValue initialMaxHeadroom = percentageMode ? randomBytes : ByteSizeValue.MINUS_ONE;
             for (int i = 0; i < numberOfNodes; i++) {
-                startNode(internalCluster, initialWatermark);
+                startNode(internalCluster, initialWatermark, initialMaxHeadroom.toString());
             }
 
-            String updatedLowWatermark = percentageMode
-                ? randomIntBetween(40, 59) + "%"
-                : new ByteSizeValue(randomIntBetween(101, 200)).toString();
-            String updatedHighWatermark = percentageMode
-                ? randomIntBetween(60, 90) + "%"
-                : new ByteSizeValue(randomIntBetween(50, 100)).toString();
-            String updatedFloodStageWatermark = percentageMode
-                ? randomIntBetween(91, 95) + "%"
-                : new ByteSizeValue(randomIntBetween(5, 10)).toString();
+            randomBytes = new ByteSizeValue(randomLongBetween(101, 200));
+            String updatedLowWatermark = percentageMode ? randomIntBetween(40, 59) + "%" : randomBytes.toString();
+            ByteSizeValue updatedLowMaxHeadroom = percentageMode ? randomBytes : ByteSizeValue.MINUS_ONE;
+            randomBytes = new ByteSizeValue(randomLongBetween(50, 100));
+            String updatedHighWatermark = percentageMode ? randomIntBetween(60, 90) + "%" : randomBytes.toString();
+            ByteSizeValue updatedHighMaxHeadroom = percentageMode ? randomBytes : ByteSizeValue.MINUS_ONE;
+            randomBytes = new ByteSizeValue(randomLongBetween(5, 10));
+            String updatedFloodStageWatermark = percentageMode ? randomIntBetween(91, 95) + "%" : randomBytes.toString();
+            ByteSizeValue updatedFloodStageMaxHeadroom = percentageMode ? randomBytes : ByteSizeValue.MINUS_ONE;
 
             ensureStableCluster(numberOfNodes);
             {
                 HealthMetadata.Disk diskMetadata = HealthMetadata.getFromClusterState(internalCluster.clusterService().state())
                     .getDiskMetadata();
                 assertThat(diskMetadata.describeHighWatermark(), equalTo(initialWatermark));
+                assertThat(diskMetadata.highMaxHeadroom(), equalTo(initialMaxHeadroom));
+            }
+            Settings.Builder builder = Settings.builder()
+                .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), updatedLowWatermark)
+                .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), updatedHighWatermark)
+                .put(
+                    DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(),
+                    updatedFloodStageWatermark
+                );
+            if (percentageMode) {
+                builder = builder.put(
+                    DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(),
+                    updatedLowMaxHeadroom
+                )
+                    .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), updatedHighMaxHeadroom)
+                    .put(
+                        DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(),
+                        updatedFloodStageMaxHeadroom
+                    );
             }
             internalCluster.client()
                 .admin()
                 .cluster()
-                .updateSettings(
-                    new ClusterUpdateSettingsRequest().persistentSettings(
-                        Settings.builder()
-                            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), updatedLowWatermark)
-                            .put(
-                                DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(),
-                                updatedHighWatermark
-                            )
-                            .put(
-                                DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(),
-                                updatedFloodStageWatermark
-                            )
-                    )
-                )
+                .updateSettings(new ClusterUpdateSettingsRequest().persistentSettings(builder))
                 .actionGet();
             assertBusy(() -> {
                 HealthMetadata.Disk diskMetadata = HealthMetadata.getFromClusterState(internalCluster.clusterService().state())
                     .getDiskMetadata();
                 assertThat(diskMetadata.describeHighWatermark(), equalTo(updatedHighWatermark));
+                assertThat(diskMetadata.highMaxHeadroom(), equalTo(updatedHighMaxHeadroom));
                 assertThat(diskMetadata.describeFloodStageWatermark(), equalTo(updatedFloodStageWatermark));
+                assertThat(diskMetadata.floodStageMaxHeadroom(), equalTo(updatedFloodStageMaxHeadroom));
             });
         }
     }
 
-    private String startNode(InternalTestCluster internalCluster, String customWatermark) {
+    private String startNode(InternalTestCluster internalCluster, String customWatermark, String customMaxHeadroom) {
         return internalCluster.startNode(
             Settings.builder()
                 .put(onlyRoles(Set.of(DiscoveryNodeRole.MASTER_ROLE, DiscoveryNodeRole.DATA_ROLE)))
-                .put(createWatermarkSettings(customWatermark))
+                .put(createWatermarkSettings(customWatermark, customMaxHeadroom))
                 .build()
         );
     }
 
-    private Settings createWatermarkSettings(String highWatermark) {
+    private Settings createWatermarkSettings(String highWatermark, String highMaxHeadroom) {
         // We define both thresholds to avoid inconsistencies over the type of the thresholds
-        return Settings.builder()
+        Settings.Builder settings = Settings.builder()
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), percentageMode ? "85%" : "20b")
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), highWatermark)
             .put(
@@ -139,10 +151,20 @@ public class HealthMetadataServiceIT extends ESIntegTestCase {
                 percentageMode ? "95%" : "1b"
             )
             .put(
-                DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(),
+                DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.getKey(),
                 percentageMode ? "95%" : "5b"
-            )
-            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.getKey(), "5b")
-            .build();
+            );
+        if (percentageMode) {
+            settings = settings.put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "20b")
+                .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "1b")
+                .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.getKey(), "5b");
+            if (highMaxHeadroom.equals("-1") == false) {
+                settings = settings.put(
+                    DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(),
+                    highMaxHeadroom
+                );
+            }
+        }
+        return settings.build();
     }
 }

+ 227 - 21
server/src/main/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdSettings.java

@@ -39,6 +39,20 @@ public class DiskThresholdSettings {
         Setting.Property.Dynamic,
         Setting.Property.NodeScope
     );
+    public static final Setting<ByteSizeValue> CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING = new Setting<>(
+        "cluster.routing.allocation.disk.watermark.low.max_headroom",
+        (settings) -> {
+            if (CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.exists(settings)) {
+                return "-1";
+            } else {
+                return "200GB";
+            }
+        },
+        (s) -> ByteSizeValue.parseBytesSizeValue(s, "cluster.routing.allocation.disk.watermark.low.max_headroom"),
+        new MaxHeadroomValidator(),
+        Setting.Property.Dynamic,
+        Setting.Property.NodeScope
+    );
     public static final Setting<RelativeByteSizeValue> CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING = new Setting<>(
         "cluster.routing.allocation.disk.watermark.high",
         "90%",
@@ -47,6 +61,20 @@ public class DiskThresholdSettings {
         Setting.Property.Dynamic,
         Setting.Property.NodeScope
     );
+    public static final Setting<ByteSizeValue> CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING = new Setting<>(
+        "cluster.routing.allocation.disk.watermark.high.max_headroom",
+        (settings) -> {
+            if (CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.exists(settings)) {
+                return "-1";
+            } else {
+                return "150GB";
+            }
+        },
+        (s) -> ByteSizeValue.parseBytesSizeValue(s, "cluster.routing.allocation.disk.watermark.high.max_headroom"),
+        new MaxHeadroomValidator(),
+        Setting.Property.Dynamic,
+        Setting.Property.NodeScope
+    );
     public static final Setting<RelativeByteSizeValue> CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING = new Setting<>(
         "cluster.routing.allocation.disk.watermark.flood_stage",
         "95%",
@@ -55,6 +83,20 @@ public class DiskThresholdSettings {
         Setting.Property.Dynamic,
         Setting.Property.NodeScope
     );
+    public static final Setting<ByteSizeValue> CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING = new Setting<>(
+        "cluster.routing.allocation.disk.watermark.flood_stage.max_headroom",
+        (settings) -> {
+            if (CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.exists(settings)) {
+                return "-1";
+            } else {
+                return "100GB";
+            }
+        },
+        (s) -> ByteSizeValue.parseBytesSizeValue(s, "cluster.routing.allocation.disk.watermark.flood_stage.max_headroom"),
+        new MaxHeadroomValidator(),
+        Setting.Property.Dynamic,
+        Setting.Property.NodeScope
+    );
     public static final Setting<RelativeByteSizeValue> CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING = new Setting<>(
         "cluster.routing.allocation.disk.watermark.flood_stage.frozen",
         "95%",
@@ -72,6 +114,7 @@ public class DiskThresholdSettings {
             }
         },
         (s) -> ByteSizeValue.parseBytesSizeValue(s, "cluster.routing.allocation.disk.watermark.flood_stage.frozen.max_headroom"),
+        new MaxHeadroomValidator(),
         Setting.Property.Dynamic,
         Setting.Property.NodeScope
     );
@@ -81,10 +124,28 @@ public class DiskThresholdSettings {
         Setting.Property.Dynamic,
         Setting.Property.NodeScope
     );
+    private static final List<Setting<?>> WATERMARK_VALIDATOR_SETTINGS_LIST = List.of(
+        CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING,
+        CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING,
+        CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING
+    );
+    private static final List<Setting<?>> MAX_HEADROOM_VALIDATOR_SETTINGS_LIST = List.of(
+        CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING,
+        CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING,
+        CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING,
+        CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING,
+        // The headroom validator also depends on the low and frozen flood watermark settings to check whether they are ratios/percentages
+        // (we do not need to check the other watermarks, since the watermark validator checks that they are all ratios/percentages or not)
+        CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING,
+        CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING
+    );
 
     private volatile RelativeByteSizeValue lowStageWatermark;
+    private volatile ByteSizeValue lowStageMaxHeadroom;
     private volatile RelativeByteSizeValue highStageWatermark;
+    private volatile ByteSizeValue highStageMaxHeadroom;
     private volatile RelativeByteSizeValue floodStageWatermark;
+    private volatile ByteSizeValue floodStageMaxHeadroom;
     private volatile RelativeByteSizeValue frozenFloodStageWatermark;
     private volatile ByteSizeValue frozenFloodStageMaxHeadroom;
     private volatile boolean enabled;
@@ -101,18 +162,27 @@ public class DiskThresholdSettings {
 
     public DiskThresholdSettings(Settings settings, ClusterSettings clusterSettings) {
         setLowWatermark(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.get(settings));
+        setLowStageMaxHeadroom(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.get(settings));
         setHighWatermark(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.get(settings));
+        setHighStageMaxHeadroom(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.get(settings));
         setFloodStageWatermark(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.get(settings));
+        setFloodStageMaxHeadroom(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.get(settings));
         setFrozenFloodStageWatermark(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.get(settings));
         setFrozenFloodStageMaxHeadroom(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.get(settings));
         this.rerouteInterval = CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.get(settings);
         this.enabled = CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.get(settings);
         clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING, this::setLowWatermark);
+        clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING, this::setLowStageMaxHeadroom);
         clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING, this::setHighWatermark);
+        clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING, this::setHighStageMaxHeadroom);
         clusterSettings.addSettingsUpdateConsumer(
             CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING,
             this::setFloodStageWatermark
         );
+        clusterSettings.addSettingsUpdateConsumer(
+            CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING,
+            this::setFloodStageMaxHeadroom
+        );
         clusterSettings.addSettingsUpdateConsumer(
             CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING,
             this::setFrozenFloodStageWatermark
@@ -197,12 +267,121 @@ public class DiskThresholdSettings {
 
         @Override
         public Iterator<Setting<?>> settings() {
-            final List<Setting<?>> settings = List.of(
-                CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING,
-                CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING,
-                CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING
+            return WATERMARK_VALIDATOR_SETTINGS_LIST.iterator();
+        }
+
+    }
+
+    /**
+     * Validates that low, high and flood stage max headrooms adhere to the comparison: flood &lt; high &lt; low.
+     * Also validates that if the low max headroom is set, then the high max headroom must be set as well.
+     * Also validates that if the high max headroom is set, then the flood stage max headroom must be set as well.
+     * Also validates that if max headrooms are set, the respective watermark values should be ratios/percentages.
+     * Else, throws an exception.
+     */
+    static class MaxHeadroomValidator implements Setting.Validator<ByteSizeValue> {
+
+        @Override
+        public void validate(ByteSizeValue value) {
+
+        }
+
+        @Override
+        public void validate(final ByteSizeValue value, final Map<Setting<?>, Object> settings, boolean isPresent) {
+            if (isPresent && value.equals(ByteSizeValue.MINUS_ONE)) {
+                throw new IllegalArgumentException("setting a headroom value to less than 0 is not supported");
+            }
+
+            final ByteSizeValue lowHeadroom = (ByteSizeValue) settings.get(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING);
+            final ByteSizeValue highHeadroom = (ByteSizeValue) settings.get(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING);
+            final ByteSizeValue floodHeadroom = (ByteSizeValue) settings.get(
+                CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING
+            );
+            final ByteSizeValue frozenFloodHeadroom = (ByteSizeValue) settings.get(
+                CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING
             );
-            return settings.iterator();
+
+            // Ensure that if max headroom values are set, then watermark values are ratios/percentages.
+            final RelativeByteSizeValue low = (RelativeByteSizeValue) settings.get(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING);
+            if (low.isAbsolute()
+                && (lowHeadroom.equals(ByteSizeValue.MINUS_ONE) == false
+                    || highHeadroom.equals(ByteSizeValue.MINUS_ONE) == false
+                    || floodHeadroom.equals(ByteSizeValue.MINUS_ONE) == false)) {
+                // No need to check that the high or flood stage watermarks are absolute as well, since there is another check in
+                // WatermarkValidator that all low/high/flood watermarks should be either ratios/percentages or absolute values.
+                throw new IllegalArgumentException(
+                    "At least one of the disk max headroom settings is set [low="
+                        + lowHeadroom.getStringRep()
+                        + ", high="
+                        + highHeadroom.getStringRep()
+                        + ", flood="
+                        + floodHeadroom.getStringRep()
+                        + "], while the disk watermark values are set to absolute values instead of ratios/percentages, e.g., "
+                        + "the low watermark is ["
+                        + low.getStringRep()
+                        + "]"
+                );
+            }
+
+            // Similar check for the frozen flood watermark and max headroom settings
+            final RelativeByteSizeValue frozenFlood = (RelativeByteSizeValue) settings.get(
+                CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING
+            );
+            if (frozenFlood.isAbsolute() && frozenFloodHeadroom.equals(ByteSizeValue.MINUS_ONE) == false) {
+                throw new IllegalArgumentException(
+                    "The frozen flood stage disk max headroom setting is set ["
+                        + frozenFloodHeadroom.getStringRep()
+                        + "], while the frozen flood stage disk watermark setting is set to an absolute value "
+                        + "instead of a ratio/percentage ["
+                        + frozenFlood.getStringRep()
+                        + "]"
+                );
+            }
+
+            if (lowHeadroom.equals(ByteSizeValue.MINUS_ONE) == false && highHeadroom.equals(ByteSizeValue.MINUS_ONE)) {
+                throw new IllegalArgumentException(
+                    "high disk max headroom ["
+                        + highHeadroom.getStringRep()
+                        + "] is not set, while the low disk max headroom is set ["
+                        + lowHeadroom.getStringRep()
+                        + "]"
+                );
+            }
+            if (highHeadroom.equals(ByteSizeValue.MINUS_ONE) == false && floodHeadroom.equals(ByteSizeValue.MINUS_ONE)) {
+                throw new IllegalArgumentException(
+                    "flood disk max headroom ["
+                        + floodHeadroom.getStringRep()
+                        + "] is not set, while the high disk max headroom is set ["
+                        + highHeadroom.getStringRep()
+                        + "]"
+                );
+            }
+
+            // For the comparisons, we need to mind that headroom values can default to -1.
+
+            if (highHeadroom.compareTo(lowHeadroom) > 0 && lowHeadroom.equals(ByteSizeValue.MINUS_ONE) == false) {
+                throw new IllegalArgumentException(
+                    "high disk max headroom ["
+                        + highHeadroom.getStringRep()
+                        + "] more than low disk max headroom ["
+                        + lowHeadroom.getStringRep()
+                        + "]"
+                );
+            }
+            if (floodHeadroom.compareTo(highHeadroom) > 0 && highHeadroom.equals(ByteSizeValue.MINUS_ONE) == false) {
+                throw new IllegalArgumentException(
+                    "flood disk max headroom ["
+                        + floodHeadroom.getStringRep()
+                        + "] more than high disk max headroom ["
+                        + highHeadroom.getStringRep()
+                        + "]"
+                );
+            }
+        }
+
+        @Override
+        public Iterator<Setting<?>> settings() {
+            return MAX_HEADROOM_VALIDATOR_SETTINGS_LIST.iterator();
         }
 
     }
@@ -219,14 +398,26 @@ public class DiskThresholdSettings {
         this.lowStageWatermark = lowWatermark;
     }
 
+    private void setLowStageMaxHeadroom(ByteSizeValue maxHeadroom) {
+        this.lowStageMaxHeadroom = maxHeadroom;
+    }
+
     private void setHighWatermark(RelativeByteSizeValue highWatermark) {
         this.highStageWatermark = highWatermark;
     }
 
+    private void setHighStageMaxHeadroom(ByteSizeValue maxHeadroom) {
+        this.highStageMaxHeadroom = maxHeadroom;
+    }
+
     private void setFloodStageWatermark(RelativeByteSizeValue floodStage) {
         this.floodStageWatermark = floodStage;
     }
 
+    private void setFloodStageMaxHeadroom(ByteSizeValue maxHeadroom) {
+        this.floodStageMaxHeadroom = maxHeadroom;
+    }
+
     private void setFrozenFloodStageWatermark(RelativeByteSizeValue floodStage) {
         this.frozenFloodStageWatermark = floodStage;
     }
@@ -240,42 +431,57 @@ public class DiskThresholdSettings {
         if (watermark.isAbsolute()) {
             return watermark.getAbsolute();
         }
-        return ByteSizeValue.ofBytes(total.getBytes() - watermark.calculateValue(total, maxHeadroom).getBytes());
+        return ByteSizeValue.subtract(total, watermark.calculateValue(total, maxHeadroom));
     }
 
     public ByteSizeValue getFreeBytesThresholdLowStage(ByteSizeValue total) {
-        return getFreeBytesThreshold(total, lowStageWatermark, ByteSizeValue.MINUS_ONE);
+        return getFreeBytesThreshold(total, lowStageWatermark, lowStageMaxHeadroom);
     }
 
     public ByteSizeValue getFreeBytesThresholdHighStage(ByteSizeValue total) {
-        return getFreeBytesThreshold(total, highStageWatermark, ByteSizeValue.MINUS_ONE);
+        return getFreeBytesThreshold(total, highStageWatermark, highStageMaxHeadroom);
     }
 
     public ByteSizeValue getFreeBytesThresholdFloodStage(ByteSizeValue total) {
-        return getFreeBytesThreshold(total, floodStageWatermark, ByteSizeValue.MINUS_ONE);
+        return getFreeBytesThreshold(total, floodStageWatermark, floodStageMaxHeadroom);
     }
 
     public ByteSizeValue getFreeBytesThresholdFrozenFloodStage(ByteSizeValue total) {
         return getFreeBytesThreshold(total, frozenFloodStageWatermark, frozenFloodStageMaxHeadroom);
     }
 
-    public ByteSizeValue getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue used) {
+    private ByteSizeValue getMinimumTotalSizeForBelowWatermark(
+        ByteSizeValue used,
+        RelativeByteSizeValue watermark,
+        ByteSizeValue maxHeadroom
+    ) {
         // If watermark is absolute, simply return total disk = used disk + free disk, where free disk bytes is the watermark value.
-        if (lowStageWatermark.isAbsolute()) {
-            return ByteSizeValue.ofBytes(lowStageWatermark.getAbsolute().getBytes() + used.getBytes());
+        if (watermark.isAbsolute()) {
+            return ByteSizeValue.add(watermark.getAbsolute(), used);
         }
 
-        // If watermark is percentage/ratio, calculate the total needed disk space.
-        double percentThreshold = lowStageWatermark.getRatio().getAsPercent();
+        double percentThreshold = watermark.getRatio().getAsPercent();
         if (percentThreshold >= 0.0 && percentThreshold < 100.0) {
-            // Use percentage instead of ratio, and multiple bytes with 100, to make division with double more accurate (issue #88791).
+            // If watermark is percentage/ratio, calculate the total needed disk space.
+            // Use percentage instead of ratio, and multiply bytes with 100, to make division with double more accurate (issue #88791).
             ByteSizeValue totalBytes = ByteSizeValue.ofBytes((long) Math.ceil((100 * used.getBytes()) / percentThreshold));
+
+            if (maxHeadroom.equals(ByteSizeValue.MINUS_ONE) == false) {
+                // If a max headroom is applicable, it can potentially require a smaller total size (used + maxHeadroom) to be stay below
+                // the watermark.
+                totalBytes = ByteSizeValue.min(totalBytes, ByteSizeValue.add(used, maxHeadroom));
+            }
+
             return totalBytes;
         } else {
             return used;
         }
     }
 
+    public ByteSizeValue getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue used) {
+        return getMinimumTotalSizeForBelowWatermark(used, lowStageWatermark, lowStageMaxHeadroom);
+    }
+
     public boolean isEnabled() {
         return enabled;
     }
@@ -308,10 +514,10 @@ public class DiskThresholdSettings {
         return describeThreshold(
             total,
             lowStageWatermark,
-            ByteSizeValue.MINUS_ONE,
+            lowStageMaxHeadroom,
             includeSettingKey,
             CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(),
-            null
+            CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey()
         );
     }
 
@@ -319,10 +525,10 @@ public class DiskThresholdSettings {
         return describeThreshold(
             total,
             highStageWatermark,
-            ByteSizeValue.MINUS_ONE,
+            highStageMaxHeadroom,
             includeSettingKey,
             CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(),
-            null
+            CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey()
         );
     }
 
@@ -330,10 +536,10 @@ public class DiskThresholdSettings {
         return describeThreshold(
             total,
             floodStageWatermark,
-            ByteSizeValue.MINUS_ONE,
+            floodStageMaxHeadroom,
             includeSettingKey,
             CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(),
-            null
+            CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey()
         );
     }
 

+ 9 - 1
server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDecider.java

@@ -40,19 +40,27 @@ import java.util.Set;
  * The {@link DiskThresholdDecider} checks that the node a shard is potentially
  * being allocated to has enough disk space.
  *
- * It has three configurable settings, all of which can be changed dynamically:
+ * It has the following configurable settings, all of which can be changed dynamically:
  *
  * <code>cluster.routing.allocation.disk.watermark.low</code> is the low disk
  * watermark. New shards will not allocated to a node with usage higher than this,
  * although this watermark may be passed by allocating a shard. It defaults to
  * 0.85 (85.0%).
  *
+ * <code>cluster.routing.allocation.disk.watermark.low.max_headroom</code> is the
+ * max headroom for the low watermark. Defaults to 200GB when the low watermark
+ * is not explicitly set. This caps the amount of free space required.
+ *
  * <code>cluster.routing.allocation.disk.watermark.high</code> is the high disk
  * watermark. If a node has usage higher than this, shards are not allowed to
  * remain on the node. In addition, if allocating a shard to a node causes the
  * node to pass this watermark, it will not be allowed. It defaults to
  * 0.90 (90.0%).
  *
+ * <code>cluster.routing.allocation.disk.watermark.high.max_headroom</code> is the
+ * max headroom for the high watermark. Defaults to 150GB when the high watermark
+ * is not explicitly set. This caps the amount of free space required.
+ *
  * The watermark settings are expressed in terms of used disk percentage/ratio, or
  * exact byte values for free space (like "500mb").
  *

+ 3 - 0
server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java

@@ -244,8 +244,11 @@ public final class ClusterSettings extends AbstractScopedSettings {
         ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING,
         DiskThresholdDecider.ENABLE_FOR_SINGLE_DATA_NODE,
         DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING,
+        DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING,
         DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING,
+        DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING,
         DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING,
+        DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING,
         DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING,
         DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING,
         DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING,

+ 18 - 0
server/src/main/java/org/elasticsearch/common/settings/Setting.java

@@ -269,6 +269,24 @@ public class Setting<T> implements ToXContentObject {
         this(new SimpleKey(key), defaultValue, parser, properties);
     }
 
+    /**
+     * Creates a new Setting instance
+     * @param key the settings key for this setting.
+     * @param defaultValue a default value function that returns the default values string representation.
+     * @param parser a parser that parses the string rep into a complex datatype.
+     * @param validator a {@link Validator} for validating this setting
+     * @param properties properties for this setting like scope, filtering...
+     */
+    public Setting(
+        String key,
+        Function<Settings, String> defaultValue,
+        Function<String, T> parser,
+        Validator<T> validator,
+        Property... properties
+    ) {
+        this(new SimpleKey(key), defaultValue, parser, validator, properties);
+    }
+
     /**
      * Creates a new Setting instance
      * @param key the settings key for this setting.

+ 41 - 3
server/src/main/java/org/elasticsearch/common/unit/ByteSizeValue.java

@@ -323,13 +323,51 @@ public class ByteSizeValue implements Writeable, Comparable<ByteSizeValue>, ToXC
 
     @Override
     public int compareTo(ByteSizeValue other) {
-        long thisValue = size * unit.toBytes(1);
-        long otherValue = other.size * other.unit.toBytes(1);
-        return Long.compare(thisValue, otherValue);
+        return Long.compare(getBytes(), other.getBytes());
     }
 
     @Override
     public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
         return builder.value(toString());
     }
+
+    /**
+     * @return Constructs a {@link ByteSizeValue} with the bytes resulting from the addition of the arguments' bytes. Note that the
+     *         resulting {@link ByteSizeUnit} is bytes.
+     * @throws IllegalArgumentException if any of the arguments have -1 bytes
+     */
+    public static ByteSizeValue add(ByteSizeValue x, ByteSizeValue y) {
+        if (x.equals(ByteSizeValue.MINUS_ONE) || y.equals(ByteSizeValue.MINUS_ONE)) {
+            throw new IllegalArgumentException("one of the arguments has -1 bytes");
+        }
+        return ByteSizeValue.ofBytes(Math.addExact(x.getBytes(), y.getBytes()));
+    }
+
+    /**
+     * @return Constructs a {@link ByteSizeValue} with the bytes resulting from the difference of the arguments' bytes. Note that the
+     *         resulting {@link ByteSizeUnit} is bytes.
+     * @throws IllegalArgumentException if any of the arguments or the result have -1 bytes
+     */
+    public static ByteSizeValue subtract(ByteSizeValue x, ByteSizeValue y) {
+        if (x.equals(ByteSizeValue.MINUS_ONE) || y.equals(ByteSizeValue.MINUS_ONE)) {
+            throw new IllegalArgumentException("one of the arguments has -1 bytes");
+        }
+        // No need to use Math.subtractExact here, since we know both arguments are >= 0.
+        ByteSizeValue res = ByteSizeValue.ofBytes(x.getBytes() - y.getBytes());
+        if (res.equals(ByteSizeValue.MINUS_ONE)) {
+            throw new IllegalArgumentException("subtraction result has -1 bytes");
+        }
+        return res;
+    }
+
+    /**
+     * @return Returns the lesser of the two given {@link ByteSizeValue} arguments. In case of equality, the first argument is returned.
+     * @throws IllegalArgumentException if any of the arguments have -1 bytes
+     */
+    public static ByteSizeValue min(ByteSizeValue x, ByteSizeValue y) {
+        if (x.equals(ByteSizeValue.MINUS_ONE) || y.equals(ByteSizeValue.MINUS_ONE)) {
+            throw new IllegalArgumentException("one of the arguments has -1 bytes");
+        }
+        return x.compareTo(y) <= 0 ? x : y;
+    }
 }

+ 4 - 1
server/src/main/java/org/elasticsearch/common/unit/RelativeByteSizeValue.java

@@ -45,13 +45,16 @@ public class RelativeByteSizeValue {
 
     /**
      * Calculate the size to use, optionally catering for a max headroom.
+     * If a ratio/percentage is used, the resulting bytes are rounded to the next integer value.
      * @param total the total size to use
      * @param maxHeadroom the max headroom to cater for or null (or -1) to ignore.
      * @return the size to use
      */
     public ByteSizeValue calculateValue(ByteSizeValue total, ByteSizeValue maxHeadroom) {
         if (ratio != null) {
-            long ratioBytes = (long) Math.ceil(ratio.getAsRatio() * total.getBytes());
+            // Use percentage instead of ratio, and divide bytes by 100, to make the calculation with double more accurate.
+            double res = total.getBytes() * ratio.getAsPercent() / 100;
+            long ratioBytes = (long) Math.ceil(res);
             if (maxHeadroom != null && maxHeadroom.getBytes() != -1) {
                 return ByteSizeValue.ofBytes(Math.max(ratioBytes, total.getBytes() - maxHeadroom.getBytes()));
             } else {

+ 75 - 10
server/src/main/java/org/elasticsearch/health/metadata/HealthMetadata.java

@@ -40,7 +40,7 @@ public final class HealthMetadata extends AbstractNamedDiffable<ClusterState.Cus
     }
 
     public HealthMetadata(StreamInput in) throws IOException {
-        this.diskMetadata = new Disk(in);
+        this.diskMetadata = Disk.readFrom(in);
     }
 
     @Override
@@ -102,24 +102,50 @@ public final class HealthMetadata extends AbstractNamedDiffable<ClusterState.Cus
      */
     public record Disk(
         RelativeByteSizeValue highWatermark,
+        ByteSizeValue highMaxHeadroom,
         RelativeByteSizeValue floodStageWatermark,
+        ByteSizeValue floodStageMaxHeadroom,
         RelativeByteSizeValue frozenFloodStageWatermark,
         ByteSizeValue frozenFloodStageMaxHeadroom
     ) implements ToXContentFragment, Writeable {
 
         public static final String TYPE = "disk";
+        public static Version VERSION_SUPPORTING_HEADROOM_FIELDS = Version.V_8_5_0;
 
         private static final ParseField HIGH_WATERMARK_FIELD = new ParseField("high_watermark");
+        private static final ParseField HIGH_MAX_HEADROOM_FIELD = new ParseField("high_max_headroom");
         private static final ParseField FLOOD_STAGE_WATERMARK_FIELD = new ParseField("flood_stage_watermark");
+        private static final ParseField FLOOD_STAGE_MAX_HEADROOM_FIELD = new ParseField("flood_stage_max_headroom");
         private static final ParseField FROZEN_FLOOD_STAGE_WATERMARK_FIELD = new ParseField("frozen_flood_stage_watermark");
         private static final ParseField FROZEN_FLOOD_STAGE_MAX_HEADROOM_FIELD = new ParseField("frozen_flood_stage_max_headroom");
 
-        Disk(StreamInput in) throws IOException {
-            this(
-                RelativeByteSizeValue.parseRelativeByteSizeValue(in.readString(), HIGH_WATERMARK_FIELD.getPreferredName()),
-                RelativeByteSizeValue.parseRelativeByteSizeValue(in.readString(), FLOOD_STAGE_WATERMARK_FIELD.getPreferredName()),
-                RelativeByteSizeValue.parseRelativeByteSizeValue(in.readString(), FROZEN_FLOOD_STAGE_WATERMARK_FIELD.getPreferredName()),
-                new ByteSizeValue(in)
+        static Disk readFrom(StreamInput in) throws IOException {
+            RelativeByteSizeValue highWatermark = RelativeByteSizeValue.parseRelativeByteSizeValue(
+                in.readString(),
+                HIGH_WATERMARK_FIELD.getPreferredName()
+            );
+            RelativeByteSizeValue floodStageWatermark = RelativeByteSizeValue.parseRelativeByteSizeValue(
+                in.readString(),
+                FLOOD_STAGE_WATERMARK_FIELD.getPreferredName()
+            );
+            RelativeByteSizeValue frozenFloodStageWatermark = RelativeByteSizeValue.parseRelativeByteSizeValue(
+                in.readString(),
+                FROZEN_FLOOD_STAGE_WATERMARK_FIELD.getPreferredName()
+            );
+            ByteSizeValue frozenFloodStageMaxHeadroom = new ByteSizeValue(in);
+            ByteSizeValue highMaxHeadroom = in.getVersion().onOrAfter(VERSION_SUPPORTING_HEADROOM_FIELDS)
+                ? new ByteSizeValue(in)
+                : ByteSizeValue.MINUS_ONE;
+            ByteSizeValue floodStageMaxHeadroom = in.getVersion().onOrAfter(VERSION_SUPPORTING_HEADROOM_FIELDS)
+                ? new ByteSizeValue(in)
+                : ByteSizeValue.MINUS_ONE;
+            return new Disk(
+                highWatermark,
+                highMaxHeadroom,
+                floodStageWatermark,
+                floodStageMaxHeadroom,
+                frozenFloodStageWatermark,
+                frozenFloodStageMaxHeadroom
             );
         }
 
@@ -129,6 +155,10 @@ public final class HealthMetadata extends AbstractNamedDiffable<ClusterState.Cus
             out.writeString(describeFloodStageWatermark());
             out.writeString(describeFrozenFloodStageWatermark());
             frozenFloodStageMaxHeadroom.writeTo(out);
+            if (out.getVersion().onOrAfter(VERSION_SUPPORTING_HEADROOM_FIELDS)) {
+                highMaxHeadroom.writeTo(out);
+                floodStageMaxHeadroom.writeTo(out);
+            }
         }
 
         @Override
@@ -139,7 +169,9 @@ public final class HealthMetadata extends AbstractNamedDiffable<ClusterState.Cus
         @Override
         public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
             builder.field(HIGH_WATERMARK_FIELD.getPreferredName(), describeHighWatermark());
+            builder.field(HIGH_MAX_HEADROOM_FIELD.getPreferredName(), highMaxHeadroom);
             builder.field(FLOOD_STAGE_WATERMARK_FIELD.getPreferredName(), describeFloodStageWatermark());
+            builder.field(FLOOD_STAGE_MAX_HEADROOM_FIELD.getPreferredName(), floodStageMaxHeadroom);
             builder.field(FROZEN_FLOOD_STAGE_WATERMARK_FIELD.getPreferredName(), describeFrozenFloodStageWatermark());
             builder.field(FROZEN_FLOOD_STAGE_MAX_HEADROOM_FIELD.getPreferredName(), frozenFloodStageMaxHeadroom);
             return builder;
@@ -149,7 +181,7 @@ public final class HealthMetadata extends AbstractNamedDiffable<ClusterState.Cus
             if (watermark.isAbsolute()) {
                 return watermark.getAbsolute();
             }
-            return ByteSizeValue.ofBytes(total.getBytes() - watermark.calculateValue(total, maxHeadroom).getBytes());
+            return ByteSizeValue.subtract(total, watermark.calculateValue(total, maxHeadroom));
         }
 
         public ByteSizeValue getFreeBytesHighWatermark(ByteSizeValue total) {
@@ -190,7 +222,9 @@ public final class HealthMetadata extends AbstractNamedDiffable<ClusterState.Cus
             if (o == null || getClass() != o.getClass()) return false;
             Disk disk = (Disk) o;
             return Objects.equals(describeHighWatermark(), disk.describeHighWatermark())
+                && Objects.equals(highMaxHeadroom, disk.highMaxHeadroom)
                 && Objects.equals(describeFloodStageWatermark(), disk.describeFloodStageWatermark())
+                && Objects.equals(floodStageMaxHeadroom, disk.floodStageMaxHeadroom)
                 && Objects.equals(describeFrozenFloodStageWatermark(), disk.describeFrozenFloodStageWatermark())
                 && Objects.equals(frozenFloodStageMaxHeadroom, disk.frozenFloodStageMaxHeadroom);
         }
@@ -199,7 +233,9 @@ public final class HealthMetadata extends AbstractNamedDiffable<ClusterState.Cus
         public int hashCode() {
             return Objects.hash(
                 describeHighWatermark(),
+                highMaxHeadroom,
                 describeFloodStageWatermark(),
+                floodStageMaxHeadroom,
                 describeFrozenFloodStageWatermark(),
                 frozenFloodStageMaxHeadroom
             );
@@ -216,13 +252,17 @@ public final class HealthMetadata extends AbstractNamedDiffable<ClusterState.Cus
         public static class Builder {
 
             private RelativeByteSizeValue highWatermark;
+            private ByteSizeValue highMaxHeadroom;
             private RelativeByteSizeValue floodStageWatermark;
+            private ByteSizeValue floodStageMaxHeadroom;
             private RelativeByteSizeValue frozenFloodStageWatermark;
             private ByteSizeValue frozenFloodStageMaxHeadroom;
 
             private Builder(Disk disk) {
                 this.highWatermark = disk.highWatermark;
+                this.highMaxHeadroom = disk.highMaxHeadroom;
                 this.floodStageWatermark = disk.floodStageWatermark;
+                this.floodStageMaxHeadroom = disk.floodStageMaxHeadroom;
                 this.frozenFloodStageWatermark = disk.frozenFloodStageWatermark;
                 this.frozenFloodStageMaxHeadroom = disk.frozenFloodStageMaxHeadroom;
             }
@@ -238,15 +278,33 @@ public final class HealthMetadata extends AbstractNamedDiffable<ClusterState.Cus
                 return highWatermark(RelativeByteSizeValue.parseRelativeByteSizeValue(highWatermark, setting));
             }
 
+            public Disk.Builder highMaxHeadroom(ByteSizeValue highMaxHeadroom) {
+                this.highMaxHeadroom = highMaxHeadroom;
+                return this;
+            }
+
+            public Disk.Builder highMaxHeadroom(String highMaxHeadroom, String setting) {
+                return highMaxHeadroom(ByteSizeValue.parseBytesSizeValue(highMaxHeadroom, setting));
+            }
+
             public Disk.Builder floodStageWatermark(RelativeByteSizeValue floodStageWatermark) {
                 this.floodStageWatermark = floodStageWatermark;
                 return this;
             }
 
-            public Builder floodStageWatermark(String floodStageWatermark, String setting) {
+            public Disk.Builder floodStageWatermark(String floodStageWatermark, String setting) {
                 return floodStageWatermark(RelativeByteSizeValue.parseRelativeByteSizeValue(floodStageWatermark, setting));
             }
 
+            public Disk.Builder floodStageMaxHeadroom(ByteSizeValue floodStageMaxHeadroom) {
+                this.floodStageMaxHeadroom = floodStageMaxHeadroom;
+                return this;
+            }
+
+            public Disk.Builder floodStageMaxHeadroom(String floodStageMaxHeadroom, String setting) {
+                return floodStageMaxHeadroom(ByteSizeValue.parseBytesSizeValue(floodStageMaxHeadroom, setting));
+            }
+
             public Disk.Builder frozenFloodStageWatermark(RelativeByteSizeValue frozenFloodStageWatermark) {
                 this.frozenFloodStageWatermark = frozenFloodStageWatermark;
                 return this;
@@ -266,7 +324,14 @@ public final class HealthMetadata extends AbstractNamedDiffable<ClusterState.Cus
             }
 
             public Disk build() {
-                return new Disk(highWatermark, floodStageWatermark, frozenFloodStageWatermark, frozenFloodStageMaxHeadroom);
+                return new Disk(
+                    highWatermark,
+                    highMaxHeadroom,
+                    floodStageWatermark,
+                    floodStageMaxHeadroom,
+                    frozenFloodStageWatermark,
+                    frozenFloodStageMaxHeadroom
+                );
             }
         }
     }

+ 21 - 0
server/src/main/java/org/elasticsearch/health/metadata/HealthMetadataService.java

@@ -29,7 +29,9 @@ import java.util.List;
 
 import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING;
 import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING;
+import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING;
 import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING;
+import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING;
 import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING;
 import static org.elasticsearch.health.node.selection.HealthNodeTaskExecutor.ENABLED_SETTING;
 
@@ -95,6 +97,17 @@ public class HealthMetadataService {
             )
         );
         clusterService.getClusterSettings().addSettingsUpdateConsumer(ENABLED_SETTING, this::enable);
+        clusterSettings.addSettingsUpdateConsumer(
+            CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING,
+            value -> updateOnSettingsUpdated(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), value.getStringRep())
+        );
+        clusterSettings.addSettingsUpdateConsumer(
+            CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING,
+            value -> updateOnSettingsUpdated(
+                CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(),
+                value.getStringRep()
+            )
+        );
     }
 
     private void enable(boolean enabled) {
@@ -207,6 +220,12 @@ public class HealthMetadataService {
             if (CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.getKey().equals(setting)) {
                 builder.frozenFloodStageMaxHeadroom(value, setting);
             }
+            if (CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey().equals(setting)) {
+                builder.highMaxHeadroom(value, setting);
+            }
+            if (CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey().equals(setting)) {
+                builder.floodStageMaxHeadroom(value, setting);
+            }
             final var finalHealthMetadata = new HealthMetadata(builder.build());
             return finalHealthMetadata.equals(initialHealthMetadata)
                 ? clusterState
@@ -232,7 +251,9 @@ public class HealthMetadataService {
             final var finalHealthMetadata = new HealthMetadata(
                 new HealthMetadata.Disk(
                     CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.get(settings),
+                    CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.get(settings),
                     CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.get(settings),
+                    CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.get(settings),
                     CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.get(settings),
                     CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.get(settings)
                 )

+ 496 - 108
server/src/test/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorTests.java

@@ -60,7 +60,7 @@ import static org.hamcrest.Matchers.equalTo;
 
 public class DiskThresholdMonitorTests extends ESAllocationTestCase {
 
-    public void testMarkFloodStageIndicesReadOnly() {
+    private void doTestMarkFloodStageIndicesReadOnly(boolean testMaxHeadroom) {
         AllocationService allocation = createAllocationService(
             Settings.builder().put("cluster.routing.allocation.node_concurrent_recoveries", 10).build()
         );
@@ -128,10 +128,26 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
             }
         };
 
+        final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100;
         Map<String, DiskUsage> builder = new HashMap<>();
-        builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, 4));
-        builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, 30));
-        builder.put("frozen", new DiskUsage("frozen", "frozen", "/foo/bar", 100, between(0, 100)));
+        builder.put(
+            "node1",
+            new DiskUsage("node1", "node1", "/foo/bar", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(99).getBytes() : 4)
+        );
+        builder.put(
+            "node2",
+            new DiskUsage("node2", "node2", "/foo/bar", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(250).getBytes() : 30)
+        );
+        builder.put(
+            "frozen",
+            new DiskUsage(
+                "frozen",
+                "frozen",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 10000)).getBytes() : between(0, 100)
+            )
+        );
         final ClusterInfo initialClusterInfo = clusterInfo(builder);
         monitor.onNewInfo(initialClusterInfo);
         assertTrue(reroute.get()); // reroute on new nodes
@@ -144,9 +160,24 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
 
         indices.set(null);
         builder = new HashMap<>();
-        builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, 4));
-        builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, 5));
-        builder.put("frozen", new DiskUsage("frozen", "frozen", "/foo/bar", 100, between(0, 4)));
+        builder.put(
+            "node1",
+            new DiskUsage("node1", "node1", "/foo/bar", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(99).getBytes() : 4)
+        );
+        builder.put(
+            "node2",
+            new DiskUsage("node2", "node2", "/foo/bar", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(100).getBytes() : 5)
+        );
+        builder.put(
+            "frozen",
+            new DiskUsage(
+                "frozen",
+                "frozen",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 19)).getBytes() : between(0, 4)
+            )
+        );
         currentTime.addAndGet(randomLongBetween(60000, 120000));
         monitor.onNewInfo(clusterInfo(builder));
         assertTrue(reroute.get());
@@ -195,15 +226,38 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         indices.set(null);
         reroute.set(false);
         builder = new HashMap<>();
-        builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, 4));
-        builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, 5));
-        builder.put("frozen", new DiskUsage("frozen", "frozen", "/foo/bar", 100, between(0, 4)));
+        builder.put(
+            "node1",
+            new DiskUsage("node1", "node1", "/foo/bar", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(99).getBytes() : 4)
+        );
+        builder.put(
+            "node2",
+            new DiskUsage("node2", "node2", "/foo/bar", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(100).getBytes() : 5)
+        );
+        builder.put(
+            "frozen",
+            new DiskUsage(
+                "frozen",
+                "frozen",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 19)).getBytes() : between(0, 4)
+            )
+        );
         monitor.onNewInfo(clusterInfo(builder));
         assertTrue(reroute.get());
         assertEquals(Collections.singleton("test_1"), indices.get());
     }
 
-    public void testDoesNotSubmitRerouteTaskTooFrequently() {
+    public void testMarkFloodStageIndicesReadOnlyWithPercentages() {
+        doTestMarkFloodStageIndicesReadOnly(false);
+    }
+
+    public void testMarkFloodStageIndicesReadOnlyWithMaxHeadroom() {
+        doTestMarkFloodStageIndicesReadOnly(true);
+    }
+
+    private void doTestDoesNotSubmitRerouteTaskTooFrequently(boolean testMaxHeadroom) {
         final ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
             .nodes(DiscoveryNodes.builder().add(newNormalNode("node1")).add(newNormalNode("node2")))
             .build();
@@ -227,13 +281,32 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
             }
         };
 
+        final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100;
         Map<String, DiskUsage> allDisksOk = new HashMap<>();
-        allDisksOk.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, 50));
-        allDisksOk.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, 50));
+        allDisksOk.put(
+            "node1",
+            new DiskUsage("node1", "node1", "/foo/bar", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(500).getBytes() : 50)
+        );
+        allDisksOk.put(
+            "node2",
+            new DiskUsage("node2", "node2", "/foo/bar", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(500).getBytes() : 50)
+        );
 
         Map<String, DiskUsage> oneDiskAboveWatermark = new HashMap<>();
-        oneDiskAboveWatermark.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(5, 9)));
-        oneDiskAboveWatermark.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, 50));
+        oneDiskAboveWatermark.put(
+            "node1",
+            new DiskUsage(
+                "node1",
+                "node1",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(101, 149)).getBytes() : between(5, 9)
+            )
+        );
+        oneDiskAboveWatermark.put(
+            "node2",
+            new DiskUsage("node2", "node2", "/foo/bar", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(500).getBytes() : 50)
+        );
 
         // should reroute when receiving info about previously-unknown nodes
         currentTime.addAndGet(randomLongBetween(0, 120000));
@@ -326,7 +399,10 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         // should reroute again when one disk has reserved space that pushes it over the high watermark
         Map<ClusterInfo.NodeAndPath, ClusterInfo.ReservedSpace> reservedSpaces = Map.of(
             new ClusterInfo.NodeAndPath("node1", "/foo/bar"),
-            new ClusterInfo.ReservedSpace.Builder().add(new ShardId("baz", "quux", 0), between(41, 100)).build()
+            new ClusterInfo.ReservedSpace.Builder().add(
+                new ShardId("baz", "quux", 0),
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(401, 10000)).getBytes() : between(41, 100)
+            ).build()
         );
 
         currentTime.addAndGet(
@@ -338,10 +414,17 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         monitor.onNewInfo(clusterInfo(allDisksOk, reservedSpaces));
         assertNotNull(listenerReference.get());
         listenerReference.getAndSet(null).onResponse(null);
+    }
+
+    public void testDoesNotSubmitRerouteTaskTooFrequentlyWithPercentages() {
+        doTestDoesNotSubmitRerouteTaskTooFrequently(false);
+    }
 
+    public void testDoesNotSubmitRerouteTaskTooFrequentlyWithMaxHeadroom() {
+        doTestDoesNotSubmitRerouteTaskTooFrequently(true);
     }
 
-    public void testAutoReleaseIndices() {
+    private void doTestAutoReleaseIndices(boolean testMaxHeadroom) {
         AtomicReference<Set<String>> indicesToMarkReadOnly = new AtomicReference<>();
         AtomicReference<Set<String>> indicesToRelease = new AtomicReference<>();
         AllocationService allocation = createAllocationService(
@@ -362,13 +445,15 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         );
         assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(8));
 
+        final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100;
+
         Map<ClusterInfo.NodeAndPath, ClusterInfo.ReservedSpace> reservedSpaces = new HashMap<>();
-        final int reservedSpaceNode1 = between(0, 10);
+        final long reservedSpaceNode1 = testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 150)).getBytes() : between(0, 10);
         reservedSpaces.put(
             new ClusterInfo.NodeAndPath("node1", "/foo/bar"),
             new ClusterInfo.ReservedSpace.Builder().add(new ShardId("", "", 0), reservedSpaceNode1).build()
         );
-        final int reservedSpaceNode2 = between(0, 10);
+        final long reservedSpaceNode2 = testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 150)).getBytes() : between(0, 10);
         reservedSpaces.put(
             new ClusterInfo.NodeAndPath("node2", "/foo/bar"),
             new ClusterInfo.ReservedSpace.Builder().add(new ShardId("", "", 0), reservedSpaceNode2).build()
@@ -399,8 +484,26 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         indicesToMarkReadOnly.set(null);
         indicesToRelease.set(null);
         Map<String, DiskUsage> builder = new HashMap<>();
-        builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(0, 4)));
-        builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(0, 4)));
+        builder.put(
+            "node1",
+            new DiskUsage(
+                "node1",
+                "node1",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 99)).getBytes() : between(0, 4)
+            )
+        );
+        builder.put(
+            "node2",
+            new DiskUsage(
+                "node2",
+                "node2",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 99)).getBytes() : between(0, 4)
+            )
+        );
         monitor.onNewInfo(clusterInfo(builder, reservedSpaces));
         assertEquals(new HashSet<>(Arrays.asList("test_1", "test_2")), indicesToMarkReadOnly.get());
         assertNull(indicesToRelease.get());
@@ -409,8 +512,26 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         indicesToMarkReadOnly.set(null);
         indicesToRelease.set(null);
         builder = new HashMap<>();
-        builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(5, 90)));
-        builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(5, 90)));
+        builder.put(
+            "node1",
+            new DiskUsage(
+                "node1",
+                "node1",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(100, 9850)).getBytes() : between(5, 90)
+            )
+        );
+        builder.put(
+            "node2",
+            new DiskUsage(
+                "node2",
+                "node2",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(100, 9850)).getBytes() : between(5, 90)
+            )
+        );
         monitor.onNewInfo(clusterInfo(builder, reservedSpaces));
         assertNull(indicesToMarkReadOnly.get());
         assertNull(indicesToRelease.get());
@@ -452,22 +573,58 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
                 listener.onResponse(null);
             }
         };
-        // When free disk on any of node1 or node2 goes below 5% flood watermark, then apply index block on indices not having the block
+        // When free disk on any of node1 or node2 goes below the flood watermark, then apply index block on indices not having the block
         indicesToMarkReadOnly.set(null);
         indicesToRelease.set(null);
         builder = new HashMap<>();
-        builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(0, 100)));
-        builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(0, 4)));
+        builder.put(
+            "node1",
+            new DiskUsage(
+                "node1",
+                "node1",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 10000)).getBytes() : between(0, 100)
+            )
+        );
+        builder.put(
+            "node2",
+            new DiskUsage(
+                "node2",
+                "node2",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 99)).getBytes() : between(0, 4)
+            )
+        );
         monitor.onNewInfo(clusterInfo(builder, reservedSpaces));
         assertThat(indicesToMarkReadOnly.get(), contains("test_1"));
         assertNull(indicesToRelease.get());
 
-        // When free disk on node1 and node2 goes above 10% high watermark then release index block, ignoring reserved space
+        // When free disk on node1 and node2 goes above the high watermark then release index block, ignoring reserved space
         indicesToMarkReadOnly.set(null);
         indicesToRelease.set(null);
         builder = new HashMap<>();
-        builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(10, 100)));
-        builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(10, 100)));
+        builder.put(
+            "node1",
+            new DiskUsage(
+                "node1",
+                "node1",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(150, 10000)).getBytes() : between(10, 100)
+            )
+        );
+        builder.put(
+            "node2",
+            new DiskUsage(
+                "node2",
+                "node2",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(150, 10000)).getBytes() : between(10, 100)
+            )
+        );
         monitor.onNewInfo(clusterInfo(builder, reservedSpaces));
         assertNull(indicesToMarkReadOnly.get());
         assertThat(indicesToRelease.get(), contains("test_2"));
@@ -476,7 +633,16 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         indicesToMarkReadOnly.set(null);
         indicesToRelease.set(null);
         builder = new HashMap<>();
-        builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(0, 4)));
+        builder.put(
+            "node1",
+            new DiskUsage(
+                "node1",
+                "node1",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 99)).getBytes() : between(0, 4)
+            )
+        );
         monitor.onNewInfo(clusterInfo(builder));
         assertThat(indicesToMarkReadOnly.get(), contains("test_1"));
         assertNull(indicesToRelease.get());
@@ -485,10 +651,37 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         indicesToMarkReadOnly.set(null);
         indicesToRelease.set(null);
         builder = new HashMap<>();
-        builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(5, 9)));
-        builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(5, 100)));
+        builder.put(
+            "node1",
+            new DiskUsage(
+                "node1",
+                "node1",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(100, 149)).getBytes() : between(5, 9)
+            )
+        );
+        builder.put(
+            "node2",
+            new DiskUsage(
+                "node2",
+                "node2",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(100, 10000)).getBytes() : between(5, 100)
+            )
+        );
         if (randomBoolean()) {
-            builder.put("node3", new DiskUsage("node3", "node3", "/foo/bar", 100, between(0, 100)));
+            builder.put(
+                "node3",
+                new DiskUsage(
+                    "node3",
+                    "node3",
+                    "/foo/bar",
+                    totalBytes,
+                    testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 10000)).getBytes() : between(0, 100)
+                )
+            );
         }
         monitor.onNewInfo(clusterInfo(builder));
         assertNull(indicesToMarkReadOnly.get());
@@ -498,9 +691,27 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         indicesToMarkReadOnly.set(null);
         indicesToRelease.set(null);
         builder = new HashMap<>();
-        builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(5, 100)));
+        builder.put(
+            "node1",
+            new DiskUsage(
+                "node1",
+                "node1",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(100, 10000)).getBytes() : between(5, 100)
+            )
+        );
         if (randomBoolean()) {
-            builder.put("node3", new DiskUsage("node3", "node3", "/foo/bar", 100, between(0, 100)));
+            builder.put(
+                "node3",
+                new DiskUsage(
+                    "node3",
+                    "node3",
+                    "/foo/bar",
+                    totalBytes,
+                    testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 10000)).getBytes() : between(0, 100)
+                )
+            );
         }
         monitor.onNewInfo(clusterInfo(builder));
         assertNull(indicesToMarkReadOnly.get());
@@ -510,16 +721,42 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         indicesToMarkReadOnly.set(null);
         indicesToRelease.set(null);
         builder = new HashMap<>();
-        builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(0, 4)));
+        builder.put(
+            "node1",
+            new DiskUsage(
+                "node1",
+                "node1",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 99)).getBytes() : between(0, 4)
+            )
+        );
         if (randomBoolean()) {
-            builder.put("node3", new DiskUsage("node3", "node3", "/foo/bar", 100, between(0, 100)));
+            builder.put(
+                "node3",
+                new DiskUsage(
+                    "node3",
+                    "node3",
+                    "/foo/bar",
+                    totalBytes,
+                    testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 10000)).getBytes() : between(0, 100)
+                )
+            );
         }
         monitor.onNewInfo(clusterInfo(builder));
         assertThat(indicesToMarkReadOnly.get(), contains("test_1"));
         assertNull(indicesToRelease.get());
     }
 
-    public void testNoAutoReleaseOfIndicesOnReplacementNodes() {
+    public void testAutoReleaseIndicesWithPercentages() {
+        doTestAutoReleaseIndices(false);
+    }
+
+    public void testAutoReleaseIndicesWithMaxHeadroom() {
+        doTestAutoReleaseIndices(true);
+    }
+
+    private void doTestNoAutoReleaseOfIndicesOnReplacementNodes(boolean testMaxHeadroom) {
         AtomicReference<Set<String>> indicesToMarkReadOnly = new AtomicReference<>();
         AtomicReference<Set<String>> indicesToRelease = new AtomicReference<>();
         AtomicReference<ClusterState> currentClusterState = new AtomicReference<>();
@@ -541,13 +778,15 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         );
         assertThat(RoutingNodesHelper.shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(8));
 
+        final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100;
+
         Map<ClusterInfo.NodeAndPath, ClusterInfo.ReservedSpace> reservedSpaces = new HashMap<>();
-        final int reservedSpaceNode1 = between(0, 10);
+        final long reservedSpaceNode1 = testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 150)).getBytes() : between(0, 10);
         reservedSpaces.put(
             new ClusterInfo.NodeAndPath("node1", "/foo/bar"),
             new ClusterInfo.ReservedSpace.Builder().add(new ShardId("", "", 0), reservedSpaceNode1).build()
         );
-        final int reservedSpaceNode2 = between(0, 10);
+        final long reservedSpaceNode2 = testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 150)).getBytes() : between(0, 10);
         reservedSpaces.put(
             new ClusterInfo.NodeAndPath("node2", "/foo/bar"),
             new ClusterInfo.ReservedSpace.Builder().add(new ShardId("", "", 0), reservedSpaceNode2).build()
@@ -580,8 +819,26 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         indicesToMarkReadOnly.set(null);
         indicesToRelease.set(null);
         Map<String, DiskUsage> builder = new HashMap<>();
-        builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(0, 4)));
-        builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(0, 4)));
+        builder.put(
+            "node1",
+            new DiskUsage(
+                "node1",
+                "node1",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 99)).getBytes() : between(0, 4)
+            )
+        );
+        builder.put(
+            "node2",
+            new DiskUsage(
+                "node2",
+                "node2",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 99)).getBytes() : between(0, 4)
+            )
+        );
         monitor.onNewInfo(clusterInfo(builder, reservedSpaces));
         assertEquals(new HashSet<>(Arrays.asList("test_1", "test_2")), indicesToMarkReadOnly.get());
         assertNull(indicesToRelease.get());
@@ -590,8 +847,26 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         indicesToMarkReadOnly.set(null);
         indicesToRelease.set(null);
         builder = new HashMap<>();
-        builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(5, 90)));
-        builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(5, 90)));
+        builder.put(
+            "node1",
+            new DiskUsage(
+                "node1",
+                "node1",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(100, 9850)).getBytes() : between(5, 90)
+            )
+        );
+        builder.put(
+            "node2",
+            new DiskUsage(
+                "node2",
+                "node2",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(100, 9850)).getBytes() : between(5, 90)
+            )
+        );
         monitor.onNewInfo(clusterInfo(builder, reservedSpaces));
         assertNull(indicesToMarkReadOnly.get());
         assertNull(indicesToRelease.get());
@@ -643,12 +918,30 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
 
         currentClusterState.set(clusterStateWithBlocks);
 
-        // When free disk on any of node1 or node2 goes below 5% flood watermark, then apply index block on indices not having the block
+        // When free disk on any of node1 or node2 goes below the flood watermark, then apply index block on indices not having the block
         indicesToMarkReadOnly.set(null);
         indicesToRelease.set(null);
         builder = new HashMap<>();
-        builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(0, 100)));
-        builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(0, 4)));
+        builder.put(
+            "node1",
+            new DiskUsage(
+                "node1",
+                "node1",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 10000)).getBytes() : between(0, 100)
+            )
+        );
+        builder.put(
+            "node2",
+            new DiskUsage(
+                "node2",
+                "node2",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 99)).getBytes() : between(0, 4)
+            )
+        );
         monitor.onNewInfo(clusterInfo(builder, reservedSpaces));
         assertThat(indicesToMarkReadOnly.get(), contains("test_1"));
         assertNull(indicesToRelease.get());
@@ -657,8 +950,26 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         indicesToMarkReadOnly.set(null);
         indicesToRelease.set(null);
         builder = new HashMap<>();
-        builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(10, 100)));
-        builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(10, 100)));
+        builder.put(
+            "node1",
+            new DiskUsage(
+                "node1",
+                "node1",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(150, 10000)).getBytes() : between(10, 100)
+            )
+        );
+        builder.put(
+            "node2",
+            new DiskUsage(
+                "node2",
+                "node2",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(150, 10000)).getBytes() : between(10, 100)
+            )
+        );
         monitor.onNewInfo(clusterInfo(builder, reservedSpaces));
         assertNull(indicesToMarkReadOnly.get());
         assertNull(indicesToRelease.get());
@@ -676,15 +987,40 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         indicesToMarkReadOnly.set(null);
         indicesToRelease.set(null);
         builder = new HashMap<>();
-        builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(10, 100)));
-        builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(10, 100)));
+        builder.put(
+            "node1",
+            new DiskUsage(
+                "node1",
+                "node1",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(150, 10000)).getBytes() : between(10, 100)
+            )
+        );
+        builder.put(
+            "node2",
+            new DiskUsage(
+                "node2",
+                "node2",
+                "/foo/bar",
+                totalBytes,
+                testMaxHeadroom ? ByteSizeValue.ofGb(between(150, 10000)).getBytes() : between(10, 100)
+            )
+        );
         monitor.onNewInfo(clusterInfo(builder, reservedSpaces));
         assertNull(indicesToMarkReadOnly.get());
         assertThat(indicesToRelease.get(), contains("test_2"));
     }
 
-    @TestLogging(value = "org.elasticsearch.cluster.routing.allocation.DiskThresholdMonitor:INFO", reason = "testing INFO/WARN logging")
-    public void testDiskMonitorLogging() throws IllegalAccessException {
+    public void testNoAutoReleaseOfIndicesOnReplacementNodesWithPercentages() {
+        doTestNoAutoReleaseOfIndicesOnReplacementNodes(false);
+    }
+
+    public void testNoAutoReleaseOfIndicesOnReplacementNodesWithMaxHeadroom() {
+        doTestNoAutoReleaseOfIndicesOnReplacementNodes(true);
+    }
+
+    private void doTestDiskMonitorLogging(boolean testHeadroom) throws IllegalAccessException {
         final ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
             .nodes(DiscoveryNodes.builder().add(newNormalNode("node1")).add(newFrozenOnlyNode("frozen")))
             .build();
@@ -725,53 +1061,62 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
             }
         };
 
+        long thousandTb = ByteSizeValue.ofTb(1000).getBytes();
+        long total = testHeadroom ? thousandTb : 100;
+
         Map<String, DiskUsage> allDisksOk = new HashMap<>();
-        allDisksOk.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(15, 100)));
-        if (randomBoolean()) {
-            allDisksOk.put("frozen", new DiskUsage("frozen", "frozen", "/foo/bar", 100, between(15, 100)));
-        } else {
-            allDisksOk.put(
+        allDisksOk.put("node1", new DiskUsage("node1", "node1", "/foo/bar", total, testHeadroom ? betweenGb(200, 1000) : between(15, 100)));
+        allDisksOk.put(
+            "frozen",
+            new DiskUsage(
                 "frozen",
-                new DiskUsage(
-                    "frozen",
-                    "frozen",
-                    "/foo/bar",
-                    ByteSizeValue.ofGb(1000).getBytes(),
-                    (randomBoolean() ? ByteSizeValue.ofGb(between(20, 1000)) : ByteSizeValue.ofGb(between(20, 50))).getBytes()
-                )
-            );
-        }
+                "frozen",
+                "/foo/bar",
+                total,
+                testHeadroom ? (randomBoolean() ? betweenGb(20, 1000) : betweenGb(20, 50)) : between(15, 100)
+            )
+        );
 
         Map<String, DiskUsage> aboveLowWatermark = new HashMap<>();
-        aboveLowWatermark.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(10, 14)));
-        aboveLowWatermark.put("frozen", new DiskUsage("frozen", "frozen", "/foo/bar", 100, between(10, 14)));
+        aboveLowWatermark.put(
+            "node1",
+            new DiskUsage("node1", "node1", "/foo/bar", total, testHeadroom ? betweenGb(150, 199) : between(10, 14))
+        );
+        aboveLowWatermark.put(
+            "frozen",
+            new DiskUsage("frozen", "frozen", "/foo/bar", total, testHeadroom ? betweenGb(150, 199) : between(10, 14))
+        );
 
         Map<String, DiskUsage> aboveHighWatermark = new HashMap<>();
-        aboveHighWatermark.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(5, 9)));
-        aboveHighWatermark.put("frozen", new DiskUsage("frozen", "frozen", "/foo/bar", 100, between(5, 9)));
+        aboveHighWatermark.put(
+            "node1",
+            new DiskUsage("node1", "node1", "/foo/bar", total, testHeadroom ? betweenGb(100, 149) : between(5, 9))
+        );
+        aboveHighWatermark.put(
+            "frozen",
+            new DiskUsage("frozen", "frozen", "/foo/bar", total, testHeadroom ? betweenGb(20, 99) : between(5, 9))
+        );
 
         Map<String, DiskUsage> aboveFloodStageWatermark = new HashMap<>();
-        aboveFloodStageWatermark.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(0, 4)));
+        aboveFloodStageWatermark.put(
+            "node1",
+            new DiskUsage("node1", "node1", "/foo/bar", total, testHeadroom ? betweenGb(0, 99) : between(0, 4))
+        );
         // frozen is below flood stage, so no logging from it.
-        aboveFloodStageWatermark.put("frozen", new DiskUsage("frozen", "frozen", "/foo/bar", 100, between(5, 9)));
+        aboveFloodStageWatermark.put(
+            "frozen",
+            new DiskUsage("frozen", "frozen", "/foo/bar", total, testHeadroom ? betweenGb(20, 99) : between(5, 9))
+        );
 
         Map<String, DiskUsage> frozenAboveFloodStageWatermark = new HashMap<>();
         // node1 is below low watermark, so no logging from it.
-        frozenAboveFloodStageWatermark.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(15, 100)));
-        frozenAboveFloodStageWatermark.put("frozen", new DiskUsage("frozen", "frozen", "/foo/bar", 100, between(0, 4)));
-
-        Map<String, DiskUsage> frozenAboveFloodStageMaxHeadroom = new HashMap<>();
-        // node1 is below low watermark, so no logging from it.
-        frozenAboveFloodStageMaxHeadroom.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(15, 100)));
-        frozenAboveFloodStageMaxHeadroom.put(
+        frozenAboveFloodStageWatermark.put(
+            "node1",
+            new DiskUsage("node1", "node1", "/foo/bar", total, testHeadroom ? betweenGb(200, 1000) : between(15, 100))
+        );
+        frozenAboveFloodStageWatermark.put(
             "frozen",
-            new DiskUsage(
-                "frozen",
-                "frozen",
-                "/foo/bar",
-                ByteSizeValue.ofGb(1000).getBytes(),
-                ByteSizeValue.ofGb(between(0, 19)).getBytes()
-            )
+            new DiskUsage("frozen", "frozen", "/foo/bar", total, testHeadroom ? betweenGb(0, 19) : between(0, 4))
         );
 
         advanceTime.set(true); // first check sees new nodes and triggers a reroute
@@ -779,17 +1124,24 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         advanceTime.set(randomBoolean()); // no new nodes so no reroute delay needed
         assertNoLogging(monitor, allDisksOk);
 
+        String lowWatermarkString = testHeadroom ? "max_headroom=200gb" : "85%";
+        String highWatermarkString = testHeadroom ? "max_headroom=150gb" : "90%";
+        String floodWatermarkString = testHeadroom ? "max_headroom=100gb" : "95%";
+        String frozenFloodWatermarkString = testHeadroom ? "max_headroom=20gb" : "95%";
+
         assertSingleInfoMessage(
             monitor,
             aboveLowWatermark,
-            "low disk watermark [85%] exceeded on *node1* replicas will not be assigned to this node"
+            "low disk watermark [" + lowWatermarkString + "] exceeded on *node1* replicas will not be assigned to this node"
         );
 
         advanceTime.set(false); // will do one reroute and emit warnings, but subsequent reroutes and associated messages are delayed
         assertSingleWarningMessage(
             monitor,
             aboveHighWatermark,
-            "high disk watermark [90%] exceeded on *node1* shards will be relocated away from this node* "
+            "high disk watermark ["
+                + highWatermarkString
+                + "] exceeded on *node1* shards will be relocated away from this node* "
                 + "the node is expected to continue to exceed the high disk watermark when these relocations are complete"
         );
 
@@ -797,7 +1149,9 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         assertRepeatedWarningMessages(
             monitor,
             aboveHighWatermark,
-            "high disk watermark [90%] exceeded on *node1* shards will be relocated away from this node* "
+            "high disk watermark ["
+                + highWatermarkString
+                + "] exceeded on *node1* shards will be relocated away from this node* "
                 + "the node is expected to continue to exceed the high disk watermark when these relocations are complete"
         );
 
@@ -805,15 +1159,19 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         assertRepeatedWarningMessages(
             monitor,
             aboveFloodStageWatermark,
-            "flood stage disk watermark [95%] exceeded on *node1* all indices on this node will be marked read-only"
+            "flood stage disk watermark ["
+                + floodWatermarkString
+                + "] exceeded on *node1* all indices on this node will be marked read-only"
         );
 
-        relocatingShardSizeRef.set(-5L);
+        relocatingShardSizeRef.set(testHeadroom ? (-1L) * ByteSizeValue.ofGb(100).getBytes() : -5L);
         advanceTime.set(true);
         assertSingleInfoMessage(
             monitor,
             aboveHighWatermark,
-            "high disk watermark [90%] exceeded on *node1* shards will be relocated away from this node* "
+            "high disk watermark ["
+                + highWatermarkString
+                + "] exceeded on *node1* shards will be relocated away from this node* "
                 + "the node is expected to be below the high disk watermark when these relocations are complete"
         );
 
@@ -823,7 +1181,9 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         assertSingleWarningMessage(
             monitor,
             aboveHighWatermark,
-            "high disk watermark [90%] exceeded on *node1* shards will be relocated away from this node* "
+            "high disk watermark ["
+                + highWatermarkString
+                + "] exceeded on *node1* shards will be relocated away from this node* "
                 + "the node is expected to continue to exceed the high disk watermark when these relocations are complete"
         );
 
@@ -831,7 +1191,9 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         assertRepeatedWarningMessages(
             monitor,
             aboveHighWatermark,
-            "high disk watermark [90%] exceeded on *node1* shards will be relocated away from this node* "
+            "high disk watermark ["
+                + highWatermarkString
+                + "] exceeded on *node1* shards will be relocated away from this node* "
                 + "the node is expected to continue to exceed the high disk watermark when these relocations are complete"
         );
 
@@ -839,56 +1201,78 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         assertSingleInfoMessage(
             monitor,
             aboveLowWatermark,
-            "high disk watermark [90%] no longer exceeded on *node1* but low disk watermark [85%] is still exceeded"
+            "high disk watermark ["
+                + highWatermarkString
+                + "] no longer exceeded on *node1* but low disk watermark ["
+                + lowWatermarkString
+                + "] is still exceeded"
         );
 
         advanceTime.set(true); // only log about dropping below the low disk watermark on a reroute
-        assertSingleInfoMessage(monitor, allDisksOk, "low disk watermark [85%] no longer exceeded on *node1*");
+        assertSingleInfoMessage(monitor, allDisksOk, "low disk watermark [" + lowWatermarkString + "] no longer exceeded on *node1*");
 
         advanceTime.set(randomBoolean());
         assertRepeatedWarningMessages(
             monitor,
             aboveFloodStageWatermark,
-            "flood stage disk watermark [95%] exceeded on *node1* all indices on this node will be marked read-only"
+            "flood stage disk watermark ["
+                + floodWatermarkString
+                + "] exceeded on *node1* all indices on this node will be marked read-only"
         );
 
-        assertSingleInfoMessage(monitor, allDisksOk, "low disk watermark [85%] no longer exceeded on *node1*");
+        assertSingleInfoMessage(monitor, allDisksOk, "low disk watermark [" + lowWatermarkString + "] no longer exceeded on *node1*");
 
         advanceTime.set(true);
         assertRepeatedWarningMessages(
             monitor,
             aboveHighWatermark,
-            "high disk watermark [90%] exceeded on *node1* shards will be relocated away from this node* "
+            "high disk watermark ["
+                + highWatermarkString
+                + "] exceeded on *node1* shards will be relocated away from this node* "
                 + "the node is expected to continue to exceed the high disk watermark when these relocations are complete"
         );
 
-        assertSingleInfoMessage(monitor, allDisksOk, "low disk watermark [85%] no longer exceeded on *node1*");
+        assertSingleInfoMessage(monitor, allDisksOk, "low disk watermark [" + lowWatermarkString + "] no longer exceeded on *node1*");
 
         assertRepeatedWarningMessages(
             monitor,
             aboveFloodStageWatermark,
-            "flood stage disk watermark [95%] exceeded on *node1* all indices on this node will be marked read-only"
+            "flood stage disk watermark ["
+                + floodWatermarkString
+                + "] exceeded on *node1* all indices on this node will be marked read-only"
         );
 
         assertSingleInfoMessage(
             monitor,
             aboveLowWatermark,
-            "high disk watermark [90%] no longer exceeded on *node1* but low disk watermark [85%] is still exceeded"
+            "high disk watermark ["
+                + highWatermarkString
+                + "] no longer exceeded on *node1* but low disk watermark ["
+                + lowWatermarkString
+                + "] is still exceeded"
         );
 
-        assertSingleInfoMessage(monitor, allDisksOk, "low disk watermark [85%] no longer exceeded on *node1*");
-
-        assertRepeatedWarningMessages(monitor, frozenAboveFloodStageWatermark, "flood stage disk watermark [95%] exceeded on *frozen*");
+        assertSingleInfoMessage(monitor, allDisksOk, "low disk watermark [" + lowWatermarkString + "] no longer exceeded on *node1*");
 
         assertRepeatedWarningMessages(
             monitor,
-            frozenAboveFloodStageMaxHeadroom,
-            "flood stage disk watermark [max_headroom=20gb] exceeded on *frozen*"
+            frozenAboveFloodStageWatermark,
+            "flood stage disk watermark [" + frozenFloodWatermarkString + "] exceeded on *frozen*"
         );
 
         assertNoLogging(monitor, allDisksOk);
     }
 
+    @TestLogging(value = "org.elasticsearch.cluster.routing.allocation.DiskThresholdMonitor:INFO", reason = "testing INFO/WARN logging")
+    public void testDiskMonitorLoggingWithPercentages() throws IllegalAccessException {
+        doTestDiskMonitorLogging(false);
+    }
+
+    @TestLogging(value = "org.elasticsearch.cluster.routing.allocation.DiskThresholdMonitor:INFO", reason = "testing INFO/WARN logging")
+    public void testDiskMonitorLoggingWithMaxHeadrooms() throws IllegalAccessException {
+        doTestDiskMonitorLogging(true);
+    }
+
     private void assertNoLogging(DiskThresholdMonitor monitor, Map<String, DiskUsage> diskUsages) throws IllegalAccessException {
         MockLogAppender mockAppender = new MockLogAppender();
         mockAppender.start();
@@ -956,6 +1340,10 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase {
         mockAppender.stop();
     }
 
+    private static long betweenGb(int min, int max) {
+        return ByteSizeValue.ofGb(between(min, max)).getBytes();
+    }
+
     private static ClusterInfo clusterInfo(Map<String, DiskUsage> diskUsages) {
         return clusterInfo(diskUsages, Map.of());
     }

+ 442 - 9
server/src/test/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdSettingsTests.java

@@ -59,9 +59,30 @@ public class DiskThresholdSettingsTests extends ESTestCase {
 
         // Test default watermark max headroom values
         ByteSizeValue thousandTb = ByteSizeValue.parseBytesSizeValue("1000tb", "test");
+        ByteSizeValue lowHeadroom = ByteSizeValue.parseBytesSizeValue("200gb", "test");
+        ByteSizeValue highHeadroom = ByteSizeValue.parseBytesSizeValue("150gb", "test");
+        ByteSizeValue floodHeadroom = ByteSizeValue.parseBytesSizeValue("100gb", "test");
         ByteSizeValue frozenFloodHeadroom = ByteSizeValue.parseBytesSizeValue("20gb", "test");
+        assertEquals(lowHeadroom, diskThresholdSettings.getFreeBytesThresholdLowStage(thousandTb));
+        assertEquals(highHeadroom, diskThresholdSettings.getFreeBytesThresholdHighStage(thousandTb));
+        assertEquals(floodHeadroom, diskThresholdSettings.getFreeBytesThresholdFloodStage(thousandTb));
         assertEquals(frozenFloodHeadroom, diskThresholdSettings.getFreeBytesThresholdFrozenFloodStage(thousandTb));
+        assertEquals("max_headroom=200gb", diskThresholdSettings.describeLowThreshold(thousandTb, false));
+        assertEquals("max_headroom=150gb", diskThresholdSettings.describeHighThreshold(thousandTb, false));
+        assertEquals("max_headroom=100gb", diskThresholdSettings.describeFloodStageThreshold(thousandTb, false));
         assertEquals("max_headroom=20gb", diskThresholdSettings.describeFrozenFloodStageThreshold(thousandTb, false));
+        assertEquals(
+            DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey() + "=" + "200gb",
+            diskThresholdSettings.describeLowThreshold(thousandTb, true)
+        );
+        assertEquals(
+            DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey() + "=" + "150gb",
+            diskThresholdSettings.describeHighThreshold(thousandTb, true)
+        );
+        assertEquals(
+            DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey() + "=" + "100gb",
+            diskThresholdSettings.describeFloodStageThreshold(thousandTb, true)
+        );
         assertEquals(
             DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.getKey() + "=" + "20gb",
             diskThresholdSettings.describeFrozenFloodStageThreshold(thousandTb, true)
@@ -79,8 +100,13 @@ public class DiskThresholdSettingsTests extends ESTestCase {
             ByteSizeValue.ofBytes(1000),
             diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofBytes(850))
         );
+        // For 100TB used bytes, the max headroom should cap the minimum required free space to 200GB. So we need 100TB+200GB total bytes.
+        assertEquals(
+            ByteSizeValue.add(ByteSizeValue.ofTb(100), ByteSizeValue.ofGb(200)),
+            diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofTb(100))
+        );
 
-        // Test random factor.
+        // Test random factor. Stay in low values so max headroom does not apply.
         final long factor = between(1, 1000);
         assertThat(
             diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofBytes(85 * factor)),
@@ -98,17 +124,18 @@ public class DiskThresholdSettingsTests extends ESTestCase {
 
         // For 850 used bytes, we need 850b + 1GB total bytes.
         assertEquals(
-            ByteSizeValue.ofBytes(ByteSizeValue.ofGb(1).getBytes() + ByteSizeValue.ofBytes(850).getBytes()),
+            ByteSizeValue.add(ByteSizeValue.ofGb(1), ByteSizeValue.ofBytes(850)),
             diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofBytes(850))
         );
         // For 100TB used bytes, we need 100TB+1GB total bytes.
         assertEquals(
-            ByteSizeValue.ofBytes(ByteSizeValue.ofTb(100).getBytes() + ByteSizeValue.ofGb(1).getBytes()),
+            ByteSizeValue.add(ByteSizeValue.ofTb(100), ByteSizeValue.ofGb(1)),
             diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofTb(100))
         );
 
         newSettings = Settings.builder()
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "0.50")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "-1")
             .build();
         nss.applySettings(newSettings);
 
@@ -125,6 +152,7 @@ public class DiskThresholdSettingsTests extends ESTestCase {
 
         newSettings = Settings.builder()
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "0.50")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "500gb")
             .build();
         nss.applySettings(newSettings);
 
@@ -133,6 +161,11 @@ public class DiskThresholdSettingsTests extends ESTestCase {
             ByteSizeValue.ofBytes(1700),
             diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofBytes(850))
         );
+        // For 100TB used bytes, the max headroom should cap the minimum required free space to 500GB. So we need 100TB+500GB total bytes.
+        assertEquals(
+            ByteSizeValue.add(ByteSizeValue.ofTb(100), ByteSizeValue.ofGb(500)),
+            diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofTb(100))
+        );
 
         // Test random percentage
 
@@ -148,6 +181,26 @@ public class DiskThresholdSettingsTests extends ESTestCase {
             Matchers.equalTo(ByteSizeValue.ofBytes(100L * factor))
         );
 
+        // Test case for 32547 used bytes & threshold 0.57. Should return 57100 bytes.
+        newSettings = Settings.builder()
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "57%")
+            .build();
+        nss.applySettings(newSettings);
+        assertThat(
+            diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofBytes(32547L)),
+            Matchers.equalTo(ByteSizeValue.ofBytes(57100L))
+        );
+
+        // Test case for 4080 used bytes & threshold 0.68. Should return 6000 bytes.
+        newSettings = Settings.builder()
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "68%")
+            .build();
+        nss.applySettings(newSettings);
+        assertThat(
+            diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofBytes(4080L)),
+            Matchers.equalTo(ByteSizeValue.ofBytes(6000))
+        );
+
         // Test case for 17777 used bytes & threshold 0.29. Should return 61300 bytes. Test case originates from issue #88791.
         newSettings = Settings.builder()
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "29%")
@@ -158,6 +211,42 @@ public class DiskThresholdSettingsTests extends ESTestCase {
             Matchers.equalTo(ByteSizeValue.ofBytes(61300))
         );
 
+        // Test case for 90 used bytes & threshold 0.90. Should return 100 bytes.
+        newSettings = Settings.builder()
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "90%")
+            .build();
+        nss.applySettings(newSettings);
+        assertThat(
+            diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofBytes(90L)),
+            Matchers.equalTo(ByteSizeValue.ofBytes(100L))
+        );
+
+        // Test case for 90 used bytes & threshold 0.90 & max headroom of 1 byte. Should return 91 bytes.
+        newSettings = Settings.builder()
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "90%")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "1b")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), "1b")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "1b")
+            .build();
+        nss.applySettings(newSettings);
+        assertThat(
+            diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofBytes(90L)),
+            Matchers.equalTo(ByteSizeValue.ofBytes(91L))
+        );
+
+        // Test case for 90 used bytes & threshold 0.90 & max headroom of 0 bytes. Should return 90 bytes.
+        newSettings = Settings.builder()
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "90%")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "0b")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), "0b")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "0b")
+            .build();
+        nss.applySettings(newSettings);
+        assertThat(
+            diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofBytes(90L)),
+            Matchers.equalTo(ByteSizeValue.ofBytes(90L))
+        );
+
         // Test random absolute values
 
         final long absolute = between(1, 1000);
@@ -230,6 +319,138 @@ public class DiskThresholdSettingsTests extends ESTestCase {
         assertFalse(diskThresholdSettings.isEnabled());
     }
 
+    public void testUpdateMaxHeadroomValues() {
+        ClusterSettings nss = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
+        DiskThresholdSettings diskThresholdSettings = new DiskThresholdSettings(Settings.EMPTY, nss);
+
+        Settings newSettings = Settings.builder()
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), false)
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "1000mb")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), "500mb")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "250mb")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.getKey(), "150mb")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.getKey(), "30s")
+            .build();
+        nss.applySettings(newSettings);
+
+        // Test that default percentage values apply
+        ByteSizeValue hundredBytes = ByteSizeValue.parseBytesSizeValue("100b", "test");
+        assertEquals(ByteSizeValue.ofBytes(15), diskThresholdSettings.getFreeBytesThresholdLowStage(hundredBytes));
+        assertEquals(ByteSizeValue.ofBytes(10), diskThresholdSettings.getFreeBytesThresholdHighStage(hundredBytes));
+        assertEquals(ByteSizeValue.ofBytes(5), diskThresholdSettings.getFreeBytesThresholdFloodStage(hundredBytes));
+        assertEquals(ByteSizeValue.ofBytes(5), diskThresholdSettings.getFreeBytesThresholdFrozenFloodStage(hundredBytes));
+
+        // Test that max headroom values apply
+        ByteSizeValue thousandTb = ByteSizeValue.parseBytesSizeValue("1000tb", "test");
+        ByteSizeValue lowHeadroom = ByteSizeValue.parseBytesSizeValue("1000mb", "test");
+        ByteSizeValue highHeadroom = ByteSizeValue.parseBytesSizeValue("500mb", "test");
+        ByteSizeValue floodHeadroom = ByteSizeValue.parseBytesSizeValue("250mb", "test");
+        ByteSizeValue frozenFloodHeadroom = ByteSizeValue.parseBytesSizeValue("150mb", "test");
+        assertEquals(lowHeadroom, diskThresholdSettings.getFreeBytesThresholdLowStage(thousandTb));
+        assertEquals(highHeadroom, diskThresholdSettings.getFreeBytesThresholdHighStage(thousandTb));
+        assertEquals(floodHeadroom, diskThresholdSettings.getFreeBytesThresholdFloodStage(thousandTb));
+        assertEquals(frozenFloodHeadroom, diskThresholdSettings.getFreeBytesThresholdFrozenFloodStage(thousandTb));
+    }
+
+    public void testUpdateMaxHeadroomValuesLowValues() {
+        ClusterSettings nss = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
+        DiskThresholdSettings diskThresholdSettings = new DiskThresholdSettings(Settings.EMPTY, nss);
+
+        Settings newSettings = Settings.builder()
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), false)
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "1b")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), "0b")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "0b")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.getKey(), "0b")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.getKey(), "30s")
+            .build();
+        nss.applySettings(newSettings);
+
+        // Test that the max headroom values prevail over default watermark ratios
+        ByteSizeValue hundredBytes = ByteSizeValue.parseBytesSizeValue("100b", "test");
+        assertEquals(ByteSizeValue.ONE, diskThresholdSettings.getFreeBytesThresholdLowStage(hundredBytes));
+        assertEquals(ByteSizeValue.ZERO, diskThresholdSettings.getFreeBytesThresholdHighStage(hundredBytes));
+        assertEquals(ByteSizeValue.ZERO, diskThresholdSettings.getFreeBytesThresholdFloodStage(hundredBytes));
+        assertEquals(ByteSizeValue.ZERO, diskThresholdSettings.getFreeBytesThresholdFrozenFloodStage(hundredBytes));
+    }
+
+    public void testUpdateWatermarkAndMaxHeadroomValues() {
+        ClusterSettings nss = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
+        DiskThresholdSettings diskThresholdSettings = new DiskThresholdSettings(Settings.EMPTY, nss);
+
+        boolean watermarksAbsolute = randomBoolean();
+        boolean lowHeadroomEnabled = (watermarksAbsolute == false) && randomBoolean();
+        boolean highHeadroomEnabled = lowHeadroomEnabled ? true : ((watermarksAbsolute == false) && randomBoolean());
+        boolean floodHeadroomEnabled = highHeadroomEnabled ? true : ((watermarksAbsolute == false) && randomBoolean());
+        boolean frozenFloodHeadroomEnabled = (watermarksAbsolute == false) && randomBoolean();
+
+        Settings.Builder builder = Settings.builder()
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), false)
+            .put(
+                DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(),
+                watermarksAbsolute ? "50b" : randomBoolean() ? "50%" : "0.50"
+            )
+            .put(
+                DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(),
+                watermarksAbsolute ? "40b" : randomBoolean() ? "60%" : "0.60"
+            )
+            .put(
+                DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(),
+                watermarksAbsolute ? "30b" : randomBoolean() ? "70%" : "0.70"
+            )
+            .put(
+                DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.getKey(),
+                watermarksAbsolute ? "15b" : randomBoolean() ? "85%" : "0.85"
+            )
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.getKey(), "30s");
+        if (lowHeadroomEnabled) {
+            builder = builder.put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "1000mb");
+        }
+        if (highHeadroomEnabled) {
+            builder = builder.put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), "500mb");
+        }
+        if (floodHeadroomEnabled) {
+            builder = builder.put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "250mb");
+        }
+        if (frozenFloodHeadroomEnabled) {
+            builder = builder.put(
+                DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.getKey(),
+                "150mb"
+            );
+        }
+        nss.applySettings(builder.build());
+
+        // Test that watermark values apply
+        ByteSizeValue hundredBytes = ByteSizeValue.parseBytesSizeValue("100b", "test");
+        assertEquals(ByteSizeValue.ofBytes(50), diskThresholdSettings.getFreeBytesThresholdLowStage(hundredBytes));
+        assertEquals(ByteSizeValue.ofBytes(40), diskThresholdSettings.getFreeBytesThresholdHighStage(hundredBytes));
+        assertEquals(ByteSizeValue.ofBytes(30), diskThresholdSettings.getFreeBytesThresholdFloodStage(hundredBytes));
+        assertEquals(ByteSizeValue.ofBytes(15), diskThresholdSettings.getFreeBytesThresholdFrozenFloodStage(hundredBytes));
+
+        // Test that max headroom values (if enabled) prevail over percentage watermark values
+        ByteSizeValue thousandTb = ByteSizeValue.parseBytesSizeValue("1000tb", "test");
+        ByteSizeValue lowExpected = ByteSizeValue.parseBytesSizeValue(
+            watermarksAbsolute ? "50b" : lowHeadroomEnabled ? "1000mb" : "500tb",
+            "test"
+        );
+        ByteSizeValue highExpected = ByteSizeValue.parseBytesSizeValue(
+            watermarksAbsolute ? "40b" : highHeadroomEnabled ? "500mb" : "400tb",
+            "test"
+        );
+        ByteSizeValue floodExpected = ByteSizeValue.parseBytesSizeValue(
+            watermarksAbsolute ? "30b" : floodHeadroomEnabled ? "250mb" : "300tb",
+            "test"
+        );
+        ByteSizeValue frozenFloodExpected = ByteSizeValue.parseBytesSizeValue(
+            watermarksAbsolute ? "15b" : frozenFloodHeadroomEnabled ? "150mb" : "150tb",
+            "test"
+        );
+        assertEquals(lowExpected, diskThresholdSettings.getFreeBytesThresholdLowStage(thousandTb));
+        assertEquals(highExpected, diskThresholdSettings.getFreeBytesThresholdHighStage(thousandTb));
+        assertEquals(floodExpected, diskThresholdSettings.getFreeBytesThresholdFloodStage(thousandTb));
+        assertEquals(frozenFloodExpected, diskThresholdSettings.getFreeBytesThresholdFrozenFloodStage(thousandTb));
+    }
+
     public void testInvalidConstruction() {
         final Settings settings = Settings.builder()
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "90%")
@@ -347,6 +568,79 @@ public class DiskThresholdSettingsTests extends ESTestCase {
         assertThat(cause, hasToString(containsString(incompatibleExpected)));
     }
 
+    public void testIncompatibleMaxHeadroomUpdate() {
+        final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
+        new DiskThresholdSettings(Settings.EMPTY, clusterSettings); // this has the effect of registering the settings updater
+
+        Settings.Builder settings = Settings.builder()
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "300g")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "200g")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "100g");
+
+        String lowHeadroom = "-1";
+        String highHeadroom = "-1";
+        String floodHeadroom = "-1";
+        if (randomBoolean()) {
+            lowHeadroom = "100gb";
+            settings = settings.put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), lowHeadroom);
+        } else if (randomBoolean()) {
+            highHeadroom = "100gb";
+            settings = settings.put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), highHeadroom);
+        } else {
+            floodHeadroom = "100gb";
+            settings = settings.put(
+                DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(),
+                floodHeadroom
+            );
+        }
+        final Settings builtSettings = settings.build();
+
+        final IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> clusterSettings.applySettings(builtSettings));
+        final String expected = "illegal value can't update [cluster.routing.allocation.disk.watermark.low.max_headroom] from [200GB] to ["
+            + lowHeadroom
+            + "]";
+        assertThat(e, hasToString(containsString(expected)));
+        assertNotNull(e.getCause());
+        assertThat(e.getCause(), instanceOf(IllegalArgumentException.class));
+        final IllegalArgumentException cause = (IllegalArgumentException) e.getCause();
+        final String incompatibleExpected = String.format(
+            Locale.ROOT,
+            "At least one of the disk max headroom settings is set [low=%s, high=%s, flood=%s], while the disk watermark values "
+                + "are set to absolute values instead of ratios/percentages, e.g., the low watermark is [%s]",
+            lowHeadroom,
+            highHeadroom,
+            floodHeadroom,
+            "300gb"
+        );
+        assertThat(cause, hasToString(containsString(incompatibleExpected)));
+    }
+
+    public void testIncompatibleFrozenMaxHeadroomUpdate() {
+        final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
+        new DiskThresholdSettings(Settings.EMPTY, clusterSettings); // this has the effect of registering the settings updater
+
+        final Settings newSettings = Settings.builder()
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.getKey(), "300g")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.getKey(), "100g")
+            .build();
+
+        final IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> clusterSettings.applySettings(newSettings));
+        final String expected =
+            "illegal value can't update [cluster.routing.allocation.disk.watermark.flood_stage.frozen.max_headroom] from [20GB] to [100g]";
+        assertThat(e, hasToString(containsString(expected)));
+        assertNotNull(e.getCause());
+        assertThat(e.getCause(), instanceOf(IllegalArgumentException.class));
+        final IllegalArgumentException cause = (IllegalArgumentException) e.getCause();
+        final String incompatibleExpected = String.format(
+            Locale.ROOT,
+            "The frozen flood stage disk max headroom setting is set [%s], while the frozen flood stage disk watermark setting "
+                + "is set to an absolute value instead of a ratio/percentage [%s]",
+            "100gb",
+            "300gb"
+        );
+        assertThat(cause, hasToString(containsString(incompatibleExpected)));
+    }
+
     public void testInvalidHighDiskThreshold() {
         final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
         new DiskThresholdSettings(Settings.EMPTY, clusterSettings); // this has the effect of registering the settings updater
@@ -365,6 +659,125 @@ public class DiskThresholdSettingsTests extends ESTestCase {
         assertThat(cause, hasToString(containsString("low disk watermark [85%] more than high disk watermark [75%]")));
     }
 
+    public void testInvalidLowHighMaxHeadroomUpdate() {
+        final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
+        new DiskThresholdSettings(Settings.EMPTY, clusterSettings); // this has the effect of registering the settings updater
+
+        final Settings newSettings = Settings.builder()
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "300m")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), "750m")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "500m")
+            .build();
+
+        final IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> clusterSettings.applySettings(newSettings));
+        final String expected =
+            "illegal value can't update [cluster.routing.allocation.disk.watermark.low.max_headroom] from [200GB] to [300m]";
+        assertThat(e, hasToString(containsString(expected)));
+        assertNotNull(e.getCause());
+        assertThat(e.getCause(), instanceOf(IllegalArgumentException.class));
+        final IllegalArgumentException cause = (IllegalArgumentException) e.getCause();
+        assertThat(cause, hasToString(containsString("high disk max headroom [750mb] more than low disk max headroom [300mb]")));
+    }
+
+    public void testInvalidHighFloodMaxHeadroomUpdate() {
+        final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
+        new DiskThresholdSettings(Settings.EMPTY, clusterSettings); // this has the effect of registering the settings updater
+
+        final Settings newSettings = Settings.builder()
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), "400m")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "500m")
+            .build();
+
+        final IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> clusterSettings.applySettings(newSettings));
+        final String expected =
+            "illegal value can't update [cluster.routing.allocation.disk.watermark.high.max_headroom] from [150GB] to [400m]";
+        assertThat(e, hasToString(containsString(expected)));
+        assertNotNull(e.getCause());
+        assertThat(e.getCause(), instanceOf(IllegalArgumentException.class));
+        final IllegalArgumentException cause = (IllegalArgumentException) e.getCause();
+        assertThat(cause, hasToString(containsString("flood disk max headroom [500mb] more than high disk max headroom [400mb]")));
+    }
+
+    public void testInvalidHeadroomSetToMinusOne() {
+        final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
+        new DiskThresholdSettings(Settings.EMPTY, clusterSettings);
+
+        Settings.Builder builder = Settings.builder();
+        if (randomBoolean()) {
+            builder = builder.put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "-1");
+        } else if (randomBoolean()) {
+            builder = builder.put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), "-1");
+        } else {
+            builder = builder.put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "-1");
+        }
+        final Settings newSettings = builder.build();
+
+        final IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> clusterSettings.applySettings(newSettings));
+        final String expected = "illegal value can't update";
+        assertThat(e, hasToString(containsString(expected)));
+        assertNotNull(e.getCause());
+        assertThat(e.getCause(), instanceOf(IllegalArgumentException.class));
+        final IllegalArgumentException cause = (IllegalArgumentException) e.getCause();
+        assertThat(cause, hasToString(containsString("setting a headroom value to less than 0 is not supported")));
+    }
+
+    public void testInvalidLowHeadroomSetAndHighNotSet() {
+        final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
+        new DiskThresholdSettings(Settings.EMPTY, clusterSettings);
+
+        Settings.Builder builder = Settings.builder()
+            // The following settings combination for a 1000TiB hard disk would result in the required minimum free disk space for the low
+            // watermark to be 150GiB, and for the high 100TiB. So it could hit the high watermark before the low watermark.
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "85%")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "90%")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "95%")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "150GB");
+        if (randomBoolean()) {
+            builder = builder.put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "20GB");
+        }
+        final Settings newSettings = builder.build();
+
+        final IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> clusterSettings.applySettings(newSettings));
+        final String expected =
+            "illegal value can't update [cluster.routing.allocation.disk.watermark.low.max_headroom] from [200GB] to [150GB]";
+        assertThat(e, hasToString(containsString(expected)));
+        assertNotNull(e.getCause());
+        assertThat(e.getCause(), instanceOf(IllegalArgumentException.class));
+        final IllegalArgumentException cause = (IllegalArgumentException) e.getCause();
+        assertThat(
+            cause,
+            hasToString(containsString("high disk max headroom [-1] is not set, while the low disk max headroom is set [150gb]"))
+        );
+    }
+
+    public void testInvalidHighHeadroomSetAndFloodNotSet() {
+        final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
+        new DiskThresholdSettings(Settings.EMPTY, clusterSettings);
+
+        Settings.Builder builder = Settings.builder()
+            // The following settings combination for a 1000TiB hard disk would result in the required minimum free disk space for the high
+            // watermark to be 150GiB and for the flood 50TiB. So it could hit the flood watermark before the high watermark.
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "85%")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "90%")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "95%")
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), "150GB");
+        if (randomBoolean()) {
+            builder = builder.put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "200GB");
+        }
+        final Settings newSettings = builder.build();
+
+        final IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> clusterSettings.applySettings(newSettings));
+        final String expected = "illegal value can't update";
+        assertThat(e, hasToString(containsString(expected)));
+        assertNotNull(e.getCause());
+        assertThat(e.getCause(), instanceOf(IllegalArgumentException.class));
+        final IllegalArgumentException cause = (IllegalArgumentException) e.getCause();
+        assertThat(
+            cause,
+            hasToString(containsString("flood disk max headroom [-1] is not set, while the high disk max headroom is set [150gb]"))
+        );
+    }
+
     public void testSequenceOfUpdates() {
         final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
         new DiskThresholdSettings(Settings.EMPTY, clusterSettings); // this has the effect of registering the settings updater
@@ -430,10 +843,21 @@ public class DiskThresholdSettingsTests extends ESTestCase {
         String frozenFloodWatermarkPrefix = includeKey
             ? DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.getKey() + "="
             : "";
+        String lowMaxHeadroomPrefix = includeKey
+            ? DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey() + "="
+            : "max_headroom=";
+        String highMaxHeadroomPrefix = includeKey
+            ? DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey() + "="
+            : "max_headroom=";
+        String floodMaxHeadroomPrefix = includeKey
+            ? DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey() + "="
+            : "max_headroom=";
         String frozenFloodMaxHeadroomPrefix = includeKey
             ? DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.getKey() + "="
             : "max_headroom=";
 
+        // Test default settings for watermarks
+
         DiskThresholdSettings diskThresholdSettings = new DiskThresholdSettings(Settings.EMPTY, clusterSettings);
         assertThat(diskThresholdSettings.describeLowThreshold(hundredBytes, includeKey), equalTo(lowWatermarkPrefix + "85%"));
         assertThat(diskThresholdSettings.describeHighThreshold(hundredBytes, includeKey), equalTo(highWatermarkPrefix + "90%"));
@@ -443,11 +867,16 @@ public class DiskThresholdSettingsTests extends ESTestCase {
             equalTo(frozenFloodWatermarkPrefix + "95%")
         );
 
+        assertThat(diskThresholdSettings.describeLowThreshold(thousandTb, includeKey), equalTo(lowMaxHeadroomPrefix + "200gb"));
+        assertThat(diskThresholdSettings.describeHighThreshold(thousandTb, includeKey), equalTo(highMaxHeadroomPrefix + "150gb"));
+        assertThat(diskThresholdSettings.describeFloodStageThreshold(thousandTb, includeKey), equalTo(floodMaxHeadroomPrefix + "100gb"));
         assertThat(
             diskThresholdSettings.describeFrozenFloodStageThreshold(thousandTb, includeKey),
             equalTo(frozenFloodMaxHeadroomPrefix + "20gb")
         );
 
+        // Test a mixture of percentages without max headroom values
+
         diskThresholdSettings = new DiskThresholdSettings(
             Settings.builder()
                 .put(
@@ -487,14 +916,14 @@ public class DiskThresholdSettingsTests extends ESTestCase {
             equalTo(frozenFloodWatermarkPrefix + "91.5%")
         );
 
+        // Test absolute values
+
         diskThresholdSettings = new DiskThresholdSettings(
             Settings.builder()
                 .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "1GB")
                 .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "10MB")
                 .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "2B")
                 .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.getKey(), "1B")
-                // Max headroom values should be ignored since the watermark values are set to absolute values
-                .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.getKey(), "10mb")
                 .build(),
             clusterSettings
         );
@@ -507,7 +936,7 @@ public class DiskThresholdSettingsTests extends ESTestCase {
             equalTo(frozenFloodWatermarkPrefix + "1b")
         );
 
-        // Even for 1000TB, the watermarks apply since they are set to absolute values (max headroom values should be ignored)
+        // Even for 1000TB, the watermarks apply since they are set (any max headroom does not apply)
         assertThat(diskThresholdSettings.describeLowThreshold(thousandTb, includeKey), equalTo(lowWatermarkPrefix + "1gb"));
         assertThat(diskThresholdSettings.describeHighThreshold(thousandTb, includeKey), equalTo(highWatermarkPrefix + "10mb"));
         assertThat(diskThresholdSettings.describeFloodStageThreshold(thousandTb, includeKey), equalTo(floodWatermarkPrefix + "2b"));
@@ -517,6 +946,7 @@ public class DiskThresholdSettingsTests extends ESTestCase {
         );
 
         // Test a mixture of percentages and max headroom values
+
         diskThresholdSettings = new DiskThresholdSettings(
             Settings.builder()
                 .put(
@@ -535,6 +965,9 @@ public class DiskThresholdSettingsTests extends ESTestCase {
                     DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.getKey(),
                     randomBoolean() ? "31.5%" : "0.315"
                 )
+                .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "100gb")
+                .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), "50gb")
+                .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "10gb")
                 .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.getKey(), "10gb")
                 .build(),
             clusterSettings
@@ -548,9 +981,9 @@ public class DiskThresholdSettingsTests extends ESTestCase {
             equalTo(frozenFloodWatermarkPrefix + "31.5%")
         );
 
-        assertThat(diskThresholdSettings.describeLowThreshold(thousandTb, includeKey), equalTo(lowWatermarkPrefix + "31.2%"));
-        assertThat(diskThresholdSettings.describeHighThreshold(thousandTb, includeKey), equalTo(highWatermarkPrefix + "31.3%"));
-        assertThat(diskThresholdSettings.describeFloodStageThreshold(thousandTb, includeKey), equalTo(floodWatermarkPrefix + "31.4%"));
+        assertThat(diskThresholdSettings.describeLowThreshold(thousandTb, includeKey), equalTo(lowMaxHeadroomPrefix + "100gb"));
+        assertThat(diskThresholdSettings.describeHighThreshold(thousandTb, includeKey), equalTo(highMaxHeadroomPrefix + "50gb"));
+        assertThat(diskThresholdSettings.describeFloodStageThreshold(thousandTb, includeKey), equalTo(floodMaxHeadroomPrefix + "10gb"));
         assertThat(
             diskThresholdSettings.describeFrozenFloodStageThreshold(thousandTb, includeKey),
             equalTo(frozenFloodMaxHeadroomPrefix + "10gb")

+ 300 - 101
server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderTests.java

@@ -45,6 +45,7 @@ import org.elasticsearch.common.UUIDs;
 import org.elasticsearch.common.settings.ClusterSettings;
 import org.elasticsearch.common.settings.Setting;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.unit.ByteSizeValue;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.shard.ShardId;
 import org.elasticsearch.repositories.IndexId;
@@ -83,27 +84,47 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
         return new DiskThresholdDecider(settings, new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS));
     }
 
-    public void testDiskThreshold() {
-        Settings diskSettings = Settings.builder()
+    private void doTestDiskThreshold(boolean testMaxHeadroom) {
+        Settings.Builder diskSettings = Settings.builder()
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true)
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), 0.7)
-            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), 0.8)
-            .build();
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), 0.8);
+        if (testMaxHeadroom) {
+            diskSettings = diskSettings.put(
+                DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(),
+                ByteSizeValue.ofGb(200).toString()
+            )
+                .put(
+                    DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(),
+                    ByteSizeValue.ofGb(150).toString()
+                );
+        }
 
         Map<String, DiskUsage> usages = new HashMap<>();
-        usages.put("node1", new DiskUsage("node1", "node1", "/dev/null", 100, 10)); // 90% used
-        usages.put("node2", new DiskUsage("node2", "node2", "/dev/null", 100, 35)); // 65% used
-        usages.put("node3", new DiskUsage("node3", "node3", "/dev/null", 100, 60)); // 40% used
-        usages.put("node4", new DiskUsage("node4", "node4", "/dev/null", 100, 80)); // 20% used
+        final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100;
+        final long exactFreeSpaceForHighWatermark = testMaxHeadroom ? ByteSizeValue.ofGb(150).getBytes() : 10;
+        usages.put("node1", new DiskUsage("node1", "node1", "/dev/null", totalBytes, exactFreeSpaceForHighWatermark));
+        usages.put(
+            "node2",
+            new DiskUsage("node2", "node2", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(350).getBytes() : 35)
+        );
+        usages.put(
+            "node3",
+            new DiskUsage("node3", "node3", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(600).getBytes() : 60)
+        );
+        usages.put(
+            "node4",
+            new DiskUsage("node4", "node4", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(800).getBytes() : 80)
+        );
 
         Map<String, Long> shardSizes = new HashMap<>();
-        shardSizes.put("[test][0][p]", 10L); // 10 bytes
-        shardSizes.put("[test][0][r]", 10L);
+        shardSizes.put("[test][0][p]", exactFreeSpaceForHighWatermark);
+        shardSizes.put("[test][0][r]", exactFreeSpaceForHighWatermark);
         final ClusterInfo clusterInfo = new DevNullClusterInfo(usages, usages, shardSizes);
 
         ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
         AllocationDeciders deciders = new AllocationDeciders(
-            new HashSet<>(Arrays.asList(new SameShardAllocationDecider(Settings.EMPTY, clusterSettings), makeDecider(diskSettings)))
+            new HashSet<>(Arrays.asList(new SameShardAllocationDecider(Settings.EMPTY, clusterSettings), makeDecider(diskSettings.build())))
         );
 
         ClusterInfoService cis = () -> {
@@ -177,17 +198,26 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
 
         logger.info("--> changing decider settings");
 
-        // Set the low threshold to 60 instead of 70
-        // Set the high threshold to 70 instead of 80
-        // node2 now should not have new shards allocated to it, but shards can remain
-        diskSettings = Settings.builder()
-            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true)
-            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "60%")
-            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), 0.7)
-            .build();
+        if (testMaxHeadroom) {
+            // Set the low max headroom to 250GB
+            // Set the high max headroom to 150GB
+            // node2 (with 200GB free space) now should not have new shards allocated to it, but shards can remain
+            diskSettings = Settings.builder()
+                .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true)
+                .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), ByteSizeValue.ofGb(250))
+                .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), ByteSizeValue.ofGb(150));
+        } else {
+            // Set the low threshold to 60 instead of 70
+            // Set the high threshold to 70 instead of 80
+            // node2 (with 75% used space) now should not have new shards allocated to it, but shards can remain
+            diskSettings = Settings.builder()
+                .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true)
+                .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "60%")
+                .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), 0.7);
+        }
 
         deciders = new AllocationDeciders(
-            new HashSet<>(Arrays.asList(new SameShardAllocationDecider(Settings.EMPTY, clusterSettings), makeDecider(diskSettings)))
+            new HashSet<>(Arrays.asList(new SameShardAllocationDecider(Settings.EMPTY, clusterSettings), makeDecider(diskSettings.build())))
         );
 
         strategy = new AllocationService(
@@ -209,17 +239,28 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
 
         logger.info("--> changing settings again");
 
-        // Set the low threshold to 50 instead of 60
-        // Set the high threshold to 60 instead of 70
-        // node2 now should not have new shards allocated to it, and shards cannot remain
-        diskSettings = Settings.builder()
-            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true)
-            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), 0.5)
-            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), 0.6)
-            .build();
+        if (testMaxHeadroom) {
+            // Set the low max headroom to 500GB
+            // Set the high max headroom to 400GB
+            // node2 (with 200GB free space) now should not have new shards allocated to it, and shards cannot remain
+            // Note that node3 (with 500GB free space) should not receive the shard so it does not get over the high threshold
+            diskSettings = Settings.builder()
+                .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true)
+                .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), ByteSizeValue.ofGb(500))
+                .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), ByteSizeValue.ofGb(400));
+        } else {
+            // Set the low threshold to 50 instead of 60
+            // Set the high threshold to 60 instead of 70
+            // node2 (with 75 used) now should not have new shards allocated to it, and shards cannot remain
+            // Note that node3 (with 50% used space) should not receive the shard so it does not get over the high threshold
+            diskSettings = Settings.builder()
+                .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true)
+                .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), 0.5)
+                .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), 0.6);
+        }
 
         deciders = new AllocationDeciders(
-            new HashSet<>(Arrays.asList(new SameShardAllocationDecider(Settings.EMPTY, clusterSettings), makeDecider(diskSettings)))
+            new HashSet<>(Arrays.asList(new SameShardAllocationDecider(Settings.EMPTY, clusterSettings), makeDecider(diskSettings.build())))
         );
 
         strategy = new AllocationService(
@@ -261,6 +302,14 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
         assertThat(clusterState.getRoutingNodes().node("node4").size(), equalTo(1));
     }
 
+    public void testDiskThresholdWithPercentages() {
+        doTestDiskThreshold(false);
+    }
+
+    public void testDiskThresholdWithMaxHeadroom() {
+        doTestDiskThreshold(true);
+    }
+
     public void testDiskThresholdWithAbsoluteSizes() {
         Settings diskSettings = Settings.builder()
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true)
@@ -505,18 +554,37 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
         assertThat(clusterState.getRoutingNodes().node("node5").size(), equalTo(1));
     }
 
-    public void testDiskThresholdWithShardSizes() {
-        Settings diskSettings = Settings.builder()
+    private void doTestDiskThresholdWithShardSizes(boolean testMaxHeadroom) {
+        Settings.Builder diskSettings = Settings.builder()
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true)
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), 0.7)
-            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "71%")
-            .build();
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "71%");
+        if (testMaxHeadroom) {
+            diskSettings = diskSettings.put(
+                DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(),
+                ByteSizeValue.ofGb(200).toString()
+            )
+                .put(
+                    DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(),
+                    ByteSizeValue.ofGb(199).toString()
+                );
+        }
 
         Map<String, DiskUsage> usages = new HashMap<>();
-        usages.put("node1", new DiskUsage("node1", "n1", "/dev/null", 100, 31)); // 69% used
-        usages.put("node2", new DiskUsage("node2", "n2", "/dev/null", 100, 1));  // 99% used
+        final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100;
+        // below but close to low watermark
+        usages.put(
+            "node1",
+            new DiskUsage("node1", "n1", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(201).getBytes() : 31)
+        );
+        // almost fully used
+        usages.put("node2", new DiskUsage("node2", "n2", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(1).getBytes() : 1));
 
-        final ClusterInfo clusterInfo = new DevNullClusterInfo(usages, usages, Map.of("[test][0][p]", 10L));
+        final ClusterInfo clusterInfo = new DevNullClusterInfo(
+            usages,
+            usages,
+            Map.of("[test][0][p]", testMaxHeadroom ? ByteSizeValue.ofGb(10).getBytes() : 10L)
+        );
 
         AllocationDeciders deciders = new AllocationDeciders(
             new HashSet<>(
@@ -525,7 +593,7 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
                         Settings.EMPTY,
                         new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS)
                     ),
-                    makeDecider(diskSettings)
+                    makeDecider(diskSettings.build())
                 )
             )
         );
@@ -554,11 +622,9 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
             .routingTable(routingTable)
             .build();
         logger.info("--> adding node1");
+        // node2 is added because DiskThresholdDecider automatically ignore single-node clusters
         clusterState = ClusterState.builder(clusterState)
-            .nodes(
-                DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")) // node2 is added because DiskThresholdDecider
-                                                                                     // automatically ignore single-node clusters
-            )
+            .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")))
             .build();
         routingTable = strategy.reroute(clusterState, "reroute").routingTable();
         clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
@@ -573,6 +639,14 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
         assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(0));
     }
 
+    public void testDiskThresholdWithShardSizesWithPercentages() {
+        doTestDiskThresholdWithShardSizes(false);
+    }
+
+    public void testDiskThresholdWithShardSizesWithMaxHeadroom() {
+        doTestDiskThresholdWithShardSizes(true);
+    }
+
     public void testUnknownDiskUsage() {
         Settings diskSettings = Settings.builder()
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true)
@@ -663,26 +737,45 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
         assertThat(node1Usage.getFreeBytes(), equalTo(25L));
     }
 
-    public void testShardRelocationsTakenIntoAccount() {
-        Settings diskSettings = Settings.builder()
+    private void doTestShardRelocationsTakenIntoAccount(boolean testMaxHeadroom) {
+        Settings.Builder diskSettings = Settings.builder()
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true)
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), 0.7)
-            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), 0.8)
-            .build();
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), 0.8);
+        if (testMaxHeadroom) {
+            diskSettings = diskSettings.put(
+                DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(),
+                ByteSizeValue.ofGb(150).toString()
+            )
+                .put(
+                    DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(),
+                    ByteSizeValue.ofGb(110).toString()
+                );
+        }
 
         Map<String, DiskUsage> usages = new HashMap<>();
-        usages.put("node1", new DiskUsage("node1", "n1", "/dev/null", 100, 40)); // 60% used
-        usages.put("node2", new DiskUsage("node2", "n2", "/dev/null", 100, 40)); // 60% used
-        usages.put("node3", new DiskUsage("node3", "n3", "/dev/null", 100, 40)); // 60% used
+        final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100;
+        usages.put(
+            "node1",
+            new DiskUsage("node1", "n1", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(160).getBytes() : 40)
+        );
+        usages.put(
+            "node2",
+            new DiskUsage("node2", "n2", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(160).getBytes() : 40)
+        );
+        usages.put(
+            "node3",
+            new DiskUsage("node3", "n3", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(160).getBytes() : 40)
+        );
 
         Map<String, Long> shardSizes = new HashMap<>();
-        shardSizes.put("[test][0][p]", 14L); // 14 bytes
-        shardSizes.put("[test][0][r]", 14L);
-        shardSizes.put("[test2][0][p]", 1L); // 1 bytes
-        shardSizes.put("[test2][0][r]", 1L);
+        shardSizes.put("[test][0][p]", testMaxHeadroom ? ByteSizeValue.ofGb(14).getBytes() : 14L);
+        shardSizes.put("[test][0][r]", testMaxHeadroom ? ByteSizeValue.ofGb(14).getBytes() : 14L);
+        shardSizes.put("[test2][0][p]", testMaxHeadroom ? ByteSizeValue.ofGb(1).getBytes() : 1L);
+        shardSizes.put("[test2][0][r]", testMaxHeadroom ? ByteSizeValue.ofGb(1).getBytes() : 1L);
         final ClusterInfo clusterInfo = new DevNullClusterInfo(usages, usages, shardSizes);
 
-        DiskThresholdDecider decider = makeDecider(diskSettings);
+        DiskThresholdDecider decider = makeDecider(diskSettings.build());
         final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
         AllocationDeciders deciders = new AllocationDeciders(
             new HashSet<>(
@@ -752,15 +845,21 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
         }
 
         Map<String, DiskUsage> overfullUsages = new HashMap<>();
-        overfullUsages.put("node1", new DiskUsage("node1", "n1", "/dev/null", 100, 40)); // 60% used
-        overfullUsages.put("node2", new DiskUsage("node2", "n2", "/dev/null", 100, 40)); // 60% used
-        overfullUsages.put("node3", new DiskUsage("node3", "n3", "/dev/null", 100, 0));  // 100% used
+        overfullUsages.put(
+            "node1",
+            new DiskUsage("node1", "n1", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(160).getBytes() : 40)
+        );
+        overfullUsages.put(
+            "node2",
+            new DiskUsage("node2", "n2", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(160).getBytes() : 40)
+        );
+        overfullUsages.put("node3", new DiskUsage("node3", "n3", "/dev/null", totalBytes, 0));  // 100% used
 
         Map<String, Long> largerShardSizes = new HashMap<>();
-        largerShardSizes.put("[test][0][p]", 14L);
-        largerShardSizes.put("[test][0][r]", 14L);
-        largerShardSizes.put("[test2][0][p]", 2L);
-        largerShardSizes.put("[test2][0][r]", 2L);
+        largerShardSizes.put("[test][0][p]", testMaxHeadroom ? ByteSizeValue.ofGb(14).getBytes() : 14L);
+        largerShardSizes.put("[test][0][r]", testMaxHeadroom ? ByteSizeValue.ofGb(14).getBytes() : 14L);
+        largerShardSizes.put("[test2][0][p]", testMaxHeadroom ? ByteSizeValue.ofGb(2).getBytes() : 2L);
+        largerShardSizes.put("[test2][0][r]", testMaxHeadroom ? ByteSizeValue.ofGb(2).getBytes() : 2L);
 
         final ClusterInfo overfullClusterInfo = new DevNullClusterInfo(overfullUsages, overfullUsages, largerShardSizes);
 
@@ -774,8 +873,12 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
                 expectThrows(IllegalArgumentException.class, () -> strategy.reroute(clusterStateThatRejectsCommands, cmds, false, false))
                     .getMessage(),
                 containsString(
-                    "the node is above the low watermark cluster setting [cluster.routing.allocation.disk.watermark.low=70%], "
-                        + "having less than the minimum required [30b] free space, actual free: [26b], actual used: [74%]"
+                    testMaxHeadroom
+                        ? "the node is above the low watermark cluster setting "
+                            + "[cluster.routing.allocation.disk.watermark.low.max_headroom=150gb], "
+                            + "having less than the minimum required [150gb] free space, actual free: [146gb], actual used: [98.5%]"
+                        : "the node is above the low watermark cluster setting [cluster.routing.allocation.disk.watermark.low=70%], "
+                            + "having less than the minimum required [30b] free space, actual free: [26b], actual used: [74%]"
                 )
             );
 
@@ -821,7 +924,10 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
                     shardSizes,
                     Map.of(
                         new ClusterInfo.NodeAndPath("node1", "/dev/null"),
-                        new ClusterInfo.ReservedSpace.Builder().add(new ShardId("", "", 0), between(51, 200)).build()
+                        new ClusterInfo.ReservedSpace.Builder().add(
+                            new ShardId("", "", 0),
+                            testMaxHeadroom ? ByteSizeValue.ofGb(between(200, 250)).getBytes() : between(51, 200)
+                        ).build()
                     )
                 )
             );
@@ -835,26 +941,49 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
         }
     }
 
-    public void testCanRemainWithShardRelocatingAway() {
-        Settings diskSettings = Settings.builder()
+    public void testShardRelocationsTakenIntoAccountWithPercentages() {
+        doTestShardRelocationsTakenIntoAccount(false);
+    }
+
+    public void testShardRelocationsTakenIntoAccountWithMaxHeadroom() {
+        doTestShardRelocationsTakenIntoAccount(true);
+    }
+
+    private void doTestCanRemainWithShardRelocatingAway(boolean testMaxHeadroom) {
+        Settings.Builder diskSettings = Settings.builder()
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true)
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "60%")
-            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "70%")
-            .build();
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "70%");
+        if (testMaxHeadroom) {
+            diskSettings = diskSettings.put(
+                DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(),
+                ByteSizeValue.ofGb(150).toString()
+            )
+                .put(
+                    DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(),
+                    ByteSizeValue.ofGb(110).toString()
+                );
+        }
 
-        // We have an index with 2 primary shards each taking 40 bytes. Each node has 100 bytes available
         Map<String, DiskUsage> usages = new HashMap<>();
-        usages.put("node1", new DiskUsage("node1", "n1", "/dev/null", 100, 20)); // 80% used
-        usages.put("node2", new DiskUsage("node2", "n2", "/dev/null", 100, 100)); // 0% used
+        final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100;
+        usages.put(
+            "node1",
+            new DiskUsage("node1", "n1", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(40).getBytes() : 20)
+        );
+        usages.put(
+            "node2",
+            new DiskUsage("node2", "n2", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100)
+        );
 
         Map<String, Long> shardSizes = new HashMap<>();
-        shardSizes.put("[test][0][p]", 40L);
-        shardSizes.put("[test][1][p]", 40L);
-        shardSizes.put("[foo][0][p]", 10L);
+        shardSizes.put("[test][0][p]", testMaxHeadroom ? ByteSizeValue.ofGb(4980).getBytes() : 40L);
+        shardSizes.put("[test][1][p]", testMaxHeadroom ? ByteSizeValue.ofGb(4980).getBytes() : 40L);
+        shardSizes.put("[foo][0][p]", testMaxHeadroom ? ByteSizeValue.ofGb(10).getBytes() : 10L);
 
         final ClusterInfo clusterInfo = new DevNullClusterInfo(usages, usages, shardSizes);
 
-        DiskThresholdDecider diskThresholdDecider = makeDecider(diskSettings);
+        DiskThresholdDecider diskThresholdDecider = makeDecider(diskSettings.build());
         Metadata metadata = Metadata.builder()
             .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(2).numberOfReplicas(0))
             .put(IndexMetadata.builder("foo").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(0))
@@ -914,9 +1043,13 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
         assertThat(
             decision.getExplanation(),
             containsString(
-                "the shard cannot remain on this node because it is above the high watermark cluster setting "
-                    + "[cluster.routing.allocation.disk.watermark.high=70%] and there is less than the required [30b] free space "
-                    + "on node, actual free: [20b], actual used: [80%]"
+                testMaxHeadroom
+                    ? "the shard cannot remain on this node because it is above the high watermark cluster setting "
+                        + "[cluster.routing.allocation.disk.watermark.high.max_headroom=110gb] and there is less than the required [110gb] "
+                        + "free space on node, actual free: [40gb], actual used: [99.6%]"
+                    : "the shard cannot remain on this node because it is above the high watermark cluster setting "
+                        + "[cluster.routing.allocation.disk.watermark.high=70%] and there is less than the required [30b] free space "
+                        + "on node, actual free: [20b], actual used: [80%]"
             )
         );
 
@@ -949,7 +1082,9 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
         );
         assertThat(decision.type(), equalTo(Decision.Type.YES));
         assertEquals(
-            "there is enough disk on this node for the shard to remain, free: [60b]",
+            testMaxHeadroom
+                ? "there is enough disk on this node for the shard to remain, free: [4.9tb]"
+                : "there is enough disk on this node for the shard to remain, free: [60b]",
             ((Decision.Single) decision).getExplanation()
         );
         decision = diskThresholdDecider.canAllocate(fooRouting, firstRoutingNode, routingAllocation);
@@ -958,16 +1093,24 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
             assertThat(
                 ((Decision.Single) decision).getExplanation(),
                 containsString(
-                    "the node is above the high watermark cluster setting [cluster.routing.allocation.disk.watermark.high=70%], "
-                        + "having less than the minimum required [30b] free space, actual free: [20b], actual used: [80%]"
+                    testMaxHeadroom
+                        ? "the node is above the high watermark cluster setting [cluster.routing.allocation.disk.watermark"
+                            + ".high.max_headroom=110gb], having less than the minimum required [110gb] free space, actual free: "
+                            + "[40gb], actual used: [99.6%]"
+                        : "the node is above the high watermark cluster setting [cluster.routing.allocation.disk.watermark.high=70%], "
+                            + "having less than the minimum required [30b] free space, actual free: [20b], actual used: [80%]"
                 )
             );
         } else {
             assertThat(
                 ((Decision.Single) decision).getExplanation(),
                 containsString(
-                    "the node is above the low watermark cluster setting [cluster.routing.allocation.disk.watermark.low=60%], "
-                        + "having less than the minimum required [40b] free space, actual free: [20b], actual used: [80%]"
+                    testMaxHeadroom
+                        ? "the node is above the low watermark cluster setting [cluster.routing.allocation.disk.watermark.low"
+                            + ".max_headroom=150gb], having less than the minimum required [150gb] free space, actual free: [40gb], actual "
+                            + "used: [99.6%]"
+                        : "the node is above the low watermark cluster setting [cluster.routing.allocation.disk.watermark.low=60%], "
+                            + "having less than the minimum required [40b] free space, actual free: [20b], actual used: [80%]"
                 )
             );
         }
@@ -1007,20 +1150,42 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
         assertThat(result.routingTable().index("test").shard(1).primaryShard().relocatingNodeId(), equalTo("node2"));
     }
 
-    public void testWatermarksEnabledForSingleDataNode() {
+    public void testCanRemainWithShardRelocatingAwayWithPercentages() {
+        doTestCanRemainWithShardRelocatingAway(false);
+    }
+
+    public void testCanRemainWithShardRelocatingAwayWithMaxHeadroom() {
+        doTestCanRemainWithShardRelocatingAway(true);
+    }
+
+    private void doTestWatermarksEnabledForSingleDataNode(boolean testMaxHeadroom) {
         Settings.Builder builder = Settings.builder()
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true)
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "60%")
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "70%");
+        if (testMaxHeadroom) {
+            builder = builder.put(
+                DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(),
+                ByteSizeValue.ofGb(150).toString()
+            )
+                .put(
+                    DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(),
+                    ByteSizeValue.ofGb(110).toString()
+                );
+        }
         if (randomBoolean()) {
-            builder.put(DiskThresholdDecider.ENABLE_FOR_SINGLE_DATA_NODE.getKey(), true);
+            builder = builder.put(DiskThresholdDecider.ENABLE_FOR_SINGLE_DATA_NODE.getKey(), true);
         }
         Settings diskSettings = builder.build();
 
-        Map<String, DiskUsage> usages = Map.of("data", new DiskUsage("data", "data", "/dev/null", 100, 20));  // 80% used
+        final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100;
+        Map<String, DiskUsage> usages = Map.of(
+            "data",
+            new DiskUsage("data", "data", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(40).getBytes() : 20)
+        );
 
-        // We have an index with 1 primary shard, taking 40 bytes. The single data node has only 20 bytes free.
-        Map<String, Long> shardSizes = Map.of("[test][0][p]", 40L);
+        // We have an index with 1 primary shard, taking more bytes than the free space of the single data node.
+        Map<String, Long> shardSizes = Map.of("[test][0][p]", testMaxHeadroom ? ByteSizeValue.ofGb(60).getBytes() : 40L);
         final ClusterInfo clusterInfo = new DevNullClusterInfo(usages, usages, shardSizes);
 
         DiskThresholdDecider diskThresholdDecider = makeDecider(diskSettings);
@@ -1114,9 +1279,13 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
         assertThat(
             decision.getExplanation(),
             containsString(
-                "the shard cannot remain on this node because it is above the high watermark cluster setting"
-                    + " [cluster.routing.allocation.disk.watermark.high=70%] and there is less than the required [30b] free space "
-                    + "on node, actual free: [20b], actual used: [80%]"
+                testMaxHeadroom
+                    ? "the shard cannot remain on this node because it is above the high watermark cluster setting [cluster"
+                        + ".routing.allocation.disk.watermark.high.max_headroom=110gb] and there is less than the required [110gb] free "
+                        + "space on node, actual free: [40gb], actual used: [99.6%]"
+                    : "the shard cannot remain on this node because it is above the high watermark cluster setting"
+                        + " [cluster.routing.allocation.disk.watermark.high=70%] and there is less than the required [30b] free space "
+                        + "on node, actual free: [20b], actual used: [80%]"
             )
         );
 
@@ -1125,6 +1294,14 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
         }
     }
 
+    public void testWatermarksEnabledForSingleDataNodeWithPercentages() {
+        doTestWatermarksEnabledForSingleDataNode(false);
+    }
+
+    public void testWatermarksEnabledForSingleDataNodeWithMaxHeadroom() {
+        doTestWatermarksEnabledForSingleDataNode(true);
+    }
+
     public void testSingleDataNodeDeprecationWarning() {
         Settings settings = Settings.builder().put(DiskThresholdDecider.ENABLE_FOR_SINGLE_DATA_NODE.getKey(), false).build();
 
@@ -1144,19 +1321,34 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
         assertSettingDeprecationsAndWarnings(new Setting<?>[] { DiskThresholdDecider.ENABLE_FOR_SINGLE_DATA_NODE });
     }
 
-    public void testDiskThresholdWithSnapshotShardSizes() {
-        final long shardSizeInBytes = randomBoolean() ? 10L : 50L;
+    private void doTestDiskThresholdWithSnapshotShardSizes(boolean testMaxHeadroom) {
+        final long shardSizeInBytes = randomBoolean()
+            ? (testMaxHeadroom ? ByteSizeValue.ofGb(99).getBytes() : 10L) // fits free space of node1
+            : (testMaxHeadroom ? ByteSizeValue.ofGb(350).getBytes() : 50L); // does not fit free space of node1
         logger.info("--> using shard size [{}]", shardSizeInBytes);
 
-        final Settings diskSettings = Settings.builder()
+        Settings.Builder diskSettings = Settings.builder()
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true)
             .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "90%")
-            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "95%")
-            .build();
+            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "95%");
+        if (testMaxHeadroom) {
+            diskSettings = diskSettings.put(
+                DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(),
+                ByteSizeValue.ofGb(150).toString()
+            )
+                .put(
+                    DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(),
+                    ByteSizeValue.ofGb(110).toString()
+                );
+        }
 
         Map<String, DiskUsage> usages = new HashMap<>();
-        usages.put("node1", new DiskUsage("node1", "n1", "/dev/null", 100, 21));  // 79% used
-        usages.put("node2", new DiskUsage("node2", "n2", "/dev/null", 100, 1)); // 99% used
+        final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100;
+        usages.put(
+            "node1",
+            new DiskUsage("node1", "n1", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(210).getBytes() : 21)
+        );
+        usages.put("node2", new DiskUsage("node2", "n2", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(1).getBytes() : 1));
         final ClusterInfoService clusterInfoService = () -> new DevNullClusterInfo(usages, usages, Map.of());
 
         final AllocationDeciders deciders = new AllocationDeciders(
@@ -1167,7 +1359,7 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
                         Settings.EMPTY,
                         new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS)
                     ),
-                    makeDecider(diskSettings)
+                    makeDecider(diskSettings.build())
                 )
             )
         );
@@ -1204,9 +1396,8 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
             .metadata(metadata)
             .routingTable(routingTable)
             .putCustom(RestoreInProgress.TYPE, restores.build())
-            .nodes(
-                DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")) // node2 is added because DiskThresholdDecider
-                                                                                     // automatically ignore single-node clusters
+            .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")) // node2 is added because DiskThresholdDecider
+            // automatically ignore single-node clusters
             )
             .build();
 
@@ -1271,6 +1462,14 @@ public class DiskThresholdDeciderTests extends ESAllocationTestCase {
         assertThat(shardsWithState(clusterState.getRoutingNodes(), "test", INITIALIZING, STARTED).size(), equalTo(shouldAllocate ? 1 : 0));
     }
 
+    public void testDiskThresholdWithSnapshotShardSizesWithPercentages() {
+        doTestDiskThresholdWithSnapshotShardSizes(false);
+    }
+
+    public void testDiskThresholdWithSnapshotShardSizesWithMaxHeadroom() {
+        doTestDiskThresholdWithSnapshotShardSizes(true);
+    }
+
     public void logShardStates(ClusterState state) {
         RoutingNodes rn = state.getRoutingNodes();
         logger.info(

+ 93 - 38
server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderUnitTests.java

@@ -35,6 +35,7 @@ import org.elasticsearch.cluster.routing.allocation.decider.DiskThresholdDecider
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.settings.ClusterSettings;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.unit.ByteSizeValue;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.shard.ShardId;
 
@@ -53,6 +54,17 @@ import static org.hamcrest.Matchers.equalTo;
  */
 public class DiskThresholdDeciderUnitTests extends ESAllocationTestCase {
 
+    private static long getExpectedShardSize(ShardRouting shardRouting, long defaultSize, RoutingAllocation allocation) {
+        return DiskThresholdDecider.getExpectedShardSize(
+            shardRouting,
+            defaultSize,
+            allocation.clusterInfo(),
+            allocation.snapshotShardSizeInfo(),
+            allocation.metadata(),
+            allocation.routingTable()
+        );
+    }
+
     public void testCanAllocateUsesMaxAvailableSpace() {
         ClusterSettings nss = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
         DiskThresholdDecider decider = new DiskThresholdDecider(Settings.EMPTY, nss);
@@ -134,7 +146,7 @@ public class DiskThresholdDeciderUnitTests extends ESAllocationTestCase {
         );
     }
 
-    public void testCannotAllocateDueToLackOfDiskResources() {
+    private void doTestCannotAllocateDueToLackOfDiskResources(boolean testMaxHeadroom) {
         ClusterSettings nss = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
         DiskThresholdDecider decider = new DiskThresholdDecider(Settings.EMPTY, nss);
 
@@ -176,12 +188,17 @@ public class DiskThresholdDeciderUnitTests extends ESAllocationTestCase {
 
         // actual test -- after all that bloat :)
 
-        Map<String, DiskUsage> leastAvailableUsages = Map.of("node_0", new DiskUsage("node_0", "node_0", "_na_", 100, 0)); // all full
-        final int freeBytes = randomIntBetween(20, 100);
-        Map<String, DiskUsage> mostAvailableUsage = Map.of("node_0", new DiskUsage("node_0", "node_0", "_na_", 100, freeBytes));
+        final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100;
+
+        Map<String, DiskUsage> leastAvailableUsages = Map.of("node_0", new DiskUsage("node_0", "node_0", "_na_", totalBytes, 0)); // all
+                                                                                                                                  // full
+        final long freeBytes = testMaxHeadroom
+            ? ByteSizeValue.ofGb(randomIntBetween(500, 10000)).getBytes()
+            : randomLongBetween(50, totalBytes);
+        Map<String, DiskUsage> mostAvailableUsage = Map.of("node_0", new DiskUsage("node_0", "node_0", "_na_", totalBytes, freeBytes));
 
         // way bigger than available space
-        final long shardSize = randomIntBetween(110, 1000);
+        final long shardSize = randomLongBetween(totalBytes + 10, totalBytes * 10);
         ClusterInfo clusterInfo = new ClusterInfo(
             leastAvailableUsages,
             mostAvailableUsage,
@@ -201,26 +218,45 @@ public class DiskThresholdDeciderUnitTests extends ESAllocationTestCase {
         Decision decision = decider.canAllocate(test_0, RoutingNodesHelper.routingNode("node_0", node_0), allocation);
         assertEquals(Decision.Type.NO, decision.type());
 
-        double usedPercentage = 100.0 * (100 - freeBytes) / 100;
+        double usedPercentage = 100.0 * (totalBytes - freeBytes) / totalBytes;
 
         assertThat(
             decision.getExplanation(),
             containsString(
-                "allocating the shard to this node will bring the node above the high watermark cluster setting "
-                    + "[cluster.routing.allocation.disk.watermark.high=90%] "
-                    + "and cause it to have less than the minimum required [10b] of free space "
-                    + "(free: ["
-                    + freeBytes
-                    + "b], used: ["
-                    + Strings.format1Decimals(usedPercentage, "%")
-                    + "], estimated shard size: ["
-                    + shardSize
-                    + "b])"
+                testMaxHeadroom
+                    ? "allocating the shard to this node will bring the node above the high watermark cluster setting "
+                        + "[cluster.routing.allocation.disk.watermark.high.max_headroom=150gb] "
+                        + "and cause it to have less than the minimum required [150gb] of free space "
+                        + "(free: ["
+                        + ByteSizeValue.ofBytes(freeBytes)
+                        + "], used: ["
+                        + Strings.format1Decimals(usedPercentage, "%")
+                        + "], estimated shard size: ["
+                        + ByteSizeValue.ofBytes(shardSize)
+                        + "])"
+                    : "allocating the shard to this node will bring the node above the high watermark cluster setting "
+                        + "[cluster.routing.allocation.disk.watermark.high=90%] "
+                        + "and cause it to have less than the minimum required [10b] of free space "
+                        + "(free: ["
+                        + freeBytes
+                        + "b], used: ["
+                        + Strings.format1Decimals(usedPercentage, "%")
+                        + "], estimated shard size: ["
+                        + shardSize
+                        + "b])"
             )
         );
     }
 
-    public void testCanRemainUsesLeastAvailableSpace() {
+    public void testCannotAllocateDueToLackOfDiskResourcesWithPercentages() {
+        doTestCannotAllocateDueToLackOfDiskResources(false);
+    }
+
+    public void testCannotAllocateDueToLackOfDiskResourcesWithMaxHeadroom() {
+        doTestCannotAllocateDueToLackOfDiskResources(true);
+    }
+
+    private void doTestCanRemainUsesLeastAvailableSpace(boolean testMaxHeadroom) {
         ClusterSettings nss = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
         DiskThresholdDecider decider = new DiskThresholdDecider(Settings.EMPTY, nss);
         Map<ShardRouting, String> shardRoutingMap = new HashMap<>();
@@ -295,18 +331,28 @@ public class DiskThresholdDeciderUnitTests extends ESAllocationTestCase {
         clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(node_0).add(node_1)).build();
 
         // actual test -- after all that bloat :)
+
+        final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100;
+        final long exactFreeSpaceForHighWatermark = testMaxHeadroom ? ByteSizeValue.ofGb(150).getBytes() : 10;
+        final long exactFreeSpaceForBelowHighWatermark = exactFreeSpaceForHighWatermark - 1;
+        final double exactUsedSpaceForBelowHighWatermark = 100.0 * (totalBytes - exactFreeSpaceForBelowHighWatermark) / totalBytes;
+        final long ninetyPercentFreeSpace = (long) (totalBytes * 0.9);
+
         Map<String, DiskUsage> leastAvailableUsages = new HashMap<>();
-        leastAvailableUsages.put("node_0", new DiskUsage("node_0", "node_0", "/node0/least", 100, 10)); // 90% used
-        leastAvailableUsages.put("node_1", new DiskUsage("node_1", "node_1", "/node1/least", 100, 9)); // 91% used
+        leastAvailableUsages.put("node_0", new DiskUsage("node_0", "node_0", "/node0/least", totalBytes, exactFreeSpaceForHighWatermark));
+        leastAvailableUsages.put(
+            "node_1",
+            new DiskUsage("node_1", "node_1", "/node1/least", totalBytes, exactFreeSpaceForBelowHighWatermark)
+        );
 
         Map<String, DiskUsage> mostAvailableUsage = new HashMap<>();
-        mostAvailableUsage.put("node_0", new DiskUsage("node_0", "node_0", "/node0/most", 100, 90)); // 10% used
-        mostAvailableUsage.put("node_1", new DiskUsage("node_1", "node_1", "/node1/most", 100, 90)); // 10% used
+        mostAvailableUsage.put("node_0", new DiskUsage("node_0", "node_0", "/node0/most", totalBytes, ninetyPercentFreeSpace));
+        mostAvailableUsage.put("node_1", new DiskUsage("node_1", "node_1", "/node1/most", totalBytes, ninetyPercentFreeSpace));
 
         Map<String, Long> shardSizes = new HashMap<>();
-        shardSizes.put("[test][0][p]", 10L); // 10 bytes
-        shardSizes.put("[test][1][p]", 10L);
-        shardSizes.put("[test][2][p]", 10L);
+        shardSizes.put("[test][0][p]", exactFreeSpaceForHighWatermark);
+        shardSizes.put("[test][1][p]", exactFreeSpaceForHighWatermark);
+        shardSizes.put("[test][2][p]", exactFreeSpaceForHighWatermark);
 
         final ClusterInfo clusterInfo = new ClusterInfo(
             leastAvailableUsages,
@@ -328,7 +374,11 @@ public class DiskThresholdDeciderUnitTests extends ESAllocationTestCase {
         assertEquals(Decision.Type.YES, decision.type());
         assertThat(
             ((Decision.Single) decision).getExplanation(),
-            containsString("there is enough disk on this node for the shard to remain, free: [10b]")
+            containsString(
+                "there is enough disk on this node for the shard to remain, free: ["
+                    + ByteSizeValue.ofBytes(exactFreeSpaceForHighWatermark)
+                    + "]"
+            )
         );
         decision = decider.canRemain(indexMetadata, test_1, RoutingNodesHelper.routingNode("node_1", node_1), allocation);
         assertEquals(Decision.Type.NO, decision.type());
@@ -336,8 +386,16 @@ public class DiskThresholdDeciderUnitTests extends ESAllocationTestCase {
             ((Decision.Single) decision).getExplanation(),
             containsString(
                 "the shard cannot remain on this node because it is above the high watermark cluster setting "
-                    + "[cluster.routing.allocation.disk.watermark.high=90%] and there is less than the required [10b] "
-                    + "free space on node, actual free: [9b], actual used: [91%]"
+                    + "[cluster.routing.allocation.disk.watermark.high"
+                    + (testMaxHeadroom ? ".max_headroom=150gb" : "=90%")
+                    + "] and there is less than the required ["
+                    + ByteSizeValue.ofBytes(exactFreeSpaceForHighWatermark)
+                    + "] free space on "
+                    + "node, actual free: ["
+                    + ByteSizeValue.ofBytes(exactFreeSpaceForBelowHighWatermark)
+                    + "], actual used: ["
+                    + Strings.format1Decimals(exactUsedSpaceForBelowHighWatermark, "%")
+                    + "]"
             )
         );
         try {
@@ -368,6 +426,14 @@ public class DiskThresholdDeciderUnitTests extends ESAllocationTestCase {
         );
     }
 
+    public void testCanRemainUsesLeastAvailableSpaceWithPercentages() {
+        doTestCanRemainUsesLeastAvailableSpace(false);
+    }
+
+    public void testCanRemainUsesLeastAvailableSpaceWithMaxHeadroom() {
+        doTestCanRemainUsesLeastAvailableSpace(true);
+    }
+
     public void testShardSizeAndRelocatingSize() {
         Map<String, Long> shardSizes = new HashMap<>();
         shardSizes.put("[test][0][r]", 10L);
@@ -622,17 +688,6 @@ public class DiskThresholdDeciderUnitTests extends ESAllocationTestCase {
         assertEquals(42L, getExpectedShardSize(target2, 42L, allocationWithMissingSourceIndex));
     }
 
-    private static long getExpectedShardSize(ShardRouting shardRouting, long defaultSize, RoutingAllocation allocation) {
-        return DiskThresholdDecider.getExpectedShardSize(
-            shardRouting,
-            defaultSize,
-            allocation.clusterInfo(),
-            allocation.snapshotShardSizeInfo(),
-            allocation.metadata(),
-            allocation.routingTable()
-        );
-    }
-
     public void testDiskUsageWithRelocations() {
         assertThat(
             new DiskThresholdDecider.DiskUsageWithRelocations(new DiskUsage("n", "n", "/dev/null", 1000L, 1000L), 0).getFreeBytes(),

+ 153 - 0
server/src/test/java/org/elasticsearch/common/unit/ByteSizeValueTests.java

@@ -372,4 +372,157 @@ public class ByteSizeValueTests extends AbstractWireSerializingTestCase<ByteSize
             assertThat(actual, equalTo(expected));
         }
     }
+
+    public void testAddition() {
+        assertThat(ByteSizeValue.add(ByteSizeValue.ZERO, ByteSizeValue.ZERO), is(ByteSizeValue.ZERO));
+        assertThat(ByteSizeValue.add(ByteSizeValue.ZERO, ByteSizeValue.ONE), is(ByteSizeValue.ONE));
+        assertThat(ByteSizeValue.add(ByteSizeValue.ONE, ByteSizeValue.ONE), is(ByteSizeValue.ofBytes(2L)));
+        assertThat(ByteSizeValue.add(ByteSizeValue.ofBytes(100L), ByteSizeValue.ONE), is(ByteSizeValue.ofBytes(101L)));
+        assertThat(ByteSizeValue.add(ByteSizeValue.ofBytes(100L), ByteSizeValue.ofBytes(2L)), is(ByteSizeValue.ofBytes(102L)));
+        assertThat(
+            ByteSizeValue.add(new ByteSizeValue(8, ByteSizeUnit.KB), new ByteSizeValue(4, ByteSizeUnit.KB)),
+            is(ByteSizeValue.ofBytes(12288L))
+        );
+        assertThat(
+            ByteSizeValue.add(new ByteSizeValue(8, ByteSizeUnit.MB), new ByteSizeValue(4, ByteSizeUnit.MB)),
+            is(ByteSizeValue.ofBytes(12582912L))
+        );
+        assertThat(
+            ByteSizeValue.add(new ByteSizeValue(8, ByteSizeUnit.GB), new ByteSizeValue(4, ByteSizeUnit.GB)),
+            is(ByteSizeValue.ofBytes(12884901888L))
+        );
+        assertThat(
+            ByteSizeValue.add(new ByteSizeValue(8, ByteSizeUnit.TB), new ByteSizeValue(4, ByteSizeUnit.TB)),
+            is(ByteSizeValue.ofBytes(13194139533312L))
+        );
+        assertThat(
+            ByteSizeValue.add(new ByteSizeValue(8, ByteSizeUnit.PB), new ByteSizeValue(4, ByteSizeUnit.PB)),
+            is(ByteSizeValue.ofBytes(13510798882111488L))
+        );
+        assertThat(
+            ByteSizeValue.add(new ByteSizeValue(8, ByteSizeUnit.PB), new ByteSizeValue(4, ByteSizeUnit.GB)),
+            is(ByteSizeValue.ofBytes(9007203549708288L))
+        );
+
+        Exception e = expectThrows(
+            ArithmeticException.class,
+            () -> ByteSizeValue.add(ByteSizeValue.ofBytes(Long.MAX_VALUE), ByteSizeValue.ONE)
+        );
+        assertThat(e.getMessage(), containsString("long overflow"));
+
+        String exceptionMessage = "one of the arguments has -1 bytes";
+        e = expectThrows(IllegalArgumentException.class, () -> ByteSizeValue.add(ByteSizeValue.MINUS_ONE, ByteSizeValue.ONE));
+        assertThat(e.getMessage(), containsString(exceptionMessage));
+
+        e = expectThrows(IllegalArgumentException.class, () -> ByteSizeValue.add(ByteSizeValue.ZERO, ByteSizeValue.MINUS_ONE));
+        assertThat(e.getMessage(), containsString(exceptionMessage));
+
+        e = expectThrows(IllegalArgumentException.class, () -> ByteSizeValue.add(ByteSizeValue.MINUS_ONE, ByteSizeValue.MINUS_ONE));
+        assertThat(e.getMessage(), containsString(exceptionMessage));
+    }
+
+    public void testSubtraction() {
+        assertThat(ByteSizeValue.subtract(ByteSizeValue.ZERO, ByteSizeValue.ZERO), is(ByteSizeValue.ZERO));
+        assertThat(ByteSizeValue.subtract(ByteSizeValue.ONE, ByteSizeValue.ZERO), is(ByteSizeValue.ONE));
+        assertThat(ByteSizeValue.subtract(ByteSizeValue.ONE, ByteSizeValue.ONE), is(ByteSizeValue.ZERO));
+        assertThat(ByteSizeValue.subtract(ByteSizeValue.ofBytes(100L), ByteSizeValue.ONE), is(ByteSizeValue.ofBytes(99L)));
+        assertThat(ByteSizeValue.subtract(ByteSizeValue.ofBytes(100L), ByteSizeValue.ofBytes(2L)), is(ByteSizeValue.ofBytes(98L)));
+        assertThat(
+            ByteSizeValue.subtract(new ByteSizeValue(8, ByteSizeUnit.KB), new ByteSizeValue(4, ByteSizeUnit.KB)),
+            is(ByteSizeValue.ofBytes(4096L))
+        );
+        assertThat(
+            ByteSizeValue.subtract(new ByteSizeValue(8, ByteSizeUnit.MB), new ByteSizeValue(4, ByteSizeUnit.MB)),
+            is(ByteSizeValue.ofBytes(4194304L))
+        );
+        assertThat(
+            ByteSizeValue.subtract(new ByteSizeValue(8, ByteSizeUnit.GB), new ByteSizeValue(4, ByteSizeUnit.GB)),
+            is(ByteSizeValue.ofBytes(4294967296L))
+        );
+        assertThat(
+            ByteSizeValue.subtract(new ByteSizeValue(8, ByteSizeUnit.TB), new ByteSizeValue(4, ByteSizeUnit.TB)),
+            is(ByteSizeValue.ofBytes(4398046511104L))
+        );
+        assertThat(
+            ByteSizeValue.subtract(new ByteSizeValue(8, ByteSizeUnit.PB), new ByteSizeValue(4, ByteSizeUnit.PB)),
+            is(ByteSizeValue.ofBytes(4503599627370496L))
+        );
+        assertThat(
+            ByteSizeValue.subtract(new ByteSizeValue(8, ByteSizeUnit.PB), new ByteSizeValue(4, ByteSizeUnit.GB)),
+            is(ByteSizeValue.ofBytes(9007194959773696L))
+        );
+
+        Exception e = expectThrows(
+            IllegalArgumentException.class,
+            () -> ByteSizeValue.subtract(ByteSizeValue.ofBytes(100L), ByteSizeValue.ofBytes(102L))
+        );
+        assertThat(e.getMessage(), containsString("Values less than -1 bytes are not supported: -2b"));
+
+        e = expectThrows(IllegalArgumentException.class, () -> ByteSizeValue.subtract(ByteSizeValue.ZERO, ByteSizeValue.ONE));
+        assertThat(e.getMessage(), containsString("subtraction result has -1 bytes"));
+
+        String exceptionMessage = "one of the arguments has -1 bytes";
+        e = expectThrows(IllegalArgumentException.class, () -> ByteSizeValue.subtract(ByteSizeValue.MINUS_ONE, ByteSizeValue.ONE));
+        assertThat(e.getMessage(), containsString(exceptionMessage));
+
+        e = expectThrows(IllegalArgumentException.class, () -> ByteSizeValue.subtract(ByteSizeValue.ZERO, ByteSizeValue.MINUS_ONE));
+        assertThat(e.getMessage(), containsString(exceptionMessage));
+
+        e = expectThrows(IllegalArgumentException.class, () -> ByteSizeValue.subtract(ByteSizeValue.MINUS_ONE, ByteSizeValue.MINUS_ONE));
+        assertThat(e.getMessage(), containsString(exceptionMessage));
+    }
+
+    public void testMinimum() {
+        assertThat(ByteSizeValue.min(ByteSizeValue.ZERO, ByteSizeValue.ZERO), is(ByteSizeValue.ZERO));
+        assertThat(ByteSizeValue.min(ByteSizeValue.ZERO, ByteSizeValue.ONE), is(ByteSizeValue.ZERO));
+        assertThat(ByteSizeValue.min(ByteSizeValue.ONE, ByteSizeValue.ZERO), is(ByteSizeValue.ZERO));
+        assertThat(ByteSizeValue.min(ByteSizeValue.ONE, ByteSizeValue.ONE), is(ByteSizeValue.ONE));
+        assertThat(ByteSizeValue.min(ByteSizeValue.ofBytes(100L), ByteSizeValue.ONE), is(ByteSizeValue.ONE));
+        assertThat(ByteSizeValue.min(ByteSizeValue.ONE, ByteSizeValue.ofBytes(100L)), is(ByteSizeValue.ONE));
+        assertThat(ByteSizeValue.min(ByteSizeValue.ofBytes(100L), ByteSizeValue.ofBytes(2L)), is(ByteSizeValue.ofBytes(2L)));
+        assertThat(ByteSizeValue.min(ByteSizeValue.ofBytes(2L), ByteSizeValue.ofBytes(100L)), is(ByteSizeValue.ofBytes(2L)));
+
+        assertThat(
+            ByteSizeValue.min(new ByteSizeValue(8, ByteSizeUnit.KB), new ByteSizeValue(4, ByteSizeUnit.KB)),
+            is(new ByteSizeValue(4, ByteSizeUnit.KB))
+        );
+        assertThat(
+            ByteSizeValue.min(new ByteSizeValue(4, ByteSizeUnit.MB), new ByteSizeValue(8, ByteSizeUnit.MB)),
+            is(new ByteSizeValue(4, ByteSizeUnit.MB))
+        );
+        assertThat(
+            ByteSizeValue.min(new ByteSizeValue(16, ByteSizeUnit.GB), new ByteSizeValue(15, ByteSizeUnit.GB)),
+            is(new ByteSizeValue(15, ByteSizeUnit.GB))
+        );
+        assertThat(
+            ByteSizeValue.min(new ByteSizeValue(90, ByteSizeUnit.TB), new ByteSizeValue(91, ByteSizeUnit.TB)),
+            is(new ByteSizeValue(90, ByteSizeUnit.TB))
+        );
+        assertThat(
+            ByteSizeValue.min(new ByteSizeValue(2, ByteSizeUnit.PB), new ByteSizeValue(1, ByteSizeUnit.PB)),
+            is(new ByteSizeValue(1, ByteSizeUnit.PB))
+        );
+        assertThat(
+            ByteSizeValue.min(new ByteSizeValue(1, ByteSizeUnit.PB), new ByteSizeValue(1, ByteSizeUnit.GB)),
+            is(new ByteSizeValue(1, ByteSizeUnit.GB))
+        );
+
+        ByteSizeValue equalityResult = ByteSizeValue.min(new ByteSizeValue(1024, ByteSizeUnit.MB), new ByteSizeValue(1, ByteSizeUnit.GB));
+        assertThat(equalityResult, is(new ByteSizeValue(1024, ByteSizeUnit.MB)));
+        assertThat(equalityResult.getUnit(), is(ByteSizeUnit.MB));
+
+        equalityResult = ByteSizeValue.min(new ByteSizeValue(1, ByteSizeUnit.GB), new ByteSizeValue(1024, ByteSizeUnit.MB));
+        assertThat(equalityResult, is(new ByteSizeValue(1, ByteSizeUnit.GB)));
+        assertThat(equalityResult.getUnit(), is(ByteSizeUnit.GB));
+
+        String exceptionMessage = "one of the arguments has -1 bytes";
+        Exception e = expectThrows(IllegalArgumentException.class, () -> ByteSizeValue.min(ByteSizeValue.MINUS_ONE, ByteSizeValue.ONE));
+        assertThat(e.getMessage(), containsString(exceptionMessage));
+
+        e = expectThrows(IllegalArgumentException.class, () -> ByteSizeValue.min(ByteSizeValue.ONE, ByteSizeValue.MINUS_ONE));
+        assertThat(e.getMessage(), containsString(exceptionMessage));
+
+        e = expectThrows(IllegalArgumentException.class, () -> ByteSizeValue.min(ByteSizeValue.MINUS_ONE, ByteSizeValue.MINUS_ONE));
+        assertThat(e.getMessage(), containsString(exceptionMessage));
+    }
 }

+ 18 - 5
server/src/test/java/org/elasticsearch/health/metadata/HealthMetadataSerializationTests.java

@@ -58,7 +58,9 @@ public class HealthMetadataSerializationTests extends SimpleDiffableWireSerializ
     private static HealthMetadata.Disk randomDiskMetadata() {
         return new HealthMetadata.Disk(
             randomRelativeByteSizeValue(),
+            ByteSizeValue.ofGb(randomIntBetween(10, 999)),
             randomRelativeByteSizeValue(),
+            ByteSizeValue.ofGb(randomIntBetween(10, 999)),
             randomRelativeByteSizeValue(),
             ByteSizeValue.ofGb(randomIntBetween(10, 999))
         );
@@ -74,16 +76,27 @@ public class HealthMetadataSerializationTests extends SimpleDiffableWireSerializ
 
     static HealthMetadata.Disk mutateDiskMetadata(HealthMetadata.Disk base) {
         RelativeByteSizeValue highWatermark = base.highWatermark();
+        ByteSizeValue highWatermarkMaxHeadRoom = base.highMaxHeadroom();
         RelativeByteSizeValue floodStageWatermark = base.floodStageWatermark();
+        ByteSizeValue floodStageWatermarkMaxHeadRoom = base.floodStageMaxHeadroom();
         RelativeByteSizeValue floodStageWatermarkFrozen = base.frozenFloodStageWatermark();
         ByteSizeValue floodStageWatermarkFrozenMaxHeadRoom = base.frozenFloodStageMaxHeadroom();
-        switch (randomInt(3)) {
+        switch (randomInt(5)) {
             case 0 -> highWatermark = randomRelativeByteSizeValue();
-            case 1 -> floodStageWatermark = randomRelativeByteSizeValue();
-            case 2 -> floodStageWatermarkFrozen = randomRelativeByteSizeValue();
-            case 3 -> ByteSizeValue.ofGb(randomIntBetween(10, 999));
+            case 1 -> highWatermarkMaxHeadRoom = ByteSizeValue.ofGb(randomIntBetween(10, 999));
+            case 2 -> floodStageWatermark = randomRelativeByteSizeValue();
+            case 3 -> floodStageWatermarkMaxHeadRoom = ByteSizeValue.ofGb(randomIntBetween(10, 999));
+            case 4 -> floodStageWatermarkFrozen = randomRelativeByteSizeValue();
+            case 5 -> floodStageWatermarkFrozenMaxHeadRoom = ByteSizeValue.ofGb(randomIntBetween(10, 999));
         }
-        return new HealthMetadata.Disk(highWatermark, floodStageWatermark, floodStageWatermarkFrozen, floodStageWatermarkFrozenMaxHeadRoom);
+        return new HealthMetadata.Disk(
+            highWatermark,
+            highWatermarkMaxHeadRoom,
+            floodStageWatermark,
+            floodStageWatermarkMaxHeadRoom,
+            floodStageWatermarkFrozen,
+            floodStageWatermarkFrozenMaxHeadRoom
+        );
     }
 
     private HealthMetadata mutate(HealthMetadata base) {