Browse Source

Add index disk usage API (#74051)

This change adds a new API that supports analyzing the disk usage of 
each field of an index.
Nhat Nguyen 4 years ago
parent
commit
c23cb99b71
20 changed files with 2603 additions and 47 deletions
  1. 2 0
      docs/reference/indices.asciidoc
  2. 184 0
      docs/reference/indices/diskusage.asciidoc
  3. 61 0
      rest-api-spec/src/main/resources/rest-api-spec/api/indices.disk_usage.json
  4. 112 0
      rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.stats/50_disk_usage.yml
  5. 111 0
      server/src/internalClusterTest/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerIT.java
  6. 6 0
      server/src/main/java/org/elasticsearch/action/ActionModule.java
  7. 56 0
      server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/AnalyzeDiskUsageShardRequest.java
  8. 37 0
      server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/AnalyzeDiskUsageShardResponse.java
  9. 20 0
      server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/AnalyzeIndexDiskUsageAction.java
  10. 73 0
      server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/AnalyzeIndexDiskUsageRequest.java
  11. 58 0
      server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/AnalyzeIndexDiskUsageResponse.java
  12. 738 0
      server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java
  13. 265 0
      server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageStats.java
  14. 145 0
      server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/TransportAnalyzeIndexDiskUsageAction.java
  15. 92 0
      server/src/main/java/org/elasticsearch/common/lucene/FilterIndexCommit.java
  16. 4 47
      server/src/main/java/org/elasticsearch/index/engine/CombinedDeletionPolicy.java
  17. 6 0
      server/src/main/java/org/elasticsearch/index/store/LuceneFilesExtensions.java
  18. 75 0
      server/src/main/java/org/elasticsearch/rest/action/admin/indices/RestAnalyzeIndexDiskUsageAction.java
  19. 557 0
      server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java
  20. 1 0
      x-pack/plugin/security/qa/operator-privileges-tests/src/javaRestTest/java/org/elasticsearch/xpack/security/operator/Constants.java

+ 2 - 0
docs/reference/indices.asciidoc

@@ -30,6 +30,7 @@ index settings, aliases, mappings, and index templates.
 * <<indices-put-mapping>>
 * <<indices-get-mapping>>
 * <<indices-get-field-mapping>>
+* <<indices-disk-usage>>
 
 [discrete]
 [[alias-management]]
@@ -92,6 +93,7 @@ For more information, see <<index-templates, Index Templates>>.
 include::indices/alias-exists.asciidoc[]
 include::indices/aliases.asciidoc[]
 include::indices/analyze.asciidoc[]
+include::indices/diskusage.asciidoc[]
 include::indices/clearcache.asciidoc[]
 include::indices/clone-index.asciidoc[]
 include::indices/close.asciidoc[]

+ 184 - 0
docs/reference/indices/diskusage.asciidoc

@@ -0,0 +1,184 @@
+[[indices-disk-usage]]
+=== Analyze index disk usage API
+++++
+<titleabbrev>Analyze index disk usage</titleabbrev>
+++++
+
+experimental[]
+
+Analyzes the disk usage of each field of an index or data stream.
+This API might not support indices created in previous {es} versions.
+The result of a small index can be inaccurate as some parts of an index
+might not be analyzed by the API.
+
+[source,console]
+--------------------------------------------------
+POST /my-index-000001/_disk_usage?run_expensive_tasks=true
+--------------------------------------------------
+// TEST[setup:messages]
+
+[[analyze-index-disk-usage-api-request]]
+==== {api-request-title}
+
+`POST /<target>/_disk_usage`
+
+[[analyze-index-disk-usage-api-request-prereqs]]
+==== {api-prereq-title}
+
+* If the {es} {security-features} are enabled, you must have the `manage`
+<<privileges-list-indices,index privilege>> for the target index, data stream,
+or alias.
+
+[[analyze-index-disk-usage-api-path-params]]
+==== {api-path-parms-title}
+
+`<target>`::
+(Required, string) Comma-separated list of data streams, indices, and aliases
+used to limit the request. It's recommended to execute this API with a single
+index (or the latest backing index of a data stream) as the API consumes
+resources significantly.
+
+[[analyze-index-disk-usage-api-query-params]]
+==== {api-query-parms-title}
+
+include::{es-repo-dir}/rest-api/common-parms.asciidoc[tag=allow-no-indices]
++
+Defaults to `true`.
+
+include::{es-repo-dir}/rest-api/common-parms.asciidoc[tag=expand-wildcards]
++
+Defaults to `open`.
+
+`flush`::
+(Optional, Boolean) If `true`, the API performs a flush before analysis. If
+`false`, the response may not include uncommitted data. Defaults to `true`.
+
+include::{es-repo-dir}/rest-api/common-parms.asciidoc[tag=index-ignore-unavailable]
+
+include::{es-repo-dir}/rest-api/common-parms.asciidoc[tag=timeoutparms]
+
+`run_expensive_tasks`::
+(Required, Boolean) Analyzing field disk usage is resource-intensive. To use the
+API, this parameter must be set to `true`. Defaults to `false`.
+
+include::{es-repo-dir}/rest-api/common-parms.asciidoc[tag=wait_for_active_shards]
+
+
+[[analyze-index-disk-usage-api-example]]
+==== {api-examples-title}
+
+[source,console]
+--------------------------------------------------
+POST /my-index-000001/_disk_usage?run_expensive_tasks=true
+--------------------------------------------------
+// TEST[setup:messages]
+
+The API returns:
+
+[source,console-response]
+--------------------------------------------------
+{
+    "_shards": {
+        "total": 1,
+        "successful": 1,
+        "failed": 0
+    },
+    "my-index-000001": {
+        "store_size": "929mb", <1>
+        "store_size_in_bytes": 974192723,
+        "all_fields": {
+            "total": "928.9mb", <2>
+            "total_in_bytes": 973977084,
+            "inverted_index": {
+                "total": "107.8mb",
+                "total_in_bytes": 113128526
+            },
+            "stored_fields": "623.5mb",
+            "stored_fields_in_bytes": 653819143,
+            "doc_values": "125.7mb",
+            "doc_values_in_bytes": 131885142,
+            "points": "59.9mb",
+            "points_in_bytes": 62885773,
+            "norms": "2.3kb",
+            "norms_in_bytes": 2356,
+            "term_vectors": "2.2kb",
+            "term_vectors_in_bytes": 2310
+        },
+        "fields": {
+            "_id": {
+                "total": "49.3mb",
+                "total_in_bytes": 51709993,
+                "inverted_index": {
+                    "total": "29.7mb",
+                    "total_in_bytes": 31172745
+                },
+                "stored_fields": "19.5mb", <3>
+                "stored_fields_in_bytes": 20537248,
+                "doc_values": "0b",
+                "doc_values_in_bytes": 0,
+                "points": "0b",
+                "points_in_bytes": 0,
+                "norms": "0b",
+                "norms_in_bytes": 0,
+                "term_vectors": "0b",
+                "term_vectors_in_bytes": 0
+            },
+            "_primary_term": {...},
+            "_seq_no": {...},
+            "_version": {...},
+            "_source": {
+                "total": "603.9mb",
+                "total_in_bytes": 633281895,
+                "inverted_index": {...},
+                "stored_fields": "603.9mb", <4>
+                "stored_fields_in_bytes": 633281895,
+                "doc_values": "0b",
+                "doc_values_in_bytes": 0,
+                "points": "0b",
+                "points_in_bytes": 0,
+                "norms": "0b",
+                "norms_in_bytes": 0,
+                "term_vectors": "0b",
+                "term_vectors_in_bytes": 0
+            },
+            "context": {
+                "total": "28.6mb",
+                "total_in_bytes": 30060405,
+                "inverted_index": {
+                    "total": "22mb",
+                    "total_in_bytes": 23090908
+                },
+                "stored_fields": "0b",
+                "stored_fields_in_bytes": 0,
+                "doc_values": "0b",
+                "doc_values_in_bytes": 0,
+                "points": "0b",
+                "points_in_bytes": 0,
+                "norms": "2.3kb",
+                "norms_in_bytes": 2356,
+                "term_vectors": "2.2kb",
+                "term_vectors_in_bytes": 2310
+            },
+            "context.keyword": {...},
+            "message": {...},
+            "message.keyword": {...}
+        }
+    }
+}
+--------------------------------------------------
+// TESTRESPONSE[s/: \{\.\.\.\}/: $body.$_path/]
+// TESTRESPONSE[s/: (\-)?[0-9]+/: $body.$_path/]
+// TESTRESPONSE[s/: "[^"]*"/: $body.$_path/]
+
+<1> The store size of only analyzed shards of the index.
+
+<2> The total size of fields of the analyzed shards of the index. This total
+is usually smaller than the index size specified in <1> as some small metadata
+files are ignored and some parts of data files might not be scanned by the API.
+
+<3> The stored size of the `_id` field
+
+<4> The stored size of the `_source` field. As stored fields are stored
+together in a compressed format, the estimated sizes of stored fields are
+best efforts and can be inaccurate. The stored size of the `_id` field
+is likely underestimated while the `_source` field is overestimated.

+ 61 - 0
rest-api-spec/src/main/resources/rest-api-spec/api/indices.disk_usage.json

@@ -0,0 +1,61 @@
+{
+  "indices.disk_usage": {
+    "documentation": {
+      "url": "https://www.elastic.co/guide/en/elasticsearch/reference/master/indices-disk-usage.html",
+      "description": "Analyzes the disk usage of each field of an index or data stream"
+    },
+    "stability": "experimental",
+    "visibility": "public",
+    "headers": {
+      "accept": [
+        "application/json"
+      ]
+    },
+    "url": {
+      "paths": [
+        {
+          "path": "/{index}/_disk_usage",
+          "methods": [
+            "POST"
+          ],
+          "parts": {
+            "index": {
+              "type": "string",
+              "description": "Comma-separated list of indices or data streams to analyze the disk usage"
+            }
+          }
+        }
+      ]
+    },
+    "params": {
+      "run_expensive_tasks": {
+        "type": "boolean",
+        "description": "Must be set to [true] in order for the task to be performed. Defaults to false."
+      },
+      "flush": {
+        "type": "boolean",
+        "description": "Whether flush or not before analyzing the index disk usage. Defaults to true"
+      },
+      "ignore_unavailable": {
+        "type": "boolean",
+        "description": "Whether specified concrete indices should be ignored when unavailable (missing or closed)"
+      },
+      "allow_no_indices": {
+        "type": "boolean",
+        "description": "Whether to ignore if a wildcard indices expression resolves into no concrete indices. (This includes `_all` string or when no indices have been specified)"
+      },
+      "expand_wildcards": {
+        "type": "enum",
+        "options": [
+          "open",
+          "closed",
+          "hidden",
+          "none",
+          "all"
+        ],
+        "default": "open",
+        "description": "Whether to expand wildcard expression to concrete indices that are open, closed or both."
+      }
+    }
+  }
+}

+ 112 - 0
rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.stats/50_disk_usage.yml

@@ -0,0 +1,112 @@
+---
+setup:
+  - skip:
+      version: " - 7.99.99"
+      reason: analyze index disk usage API is introduced in 8.0
+
+---
+"Disk usage stats":
+  - do:
+      indices.create:
+        index: testindex
+        body:
+          mappings:
+            properties:
+              name:
+                type: text
+              quantity:
+                type: long
+                doc_values: false
+              genre:
+                type: keyword
+                doc_values: true
+  - do:
+      index:
+        index: testindex
+        body: { "name": "foo", "quantity": 0, "genre": [ "rock", "pop" ] }
+  - do:
+      index:
+        index: testindex
+        body: { "name": "bar", "quantity": 99, "genre": "pop" }
+  - do:
+      index:
+        index: testindex
+        body: { "name": "baz", "quantity": 50, "genre": "jazz" }
+  - do:
+      index:
+        index: testindex
+        body: { "name": "bar & baz", "quantity": 1000, "genre": "blue" }
+  - do:
+      index:
+        index: testindex
+        body: { "name": "foobar", "quantity": 1000, "genre": "country" }
+  - do:
+      indices.disk_usage: { index: "testindex", "run_expensive_tasks": true }
+
+  - gt: { testindex.store_size_in_bytes: 100 }
+  # all_fields
+  - set: { testindex.all_fields: all }
+  - gt: { $all.total_in_bytes: 0 }
+  - gt: { $all.inverted_index.total_in_bytes: 0 }
+  - gt: { $all.stored_fields_in_bytes: 0 }
+  - gt: { $all.doc_values_in_bytes: 0 }
+  - gt: { $all.points_in_bytes: 0 }
+  - match: { $all.term_vectors_in_bytes: 0 }
+
+  # genre
+  - set: { testindex.fields.genre: genre }
+  - gt: { $genre.total_in_bytes: 0 }
+  - gt: { $genre.inverted_index.total_in_bytes: 0 }
+  - match: { $genre.stored_fields_in_bytes: 0 }
+  - gt: { $genre.doc_values_in_bytes: 0 }
+  - match: { $genre.points_in_bytes: 0 }
+  - match: { $genre.norms_in_bytes: 0 }
+  - match: { $genre.term_vectors_in_bytes: 0 }
+
+  # name
+  - set: { testindex.fields.name: name }
+  - gt: { $name.total_in_bytes: 0 }
+  - gt: { $name.inverted_index.total_in_bytes: 0 }
+  - match: { $name.stored_fields_in_bytes: 0 }
+  - match: { $name.doc_values_in_bytes: 0 }
+  - match: { $name.points_in_bytes: 0 }
+  - match: { $name.term_vectors_in_bytes: 0 }
+
+  # quantity
+  - set: { testindex.fields.quantity: quantity }
+  - gt: { $quantity.total_in_bytes: 0 }
+  - match: { $quantity.inverted_index.total_in_bytes: 0 }
+  - match: { $quantity.stored_fields_in_bytes: 0 }
+  - match: { $quantity.doc_values_in_bytes: 0 }
+  - gt: { $quantity.points_in_bytes: 0 }
+  - match: { $quantity.norms_in_bytes: 0 }
+  - match: { $quantity.term_vectors_in_bytes: 0 }
+
+  # _source
+  - set: { testindex.fields._source: source }
+  - gt: { $source.total_in_bytes: 0 }
+  - match: { $source.inverted_index.total_in_bytes: 0 }
+  - gt: { $source.stored_fields_in_bytes: 0 }
+  - match: { $source.doc_values_in_bytes: 0 }
+  - match: { $source.points_in_bytes: 0 }
+  - match: { $source.norms_in_bytes: 0 }
+  - match: { $source.term_vectors_in_bytes: 0 }
+
+  # _id
+  - set: { testindex.fields._id: id }
+  - gt: { $id.total_in_bytes: 0 }
+  - gt: { $id.inverted_index.total_in_bytes: 0 }
+  - gt: { $id.stored_fields_in_bytes: 0 }
+  - match: { $id.doc_values_in_bytes: 0 }
+  - match: { $id.points_in_bytes: 0 }
+  - match: { $id.norms_in_bytes: 0 }
+  - match: { $id.term_vectors_in_bytes: 0 }
+
+  # _seq_no
+  - set: { testindex.fields._seq_no: seqno }
+  - gt: { $seqno.total_in_bytes: 0 }
+  - match: { $seqno.inverted_index.total_in_bytes: 0 }
+  - match: { $seqno.stored_fields_in_bytes: 0 }
+  - gt: { $seqno.points_in_bytes: 0 }
+  - match: { $seqno.norms_in_bytes: 0 }
+  - match: { $seqno.term_vectors_in_bytes: 0 }

+ 111 - 0
server/src/internalClusterTest/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerIT.java

@@ -0,0 +1,111 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.action.admin.indices.diskusage;
+
+import org.apache.lucene.util.English;
+import org.elasticsearch.action.support.PlainActionFuture;
+import org.elasticsearch.cluster.metadata.IndexMetadata;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.common.xcontent.XContentFactory;
+import org.elasticsearch.test.ESIntegTestCase;
+
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.greaterThan;
+
+public class IndexDiskUsageAnalyzerIT extends ESIntegTestCase {
+
+    public void testSimple() throws Exception {
+        final XContentBuilder mapping = XContentFactory.jsonBuilder();
+        mapping.startObject();
+        {
+            mapping.startObject("_doc");
+            {
+                mapping.startObject("properties");
+                {
+                    mapping.startObject("english_text");
+                    mapping.field("type", "text");
+                    mapping.endObject();
+
+                    mapping.startObject("value");
+                    mapping.field("type", "long");
+                    mapping.endObject();
+                }
+                mapping.endObject();
+            }
+            mapping.endObject();
+        }
+        mapping.endObject();
+
+        final String index = "test-index";
+        client().admin().indices().prepareCreate(index)
+            .setMapping(mapping)
+            .setSettings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, between(1, 5)))
+            .get();
+        ensureGreen(index);
+
+        int numDocs = randomIntBetween(10, 100);
+        for (int i = 0; i < numDocs; i++) {
+            int value = randomIntBetween(1, 1024);
+            final XContentBuilder doc = XContentFactory.jsonBuilder()
+                .startObject()
+                .field("english_text", English.intToEnglish(value))
+                .field("value", value)
+                .endObject();
+            client().prepareIndex(index)
+                .setId("id-" + i)
+                .setSource(doc)
+                .get();
+        }
+        PlainActionFuture<AnalyzeIndexDiskUsageResponse> future = PlainActionFuture.newFuture();
+        client().execute(AnalyzeIndexDiskUsageAction.INSTANCE,
+            new AnalyzeIndexDiskUsageRequest(new String[] {index}, AnalyzeIndexDiskUsageRequest.DEFAULT_INDICES_OPTIONS, true),
+            future);
+
+        AnalyzeIndexDiskUsageResponse resp = future.actionGet();
+        final IndexDiskUsageStats stats = resp.getStats().get(index);
+        logger.info("--> stats {}", stats);
+        assertNotNull(stats);
+        assertThat(stats.getIndexSizeInBytes(), greaterThan(100L));
+
+        final IndexDiskUsageStats.PerFieldDiskUsage englishField = stats.getFields().get("english_text");
+        assertThat(englishField.getInvertedIndexBytes(), greaterThan(0L));
+        assertThat(englishField.getStoredFieldBytes(), equalTo(0L));
+        assertThat(englishField.getNormsBytes(), greaterThan(0L));
+
+        final IndexDiskUsageStats.PerFieldDiskUsage valueField = stats.getFields().get("value");
+        assertThat(valueField.getInvertedIndexBytes(), equalTo(0L));
+        assertThat(valueField.getStoredFieldBytes(), equalTo(0L));
+        assertThat(valueField.getPointsBytes(), greaterThan(0L));
+        assertThat(valueField.getDocValuesBytes(), greaterThan(0L));
+
+        assertMetadataFields(stats);
+    }
+
+
+    void assertMetadataFields(IndexDiskUsageStats stats) {
+        final IndexDiskUsageStats.PerFieldDiskUsage sourceField = stats.getFields().get("_source");
+        assertThat(sourceField.getInvertedIndexBytes(), equalTo(0L));
+        assertThat(sourceField.getStoredFieldBytes(), greaterThan(0L));
+        assertThat(sourceField.getPointsBytes(), equalTo(0L));
+        assertThat(sourceField.getDocValuesBytes(), equalTo(0L));
+
+        final IndexDiskUsageStats.PerFieldDiskUsage idField = stats.getFields().get("_id");
+        assertThat(idField.getInvertedIndexBytes(), greaterThan(0L));
+        assertThat(idField.getStoredFieldBytes(), greaterThan(0L));
+        assertThat(idField.getPointsBytes(), equalTo(0L));
+        assertThat(idField.getDocValuesBytes(), equalTo(0L));
+
+        final IndexDiskUsageStats.PerFieldDiskUsage seqNoField = stats.getFields().get("_seq_no");
+        assertThat(seqNoField.getInvertedIndexBytes(), equalTo(0L));
+        assertThat(seqNoField.getStoredFieldBytes(), equalTo(0L));
+        assertThat(seqNoField.getPointsBytes(), greaterThan(0L));
+        assertThat(seqNoField.getDocValuesBytes(), greaterThan(0L));
+    }
+}

+ 6 - 0
server/src/main/java/org/elasticsearch/action/ActionModule.java

@@ -150,7 +150,9 @@ import org.elasticsearch.action.admin.indices.shards.IndicesShardStoresAction;
 import org.elasticsearch.action.admin.indices.shards.TransportIndicesShardStoresAction;
 import org.elasticsearch.action.admin.indices.shrink.ResizeAction;
 import org.elasticsearch.action.admin.indices.shrink.TransportResizeAction;
+import org.elasticsearch.action.admin.indices.diskusage.AnalyzeIndexDiskUsageAction;
 import org.elasticsearch.action.admin.indices.stats.IndicesStatsAction;
+import org.elasticsearch.action.admin.indices.diskusage.TransportAnalyzeIndexDiskUsageAction;
 import org.elasticsearch.action.admin.indices.stats.TransportIndicesStatsAction;
 import org.elasticsearch.action.admin.indices.template.delete.DeleteComponentTemplateAction;
 import org.elasticsearch.action.admin.indices.template.delete.DeleteComposableIndexTemplateAction;
@@ -300,6 +302,7 @@ import org.elasticsearch.rest.action.admin.cluster.dangling.RestImportDanglingIn
 import org.elasticsearch.rest.action.admin.cluster.dangling.RestListDanglingIndicesAction;
 import org.elasticsearch.rest.action.admin.indices.RestAddIndexBlockAction;
 import org.elasticsearch.rest.action.admin.indices.RestAnalyzeAction;
+import org.elasticsearch.rest.action.admin.indices.RestAnalyzeIndexDiskUsageAction;
 import org.elasticsearch.rest.action.admin.indices.RestClearIndicesCacheAction;
 import org.elasticsearch.rest.action.admin.indices.RestCloseIndexAction;
 import org.elasticsearch.rest.action.admin.indices.RestCreateIndexAction;
@@ -580,6 +583,7 @@ public class ActionModule extends AbstractModule {
         actions.register(NodesReloadSecureSettingsAction.INSTANCE, TransportNodesReloadSecureSettingsAction.class);
         actions.register(AutoCreateAction.INSTANCE, AutoCreateAction.TransportAction.class);
         actions.register(ResolveIndexAction.INSTANCE, ResolveIndexAction.TransportAction.class);
+        actions.register(AnalyzeIndexDiskUsageAction.INSTANCE, TransportAnalyzeIndexDiskUsageAction.class);
 
         //Indexed scripts
         actions.register(PutStoredScriptAction.INSTANCE, TransportPutStoredScriptAction.class);
@@ -792,6 +796,8 @@ public class ActionModule extends AbstractModule {
         registerHandler.accept(new RestRepositoriesAction());
         registerHandler.accept(new RestSnapshotAction());
         registerHandler.accept(new RestTemplatesAction());
+        registerHandler.accept(new RestAnalyzeIndexDiskUsageAction());
+
         for (ActionPlugin plugin : actionPlugins) {
             for (RestHandler handler : plugin.getRestHandlers(settings, restController, clusterSettings, indexScopedSettings,
                     settingsFilter, indexNameExpressionResolver, nodesInCluster)) {

+ 56 - 0
server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/AnalyzeDiskUsageShardRequest.java

@@ -0,0 +1,56 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.action.admin.indices.diskusage;
+
+import org.elasticsearch.action.support.broadcast.BroadcastShardRequest;
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.index.shard.ShardId;
+import org.elasticsearch.tasks.CancellableTask;
+import org.elasticsearch.tasks.Task;
+import org.elasticsearch.tasks.TaskId;
+
+import java.io.IOException;
+import java.util.Map;
+
+final class AnalyzeDiskUsageShardRequest extends BroadcastShardRequest {
+    final boolean flush;
+
+    AnalyzeDiskUsageShardRequest(ShardId shardId, AnalyzeIndexDiskUsageRequest request) {
+        super(shardId, request);
+        this.flush = request.flush;
+    }
+
+
+    AnalyzeDiskUsageShardRequest(StreamInput in) throws IOException {
+        super(in);
+        this.flush = in.readBoolean();
+    }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        super.writeTo(out);
+        out.writeBoolean(flush);
+    }
+
+    @Override
+    public Task createTask(long id, String type, String action, TaskId parentTaskId, Map<String, String> headers) {
+        return new CancellableTask(id, type, action, "", parentTaskId, headers) {
+            @Override
+            public String getDescription() {
+                return AnalyzeDiskUsageShardRequest.this.getDescription();
+            }
+        };
+    }
+
+    @Override
+    public String getDescription() {
+        return "Analyze disk usage shard [" + shardId() + "], flush [" + flush + "]";
+    }
+}

+ 37 - 0
server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/AnalyzeDiskUsageShardResponse.java

@@ -0,0 +1,37 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.action.admin.indices.diskusage;
+
+import org.elasticsearch.action.support.broadcast.BroadcastShardResponse;
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.index.shard.ShardId;
+
+import java.io.IOException;
+import java.util.Objects;
+
+final class AnalyzeDiskUsageShardResponse extends BroadcastShardResponse {
+    final IndexDiskUsageStats stats;
+
+    AnalyzeDiskUsageShardResponse(StreamInput in) throws IOException {
+        super(in);
+        stats = new IndexDiskUsageStats(in);
+    }
+
+    AnalyzeDiskUsageShardResponse(ShardId shardId, IndexDiskUsageStats stats) {
+        super(shardId);
+        this.stats = Objects.requireNonNull(stats, "stats must be non null");
+    }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        super.writeTo(out);
+        stats.writeTo(out);
+    }
+}

+ 20 - 0
server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/AnalyzeIndexDiskUsageAction.java

@@ -0,0 +1,20 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.action.admin.indices.diskusage;
+
+import org.elasticsearch.action.ActionType;
+
+public class AnalyzeIndexDiskUsageAction extends ActionType<AnalyzeIndexDiskUsageResponse> {
+    public static final AnalyzeIndexDiskUsageAction INSTANCE = new AnalyzeIndexDiskUsageAction();
+    public static final String NAME = "indices:admin/analyze_disk_usage";
+
+    public AnalyzeIndexDiskUsageAction() {
+        super(NAME, AnalyzeIndexDiskUsageResponse::new);
+    }
+}

+ 73 - 0
server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/AnalyzeIndexDiskUsageRequest.java

@@ -0,0 +1,73 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.action.admin.indices.diskusage;
+
+import org.elasticsearch.action.ActionRequestValidationException;
+import org.elasticsearch.action.support.IndicesOptions;
+import org.elasticsearch.action.support.broadcast.BroadcastRequest;
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.tasks.CancellableTask;
+import org.elasticsearch.tasks.Task;
+import org.elasticsearch.tasks.TaskId;
+
+import java.io.IOException;
+import java.util.Map;
+
+import static org.elasticsearch.action.ValidateActions.addValidationError;
+
+public class AnalyzeIndexDiskUsageRequest extends BroadcastRequest<AnalyzeIndexDiskUsageRequest> {
+    public static final IndicesOptions DEFAULT_INDICES_OPTIONS = IndicesOptions.fromOptions(false, false, false, true);
+    final boolean flush;
+
+    public AnalyzeIndexDiskUsageRequest(String[] indices, IndicesOptions indicesOptions, boolean flush) {
+        super(indices, indicesOptions);
+        this.flush = flush;
+    }
+
+    public AnalyzeIndexDiskUsageRequest(StreamInput in) throws IOException {
+        super(in);
+        this.flush = in.readBoolean();
+    }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        super.writeTo(out);
+        out.writeBoolean(flush);
+    }
+
+    @Override
+    public ActionRequestValidationException validate() {
+        ActionRequestValidationException validationError = super.validate();
+        if (indices.length == 0) {
+            validationError = addValidationError("indices must be specified for disk usage request", validationError);
+        }
+        return validationError;
+    }
+
+    @Override
+    public void setParentTask(String parentTaskNode, long parentTaskId) {
+        super.setParentTask(parentTaskNode, parentTaskId);
+    }
+
+    @Override
+    public Task createTask(long id, String type, String action, TaskId parentTaskId, Map<String, String> headers) {
+        return new CancellableTask(id, AnalyzeIndexDiskUsageAction.NAME, type, "", parentTaskId, headers) {
+            @Override
+            public String getDescription() {
+                return AnalyzeIndexDiskUsageRequest.this.getDescription();
+            }
+        };
+    }
+
+    @Override
+    public String getDescription() {
+        return "analyze disk usage indices [" + String.join(",", indices) + "], flush [" + flush + "]";
+    }
+}

+ 58 - 0
server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/AnalyzeIndexDiskUsageResponse.java

@@ -0,0 +1,58 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.action.admin.indices.diskusage;
+
+import org.elasticsearch.action.support.DefaultShardOperationFailedException;
+import org.elasticsearch.action.support.broadcast.BroadcastResponse;
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+public final class AnalyzeIndexDiskUsageResponse extends BroadcastResponse {
+    private final Map<String, IndexDiskUsageStats> stats;
+
+    AnalyzeIndexDiskUsageResponse(int totalShards, int successfulShards, int failedShards,
+                                  List<DefaultShardOperationFailedException> shardFailures,
+                                  Map<String, IndexDiskUsageStats> stats) {
+        super(totalShards, successfulShards, failedShards, shardFailures);
+        this.stats = stats;
+    }
+
+    AnalyzeIndexDiskUsageResponse(StreamInput in) throws IOException {
+        super(in);
+        stats = in.readMap(StreamInput::readString, IndexDiskUsageStats::new);
+    }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        super.writeTo(out);
+        out.writeMap(stats, StreamOutput::writeString, (o, v) -> v.writeTo(o));
+    }
+
+    Map<String, IndexDiskUsageStats> getStats() {
+        return stats;
+    }
+
+    @Override
+    protected void addCustomXContentFields(XContentBuilder builder, Params params) throws IOException {
+        final List<Map.Entry<String, IndexDiskUsageStats>> entries = stats.entrySet().stream()
+            .sorted(Map.Entry.comparingByKey())
+            .collect(Collectors.toList());
+        for (Map.Entry<String, IndexDiskUsageStats> entry : entries) {
+            builder.startObject(entry.getKey());
+            entry.getValue().toXContent(builder, params);
+            builder.endObject();
+        }
+    }
+}

+ 738 - 0
server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java

@@ -0,0 +1,738 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.action.admin.indices.diskusage;
+
+import org.apache.logging.log4j.Logger;
+import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.NormsProducer;
+import org.apache.lucene.codecs.PointsReader;
+import org.apache.lucene.codecs.StoredFieldsReader;
+import org.apache.lucene.codecs.TermVectorsReader;
+import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat;
+import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.DocValuesType;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexCommit;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.PointValues;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.SegmentReader;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.StoredFieldVisitor;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FilterDirectory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FutureArrays;
+import org.elasticsearch.common.CheckedSupplier;
+import org.elasticsearch.common.logging.Loggers;
+import org.elasticsearch.common.lucene.FilterIndexCommit;
+import org.elasticsearch.common.lucene.Lucene;
+import org.elasticsearch.core.CheckedConsumer;
+import org.elasticsearch.core.internal.io.IOUtils;
+import org.elasticsearch.index.shard.ShardId;
+import org.elasticsearch.index.store.LuceneFilesExtensions;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Objects;
+
+/**
+ * Analyze the disk usage of each field in the index.
+ */
+ final class IndexDiskUsageAnalyzer {
+    private final Logger logger;
+    private final IndexCommit commit;
+    private final TrackingReadBytesDirectory directory;
+    private final CancellationChecker cancellationChecker;
+
+    private IndexDiskUsageAnalyzer(ShardId shardId, IndexCommit commit, Runnable checkForCancellation) {
+        this.logger = Loggers.getLogger(IndexDiskUsageAnalyzer.class, shardId);
+        this.directory = new TrackingReadBytesDirectory(commit.getDirectory());
+        this.commit = new FilterIndexCommit(commit) {
+            @Override
+            public Directory getDirectory() {
+                return directory;
+            }
+        };
+        this.cancellationChecker = new CancellationChecker(checkForCancellation);
+    }
+
+    static IndexDiskUsageStats analyze(ShardId shardId, IndexCommit commit, Runnable checkForCancellation) throws IOException {
+        final IndexDiskUsageAnalyzer analyzer = new IndexDiskUsageAnalyzer(shardId, commit, checkForCancellation);
+        final IndexDiskUsageStats stats = new IndexDiskUsageStats(getIndexSize(commit));
+        analyzer.doAnalyze(stats);
+        return stats;
+    }
+
+    void doAnalyze(IndexDiskUsageStats stats) throws IOException {
+        long startTimeInNanos;
+        final ExecutionTime executionTime = new ExecutionTime();
+        try (DirectoryReader directoryReader = DirectoryReader.open(commit)) {
+            directory.resetBytesRead();
+            for (LeafReaderContext leaf : directoryReader.leaves()) {
+                cancellationChecker.checkForCancellation();
+                final SegmentReader reader = Lucene.segmentReader(leaf.reader());
+
+                startTimeInNanos = System.nanoTime();
+                analyzeInvertedIndex(reader, stats);
+                executionTime.invertedIndexTimeInNanos += System.nanoTime() - startTimeInNanos;
+
+                startTimeInNanos = System.nanoTime();
+                analyzeStoredFields(reader, stats);
+                executionTime.storedFieldsTimeInNanos += System.nanoTime() - startTimeInNanos;
+
+                startTimeInNanos = System.nanoTime();
+                analyzeDocValues(reader, stats);
+                executionTime.docValuesTimeInNanos += System.nanoTime() - startTimeInNanos;
+
+                startTimeInNanos = System.nanoTime();
+                analyzePoints(reader, stats);
+                executionTime.pointsTimeInNanos += System.nanoTime() - startTimeInNanos;
+
+                startTimeInNanos = System.nanoTime();
+                analyzeNorms(reader, stats);
+                executionTime.normsTimeInNanos += System.nanoTime() - startTimeInNanos;
+
+                startTimeInNanos = System.nanoTime();
+                analyzeTermVectors(reader, stats);
+                executionTime.termVectorsTimeInNanos += System.nanoTime() - startTimeInNanos;
+            }
+        }
+        logger.debug("analyzing the disk usage took {} stats: {}", executionTime, stats);
+    }
+
+    void analyzeStoredFields(SegmentReader reader, IndexDiskUsageStats stats) throws IOException {
+        final StoredFieldsReader storedFieldsReader = reader.getFieldsReader().getMergeInstance();
+        directory.resetBytesRead();
+        final TrackingSizeStoredFieldVisitor visitor = new TrackingSizeStoredFieldVisitor();
+        int docID = 0;
+        final int skipMask = 0x1FF; // 511
+        while (docID < reader.maxDoc()) {
+            cancellationChecker.logEvent();
+            storedFieldsReader.visitDocument(docID, visitor);
+            // As we already estimate the size of stored fields, we can trade off the accuracy for the speed of the estimate.
+            // Here we only visit 1/11 documents instead of all documents. Ideally, we should visit 1 doc then skip 10 docs
+            // to avoid missing many skew documents. But, documents are stored in chunks in compressed format and a chunk can
+            // have up to 4096 docs, we need to skip a large number of docs to avoid loading/decompressing some chunks.
+            if ((docID & skipMask) == skipMask && docID < reader.maxDoc() - 512) {
+                docID = Math.toIntExact(Math.min(docID + 5120L, reader.maxDoc() - 512L)); // always visit both ends
+            } else {
+                docID++;
+            }
+        }
+        if (visitor.fields.isEmpty() == false) {
+            // Computing the compression ratio for each chunk would provide a better estimate for each field individually.
+            // But it's okay to do this entire segment because source and _id are the only two stored fields in ES most the cases.
+            final long totalBytes = visitor.fields.values().stream().mapToLong(v -> v).sum();
+            final double ratio = (double) directory.getBytesRead() / (double) totalBytes;
+            final FieldInfos fieldInfos = reader.getFieldInfos();
+            for (Map.Entry<Integer, Long> field : visitor.fields.entrySet()) {
+                final String fieldName = fieldInfos.fieldInfo(field.getKey()).name;
+                final long fieldSize = (long) Math.ceil(field.getValue() * ratio);
+                stats.addStoredField(fieldName, fieldSize);
+            }
+        }
+    }
+
+    private static class TrackingSizeStoredFieldVisitor extends StoredFieldVisitor {
+        private final Map<Integer, Long> fields = new HashMap<>();
+
+        private void trackField(FieldInfo fieldInfo, int fieldLength) {
+            final int totalBytes = fieldLength + Long.BYTES; // a Long for bitsAndInfo
+            fields.compute(fieldInfo.number, (k, v) -> v == null ? totalBytes : v + totalBytes);
+        }
+
+        @Override
+        public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
+            trackField(fieldInfo, Integer.BYTES + value.length);
+        }
+
+        @Override
+        public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException {
+            trackField(fieldInfo, Integer.BYTES + value.length);
+        }
+
+        @Override
+        public void intField(FieldInfo fieldInfo, int value) throws IOException {
+            trackField(fieldInfo, Integer.BYTES);
+        }
+
+        @Override
+        public void longField(FieldInfo fieldInfo, long value) throws IOException {
+            trackField(fieldInfo, Long.BYTES);
+        }
+
+        @Override
+        public void floatField(FieldInfo fieldInfo, float value) throws IOException {
+            trackField(fieldInfo, Float.BYTES);
+        }
+
+        @Override
+        public void doubleField(FieldInfo fieldInfo, double value) throws IOException {
+            trackField(fieldInfo, Double.BYTES);
+        }
+
+        @Override
+        public Status needsField(FieldInfo fieldInfo) throws IOException {
+            return Status.YES;
+        }
+    }
+
+    private <DV extends DocIdSetIterator> DV iterateDocValues(int maxDocs,
+                                                              CheckedSupplier<DV, IOException> dvReader,
+                                                              CheckedConsumer<DV, IOException> valueAccessor) throws IOException {
+        // As we track the min/max positions of read bytes, we just visit the first and last values of the docValues iterator.
+        // Here we use a binary search like to visit the right most index that has values
+        DV dv = dvReader.get();
+        int docID;
+        if ((docID = dv.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
+            valueAccessor.accept(dv);
+            long left = docID;
+            long right = 2L * (maxDocs - 1L) - left; // starts with the last index
+            while (left < maxDocs - 1L && left <= right) {
+                cancellationChecker.logEvent();
+                final int mid = Math.toIntExact((left + right) >>> 1);
+                if ((docID = dv.advance(mid)) != DocIdSetIterator.NO_MORE_DOCS) {
+                    valueAccessor.accept(dv);
+                    left = docID + 1;
+                } else {
+                    right = mid - 1;
+                    dv = dvReader.get();
+                }
+            }
+            assert dv.advance(Math.toIntExact(left + 1)) == DocIdSetIterator.NO_MORE_DOCS;
+        }
+        return dv;
+    }
+
+    void analyzeDocValues(SegmentReader reader, IndexDiskUsageStats stats) throws IOException {
+        if (reader.getDocValuesReader() == null) {
+            return;
+        }
+        final DocValuesProducer docValuesReader = reader.getDocValuesReader().getMergeInstance();
+        final int maxDocs = reader.maxDoc();
+        for (FieldInfo field : reader.getFieldInfos()) {
+            final DocValuesType dvType = field.getDocValuesType();
+            if (dvType == DocValuesType.NONE) {
+                continue;
+            }
+            cancellationChecker.checkForCancellation();
+            directory.resetBytesRead();
+            switch (dvType) {
+                case NUMERIC:
+                    iterateDocValues(maxDocs, () -> docValuesReader.getNumeric(field), NumericDocValues::longValue);
+                    break;
+                case SORTED_NUMERIC:
+                    iterateDocValues(maxDocs, () -> docValuesReader.getSortedNumeric(field), dv -> {
+                        for (int i = 0; i < dv.docValueCount(); i++) {
+                            cancellationChecker.logEvent();
+                            dv.nextValue();
+                        }
+                    });
+                    break;
+                case BINARY:
+                    iterateDocValues(maxDocs, () -> docValuesReader.getBinary(field), BinaryDocValues::binaryValue);
+                    break;
+                case SORTED:
+                    SortedDocValues sorted = iterateDocValues(maxDocs, () -> docValuesReader.getSorted(field), SortedDocValues::ordValue);
+                    sorted.lookupOrd(0);
+                    sorted.lookupOrd(sorted.getValueCount() - 1);
+                    break;
+                case SORTED_SET:
+                    SortedSetDocValues sortedSet = iterateDocValues(maxDocs, () -> docValuesReader.getSortedSet(field), dv -> {
+                        while (dv.nextOrd() != SortedSetDocValues.NO_MORE_ORDS) {
+                            cancellationChecker.logEvent();
+                        }
+                    });
+                    sortedSet.lookupOrd(0);
+                    sortedSet.lookupOrd(sortedSet.getValueCount() - 1);
+                    break;
+                default:
+                    assert false : "Unknown docValues type [" + dvType + "]";
+                    throw new IllegalStateException("Unknown docValues type [" + dvType + "]");
+            }
+            stats.addDocValues(field.name, directory.getBytesRead());
+        }
+    }
+
+    private void readProximity(Terms terms, PostingsEnum postings) throws IOException {
+        if (terms.hasPositions()) {
+            for (int pos = 0; pos < postings.freq(); pos++) {
+                postings.nextPosition();
+                postings.startOffset();
+                postings.endOffset();
+                postings.getPayload();
+            }
+        }
+    }
+
+    private BlockTermState getBlockTermState(TermsEnum termsEnum, BytesRef term) throws IOException {
+        if (term != null && termsEnum.seekExact(term)) {
+            final TermState termState = termsEnum.termState();
+            if (termState instanceof Lucene84PostingsFormat.IntBlockTermState) {
+                final Lucene84PostingsFormat.IntBlockTermState blockTermState = (Lucene84PostingsFormat.IntBlockTermState) termState;
+                return new BlockTermState(blockTermState.docStartFP, blockTermState.posStartFP, blockTermState.payStartFP);
+            }
+            if (termState instanceof Lucene50PostingsFormat.IntBlockTermState) {
+                final Lucene50PostingsFormat.IntBlockTermState blockTermState = (Lucene50PostingsFormat.IntBlockTermState) termState;
+                return new BlockTermState(blockTermState.docStartFP, blockTermState.posStartFP, blockTermState.payStartFP);
+            }
+        }
+        return null;
+    }
+
+    private static class BlockTermState {
+        final long docStartFP;
+        final long posStartFP;
+        final long payloadFP;
+
+        BlockTermState(long docStartFP, long posStartFP, long payloadFP) {
+            this.docStartFP = docStartFP;
+            this.posStartFP = posStartFP;
+            this.payloadFP = payloadFP;
+        }
+
+        long distance(BlockTermState other) {
+            return this.docStartFP - other.docStartFP + this.posStartFP - other.posStartFP + this.payloadFP - other.payloadFP;
+        }
+    }
+
+    void analyzeInvertedIndex(SegmentReader reader, IndexDiskUsageStats stats) throws IOException {
+        FieldsProducer postingsReader = reader.getPostingsReader();
+        if (postingsReader == null) {
+            return;
+        }
+        postingsReader = postingsReader.getMergeInstance();
+        PostingsEnum postings = null;
+        for (FieldInfo field : reader.getFieldInfos()) {
+            if (field.getIndexOptions() == IndexOptions.NONE) {
+                continue;
+            }
+            cancellationChecker.checkForCancellation();
+            directory.resetBytesRead();
+            final Terms terms = postingsReader.terms(field.name);
+            if (terms == null) {
+                continue;
+            }
+            // It's expensive to look up every term and visit every document of the postings lists of all terms.
+            // As we track the min/max positions of read bytes, we just visit the two ends of a partition containing
+            // the data. We might miss some small parts of the data, but it's an good trade-off to speed up the process.
+            TermsEnum termsEnum = terms.iterator();
+            final BlockTermState minState = getBlockTermState(termsEnum, terms.getMin());
+            if (minState != null) {
+                final BlockTermState maxState = Objects.requireNonNull(
+                    getBlockTermState(termsEnum, terms.getMax()), "can't retrieve the block term state of the max term");
+                final long skippedBytes = maxState.distance(minState);
+                stats.addInvertedIndex(field.name, skippedBytes);
+                termsEnum.seekExact(terms.getMax());
+                postings = termsEnum.postings(postings, PostingsEnum.ALL);
+                if (postings.advance(termsEnum.docFreq() - 1) != DocIdSetIterator.NO_MORE_DOCS) {
+                    postings.freq();
+                    readProximity(terms, postings);
+                }
+                final long bytesRead = directory.getBytesRead();
+                int visitedTerms = 0;
+                final long totalTerms = terms.size();
+                termsEnum = terms.iterator();
+                // Iterate until we really access the first terms, but iterate all if the number of terms is small
+                while (termsEnum.next() != null) {
+                    cancellationChecker.logEvent();
+                    ++visitedTerms;
+                    if (totalTerms > 1000 && visitedTerms % 50 == 0 && directory.getBytesRead() > bytesRead) {
+                        break;
+                    }
+                }
+            } else {
+                // We aren't sure if the optimization can be applied for other implementations rather than the BlockTree
+                // based implementation. Hence, we just traverse every postings of all terms in this case.
+                while (termsEnum.next() != null) {
+                    cancellationChecker.logEvent();
+                    termsEnum.docFreq();
+                    termsEnum.totalTermFreq();
+                    postings = termsEnum.postings(postings, PostingsEnum.ALL);
+                    while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+                        cancellationChecker.logEvent();
+                        postings.freq();
+                        readProximity(terms, postings);
+                    }
+                }
+            }
+            stats.addInvertedIndex(field.name, directory.getBytesRead());
+        }
+    }
+
+    void analyzePoints(SegmentReader reader, IndexDiskUsageStats stats) throws IOException {
+        PointsReader pointsReader = reader.getPointsReader();
+        if (pointsReader == null) {
+            return;
+        }
+        pointsReader = pointsReader.getMergeInstance();
+        for (FieldInfo field : reader.getFieldInfos()) {
+            cancellationChecker.checkForCancellation();
+            directory.resetBytesRead();
+            if (field.getPointDimensionCount() > 0) {
+                final PointValues values = pointsReader.getValues(field.name);
+                values.intersect(new PointsVisitor(values.getMinPackedValue(), values.getNumDimensions(), values.getBytesPerDimension()));
+                values.intersect(new PointsVisitor(values.getMaxPackedValue(), values.getNumDimensions(), values.getBytesPerDimension()));
+                stats.addPoints(field.name, directory.getBytesRead());
+            }
+        }
+    }
+
+    private class PointsVisitor implements PointValues.IntersectVisitor {
+        private final byte[] point;
+        private final int numDims;
+        private final int bytesPerDim;
+
+        PointsVisitor(byte[] point, int numDims, int bytesPerDim) {
+            this.point = point;
+            this.numDims = numDims;
+            this.bytesPerDim = bytesPerDim;
+        }
+
+        @Override
+        public void visit(int docID) throws IOException {
+            cancellationChecker.logEvent();
+        }
+
+        @Override
+        public void visit(int docID, byte[] packedValue) throws IOException {
+            cancellationChecker.logEvent();
+        }
+
+        @Override
+        public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+            for (int dim = 0; dim < numDims; dim++) {
+                int offset = dim * bytesPerDim;
+                if (FutureArrays.compareUnsigned(minPackedValue, offset, offset + bytesPerDim, point, offset, offset + bytesPerDim) > 0 ||
+                    FutureArrays.compareUnsigned(maxPackedValue, offset, offset + bytesPerDim, point, offset, offset + bytesPerDim) < 0) {
+                    return PointValues.Relation.CELL_OUTSIDE_QUERY;
+                }
+            }
+            return PointValues.Relation.CELL_CROSSES_QUERY;
+        }
+    }
+
+    void analyzeNorms(SegmentReader reader, IndexDiskUsageStats stats) throws IOException {
+        if (reader.getNormsReader() == null) {
+            return;
+        }
+        final NormsProducer normsReader = reader.getNormsReader().getMergeInstance();
+        for (FieldInfo field : reader.getFieldInfos()) {
+            if (field.hasNorms()) {
+                cancellationChecker.checkForCancellation();
+                directory.resetBytesRead();
+                iterateDocValues(reader.maxDoc(), () -> normsReader.getNorms(field), norms -> {
+                    cancellationChecker.logEvent();
+                    norms.longValue();
+                });
+                stats.addNorms(field.name, directory.getBytesRead());
+            }
+        }
+    }
+
+    void analyzeTermVectors(SegmentReader reader, IndexDiskUsageStats stats) throws IOException {
+        TermVectorsReader termVectorsReader = reader.getTermVectorsReader();
+        if (termVectorsReader == null) {
+            return;
+        }
+        termVectorsReader = termVectorsReader.getMergeInstance();
+        directory.resetBytesRead();
+        final TermVectorsVisitor visitor = new TermVectorsVisitor();
+        // TODO: Traverse 10-20% documents
+        for (int docID = 0; docID < reader.numDocs(); docID++) {
+            cancellationChecker.logEvent();
+            final Fields vectors = termVectorsReader.get(docID);
+            if (vectors != null) {
+                for (String field : vectors) {
+                    cancellationChecker.logEvent();
+                    visitor.visitField(vectors, field);
+                }
+            }
+        }
+        if (visitor.fields.isEmpty() == false) {
+            final long totalBytes = visitor.fields.values().stream().mapToLong(v -> v).sum();
+            final double ratio = (double) (directory.getBytesRead()) / (double) (totalBytes);
+            for (Map.Entry<String, Long> field : visitor.fields.entrySet()) {
+                final long fieldBytes = (long) Math.ceil(field.getValue() * ratio);
+                stats.addTermVectors(field.getKey(), fieldBytes);
+            }
+        }
+    }
+
+    private class TermVectorsVisitor {
+        final Map<String, Long> fields = new HashMap<>();
+        private PostingsEnum docsAndPositions; // to reuse
+
+        void visitField(Fields vectors, String fieldName) throws IOException {
+            final Terms terms = vectors.terms(fieldName);
+            if (terms == null) {
+                return;
+            }
+            final boolean hasPositions = terms.hasPositions();
+            final boolean hasOffsets = terms.hasOffsets();
+            final boolean hasPayloads = terms.hasPayloads();
+            assert hasPayloads == false || hasPositions;
+            long fieldLength = 1; // flags
+            final TermsEnum termsEnum = terms.iterator();
+            BytesRef bytesRef;
+            while ((bytesRef = termsEnum.next()) != null) {
+                cancellationChecker.logEvent();
+                fieldLength += Integer.BYTES + bytesRef.length; // term
+                final int freq = (int) termsEnum.totalTermFreq();
+                fieldLength += Integer.BYTES; // freq
+                if (hasPositions || hasOffsets) {
+                    docsAndPositions = termsEnum.postings(docsAndPositions, PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);
+                    assert docsAndPositions != null;
+                    while (docsAndPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+                        cancellationChecker.logEvent();
+                        assert docsAndPositions.freq() == freq;
+                        for (int posUpTo = 0; posUpTo < freq; posUpTo++) {
+                            final int pos = docsAndPositions.nextPosition();
+                            fieldLength += Integer.BYTES; // position
+                            docsAndPositions.startOffset();
+                            fieldLength += Integer.BYTES; // start offset
+                            docsAndPositions.endOffset();
+                            fieldLength += Integer.BYTES; // end offset
+                            final BytesRef payload = docsAndPositions.getPayload();
+                            if (payload != null) {
+                                fieldLength += Integer.BYTES + payload.length; // payload
+                            }
+                            assert hasPositions == false || pos >= 0;
+                        }
+                    }
+                }
+            }
+            final long finalLength = fieldLength;
+            fields.compute(fieldName, (k, v) -> v == null ? finalLength : v + finalLength);
+        }
+    }
+
+    private static class TrackingReadBytesDirectory extends FilterDirectory {
+        private final Map<String, BytesReadTracker> trackers = new HashMap<>();
+
+        TrackingReadBytesDirectory(Directory in) {
+            super(in);
+        }
+
+        long getBytesRead() {
+            return trackers.values().stream().mapToLong(BytesReadTracker::getBytesRead).sum();
+        }
+
+        void resetBytesRead() {
+            trackers.values().forEach(BytesReadTracker::resetBytesRead);
+        }
+
+        @Override
+        public IndexInput openInput(String name, IOContext context) throws IOException {
+            IndexInput in = super.openInput(name, context);
+            try {
+                final BytesReadTracker tracker = trackers.computeIfAbsent(name, k -> {
+                    if (LuceneFilesExtensions.fromFile(name) == LuceneFilesExtensions.CFS) {
+                        return new CompoundFileBytesReaderTracker();
+                    } else {
+                        return new BytesReadTracker();
+                    }
+                });
+                final TrackingReadBytesIndexInput wrapped = new TrackingReadBytesIndexInput(in, 0L, tracker);
+                in = null;
+                return wrapped;
+            } finally {
+                IOUtils.close(in);
+            }
+        }
+    }
+
+    private static class TrackingReadBytesIndexInput extends IndexInput {
+        final IndexInput in;
+        final BytesReadTracker bytesReadTracker;
+        final long fileOffset;
+
+        TrackingReadBytesIndexInput(IndexInput in, long fileOffset, BytesReadTracker bytesReadTracker) {
+            super(in.toString());
+            this.in = in;
+            this.fileOffset = fileOffset;
+            this.bytesReadTracker = bytesReadTracker;
+        }
+
+        @Override
+        public void close() throws IOException {
+            in.close();
+        }
+
+        @Override
+        public long getFilePointer() {
+            return in.getFilePointer();
+        }
+
+        @Override
+        public void seek(long pos) throws IOException {
+            in.seek(pos);
+        }
+
+        @Override
+        public long length() {
+            return in.length();
+        }
+
+        @Override
+        public IndexInput slice(String sliceDescription, long offset, long length) throws IOException {
+            final IndexInput slice = in.slice(sliceDescription, offset, length);
+            return new TrackingReadBytesIndexInput(slice, fileOffset + offset, bytesReadTracker.createSliceTracker(offset));
+        }
+
+        @Override
+        public IndexInput clone() {
+            return new TrackingReadBytesIndexInput(in.clone(), fileOffset, bytesReadTracker);
+        }
+
+        @Override
+        public byte readByte() throws IOException {
+            bytesReadTracker.trackPositions(fileOffset + getFilePointer(), 1);
+            return in.readByte();
+        }
+
+        @Override
+        public void readBytes(byte[] b, int offset, int len) throws IOException {
+            bytesReadTracker.trackPositions(fileOffset + getFilePointer(), len);
+            in.readBytes(b, offset, len);
+        }
+    }
+
+    /**
+     * Lucene Codec organizes data field by field for doc values, points, postings, and norms; and document by document
+     * for stored fields and term vectors. BytesReadTracker then can simply track the min and max read positions.
+     * This would allow us to traverse only two ends of each partition.
+     */
+    private static class BytesReadTracker {
+        private long minPosition = Long.MAX_VALUE;
+        private long maxPosition = Long.MIN_VALUE;
+
+        BytesReadTracker createSliceTracker(long offset) {
+            return this;
+        }
+
+        void trackPositions(long position, int length) {
+            minPosition = Math.min(minPosition, position);
+            maxPosition = Math.max(maxPosition, position + length - 1);
+        }
+
+
+        void resetBytesRead() {
+            minPosition = Long.MAX_VALUE;
+            maxPosition = Long.MIN_VALUE;
+        }
+
+        long getBytesRead() {
+            if (minPosition <= maxPosition) {
+                return maxPosition - minPosition + 1;
+            } else {
+                return 0L;
+            }
+        }
+    }
+
+    private static class CompoundFileBytesReaderTracker extends BytesReadTracker {
+        private final Map<Long, BytesReadTracker> slicedTrackers = new HashMap<>();
+
+        @Override
+        BytesReadTracker createSliceTracker(long offset) {
+            return slicedTrackers.computeIfAbsent(offset, k -> new BytesReadTracker());
+        }
+
+        @Override
+        void trackPositions(long position, int length) {
+            // already tracked by a child tracker except for the header and footer, but we can ignore them.
+        }
+
+        @Override
+        void resetBytesRead() {
+            slicedTrackers.values().forEach(BytesReadTracker::resetBytesRead);
+        }
+
+        @Override
+        long getBytesRead() {
+            return slicedTrackers.values().stream().mapToLong(BytesReadTracker::getBytesRead).sum();
+        }
+    }
+
+    static long getIndexSize(IndexCommit commit) throws IOException {
+        long total = 0;
+        for (String file : commit.getFileNames()) {
+            total += commit.getDirectory().fileLength(file);
+        }
+        return total;
+    }
+
+    /**
+     * Periodically checks if the task was cancelled so the analyzing process can abort quickly.
+     */
+    private static class CancellationChecker {
+        static final long THRESHOLD = 10_000;
+        private long iterations;
+        private final Runnable checkForCancellationRunner;
+
+        CancellationChecker(Runnable checkForCancellationRunner) {
+            this.checkForCancellationRunner = checkForCancellationRunner;
+        }
+
+        void logEvent() {
+            if (iterations == THRESHOLD) {
+                checkForCancellation();
+            } else {
+                iterations++;
+            }
+        }
+
+        void checkForCancellation() {
+            iterations = 0;
+            checkForCancellationRunner.run();
+        }
+    }
+
+    private static class ExecutionTime {
+        long invertedIndexTimeInNanos;
+        long storedFieldsTimeInNanos;
+        long docValuesTimeInNanos;
+        long pointsTimeInNanos;
+        long normsTimeInNanos;
+        long termVectorsTimeInNanos;
+
+        long totalInNanos() {
+            return invertedIndexTimeInNanos + storedFieldsTimeInNanos + docValuesTimeInNanos
+                + pointsTimeInNanos + normsTimeInNanos + termVectorsTimeInNanos;
+        }
+
+        @Override
+        public String toString() {
+            return "total: " + totalInNanos() / 1000_000 + "ms" +
+                ", inverted index: " + invertedIndexTimeInNanos / 1000_000 + "ms" +
+                ", stored fields: " + storedFieldsTimeInNanos / 1000_000 + "ms" +
+                ", doc values: " + docValuesTimeInNanos / 1000_000 + "ms" +
+                ", points: " + pointsTimeInNanos / 1000_000 + "ms" +
+                ", norms: " + normsTimeInNanos / 1000_000 + "ms" +
+                ", term vectors: " + termVectorsTimeInNanos / 1000_000 + "ms";
+        }
+    }
+}

+ 265 - 0
server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageStats.java

@@ -0,0 +1,265 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.action.admin.indices.diskusage;
+
+import org.elasticsearch.common.Strings;
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.common.io.stream.Writeable;
+import org.elasticsearch.common.unit.ByteSizeValue;
+import org.elasticsearch.common.xcontent.ToXContentFragment;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.stream.Collectors;
+
+/**
+ * The result of analyzing disk usage of each field in a shard/index
+ */
+public final class IndexDiskUsageStats implements ToXContentFragment, Writeable {
+    public static final String TOTAL = "total";
+    public static final String TOTAL_IN_BYTES = "total_in_bytes";
+    public static final String INVERTED_INDEX = "inverted_index";
+    public static final String STORED_FIELDS = "stored_fields";
+    public static final String STORED_FIELDS_IN_BYTES = "stored_fields_in_bytes";
+    public static final String DOC_VALUES = "doc_values";
+    public static final String DOC_VALUES_IN_BYTES = "doc_values_in_bytes";
+    public static final String POINTS = "points";
+    public static final String POINTS_IN_BYTES = "points_in_bytes";
+    public static final String NORMS = "norms";
+    public static final String NORMS_IN_BYTES = "norms_in_bytes";
+    public static final String TERM_VECTORS = "term_vectors";
+    public static final String TERM_VECTORS_IN_BYTES = "term_vectors_in_bytes";
+
+    public static final String STORE_SIZE = "store_size";
+    public static final String STORE_SIZE_IN_BYTES = "store_size_in_bytes";
+
+    private final Map<String, PerFieldDiskUsage> fields;
+    private long indexSizeInBytes;
+
+    public IndexDiskUsageStats(long indexSizeInBytes) {
+        fields = new HashMap<>();
+        this.indexSizeInBytes = indexSizeInBytes;
+    }
+
+    public IndexDiskUsageStats(StreamInput in) throws IOException {
+        this.fields = in.readMap(StreamInput::readString, PerFieldDiskUsage::new);
+        this.indexSizeInBytes = in.readVLong();
+    }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        out.writeMap(fields, StreamOutput::writeString, (o, v) -> v.writeTo(o));
+        out.writeVLong(indexSizeInBytes);
+    }
+
+    PerFieldDiskUsage total() {
+        final PerFieldDiskUsage total = new PerFieldDiskUsage();
+        for (PerFieldDiskUsage value : fields.values()) {
+            total.add(value);
+        }
+        return total;
+    }
+
+    Map<String, PerFieldDiskUsage> getFields() {
+        return fields;
+    }
+
+    long getIndexSizeInBytes() {
+        return indexSizeInBytes;
+    }
+
+    private void checkByteSize(long bytes) {
+        if (bytes < 0) {
+            throw new IllegalArgumentException("Bytes must be non-negative; got " + bytes);
+        }
+    }
+
+    private PerFieldDiskUsage getOrAdd(String fieldName) {
+        Objects.requireNonNull(fieldName, "fieldName must be non-null");
+        return fields.computeIfAbsent(fieldName, k -> new PerFieldDiskUsage());
+    }
+
+    public void addInvertedIndex(String fieldName, long bytes) {
+        checkByteSize(bytes);
+        getOrAdd(fieldName).invertedIndexBytes += bytes;
+    }
+
+    public void addStoredField(String fieldName, long bytes) {
+        checkByteSize(bytes);
+        getOrAdd(fieldName).storedFieldBytes += bytes;
+    }
+
+    public void addDocValues(String fieldName, long bytes) {
+        checkByteSize(bytes);
+        getOrAdd(fieldName).docValuesBytes += bytes;
+    }
+
+    public void addPoints(String fieldName, long bytes) {
+        checkByteSize(bytes);
+        getOrAdd(fieldName).pointsBytes += bytes;
+    }
+
+    public void addNorms(String fieldName, long bytes) {
+        checkByteSize(bytes);
+        getOrAdd(fieldName).normsBytes += bytes;
+    }
+
+    public void addTermVectors(String fieldName, long bytes) {
+        checkByteSize(bytes);
+        getOrAdd(fieldName).termVectorsBytes += bytes;
+    }
+
+    public IndexDiskUsageStats add(IndexDiskUsageStats other) {
+        other.fields.forEach((k, v) -> getOrAdd(k).add(v));
+        this.indexSizeInBytes += other.indexSizeInBytes;
+        return this;
+    }
+
+    @Override
+    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
+        final PerFieldDiskUsage total = total();
+        builder.field(STORE_SIZE, new ByteSizeValue(indexSizeInBytes));
+        builder.field(STORE_SIZE_IN_BYTES, indexSizeInBytes);
+
+        // all fields
+        builder.startObject("all_fields");
+        total.toXContent(builder, params);
+        builder.endObject();
+
+        // per field
+        builder.startObject("fields");
+        {
+            final List<Map.Entry<String, PerFieldDiskUsage>> entries = fields.entrySet().stream()
+                .sorted(Map.Entry.comparingByKey()).collect(Collectors.toList());
+            for (Map.Entry<String, PerFieldDiskUsage> entry : entries) {
+                builder.startObject(entry.getKey());
+                entry.getValue().toXContent(builder, params);
+                builder.endObject();
+            }
+        }
+        builder.endObject();
+        return builder;
+    }
+
+    @Override
+    public String toString() {
+        return Strings.toString(this);
+    }
+
+    /**
+     * Disk usage stats for a single field
+     */
+    public static final class PerFieldDiskUsage implements ToXContentFragment, Writeable {
+        private long invertedIndexBytes;
+        private long storedFieldBytes;
+        private long docValuesBytes;
+        private long pointsBytes;
+        private long normsBytes;
+        private long termVectorsBytes;
+
+        private PerFieldDiskUsage() {
+
+        }
+
+        private PerFieldDiskUsage(StreamInput in) throws IOException {
+            invertedIndexBytes = in.readVLong();
+            storedFieldBytes = in.readVLong();
+            docValuesBytes = in.readVLong();
+            pointsBytes = in.readVLong();
+            normsBytes = in.readVLong();
+            termVectorsBytes = in.readVLong();
+        }
+
+        @Override
+        public void writeTo(StreamOutput out) throws IOException {
+            out.writeVLong(invertedIndexBytes);
+            out.writeVLong(storedFieldBytes);
+            out.writeVLong(docValuesBytes);
+            out.writeVLong(pointsBytes);
+            out.writeVLong(normsBytes);
+            out.writeVLong(termVectorsBytes);
+        }
+
+        private void add(PerFieldDiskUsage other) {
+            invertedIndexBytes += other.invertedIndexBytes;
+            storedFieldBytes += other.storedFieldBytes;
+            docValuesBytes += other.docValuesBytes;
+            pointsBytes += other.pointsBytes;
+            normsBytes += other.normsBytes;
+            termVectorsBytes += other.termVectorsBytes;
+        }
+
+        public long getInvertedIndexBytes() {
+            return invertedIndexBytes;
+        }
+
+        public long getStoredFieldBytes() {
+            return storedFieldBytes;
+        }
+
+        public long getDocValuesBytes() {
+            return docValuesBytes;
+        }
+
+        public long getPointsBytes() {
+            return pointsBytes;
+        }
+
+        public long getNormsBytes() {
+            return normsBytes;
+        }
+
+        public long getTermVectorsBytes() {
+            return termVectorsBytes;
+        }
+
+
+        long totalBytes() {
+            return invertedIndexBytes + storedFieldBytes + docValuesBytes + pointsBytes + normsBytes + termVectorsBytes;
+        }
+
+        @Override
+        public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
+            final long totalBytes = totalBytes();
+            builder.field(TOTAL, new ByteSizeValue(totalBytes));
+            builder.field(TOTAL_IN_BYTES, totalBytes);
+
+            builder.startObject(INVERTED_INDEX);
+            builder.field(TOTAL, new ByteSizeValue(invertedIndexBytes));
+            builder.field(TOTAL_IN_BYTES, invertedIndexBytes);
+            builder.endObject();
+
+            builder.field(STORED_FIELDS, new ByteSizeValue(storedFieldBytes));
+            builder.field(STORED_FIELDS_IN_BYTES, storedFieldBytes);
+
+            builder.field(DOC_VALUES, new ByteSizeValue(docValuesBytes));
+            builder.field(DOC_VALUES_IN_BYTES, docValuesBytes);
+
+            builder.field(POINTS, new ByteSizeValue(pointsBytes));
+            builder.field(POINTS_IN_BYTES, pointsBytes);
+
+            builder.field(NORMS, new ByteSizeValue(normsBytes));
+            builder.field(NORMS_IN_BYTES, normsBytes);
+
+            builder.field(TERM_VECTORS, new ByteSizeValue(termVectorsBytes));
+            builder.field(TERM_VECTORS_IN_BYTES, termVectorsBytes);
+            return builder;
+        }
+
+        @Override
+        public String toString() {
+            return Strings.toString(this);
+        }
+    }
+}

+ 145 - 0
server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/TransportAnalyzeIndexDiskUsageAction.java

@@ -0,0 +1,145 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.action.admin.indices.diskusage;
+
+import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.action.NoShardAvailableActionException;
+import org.elasticsearch.action.support.ActionFilters;
+import org.elasticsearch.action.support.DefaultShardOperationFailedException;
+import org.elasticsearch.action.support.broadcast.TransportBroadcastAction;
+import org.elasticsearch.cluster.ClusterState;
+import org.elasticsearch.cluster.block.ClusterBlockException;
+import org.elasticsearch.cluster.block.ClusterBlockLevel;
+import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver;
+import org.elasticsearch.cluster.routing.GroupShardsIterator;
+import org.elasticsearch.cluster.routing.ShardIterator;
+import org.elasticsearch.cluster.routing.ShardRouting;
+import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.index.engine.Engine;
+import org.elasticsearch.index.shard.IndexShard;
+import org.elasticsearch.index.shard.ShardId;
+import org.elasticsearch.indices.IndicesService;
+import org.elasticsearch.tasks.CancellableTask;
+import org.elasticsearch.tasks.Task;
+import org.elasticsearch.tasks.TaskCancelledException;
+import org.elasticsearch.threadpool.ThreadPool;
+import org.elasticsearch.transport.TransportService;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicReferenceArray;
+
+public class TransportAnalyzeIndexDiskUsageAction extends TransportBroadcastAction<
+    AnalyzeIndexDiskUsageRequest, AnalyzeIndexDiskUsageResponse,
+    AnalyzeDiskUsageShardRequest, AnalyzeDiskUsageShardResponse> {
+    private final IndicesService indicesService;
+
+    @Inject
+    public TransportAnalyzeIndexDiskUsageAction(ClusterService clusterService,
+                                                TransportService transportService,
+                                                IndicesService indexServices, ActionFilters actionFilters,
+                                                IndexNameExpressionResolver indexNameExpressionResolver) {
+        super(AnalyzeIndexDiskUsageAction.NAME, clusterService, transportService, actionFilters, indexNameExpressionResolver,
+            AnalyzeIndexDiskUsageRequest::new, AnalyzeDiskUsageShardRequest::new, ThreadPool.Names.ANALYZE);
+        this.indicesService = indexServices;
+    }
+
+    @Override
+    protected void doExecute(Task task, AnalyzeIndexDiskUsageRequest request, ActionListener<AnalyzeIndexDiskUsageResponse> listener) {
+        super.doExecute(task, request, listener);
+    }
+
+    @Override
+    protected AnalyzeDiskUsageShardRequest newShardRequest(int numShards, ShardRouting shard, AnalyzeIndexDiskUsageRequest request) {
+        return new AnalyzeDiskUsageShardRequest(shard.shardId(), request);
+    }
+
+    @Override
+    protected AnalyzeDiskUsageShardResponse readShardResponse(StreamInput in) throws IOException {
+        return new AnalyzeDiskUsageShardResponse(in);
+    }
+
+    @Override
+    protected AnalyzeDiskUsageShardResponse shardOperation(AnalyzeDiskUsageShardRequest request, Task task) throws IOException {
+        final ShardId shardId = request.shardId();
+        assert task instanceof CancellableTask : "AnalyzeDiskUsageShardRequest must create a cancellable task";
+        final CancellableTask cancellableTask = (CancellableTask) task;
+        final Runnable checkForCancellation = () -> {
+            if (cancellableTask.isCancelled()) {
+                final String reason = cancellableTask.getReasonCancelled();
+                throw new TaskCancelledException(reason != null ? reason : "Task was cancelled");
+            }
+        };
+        final IndexShard shard = indicesService.indexServiceSafe(shardId.getIndex()).getShard(shardId.id());
+        try (Engine.IndexCommitRef commitRef = shard.acquireLastIndexCommit(request.flush)) {
+            final IndexDiskUsageStats stats = IndexDiskUsageAnalyzer.analyze(shardId, commitRef.getIndexCommit(), checkForCancellation);
+            return new AnalyzeDiskUsageShardResponse(shardId, stats);
+        }
+    }
+
+    @Override
+    protected AnalyzeIndexDiskUsageResponse newResponse(AnalyzeIndexDiskUsageRequest request,
+                                                        AtomicReferenceArray<?> shardsResponses,
+                                                        ClusterState clusterState) {
+        int successfulShards = 0;
+        final List<DefaultShardOperationFailedException> shardFailures = new ArrayList<>();
+        final Map<String, IndexDiskUsageStats> combined = new HashMap<>();
+        for (int i = 0; i < shardsResponses.length(); i++) {
+            final Object r = shardsResponses.get(i);
+            if (r instanceof AnalyzeDiskUsageShardResponse) {
+                ++successfulShards;
+                AnalyzeDiskUsageShardResponse resp = (AnalyzeDiskUsageShardResponse) r;
+                combined.compute(resp.getIndex(), (k, v) -> v == null ? resp.stats : v.add(resp.stats));
+            } else if (r instanceof DefaultShardOperationFailedException) {
+                shardFailures.add((DefaultShardOperationFailedException) r);
+            } else {
+                assert false : "unknown response [" + r + "]";
+                throw new IllegalStateException("unknown response [" + r + "]");
+            }
+        }
+        return new AnalyzeIndexDiskUsageResponse(
+            shardsResponses.length(),
+            successfulShards,
+            shardFailures.size(),
+            shardFailures,
+            combined);
+    }
+
+    @Override
+    protected GroupShardsIterator<ShardIterator> shards(ClusterState clusterState,
+                                                        AnalyzeIndexDiskUsageRequest request,
+                                                        String[] concreteIndices) {
+        final GroupShardsIterator<ShardIterator> groups = clusterService
+            .operationRouting()
+            .searchShards(clusterState, concreteIndices, null, null);
+        for (ShardIterator group : groups) {
+            // fails fast if any non-active groups
+            if (group.size() == 0) {
+                throw new NoShardAvailableActionException(group.shardId());
+            }
+        }
+        return groups;
+    }
+
+    @Override
+    protected ClusterBlockException checkGlobalBlock(ClusterState state, AnalyzeIndexDiskUsageRequest request) {
+        return state.blocks().globalBlockedException(ClusterBlockLevel.METADATA_READ);
+    }
+
+    @Override
+    protected ClusterBlockException checkRequestBlock(ClusterState state, AnalyzeIndexDiskUsageRequest request,
+                                                      String[] concreteIndices) {
+        return state.blocks().indicesBlockedException(ClusterBlockLevel.METADATA_READ, concreteIndices);
+    }
+}

+ 92 - 0
server/src/main/java/org/elasticsearch/common/lucene/FilterIndexCommit.java

@@ -0,0 +1,92 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.common.lucene;
+
+import org.apache.lucene.index.IndexCommit;
+import org.apache.lucene.store.Directory;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Map;
+
+public abstract class FilterIndexCommit extends IndexCommit {
+    protected final IndexCommit in;
+
+    public FilterIndexCommit(IndexCommit in) {
+        this.in = in;
+    }
+
+    public IndexCommit getIndexCommit() {
+        return in;
+    }
+
+    @Override
+    public String getSegmentsFileName() {
+        return in.getSegmentsFileName();
+    }
+
+    @Override
+    public Collection<String> getFileNames() throws IOException {
+        return in.getFileNames();
+    }
+
+    @Override
+    public Directory getDirectory() {
+        return in.getDirectory();
+    }
+
+    @Override
+    public void delete() {
+        in.delete();
+    }
+
+    @Override
+    public boolean isDeleted() {
+        return in.isDeleted();
+    }
+
+    @Override
+    public int getSegmentCount() {
+        return in.getSegmentCount();
+    }
+
+    @Override
+    public long getGeneration() {
+        return in.getGeneration();
+    }
+
+    @Override
+    public Map<String, String> getUserData() throws IOException {
+        return in.getUserData();
+    }
+
+    @Override
+    public String toString() {
+        return "FilterIndexCommit{" + "in=" + in + '}';
+    }
+}

+ 4 - 47
server/src/main/java/org/elasticsearch/index/engine/CombinedDeletionPolicy.java

@@ -13,14 +13,13 @@ import org.apache.logging.log4j.Logger;
 import org.apache.lucene.index.IndexCommit;
 import org.apache.lucene.index.IndexDeletionPolicy;
 import org.apache.lucene.index.SegmentInfos;
-import org.apache.lucene.store.Directory;
+import org.elasticsearch.common.lucene.FilterIndexCommit;
 import org.elasticsearch.index.seqno.SequenceNumbers;
 import org.elasticsearch.index.translog.Translog;
 import org.elasticsearch.index.translog.TranslogDeletionPolicy;
 
 import java.io.IOException;
 import java.nio.file.Path;
-import java.util.Collection;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
@@ -144,7 +143,7 @@ public class CombinedDeletionPolicy extends IndexDeletionPolicy {
      * @return true if the snapshotting commit can be clean up.
      */
     synchronized boolean releaseCommit(final IndexCommit snapshotCommit) {
-        final IndexCommit releasingCommit = ((SnapshotIndexCommit) snapshotCommit).delegate;
+        final IndexCommit releasingCommit = ((SnapshotIndexCommit) snapshotCommit).getIndexCommit();
         assert snapshottedCommits.containsKey(releasingCommit) : "Release non-snapshotted commit;" +
             "snapshotted commits [" + snapshottedCommits + "], releasing commit [" + releasingCommit + "]";
         final int refCount = snapshottedCommits.addTo(releasingCommit, -1); // release refCount
@@ -222,56 +221,14 @@ public class CombinedDeletionPolicy extends IndexDeletionPolicy {
     /**
      * A wrapper of an index commit that prevents it from being deleted.
      */
-    private static class SnapshotIndexCommit extends IndexCommit {
-        private final IndexCommit delegate;
-
+    private static class SnapshotIndexCommit extends FilterIndexCommit {
         SnapshotIndexCommit(IndexCommit delegate) {
-            this.delegate = delegate;
-        }
-
-        @Override
-        public String getSegmentsFileName() {
-            return delegate.getSegmentsFileName();
-        }
-
-        @Override
-        public Collection<String> getFileNames() throws IOException {
-            return delegate.getFileNames();
-        }
-
-        @Override
-        public Directory getDirectory() {
-            return delegate.getDirectory();
+            super(delegate);
         }
 
         @Override
         public void delete() {
             throw new UnsupportedOperationException("A snapshot commit does not support deletion");
         }
-
-        @Override
-        public boolean isDeleted() {
-            return delegate.isDeleted();
-        }
-
-        @Override
-        public int getSegmentCount() {
-            return delegate.getSegmentCount();
-        }
-
-        @Override
-        public long getGeneration() {
-            return delegate.getGeneration();
-        }
-
-        @Override
-        public Map<String, String> getUserData() throws IOException {
-            return delegate.getUserData();
-        }
-
-        @Override
-        public String toString() {
-            return "SnapshotIndexCommit{" + delegate + "}";
-        }
     }
 }

+ 6 - 0
server/src/main/java/org/elasticsearch/index/store/LuceneFilesExtensions.java

@@ -8,6 +8,7 @@
 
 package org.elasticsearch.index.store;
 
+import org.apache.lucene.index.IndexFileNames;
 import org.elasticsearch.core.Nullable;
 
 import java.util.Collections;
@@ -143,4 +144,9 @@ public enum LuceneFilesExtensions {
         }
         return null;
     }
+
+    @Nullable
+    public static LuceneFilesExtensions fromFile(String fileName) {
+        return fromExtension(IndexFileNames.getExtension(fileName));
+    }
 }

+ 75 - 0
server/src/main/java/org/elasticsearch/rest/action/admin/indices/RestAnalyzeIndexDiskUsageAction.java

@@ -0,0 +1,75 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.rest.action.admin.indices;
+
+import org.elasticsearch.action.admin.indices.diskusage.AnalyzeIndexDiskUsageAction;
+import org.elasticsearch.action.admin.indices.diskusage.AnalyzeIndexDiskUsageRequest;
+import org.elasticsearch.action.support.IndicesOptions;
+import org.elasticsearch.client.node.NodeClient;
+import org.elasticsearch.common.Strings;
+import org.elasticsearch.core.Booleans;
+import org.elasticsearch.rest.BaseRestHandler;
+import org.elasticsearch.rest.RestHandler;
+import org.elasticsearch.rest.RestRequest;
+import org.elasticsearch.rest.action.RestCancellableNodeClient;
+import org.elasticsearch.rest.action.RestToXContentListener;
+
+import java.io.IOException;
+import java.util.List;
+
+import static org.elasticsearch.rest.RestRequest.Method.POST;
+
+public class RestAnalyzeIndexDiskUsageAction extends BaseRestHandler {
+
+    @Override
+    public List<RestHandler.Route> routes() {
+        return List.of(new RestHandler.Route(POST, "/{index}/_disk_usage"));
+    }
+
+    @Override
+    public String getName() {
+        return "analyze_index_disk_usage_action";
+    }
+
+    @Override
+    public BaseRestHandler.RestChannelConsumer prepareRequest(final RestRequest request, final NodeClient client) throws IOException {
+        if (Booleans.parseBoolean(request.param("run_expensive_tasks"), false) == false) {
+            throw new IllegalArgumentException(
+                "analyzing the disk usage of an index is expensive and resource-intensive, " +
+                    "the parameter [run_expensive_tasks] must be set to [true] in order for the task to be performed.");
+        }
+        final String[] indices = Strings.splitStringByCommaToArray(request.param("index"));
+        final IndicesOptions indicesOptions = IndicesOptions.fromRequest(request, AnalyzeIndexDiskUsageRequest.DEFAULT_INDICES_OPTIONS);
+        final boolean flush = request.paramAsBoolean("flush", true);
+        final AnalyzeIndexDiskUsageRequest analyzeRequest = new AnalyzeIndexDiskUsageRequest(indices, indicesOptions, flush);
+        return channel -> {
+            final RestCancellableNodeClient cancelClient = new RestCancellableNodeClient(client, request.getHttpChannel());
+            cancelClient.execute(AnalyzeIndexDiskUsageAction.INSTANCE, analyzeRequest, new RestToXContentListener<>(channel));
+        };
+    }
+}

+ 557 - 0
server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java

@@ -0,0 +1,557 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.action.admin.indices.diskusage;
+
+import org.apache.lucene.codecs.DocValuesFormat;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat;
+import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat;
+import org.apache.lucene.codecs.lucene87.Lucene87Codec;
+import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
+import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
+import org.apache.lucene.document.BinaryDocValuesField;
+import org.apache.lucene.document.BinaryPoint;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.IntPoint;
+import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.document.SortedDocValuesField;
+import org.apache.lucene.document.SortedNumericDocValuesField;
+import org.apache.lucene.document.SortedSetDocValuesField;
+import org.apache.lucene.document.StoredField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexCommit;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.SegmentReader;
+import org.apache.lucene.search.suggest.document.Completion84PostingsFormat;
+import org.apache.lucene.search.suggest.document.CompletionPostingsFormat;
+import org.apache.lucene.search.suggest.document.SuggestField;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.lucene.Lucene;
+import org.elasticsearch.core.internal.io.IOUtils;
+import org.elasticsearch.index.shard.ShardId;
+import org.elasticsearch.index.store.LuceneFilesExtensions;
+import org.elasticsearch.test.ESTestCase;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.function.Consumer;
+import java.util.stream.Collectors;
+
+import static org.hamcrest.Matchers.empty;
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.not;
+import static org.hamcrest.Matchers.notNullValue;
+
+public class IndexDiskUsageAnalyzerTests extends ESTestCase {
+
+    public void testStoredFields() throws Exception {
+        try (Directory dir = newDirectory()) {
+            final CodecMode codec = randomFrom(CodecMode.values());
+            indexRandomly(dir, codec, between(100, 1000), doc -> {
+                final double ratio = randomDouble();
+                if (ratio <= 0.33) {
+                    doc.add(new StoredField("sf1", randomAlphaOfLength(5)));
+                }
+                if (ratio <= 0.67) {
+                    doc.add(new StoredField("sf2", randomAlphaOfLength(5)));
+                }
+                doc.add(new StoredField("sf3", randomAlphaOfLength(5)));
+            });
+            final IndexDiskUsageStats stats = IndexDiskUsageAnalyzer.analyze(testShardId(), lastCommit(dir), () -> {});
+            final IndexDiskUsageStats perField = collectPerFieldStats(dir);
+            assertFieldStats("total", "stored field",
+                stats.total().getStoredFieldBytes(), perField.total().getStoredFieldBytes(), 0.01, 1024);
+
+            assertFieldStats("sf1", "stored field",
+                stats.getFields().get("sf1").getStoredFieldBytes(), stats.total().getStoredFieldBytes() / 6, 0.01, 512);
+
+            assertFieldStats("sf2", "stored field",
+                stats.getFields().get("sf2").getStoredFieldBytes(), stats.total().getStoredFieldBytes() / 3, 0.01, 512);
+
+            assertFieldStats("sf3", "stored field",
+                stats.getFields().get("sf3").getStoredFieldBytes(), stats.total().getStoredFieldBytes() / 2, 0.01, 512);
+        }
+    }
+
+    public void testTermVectors() throws Exception {
+        try (Directory dir = newDirectory()) {
+            final CodecMode codec = randomFrom(CodecMode.values());
+            indexRandomly(dir, codec, between(100, 1000), doc -> {
+                final FieldType fieldType = randomTermVectorsFieldType();
+                final double ratio = randomDouble();
+                if (ratio <= 0.25) {
+                    doc.add(new Field("v1", randomAlphaOfLength(5), fieldType));
+                }
+                if (ratio <= 0.50) {
+                    doc.add(new Field("v2", randomAlphaOfLength(5), fieldType));
+                }
+                doc.add(new Field("v3", randomAlphaOfLength(5), fieldType));
+            });
+            final IndexDiskUsageStats stats = IndexDiskUsageAnalyzer.analyze(testShardId(), lastCommit(dir), () -> {});
+            final IndexDiskUsageStats perField = collectPerFieldStats(dir);
+            logger.info("--> stats {} per field {}", stats, perField);
+            assertFieldStats("total", "term vectors",
+                stats.total().getTermVectorsBytes(), perField.total().getTermVectorsBytes(), 0.01, 1024);
+            assertFieldStats("v1", "term vectors",
+                stats.getFields().get("v1").getTermVectorsBytes(), stats.total().getTermVectorsBytes() / 7, 0.01, 512);
+            assertFieldStats("v2", "term vectors",
+                stats.getFields().get("v2").getTermVectorsBytes(), stats.total().getTermVectorsBytes() * 2 / 7, 0.01, 512);
+            assertFieldStats("v3", "term vectors",
+                stats.getFields().get("v3").getTermVectorsBytes(), stats.total().getTermVectorsBytes() * 4 / 7, 0.01, 512);
+        }
+    }
+
+    public void testPoints() throws Exception {
+        try (Directory dir = newDirectory()) {
+            final CodecMode codec = randomFrom(CodecMode.values());
+            indexRandomly(dir, codec, between(100, 1000), doc -> {
+                final double ratio = randomDouble();
+                if (ratio <= 0.25) {
+                    doc.add(new BinaryPoint("pt1", randomAlphaOfLength(5).getBytes(StandardCharsets.UTF_8)));
+                }
+                if (ratio <= 0.50) {
+                    doc.add(new BinaryPoint("pt2", randomAlphaOfLength(5).getBytes(StandardCharsets.UTF_8)));
+                }
+                doc.add(new BinaryPoint("pt3", randomAlphaOfLength(5).getBytes(StandardCharsets.UTF_8)));
+            });
+            final IndexDiskUsageStats stats = IndexDiskUsageAnalyzer.analyze(testShardId(), lastCommit(dir), () -> {});
+            final IndexDiskUsageStats perField = collectPerFieldStats(dir);
+            logger.info("--> stats {} per field {}", stats, perField);
+            assertFieldStats("total", "points",
+                stats.total().getPointsBytes(), perField.total().getPointsBytes(), 0.01, 1024);
+            assertFieldStats("pt1", "points",
+                stats.getFields().get("pt1").getPointsBytes(), stats.total().getPointsBytes() / 7, 0.01, 512);
+            assertFieldStats("pt2", "points",
+                stats.getFields().get("pt2").getPointsBytes(), stats.total().getPointsBytes() * 2 / 7, 0.01, 512);
+            assertFieldStats("pt3", "points",
+                stats.getFields().get("pt3").getPointsBytes(), stats.total().getPointsBytes() * 4 / 7, 0.01, 512);
+        }
+    }
+
+    public void testCompletionField() throws Exception {
+        IndexWriterConfig config = new IndexWriterConfig()
+            .setCommitOnClose(true)
+            .setUseCompoundFile(false)
+            .setCodec(new Lucene87Codec(Lucene87Codec.Mode.BEST_SPEED) {
+                @Override
+                public PostingsFormat getPostingsFormatForField(String field) {
+                    if (field.startsWith("suggest_")) {
+                        return new Completion84PostingsFormat(randomFrom(CompletionPostingsFormat.FSTLoadMode.values()));
+                    } else {
+                        return super.postingsFormat();
+                    }
+                }
+            });
+
+        try (Directory dir = newDirectory()) {
+            try (IndexWriter writer = new IndexWriter(dir, config)) {
+                int numDocs = randomIntBetween(100, 1000);
+                for (int i = 0; i < numDocs; i++) {
+                    final Document doc = new Document();
+                    if (randomDouble() < 0.5) {
+                        doc.add(new SuggestField("suggest_1", randomAlphaOfLength(10), randomIntBetween(1, 20)));
+                    }
+                    doc.add(new SuggestField("suggest_2", randomAlphaOfLength(10), randomIntBetween(1, 20)));
+                    writer.addDocument(doc);
+                }
+            }
+            final IndexDiskUsageStats stats = IndexDiskUsageAnalyzer.analyze(testShardId(), lastCommit(dir), () -> {});
+            assertFieldStats("suggest_1", "inverted_index",
+                stats.getFields().get("suggest_1").getInvertedIndexBytes(),
+                stats.total().totalBytes() / 3, 0.05, 2048);
+
+            assertFieldStats("suggest_2", "inverted_index",
+                stats.getFields().get("suggest_2").getInvertedIndexBytes(),
+                stats.total().totalBytes() * 2 / 3, 0.05, 2048);
+
+            final IndexDiskUsageStats perField = collectPerFieldStats(dir);
+            assertFieldStats("suggest_1", "inverted_index",
+                stats.getFields().get("suggest_1").getInvertedIndexBytes(),
+                perField.getFields().get("suggest_1").getInvertedIndexBytes(), 0.05, 2048);
+
+            assertFieldStats("suggest_2", "inverted_index",
+                stats.getFields().get("suggest_2").getInvertedIndexBytes(),
+                perField.getFields().get("suggest_2").getInvertedIndexBytes(), 0.05, 2048);
+        }
+    }
+
+    public void testMixedFields() throws Exception {
+        try (Directory dir = newDirectory()) {
+            final CodecMode codec = randomFrom(CodecMode.values());
+            indexRandomly(dir, codec, between(100, 1000), IndexDiskUsageAnalyzerTests::addRandomFields);
+            final IndexDiskUsageStats stats = IndexDiskUsageAnalyzer.analyze(testShardId(), lastCommit(dir), () -> {});
+            logger.info("--> stats {}", stats);
+            try (Directory perFieldDir = newDirectory()) {
+                rewriteIndexWithPerFieldCodec(dir, codec, perFieldDir);
+                final IndexDiskUsageStats perFieldStats = collectPerFieldStats(perFieldDir);
+                assertStats(stats, perFieldStats);
+                assertStats(IndexDiskUsageAnalyzer.analyze(testShardId(), lastCommit(perFieldDir), () -> {}), perFieldStats);
+            }
+        }
+    }
+
+    enum CodecMode {
+        BEST_SPEED {
+            @Override
+            Lucene87Codec.Mode mode() {
+                return Lucene87Codec.Mode.BEST_SPEED;
+            }
+
+            @Override
+            DocValuesFormat dvFormat() {
+                return new Lucene80DocValuesFormat(Lucene80DocValuesFormat.Mode.BEST_SPEED);
+            }
+        },
+
+        BEST_COMPRESSION {
+            @Override
+            Lucene87Codec.Mode mode() {
+                return Lucene87Codec.Mode.BEST_COMPRESSION;
+            }
+
+            @Override
+            DocValuesFormat dvFormat() {
+                return new Lucene80DocValuesFormat(Lucene80DocValuesFormat.Mode.BEST_COMPRESSION);
+            }
+        };
+
+        abstract Lucene87Codec.Mode mode();
+
+        abstract DocValuesFormat dvFormat();
+    }
+
+    static void indexRandomly(Directory directory, CodecMode codecMode, int numDocs, Consumer<Document> addFields) throws IOException {
+        IndexWriterConfig config = new IndexWriterConfig()
+            .setCommitOnClose(true)
+            .setUseCompoundFile(randomBoolean())
+            .setCodec(new Lucene87Codec(codecMode.mode()));
+        try (IndexWriter writer = new IndexWriter(directory, config)) {
+            for (int i = 0; i < numDocs; i++) {
+                final Document doc = new Document();
+                addFields.accept(doc);
+                writer.addDocument(doc);
+            }
+        }
+    }
+
+    static void addRandomDocValuesField(Document doc) {
+        if (randomBoolean()) {
+            doc.add(new NumericDocValuesField("ndv", random().nextInt(1024)));
+        }
+        if (randomBoolean()) {
+            doc.add(new BinaryDocValuesField("bdv", new BytesRef(randomAlphaOfLength(3))));
+        }
+        if (randomBoolean()) {
+            doc.add(new SortedDocValuesField("sdv", new BytesRef(randomAlphaOfLength(3))));
+        }
+        int numValues = random().nextInt(5);
+        for (int i = 0; i < numValues; ++i) {
+            doc.add(new SortedSetDocValuesField("ssdv", new BytesRef(randomAlphaOfLength(3))));
+        }
+        numValues = random().nextInt(5);
+        for (int i = 0; i < numValues; ++i) {
+            doc.add(new SortedNumericDocValuesField("sndv", random().nextInt(1024)));
+        }
+    }
+
+    static void addRandomPostings(Document doc) {
+        for (IndexOptions opts : IndexOptions.values()) {
+            if (opts == IndexOptions.NONE) {
+                continue;
+            }
+            FieldType ft = new FieldType();
+            ft.setIndexOptions(opts);
+            ft.freeze();
+            final int numFields = random().nextInt(5);
+            for (int j = 0; j < numFields; ++j) {
+                doc.add(new Field("f_" + opts, randomAlphaOfLength(5), ft));
+            }
+        }
+    }
+
+    static void addRandomStoredFields(Document doc, int numFields) {
+        final int numValues = random().nextInt(3);
+        for (int i = 0; i < numValues; ++i) {
+            final String field = "sf-" + between(1, numFields);
+            if (randomBoolean()) {
+                doc.add(new StoredField(field, randomAlphaOfLength(5)));
+            } else {
+                doc.add(new StoredField(field, randomLong()));
+            }
+        }
+    }
+
+    static void addRandomPoints(Document doc) {
+        final int numValues = random().nextInt(5);
+        for (int i = 0; i < numValues; i++) {
+            doc.add(new IntPoint("pt-" + randomIntBetween(1, 2), random().nextInt()));
+        }
+    }
+
+    static FieldType randomTermVectorsFieldType() {
+        FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
+        fieldType.setStoreTermVectors(true);
+        fieldType.setStoreTermVectorPositions(randomBoolean());
+        fieldType.setStoreTermVectorOffsets(randomBoolean());
+        // TODO: add payloads
+        fieldType.setStoreTermVectorPayloads(false);
+        return fieldType;
+    }
+
+    static void addRandomTermVectors(Document doc) {
+        int numFields = randomFrom(1, 3);
+        for (int f = 0; f < numFields; f++) {
+            doc.add(new Field("vector-" + f, randomAlphaOfLength(5), randomTermVectorsFieldType()));
+        }
+    }
+
+    static void addRandomFields(Document doc) {
+        if (randomBoolean()) {
+            addRandomDocValuesField(doc);
+        }
+        if (randomBoolean()) {
+            addRandomPostings(doc);
+        }
+        if (randomBoolean()) {
+            addRandomPoints(doc);
+        }
+        if (randomBoolean()) {
+            addRandomStoredFields(doc, between(1, 3));
+        }
+        if (randomBoolean()) {
+            addRandomTermVectors(doc);
+        }
+    }
+
+    static class FieldLookup {
+        private final Map<String, FieldInfo> dvSuffixes = new HashMap<>();
+        private final Map<String, FieldInfo> postingsSuffixes = new HashMap<>();
+
+        FieldLookup(FieldInfos fieldInfos) {
+            for (FieldInfo field : fieldInfos) {
+                Map<String, String> attributes = field.attributes();
+                if (attributes != null) {
+                    String postingsSuffix = attributes.get(PerFieldPostingsFormat.PER_FIELD_SUFFIX_KEY);
+                    if (postingsSuffix != null) {
+                        postingsSuffixes.put(postingsSuffix, field);
+                    }
+                    String dvSuffix = attributes.get(PerFieldDocValuesFormat.PER_FIELD_SUFFIX_KEY);
+                    if (dvSuffix != null) {
+                        dvSuffixes.put(dvSuffix, field);
+                    }
+                }
+            }
+        }
+
+        /**
+         * Returns the codec suffix from this file name, or null if there is no suffix.
+         */
+        private static String parseSuffix(String filename) {
+            if (filename.startsWith("_") == false) {
+                return null;
+            }
+            String[] parts = IndexFileNames.stripExtension(filename).substring(1).split("_");
+            // 4 cases:
+            // segment.ext
+            // segment_gen.ext
+            // segment_codec_suffix.ext
+            // segment_gen_codec_suffix.ext
+            if (parts.length == 3) {
+                return parts[2];
+            } else if (parts.length == 4) {
+                return parts[3];
+            } else {
+                return null;
+            }
+        }
+
+        String getDocValuesField(String fileName) {
+            final String suffix = parseSuffix(fileName);
+            final FieldInfo field = dvSuffixes.get(suffix);
+            assertThat("dvSuffix[" + dvSuffixes + "] fileName[" + fileName + "]", field, notNullValue());
+            return field.name;
+        }
+
+        String getPostingsField(String fileName) {
+            final String suffix = parseSuffix(fileName);
+            final FieldInfo field = postingsSuffixes.get(suffix);
+            assertThat("postingsSuffixes[" + postingsSuffixes + "] fileName[" + fileName + "]", field, notNullValue());
+            return field.name;
+        }
+    }
+
+    static void rewriteIndexWithPerFieldCodec(Directory source, CodecMode mode, Directory dst) throws IOException {
+        try (DirectoryReader reader = DirectoryReader.open(source)) {
+            IndexWriterConfig config = new IndexWriterConfig()
+                .setSoftDeletesField(Lucene.SOFT_DELETES_FIELD)
+                .setUseCompoundFile(randomBoolean())
+                .setCodec(new Lucene87Codec(mode.mode()) {
+                    @Override
+                    public PostingsFormat getPostingsFormatForField(String field) {
+                        return new Lucene84PostingsFormat();
+                    }
+
+                    @Override
+                    public DocValuesFormat getDocValuesFormatForField(String field) {
+                        return mode.dvFormat();
+                    }
+
+                    @Override
+                    public String toString() {
+                        return super.toString();
+                    }
+                })
+                .setOpenMode(IndexWriterConfig.OpenMode.CREATE);
+            try (IndexWriter writer = new IndexWriter(dst, config)) {
+                for (LeafReaderContext leaf : reader.leaves()) {
+                    final SegmentReader segmentReader = Lucene.segmentReader(leaf.reader());
+                    writer.addIndexes(segmentReader);
+                }
+                writer.commit();
+            }
+        }
+    }
+
+    static IndexDiskUsageStats collectPerFieldStats(Directory directory) throws IOException {
+        try (DirectoryReader reader = DirectoryReader.open(directory)) {
+            final IndexDiskUsageStats stats = new IndexDiskUsageStats(IndexDiskUsageAnalyzer.getIndexSize(lastCommit(directory)));
+            for (LeafReaderContext leaf : reader.leaves()) {
+                collectPerFieldStats(Lucene.segmentReader(leaf.reader()), stats);
+            }
+            return stats;
+        }
+    }
+
+    static void collectPerFieldStats(SegmentReader reader, IndexDiskUsageStats stats) throws IOException {
+        final SegmentInfo sis = reader.getSegmentInfo().info;
+        final String[] files;
+        final Directory directory;
+        if (sis.getUseCompoundFile()) {
+            directory = sis.getCodec().compoundFormat().getCompoundReader(reader.directory(), sis, IOContext.READ);
+            files = directory.listAll();
+        } else {
+            directory = reader.directory();
+            files = sis.files().toArray(new String[0]);
+        }
+        final FieldLookup fieldLookup = new FieldLookup(reader.getFieldInfos());
+        try {
+            for (String file : files) {
+                final LuceneFilesExtensions ext = LuceneFilesExtensions.fromFile(file);
+                if (ext == null) {
+                    continue;
+                }
+                final long bytes = directory.fileLength(file);
+                switch (ext) {
+                    case DVD:
+                    case DVM:
+                        stats.addDocValues(fieldLookup.getDocValuesField(file), bytes);
+                        break;
+                    case TIM:
+                    case TIP:
+                    case TMD:
+                    case DOC:
+                    case POS:
+                    case PAY:
+                        stats.addInvertedIndex(fieldLookup.getPostingsField(file), bytes);
+                        break;
+                    case KDI:
+                    case KDD:
+                    case KDM:
+                    case DIM:
+                        stats.addPoints("_all_points_fields", bytes);
+                        break;
+                    case FDT:
+                    case FDX:
+                    case FDM:
+                        // We don't have per field Codec for stored, vector, and norms field
+                        stats.addStoredField("_all_stored_fields", bytes);
+                        break;
+                    case TVX:
+                    case TVD:
+                        stats.addTermVectors("_all_vectors_fields", bytes);
+                        break;
+                    case NVD:
+                    case NVM:
+                        stats.addNorms("_all_norms_fields", bytes);
+                        break;
+                    default:
+                        break;
+                }
+            }
+        } finally {
+            if (directory != reader.directory()) {
+                IOUtils.close(directory);
+            }
+        }
+    }
+
+    private static void assertStats(IndexDiskUsageStats actualStats, IndexDiskUsageStats perFieldStats) {
+        final List<String> fields = actualStats.getFields().keySet().stream().sorted().collect(Collectors.toList());
+        for (String field : fields) {
+            IndexDiskUsageStats.PerFieldDiskUsage actualField = actualStats.getFields().get(field);
+            IndexDiskUsageStats.PerFieldDiskUsage expectedField = perFieldStats.getFields().get(field);
+            if (expectedField == null) {
+                assertThat(actualField.getDocValuesBytes(), equalTo(0L));
+                assertThat(actualField.getInvertedIndexBytes(), equalTo(0L));
+                continue;
+            }
+            // Allow difference up to 2.5KB as we can load up to 256 long values in the table for numeric docValues
+            assertFieldStats(field, "doc values",
+                actualField.getDocValuesBytes(), expectedField.getDocValuesBytes(), 0.01, 2560);
+            assertFieldStats(field, "inverted index",
+                actualField.getInvertedIndexBytes(), expectedField.getInvertedIndexBytes(), 0.01, 1024);
+        }
+        // We are not able to collect per field stats for stored, vector, points, and norms
+        IndexDiskUsageStats.PerFieldDiskUsage actualTotal = actualStats.total();
+        IndexDiskUsageStats.PerFieldDiskUsage expectedTotal = perFieldStats.total();
+        assertFieldStats("total", "stored fields", actualTotal.getStoredFieldBytes(), expectedTotal.getStoredFieldBytes(), 0.01, 1024);
+        assertFieldStats("total", "points", actualTotal.getPointsBytes(), expectedTotal.getPointsBytes(), 0.01, 1024);
+        assertFieldStats("total", "term vectors", actualTotal.getTermVectorsBytes(), expectedTotal.getTermVectorsBytes(), 0.01, 1024);
+        assertFieldStats("total", "norms", actualTotal.getNormsBytes(), expectedTotal.getNormsBytes(), 0.01, 1024);
+    }
+
+    private static void assertFieldStats(String fieldName, String fieldType,
+                                         long actualBytes, long expectedBytes,
+                                         double allowErrorPercentage, long allowErrorBytes) {
+        long margin = allowErrorBytes;
+        if (allowErrorPercentage * actualBytes > allowErrorBytes) {
+            margin = (long) (allowErrorPercentage * actualBytes);
+        }
+        final boolean inRange = expectedBytes - margin <= actualBytes && actualBytes <= expectedBytes + margin;
+        if (inRange == false) {
+            throw new AssertionError(
+                "field=" + fieldName + " type=" + fieldType + " actual=" + actualBytes + " expected=" + expectedBytes);
+        }
+    }
+
+    private static IndexCommit lastCommit(Directory directory) throws IOException {
+        final List<IndexCommit> commits = DirectoryReader.listCommits(directory);
+        assertThat(commits, not(empty()));
+        return commits.get(commits.size() - 1);
+    }
+
+    private static ShardId testShardId() {
+        return new ShardId("test_index", "_na_", randomIntBetween(0, 3));
+    }
+}

+ 1 - 0
x-pack/plugin/security/qa/operator-privileges-tests/src/javaRestTest/java/org/elasticsearch/xpack/security/operator/Constants.java

@@ -358,6 +358,7 @@ public class Constants {
         "indices:admin/forcemerge",
         "indices:admin/freeze",
         "indices:admin/get",
+        "indices:admin/analyze_disk_usage",
         "indices:admin/ilm/explain",
         "indices:admin/ilm/remove_policy",
         "indices:admin/ilm/retry",