Browse Source

Track search and fetch failure stats (#113988) (#114041)

This PR tracks the total number of query and fetch failures, in addition
to the existing metrics for each shard, and exposes them through the
stats API.
Nhat Nguyen 1 year ago
parent
commit
eb2763ee16

+ 5 - 0
docs/changelog/113988.yaml

@@ -0,0 +1,5 @@
+pr: 113988
+summary: Track search and fetch failure stats
+area: Stats
+type: enhancement
+issues: []

+ 117 - 1
server/src/internalClusterTest/java/org/elasticsearch/search/stats/SearchStatsIT.java

@@ -13,23 +13,35 @@ import org.elasticsearch.action.admin.cluster.node.stats.NodeStats;
 import org.elasticsearch.action.admin.cluster.node.stats.NodesStatsResponse;
 import org.elasticsearch.action.admin.indices.stats.IndicesStatsResponse;
 import org.elasticsearch.cluster.ClusterState;
+import org.elasticsearch.cluster.metadata.IndexMetadata;
 import org.elasticsearch.cluster.routing.GroupShardsIterator;
 import org.elasticsearch.cluster.routing.ShardIterator;
 import org.elasticsearch.cluster.routing.ShardRouting;
+import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.core.TimeValue;
+import org.elasticsearch.index.mapper.OnScriptError;
 import org.elasticsearch.index.query.QueryBuilders;
+import org.elasticsearch.index.query.RangeQueryBuilder;
 import org.elasticsearch.index.search.stats.SearchStats.Stats;
 import org.elasticsearch.plugins.Plugin;
+import org.elasticsearch.plugins.ScriptPlugin;
+import org.elasticsearch.script.LongFieldScript;
 import org.elasticsearch.script.MockScriptPlugin;
 import org.elasticsearch.script.Script;
+import org.elasticsearch.script.ScriptContext;
+import org.elasticsearch.script.ScriptEngine;
 import org.elasticsearch.script.ScriptType;
 import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
+import org.elasticsearch.search.lookup.SearchLookup;
 import org.elasticsearch.search.lookup.Source;
 import org.elasticsearch.test.ESIntegTestCase;
+import org.elasticsearch.xcontent.XContentBuilder;
+import org.elasticsearch.xcontent.json.JsonXContent;
 
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.function.Function;
@@ -53,7 +65,7 @@ public class SearchStatsIT extends ESIntegTestCase {
 
     @Override
     protected Collection<Class<? extends Plugin>> nodePlugins() {
-        return Collections.singleton(CustomScriptPlugin.class);
+        return List.of(CustomScriptPlugin.class, FailingFieldPlugin.class);
     }
 
     public static class CustomScriptPlugin extends MockScriptPlugin {
@@ -68,6 +80,50 @@ public class SearchStatsIT extends ESIntegTestCase {
         }
     }
 
+    public static class FailingFieldPlugin extends Plugin implements ScriptPlugin {
+
+        @Override
+        public ScriptEngine getScriptEngine(Settings settings, Collection<ScriptContext<?>> contexts) {
+            return new ScriptEngine() {
+                @Override
+                public String getType() {
+                    return "failing_field";
+                }
+
+                @Override
+                @SuppressWarnings("unchecked")
+                public <FactoryType> FactoryType compile(
+                    String name,
+                    String code,
+                    ScriptContext<FactoryType> context,
+                    Map<String, String> params
+                ) {
+                    return (FactoryType) new LongFieldScript.Factory() {
+                        @Override
+                        public LongFieldScript.LeafFactory newFactory(
+                            String fieldName,
+                            Map<String, Object> params,
+                            SearchLookup searchLookup,
+                            OnScriptError onScriptError
+                        ) {
+                            return ctx -> new LongFieldScript(fieldName, params, searchLookup, onScriptError, ctx) {
+                                @Override
+                                public void execute() {
+                                    throw new IllegalArgumentException("Accessing failing field");
+                                }
+                            };
+                        }
+                    };
+                }
+
+                @Override
+                public Set<ScriptContext<?>> getSupportedContexts() {
+                    return Set.of(LongFieldScript.CONTEXT);
+                }
+            };
+        }
+    }
+
     @Override
     protected int numberOfReplicas() {
         return 0;
@@ -244,4 +300,64 @@ public class SearchStatsIT extends ESIntegTestCase {
         GroupShardsIterator<?> allAssignedShardsGrouped = state.routingTable().allAssignedShardsGrouped(indices, true);
         return allAssignedShardsGrouped.size();
     }
+
+    public void testFailureStats() throws Exception {
+        String indexName = "test";
+        XContentBuilder mapping = JsonXContent.contentBuilder().startObject();
+        mapping.startObject("runtime");
+        {
+            mapping.startObject("fail_me");
+            {
+                mapping.field("type", "long");
+                mapping.startObject("script").field("source", "").field("lang", "failing_field").endObject();
+            }
+            mapping.endObject();
+        }
+        mapping.endObject();
+        mapping.endObject();
+        int numOfShards = between(1, 5);
+        client().admin()
+            .indices()
+            .prepareCreate(indexName)
+            .setSettings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, numOfShards))
+            .setMapping(mapping)
+            .get();
+        int numDocs = between(20, 100);
+        for (int i = 1; i < numDocs; i++) {
+            index(indexName, Integer.toString(i), Map.of("position", i));
+        }
+        refresh(indexName);
+        int numQueries = between(1, 10);
+        long failedQueries = 0;
+        for (int q = 0; q < numQueries; q++) {
+            expectThrows(Exception.class, () -> {
+                client().prepareSearch(indexName)
+                    .setQuery(new RangeQueryBuilder("fail_me").gt(10))
+                    .setAllowPartialSearchResults(true)
+                    .get();
+            });
+            failedQueries += numOfShards;
+            var stats = client().admin().indices().prepareStats(indexName).all().get().getTotal().search.getTotal();
+            assertThat(stats.getQueryCount(), equalTo(0L));
+            assertThat(stats.getQueryFailure(), equalTo(failedQueries));
+            assertThat(stats.getFetchCount(), equalTo(0L));
+            assertThat(stats.getFetchFailure(), equalTo(0L));
+        }
+        int numFetches = between(1, 10);
+        for (int q = 0; q < numFetches; q++) {
+            expectThrows(Exception.class, () -> {
+                client().prepareSearch(indexName)
+                    .setQuery(new RangeQueryBuilder("position").gt(0))
+                    .setFetchSource(false)
+                    .addFetchField("fail_me")
+                    .setSize(1000)
+                    .get();
+            });
+            var stats = client().admin().indices().prepareStats(indexName).all().get().getTotal().search.getTotal();
+            assertThat(stats.getQueryCount(), equalTo((q + 1L) * numOfShards));
+            assertThat(stats.getQueryFailure(), equalTo(failedQueries));
+            assertThat(stats.getFetchCount(), equalTo(0L));
+            assertThat(stats.getFetchFailure(), equalTo((q + 1L) * numOfShards));
+        }
+    }
 }

+ 1 - 0
server/src/main/java/org/elasticsearch/TransportVersions.java

@@ -232,6 +232,7 @@ public class TransportVersions {
     public static final TransportVersion ESQL_CCS_EXECUTION_INFO = def(8_756_00_0);
     public static final TransportVersion REGEX_AND_RANGE_INTERVAL_QUERIES = def(8_757_00_0);
     public static final TransportVersion RRF_QUERY_REWRITE = def(8_758_00_0);
+    public static final TransportVersion SEARCH_FAILURE_STATS = def(8_759_00_0);
 
     /*
      * STOP! READ THIS FIRST! No, really,

+ 57 - 19
server/src/main/java/org/elasticsearch/index/search/stats/SearchStats.java

@@ -9,6 +9,7 @@
 
 package org.elasticsearch.index.search.stats;
 
+import org.elasticsearch.TransportVersions;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
@@ -45,6 +46,9 @@ public class SearchStats implements Writeable, ToXContentFragment {
         private long suggestTimeInMillis;
         private long suggestCurrent;
 
+        private long queryFailure;
+        private long fetchFailure;
+
         private Stats() {
             // for internal use, initializes all counts to 0
         }
@@ -53,9 +57,11 @@ public class SearchStats implements Writeable, ToXContentFragment {
             long queryCount,
             long queryTimeInMillis,
             long queryCurrent,
+            long queryFailure,
             long fetchCount,
             long fetchTimeInMillis,
             long fetchCurrent,
+            long fetchFailure,
             long scrollCount,
             long scrollTimeInMillis,
             long scrollCurrent,
@@ -66,10 +72,12 @@ public class SearchStats implements Writeable, ToXContentFragment {
             this.queryCount = queryCount;
             this.queryTimeInMillis = queryTimeInMillis;
             this.queryCurrent = queryCurrent;
+            this.queryFailure = queryFailure;
 
             this.fetchCount = fetchCount;
             this.fetchTimeInMillis = fetchTimeInMillis;
             this.fetchCurrent = fetchCurrent;
+            this.fetchFailure = fetchFailure;
 
             this.scrollCount = scrollCount;
             this.scrollTimeInMillis = scrollTimeInMillis;
@@ -96,16 +104,47 @@ public class SearchStats implements Writeable, ToXContentFragment {
             suggestCount = in.readVLong();
             suggestTimeInMillis = in.readVLong();
             suggestCurrent = in.readVLong();
+
+            if (in.getTransportVersion().onOrAfter(TransportVersions.SEARCH_FAILURE_STATS)) {
+                queryFailure = in.readVLong();
+                fetchFailure = in.readVLong();
+            }
+        }
+
+        @Override
+        public void writeTo(StreamOutput out) throws IOException {
+            out.writeVLong(queryCount);
+            out.writeVLong(queryTimeInMillis);
+            out.writeVLong(queryCurrent);
+
+            out.writeVLong(fetchCount);
+            out.writeVLong(fetchTimeInMillis);
+            out.writeVLong(fetchCurrent);
+
+            out.writeVLong(scrollCount);
+            out.writeVLong(scrollTimeInMillis);
+            out.writeVLong(scrollCurrent);
+
+            out.writeVLong(suggestCount);
+            out.writeVLong(suggestTimeInMillis);
+            out.writeVLong(suggestCurrent);
+
+            if (out.getTransportVersion().onOrAfter(TransportVersions.SEARCH_FAILURE_STATS)) {
+                out.writeVLong(queryFailure);
+                out.writeVLong(fetchFailure);
+            }
         }
 
         public void add(Stats stats) {
             queryCount += stats.queryCount;
             queryTimeInMillis += stats.queryTimeInMillis;
             queryCurrent += stats.queryCurrent;
+            queryFailure += stats.queryFailure;
 
             fetchCount += stats.fetchCount;
             fetchTimeInMillis += stats.fetchTimeInMillis;
             fetchCurrent += stats.fetchCurrent;
+            fetchFailure += stats.fetchFailure;
 
             scrollCount += stats.scrollCount;
             scrollTimeInMillis += stats.scrollTimeInMillis;
@@ -119,9 +158,11 @@ public class SearchStats implements Writeable, ToXContentFragment {
         public void addForClosingShard(Stats stats) {
             queryCount += stats.queryCount;
             queryTimeInMillis += stats.queryTimeInMillis;
+            queryFailure += stats.queryFailure;
 
             fetchCount += stats.fetchCount;
             fetchTimeInMillis += stats.fetchTimeInMillis;
+            fetchFailure += stats.fetchFailure;
 
             scrollCount += stats.scrollCount;
             scrollTimeInMillis += stats.scrollTimeInMillis;
@@ -148,6 +189,10 @@ public class SearchStats implements Writeable, ToXContentFragment {
             return queryCurrent;
         }
 
+        public long getQueryFailure() {
+            return queryFailure;
+        }
+
         public long getFetchCount() {
             return fetchCount;
         }
@@ -164,6 +209,10 @@ public class SearchStats implements Writeable, ToXContentFragment {
             return fetchCurrent;
         }
 
+        public long getFetchFailure() {
+            return fetchFailure;
+        }
+
         public long getScrollCount() {
             return scrollCount;
         }
@@ -200,34 +249,17 @@ public class SearchStats implements Writeable, ToXContentFragment {
             return new Stats(in);
         }
 
-        @Override
-        public void writeTo(StreamOutput out) throws IOException {
-            out.writeVLong(queryCount);
-            out.writeVLong(queryTimeInMillis);
-            out.writeVLong(queryCurrent);
-
-            out.writeVLong(fetchCount);
-            out.writeVLong(fetchTimeInMillis);
-            out.writeVLong(fetchCurrent);
-
-            out.writeVLong(scrollCount);
-            out.writeVLong(scrollTimeInMillis);
-            out.writeVLong(scrollCurrent);
-
-            out.writeVLong(suggestCount);
-            out.writeVLong(suggestTimeInMillis);
-            out.writeVLong(suggestCurrent);
-        }
-
         @Override
         public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
             builder.field(Fields.QUERY_TOTAL, queryCount);
             builder.humanReadableField(Fields.QUERY_TIME_IN_MILLIS, Fields.QUERY_TIME, getQueryTime());
             builder.field(Fields.QUERY_CURRENT, queryCurrent);
+            builder.field(Fields.QUERY_FAILURE, queryFailure);
 
             builder.field(Fields.FETCH_TOTAL, fetchCount);
             builder.humanReadableField(Fields.FETCH_TIME_IN_MILLIS, Fields.FETCH_TIME, getFetchTime());
             builder.field(Fields.FETCH_CURRENT, fetchCurrent);
+            builder.field(Fields.FETCH_FAILURE, fetchFailure);
 
             builder.field(Fields.SCROLL_TOTAL, scrollCount);
             builder.humanReadableField(Fields.SCROLL_TIME_IN_MILLIS, Fields.SCROLL_TIME, getScrollTime());
@@ -248,9 +280,11 @@ public class SearchStats implements Writeable, ToXContentFragment {
             return queryCount == that.queryCount
                 && queryTimeInMillis == that.queryTimeInMillis
                 && queryCurrent == that.queryCurrent
+                && queryFailure == that.queryFailure
                 && fetchCount == that.fetchCount
                 && fetchTimeInMillis == that.fetchTimeInMillis
                 && fetchCurrent == that.fetchCurrent
+                && fetchFailure == that.fetchFailure
                 && scrollCount == that.scrollCount
                 && scrollTimeInMillis == that.scrollTimeInMillis
                 && scrollCurrent == that.scrollCurrent
@@ -265,9 +299,11 @@ public class SearchStats implements Writeable, ToXContentFragment {
                 queryCount,
                 queryTimeInMillis,
                 queryCurrent,
+                queryFailure,
                 fetchCount,
                 fetchTimeInMillis,
                 fetchCurrent,
+                fetchCount,
                 scrollCount,
                 scrollTimeInMillis,
                 scrollCurrent,
@@ -377,10 +413,12 @@ public class SearchStats implements Writeable, ToXContentFragment {
         static final String QUERY_TIME = "query_time";
         static final String QUERY_TIME_IN_MILLIS = "query_time_in_millis";
         static final String QUERY_CURRENT = "query_current";
+        static final String QUERY_FAILURE = "query_failure";
         static final String FETCH_TOTAL = "fetch_total";
         static final String FETCH_TIME = "fetch_time";
         static final String FETCH_TIME_IN_MILLIS = "fetch_time_in_millis";
         static final String FETCH_CURRENT = "fetch_current";
+        static final String FETCH_FAILURE = "fetch_failure";
         static final String SCROLL_TOTAL = "scroll_total";
         static final String SCROLL_TIME = "scroll_time";
         static final String SCROLL_TIME_IN_MILLIS = "scroll_time_in_millis";

+ 10 - 1
server/src/main/java/org/elasticsearch/index/search/stats/ShardSearchStats.java

@@ -73,6 +73,7 @@ public final class ShardSearchStats implements SearchOperationListener {
                 statsHolder.suggestCurrent.dec();
             } else {
                 statsHolder.queryCurrent.dec();
+                statsHolder.queryFailure.inc();
             }
         });
     }
@@ -97,7 +98,10 @@ public final class ShardSearchStats implements SearchOperationListener {
 
     @Override
     public void onFailedFetchPhase(SearchContext searchContext) {
-        computeStats(searchContext, statsHolder -> statsHolder.fetchCurrent.dec());
+        computeStats(searchContext, statsHolder -> {
+            statsHolder.fetchCurrent.dec();
+            statsHolder.fetchFailure.inc();
+        });
     }
 
     @Override
@@ -170,14 +174,19 @@ public final class ShardSearchStats implements SearchOperationListener {
         final CounterMetric scrollCurrent = new CounterMetric();
         final CounterMetric suggestCurrent = new CounterMetric();
 
+        final CounterMetric queryFailure = new CounterMetric();
+        final CounterMetric fetchFailure = new CounterMetric();
+
         SearchStats.Stats stats() {
             return new SearchStats.Stats(
                 queryMetric.count(),
                 TimeUnit.NANOSECONDS.toMillis(queryMetric.sum()),
                 queryCurrent.count(),
+                queryFailure.count(),
                 fetchMetric.count(),
                 TimeUnit.NANOSECONDS.toMillis(fetchMetric.sum()),
                 fetchCurrent.count(),
+                fetchFailure.count(),
                 scrollMetric.count(),
                 TimeUnit.MICROSECONDS.toMillis(scrollMetric.sum()),
                 scrollCurrent.count(),

+ 2 - 0
server/src/test/java/org/elasticsearch/action/admin/cluster/node/stats/NodeStatsTests.java

@@ -604,6 +604,8 @@ public class NodeStatsTests extends ESTestCase {
         indicesCommonStats.getRequestCache().add(new RequestCacheStats(++iota, ++iota, ++iota, ++iota));
 
         final SearchStats.Stats searchStats = new SearchStats.Stats(
+            ++iota,
+            ++iota,
             ++iota,
             ++iota,
             ++iota,

+ 3 - 3
server/src/test/java/org/elasticsearch/index/search/stats/SearchStatsTests.java

@@ -22,9 +22,9 @@ public class SearchStatsTests extends ESTestCase {
         // let's create two dummy search stats with groups
         Map<String, Stats> groupStats1 = new HashMap<>();
         Map<String, Stats> groupStats2 = new HashMap<>();
-        groupStats2.put("group1", new Stats(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1));
-        SearchStats searchStats1 = new SearchStats(new Stats(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), 0, groupStats1);
-        SearchStats searchStats2 = new SearchStats(new Stats(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), 0, groupStats2);
+        groupStats2.put("group1", new Stats(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1));
+        SearchStats searchStats1 = new SearchStats(new Stats(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), 0, groupStats1);
+        SearchStats searchStats2 = new SearchStats(new Stats(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), 0, groupStats2);
 
         // adding these two search stats and checking group stats are correct
         searchStats1.add(searchStats2);

+ 1 - 1
x-pack/plugin/monitoring/src/test/java/org/elasticsearch/xpack/monitoring/collector/indices/IndexStatsMonitoringDocTests.java

@@ -392,7 +392,7 @@ public class IndexStatsMonitoringDocTests extends BaseFilteredMonitoringDocTestC
         final IndexingStats.Stats indexingStats = new IndexingStats.Stats(++iota, ++iota, no, no, no, no, no, no, false, ++iota, no, no);
         commonStats.getIndexing().add(new IndexingStats(indexingStats));
 
-        final SearchStats.Stats searchStats = new SearchStats.Stats(++iota, ++iota, no, no, no, no, no, no, no, no, no, no);
+        final SearchStats.Stats searchStats = new SearchStats.Stats(++iota, ++iota, no, no, no, no, no, no, no, no, no, no, no, no);
         commonStats.getSearch().add(new SearchStats(searchStats, no, null));
 
         final SegmentsStats segmentsStats = new SegmentsStats();

+ 1 - 1
x-pack/plugin/monitoring/src/test/java/org/elasticsearch/xpack/monitoring/collector/indices/IndicesStatsMonitoringDocTests.java

@@ -186,7 +186,7 @@ public class IndicesStatsMonitoringDocTests extends BaseFilteredMonitoringDocTes
         final IndexingStats.Stats indexingStats = new IndexingStats.Stats(3L, 4L, 0L, 0L, 0L, 0L, 0L, 0L, true, 5L, 0, 0);
         commonStats.getIndexing().add(new IndexingStats(indexingStats));
 
-        final SearchStats.Stats searchStats = new SearchStats.Stats(6L, 7L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L);
+        final SearchStats.Stats searchStats = new SearchStats.Stats(6L, 7L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L);
         commonStats.getSearch().add(new SearchStats(searchStats, 0L, null));
 
         final BulkStats bulkStats = new BulkStats(0L, 0L, 0L, 0L, 0L);

+ 1 - 1
x-pack/plugin/monitoring/src/test/java/org/elasticsearch/xpack/monitoring/collector/node/NodeStatsMonitoringDocTests.java

@@ -352,7 +352,7 @@ public class NodeStatsMonitoringDocTests extends BaseFilteredMonitoringDocTestCa
         indicesCommonStats.getQueryCache().add(new QueryCacheStats(++iota, ++iota, ++iota, ++iota, no));
         indicesCommonStats.getRequestCache().add(new RequestCacheStats(++iota, ++iota, ++iota, ++iota));
 
-        final SearchStats.Stats searchStats = new SearchStats.Stats(++iota, ++iota, no, no, no, no, no, no, no, no, no, no);
+        final SearchStats.Stats searchStats = new SearchStats.Stats(++iota, ++iota, no, no, no, no, no, no, no, no, no, no, no, no);
         indicesCommonStats.getSearch().add(new SearchStats(searchStats, no, null));
 
         final SegmentsStats segmentsStats = new SegmentsStats();