Browse Source

Propagate status codes from shard failures appropriately (#118016) (#119205)

* Return 502 if the underlying error is `NodeNotConnectedException`

* Traverse through the cause stack trace and check for `NodeNotConnectedException` and undo `status()` override in `NodeDisconnectedException`

* Rewrite `while` condition

* Fix: precommit

* Let status codes propagate rather than walking the stacktrace explicitly

Walking the stacktrace explicitly and looking for a specific error (node
connection-related errors in this case) is a workaround rather than a
proper fix. Instead, let the status codes propagate all the way to the
top so that they can be reported as-is.

* Fix: unused import

* Fix null deref

* Do not map descendants of `ConnectTransportException` to `502`

* Fix: precommit

* Do not account for 4xx status codes

4xx status codes are not likely to appear along with 5xx status codes.
As a result, we do not need to account for them when looking at shard
failures' status codes.

* Remove unnecessary `switch` case

In reference to the previous commit: this case is no longer needed.

* Rewrite code comment

* Address review comments

* [CI] Auto commit changes from spotless

* Update docs/changelog/118016.yaml

---------

Co-authored-by: elasticsearchmachine <infra-root+elasticsearchmachine@elastic.co>
Pawan Kartik 9 months ago
parent
commit
5e9f907000

+ 6 - 0
docs/changelog/118016.yaml

@@ -0,0 +1,6 @@
+pr: 118016
+summary: Propagate status codes from shard failures appropriately
+area: Search
+type: enhancement
+issues:
+ - 118482

+ 13 - 7
server/src/main/java/org/elasticsearch/action/search/SearchPhaseExecutionException.java

@@ -73,15 +73,21 @@ public class SearchPhaseExecutionException extends ElasticsearchException {
             // on coordinator node. so get the status from cause instead of returning SERVICE_UNAVAILABLE blindly
             return getCause() == null ? RestStatus.SERVICE_UNAVAILABLE : ExceptionsHelper.status(getCause());
         }
-        RestStatus status = shardFailures[0].status();
-        if (shardFailures.length > 1) {
-            for (int i = 1; i < shardFailures.length; i++) {
-                if (shardFailures[i].status().getStatus() >= RestStatus.INTERNAL_SERVER_ERROR.getStatus()) {
-                    status = shardFailures[i].status();
-                }
+        RestStatus status = null;
+        for (ShardSearchFailure shardFailure : shardFailures) {
+            RestStatus shardStatus = shardFailure.status();
+            int statusCode = shardStatus.getStatus();
+
+            // Return if it's an error that can be retried.
+            // These currently take precedence over other status code(s).
+            if (statusCode >= 502 && statusCode <= 504) {
+                return shardStatus;
+            } else if (statusCode >= 500) {
+                status = shardStatus;
             }
         }
-        return status;
+
+        return status == null ? shardFailures[0].status() : status;
     }
 
     public ShardSearchFailure[] shardFailures() {

+ 55 - 0
server/src/test/java/org/elasticsearch/action/search/SearchPhaseExecutionExceptionTests.java

@@ -22,6 +22,7 @@ import org.elasticsearch.indices.InvalidIndexTemplateException;
 import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.search.SearchShardTarget;
 import org.elasticsearch.test.ESTestCase;
+import org.elasticsearch.transport.NodeDisconnectedException;
 import org.elasticsearch.xcontent.ToXContent;
 import org.elasticsearch.xcontent.XContent;
 import org.elasticsearch.xcontent.XContentParser;
@@ -31,6 +32,7 @@ import java.io.IOException;
 
 import static org.hamcrest.CoreMatchers.hasItem;
 import static org.hamcrest.Matchers.hasSize;
+import static org.hamcrest.Matchers.is;
 
 public class SearchPhaseExecutionExceptionTests extends ESTestCase {
 
@@ -168,4 +170,57 @@ public class SearchPhaseExecutionExceptionTests extends ESTestCase {
 
         assertEquals(actual.status(), RestStatus.BAD_REQUEST);
     }
+
+    public void testOnlyWithCodesThatDoNotRequirePrecedence() {
+        int pickedIndex = randomIntBetween(0, 1);
+
+        // Pick one of these exceptions randomly.
+        var searchExceptions = new ElasticsearchException[] {
+            new ElasticsearchException("simulated"),
+            new NodeDisconnectedException(null, "unused message", "unused action", null) };
+
+        // Status codes that map to searchExceptions.
+        var expectedStatusCodes = new RestStatus[] { RestStatus.INTERNAL_SERVER_ERROR, RestStatus.BAD_GATEWAY };
+
+        ShardSearchFailure shardFailure1 = new ShardSearchFailure(
+            searchExceptions[pickedIndex],
+            new SearchShardTarget("nodeID", new ShardId("someIndex", "someUUID", 1), null)
+        );
+
+        ShardSearchFailure shardFailure2 = new ShardSearchFailure(
+            searchExceptions[pickedIndex],
+            new SearchShardTarget("nodeID", new ShardId("someIndex", "someUUID", 2), null)
+        );
+
+        SearchPhaseExecutionException ex = new SearchPhaseExecutionException(
+            "search",
+            "all shards failed",
+            new ShardSearchFailure[] { shardFailure1, shardFailure2 }
+        );
+
+        assertThat(ex.status(), is(expectedStatusCodes[pickedIndex]));
+    }
+
+    public void testWithRetriableCodesThatTakePrecedence() {
+        // Maps to a 500.
+        ShardSearchFailure shardFailure1 = new ShardSearchFailure(
+            new ElasticsearchException("simulated"),
+            new SearchShardTarget("nodeID", new ShardId("someIndex", "someUUID", 1), null)
+        );
+
+        // Maps to a 502.
+        ShardSearchFailure shardFailure2 = new ShardSearchFailure(
+            new NodeDisconnectedException(null, "unused message", "unused action", null),
+            new SearchShardTarget("nodeID", new ShardId("someIndex", "someUUID", 2), null)
+        );
+
+        SearchPhaseExecutionException ex = new SearchPhaseExecutionException(
+            "search",
+            "all shards failed",
+            new ShardSearchFailure[] { shardFailure1, shardFailure2 }
+        );
+
+        // The 502 takes precedence over 500.
+        assertThat(ex.status(), is(RestStatus.BAD_GATEWAY));
+    }
 }