浏览代码

Fix false positive out of sync warning in synced-flush (#46576)

Synced-flush consists of three steps: (1) force-flush on every active
copy; (2) check for ongoing indexing operations; (3) seal copies if
there's no change since step 1. If some indexing operations are
completed on the primary but not replicas, then Lucene commits from step
1 on replicas won't be the same as the primary's. And step 2 would pass
if it's executed when all pending operations are done. Once step 2
passes, we will incorrectly emit the "out of sync" warning message
although nothing wrong here.

Relates #28464
Relates #30244
Nhat Nguyen 6 年之前
父节点
当前提交
682dae0897

+ 2 - 2
server/src/main/java/org/elasticsearch/indices/flush/SyncedFlushService.java

@@ -387,9 +387,9 @@ public class SyncedFlushService implements IndexEventListener {
             if (preSyncedResponse.numDocs != numDocsOnPrimary &&
                 preSyncedResponse.numDocs != PreSyncedFlushResponse.UNKNOWN_NUM_DOCS &&
                 numDocsOnPrimary != PreSyncedFlushResponse.UNKNOWN_NUM_DOCS) {
-                logger.warn("{} can't to issue sync id [{}] for out of sync replica [{}] with num docs [{}]; num docs on primary [{}]",
+                logger.debug("{} can't issue sync id [{}] for replica [{}] with num docs [{}]; num docs on primary [{}]",
                     shardId, syncId, shard, preSyncedResponse.numDocs, numDocsOnPrimary);
-                results.put(shard, new ShardSyncedFlushResponse("out of sync replica; " +
+                results.put(shard, new ShardSyncedFlushResponse("ongoing indexing operations: " +
                     "num docs on replica [" + preSyncedResponse.numDocs + "]; num docs on primary [" + numDocsOnPrimary + "]"));
                 countDownAndSendResponseIfDone(syncId, shards, shardId, totalShards, listener, countDown, results);
                 continue;

+ 1 - 1
server/src/test/java/org/elasticsearch/indices/flush/FlushIT.java

@@ -309,7 +309,7 @@ public class FlushIT extends ESIntegTestCase {
         assertThat(partialResult.totalShards(), equalTo(numberOfReplicas + 1));
         assertThat(partialResult.successfulShards(), equalTo(numberOfReplicas));
         assertThat(partialResult.shardResponses().get(outOfSyncReplica.routingEntry()).failureReason, equalTo(
-            "out of sync replica; num docs on replica [" + (numDocs + extraDocs) + "]; num docs on primary [" + numDocs + "]"));
+            "ongoing indexing operations: num docs on replica [" + (numDocs + extraDocs) + "]; num docs on primary [" + numDocs + "]"));
         // Index extra documents to all shards - synced-flush should be ok.
         for (IndexShard indexShard : indexShards) {
             // Do reindex documents to the out of sync replica to avoid trigger merges