Kaynağa Gözat

Fix shard splitting for `nested` (#89351)

I broke shard splitting when `_routing` is required and you use `nested`
docs. The mapping would look like this:
```
"mappings": {
  "_routing": {
    "required": true
  },
  "properties": {
    "n": { "type": "nested" }
  }
}
```

If you attempt to split an index with a mapping like this it'll blow up
with an exception like this:
```
Caused by: [idx] org.elasticsearch.action.RoutingMissingException: routing is required for [idx]/[0]
	at org.elasticsearch.cluster.routing.IndexRouting$IdAndRoutingOnly.checkRoutingRequired(IndexRouting.java:181)
	at org.elasticsearch.cluster.routing.IndexRouting$IdAndRoutingOnly.getShard(IndexRouting.java:175)
```

This fixes the problem by entirely avoiding the branch of code. That
branch was trying to find any top level documents that don't have a
`_routing`. But we *know* that there aren't any top level documents
without a routing in this case - the routing is "required". ES wouldn't
have let you index any top level documents without the routing.

This also adds a small pile of REST layer tests for shard splitting that
hit various branches in this area. For extra paranoia.

Closes #88109
Nik Everett 3 yıl önce
ebeveyn
işleme
b327b17653

+ 6 - 0
docs/changelog/89351.yaml

@@ -0,0 +1,6 @@
+pr: 89351
+summary: Fix shard splitting for `nested`
+area: Indices APIs
+type: bug
+issues:
+ - 88109

+ 10 - 0
rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.split/10_basic.yml

@@ -87,6 +87,11 @@ setup:
   - match: { _id:      "3"     }
   - match: { _source:  { foo: "hello world 3" } }
 
+  - do:
+      search:
+        index: target
+  - match: { hits.total.value: 3 }
+
 
 ---
 "Split from 1 to N":
@@ -177,6 +182,11 @@ setup:
   - match: { _id:      "3"     }
   - match: { _source:  { foo: "hello world 3" } }
 
+  - do:
+      search:
+        index: target
+  - match: { hits.total.value: 3 }
+
 ---
 "Create illegal split indices":
   # try to do an illegal split with number_of_routing_shards set

+ 304 - 0
rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.split/40_routing_partition_size.yml

@@ -0,0 +1,304 @@
+more than 1:
+  - do:
+      indices.create:
+        index: source
+        wait_for_active_shards: 1
+        body:
+          settings:
+            index.number_of_shards: 2
+            index.number_of_replicas: 0
+            index.number_of_routing_shards: 4
+            index.routing_partition_size: 2
+          mappings:
+            _routing:
+              required: true
+
+  - do:
+      index:
+        index:   source
+        id:      1
+        routing: 1
+        body:    { "foo": "hello world" }
+
+  - do:
+      index:
+        index:   source
+        id:      2
+        routing: 2
+        body:    { "foo": "hello world 2" }
+
+  - do:
+      index:
+        index:   source
+        id:      3
+        routing: 3
+        body:    { "foo": "hello world 3" }
+
+  # make it read-only
+  - do:
+      indices.put_settings:
+        index: source
+        body:
+          index.blocks.write: true
+          index.number_of_replicas: 0
+
+  - do:
+      cluster.health:
+        wait_for_status: green
+        index: source
+
+  # now we do the actual split
+  - do:
+      indices.split:
+        index: "source"
+        target: "target"
+        wait_for_active_shards: 1
+        master_timeout: 10s
+        body:
+          settings:
+            index.number_of_replicas: 0
+            index.number_of_shards: 4
+
+  - do:
+      cluster.health:
+        wait_for_status: green
+
+  - do:
+      get:
+        index:   target
+        routing: 1
+        id:      1
+
+  - match: { _index:   target }
+  - match: { _id:      "1"     }
+  - match: { _source:  { foo: "hello world" } }
+
+  - do:
+      get:
+        index:   target
+        routing: 2
+        id:      2
+
+  - match: { _index:   target }
+  - match: { _id:      "2"     }
+  - match: { _source:  { foo: "hello world 2" } }
+
+  - do:
+      get:
+        index:   target
+        routing: 3
+        id:      3
+
+  - match: { _index:   target }
+  - match: { _id:      "3"     }
+  - match: { _source:  { foo: "hello world 3" } }
+
+  - do:
+      search:
+        index: target
+  - match: { hits.total.value: 3 }
+
+---
+exactly 1:
+  - do:
+      indices.create:
+        index: source
+        wait_for_active_shards: 1
+        body:
+          settings:
+            index.number_of_shards: 2
+            index.number_of_replicas: 0
+            index.number_of_routing_shards: 4
+            index.routing_partition_size: 1
+          mappings:
+            _routing:
+              required: true
+
+  - do:
+      index:
+        index:   source
+        id:      1
+        routing: 1
+        body:    { "foo": "hello world" }
+
+  - do:
+      index:
+        index:   source
+        id:      2
+        routing: 2
+        body:    { "foo": "hello world 2" }
+
+  - do:
+      index:
+        index:   source
+        id:      3
+        routing: 3
+        body:    { "foo": "hello world 3" }
+
+  # make it read-only
+  - do:
+      indices.put_settings:
+        index: source
+        body:
+          index.blocks.write: true
+          index.number_of_replicas: 0
+
+  - do:
+      cluster.health:
+        wait_for_status: green
+        index: source
+
+  # now we do the actual split
+  - do:
+      indices.split:
+        index: "source"
+        target: "target"
+        wait_for_active_shards: 1
+        master_timeout: 10s
+        body:
+          settings:
+            index.number_of_replicas: 0
+            index.number_of_shards: 4
+
+  - do:
+      cluster.health:
+        wait_for_status: green
+
+  - do:
+      get:
+        index:   target
+        routing: 1
+        id:      1
+
+  - match: { _index:   target }
+  - match: { _id:      "1"     }
+  - match: { _source:  { foo: "hello world" } }
+
+  - do:
+      get:
+        index:   target
+        routing: 2
+        id:      2
+
+  - match: { _index:   target }
+  - match: { _id:      "2"     }
+  - match: { _source:  { foo: "hello world 2" } }
+
+  - do:
+      get:
+        index:   target
+        routing: 3
+        id:      3
+
+  - match: { _index:   target }
+  - match: { _id:      "3"     }
+  - match: { _source:  { foo: "hello world 3" } }
+
+  - do:
+      search:
+        index: target
+  - match: { hits.total.value: 3 }
+
+---
+nested:
+  - do:
+      indices.create:
+        index: source
+        wait_for_active_shards: 1
+        body:
+          settings:
+            index.number_of_shards: 2
+            index.number_of_replicas: 0
+            index.number_of_routing_shards: 4
+            index.routing_partition_size: 2
+          mappings:
+            _routing:
+              required: true
+            properties:
+              n:
+                type: nested
+
+  - do:
+      index:
+        index:   source
+        id:      1
+        routing: 1
+        body:    { "foo": "hello world", "n": [{"foo": "goodbye world"}, {"foo": "more words"}] }
+
+  - do:
+      index:
+        index:   source
+        id:      2
+        routing: 2
+        body:    { "foo": "hello world 2" }
+
+  - do:
+      index:
+        index:   source
+        id:      3
+        routing: 3
+        body:    { "foo": "hello world 3" }
+
+  # make it read-only
+  - do:
+      indices.put_settings:
+        index: source
+        body:
+          index.blocks.write: true
+          index.number_of_replicas: 0
+
+  - do:
+      cluster.health:
+        wait_for_status: green
+        index: source
+
+  # now we do the actual split
+  - do:
+      indices.split:
+        index: "source"
+        target: "target"
+        wait_for_active_shards: 1
+        master_timeout: 10s
+        body:
+          settings:
+            index.number_of_replicas: 0
+            index.number_of_shards: 4
+
+  - do:
+      cluster.health:
+        wait_for_status: green
+
+  - do:
+      get:
+        index:   target
+        routing: 1
+        id:      1
+
+  - match: { _index:   target }
+  - match: { _id:      "1"     }
+  - match: { _source:  { "foo": "hello world", "n": [{"foo": "goodbye world"}, {"foo": "more words"}] } }
+
+  - do:
+      get:
+        index:   target
+        routing: 2
+        id:      2
+
+  - match: { _index:   target }
+  - match: { _id:      "2"     }
+  - match: { _source:  { foo: "hello world 2" } }
+
+  - do:
+      get:
+        index:   target
+        routing: 3
+        id:      3
+
+  - match: { _index:   target }
+  - match: { _id:      "3"     }
+  - match: { _source:  { foo: "hello world 3" } }
+
+  - do:
+      search:
+        index: target
+  - match: { hits.total.value: 3 }

+ 205 - 0
rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.split/50_routing_required.yml

@@ -0,0 +1,205 @@
+routing required:
+  - do:
+      indices.create:
+        index: source
+        wait_for_active_shards: 1
+        body:
+          settings:
+            index.number_of_shards: 2
+            index.number_of_replicas: 0
+            index.number_of_routing_shards: 4
+          mappings:
+            _routing:
+              required: true
+
+  - do:
+      index:
+        index:   source
+        id:      1
+        routing: 1
+        body:    { "foo": "hello world" }
+
+  - do:
+      index:
+        index:   source
+        id:      2
+        routing: 2
+        body:    { "foo": "hello world 2" }
+
+  - do:
+      index:
+        index:   source
+        id:      3
+        routing: 3
+        body:    { "foo": "hello world 3" }
+
+  # make it read-only
+  - do:
+      indices.put_settings:
+        index: source
+        body:
+          index.blocks.write: true
+          index.number_of_replicas: 0
+
+  - do:
+      cluster.health:
+        wait_for_status: green
+        index: source
+
+  # now we do the actual split
+  - do:
+      indices.split:
+        index: "source"
+        target: "target"
+        wait_for_active_shards: 1
+        master_timeout: 10s
+        body:
+          settings:
+            index.number_of_replicas: 0
+            index.number_of_shards: 4
+
+  - do:
+      cluster.health:
+        wait_for_status: green
+
+  - do:
+      get:
+        index:   target
+        routing: 1
+        id:      1
+
+  - match: { _index:   target }
+  - match: { _id:      "1"     }
+  - match: { _source:  { foo: "hello world" } }
+
+  - do:
+      get:
+        index:   target
+        routing: 2
+        id:      2
+
+  - match: { _index:   target }
+  - match: { _id:      "2"     }
+  - match: { _source:  { foo: "hello world 2" } }
+
+  - do:
+      get:
+        index:   target
+        routing: 3
+        id:      3
+
+  - match: { _index:   target }
+  - match: { _id:      "3"     }
+  - match: { _source:  { foo: "hello world 3" } }
+
+  - do:
+      search:
+        index: target
+  - match: { hits.total.value: 3 }
+
+---
+nested:
+  - skip:
+      version: " - 8.4.99"
+      reason: "fixed in 8.5.0"
+
+  - do:
+      indices.create:
+        index: source
+        wait_for_active_shards: 1
+        body:
+          settings:
+            index.number_of_shards: 2
+            index.number_of_replicas: 0
+            index.number_of_routing_shards: 4
+          mappings:
+            _routing:
+              required: true
+            properties:
+              n:
+                type: nested
+
+  - do:
+      index:
+        index:   source
+        id:      1
+        routing: 1
+        body:    { "foo": "hello world", "n": [{"foo": "goodbye world"}, {"foo": "more words"}] }
+
+  - do:
+      index:
+        index:   source
+        id:      2
+        routing: 2
+        body:    { "foo": "hello world 2" }
+
+  - do:
+      index:
+        index:   source
+        id:      3
+        routing: 3
+        body:    { "foo": "hello world 3" }
+
+  # make it read-only
+  - do:
+      indices.put_settings:
+        index: source
+        body:
+          index.blocks.write: true
+          index.number_of_replicas: 0
+
+  - do:
+      cluster.health:
+        wait_for_status: green
+        index: source
+
+  # now we do the actual split
+  - do:
+      indices.split:
+        index: "source"
+        target: "target"
+        wait_for_active_shards: 1
+        master_timeout: 10s
+        body:
+          settings:
+            index.number_of_replicas: 0
+            index.number_of_shards: 4
+
+  - do:
+      cluster.health:
+        wait_for_status: green
+
+  - do:
+      get:
+        index:   target
+        routing: 1
+        id:      1
+
+  - match: { _index:   target }
+  - match: { _id:      "1"     }
+  - match: { _source:  { "foo": "hello world", "n": [{"foo": "goodbye world"}, {"foo": "more words"}] } }
+
+  - do:
+      get:
+        index:   target
+        routing: 2
+        id:      2
+
+  - match: { _index:   target }
+  - match: { _id:      "2"     }
+  - match: { _source:  { foo: "hello world 2" } }
+
+  - do:
+      get:
+        index:   target
+        routing: 3
+        id:      3
+
+  - match: { _index:   target }
+  - match: { _id:      "3"     }
+  - match: { _source:  { foo: "hello world 3" } }
+
+  - do:
+      search:
+        index: target
+  - match: { hits.total.value: 3 }

+ 94 - 0
rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.split/60_nested.yml

@@ -0,0 +1,94 @@
+---
+nested:
+  - do:
+      indices.create:
+        index: source
+        wait_for_active_shards: 1
+        body:
+          settings:
+            index.number_of_shards: 2
+            index.number_of_replicas: 0
+            index.number_of_routing_shards: 4
+          mappings:
+            properties:
+              n:
+                type: nested
+
+  - do:
+      index:
+        index:   source
+        id:      1
+        body:    { "foo": "hello world", "n": [{"foo": "goodbye world"}, {"foo": "more words"}] }
+
+  - do:
+      index:
+        index:   source
+        id:      2
+        body:    { "foo": "hello world 2" }
+
+  - do:
+      index:
+        index:   source
+        id:      3
+        body:    { "foo": "hello world 3" }
+
+  # make it read-only
+  - do:
+      indices.put_settings:
+        index: source
+        body:
+          index.blocks.write: true
+          index.number_of_replicas: 0
+
+  - do:
+      cluster.health:
+        wait_for_status: green
+        index: source
+
+  # now we do the actual split
+  - do:
+      indices.split:
+        index: "source"
+        target: "target"
+        wait_for_active_shards: 1
+        master_timeout: 10s
+        body:
+          settings:
+            index.number_of_replicas: 0
+            index.number_of_shards: 4
+
+  - do:
+      cluster.health:
+        wait_for_status: green
+
+  - do:
+      get:
+        index:   target
+        id:      1
+
+  - match: { _index:   target }
+  - match: { _id:      "1"     }
+  - match: { _source:  { "foo": "hello world", "n": [{"foo": "goodbye world"}, {"foo": "more words"}] } }
+
+  - do:
+      get:
+        index:   target
+        id:      2
+
+  - match: { _index:   target }
+  - match: { _id:      "2"     }
+  - match: { _source:  { foo: "hello world 2" } }
+
+  - do:
+      get:
+        index:   target
+        id:      3
+
+  - match: { _index:   target }
+  - match: { _id:      "3"     }
+  - match: { _source:  { foo: "hello world 3" } }
+
+  - do:
+      search:
+        index: target
+  - match: { hits.total.value: 3 }

+ 15 - 4
server/src/main/java/org/elasticsearch/index/shard/ShardSplittingQuery.java

@@ -96,7 +96,7 @@ final class ShardSplittingQuery extends Query {
                     }
                     if (indexMetadata.isRoutingPartitionedIndex()) {
                         // this is the heaviest invariant. Here we have to visit all docs stored fields do extract _id and _routing
-                        // this this index is routing partitioned.
+                        // this index is routing partitioned.
                         Visitor visitor = new Visitor(leafReader);
                         TwoPhaseIterator twoPhaseIterator = parentBitSet == null
                             ? new RoutingPartitionedDocIdSetIterator(visitor)
@@ -122,10 +122,21 @@ final class ShardSplittingQuery extends Query {
                             return shardId == targetShardId;
                         }, leafReader, maybeWrapConsumer.apply(bitSet::set));
 
+                        // TODO have the IndexRouting build the query and pass routingRequired in
+                        boolean routingRequired = indexMetadata.mapping() == null ? false : indexMetadata.mapping().routingRequired();
                         // now if we have a mixed index where some docs have a _routing value and some don't we have to exclude the ones
-                        // with a routing value from the next iteration an delete / select based on the ID.
-                        if (terms.getDocCount() != leafReader.maxDoc()) {
-                            // this is a special case where some of the docs have no routing values this sucks but it's possible today
+                        // with a routing value from the next iteration and delete / select based on the ID.
+                        if (routingRequired == false && terms.getDocCount() != leafReader.maxDoc()) {
+                            /*
+                             * This is a special case where some docs don't have routing values.
+                             * It's annoying, but it's allowed to build an index where some documents
+                             * hve routing and others don't.
+                             *
+                             * Luckily, if the routing field is required in the mapping then we can
+                             * safely assume that all documents which are don't have a routing are
+                             * nested documents. And we pick those up later based on the assignment
+                             * of the document that contains them.
+                             */
                             FixedBitSet hasRoutingValue = new FixedBitSet(leafReader.maxDoc());
                             findSplitDocs(RoutingFieldMapper.NAME, ref -> false, leafReader, maybeWrapConsumer.apply(hasRoutingValue::set));
                             IntConsumer bitSetConsumer = maybeWrapConsumer.apply(bitSet::set);