1
0
Эх сурвалжийг харах

Enable `_terms_enum` on `ip` fields (#94322)

The _terms_enum API currently does not support ip fields. However,
type-ahead-like completion is useful for UI purposes.
This change adds the ability to query ip fields via the _terms_enum API by
leveraging the terms enumeration available when doc_values are enabled on the
field, which is the default. In order to make prefix filtering fast, we
internally create a fast prefix automaton from the user-supplied prefix that
gets intersected with the shards terms enumeration, similar to what we do for
keyword fields already.

Closes #89933
Christoph Büscher 2 жил өмнө
parent
commit
d8021360ff

+ 6 - 0
docs/changelog/94322.yaml

@@ -0,0 +1,6 @@
+pr: 94322
+summary: Enable `_terms_enum` on `ip` fields
+area: Mapping
+type: enhancement
+issues:
+ - 89933

+ 2 - 2
docs/reference/search/terms-enum.asciidoc

@@ -6,8 +6,8 @@
 
 The terms enum API can be used to discover terms in the index that match
 a partial string. Supported field types are <<keyword-field-type,`keyword`>>,
-<<constant-keyword-field-type,`constant_keyword`>>, <<flattened,`flattened`>>
-and <<version, `version`>>. This is used for auto-complete:
+<<constant-keyword-field-type,`constant_keyword`>>, <<flattened,`flattened`>>,
+<<version, `version`>> and <<ip, `ip`>>. This is used for auto-complete:
 
 [source,console]
 --------------------------------------------------

+ 34 - 1
server/src/main/java/org/elasticsearch/index/mapper/IpFieldMapper.java

@@ -12,12 +12,16 @@ import org.apache.lucene.document.Field;
 import org.apache.lucene.document.InetAddressPoint;
 import org.apache.lucene.document.SortedSetDocValuesField;
 import org.apache.lucene.document.StoredField;
+import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.IndexOrDocValuesQuery;
 import org.apache.lucene.search.MatchNoDocsQuery;
 import org.apache.lucene.search.PointRangeQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
 import org.elasticsearch.Version;
 import org.elasticsearch.common.logging.DeprecationCategory;
 import org.elasticsearch.common.logging.DeprecationLogger;
@@ -49,7 +53,11 @@ import java.util.Map;
 import java.util.Objects;
 import java.util.function.BiFunction;
 
-/** A {@link FieldMapper} for ip addresses. */
+import static org.elasticsearch.index.mapper.IpPrefixAutomatonUtil.buildIpPrefixAutomaton;
+
+/**
+ * A {@link FieldMapper} for ip addresses.
+ */
 public class IpFieldMapper extends FieldMapper {
 
     private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(IpFieldMapper.class);
@@ -417,6 +425,31 @@ public class IpFieldMapper extends FieldMapper {
             return DocValueFormat.IP;
         }
 
+        @Override
+        public TermsEnum getTerms(IndexReader reader, String prefix, boolean caseInsensitive, String searchAfter) throws IOException {
+
+            Terms terms = null;
+            // terms_enum for ip only works if doc values are enabled
+            if (hasDocValues()) {
+                terms = SortedSetDocValuesTerms.getTerms(reader, name());
+            }
+            if (terms == null) {
+                // Field does not exist on this shard.
+                return null;
+            }
+            BytesRef searchBytes = searchAfter == null ? null : new BytesRef(InetAddressPoint.encode(InetAddress.getByName(searchAfter)));
+            CompiledAutomaton prefixAutomaton = buildIpPrefixAutomaton(prefix);
+
+            if (prefixAutomaton.type == CompiledAutomaton.AUTOMATON_TYPE.ALL) {
+                TermsEnum result = terms.iterator();
+                if (searchAfter != null) {
+                    result = new SearchAfterTermsEnum(result, searchBytes);
+                }
+                return result;
+            }
+            return terms.intersect(prefixAutomaton, searchBytes);
+        }
+
         /**
          * @return true if field has been marked as a dimension field
          */

+ 229 - 0
server/src/main/java/org/elasticsearch/index/mapper/IpPrefixAutomatonUtil.java

@@ -0,0 +1,229 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.index.mapper;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.automaton.MinimizationOperations;
+import org.apache.lucene.util.automaton.Operations;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.IntStream;
+
+import static org.apache.lucene.util.automaton.Operations.concatenate;
+
+/**
+ * This class contains utility functionality to build an Automaton based
+ * on a prefix String on an `ip` field.
+ */
+public class IpPrefixAutomatonUtil {
+
+    private static final Automaton EMPTY_AUTOMATON = Automata.makeEmpty();
+    private static final Automaton IPV4_PREFIX = Automata.makeBinary(new BytesRef(new byte[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1 }));
+
+    static final Map<Integer, Automaton> INCOMPLETE_IP4_GROUP_AUTOMATON_LOOKUP = new HashMap<>();
+    static {
+        for (int c = 0; c <= 255; c++) {
+            Automaton a = Automata.makeChar(c);
+            if (c > 0 && c < 10) {
+                // all one digit prefixes expand to the two digit range, i.e. 1 -> [10..19]
+                a = Operations.union(a, Automata.makeCharRange(c * 10, c * 10 + 9));
+                // 1 and 2 even to three digit ranges
+                if (c == 1) {
+                    a = Operations.union(a, Automata.makeCharRange(100, 199));
+                }
+                if (c == 2) {
+                    a = Operations.union(a, Automata.makeCharRange(200, 255));
+                }
+            }
+            if (c >= 10 && c < 26) {
+                int min = c * 10;
+                int max = Math.min(c * 10 + 9, 255);
+                a = Operations.union(a, Automata.makeCharRange(min, max));
+            }
+            INCOMPLETE_IP4_GROUP_AUTOMATON_LOOKUP.put(c, a);
+        }
+    }
+
+    /**
+     * Create a {@link CompiledAutomaton} from the ip Prefix.
+     * If the prefix is empty, the automaton returned will accept everything.
+     */
+    static CompiledAutomaton buildIpPrefixAutomaton(String ipPrefix) {
+        Automaton result;
+        if (ipPrefix.isEmpty() == false) {
+            Automaton ipv4Automaton = createIp4Automaton(ipPrefix);
+            if (ipv4Automaton != null) {
+                ipv4Automaton = concatenate(IPV4_PREFIX, ipv4Automaton);
+            }
+            Automaton ipv6Automaton = getIpv6Automaton(ipPrefix);
+            result = Operations.union(ipv4Automaton, ipv6Automaton);
+        } else {
+            result = Automata.makeAnyBinary();
+        }
+        result = MinimizationOperations.minimize(result, Integer.MAX_VALUE);
+        return new CompiledAutomaton(result, null, false, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT, true);
+    }
+
+    private static Automaton getIpv6Automaton(String ipPrefix) {
+        Automaton ipv6Automaton = EMPTY_AUTOMATON;
+        List<String> ip6Groups = parseIp6Prefix(ipPrefix);
+        if (ip6Groups.isEmpty() == false) {
+            ipv6Automaton = Automata.makeString("");
+            int groupsAdded = 0;
+            for (String group : ip6Groups) {
+                if (group.contains(".")) {
+                    // try to parse this as ipv4 ending part, but only if we already have some ipv6 specific stuff in front
+                    if (groupsAdded > 0) {
+                        ipv6Automaton = concatenate(ipv6Automaton, createIp4Automaton(group));
+                        groupsAdded += 2; // this counts as two bytes, missing bytes are padded already
+                    } else {
+                        return EMPTY_AUTOMATON;
+                    }
+                } else if (group.endsWith(":")) {
+                    groupsAdded++;
+                    // full block
+                    if (group.length() > 1) {
+                        group = group.substring(0, group.length() - 1);
+                        ipv6Automaton = concatenate(ipv6Automaton, automatonFromIPv6Group(padWithZeros(group, 4 - group.length())));
+                    } else {
+                        // single colon denotes left out zeros
+                        ipv6Automaton = concatenate(ipv6Automaton, Operations.repeat(Automata.makeChar(0)));
+                    }
+                } else {
+                    groupsAdded++;
+                    // partial block, we need to create all possibilities of byte sequences this could match
+                    ipv6Automaton = concatenate(ipv6Automaton, automatonFromIPv6Group(group));
+                }
+            }
+            // fill up the remainder of the 16 address bytes with wildcard matches, each group added so far counts for two bytes
+            for (int i = 0; i < 16 - groupsAdded * 2; i++) {
+                ipv6Automaton = concatenate(ipv6Automaton, Operations.optional(Automata.makeCharRange(0, 255)));
+            }
+        }
+        return ipv6Automaton;
+    }
+
+    static Automaton automatonFromIPv6Group(String ipv6Group) {
+        assert ipv6Group.length() > 0 && ipv6Group.length() <= 4 : "expected a full ipv6 group or prefix";
+        Automaton result = Automata.makeString("");
+        for (int leadingZeros = 0; leadingZeros <= 4 - ipv6Group.length(); leadingZeros++) {
+            int bytesAdded = 0;
+            String padded = padWithZeros(ipv6Group, leadingZeros);
+            Automaton a = Automata.makeString("");
+            while (padded.length() >= 2) {
+                a = concatenate(a, Automata.makeChar(Integer.parseInt(padded.substring(0, 2), 16)));
+                padded = padded.substring(2);
+                bytesAdded++;
+            }
+            if (padded.length() == 1) {
+                int value = Integer.parseInt(padded, 16);
+                a = concatenate(a, Operations.union(Automata.makeChar(value), Automata.makeCharRange(value * 16, value * 16 + 15)));
+                bytesAdded++;
+            }
+            if (bytesAdded != 2) {
+                a = concatenate(a, Automata.makeCharRange(0, 255));
+            }
+            result = Operations.union(result, a);
+        }
+        return result;
+    }
+
+    private static Pattern IPV4_GROUP_MATCHER = Pattern.compile(
+        "^((?:0|[1-9][0-9]{0,2})\\.)?" + "((?:0|[1-9][0-9]{0,2})\\.)?" + "((?:0|[1-9][0-9]{0,2})\\.)?" + "((?:0|[1-9][0-9]{0,2}))?$"
+    );
+
+    /**
+     * Creates an {@link Automaton} that accepts all ipv4 address byte representation
+     * that start with the given prefix. If the prefix is no valid ipv4 prefix, an automaton
+     * that accepts the empty language is returned.
+     */
+    static Automaton createIp4Automaton(String prefix) {
+        Matcher ip4Matcher = IPV4_GROUP_MATCHER.matcher(prefix);
+        if (ip4Matcher.matches() == false) {
+            return EMPTY_AUTOMATON;
+        }
+        int prefixBytes = 0;
+        byte[] completeByteGroups = new byte[4];
+        int completeBytes = 0;
+        // scan the groups the prefix matches
+        Automaton incompleteGroupAutomaton = Automata.makeString("");
+        for (int g = 1; g <= 4; g++) {
+            String group = ip4Matcher.group(g);
+            // note that intermediate groups might not match anything and can be empty
+            if (group != null) {
+                if (group.endsWith(".")) {
+                    // complete group found
+                    int value = Integer.parseInt(group.substring(0, group.length() - 1));
+                    if (value < 0 || value > 255) {
+                        // invalid value, append the empty result to the current one to make it match nothing
+                        return EMPTY_AUTOMATON;
+                    } else {
+                        completeByteGroups[completeBytes] = (byte) value;
+                        completeBytes++;
+                        prefixBytes++;
+                    }
+                } else {
+                    // if present, this is the last group
+                    int numberPrefix = Integer.parseInt(group);
+                    if (numberPrefix < 255) {
+                        incompleteGroupAutomaton = INCOMPLETE_IP4_GROUP_AUTOMATON_LOOKUP.get(numberPrefix);
+                        prefixBytes++;
+                    } else {
+                        // this cannot be a valid ip4 groups
+                        return EMPTY_AUTOMATON;
+                    }
+                }
+            }
+        }
+        return concatenate(
+            List.of(
+                Automata.makeBinary(new BytesRef(completeByteGroups, 0, completeBytes)),
+                incompleteGroupAutomaton,
+                Operations.repeat(Automata.makeCharRange(0, 255), 4 - prefixBytes, 4 - prefixBytes)
+            )
+        );
+    }
+
+    private static String padWithZeros(String input, int leadingZeros) {
+        return new StringBuilder("0".repeat(leadingZeros)).append(input).toString();
+    }
+
+    private static Pattern IP6_BLOCK_MATCHER = Pattern.compile(
+        "([a-f0-9]{0,4}:)|([a-f0-9]{1,4}$)" // the ipv6 specific notation
+            + "|((?:(?:0|[1-9][0-9]{0,2})\\.){1,3}(?:0|[1-9][0-9]{0,2})?$)" // the optional ipv4 part
+    );
+
+    static List<String> parseIp6Prefix(String ip6Prefix) {
+        Matcher ip6blockMatcher = IP6_BLOCK_MATCHER.matcher(ip6Prefix);
+        int position = 0;
+        List<String> groups = new ArrayList<>();
+        while (ip6blockMatcher.find(position)) {
+            if (ip6blockMatcher.start() == position) {
+                position = ip6blockMatcher.end();
+                IntStream.rangeClosed(1, 3).mapToObj(i -> ip6blockMatcher.group(i)).filter(s -> s != null).forEach(groups::add);
+            } else {
+                return Collections.emptyList();
+            }
+        }
+        if (position != ip6Prefix.length()) {
+            // no full match, return empty list
+            return Collections.emptyList();
+        }
+        return groups;
+    }
+}

+ 0 - 107
server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java

@@ -21,8 +21,6 @@ import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.MultiTerms;
-import org.apache.lucene.index.ReaderSlice;
-import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.sandbox.search.DocValuesTermsQuery;
@@ -68,11 +66,9 @@ import org.elasticsearch.xcontent.XContentBuilder;
 
 import java.io.IOException;
 import java.io.UncheckedIOException;
-import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
-import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Objects;
@@ -558,109 +554,6 @@ public final class KeywordFieldMapper extends FieldMapper {
             return terms.intersect(automaton, searchBytes);
         }
 
-        /**
-         * A simple terms implementation for SortedSetDocValues that only provides access to {@link TermsEnum} via
-         * {@link #iterator} and {@link #intersect(CompiledAutomaton, BytesRef)} methods.
-         * We have this custom implementation based on {@link MultiTerms} instead of using
-         * {@link org.apache.lucene.index.MultiDocValues#getSortedSetValues(IndexReader, String)}
-         * because {@link org.apache.lucene.index.MultiDocValues} builds global ordinals up-front whereas
-         * {@link MultiTerms}, which exposes the terms enum via {@link org.apache.lucene.index.MultiTermsEnum},
-         * merges terms on the fly.
-         */
-        static class SortedSetDocValuesTerms extends Terms {
-
-            public static Terms getTerms(IndexReader r, String field) throws IOException {
-                final List<LeafReaderContext> leaves = r.leaves();
-                if (leaves.size() == 1) {
-                    SortedSetDocValues sortedSetDocValues = leaves.get(0).reader().getSortedSetDocValues(field);
-                    if (sortedSetDocValues == null) {
-                        return null;
-                    } else {
-                        return new SortedSetDocValuesTerms(sortedSetDocValues);
-                    }
-                }
-
-                final List<Terms> termsPerLeaf = new ArrayList<>(leaves.size());
-                final List<ReaderSlice> slicePerLeaf = new ArrayList<>(leaves.size());
-
-                for (int leafIdx = 0; leafIdx < leaves.size(); leafIdx++) {
-                    LeafReaderContext ctx = leaves.get(leafIdx);
-                    SortedSetDocValues sortedSetDocValues = ctx.reader().getSortedSetDocValues(field);
-                    if (sortedSetDocValues != null) {
-                        termsPerLeaf.add(new SortedSetDocValuesTerms(sortedSetDocValues));
-                        slicePerLeaf.add(new ReaderSlice(ctx.docBase, r.maxDoc(), leafIdx));
-                    }
-                }
-
-                if (termsPerLeaf.isEmpty()) {
-                    return null;
-                } else {
-                    return new MultiTerms(termsPerLeaf.toArray(EMPTY_ARRAY), slicePerLeaf.toArray(ReaderSlice.EMPTY_ARRAY));
-                }
-            }
-
-            private final SortedSetDocValues values;
-
-            SortedSetDocValuesTerms(SortedSetDocValues values) {
-                this.values = values;
-            }
-
-            @Override
-            public TermsEnum iterator() throws IOException {
-                return values.termsEnum();
-            }
-
-            @Override
-            public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) throws IOException {
-                if (startTerm == null) {
-                    return values.intersect(compiled);
-                } else {
-                    return super.intersect(compiled, startTerm);
-                }
-            }
-
-            @Override
-            public long size() throws IOException {
-                throw new UnsupportedOperationException();
-            }
-
-            @Override
-            public long getSumTotalTermFreq() throws IOException {
-                throw new UnsupportedOperationException();
-            }
-
-            @Override
-            public long getSumDocFreq() throws IOException {
-                throw new UnsupportedOperationException();
-            }
-
-            @Override
-            public int getDocCount() throws IOException {
-                throw new UnsupportedOperationException();
-            }
-
-            @Override
-            public boolean hasFreqs() {
-                throw new UnsupportedOperationException();
-            }
-
-            @Override
-            public boolean hasOffsets() {
-                throw new UnsupportedOperationException();
-            }
-
-            @Override
-            public boolean hasPositions() {
-                throw new UnsupportedOperationException();
-            }
-
-            @Override
-            public boolean hasPayloads() {
-                throw new UnsupportedOperationException();
-            }
-
-        }
-
         @Override
         public String typeName() {
             return CONTENT_TYPE;

+ 126 - 0
server/src/main/java/org/elasticsearch/index/mapper/SortedSetDocValuesTerms.java

@@ -0,0 +1,126 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.index.mapper;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.MultiTerms;
+import org.apache.lucene.index.ReaderSlice;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A simple terms implementation for SortedSetDocValues that only provides access to {@link TermsEnum} via
+ * {@link #iterator} and {@link #intersect(CompiledAutomaton, BytesRef)} methods.
+ * We have this custom implementation based on {@link MultiTerms} instead of using
+ * {@link org.apache.lucene.index.MultiDocValues#getSortedSetValues(IndexReader, String)}
+ * because {@link org.apache.lucene.index.MultiDocValues} builds global ordinals up-front whereas
+ * {@link MultiTerms}, which exposes the terms enum via {@link org.apache.lucene.index.MultiTermsEnum},
+ * merges terms on the fly.
+ */
+class SortedSetDocValuesTerms extends Terms {
+
+    public static Terms getTerms(IndexReader r, String field) throws IOException {
+        final List<LeafReaderContext> leaves = r.leaves();
+        if (leaves.size() == 1) {
+            SortedSetDocValues sortedSetDocValues = leaves.get(0).reader().getSortedSetDocValues(field);
+            if (sortedSetDocValues == null) {
+                return null;
+            } else {
+                return new org.elasticsearch.index.mapper.SortedSetDocValuesTerms(sortedSetDocValues);
+            }
+        }
+
+        final List<Terms> termsPerLeaf = new ArrayList<>(leaves.size());
+        final List<ReaderSlice> slicePerLeaf = new ArrayList<>(leaves.size());
+
+        for (int leafIdx = 0; leafIdx < leaves.size(); leafIdx++) {
+            LeafReaderContext ctx = leaves.get(leafIdx);
+            SortedSetDocValues sortedSetDocValues = ctx.reader().getSortedSetDocValues(field);
+            if (sortedSetDocValues != null) {
+                termsPerLeaf.add(new org.elasticsearch.index.mapper.SortedSetDocValuesTerms(sortedSetDocValues));
+                slicePerLeaf.add(new ReaderSlice(ctx.docBase, r.maxDoc(), leafIdx));
+            }
+        }
+
+        if (termsPerLeaf.isEmpty()) {
+            return null;
+        } else {
+            return new MultiTerms(termsPerLeaf.toArray(EMPTY_ARRAY), slicePerLeaf.toArray(ReaderSlice.EMPTY_ARRAY));
+        }
+    }
+
+    private final SortedSetDocValues values;
+
+    SortedSetDocValuesTerms(SortedSetDocValues values) {
+        this.values = values;
+    }
+
+    @Override
+    public TermsEnum iterator() throws IOException {
+        return values.termsEnum();
+    }
+
+    @Override
+    public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) throws IOException {
+        if (startTerm == null) {
+            return values.intersect(compiled);
+        } else {
+            return super.intersect(compiled, startTerm);
+        }
+    }
+
+    @Override
+    public long size() throws IOException {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public long getSumTotalTermFreq() throws IOException {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public long getSumDocFreq() throws IOException {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public int getDocCount() throws IOException {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public boolean hasFreqs() {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public boolean hasOffsets() {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public boolean hasPositions() {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public boolean hasPayloads() {
+        throw new UnsupportedOperationException();
+    }
+
+}

+ 222 - 0
server/src/test/java/org/elasticsearch/index/mapper/IpPrefixAutomatonUtilTests.java

@@ -0,0 +1,222 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.index.mapper;
+
+import org.apache.commons.codec.binary.Hex;
+import org.apache.lucene.document.InetAddressPoint;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.automaton.MinimizationOperations;
+import org.apache.lucene.util.automaton.Operations;
+import org.elasticsearch.common.network.NetworkAddress;
+import org.elasticsearch.test.ESTestCase;
+
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.Arrays;
+
+import static org.elasticsearch.index.mapper.IpPrefixAutomatonUtil.parseIp6Prefix;
+import static org.hamcrest.Matchers.contains;
+import static org.hamcrest.Matchers.empty;
+import static org.hamcrest.Matchers.is;
+
+public class IpPrefixAutomatonUtilTests extends ESTestCase {
+
+    public void testCreateIp4PrefixAutomaton() throws UnknownHostException {
+        InetAddress randomIp = randomIp(true);
+        String ipString = NetworkAddress.format(randomIp);
+
+        // get a random prefix, some emphasis on shorter ones, and compile a prefix automaton for it
+        String randomPrefix = ipString.substring(0, randomBoolean() ? randomIntBetween(1, 6) : randomIntBetween(1, ipString.length()));
+        CompiledAutomaton ip4Automaton = compileAutomaton(IpPrefixAutomatonUtil.createIp4Automaton(randomPrefix));
+
+        // check that the original ip is accepted
+        assertTrue(ip4Automaton.runAutomaton.run(randomIp.getAddress(), 0, randomIp.getAddress().length));
+
+        // check that another random ip that doesn't have the same prefix isn't accepted
+        byte[] nonMatchingIp = randomValueOtherThanMany(ipv4 -> {
+            try {
+                return NetworkAddress.format(InetAddress.getByAddress(ipv4)).startsWith(randomPrefix);
+            } catch (UnknownHostException e) {
+                throw new RuntimeException(e);
+            }
+        }, () -> randomByteArrayOfLength(4));
+        assertFalse(ip4Automaton.runAutomaton.run(nonMatchingIp, 0, nonMatchingIp.length));
+
+        // no bytes sequence longer than four bytes should be accepted
+        byte[] fiveBytes = Arrays.copyOf(randomIp.getAddress(), 5);
+        fiveBytes[4] = randomByte();
+        assertFalse(ip4Automaton.runAutomaton.run(fiveBytes, 0, 5));
+
+        // the empty prefix should create an automaton that accepts every four bytes address
+        CompiledAutomaton acceptAll = compileAutomaton(IpPrefixAutomatonUtil.createIp4Automaton(""));
+        assertTrue(acceptAll.runAutomaton.run(randomByteArrayOfLength(4), 0, 4));
+    }
+
+    public void testIncompleteDecimalGroupAutomaton() throws UnknownHostException {
+        for (int p = 0; p <= 255; p++) {
+            String prefix = String.valueOf(p);
+            Automaton automaton = IpPrefixAutomatonUtil.INCOMPLETE_IP4_GROUP_AUTOMATON_LOOKUP.get(Integer.parseInt(prefix));
+            CompiledAutomaton compiledAutomaton = compileAutomaton(automaton);
+            for (int i = 0; i < 256; i++) {
+                if (String.valueOf(i).startsWith(prefix)) {
+                    assertTrue(compiledAutomaton.runAutomaton.run(new byte[] { (byte) i }, 0, 1));
+                } else {
+                    assertFalse(compiledAutomaton.runAutomaton.run(new byte[] { (byte) i }, 0, 1));
+                }
+            }
+        }
+    }
+
+    public void testBuildPrefixAutomaton() throws UnknownHostException {
+        {
+            CompiledAutomaton compiledAutomaton = IpPrefixAutomatonUtil.buildIpPrefixAutomaton("10");
+            byte[] encode = InetAddressPoint.encode(InetAddress.getByName("1.2.3.4"));
+            assertFalse(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("10.2.3.4"));
+            assertTrue(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("2.2.3.4"));
+            assertFalse(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("1::1"));
+            assertFalse(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("10::1"));
+            assertTrue(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("100::1"));
+            assertTrue(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("1000::1"));
+            assertTrue(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("1000::1.2.3.4"));
+            assertTrue(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+        }
+        {
+            CompiledAutomaton compiledAutomaton = IpPrefixAutomatonUtil.buildIpPrefixAutomaton("1");
+            byte[] encode = InetAddressPoint.encode(InetAddress.getByName("1.2.3.4"));
+            assertTrue(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("10.2.3.4"));
+            assertTrue(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("2.2.3.4"));
+            assertFalse(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("1af::1:2"));
+            assertTrue(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("1f::1:2"));
+            assertTrue(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("::1:2"));
+            assertFalse(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("1cce:e003:0:0:9279:d8d3:ffff:ffff"));
+            assertTrue(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+
+        }
+        {
+            CompiledAutomaton compiledAutomaton = IpPrefixAutomatonUtil.buildIpPrefixAutomaton("1.");
+            byte[] encode = InetAddressPoint.encode(InetAddress.getByName("1.2.3.4"));
+            assertTrue(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("10.2.3.4"));
+            assertFalse(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("2.2.3.4"));
+            assertFalse(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+        }
+        {
+            CompiledAutomaton compiledAutomaton = IpPrefixAutomatonUtil.buildIpPrefixAutomaton("1:2");
+            byte[] encode = InetAddressPoint.encode(InetAddress.getByName("1:2::1"));
+            assertTrue(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("1:2a::1"));
+            assertTrue(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("1:2ab::1"));
+            assertTrue(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("1:2ab5::1"));
+            assertTrue(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("10:2::3:4"));
+            assertFalse(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("::1:2:3:4"));
+            assertFalse(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+        }
+        {
+            CompiledAutomaton compiledAutomaton = IpPrefixAutomatonUtil.buildIpPrefixAutomaton("::1:2");
+            byte[] encode = InetAddressPoint.encode(InetAddress.getByName("::1:2"));
+            assertTrue(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("0:0:1:2::1"));
+            assertTrue(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("1:2ab::1"));
+            assertFalse(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+        }
+        {
+            CompiledAutomaton compiledAutomaton = IpPrefixAutomatonUtil.buildIpPrefixAutomaton("1::1.2");
+            byte[] encode = InetAddressPoint.encode(InetAddress.getByName("1::1.2.3.4"));
+            assertTrue(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("1::1.3.2.4"));
+            assertFalse(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+            encode = InetAddressPoint.encode(InetAddress.getByName("1::1.22.3.4"));
+            assertTrue(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+        }
+        {
+            CompiledAutomaton compiledAutomaton = IpPrefixAutomatonUtil.buildIpPrefixAutomaton("201.");
+            byte[] encode = InetAddressPoint.encode(InetAddress.getByName("c935:1902::643f:9e65:0:0"));
+            assertFalse(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+        }
+        {
+            CompiledAutomaton compiledAutomaton = IpPrefixAutomatonUtil.buildIpPrefixAutomaton("935");
+            byte[] encode = InetAddressPoint.encode(InetAddress.getByName("0935:1902::643f:9e65:0:0"));
+            assertTrue(compiledAutomaton.runAutomaton.run(encode, 0, encode.length));
+        }
+    }
+
+    public void testParseIp6Prefix() {
+        assertThat(parseIp6Prefix("123"), contains("123"));
+        assertThat(parseIp6Prefix("123:12"), contains("123:", "12"));
+        assertThat(parseIp6Prefix("123::12"), contains("123:", ":", "12"));
+        assertThat(parseIp6Prefix("123::12:00ab"), contains("123:", ":", "12:", "00ab"));
+        assertThat(parseIp6Prefix("123::12:00ah"), is(empty()));
+        assertThat(parseIp6Prefix("12345:"), is(empty()));
+        assertThat(
+            parseIp6Prefix("2001:0db8:85a3:08d3:1319:8a2e:0370:7344"),
+            contains("2001:", "0db8:", "85a3:", "08d3:", "1319:", "8a2e:", "0370:", "7344")
+        );
+        assertThat(parseIp6Prefix("2001:db8:0:8d3:0:8a2e:70:7344"), contains("2001:", "db8:", "0:", "8d3:", "0:", "8a2e:", "70:", "7344"));
+        assertThat(parseIp6Prefix("2001:db8::1428:57ab"), contains("2001:", "db8:", ":", "1428:", "57ab"));
+        assertThat(parseIp6Prefix("::ffff:7f00:1"), contains(":", ":", "ffff:", "7f00:", "1"));
+        assertThat(parseIp6Prefix("::ffff:127.0.0.1"), contains(":", ":", "ffff:", "127.0.0.1"));
+        assertThat(parseIp6Prefix("::127."), contains(":", ":", "127."));
+        assertThat(parseIp6Prefix("::127.1.2"), contains(":", ":", "127.1.2"));
+        assertThat(parseIp6Prefix("::127.1.1f"), is(empty()));
+        assertThat(parseIp6Prefix("::127.1234.1.3"), is(empty()));
+        assertThat(parseIp6Prefix("::127.1234.1:3"), is(empty()));
+    }
+
+    public void testAutomatonFromIPv6Group() throws UnknownHostException {
+        expectThrows(AssertionError.class, () -> IpPrefixAutomatonUtil.automatonFromIPv6Group(""));
+        expectThrows(AssertionError.class, () -> IpPrefixAutomatonUtil.automatonFromIPv6Group("12345"));
+
+        // start with a 4-char hex string, build automaton for random prefix of it, then assure its accepted
+        byte[] bytes = randomByteArrayOfLength(2);
+        String randomHex = new String(Hex.encodeHex(bytes)).replaceAll("^0+", "");
+        String prefix = randomHex.substring(0, randomIntBetween(1, randomHex.length()));
+        Automaton automaton = IpPrefixAutomatonUtil.automatonFromIPv6Group(prefix);
+        CompiledAutomaton compiledAutomaton = compileAutomaton(automaton);
+        assertTrue(compiledAutomaton.runAutomaton.run(bytes, 0, bytes.length));
+
+        // create random 4-char hex that isn't prefixed by the current prefix and check it isn't accepted
+        byte[] badGroup = randomValueOtherThanMany(
+            b -> new String(Hex.encodeHex(b)).replaceAll("^0+", "").startsWith(prefix),
+            () -> randomByteArrayOfLength(2)
+        );
+        assertFalse(compiledAutomaton.runAutomaton.run(badGroup, 0, badGroup.length));
+    }
+
+    private static CompiledAutomaton compileAutomaton(Automaton automaton) {
+        automaton = MinimizationOperations.minimize(automaton, Integer.MAX_VALUE);
+        CompiledAutomaton compiledAutomaton = new CompiledAutomaton(
+            automaton,
+            null,
+            false,
+            Operations.DEFAULT_DETERMINIZE_WORK_LIMIT,
+            true
+        );
+        return compiledAutomaton;
+    }
+}

+ 183 - 0
x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/termsenum/TermsEnumTests.java

@@ -0,0 +1,183 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.core.termsenum;
+
+import org.elasticsearch.action.bulk.BulkRequestBuilder;
+import org.elasticsearch.common.network.NetworkAddress;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.plugins.Plugin;
+import org.elasticsearch.test.ESSingleNodeTestCase;
+import org.elasticsearch.xcontent.XContentFactory;
+import org.elasticsearch.xpack.core.LocalStateCompositeXPackPlugin;
+import org.elasticsearch.xpack.core.XPackSettings;
+import org.elasticsearch.xpack.core.termsenum.action.TermsEnumAction;
+import org.elasticsearch.xpack.core.termsenum.action.TermsEnumRequest;
+import org.elasticsearch.xpack.core.termsenum.action.TermsEnumResponse;
+
+import java.net.InetAddress;
+import java.util.Collection;
+import java.util.List;
+
+import static org.elasticsearch.xcontent.XContentFactory.jsonBuilder;
+import static org.hamcrest.Matchers.contains;
+
+public class TermsEnumTests extends ESSingleNodeTestCase {
+
+    @Override
+    protected Collection<Class<? extends Plugin>> getPlugins() {
+        return List.of(LocalStateCompositeXPackPlugin.class);
+    }
+
+    @Override
+    protected Settings nodeSettings() {
+        return Settings.builder().put(XPackSettings.SECURITY_ENABLED.getKey(), "false").build();
+    }
+
+    public void testTermsEnumIPBasic() throws Exception {
+        String indexName = "test";
+        createIndex(indexName);
+
+        client().admin()
+            .indices()
+            .preparePutMapping(indexName)
+            .setSource(
+                XContentFactory.jsonBuilder()
+                    .startObject()
+                    .startObject("_doc")
+                    .startObject("properties")
+                    .startObject("ip_addr")
+                    .field("type", "ip")
+                    .endObject()
+                    .endObject()
+                    .endObject()
+                    .endObject()
+            )
+            .get();
+        ensureGreen();
+
+        client().prepareIndex(indexName).setId("1").setSource(jsonBuilder().startObject().field("ip_addr", "1.2.3.4").endObject()).get();
+        client().prepareIndex(indexName).setId("2").setSource(jsonBuilder().startObject().field("ip_addr", "205.0.1.2").endObject()).get();
+        client().prepareIndex(indexName).setId("3").setSource(jsonBuilder().startObject().field("ip_addr", "2.2.2.2").endObject()).get();
+        client().prepareIndex(indexName)
+            .setId("4")
+            .setSource(jsonBuilder().startObject().field("ip_addr", "2001:db8::1:0:0:1").endObject())
+            .get();
+        client().prepareIndex(indexName).setId("5").setSource(jsonBuilder().startObject().field("ip_addr", "13.3.3.3").endObject()).get();
+        client().admin().indices().prepareRefresh().get();
+
+        {
+            TermsEnumResponse response = client().execute(TermsEnumAction.INSTANCE, new TermsEnumRequest(indexName).field("ip_addr")).get();
+            List<String> terms = response.getTerms();
+            assertEquals(5, terms.size());
+            assertThat(terms, contains("1.2.3.4", "2.2.2.2", "13.3.3.3", "205.0.1.2", "2001:db8::1:0:0:1"));
+        }
+        {
+            TermsEnumResponse response = client().execute(
+                TermsEnumAction.INSTANCE,
+                new TermsEnumRequest(indexName).field("ip_addr").searchAfter("13.3.3.3")
+            ).get();
+            List<String> terms = response.getTerms();
+            assertEquals(2, terms.size());
+            assertThat(terms, contains("205.0.1.2", "2001:db8::1:0:0:1"));
+        }
+        {
+            TermsEnumResponse response = client().execute(
+                TermsEnumAction.INSTANCE,
+                new TermsEnumRequest(indexName).field("ip_addr").string("2")
+            ).get();
+            List<String> terms = response.getTerms();
+            assertEquals(3, terms.size());
+            assertThat(terms, contains("2.2.2.2", "205.0.1.2", "2001:db8::1:0:0:1"));
+        }
+        {
+            TermsEnumResponse response = client().execute(
+                TermsEnumAction.INSTANCE,
+                new TermsEnumRequest(indexName).field("ip_addr").string("20")
+            ).get();
+            List<String> terms = response.getTerms();
+            assertEquals(2, terms.size());
+            assertThat(terms, contains("205.0.1.2", "2001:db8::1:0:0:1"));
+        }
+    }
+
+    public void testTermsEnumIPRandomized() throws Exception {
+        String indexName = "test_random";
+        createIndex(indexName);
+        int numDocs = 500;
+
+        client().admin()
+            .indices()
+            .preparePutMapping(indexName)
+            .setSource(
+                XContentFactory.jsonBuilder()
+                    .startObject()
+                    .startObject("_doc")
+                    .startObject("properties")
+                    .startObject("ip_addr")
+                    .field("type", "ip")
+                    .endObject()
+                    .endObject()
+                    .endObject()
+                    .endObject()
+            )
+            .get();
+        ensureGreen();
+
+        // create random ip test data
+        InetAddress[] randomIps = new InetAddress[numDocs];
+        BulkRequestBuilder bulkRequestBuilder = client().prepareBulk(indexName);
+        for (int i = 0; i < numDocs; i++) {
+            randomIps[i] = randomIp(randomBoolean());
+            bulkRequestBuilder.add(
+                client().prepareIndex(indexName)
+                    .setSource(jsonBuilder().startObject().field("ip_addr", NetworkAddress.format(randomIps[i])).endObject())
+            );
+        }
+        bulkRequestBuilder.get();
+        client().admin().indices().prepareRefresh().get();
+
+        // test for short random prefixes, max length 7 should at least include some separators but not be too long for short ipv4
+        for (int prefixLength = 1; prefixLength < 7; prefixLength++) {
+            String randomPrefix = NetworkAddress.format(randomIps[randomIntBetween(0, numDocs)])
+                .substring(0, prefixLength)
+                .replaceAll("^0*", "");
+            int expectedResults = 0;
+            for (int i = 0; i < numDocs; i++) {
+                if (NetworkAddress.format(randomIps[i]).startsWith(randomPrefix)) {
+                    expectedResults++;
+                }
+            }
+            TermsEnumResponse response = client().execute(
+                TermsEnumAction.INSTANCE,
+                new TermsEnumRequest(indexName).field("ip_addr").string(randomPrefix).size(numDocs)
+            ).get();
+            List<String> terms = response.getTerms();
+            assertEquals(
+                "expected " + expectedResults + " for prefix " + randomPrefix + " but was " + terms.size() + ", " + terms,
+                expectedResults,
+                terms.size()
+            );
+
+            // test search after functionality
+            int searchAfterPosition = randomIntBetween(0, terms.size() - 1);
+            expectedResults = expectedResults - searchAfterPosition - 1;
+            response = client().execute(
+                TermsEnumAction.INSTANCE,
+                new TermsEnumRequest(indexName).field("ip_addr")
+                    .string(randomPrefix)
+                    .size(numDocs)
+                    .searchAfter(terms.get(searchAfterPosition))
+            ).get();
+            assertEquals(
+                "expected " + expectedResults + " for prefix " + randomPrefix + " but was " + response.getTerms().size() + ", " + terms,
+                expectedResults,
+                response.getTerms().size()
+            );
+        }
+    }
+}