Browse Source

Add LimitedOffsetsEnum to Limited offset token (#86110)

Apply `max_analyzed_offset` to highlighting when offsets are
recorded in the index.

Fixes #86109
luyuncheng 3 years ago
parent
commit
f641000d19

+ 6 - 0
docs/changelog/86110.yaml

@@ -0,0 +1,6 @@
+pr: 86110
+summary: Add LimitedOffsetsEnum to Limited offset token
+area: Search
+type: enhancement
+issues:
+ - 86109

+ 8 - 1
server/src/main/java/org/elasticsearch/lucene/search/uhighlight/CustomFieldHighlighter.java

@@ -36,6 +36,7 @@ class CustomFieldHighlighter extends FieldHighlighter {
     private final Locale breakIteratorLocale;
     private final int noMatchSize;
     private String fieldValue;
+    private final Integer queryMaxAnalyzedOffset;
 
     CustomFieldHighlighter(
         String field,
@@ -46,11 +47,13 @@ class CustomFieldHighlighter extends FieldHighlighter {
         int maxPassages,
         int maxNoHighlightPassages,
         PassageFormatter passageFormatter,
-        int noMatchSize
+        int noMatchSize,
+        Integer queryMaxAnalyzedOffset
     ) {
         super(field, fieldOffsetStrategy, breakIterator, passageScorer, maxPassages, maxNoHighlightPassages, passageFormatter);
         this.breakIteratorLocale = breakIteratorLocale;
         this.noMatchSize = noMatchSize;
+        this.queryMaxAnalyzedOffset = queryMaxAnalyzedOffset;
     }
 
     FieldOffsetStrategy getFieldOffsetStrategy() {
@@ -106,6 +109,10 @@ class CustomFieldHighlighter extends FieldHighlighter {
     @Override
     protected Passage[] highlightOffsetsEnums(OffsetsEnum off) throws IOException {
 
+        if (queryMaxAnalyzedOffset != null) {
+            off = new LimitedOffsetsEnum(off, queryMaxAnalyzedOffset);
+        }
+
         final int contentLength = this.breakIterator.getText().getEndIndex();
 
         if (off.nextPosition() == false) {

+ 2 - 1
server/src/main/java/org/elasticsearch/lucene/search/uhighlight/CustomUnifiedHighlighter.java

@@ -193,7 +193,8 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
             maxPassages,
             (noMatchSize > 0 ? 1 : 0),
             getFormatter(field),
-            noMatchSize
+            noMatchSize,
+            queryMaxAnalyzedOffset
         );
     }
 

+ 56 - 0
server/src/main/java/org/elasticsearch/lucene/search/uhighlight/LimitedOffsetsEnum.java

@@ -0,0 +1,56 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.lucene.search.uhighlight;
+
+import org.apache.lucene.search.uhighlight.OffsetsEnum;
+import org.apache.lucene.util.BytesRef;
+
+import java.io.IOException;
+
+public class LimitedOffsetsEnum extends OffsetsEnum {
+    private final OffsetsEnum delegate;
+    private final int maxOffset;
+
+    public LimitedOffsetsEnum(OffsetsEnum delegate, int maxOffset) {
+        this.delegate = delegate;
+        this.maxOffset = maxOffset;
+    }
+
+    @Override
+    public boolean nextPosition() throws IOException {
+        boolean next = delegate.nextPosition();
+        if (next == false) {
+            return next;
+        }
+        if (delegate.startOffset() > maxOffset) {
+            return false;
+        }
+        return next;
+    }
+
+    @Override
+    public int freq() throws IOException {
+        return delegate.freq();
+    }
+
+    @Override
+    public BytesRef getTerm() throws IOException {
+        return delegate.getTerm();
+    }
+
+    @Override
+    public int startOffset() throws IOException {
+        return delegate.startOffset();
+    }
+
+    @Override
+    public int endOffset() throws IOException {
+        return delegate.endOffset();
+    }
+}

+ 100 - 1
server/src/test/java/org/elasticsearch/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java

@@ -9,6 +9,7 @@
 package org.elasticsearch.lucene.search.uhighlight;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.custom.CustomAnalyzer;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
@@ -40,6 +41,8 @@ import org.elasticsearch.test.ESTestCase;
 
 import java.text.BreakIterator;
 import java.util.Locale;
+import java.util.Map;
+import java.util.TreeMap;
 
 import static org.elasticsearch.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
 import static org.hamcrest.CoreMatchers.equalTo;
@@ -82,6 +85,34 @@ public class CustomUnifiedHighlighterTests extends ESTestCase {
         String[] expectedPassages,
         int maxAnalyzedOffset,
         Integer queryMaxAnalyzedOffset
+    ) throws Exception {
+        assertHighlightOneDoc(
+            fieldName,
+            inputs,
+            analyzer,
+            query,
+            locale,
+            breakIterator,
+            noMatchSize,
+            expectedPassages,
+            maxAnalyzedOffset,
+            queryMaxAnalyzedOffset,
+            UnifiedHighlighter.OffsetSource.ANALYSIS
+        );
+    }
+
+    private void assertHighlightOneDoc(
+        String fieldName,
+        String[] inputs,
+        Analyzer analyzer,
+        Query query,
+        Locale locale,
+        BreakIterator breakIterator,
+        int noMatchSize,
+        String[] expectedPassages,
+        int maxAnalyzedOffset,
+        Integer queryMaxAnalyzedOffset,
+        UnifiedHighlighter.OffsetSource offsetSource
     ) throws Exception {
         try (Directory dir = newDirectory()) {
             IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
@@ -106,7 +137,7 @@ public class CustomUnifiedHighlighterTests extends ESTestCase {
                 CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(
                     searcher,
                     analyzer,
-                    UnifiedHighlighter.OffsetSource.ANALYSIS,
+                    offsetSource,
                     new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()),
                     locale,
                     breakIterator,
@@ -394,4 +425,72 @@ public class CustomUnifiedHighlighterTests extends ESTestCase {
             10
         );
     }
+
+    public void testExceedMaxAnalyzedOffsetWithRepeatedWords() throws Exception {
+
+        TermQuery query = new TermQuery(new Term("text", "Fun"));
+        Analyzer analyzer = new WhitespaceAnalyzer();
+        assertHighlightOneDoc(
+            "text",
+            new String[] { "Testing Fun Testing Fun" },
+            analyzer,
+            query,
+            Locale.ROOT,
+            BreakIterator.getSentenceInstance(Locale.ROOT),
+            0,
+            new String[] { "Testing <b>Fun</b> Testing Fun" },
+            29,
+            10,
+            UnifiedHighlighter.OffsetSource.ANALYSIS
+        );
+        assertHighlightOneDoc(
+            "text",
+            new String[] { "Testing Fun Testing Fun" },
+            analyzer,
+            query,
+            Locale.ROOT,
+            BreakIterator.getSentenceInstance(Locale.ROOT),
+            0,
+            new String[] { "Testing <b>Fun</b> Testing Fun" },
+            29,
+            10,
+            UnifiedHighlighter.OffsetSource.POSTINGS
+        );
+    }
+
+    public void testExceedMaxAnalyzedOffsetRandomOffset() throws Exception {
+        TermQuery query = new TermQuery(new Term("text", "fun"));
+        Analyzer analyzer = new WhitespaceAnalyzer();
+        UnifiedHighlighter.OffsetSource offsetSource = randomBoolean()
+            ? UnifiedHighlighter.OffsetSource.ANALYSIS
+            : UnifiedHighlighter.OffsetSource.POSTINGS;
+        final String[] inputs = { "Fun fun fun fun fun" };
+        TreeMap<Integer, String> outputs = new TreeMap<>(
+            Map.of(
+                7,
+                "Fun <b>fun</b> fun fun fun",
+                11,
+                "Fun <b>fun</b> <b>fun</b> fun fun",
+                15,
+                "Fun <b>fun</b> <b>fun</b> <b>fun</b> fun",
+                19,
+                "Fun <b>fun</b> <b>fun</b> <b>fun</b> <b>fun</b>"
+            )
+        );
+        Integer randomOffset = between(7, 19);
+        String output = outputs.ceilingEntry(randomOffset).getValue();
+        assertHighlightOneDoc(
+            "text",
+            inputs,
+            analyzer,
+            query,
+            Locale.ROOT,
+            BreakIterator.getSentenceInstance(Locale.ROOT),
+            0,
+            new String[] { output },
+            47,
+            randomOffset,
+            offsetSource
+        );
+    }
 }