|
|
@@ -19,16 +19,22 @@ package org.apache.lucene.queryparser;
|
|
|
|
|
|
import org.apache.lucene.analysis.Analyzer;
|
|
|
import org.apache.lucene.index.Term;
|
|
|
-import org.apache.lucene.search.*;
|
|
|
+import org.apache.lucene.search.BooleanClause;
|
|
|
+import org.apache.lucene.search.BooleanQuery;
|
|
|
+import org.apache.lucene.search.FuzzyQuery;
|
|
|
+import org.apache.lucene.search.MatchAllDocsQuery;
|
|
|
+import org.apache.lucene.search.PrefixQuery;
|
|
|
+import org.apache.lucene.search.Query;
|
|
|
import org.apache.lucene.util.QueryBuilder;
|
|
|
import org.apache.lucene.util.Version;
|
|
|
+import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
|
|
import org.elasticsearch.common.lucene.Lucene;
|
|
|
|
|
|
import java.util.Collections;
|
|
|
import java.util.Map;
|
|
|
|
|
|
/**
|
|
|
- * XSimpleQueryParser is used to parse human readable query syntax.
|
|
|
+ * SimpleQueryParser is used to parse human readable query syntax.
|
|
|
* <p>
|
|
|
* The main idea behind this parser is that a person should be able to type
|
|
|
* whatever they want to represent a query, and this parser will do its best
|
|
|
@@ -46,6 +52,8 @@ import java.util.Map;
|
|
|
* <li>'{@code -}' negates a single token: <tt>-token0</tt>
|
|
|
* <li>'{@code "}' creates phrases of terms: <tt>"term1 term2 ..."</tt>
|
|
|
* <li>'{@code *}' at the end of terms specifies prefix query: <tt>term*</tt>
|
|
|
+ * <li>'{@code ~}N' at the end of terms specifies fuzzy query: <tt>term~1</tt>
|
|
|
+ * <li>'{@code ~}N' at the end of phrases specifies near query: <tt>"term1 term2"~5</tt>
|
|
|
* <li>'{@code (}' and '{@code )}' specifies precedence: <tt>token1 + (token2 | token3)</tt>
|
|
|
* </ul>
|
|
|
* <p>
|
|
|
@@ -114,6 +122,11 @@ public class XSimpleQueryParser extends QueryBuilder {
|
|
|
public static final int ESCAPE_OPERATOR = 1<<6;
|
|
|
/** Enables {@code WHITESPACE} operators: ' ' '\n' '\r' '\t' */
|
|
|
public static final int WHITESPACE_OPERATOR = 1<<7;
|
|
|
+ /** Enables {@code FUZZY} operators: (~) on single terms */
|
|
|
+ public static final int FUZZY_OPERATOR = 1<<8;
|
|
|
+ /** Enables {@code NEAR} operators: (~) on phrases */
|
|
|
+ public static final int NEAR_OPERATOR = 1<<9;
|
|
|
+
|
|
|
|
|
|
private BooleanClause.Occur defaultOperator = BooleanClause.Occur.SHOULD;
|
|
|
|
|
|
@@ -269,6 +282,7 @@ public class XSimpleQueryParser extends QueryBuilder {
|
|
|
int start = ++state.index;
|
|
|
int copied = 0;
|
|
|
boolean escaped = false;
|
|
|
+ boolean hasSlop = false;
|
|
|
|
|
|
while (state.index < state.length) {
|
|
|
if (!escaped) {
|
|
|
@@ -282,10 +296,23 @@ public class XSimpleQueryParser extends QueryBuilder {
|
|
|
|
|
|
continue;
|
|
|
} else if (state.data[state.index] == '"') {
|
|
|
- // this should be the end of the phrase
|
|
|
- // all characters found will used for
|
|
|
- // creating the phrase query
|
|
|
- break;
|
|
|
+ // if there are still characters after the closing ", check for a
|
|
|
+ // tilde
|
|
|
+ if (state.length > (state.index + 1) &&
|
|
|
+ state.data[state.index+1] == '~' &&
|
|
|
+ (flags & NEAR_OPERATOR) != 0) {
|
|
|
+ state.index++;
|
|
|
+ // check for characters after the tilde
|
|
|
+ if (state.length > (state.index + 1)) {
|
|
|
+ hasSlop = true;
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ } else {
|
|
|
+ // this should be the end of the phrase
|
|
|
+ // all characters found will used for
|
|
|
+ // creating the phrase query
|
|
|
+ break;
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -308,7 +335,12 @@ public class XSimpleQueryParser extends QueryBuilder {
|
|
|
// a complete phrase has been found and is parsed through
|
|
|
// through the analyzer from the given field
|
|
|
String phrase = new String(state.buffer, 0, copied);
|
|
|
- Query branch = newPhraseQuery(phrase);
|
|
|
+ Query branch;
|
|
|
+ if (hasSlop) {
|
|
|
+ branch = newPhraseQuery(phrase, parseFuzziness(state));
|
|
|
+ } else {
|
|
|
+ branch = newPhraseQuery(phrase, 0);
|
|
|
+ }
|
|
|
buildQueryTree(state, branch);
|
|
|
|
|
|
++state.index;
|
|
|
@@ -319,6 +351,7 @@ public class XSimpleQueryParser extends QueryBuilder {
|
|
|
int copied = 0;
|
|
|
boolean escaped = false;
|
|
|
boolean prefix = false;
|
|
|
+ boolean fuzzy = false;
|
|
|
|
|
|
while (state.index < state.length) {
|
|
|
if (!escaped) {
|
|
|
@@ -332,19 +365,14 @@ public class XSimpleQueryParser extends QueryBuilder {
|
|
|
++state.index;
|
|
|
|
|
|
continue;
|
|
|
- } else if ((state.data[state.index] == '"' && (flags & PHRASE_OPERATOR) != 0)
|
|
|
- || (state.data[state.index] == '|' && (flags & OR_OPERATOR) != 0)
|
|
|
- || (state.data[state.index] == '+' && (flags & AND_OPERATOR) != 0)
|
|
|
- || (state.data[state.index] == '(' && (flags & PRECEDENCE_OPERATORS) != 0)
|
|
|
- || (state.data[state.index] == ')' && (flags & PRECEDENCE_OPERATORS) != 0)
|
|
|
- || ((state.data[state.index] == ' '
|
|
|
- || state.data[state.index] == '\t'
|
|
|
- || state.data[state.index] == '\n'
|
|
|
- || state.data[state.index] == '\r') && (flags & WHITESPACE_OPERATOR) != 0)) {
|
|
|
+ } else if (tokenFinished(state)) {
|
|
|
// this should be the end of the term
|
|
|
// all characters found will used for
|
|
|
// creating the term query
|
|
|
break;
|
|
|
+ } else if (copied > 0 && state.data[state.index] == '~' && (flags & FUZZY_OPERATOR) != 0) {
|
|
|
+ fuzzy = true;
|
|
|
+ break;
|
|
|
}
|
|
|
|
|
|
// wildcard tracks whether or not the last character
|
|
|
@@ -361,7 +389,17 @@ public class XSimpleQueryParser extends QueryBuilder {
|
|
|
if (copied > 0) {
|
|
|
final Query branch;
|
|
|
|
|
|
- if (prefix) {
|
|
|
+ if (fuzzy && (flags & FUZZY_OPERATOR) != 0) {
|
|
|
+ String token = new String(state.buffer, 0, copied);
|
|
|
+ int fuzziness = parseFuzziness(state);
|
|
|
+ // edit distance has a maximum, limit to the maximum supported
|
|
|
+ fuzziness = Math.min(fuzziness, LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
|
|
|
+ if (fuzziness == 0) {
|
|
|
+ branch = newDefaultQuery(token);
|
|
|
+ } else {
|
|
|
+ branch = newFuzzyQuery(token, fuzziness);
|
|
|
+ }
|
|
|
+ } else if (prefix) {
|
|
|
// if a term is found with a closing '*' it is considered to be a prefix query
|
|
|
// and will have prefix added as an option
|
|
|
String token = new String(state.buffer, 0, copied - 1);
|
|
|
@@ -423,6 +461,60 @@ public class XSimpleQueryParser extends QueryBuilder {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ /**
|
|
|
+ * Helper parsing fuzziness from parsing state
|
|
|
+ * @return slop/edit distance, 0 in the case of non-parsing slop/edit string
|
|
|
+ */
|
|
|
+ private int parseFuzziness(State state) {
|
|
|
+ char slopText[] = new char[state.length];
|
|
|
+ int slopLength = 0;
|
|
|
+
|
|
|
+ if (state.data[state.index] == '~') {
|
|
|
+ while (state.index < state.length) {
|
|
|
+ state.index++;
|
|
|
+ // it's possible that the ~ was at the end, so check after incrementing
|
|
|
+ // to make sure we don't go out of bounds
|
|
|
+ if (state.index < state.length) {
|
|
|
+ if (tokenFinished(state)) {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ slopText[slopLength] = state.data[state.index];
|
|
|
+ slopLength++;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ int fuzziness = 0;
|
|
|
+ try {
|
|
|
+ fuzziness = Integer.parseInt(new String(slopText, 0, slopLength));
|
|
|
+ } catch (NumberFormatException e) {
|
|
|
+ // swallow number format exceptions parsing fuzziness
|
|
|
+ }
|
|
|
+ // negative -> 0
|
|
|
+ if (fuzziness < 0) {
|
|
|
+ fuzziness = 0;
|
|
|
+ }
|
|
|
+ return fuzziness;
|
|
|
+ }
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Helper returning true if the state has reached the end of token.
|
|
|
+ */
|
|
|
+ private boolean tokenFinished(State state) {
|
|
|
+ if ((state.data[state.index] == '"' && (flags & PHRASE_OPERATOR) != 0)
|
|
|
+ || (state.data[state.index] == '|' && (flags & OR_OPERATOR) != 0)
|
|
|
+ || (state.data[state.index] == '+' && (flags & AND_OPERATOR) != 0)
|
|
|
+ || (state.data[state.index] == '(' && (flags & PRECEDENCE_OPERATORS) != 0)
|
|
|
+ || (state.data[state.index] == ')' && (flags & PRECEDENCE_OPERATORS) != 0)
|
|
|
+ || ((state.data[state.index] == ' '
|
|
|
+ || state.data[state.index] == '\t'
|
|
|
+ || state.data[state.index] == '\n'
|
|
|
+ || state.data[state.index] == '\r') && (flags & WHITESPACE_OPERATOR) != 0)) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
/**
|
|
|
* Factory method to generate a standard query (no phrase or prefix operators).
|
|
|
*/
|
|
|
@@ -439,12 +531,27 @@ public class XSimpleQueryParser extends QueryBuilder {
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * Factory method to generate a phrase query.
|
|
|
+ * Factory method to generate a fuzzy query.
|
|
|
+ */
|
|
|
+ protected Query newFuzzyQuery(String text, int fuzziness) {
|
|
|
+ BooleanQuery bq = new BooleanQuery(true);
|
|
|
+ for (Map.Entry<String,Float> entry : weights.entrySet()) {
|
|
|
+ Query q = new FuzzyQuery(new Term(entry.getKey(), text), fuzziness);
|
|
|
+ if (q != null) {
|
|
|
+ q.setBoost(entry.getValue());
|
|
|
+ bq.add(q, BooleanClause.Occur.SHOULD);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return simplify(bq);
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Factory method to generate a phrase query with slop.
|
|
|
*/
|
|
|
- protected Query newPhraseQuery(String text) {
|
|
|
+ protected Query newPhraseQuery(String text, int slop) {
|
|
|
BooleanQuery bq = new BooleanQuery(true);
|
|
|
for (Map.Entry<String,Float> entry : weights.entrySet()) {
|
|
|
- Query q = createPhraseQuery(entry.getKey(), text);
|
|
|
+ Query q = createPhraseQuery(entry.getKey(), text, slop);
|
|
|
if (q != null) {
|
|
|
q.setBoost(entry.getValue());
|
|
|
bq.add(q, BooleanClause.Occur.SHOULD);
|
|
|
@@ -518,4 +625,3 @@ public class XSimpleQueryParser extends QueryBuilder {
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
-
|