Browse Source

docs: add best practises for wildcard queries inside percolator queries

Martijn van Groningen 7 years ago
parent
commit
cef7bd2079
1 changed files with 280 additions and 0 deletions
  1. 280 0
      docs/reference/mapping/types/percolator.asciidoc

+ 280 - 0
docs/reference/mapping/types/percolator.asciidoc

@@ -423,6 +423,286 @@ This results in a response like this:
 --------------------------------------------------
 // TESTRESPONSE[s/"took": 6,/"took": "$body.took",/]
 
+[float]
+==== Optimizing wildcard queries.
+
+Wildcard queries are  more expensive than other queries for the percolator,
+especially if the wildcard expressions are large.
+
+In the case of `wildcard` queries with prefix wildcard expressions or just the `prefix` query,
+the `edge_ngram` token filter can be used to replace these queries with regular `term`
+query on a field where the `edge_ngram` token filter is configured.
+
+Creating an index with custom analysis settings:
+
+[source,js]
+--------------------------------------------------
+PUT my_queries1
+{
+  "settings": {
+    "analysis": {
+      "analyzer": {
+        "wildcard_prefix": { <1>
+          "type": "custom",
+          "tokenizer": "standard",
+          "filter": [
+            "standard",
+            "lowercase",
+            "wildcard_edge_ngram"
+          ]
+        }
+      },
+      "filter": {
+        "wildcard_edge_ngram": { <2>
+          "type": "edge_ngram",
+          "min_gram": 1,
+          "max_gram": 32
+        }
+      }
+    }
+  },
+  "mappings": {
+    "query": {
+      "properties": {
+        "query": {
+          "type": "percolator"
+        },
+        "my_field": {
+          "type": "text",
+          "fields": {
+            "prefix": { <3>
+              "type": "text",
+              "analyzer": "wildcard_prefix",
+              "search_analyzer": "standard"
+            }
+          }
+        }
+      }
+    }
+  }
+}
+--------------------------------------------------
+// CONSOLE
+// TEST[continued]
+
+<1> The analyzer that generates the prefix tokens to be used at index time only.
+<2> Increase the `min_gram` and decrease `max_gram` settings based on your prefix search needs.
+<3> This multifield should be used to do the prefix search
+    with a `term` or `match` query instead of a `prefix` or `wildcard` query.
+
+
+Then instead of indexing the following query:
+
+[source,js]
+--------------------------------------------------
+{
+  "query": {
+    "wildcard": {
+      "my_field": "abc*"
+    }
+  }
+}
+--------------------------------------------------
+// NOTCONSOLE
+
+this query below should be indexed:
+
+[source,js]
+--------------------------------------------------
+PUT /my_queries1/query/1?refresh
+{
+  "query": {
+    "term": {
+      "my_field.prefix": "abc"
+    }
+  }
+}
+--------------------------------------------------
+// CONSOLE
+// TEST[continued]
+
+This way can handle the second query more efficiently than the first query.
+
+The following search request will match with the previously indexed
+percolator query:
+
+[source,js]
+--------------------------------------------------
+GET /my_queries1/_search
+{
+  "query": {
+    "percolate": {
+      "field": "query",
+      "document": {
+        "my_field": "abcd"
+      }
+    }
+  }
+}
+--------------------------------------------------
+// CONSOLE
+// TEST[continued]
+
+[source,js]
+--------------------------------------------------
+{
+  "took": 6,
+  "timed_out": false,
+  "_shards": {
+    "total": 5,
+    "successful": 5,
+    "skipped": 0,
+    "failed": 0
+  },
+  "hits": {
+    "total": 1,
+    "max_score": 0.41501677,
+    "hits": [
+      {
+        "_index": "my_queries1",
+        "_type": "query",
+        "_id": "1",
+        "_score": 0.41501677,
+        "_source": {
+          "query": {
+            "term": {
+              "my_field.prefix": "abc"
+            }
+          }
+        },
+        "fields": {
+          "_percolator_document_slot": [
+            0
+          ]
+        }
+      }
+    ]
+  }
+}
+--------------------------------------------------
+// TESTRESPONSE[s/"took": 6,/"took": "$body.took",/]
+
+The same technique can also be used to speed up suffix
+wildcard searches. By using the `reverse` token filter
+before the `edge_ngram` token filter.
+
+[source,js]
+--------------------------------------------------
+PUT my_queries2
+{
+  "settings": {
+    "analysis": {
+      "analyzer": {
+        "wildcard_suffix": {
+          "type": "custom",
+          "tokenizer": "standard",
+          "filter": [
+            "standard",
+            "lowercase",
+            "reverse",
+            "wildcard_edge_ngram"
+          ]
+        },
+        "wildcard_suffix_search_time": {
+          "type": "custom",
+          "tokenizer": "standard",
+          "filter": [
+            "standard",
+            "lowercase",
+            "reverse"
+          ]
+        }
+      },
+      "filter": {
+        "wildcard_edge_ngram": {
+          "type": "edge_ngram",
+          "min_gram": 1,
+          "max_gram": 32
+        }
+      }
+    }
+  },
+  "mappings": {
+    "query": {
+      "properties": {
+        "query": {
+          "type": "percolator"
+        },
+        "my_field": {
+          "type": "text",
+          "fields": {
+            "suffix": {
+              "type": "text",
+              "analyzer": "wildcard_suffix",
+              "search_analyzer": "wildcard_suffix_search_time" <1>
+            }
+          }
+        }
+      }
+    }
+  }
+}
+--------------------------------------------------
+// CONSOLE
+// TEST[continued]
+
+<1> A custom analyzer is needed at search time too, because otherwise
+    the query terms are not being reversed and would otherwise not match
+    with the reserved suffix tokens.
+
+Then instead of indexing the following query:
+
+[source,js]
+--------------------------------------------------
+{
+  "query": {
+    "wildcard": {
+      "my_field": "*xyz"
+    }
+  }
+}
+--------------------------------------------------
+// NOTCONSOLE
+
+the following query below should be indexed:
+
+[source,js]
+--------------------------------------------------
+PUT /my_queries2/query/2?refresh
+{
+  "query": {
+    "match": { <1>
+      "my_field.suffix": "xyz"
+    }
+  }
+}
+--------------------------------------------------
+// CONSOLE
+// TEST[continued]
+
+<1> The `match` query should be used instead of the `term` query,
+    because text analysis needs to reverse the query terms.
+
+The following search request will match with the previously indexed
+percolator query:
+
+[source,js]
+--------------------------------------------------
+GET /my_queries2/_search
+{
+  "query": {
+    "percolate": {
+      "field": "query",
+      "document": {
+        "my_field": "wxyz"
+      }
+    }
+  }
+}
+--------------------------------------------------
+// CONSOLE
+// TEST[continued]
+
 [float]
 ==== Dedicated Percolator Index