|  | @@ -4,59 +4,82 @@
 | 
	
		
			
				|  |  |  <titleabbrev>MinHash</titleabbrev>
 | 
	
		
			
				|  |  |  ++++
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -The `min_hash` token filter hashes each token of the token stream and divides
 | 
	
		
			
				|  |  | -the resulting hashes into buckets, keeping the lowest-valued hashes per
 | 
	
		
			
				|  |  | -bucket. It then returns these hashes as tokens.
 | 
	
		
			
				|  |  | +Uses the https://en.wikipedia.org/wiki/MinHash[MinHash] technique to produce a
 | 
	
		
			
				|  |  | +signature for a token stream. You can use MinHash signatures to estimate the
 | 
	
		
			
				|  |  | +similarity of documents. See <<analysis-minhash-tokenfilter-similarity-search>>.
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -The following are settings that can be set for a `min_hash` token filter.
 | 
	
		
			
				|  |  | +The `min_hash` filter performs the following operations on a token stream in
 | 
	
		
			
				|  |  | +order:
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -[cols="<,<", options="header",]
 | 
	
		
			
				|  |  | -|=======================================================================
 | 
	
		
			
				|  |  | -|Setting |Description
 | 
	
		
			
				|  |  | -|`hash_count` |The number of hashes to hash the token stream with. Defaults to `1`.
 | 
	
		
			
				|  |  | +. Hashes each token in the stream.
 | 
	
		
			
				|  |  | +. Assigns the hashes to buckets, keeping only the smallest hashes of each
 | 
	
		
			
				|  |  | +  bucket.
 | 
	
		
			
				|  |  | +. Outputs the smallest hash from each bucket as a token stream.
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -|`bucket_count` |The number of buckets to divide the minhashes into. Defaults to `512`.
 | 
	
		
			
				|  |  | +This filter uses Lucene's
 | 
	
		
			
				|  |  | +{lucene-analysis-docs}/minhash/MinHashFilter.html[MinHashFilter].
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -|`hash_set_size` |The number of minhashes to keep per bucket. Defaults to `1`.
 | 
	
		
			
				|  |  | +[[analysis-minhash-tokenfilter-configure-parms]]
 | 
	
		
			
				|  |  | +==== Configurable parameters
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -|`with_rotation` |Whether or not to fill empty buckets with the value of the first non-empty
 | 
	
		
			
				|  |  | -bucket to its circular right. Only takes effect if hash_set_size is equal to one.
 | 
	
		
			
				|  |  | -Defaults to `true` if bucket_count is greater than one, else `false`.
 | 
	
		
			
				|  |  | -|=======================================================================
 | 
	
		
			
				|  |  | +`bucket_count`::
 | 
	
		
			
				|  |  | +(Optional, integer)
 | 
	
		
			
				|  |  | +Number of buckets to which hashes are assigned. Defaults to `512`.
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -Some points to consider while setting up a `min_hash` filter:
 | 
	
		
			
				|  |  | +`hash_count`::
 | 
	
		
			
				|  |  | +(Optional, integer)
 | 
	
		
			
				|  |  | +Number of ways to hash each token in the stream. Defaults to `1`.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +`hash_set_size`::
 | 
	
		
			
				|  |  | +(Optional, integer)
 | 
	
		
			
				|  |  | +Number of hashes to keep from each bucket. Defaults to `1`.
 | 
	
		
			
				|  |  | ++
 | 
	
		
			
				|  |  | +Hashes are retained by ascending size, starting with the bucket's smallest hash
 | 
	
		
			
				|  |  | +first.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +`with_rotation`::
 | 
	
		
			
				|  |  | +(Optional, boolean)
 | 
	
		
			
				|  |  | +If `true`, the filter fills empty buckets with the value of the first non-empty
 | 
	
		
			
				|  |  | +bucket to its circular right if the `hash_set_size` is `1`. If the
 | 
	
		
			
				|  |  | +`bucket_count` argument is greater than `1`, this parameter defaults to `true`.
 | 
	
		
			
				|  |  | +Otherwise, this parameter defaults to `false`.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +[[analysis-minhash-tokenfilter-configuration-tips]]
 | 
	
		
			
				|  |  | +==== Tips for configuring the `min_hash` filter
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  * `min_hash` filter input tokens should typically be k-words shingles produced
 | 
	
		
			
				|  |  | -from <<analysis-shingle-tokenfilter,shingle token filter>>.  You should
 | 
	
		
			
				|  |  | +from <<analysis-shingle-tokenfilter,shingle token filter>>. You should
 | 
	
		
			
				|  |  |  choose `k` large enough so that the probability of any given shingle
 | 
	
		
			
				|  |  | -occurring in a  document is low. At the same time, as
 | 
	
		
			
				|  |  | +occurring in a document is low. At the same time, as
 | 
	
		
			
				|  |  |  internally each shingle is hashed into to 128-bit hash, you should choose
 | 
	
		
			
				|  |  |  `k` small enough so that all possible
 | 
	
		
			
				|  |  |  different k-words shingles can be hashed to 128-bit hash with
 | 
	
		
			
				|  |  |  minimal collision.
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -* choosing the right settings for `hash_count`, `bucket_count` and
 | 
	
		
			
				|  |  | -`hash_set_size` needs some experimentation.
 | 
	
		
			
				|  |  | -** to improve the precision, you should increase `bucket_count` or
 | 
	
		
			
				|  |  | -`hash_set_size`. Higher values of `bucket_count` or `hash_set_size`
 | 
	
		
			
				|  |  | -will provide a higher guarantee that different tokens are
 | 
	
		
			
				|  |  | -indexed to different buckets.
 | 
	
		
			
				|  |  | -** to improve the recall,
 | 
	
		
			
				|  |  | -you should increase `hash_count` parameter. For example,
 | 
	
		
			
				|  |  | -setting `hash_count=2`, will make each token to be hashed in
 | 
	
		
			
				|  |  | -two different ways, thus increasing the number of potential
 | 
	
		
			
				|  |  | -candidates for search.
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -* the default settings makes the  `min_hash` filter to produce for
 | 
	
		
			
				|  |  | -each document 512 `min_hash` tokens, each is of size 16 bytes.
 | 
	
		
			
				|  |  | -Thus, each document's size will be increased by around 8Kb.
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -* `min_hash` filter is used to hash for Jaccard similarity. This means
 | 
	
		
			
				|  |  | +* We recommend you test different arguments for the `hash_count`, `bucket_count` and
 | 
	
		
			
				|  |  | +  `hash_set_size` parameters:
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +** To improve precision, increase the `bucket_count` or
 | 
	
		
			
				|  |  | +   `hash_set_size` arguments. Higher `bucket_count` and `hash_set_size` values
 | 
	
		
			
				|  |  | +   increase the likelihood that different tokens are indexed to different
 | 
	
		
			
				|  |  | +   buckets.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +** To improve the recall, increase the value of the `hash_count` argument. For
 | 
	
		
			
				|  |  | +   example, setting `hash_count` to `2` hashes each token in two different ways,
 | 
	
		
			
				|  |  | +   increasing the number of potential candidates for search.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +* By default, the `min_hash` filter produces 512 tokens for each document. Each
 | 
	
		
			
				|  |  | +token is 16 bytes in size. This means each document's size will be increased by
 | 
	
		
			
				|  |  | +around 8Kb.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +* The `min_hash` filter is used for Jaccard similarity. This means
 | 
	
		
			
				|  |  |  that it doesn't matter how many times a document contains a certain token,
 | 
	
		
			
				|  |  |  only that if it contains it or not.
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -==== Theory
 | 
	
		
			
				|  |  | -MinHash token filter allows you to hash documents for similarity search.
 | 
	
		
			
				|  |  | +[[analysis-minhash-tokenfilter-similarity-search]]
 | 
	
		
			
				|  |  | +==== Using the `min_hash` token filter for similarity search
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +The `min_hash` token filter allows you to hash documents for similarity search.
 | 
	
		
			
				|  |  |  Similarity search, or nearest neighbor search is a complex problem.
 | 
	
		
			
				|  |  |  A naive solution requires an exhaustive pairwise comparison between a query
 | 
	
		
			
				|  |  |  document and every document in an index. This is a prohibitive operation
 | 
	
	
		
			
				|  | @@ -88,18 +111,33 @@ document's tokens and chooses the minimum hash code among them.
 | 
	
		
			
				|  |  |  The minimum hash codes from all hash functions are combined
 | 
	
		
			
				|  |  |  to form a signature for the document.
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +[[analysis-minhash-tokenfilter-customize]]
 | 
	
		
			
				|  |  | +==== Customize and add to an analyzer
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +To customize the `min_hash` filter, duplicate it to create the basis for a new
 | 
	
		
			
				|  |  | +custom token filter. You can modify the filter using its configurable
 | 
	
		
			
				|  |  | +parameters.
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -==== Example of setting MinHash Token Filter in Elasticsearch
 | 
	
		
			
				|  |  | -Here is an example of setting up a `min_hash` filter:
 | 
	
		
			
				|  |  | +For example, the following <<indices-create-index,create index API>> request
 | 
	
		
			
				|  |  | +uses the following custom token filters to configure a new
 | 
	
		
			
				|  |  | +<<analysis-custom-analyzer,custom analyzer>>:
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -[source,js]
 | 
	
		
			
				|  |  | ---------------------------------------------------
 | 
	
		
			
				|  |  | -POST /index1
 | 
	
		
			
				|  |  | +* `my_shingle_filter`, a custom <<analysis-shingle-tokenfilter,`shingle`
 | 
	
		
			
				|  |  | +  filter>>. `my_shingle_filter` only outputs five-word shingles.
 | 
	
		
			
				|  |  | +* `my_minhash_filter`, a custom `min_hash` filter. `my_minhash_filter` hashes
 | 
	
		
			
				|  |  | +  each five-word shingle once. It then assigns the hashes into 512 buckets,
 | 
	
		
			
				|  |  | +  keeping only the smallest hash from each bucket.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +The request also assigns the custom analyzer to the `fingerprint` field mapping.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +[source,console]
 | 
	
		
			
				|  |  | +----
 | 
	
		
			
				|  |  | +PUT /my_index
 | 
	
		
			
				|  |  |  {
 | 
	
		
			
				|  |  |    "settings": {
 | 
	
		
			
				|  |  |      "analysis": {
 | 
	
		
			
				|  |  |        "filter": {
 | 
	
		
			
				|  |  | -        "my_shingle_filter": { <1>
 | 
	
		
			
				|  |  | +        "my_shingle_filter": {      <1>
 | 
	
		
			
				|  |  |            "type": "shingle",
 | 
	
		
			
				|  |  |            "min_shingle_size": 5,
 | 
	
		
			
				|  |  |            "max_shingle_size": 5,
 | 
	
	
		
			
				|  | @@ -107,10 +145,10 @@ POST /index1
 | 
	
		
			
				|  |  |          },
 | 
	
		
			
				|  |  |          "my_minhash_filter": {
 | 
	
		
			
				|  |  |            "type": "min_hash",
 | 
	
		
			
				|  |  | -          "hash_count": 1,   <2>
 | 
	
		
			
				|  |  | -          "bucket_count": 512, <3>
 | 
	
		
			
				|  |  | -          "hash_set_size": 1, <4>
 | 
	
		
			
				|  |  | -          "with_rotation": true <5>
 | 
	
		
			
				|  |  | +          "hash_count": 1,          <2>
 | 
	
		
			
				|  |  | +          "bucket_count": 512,      <3>
 | 
	
		
			
				|  |  | +          "hash_set_size": 1,       <4>
 | 
	
		
			
				|  |  | +          "with_rotation": true     <5>
 | 
	
		
			
				|  |  |          }
 | 
	
		
			
				|  |  |        },
 | 
	
		
			
				|  |  |        "analyzer": {
 | 
	
	
		
			
				|  | @@ -133,10 +171,10 @@ POST /index1
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  |    }
 | 
	
		
			
				|  |  |  }
 | 
	
		
			
				|  |  | ---------------------------------------------------
 | 
	
		
			
				|  |  | -// NOTCONSOLE
 | 
	
		
			
				|  |  | -<1> setting a shingle filter with 5-word shingles
 | 
	
		
			
				|  |  | -<2> setting min_hash filter to hash with 1 hash
 | 
	
		
			
				|  |  | -<3> setting min_hash filter to hash tokens into 512 buckets
 | 
	
		
			
				|  |  | -<4> setting min_hash filter to keep only a single smallest hash in each bucket
 | 
	
		
			
				|  |  | -<5> setting min_hash filter to fill empty buckets with values from neighboring buckets
 | 
	
		
			
				|  |  | +----
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +<1> Configures a custom shingle filter to output only five-word shingles.
 | 
	
		
			
				|  |  | +<2> Each five-word shingle in the stream is hashed once.
 | 
	
		
			
				|  |  | +<3> The hashes are assigned to 512 buckets.
 | 
	
		
			
				|  |  | +<4> Only the smallest hash in each bucket is retained.
 | 
	
		
			
				|  |  | +<5> The filter fills empty buckets with the values of neighboring buckets.
 |