|
@@ -70,3 +70,374 @@
|
|
|
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
|
|
- match: { detail.tokenizer.tokens.0.token: foo }
|
|
|
- match: { detail.tokenizer.tokens.1.token: bar }
|
|
|
+
|
|
|
+---
|
|
|
+"thai_tokenizer":
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "ภาษาไทย"
|
|
|
+ explain: true
|
|
|
+ tokenizer:
|
|
|
+ type: thai
|
|
|
+ - length: { detail.tokenizer.tokens: 2 }
|
|
|
+ - match: { detail.tokenizer.name: _anonymous_tokenizer }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: ภาษา }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: ไทย }
|
|
|
+
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "ภาษาไทย"
|
|
|
+ explain: true
|
|
|
+ tokenizer: thai
|
|
|
+ - length: { detail.tokenizer.tokens: 2 }
|
|
|
+ - match: { detail.tokenizer.name: thai }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: ภาษา }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: ไทย }
|
|
|
+
|
|
|
+---
|
|
|
+"ngram":
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "foobar"
|
|
|
+ explain: true
|
|
|
+ tokenizer:
|
|
|
+ type: ngram
|
|
|
+ min_gram: 3
|
|
|
+ max_gram: 3
|
|
|
+ - length: { detail.tokenizer.tokens: 4 }
|
|
|
+ - match: { detail.tokenizer.name: _anonymous_tokenizer }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: foo }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: oob }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: oba }
|
|
|
+ - match: { detail.tokenizer.tokens.3.token: bar }
|
|
|
+
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "foobar"
|
|
|
+ explain: true
|
|
|
+ tokenizer:
|
|
|
+ type: nGram
|
|
|
+ min_gram: 3
|
|
|
+ max_gram: 3
|
|
|
+ - length: { detail.tokenizer.tokens: 4 }
|
|
|
+ - match: { detail.tokenizer.name: _anonymous_tokenizer }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: foo }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: oob }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: oba }
|
|
|
+ - match: { detail.tokenizer.tokens.3.token: bar }
|
|
|
+
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "foo"
|
|
|
+ explain: true
|
|
|
+ tokenizer: ngram
|
|
|
+ - length: { detail.tokenizer.tokens: 5 }
|
|
|
+ - match: { detail.tokenizer.name: ngram }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: f }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: fo }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: o }
|
|
|
+ - match: { detail.tokenizer.tokens.3.token: oo }
|
|
|
+ - match: { detail.tokenizer.tokens.4.token: o }
|
|
|
+
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "foo"
|
|
|
+ explain: true
|
|
|
+ tokenizer: nGram
|
|
|
+ - length: { detail.tokenizer.tokens: 5 }
|
|
|
+ - match: { detail.tokenizer.name: nGram }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: f }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: fo }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: o }
|
|
|
+ - match: { detail.tokenizer.tokens.3.token: oo }
|
|
|
+ - match: { detail.tokenizer.tokens.4.token: o }
|
|
|
+
|
|
|
+---
|
|
|
+"edge_ngram":
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "foo"
|
|
|
+ explain: true
|
|
|
+ tokenizer:
|
|
|
+ type: edge_ngram
|
|
|
+ min_gram: 1
|
|
|
+ max_gram: 3
|
|
|
+ - length: { detail.tokenizer.tokens: 3 }
|
|
|
+ - match: { detail.tokenizer.name: _anonymous_tokenizer }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: f }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: fo }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: foo }
|
|
|
+
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "foo"
|
|
|
+ explain: true
|
|
|
+ tokenizer:
|
|
|
+ type: edgeNGram
|
|
|
+ min_gram: 1
|
|
|
+ max_gram: 3
|
|
|
+ - length: { detail.tokenizer.tokens: 3 }
|
|
|
+ - match: { detail.tokenizer.name: _anonymous_tokenizer }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: f }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: fo }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: foo }
|
|
|
+
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "foo"
|
|
|
+ explain: true
|
|
|
+ tokenizer: edge_ngram
|
|
|
+ - length: { detail.tokenizer.tokens: 2 }
|
|
|
+ - match: { detail.tokenizer.name: edge_ngram }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: f }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: fo }
|
|
|
+
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "foo"
|
|
|
+ explain: true
|
|
|
+ tokenizer: edgeNGram
|
|
|
+ - length: { detail.tokenizer.tokens: 2 }
|
|
|
+ - match: { detail.tokenizer.name: edgeNGram }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: f }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: fo }
|
|
|
+
|
|
|
+---
|
|
|
+"classic":
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "Brown-Foxes don't jump."
|
|
|
+ explain: true
|
|
|
+ tokenizer:
|
|
|
+ type: classic
|
|
|
+ - length: { detail.tokenizer.tokens: 4 }
|
|
|
+ - match: { detail.tokenizer.name: _anonymous_tokenizer }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: Brown }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: Foxes }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: don't }
|
|
|
+ - match: { detail.tokenizer.tokens.3.token: jump }
|
|
|
+
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "Brown-Foxes don't jump."
|
|
|
+ explain: true
|
|
|
+ tokenizer: classic
|
|
|
+ - length: { detail.tokenizer.tokens: 4 }
|
|
|
+ - match: { detail.tokenizer.name: classic }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: Brown }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: Foxes }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: don't }
|
|
|
+ - match: { detail.tokenizer.tokens.3.token: jump }
|
|
|
+
|
|
|
+---
|
|
|
+"letter":
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "Brown-Foxes don't jump."
|
|
|
+ explain: true
|
|
|
+ tokenizer:
|
|
|
+ type: letter
|
|
|
+ - length: { detail.tokenizer.tokens: 5 }
|
|
|
+ - match: { detail.tokenizer.name: _anonymous_tokenizer }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: Brown }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: Foxes }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: don }
|
|
|
+ - match: { detail.tokenizer.tokens.3.token: t }
|
|
|
+ - match: { detail.tokenizer.tokens.4.token: jump }
|
|
|
+
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "Brown-Foxes don't jump."
|
|
|
+ explain: true
|
|
|
+ tokenizer: letter
|
|
|
+ - length: { detail.tokenizer.tokens: 5 }
|
|
|
+ - match: { detail.tokenizer.name: letter }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: Brown }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: Foxes }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: don }
|
|
|
+ - match: { detail.tokenizer.tokens.3.token: t }
|
|
|
+ - match: { detail.tokenizer.tokens.4.token: jump }
|
|
|
+
|
|
|
+---
|
|
|
+"lowercase":
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "Brown-Foxes don't jump."
|
|
|
+ explain: true
|
|
|
+ tokenizer:
|
|
|
+ type: lowercase
|
|
|
+ - length: { detail.tokenizer.tokens: 5 }
|
|
|
+ - match: { detail.tokenizer.name: _anonymous_tokenizer }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: brown }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: foxes }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: don }
|
|
|
+ - match: { detail.tokenizer.tokens.3.token: t }
|
|
|
+ - match: { detail.tokenizer.tokens.4.token: jump }
|
|
|
+
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "Brown-Foxes don't jump."
|
|
|
+ explain: true
|
|
|
+ tokenizer: lowercase
|
|
|
+ - length: { detail.tokenizer.tokens: 5 }
|
|
|
+ - match: { detail.tokenizer.name: lowercase }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: brown }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: foxes }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: don }
|
|
|
+ - match: { detail.tokenizer.tokens.3.token: t }
|
|
|
+ - match: { detail.tokenizer.tokens.4.token: jump }
|
|
|
+
|
|
|
+---
|
|
|
+"path_hierarchy":
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "a/b/c"
|
|
|
+ explain: true
|
|
|
+ tokenizer:
|
|
|
+ type: path_hierarchy
|
|
|
+ - length: { detail.tokenizer.tokens: 3 }
|
|
|
+ - match: { detail.tokenizer.name: _anonymous_tokenizer }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: a }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: a/b }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: a/b/c }
|
|
|
+
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "a/b/c"
|
|
|
+ explain: true
|
|
|
+ tokenizer:
|
|
|
+ type: PathHierarchy
|
|
|
+ - length: { detail.tokenizer.tokens: 3 }
|
|
|
+ - match: { detail.tokenizer.name: _anonymous_tokenizer }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: a }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: a/b }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: a/b/c }
|
|
|
+
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "a/b/c"
|
|
|
+ explain: true
|
|
|
+ tokenizer: path_hierarchy
|
|
|
+ - length: { detail.tokenizer.tokens: 3 }
|
|
|
+ - match: { detail.tokenizer.name: path_hierarchy }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: a }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: a/b }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: a/b/c }
|
|
|
+
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "a/b/c"
|
|
|
+ explain: true
|
|
|
+ tokenizer: PathHierarchy
|
|
|
+ - length: { detail.tokenizer.tokens: 3 }
|
|
|
+ - match: { detail.tokenizer.name: PathHierarchy }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: a }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: a/b }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: a/b/c }
|
|
|
+
|
|
|
+---
|
|
|
+"pattern":
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "split by whitespace by default"
|
|
|
+ explain: true
|
|
|
+ tokenizer:
|
|
|
+ type: pattern
|
|
|
+ - length: { detail.tokenizer.tokens: 5 }
|
|
|
+ - match: { detail.tokenizer.name: _anonymous_tokenizer }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: split }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: by }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: whitespace }
|
|
|
+ - match: { detail.tokenizer.tokens.3.token: by }
|
|
|
+ - match: { detail.tokenizer.tokens.4.token: default }
|
|
|
+
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "split by whitespace by default"
|
|
|
+ explain: true
|
|
|
+ tokenizer: pattern
|
|
|
+ - length: { detail.tokenizer.tokens: 5 }
|
|
|
+ - match: { detail.tokenizer.name: pattern }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: split }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: by }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: whitespace }
|
|
|
+ - match: { detail.tokenizer.tokens.3.token: by }
|
|
|
+ - match: { detail.tokenizer.tokens.4.token: default }
|
|
|
+
|
|
|
+---
|
|
|
+"uax_url_email":
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "Email me at john.smith@global-international.com"
|
|
|
+ explain: true
|
|
|
+ tokenizer:
|
|
|
+ type: uax_url_email
|
|
|
+ - length: { detail.tokenizer.tokens: 4 }
|
|
|
+ - match: { detail.tokenizer.name: _anonymous_tokenizer }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: Email }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: me }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: at }
|
|
|
+ - match: { detail.tokenizer.tokens.3.token: john.smith@global-international.com }
|
|
|
+
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "Email me at john.smith@global-international.com"
|
|
|
+ explain: true
|
|
|
+ tokenizer: uax_url_email
|
|
|
+ - length: { detail.tokenizer.tokens: 4 }
|
|
|
+ - match: { detail.tokenizer.name: uax_url_email }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: Email }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: me }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: at }
|
|
|
+ - match: { detail.tokenizer.tokens.3.token: john.smith@global-international.com }
|
|
|
+
|
|
|
+---
|
|
|
+"whitespace":
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "split by whitespace"
|
|
|
+ explain: true
|
|
|
+ tokenizer:
|
|
|
+ type: whitespace
|
|
|
+ - length: { detail.tokenizer.tokens: 3 }
|
|
|
+ - match: { detail.tokenizer.name: _anonymous_tokenizer }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: split }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: by }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: whitespace }
|
|
|
+
|
|
|
+ - do:
|
|
|
+ indices.analyze:
|
|
|
+ body:
|
|
|
+ text: "split by whitespace"
|
|
|
+ explain: true
|
|
|
+ tokenizer: whitespace
|
|
|
+ - length: { detail.tokenizer.tokens: 3 }
|
|
|
+ - match: { detail.tokenizer.name: whitespace }
|
|
|
+ - match: { detail.tokenizer.tokens.0.token: split }
|
|
|
+ - match: { detail.tokenizer.tokens.1.token: by }
|
|
|
+ - match: { detail.tokenizer.tokens.2.token: whitespace }
|