lang-analyzer.asciidoc 38 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516
  1. [[analysis-lang-analyzer]]
  2. === Language Analyzers
  3. A set of analyzers aimed at analyzing specific language text. The
  4. following types are supported:
  5. <<arabic-analyzer,`arabic`>>,
  6. <<armenian-analyzer,`armenian`>>,
  7. <<basque-analyzer,`basque`>>,
  8. <<brazilian-analyzer,`brazilian`>>,
  9. <<bulgarian-analyzer,`bulgarian`>>,
  10. <<catalan-analyzer,`catalan`>>,
  11. <<cjk-analyzer,`cjk`>>,
  12. <<czech-analyzer,`czech`>>,
  13. <<danish-analyzer,`danish`>>,
  14. <<dutch-analyzer,`dutch`>>,
  15. <<english-analyzer,`english`>>,
  16. <<finnish-analyzer,`finnish`>>,
  17. <<french-analyzer,`french`>>,
  18. <<galician-analyzer,`galician`>>,
  19. <<german-analyzer,`german`>>,
  20. <<greek-analyzer,`greek`>>,
  21. <<hindi-analyzer,`hindi`>>,
  22. <<hungarian-analyzer,`hungarian`>>,
  23. <<indonesian-analyzer,`indonesian`>>,
  24. <<irish-analyzer,`irish`>>,
  25. <<italian-analyzer,`italian`>>,
  26. <<latvian-analyzer,`latvian`>>,
  27. <<norwegian-analyzer,`norwegian`>>,
  28. <<persian-analyzer,`persian`>>,
  29. <<portuguese-analyzer,`portuguese`>>,
  30. <<romanian-analyzer,`romanian`>>,
  31. <<russian-analyzer,`russian`>>,
  32. <<sorani-analyzer,`sorani`>>,
  33. <<spanish-analyzer,`spanish`>>,
  34. <<swedish-analyzer,`swedish`>>,
  35. <<turkish-analyzer,`turkish`>>,
  36. <<thai-analyzer,`thai`>>.
  37. ==== Configuring language analyzers
  38. ===== Stopwords
  39. All analyzers support setting custom `stopwords` either internally in
  40. the config, or by using an external stopwords file by setting
  41. `stopwords_path`. Check <<analysis-stop-analyzer,Stop Analyzer>> for
  42. more details.
  43. ===== Excluding words from stemming
  44. The `stem_exclusion` parameter allows you to specify an array
  45. of lowercase words that should not be stemmed. Internally, this
  46. functionality is implemented by adding the
  47. <<analysis-keyword-marker-tokenfilter,`keyword_marker` token filter>>
  48. with the `keywords` set to the value of the `stem_exclusion` parameter.
  49. The following analyzers support setting custom `stem_exclusion` list:
  50. `arabic`, `armenian`, `basque`, `catalan`, `bulgarian`, `catalan`,
  51. `czech`, `finnish`, `dutch`, `english`, `finnish`, `french`, `galician`,
  52. `german`, `irish`, `hindi`, `hungarian`, `indonesian`, `italian`, `latvian`, `norwegian`,
  53. `portuguese`, `romanian`, `russian`, `sorani`, `spanish`, `swedish`, `turkish`.
  54. ==== Reimplementing language analyzers
  55. The built-in language analyzers can be reimplemented as `custom` analyzers
  56. (as described below) in order to customize their behaviour.
  57. NOTE: If you do not intend to exclude words from being stemmed (the
  58. equivalent of the `stem_exclusion` parameter above), then you should remove
  59. the `keyword_marker` token filter from the custom analyzer configuration.
  60. [[arabic-analyzer]]
  61. ===== `arabic` analyzer
  62. The `arabic` analyzer could be reimplemented as a `custom` analyzer as follows:
  63. [source,js]
  64. ----------------------------------------------------
  65. {
  66. "settings": {
  67. "analysis": {
  68. "filter": {
  69. "arabic_stop": {
  70. "type": "stop",
  71. "stopwords": "_arabic_" <1>
  72. },
  73. "arabic_keywords": {
  74. "type": "keyword_marker",
  75. "keywords": [] <2>
  76. },
  77. "arabic_stemmer": {
  78. "type": "stemmer",
  79. "language": "arabic"
  80. }
  81. },
  82. "analyzer": {
  83. "arabic": {
  84. "tokenizer": "standard",
  85. "filter": [
  86. "lowercase",
  87. "arabic_stop",
  88. "arabic_normalization",
  89. "arabic_keywords",
  90. "arabic_stemmer"
  91. ]
  92. }
  93. }
  94. }
  95. }
  96. }
  97. ----------------------------------------------------
  98. <1> The default stopwords can be overridden with the `stopwords`
  99. or `stopwords_path` parameters.
  100. <2> This filter should be removed unless there are words which should
  101. be excluded from stemming.
  102. [[armenian-analyzer]]
  103. ===== `armenian` analyzer
  104. The `armenian` analyzer could be reimplemented as a `custom` analyzer as follows:
  105. [source,js]
  106. ----------------------------------------------------
  107. {
  108. "settings": {
  109. "analysis": {
  110. "filter": {
  111. "armenian_stop": {
  112. "type": "stop",
  113. "stopwords": "_armenian_" <1>
  114. },
  115. "armenian_keywords": {
  116. "type": "keyword_marker",
  117. "keywords": [] <2>
  118. },
  119. "armenian_stemmer": {
  120. "type": "stemmer",
  121. "language": "armenian"
  122. }
  123. },
  124. "analyzer": {
  125. "armenian": {
  126. "tokenizer": "standard",
  127. "filter": [
  128. "lowercase",
  129. "armenian_stop",
  130. "armenian_keywords",
  131. "armenian_stemmer"
  132. ]
  133. }
  134. }
  135. }
  136. }
  137. }
  138. ----------------------------------------------------
  139. <1> The default stopwords can be overridden with the `stopwords`
  140. or `stopwords_path` parameters.
  141. <2> This filter should be removed unless there are words which should
  142. be excluded from stemming.
  143. [[basque-analyzer]]
  144. ===== `basque` analyzer
  145. The `basque` analyzer could be reimplemented as a `custom` analyzer as follows:
  146. [source,js]
  147. ----------------------------------------------------
  148. {
  149. "settings": {
  150. "analysis": {
  151. "filter": {
  152. "basque_stop": {
  153. "type": "stop",
  154. "stopwords": "_basque_" <1>
  155. },
  156. "basque_keywords": {
  157. "type": "keyword_marker",
  158. "keywords": [] <2>
  159. },
  160. "basque_stemmer": {
  161. "type": "stemmer",
  162. "language": "basque"
  163. }
  164. },
  165. "analyzer": {
  166. "basque": {
  167. "tokenizer": "standard",
  168. "filter": [
  169. "lowercase",
  170. "basque_stop",
  171. "basque_keywords",
  172. "basque_stemmer"
  173. ]
  174. }
  175. }
  176. }
  177. }
  178. }
  179. ----------------------------------------------------
  180. <1> The default stopwords can be overridden with the `stopwords`
  181. or `stopwords_path` parameters.
  182. <2> This filter should be removed unless there are words which should
  183. be excluded from stemming.
  184. [[brazilian-analyzer]]
  185. ===== `brazilian` analyzer
  186. The `brazilian` analyzer could be reimplemented as a `custom` analyzer as follows:
  187. [source,js]
  188. ----------------------------------------------------
  189. {
  190. "settings": {
  191. "analysis": {
  192. "filter": {
  193. "brazilian_stop": {
  194. "type": "stop",
  195. "stopwords": "_brazilian_" <1>
  196. },
  197. "brazilian_keywords": {
  198. "type": "keyword_marker",
  199. "keywords": [] <2>
  200. },
  201. "brazilian_stemmer": {
  202. "type": "stemmer",
  203. "language": "brazilian"
  204. }
  205. },
  206. "analyzer": {
  207. "brazilian": {
  208. "tokenizer": "standard",
  209. "filter": [
  210. "lowercase",
  211. "brazilian_stop",
  212. "brazilian_keywords",
  213. "brazilian_stemmer"
  214. ]
  215. }
  216. }
  217. }
  218. }
  219. }
  220. ----------------------------------------------------
  221. <1> The default stopwords can be overridden with the `stopwords`
  222. or `stopwords_path` parameters.
  223. <2> This filter should be removed unless there are words which should
  224. be excluded from stemming.
  225. [[bulgarian-analyzer]]
  226. ===== `bulgarian` analyzer
  227. The `bulgarian` analyzer could be reimplemented as a `custom` analyzer as follows:
  228. [source,js]
  229. ----------------------------------------------------
  230. {
  231. "settings": {
  232. "analysis": {
  233. "filter": {
  234. "bulgarian_stop": {
  235. "type": "stop",
  236. "stopwords": "_bulgarian_" <1>
  237. },
  238. "bulgarian_keywords": {
  239. "type": "keyword_marker",
  240. "keywords": [] <2>
  241. },
  242. "bulgarian_stemmer": {
  243. "type": "stemmer",
  244. "language": "bulgarian"
  245. }
  246. },
  247. "analyzer": {
  248. "bulgarian": {
  249. "tokenizer": "standard",
  250. "filter": [
  251. "lowercase",
  252. "bulgarian_stop",
  253. "bulgarian_keywords",
  254. "bulgarian_stemmer"
  255. ]
  256. }
  257. }
  258. }
  259. }
  260. }
  261. ----------------------------------------------------
  262. <1> The default stopwords can be overridden with the `stopwords`
  263. or `stopwords_path` parameters.
  264. <2> This filter should be removed unless there are words which should
  265. be excluded from stemming.
  266. [[catalan-analyzer]]
  267. ===== `catalan` analyzer
  268. The `catalan` analyzer could be reimplemented as a `custom` analyzer as follows:
  269. [source,js]
  270. ----------------------------------------------------
  271. {
  272. "settings": {
  273. "analysis": {
  274. "filter": {
  275. "catalan_elision": {
  276. "type": "elision",
  277. "articles": [ "d", "l", "m", "n", "s", "t"]
  278. },
  279. "catalan_stop": {
  280. "type": "stop",
  281. "stopwords": "_catalan_" <1>
  282. },
  283. "catalan_keywords": {
  284. "type": "keyword_marker",
  285. "keywords": [] <2>
  286. },
  287. "catalan_stemmer": {
  288. "type": "stemmer",
  289. "language": "catalan"
  290. }
  291. },
  292. "analyzer": {
  293. "catalan": {
  294. "tokenizer": "standard",
  295. "filter": [
  296. "catalan_elision",
  297. "lowercase",
  298. "catalan_stop",
  299. "catalan_keywords",
  300. "catalan_stemmer"
  301. ]
  302. }
  303. }
  304. }
  305. }
  306. }
  307. ----------------------------------------------------
  308. <1> The default stopwords can be overridden with the `stopwords`
  309. or `stopwords_path` parameters.
  310. <2> This filter should be removed unless there are words which should
  311. be excluded from stemming.
  312. [[cjk-analyzer]]
  313. ===== `cjk` analyzer
  314. The `cjk` analyzer could be reimplemented as a `custom` analyzer as follows:
  315. [source,js]
  316. ----------------------------------------------------
  317. {
  318. "settings": {
  319. "analysis": {
  320. "filter": {
  321. "english_stop": {
  322. "type": "stop",
  323. "stopwords": "_english_" <1>
  324. }
  325. },
  326. "analyzer": {
  327. "cjk": {
  328. "tokenizer": "standard",
  329. "filter": [
  330. "cjk_width",
  331. "lowercase",
  332. "cjk_bigram",
  333. "english_stop"
  334. ]
  335. }
  336. }
  337. }
  338. }
  339. }
  340. ----------------------------------------------------
  341. <1> The default stopwords can be overridden with the `stopwords`
  342. or `stopwords_path` parameters.
  343. [[czech-analyzer]]
  344. ===== `czech` analyzer
  345. The `czech` analyzer could be reimplemented as a `custom` analyzer as follows:
  346. [source,js]
  347. ----------------------------------------------------
  348. {
  349. "settings": {
  350. "analysis": {
  351. "filter": {
  352. "czech_stop": {
  353. "type": "stop",
  354. "stopwords": "_czech_" <1>
  355. },
  356. "czech_keywords": {
  357. "type": "keyword_marker",
  358. "keywords": [] <2>
  359. },
  360. "czech_stemmer": {
  361. "type": "stemmer",
  362. "language": "czech"
  363. }
  364. },
  365. "analyzer": {
  366. "czech": {
  367. "tokenizer": "standard",
  368. "filter": [
  369. "lowercase",
  370. "czech_stop",
  371. "czech_keywords",
  372. "czech_stemmer"
  373. ]
  374. }
  375. }
  376. }
  377. }
  378. }
  379. ----------------------------------------------------
  380. <1> The default stopwords can be overridden with the `stopwords`
  381. or `stopwords_path` parameters.
  382. <2> This filter should be removed unless there are words which should
  383. be excluded from stemming.
  384. [[danish-analyzer]]
  385. ===== `danish` analyzer
  386. The `danish` analyzer could be reimplemented as a `custom` analyzer as follows:
  387. [source,js]
  388. ----------------------------------------------------
  389. {
  390. "settings": {
  391. "analysis": {
  392. "filter": {
  393. "danish_stop": {
  394. "type": "stop",
  395. "stopwords": "_danish_" <1>
  396. },
  397. "danish_keywords": {
  398. "type": "keyword_marker",
  399. "keywords": [] <2>
  400. },
  401. "danish_stemmer": {
  402. "type": "stemmer",
  403. "language": "danish"
  404. }
  405. },
  406. "analyzer": {
  407. "danish": {
  408. "tokenizer": "standard",
  409. "filter": [
  410. "lowercase",
  411. "danish_stop",
  412. "danish_keywords",
  413. "danish_stemmer"
  414. ]
  415. }
  416. }
  417. }
  418. }
  419. }
  420. ----------------------------------------------------
  421. <1> The default stopwords can be overridden with the `stopwords`
  422. or `stopwords_path` parameters.
  423. <2> This filter should be removed unless there are words which should
  424. be excluded from stemming.
  425. [[dutch-analyzer]]
  426. ===== `dutch` analyzer
  427. The `dutch` analyzer could be reimplemented as a `custom` analyzer as follows:
  428. [source,js]
  429. ----------------------------------------------------
  430. {
  431. "settings": {
  432. "analysis": {
  433. "filter": {
  434. "dutch_stop": {
  435. "type": "stop",
  436. "stopwords": "_dutch_" <1>
  437. },
  438. "dutch_keywords": {
  439. "type": "keyword_marker",
  440. "keywords": [] <2>
  441. },
  442. "dutch_stemmer": {
  443. "type": "stemmer",
  444. "language": "dutch"
  445. },
  446. "dutch_override": {
  447. "type": "stemmer_override",
  448. "rules": [
  449. "fiets=>fiets",
  450. "bromfiets=>bromfiets",
  451. "ei=>eier",
  452. "kind=>kinder"
  453. ]
  454. }
  455. },
  456. "analyzer": {
  457. "dutch": {
  458. "tokenizer": "standard",
  459. "filter": [
  460. "lowercase",
  461. "dutch_stop",
  462. "dutch_keywords",
  463. "dutch_override",
  464. "dutch_stemmer"
  465. ]
  466. }
  467. }
  468. }
  469. }
  470. }
  471. ----------------------------------------------------
  472. <1> The default stopwords can be overridden with the `stopwords`
  473. or `stopwords_path` parameters.
  474. <2> This filter should be removed unless there are words which should
  475. be excluded from stemming.
  476. [[english-analyzer]]
  477. ===== `english` analyzer
  478. The `english` analyzer could be reimplemented as a `custom` analyzer as follows:
  479. [source,js]
  480. ----------------------------------------------------
  481. {
  482. "settings": {
  483. "analysis": {
  484. "filter": {
  485. "english_stop": {
  486. "type": "stop",
  487. "stopwords": "_english_" <1>
  488. },
  489. "english_keywords": {
  490. "type": "keyword_marker",
  491. "keywords": [] <2>
  492. },
  493. "english_stemmer": {
  494. "type": "stemmer",
  495. "language": "english"
  496. },
  497. "english_possessive_stemmer": {
  498. "type": "stemmer",
  499. "language": "possessive_english"
  500. }
  501. },
  502. "analyzer": {
  503. "english": {
  504. "tokenizer": "standard",
  505. "filter": [
  506. "english_possessive_stemmer",
  507. "lowercase",
  508. "english_stop",
  509. "english_keywords",
  510. "english_stemmer"
  511. ]
  512. }
  513. }
  514. }
  515. }
  516. }
  517. ----------------------------------------------------
  518. <1> The default stopwords can be overridden with the `stopwords`
  519. or `stopwords_path` parameters.
  520. <2> This filter should be removed unless there are words which should
  521. be excluded from stemming.
  522. [[finnish-analyzer]]
  523. ===== `finnish` analyzer
  524. The `finnish` analyzer could be reimplemented as a `custom` analyzer as follows:
  525. [source,js]
  526. ----------------------------------------------------
  527. {
  528. "settings": {
  529. "analysis": {
  530. "filter": {
  531. "finnish_stop": {
  532. "type": "stop",
  533. "stopwords": "_finnish_" <1>
  534. },
  535. "finnish_keywords": {
  536. "type": "keyword_marker",
  537. "keywords": [] <2>
  538. },
  539. "finnish_stemmer": {
  540. "type": "stemmer",
  541. "language": "finnish"
  542. }
  543. },
  544. "analyzer": {
  545. "finnish": {
  546. "tokenizer": "standard",
  547. "filter": [
  548. "lowercase",
  549. "finnish_stop",
  550. "finnish_keywords",
  551. "finnish_stemmer"
  552. ]
  553. }
  554. }
  555. }
  556. }
  557. }
  558. ----------------------------------------------------
  559. <1> The default stopwords can be overridden with the `stopwords`
  560. or `stopwords_path` parameters.
  561. <2> This filter should be removed unless there are words which should
  562. be excluded from stemming.
  563. [[french-analyzer]]
  564. ===== `french` analyzer
  565. The `french` analyzer could be reimplemented as a `custom` analyzer as follows:
  566. [source,js]
  567. ----------------------------------------------------
  568. {
  569. "settings": {
  570. "analysis": {
  571. "filter": {
  572. "french_elision": {
  573. "type": "elision",
  574. "articles": [ "l", "m", "t", "qu", "n", "s",
  575. "j", "d", "c", "jusqu", "quoiqu",
  576. "lorsqu", "puisqu"
  577. ]
  578. },
  579. "french_stop": {
  580. "type": "stop",
  581. "stopwords": "_french_" <1>
  582. },
  583. "french_keywords": {
  584. "type": "keyword_marker",
  585. "keywords": [] <2>
  586. },
  587. "french_stemmer": {
  588. "type": "stemmer",
  589. "language": "light_french"
  590. }
  591. },
  592. "analyzer": {
  593. "french": {
  594. "tokenizer": "standard",
  595. "filter": [
  596. "french_elision",
  597. "lowercase",
  598. "french_stop",
  599. "french_keywords",
  600. "french_stemmer"
  601. ]
  602. }
  603. }
  604. }
  605. }
  606. }
  607. ----------------------------------------------------
  608. <1> The default stopwords can be overridden with the `stopwords`
  609. or `stopwords_path` parameters.
  610. <2> This filter should be removed unless there are words which should
  611. be excluded from stemming.
  612. [[galician-analyzer]]
  613. ===== `galician` analyzer
  614. The `galician` analyzer could be reimplemented as a `custom` analyzer as follows:
  615. [source,js]
  616. ----------------------------------------------------
  617. {
  618. "settings": {
  619. "analysis": {
  620. "filter": {
  621. "galician_stop": {
  622. "type": "stop",
  623. "stopwords": "_galician_" <1>
  624. },
  625. "galician_keywords": {
  626. "type": "keyword_marker",
  627. "keywords": [] <2>
  628. },
  629. "galician_stemmer": {
  630. "type": "stemmer",
  631. "language": "galician"
  632. }
  633. },
  634. "analyzer": {
  635. "galician": {
  636. "tokenizer": "standard",
  637. "filter": [
  638. "lowercase",
  639. "galician_stop",
  640. "galician_keywords",
  641. "galician_stemmer"
  642. ]
  643. }
  644. }
  645. }
  646. }
  647. }
  648. ----------------------------------------------------
  649. <1> The default stopwords can be overridden with the `stopwords`
  650. or `stopwords_path` parameters.
  651. <2> This filter should be removed unless there are words which should
  652. be excluded from stemming.
  653. [[german-analyzer]]
  654. ===== `german` analyzer
  655. The `german` analyzer could be reimplemented as a `custom` analyzer as follows:
  656. [source,js]
  657. ----------------------------------------------------
  658. {
  659. "settings": {
  660. "analysis": {
  661. "filter": {
  662. "german_stop": {
  663. "type": "stop",
  664. "stopwords": "_german_" <1>
  665. },
  666. "german_keywords": {
  667. "type": "keyword_marker",
  668. "keywords": [] <2>
  669. },
  670. "german_stemmer": {
  671. "type": "stemmer",
  672. "language": "light_german"
  673. }
  674. },
  675. "analyzer": {
  676. "german": {
  677. "tokenizer": "standard",
  678. "filter": [
  679. "lowercase",
  680. "german_stop",
  681. "german_keywords",
  682. "german_normalization",
  683. "german_stemmer"
  684. ]
  685. }
  686. }
  687. }
  688. }
  689. }
  690. ----------------------------------------------------
  691. <1> The default stopwords can be overridden with the `stopwords`
  692. or `stopwords_path` parameters.
  693. <2> This filter should be removed unless there are words which should
  694. be excluded from stemming.
  695. [[greek-analyzer]]
  696. ===== `greek` analyzer
  697. The `greek` analyzer could be reimplemented as a `custom` analyzer as follows:
  698. [source,js]
  699. ----------------------------------------------------
  700. {
  701. "settings": {
  702. "analysis": {
  703. "filter": {
  704. "greek_stop": {
  705. "type": "stop",
  706. "stopwords": "_greek_" <1>
  707. },
  708. "greek_lowercase": {
  709. "type": "lowercase",
  710. "language": "greek"
  711. },
  712. "greek_keywords": {
  713. "type": "keyword_marker",
  714. "keywords": [] <2>
  715. },
  716. "greek_stemmer": {
  717. "type": "stemmer",
  718. "language": "greek"
  719. }
  720. },
  721. "analyzer": {
  722. "greek": {
  723. "tokenizer": "standard",
  724. "filter": [
  725. "greek_lowercase",
  726. "greek_stop",
  727. "greek_keywords",
  728. "greek_stemmer"
  729. ]
  730. }
  731. }
  732. }
  733. }
  734. }
  735. ----------------------------------------------------
  736. <1> The default stopwords can be overridden with the `stopwords`
  737. or `stopwords_path` parameters.
  738. <2> This filter should be removed unless there are words which should
  739. be excluded from stemming.
  740. [[hindi-analyzer]]
  741. ===== `hindi` analyzer
  742. The `hindi` analyzer could be reimplemented as a `custom` analyzer as follows:
  743. [source,js]
  744. ----------------------------------------------------
  745. {
  746. "settings": {
  747. "analysis": {
  748. "filter": {
  749. "hindi_stop": {
  750. "type": "stop",
  751. "stopwords": "_hindi_" <1>
  752. },
  753. "hindi_keywords": {
  754. "type": "keyword_marker",
  755. "keywords": [] <2>
  756. },
  757. "hindi_stemmer": {
  758. "type": "stemmer",
  759. "language": "hindi"
  760. }
  761. },
  762. "analyzer": {
  763. "hindi": {
  764. "tokenizer": "standard",
  765. "filter": [
  766. "lowercase",
  767. "indic_normalization",
  768. "hindi_normalization",
  769. "hindi_stop",
  770. "hindi_keywords",
  771. "hindi_stemmer"
  772. ]
  773. }
  774. }
  775. }
  776. }
  777. }
  778. ----------------------------------------------------
  779. <1> The default stopwords can be overridden with the `stopwords`
  780. or `stopwords_path` parameters.
  781. <2> This filter should be removed unless there are words which should
  782. be excluded from stemming.
  783. [[hungarian-analyzer]]
  784. ===== `hungarian` analyzer
  785. The `hungarian` analyzer could be reimplemented as a `custom` analyzer as follows:
  786. [source,js]
  787. ----------------------------------------------------
  788. {
  789. "settings": {
  790. "analysis": {
  791. "filter": {
  792. "hungarian_stop": {
  793. "type": "stop",
  794. "stopwords": "_hungarian_" <1>
  795. },
  796. "hungarian_keywords": {
  797. "type": "keyword_marker",
  798. "keywords": [] <2>
  799. },
  800. "hungarian_stemmer": {
  801. "type": "stemmer",
  802. "language": "hungarian"
  803. }
  804. },
  805. "analyzer": {
  806. "hungarian": {
  807. "tokenizer": "standard",
  808. "filter": [
  809. "lowercase",
  810. "hungarian_stop",
  811. "hungarian_keywords",
  812. "hungarian_stemmer"
  813. ]
  814. }
  815. }
  816. }
  817. }
  818. }
  819. ----------------------------------------------------
  820. <1> The default stopwords can be overridden with the `stopwords`
  821. or `stopwords_path` parameters.
  822. <2> This filter should be removed unless there are words which should
  823. be excluded from stemming.
  824. [[indonesian-analyzer]]
  825. ===== `indonesian` analyzer
  826. The `indonesian` analyzer could be reimplemented as a `custom` analyzer as follows:
  827. [source,js]
  828. ----------------------------------------------------
  829. {
  830. "settings": {
  831. "analysis": {
  832. "filter": {
  833. "indonesian_stop": {
  834. "type": "stop",
  835. "stopwords": "_indonesian_" <1>
  836. },
  837. "indonesian_keywords": {
  838. "type": "keyword_marker",
  839. "keywords": [] <2>
  840. },
  841. "indonesian_stemmer": {
  842. "type": "stemmer",
  843. "language": "indonesian"
  844. }
  845. },
  846. "analyzer": {
  847. "indonesian": {
  848. "tokenizer": "standard",
  849. "filter": [
  850. "lowercase",
  851. "indonesian_stop",
  852. "indonesian_keywords",
  853. "indonesian_stemmer"
  854. ]
  855. }
  856. }
  857. }
  858. }
  859. }
  860. ----------------------------------------------------
  861. <1> The default stopwords can be overridden with the `stopwords`
  862. or `stopwords_path` parameters.
  863. <2> This filter should be removed unless there are words which should
  864. be excluded from stemming.
  865. [[irish-analyzer]]
  866. ===== `irish` analyzer
  867. The `irish` analyzer could be reimplemented as a `custom` analyzer as follows:
  868. [source,js]
  869. ----------------------------------------------------
  870. {
  871. "settings": {
  872. "analysis": {
  873. "filter": {
  874. "irish_elision": {
  875. "type": "elision",
  876. "articles": [ "h", "n", "t" ]
  877. },
  878. "irish_stop": {
  879. "type": "stop",
  880. "stopwords": "_irish_" <1>
  881. },
  882. "irish_lowercase": {
  883. "type": "lowercase",
  884. "language": "irish"
  885. },
  886. "irish_keywords": {
  887. "type": "keyword_marker",
  888. "keywords": [] <2>
  889. },
  890. "irish_stemmer": {
  891. "type": "stemmer",
  892. "language": "irish"
  893. }
  894. },
  895. "analyzer": {
  896. "irish": {
  897. "tokenizer": "standard",
  898. "filter": [
  899. "irish_stop",
  900. "irish_elision",
  901. "irish_lowercase",
  902. "irish_keywords",
  903. "irish_stemmer"
  904. ]
  905. }
  906. }
  907. }
  908. }
  909. }
  910. ----------------------------------------------------
  911. <1> The default stopwords can be overridden with the `stopwords`
  912. or `stopwords_path` parameters.
  913. <2> This filter should be removed unless there are words which should
  914. be excluded from stemming.
  915. [[italian-analyzer]]
  916. ===== `italian` analyzer
  917. The `italian` analyzer could be reimplemented as a `custom` analyzer as follows:
  918. [source,js]
  919. ----------------------------------------------------
  920. {
  921. "settings": {
  922. "analysis": {
  923. "filter": {
  924. "italian_elision": {
  925. "type": "elision",
  926. "articles": [
  927. "c", "l", "all", "dall", "dell",
  928. "nell", "sull", "coll", "pell",
  929. "gl", "agl", "dagl", "degl", "negl",
  930. "sugl", "un", "m", "t", "s", "v", "d"
  931. ]
  932. },
  933. "italian_stop": {
  934. "type": "stop",
  935. "stopwords": "_italian_" <1>
  936. },
  937. "italian_keywords": {
  938. "type": "keyword_marker",
  939. "keywords": [] <2>
  940. },
  941. "italian_stemmer": {
  942. "type": "stemmer",
  943. "language": "light_italian"
  944. }
  945. },
  946. "analyzer": {
  947. "italian": {
  948. "tokenizer": "standard",
  949. "filter": [
  950. "italian_elision",
  951. "lowercase",
  952. "italian_stop",
  953. "italian_keywords",
  954. "italian_stemmer"
  955. ]
  956. }
  957. }
  958. }
  959. }
  960. }
  961. ----------------------------------------------------
  962. <1> The default stopwords can be overridden with the `stopwords`
  963. or `stopwords_path` parameters.
  964. <2> This filter should be removed unless there are words which should
  965. be excluded from stemming.
  966. [[latvian-analyzer]]
  967. ===== `latvian` analyzer
  968. The `latvian` analyzer could be reimplemented as a `custom` analyzer as follows:
  969. [source,js]
  970. ----------------------------------------------------
  971. {
  972. "settings": {
  973. "analysis": {
  974. "filter": {
  975. "latvian_stop": {
  976. "type": "stop",
  977. "stopwords": "_latvian_" <1>
  978. },
  979. "latvian_keywords": {
  980. "type": "keyword_marker",
  981. "keywords": [] <2>
  982. },
  983. "latvian_stemmer": {
  984. "type": "stemmer",
  985. "language": "latvian"
  986. }
  987. },
  988. "analyzer": {
  989. "latvian": {
  990. "tokenizer": "standard",
  991. "filter": [
  992. "lowercase",
  993. "latvian_stop",
  994. "latvian_keywords",
  995. "latvian_stemmer"
  996. ]
  997. }
  998. }
  999. }
  1000. }
  1001. }
  1002. ----------------------------------------------------
  1003. <1> The default stopwords can be overridden with the `stopwords`
  1004. or `stopwords_path` parameters.
  1005. <2> This filter should be removed unless there are words which should
  1006. be excluded from stemming.
  1007. [[norwegian-analyzer]]
  1008. ===== `norwegian` analyzer
  1009. The `norwegian` analyzer could be reimplemented as a `custom` analyzer as follows:
  1010. [source,js]
  1011. ----------------------------------------------------
  1012. {
  1013. "settings": {
  1014. "analysis": {
  1015. "filter": {
  1016. "norwegian_stop": {
  1017. "type": "stop",
  1018. "stopwords": "_norwegian_" <1>
  1019. },
  1020. "norwegian_keywords": {
  1021. "type": "keyword_marker",
  1022. "keywords": [] <2>
  1023. },
  1024. "norwegian_stemmer": {
  1025. "type": "stemmer",
  1026. "language": "norwegian"
  1027. }
  1028. },
  1029. "analyzer": {
  1030. "norwegian": {
  1031. "tokenizer": "standard",
  1032. "filter": [
  1033. "lowercase",
  1034. "norwegian_stop",
  1035. "norwegian_keywords",
  1036. "norwegian_stemmer"
  1037. ]
  1038. }
  1039. }
  1040. }
  1041. }
  1042. }
  1043. ----------------------------------------------------
  1044. <1> The default stopwords can be overridden with the `stopwords`
  1045. or `stopwords_path` parameters.
  1046. <2> This filter should be removed unless there are words which should
  1047. be excluded from stemming.
  1048. [[persian-analyzer]]
  1049. ===== `persian` analyzer
  1050. The `persian` analyzer could be reimplemented as a `custom` analyzer as follows:
  1051. [source,js]
  1052. ----------------------------------------------------
  1053. {
  1054. "settings": {
  1055. "analysis": {
  1056. "char_filter": {
  1057. "zero_width_spaces": {
  1058. "type": "mapping",
  1059. "mappings": [ "\\u200C=> "] <1>
  1060. }
  1061. },
  1062. "filter": {
  1063. "persian_stop": {
  1064. "type": "stop",
  1065. "stopwords": "_persian_" <2>
  1066. }
  1067. },
  1068. "analyzer": {
  1069. "persian": {
  1070. "tokenizer": "standard",
  1071. "char_filter": [ "zero_width_spaces" ],
  1072. "filter": [
  1073. "lowercase",
  1074. "arabic_normalization",
  1075. "persian_normalization",
  1076. "persian_stop"
  1077. ]
  1078. }
  1079. }
  1080. }
  1081. }
  1082. }
  1083. ----------------------------------------------------
  1084. <1> Replaces zero-width non-joiners with an ASCII space.
  1085. <2> The default stopwords can be overridden with the `stopwords`
  1086. or `stopwords_path` parameters.
  1087. [[portuguese-analyzer]]
  1088. ===== `portuguese` analyzer
  1089. The `portuguese` analyzer could be reimplemented as a `custom` analyzer as follows:
  1090. [source,js]
  1091. ----------------------------------------------------
  1092. {
  1093. "settings": {
  1094. "analysis": {
  1095. "filter": {
  1096. "portuguese_stop": {
  1097. "type": "stop",
  1098. "stopwords": "_portuguese_" <1>
  1099. },
  1100. "portuguese_keywords": {
  1101. "type": "keyword_marker",
  1102. "keywords": [] <2>
  1103. },
  1104. "portuguese_stemmer": {
  1105. "type": "stemmer",
  1106. "language": "light_portuguese"
  1107. }
  1108. },
  1109. "analyzer": {
  1110. "portuguese": {
  1111. "tokenizer": "standard",
  1112. "filter": [
  1113. "lowercase",
  1114. "portuguese_stop",
  1115. "portuguese_keywords",
  1116. "portuguese_stemmer"
  1117. ]
  1118. }
  1119. }
  1120. }
  1121. }
  1122. }
  1123. ----------------------------------------------------
  1124. <1> The default stopwords can be overridden with the `stopwords`
  1125. or `stopwords_path` parameters.
  1126. <2> This filter should be removed unless there are words which should
  1127. be excluded from stemming.
  1128. [[romanian-analyzer]]
  1129. ===== `romanian` analyzer
  1130. The `romanian` analyzer could be reimplemented as a `custom` analyzer as follows:
  1131. [source,js]
  1132. ----------------------------------------------------
  1133. {
  1134. "settings": {
  1135. "analysis": {
  1136. "filter": {
  1137. "romanian_stop": {
  1138. "type": "stop",
  1139. "stopwords": "_romanian_" <1>
  1140. },
  1141. "romanian_keywords": {
  1142. "type": "keyword_marker",
  1143. "keywords": [] <2>
  1144. },
  1145. "romanian_stemmer": {
  1146. "type": "stemmer",
  1147. "language": "romanian"
  1148. }
  1149. },
  1150. "analyzer": {
  1151. "romanian": {
  1152. "tokenizer": "standard",
  1153. "filter": [
  1154. "lowercase",
  1155. "romanian_stop",
  1156. "romanian_keywords",
  1157. "romanian_stemmer"
  1158. ]
  1159. }
  1160. }
  1161. }
  1162. }
  1163. }
  1164. ----------------------------------------------------
  1165. <1> The default stopwords can be overridden with the `stopwords`
  1166. or `stopwords_path` parameters.
  1167. <2> This filter should be removed unless there are words which should
  1168. be excluded from stemming.
  1169. [[russian-analyzer]]
  1170. ===== `russian` analyzer
  1171. The `russian` analyzer could be reimplemented as a `custom` analyzer as follows:
  1172. [source,js]
  1173. ----------------------------------------------------
  1174. {
  1175. "settings": {
  1176. "analysis": {
  1177. "filter": {
  1178. "russian_stop": {
  1179. "type": "stop",
  1180. "stopwords": "_russian_" <1>
  1181. },
  1182. "russian_keywords": {
  1183. "type": "keyword_marker",
  1184. "keywords": [] <2>
  1185. },
  1186. "russian_stemmer": {
  1187. "type": "stemmer",
  1188. "language": "russian"
  1189. }
  1190. },
  1191. "analyzer": {
  1192. "russian": {
  1193. "tokenizer": "standard",
  1194. "filter": [
  1195. "lowercase",
  1196. "russian_stop",
  1197. "russian_keywords",
  1198. "russian_stemmer"
  1199. ]
  1200. }
  1201. }
  1202. }
  1203. }
  1204. }
  1205. ----------------------------------------------------
  1206. <1> The default stopwords can be overridden with the `stopwords`
  1207. or `stopwords_path` parameters.
  1208. <2> This filter should be removed unless there are words which should
  1209. be excluded from stemming.
  1210. [[sorani-analyzer]]
  1211. ===== `sorani` analyzer
  1212. The `sorani` analyzer could be reimplemented as a `custom` analyzer as follows:
  1213. [source,js]
  1214. ----------------------------------------------------
  1215. {
  1216. "settings": {
  1217. "analysis": {
  1218. "filter": {
  1219. "sorani_stop": {
  1220. "type": "stop",
  1221. "stopwords": "_sorani_" <1>
  1222. },
  1223. "sorani_keywords": {
  1224. "type": "keyword_marker",
  1225. "keywords": [] <2>
  1226. },
  1227. "sorani_stemmer": {
  1228. "type": "stemmer",
  1229. "language": "sorani"
  1230. }
  1231. },
  1232. "analyzer": {
  1233. "sorani": {
  1234. "tokenizer": "standard",
  1235. "filter": [
  1236. "sorani_normalization",
  1237. "lowercase",
  1238. "sorani_stop",
  1239. "sorani_keywords",
  1240. "sorani_stemmer"
  1241. ]
  1242. }
  1243. }
  1244. }
  1245. }
  1246. }
  1247. ----------------------------------------------------
  1248. <1> The default stopwords can be overridden with the `stopwords`
  1249. or `stopwords_path` parameters.
  1250. <2> This filter should be removed unless there are words which should
  1251. be excluded from stemming.
  1252. [[spanish-analyzer]]
  1253. ===== `spanish` analyzer
  1254. The `spanish` analyzer could be reimplemented as a `custom` analyzer as follows:
  1255. [source,js]
  1256. ----------------------------------------------------
  1257. {
  1258. "settings": {
  1259. "analysis": {
  1260. "filter": {
  1261. "spanish_stop": {
  1262. "type": "stop",
  1263. "stopwords": "_spanish_" <1>
  1264. },
  1265. "spanish_keywords": {
  1266. "type": "keyword_marker",
  1267. "keywords": [] <2>
  1268. },
  1269. "spanish_stemmer": {
  1270. "type": "stemmer",
  1271. "language": "light_spanish"
  1272. }
  1273. },
  1274. "analyzer": {
  1275. "spanish": {
  1276. "tokenizer": "standard",
  1277. "filter": [
  1278. "lowercase",
  1279. "spanish_stop",
  1280. "spanish_keywords",
  1281. "spanish_stemmer"
  1282. ]
  1283. }
  1284. }
  1285. }
  1286. }
  1287. }
  1288. ----------------------------------------------------
  1289. <1> The default stopwords can be overridden with the `stopwords`
  1290. or `stopwords_path` parameters.
  1291. <2> This filter should be removed unless there are words which should
  1292. be excluded from stemming.
  1293. [[swedish-analyzer]]
  1294. ===== `swedish` analyzer
  1295. The `swedish` analyzer could be reimplemented as a `custom` analyzer as follows:
  1296. [source,js]
  1297. ----------------------------------------------------
  1298. {
  1299. "settings": {
  1300. "analysis": {
  1301. "filter": {
  1302. "swedish_stop": {
  1303. "type": "stop",
  1304. "stopwords": "_swedish_" <1>
  1305. },
  1306. "swedish_keywords": {
  1307. "type": "keyword_marker",
  1308. "keywords": [] <2>
  1309. },
  1310. "swedish_stemmer": {
  1311. "type": "stemmer",
  1312. "language": "swedish"
  1313. }
  1314. },
  1315. "analyzer": {
  1316. "swedish": {
  1317. "tokenizer": "standard",
  1318. "filter": [
  1319. "lowercase",
  1320. "swedish_stop",
  1321. "swedish_keywords",
  1322. "swedish_stemmer"
  1323. ]
  1324. }
  1325. }
  1326. }
  1327. }
  1328. }
  1329. ----------------------------------------------------
  1330. <1> The default stopwords can be overridden with the `stopwords`
  1331. or `stopwords_path` parameters.
  1332. <2> This filter should be removed unless there are words which should
  1333. be excluded from stemming.
  1334. [[turkish-analyzer]]
  1335. ===== `turkish` analyzer
  1336. The `turkish` analyzer could be reimplemented as a `custom` analyzer as follows:
  1337. [source,js]
  1338. ----------------------------------------------------
  1339. {
  1340. "settings": {
  1341. "analysis": {
  1342. "filter": {
  1343. "turkish_stop": {
  1344. "type": "stop",
  1345. "stopwords": "_turkish_" <1>
  1346. },
  1347. "turkish_lowercase": {
  1348. "type": "lowercase",
  1349. "language": "turkish"
  1350. },
  1351. "turkish_keywords": {
  1352. "type": "keyword_marker",
  1353. "keywords": [] <2>
  1354. },
  1355. "turkish_stemmer": {
  1356. "type": "stemmer",
  1357. "language": "turkish"
  1358. }
  1359. },
  1360. "analyzer": {
  1361. "turkish": {
  1362. "tokenizer": "standard",
  1363. "filter": [
  1364. "apostrophe",
  1365. "turkish_lowercase",
  1366. "turkish_stop",
  1367. "turkish_keywords",
  1368. "turkish_stemmer"
  1369. ]
  1370. }
  1371. }
  1372. }
  1373. }
  1374. }
  1375. ----------------------------------------------------
  1376. <1> The default stopwords can be overridden with the `stopwords`
  1377. or `stopwords_path` parameters.
  1378. <2> This filter should be removed unless there are words which should
  1379. be excluded from stemming.
  1380. [[thai-analyzer]]
  1381. ===== `thai` analyzer
  1382. The `thai` analyzer could be reimplemented as a `custom` analyzer as follows:
  1383. [source,js]
  1384. ----------------------------------------------------
  1385. {
  1386. "settings": {
  1387. "analysis": {
  1388. "filter": {
  1389. "thai_stop": {
  1390. "type": "stop",
  1391. "stopwords": "_thai_" <1>
  1392. }
  1393. },
  1394. "analyzer": {
  1395. "thai": {
  1396. "tokenizer": "thai",
  1397. "filter": [
  1398. "lowercase",
  1399. "thai_stop"
  1400. ]
  1401. }
  1402. }
  1403. }
  1404. }
  1405. }
  1406. ----------------------------------------------------
  1407. <1> The default stopwords can be overridden with the `stopwords`
  1408. or `stopwords_path` parameters.