lang-analyzer.asciidoc 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564
  1. [[analysis-lang-analyzer]]
  2. === Language Analyzers
  3. A set of analyzers aimed at analyzing specific language text. The
  4. following types are supported:
  5. <<arabic-analyzer,`arabic`>>,
  6. <<armenian-analyzer,`armenian`>>,
  7. <<basque-analyzer,`basque`>>,
  8. <<brazilian-analyzer,`brazilian`>>,
  9. <<bulgarian-analyzer,`bulgarian`>>,
  10. <<catalan-analyzer,`catalan`>>,
  11. <<cjk-analyzer,`cjk`>>,
  12. <<czech-analyzer,`czech`>>,
  13. <<danish-analyzer,`danish`>>,
  14. <<dutch-analyzer,`dutch`>>,
  15. <<english-analyzer,`english`>>,
  16. <<finnish-analyzer,`finnish`>>,
  17. <<french-analyzer,`french`>>,
  18. <<galician-analyzer,`galician`>>,
  19. <<german-analyzer,`german`>>,
  20. <<greek-analyzer,`greek`>>,
  21. <<hindi-analyzer,`hindi`>>,
  22. <<hungarian-analyzer,`hungarian`>>,
  23. <<indonesian-analyzer,`indonesian`>>,
  24. <<irish-analyzer,`irish`>>,
  25. <<italian-analyzer,`italian`>>,
  26. <<latvian-analyzer,`latvian`>>,
  27. <<lithuanian-analyzer,`lithuanian`>>,
  28. <<norwegian-analyzer,`norwegian`>>,
  29. <<persian-analyzer,`persian`>>,
  30. <<portuguese-analyzer,`portuguese`>>,
  31. <<romanian-analyzer,`romanian`>>,
  32. <<russian-analyzer,`russian`>>,
  33. <<sorani-analyzer,`sorani`>>,
  34. <<spanish-analyzer,`spanish`>>,
  35. <<swedish-analyzer,`swedish`>>,
  36. <<turkish-analyzer,`turkish`>>,
  37. <<thai-analyzer,`thai`>>.
  38. ==== Configuring language analyzers
  39. ===== Stopwords
  40. All analyzers support setting custom `stopwords` either internally in
  41. the config, or by using an external stopwords file by setting
  42. `stopwords_path`. Check <<analysis-stop-analyzer,Stop Analyzer>> for
  43. more details.
  44. ===== Excluding words from stemming
  45. The `stem_exclusion` parameter allows you to specify an array
  46. of lowercase words that should not be stemmed. Internally, this
  47. functionality is implemented by adding the
  48. <<analysis-keyword-marker-tokenfilter,`keyword_marker` token filter>>
  49. with the `keywords` set to the value of the `stem_exclusion` parameter.
  50. The following analyzers support setting custom `stem_exclusion` list:
  51. `arabic`, `armenian`, `basque`, `catalan`, `bulgarian`, `catalan`,
  52. `czech`, `finnish`, `dutch`, `english`, `finnish`, `french`, `galician`,
  53. `german`, `irish`, `hindi`, `hungarian`, `indonesian`, `italian`, `latvian`,
  54. `lithuanian`, `norwegian`, `portuguese`, `romanian`, `russian`, `sorani`,
  55. `spanish`, `swedish`, `turkish`.
  56. ==== Reimplementing language analyzers
  57. The built-in language analyzers can be reimplemented as `custom` analyzers
  58. (as described below) in order to customize their behaviour.
  59. NOTE: If you do not intend to exclude words from being stemmed (the
  60. equivalent of the `stem_exclusion` parameter above), then you should remove
  61. the `keyword_marker` token filter from the custom analyzer configuration.
  62. [[arabic-analyzer]]
  63. ===== `arabic` analyzer
  64. The `arabic` analyzer could be reimplemented as a `custom` analyzer as follows:
  65. [source,js]
  66. ----------------------------------------------------
  67. {
  68. "settings": {
  69. "analysis": {
  70. "filter": {
  71. "arabic_stop": {
  72. "type": "stop",
  73. "stopwords": "_arabic_" <1>
  74. },
  75. "arabic_keywords": {
  76. "type": "keyword_marker",
  77. "keywords": [] <2>
  78. },
  79. "arabic_stemmer": {
  80. "type": "stemmer",
  81. "language": "arabic"
  82. }
  83. },
  84. "analyzer": {
  85. "arabic": {
  86. "tokenizer": "standard",
  87. "filter": [
  88. "lowercase",
  89. "arabic_stop",
  90. "arabic_normalization",
  91. "arabic_keywords",
  92. "arabic_stemmer"
  93. ]
  94. }
  95. }
  96. }
  97. }
  98. }
  99. ----------------------------------------------------
  100. <1> The default stopwords can be overridden with the `stopwords`
  101. or `stopwords_path` parameters.
  102. <2> This filter should be removed unless there are words which should
  103. be excluded from stemming.
  104. [[armenian-analyzer]]
  105. ===== `armenian` analyzer
  106. The `armenian` analyzer could be reimplemented as a `custom` analyzer as follows:
  107. [source,js]
  108. ----------------------------------------------------
  109. {
  110. "settings": {
  111. "analysis": {
  112. "filter": {
  113. "armenian_stop": {
  114. "type": "stop",
  115. "stopwords": "_armenian_" <1>
  116. },
  117. "armenian_keywords": {
  118. "type": "keyword_marker",
  119. "keywords": [] <2>
  120. },
  121. "armenian_stemmer": {
  122. "type": "stemmer",
  123. "language": "armenian"
  124. }
  125. },
  126. "analyzer": {
  127. "armenian": {
  128. "tokenizer": "standard",
  129. "filter": [
  130. "lowercase",
  131. "armenian_stop",
  132. "armenian_keywords",
  133. "armenian_stemmer"
  134. ]
  135. }
  136. }
  137. }
  138. }
  139. }
  140. ----------------------------------------------------
  141. <1> The default stopwords can be overridden with the `stopwords`
  142. or `stopwords_path` parameters.
  143. <2> This filter should be removed unless there are words which should
  144. be excluded from stemming.
  145. [[basque-analyzer]]
  146. ===== `basque` analyzer
  147. The `basque` analyzer could be reimplemented as a `custom` analyzer as follows:
  148. [source,js]
  149. ----------------------------------------------------
  150. {
  151. "settings": {
  152. "analysis": {
  153. "filter": {
  154. "basque_stop": {
  155. "type": "stop",
  156. "stopwords": "_basque_" <1>
  157. },
  158. "basque_keywords": {
  159. "type": "keyword_marker",
  160. "keywords": [] <2>
  161. },
  162. "basque_stemmer": {
  163. "type": "stemmer",
  164. "language": "basque"
  165. }
  166. },
  167. "analyzer": {
  168. "basque": {
  169. "tokenizer": "standard",
  170. "filter": [
  171. "lowercase",
  172. "basque_stop",
  173. "basque_keywords",
  174. "basque_stemmer"
  175. ]
  176. }
  177. }
  178. }
  179. }
  180. }
  181. ----------------------------------------------------
  182. <1> The default stopwords can be overridden with the `stopwords`
  183. or `stopwords_path` parameters.
  184. <2> This filter should be removed unless there are words which should
  185. be excluded from stemming.
  186. [[brazilian-analyzer]]
  187. ===== `brazilian` analyzer
  188. The `brazilian` analyzer could be reimplemented as a `custom` analyzer as follows:
  189. [source,js]
  190. ----------------------------------------------------
  191. {
  192. "settings": {
  193. "analysis": {
  194. "filter": {
  195. "brazilian_stop": {
  196. "type": "stop",
  197. "stopwords": "_brazilian_" <1>
  198. },
  199. "brazilian_keywords": {
  200. "type": "keyword_marker",
  201. "keywords": [] <2>
  202. },
  203. "brazilian_stemmer": {
  204. "type": "stemmer",
  205. "language": "brazilian"
  206. }
  207. },
  208. "analyzer": {
  209. "brazilian": {
  210. "tokenizer": "standard",
  211. "filter": [
  212. "lowercase",
  213. "brazilian_stop",
  214. "brazilian_keywords",
  215. "brazilian_stemmer"
  216. ]
  217. }
  218. }
  219. }
  220. }
  221. }
  222. ----------------------------------------------------
  223. <1> The default stopwords can be overridden with the `stopwords`
  224. or `stopwords_path` parameters.
  225. <2> This filter should be removed unless there are words which should
  226. be excluded from stemming.
  227. [[bulgarian-analyzer]]
  228. ===== `bulgarian` analyzer
  229. The `bulgarian` analyzer could be reimplemented as a `custom` analyzer as follows:
  230. [source,js]
  231. ----------------------------------------------------
  232. {
  233. "settings": {
  234. "analysis": {
  235. "filter": {
  236. "bulgarian_stop": {
  237. "type": "stop",
  238. "stopwords": "_bulgarian_" <1>
  239. },
  240. "bulgarian_keywords": {
  241. "type": "keyword_marker",
  242. "keywords": [] <2>
  243. },
  244. "bulgarian_stemmer": {
  245. "type": "stemmer",
  246. "language": "bulgarian"
  247. }
  248. },
  249. "analyzer": {
  250. "bulgarian": {
  251. "tokenizer": "standard",
  252. "filter": [
  253. "lowercase",
  254. "bulgarian_stop",
  255. "bulgarian_keywords",
  256. "bulgarian_stemmer"
  257. ]
  258. }
  259. }
  260. }
  261. }
  262. }
  263. ----------------------------------------------------
  264. <1> The default stopwords can be overridden with the `stopwords`
  265. or `stopwords_path` parameters.
  266. <2> This filter should be removed unless there are words which should
  267. be excluded from stemming.
  268. [[catalan-analyzer]]
  269. ===== `catalan` analyzer
  270. The `catalan` analyzer could be reimplemented as a `custom` analyzer as follows:
  271. [source,js]
  272. ----------------------------------------------------
  273. {
  274. "settings": {
  275. "analysis": {
  276. "filter": {
  277. "catalan_elision": {
  278. "type": "elision",
  279. "articles": [ "d", "l", "m", "n", "s", "t"]
  280. },
  281. "catalan_stop": {
  282. "type": "stop",
  283. "stopwords": "_catalan_" <1>
  284. },
  285. "catalan_keywords": {
  286. "type": "keyword_marker",
  287. "keywords": [] <2>
  288. },
  289. "catalan_stemmer": {
  290. "type": "stemmer",
  291. "language": "catalan"
  292. }
  293. },
  294. "analyzer": {
  295. "catalan": {
  296. "tokenizer": "standard",
  297. "filter": [
  298. "catalan_elision",
  299. "lowercase",
  300. "catalan_stop",
  301. "catalan_keywords",
  302. "catalan_stemmer"
  303. ]
  304. }
  305. }
  306. }
  307. }
  308. }
  309. ----------------------------------------------------
  310. <1> The default stopwords can be overridden with the `stopwords`
  311. or `stopwords_path` parameters.
  312. <2> This filter should be removed unless there are words which should
  313. be excluded from stemming.
  314. [[cjk-analyzer]]
  315. ===== `cjk` analyzer
  316. The `cjk` analyzer could be reimplemented as a `custom` analyzer as follows:
  317. [source,js]
  318. ----------------------------------------------------
  319. {
  320. "settings": {
  321. "analysis": {
  322. "filter": {
  323. "english_stop": {
  324. "type": "stop",
  325. "stopwords": "_english_" <1>
  326. }
  327. },
  328. "analyzer": {
  329. "cjk": {
  330. "tokenizer": "standard",
  331. "filter": [
  332. "cjk_width",
  333. "lowercase",
  334. "cjk_bigram",
  335. "english_stop"
  336. ]
  337. }
  338. }
  339. }
  340. }
  341. }
  342. ----------------------------------------------------
  343. <1> The default stopwords can be overridden with the `stopwords`
  344. or `stopwords_path` parameters.
  345. [[czech-analyzer]]
  346. ===== `czech` analyzer
  347. The `czech` analyzer could be reimplemented as a `custom` analyzer as follows:
  348. [source,js]
  349. ----------------------------------------------------
  350. {
  351. "settings": {
  352. "analysis": {
  353. "filter": {
  354. "czech_stop": {
  355. "type": "stop",
  356. "stopwords": "_czech_" <1>
  357. },
  358. "czech_keywords": {
  359. "type": "keyword_marker",
  360. "keywords": [] <2>
  361. },
  362. "czech_stemmer": {
  363. "type": "stemmer",
  364. "language": "czech"
  365. }
  366. },
  367. "analyzer": {
  368. "czech": {
  369. "tokenizer": "standard",
  370. "filter": [
  371. "lowercase",
  372. "czech_stop",
  373. "czech_keywords",
  374. "czech_stemmer"
  375. ]
  376. }
  377. }
  378. }
  379. }
  380. }
  381. ----------------------------------------------------
  382. <1> The default stopwords can be overridden with the `stopwords`
  383. or `stopwords_path` parameters.
  384. <2> This filter should be removed unless there are words which should
  385. be excluded from stemming.
  386. [[danish-analyzer]]
  387. ===== `danish` analyzer
  388. The `danish` analyzer could be reimplemented as a `custom` analyzer as follows:
  389. [source,js]
  390. ----------------------------------------------------
  391. {
  392. "settings": {
  393. "analysis": {
  394. "filter": {
  395. "danish_stop": {
  396. "type": "stop",
  397. "stopwords": "_danish_" <1>
  398. },
  399. "danish_keywords": {
  400. "type": "keyword_marker",
  401. "keywords": [] <2>
  402. },
  403. "danish_stemmer": {
  404. "type": "stemmer",
  405. "language": "danish"
  406. }
  407. },
  408. "analyzer": {
  409. "danish": {
  410. "tokenizer": "standard",
  411. "filter": [
  412. "lowercase",
  413. "danish_stop",
  414. "danish_keywords",
  415. "danish_stemmer"
  416. ]
  417. }
  418. }
  419. }
  420. }
  421. }
  422. ----------------------------------------------------
  423. <1> The default stopwords can be overridden with the `stopwords`
  424. or `stopwords_path` parameters.
  425. <2> This filter should be removed unless there are words which should
  426. be excluded from stemming.
  427. [[dutch-analyzer]]
  428. ===== `dutch` analyzer
  429. The `dutch` analyzer could be reimplemented as a `custom` analyzer as follows:
  430. [source,js]
  431. ----------------------------------------------------
  432. {
  433. "settings": {
  434. "analysis": {
  435. "filter": {
  436. "dutch_stop": {
  437. "type": "stop",
  438. "stopwords": "_dutch_" <1>
  439. },
  440. "dutch_keywords": {
  441. "type": "keyword_marker",
  442. "keywords": [] <2>
  443. },
  444. "dutch_stemmer": {
  445. "type": "stemmer",
  446. "language": "dutch"
  447. },
  448. "dutch_override": {
  449. "type": "stemmer_override",
  450. "rules": [
  451. "fiets=>fiets",
  452. "bromfiets=>bromfiets",
  453. "ei=>eier",
  454. "kind=>kinder"
  455. ]
  456. }
  457. },
  458. "analyzer": {
  459. "dutch": {
  460. "tokenizer": "standard",
  461. "filter": [
  462. "lowercase",
  463. "dutch_stop",
  464. "dutch_keywords",
  465. "dutch_override",
  466. "dutch_stemmer"
  467. ]
  468. }
  469. }
  470. }
  471. }
  472. }
  473. ----------------------------------------------------
  474. <1> The default stopwords can be overridden with the `stopwords`
  475. or `stopwords_path` parameters.
  476. <2> This filter should be removed unless there are words which should
  477. be excluded from stemming.
  478. [[english-analyzer]]
  479. ===== `english` analyzer
  480. The `english` analyzer could be reimplemented as a `custom` analyzer as follows:
  481. [source,js]
  482. ----------------------------------------------------
  483. {
  484. "settings": {
  485. "analysis": {
  486. "filter": {
  487. "english_stop": {
  488. "type": "stop",
  489. "stopwords": "_english_" <1>
  490. },
  491. "english_keywords": {
  492. "type": "keyword_marker",
  493. "keywords": [] <2>
  494. },
  495. "english_stemmer": {
  496. "type": "stemmer",
  497. "language": "english"
  498. },
  499. "english_possessive_stemmer": {
  500. "type": "stemmer",
  501. "language": "possessive_english"
  502. }
  503. },
  504. "analyzer": {
  505. "english": {
  506. "tokenizer": "standard",
  507. "filter": [
  508. "english_possessive_stemmer",
  509. "lowercase",
  510. "english_stop",
  511. "english_keywords",
  512. "english_stemmer"
  513. ]
  514. }
  515. }
  516. }
  517. }
  518. }
  519. ----------------------------------------------------
  520. <1> The default stopwords can be overridden with the `stopwords`
  521. or `stopwords_path` parameters.
  522. <2> This filter should be removed unless there are words which should
  523. be excluded from stemming.
  524. [[finnish-analyzer]]
  525. ===== `finnish` analyzer
  526. The `finnish` analyzer could be reimplemented as a `custom` analyzer as follows:
  527. [source,js]
  528. ----------------------------------------------------
  529. {
  530. "settings": {
  531. "analysis": {
  532. "filter": {
  533. "finnish_stop": {
  534. "type": "stop",
  535. "stopwords": "_finnish_" <1>
  536. },
  537. "finnish_keywords": {
  538. "type": "keyword_marker",
  539. "keywords": [] <2>
  540. },
  541. "finnish_stemmer": {
  542. "type": "stemmer",
  543. "language": "finnish"
  544. }
  545. },
  546. "analyzer": {
  547. "finnish": {
  548. "tokenizer": "standard",
  549. "filter": [
  550. "lowercase",
  551. "finnish_stop",
  552. "finnish_keywords",
  553. "finnish_stemmer"
  554. ]
  555. }
  556. }
  557. }
  558. }
  559. }
  560. ----------------------------------------------------
  561. <1> The default stopwords can be overridden with the `stopwords`
  562. or `stopwords_path` parameters.
  563. <2> This filter should be removed unless there are words which should
  564. be excluded from stemming.
  565. [[french-analyzer]]
  566. ===== `french` analyzer
  567. The `french` analyzer could be reimplemented as a `custom` analyzer as follows:
  568. [source,js]
  569. ----------------------------------------------------
  570. {
  571. "settings": {
  572. "analysis": {
  573. "filter": {
  574. "french_elision": {
  575. "type": "elision",
  576. "articles_case": true,
  577. "articles": [
  578. "l", "m", "t", "qu", "n", "s",
  579. "j", "d", "c", "jusqu", "quoiqu",
  580. "lorsqu", "puisqu"
  581. ]
  582. },
  583. "french_stop": {
  584. "type": "stop",
  585. "stopwords": "_french_" <1>
  586. },
  587. "french_keywords": {
  588. "type": "keyword_marker",
  589. "keywords": [] <2>
  590. },
  591. "french_stemmer": {
  592. "type": "stemmer",
  593. "language": "light_french"
  594. }
  595. },
  596. "analyzer": {
  597. "french": {
  598. "tokenizer": "standard",
  599. "filter": [
  600. "french_elision",
  601. "lowercase",
  602. "french_stop",
  603. "french_keywords",
  604. "french_stemmer"
  605. ]
  606. }
  607. }
  608. }
  609. }
  610. }
  611. ----------------------------------------------------
  612. <1> The default stopwords can be overridden with the `stopwords`
  613. or `stopwords_path` parameters.
  614. <2> This filter should be removed unless there are words which should
  615. be excluded from stemming.
  616. [[galician-analyzer]]
  617. ===== `galician` analyzer
  618. The `galician` analyzer could be reimplemented as a `custom` analyzer as follows:
  619. [source,js]
  620. ----------------------------------------------------
  621. {
  622. "settings": {
  623. "analysis": {
  624. "filter": {
  625. "galician_stop": {
  626. "type": "stop",
  627. "stopwords": "_galician_" <1>
  628. },
  629. "galician_keywords": {
  630. "type": "keyword_marker",
  631. "keywords": [] <2>
  632. },
  633. "galician_stemmer": {
  634. "type": "stemmer",
  635. "language": "galician"
  636. }
  637. },
  638. "analyzer": {
  639. "galician": {
  640. "tokenizer": "standard",
  641. "filter": [
  642. "lowercase",
  643. "galician_stop",
  644. "galician_keywords",
  645. "galician_stemmer"
  646. ]
  647. }
  648. }
  649. }
  650. }
  651. }
  652. ----------------------------------------------------
  653. <1> The default stopwords can be overridden with the `stopwords`
  654. or `stopwords_path` parameters.
  655. <2> This filter should be removed unless there are words which should
  656. be excluded from stemming.
  657. [[german-analyzer]]
  658. ===== `german` analyzer
  659. The `german` analyzer could be reimplemented as a `custom` analyzer as follows:
  660. [source,js]
  661. ----------------------------------------------------
  662. {
  663. "settings": {
  664. "analysis": {
  665. "filter": {
  666. "german_stop": {
  667. "type": "stop",
  668. "stopwords": "_german_" <1>
  669. },
  670. "german_keywords": {
  671. "type": "keyword_marker",
  672. "keywords": [] <2>
  673. },
  674. "german_stemmer": {
  675. "type": "stemmer",
  676. "language": "light_german"
  677. }
  678. },
  679. "analyzer": {
  680. "german": {
  681. "tokenizer": "standard",
  682. "filter": [
  683. "lowercase",
  684. "german_stop",
  685. "german_keywords",
  686. "german_normalization",
  687. "german_stemmer"
  688. ]
  689. }
  690. }
  691. }
  692. }
  693. }
  694. ----------------------------------------------------
  695. <1> The default stopwords can be overridden with the `stopwords`
  696. or `stopwords_path` parameters.
  697. <2> This filter should be removed unless there are words which should
  698. be excluded from stemming.
  699. [[greek-analyzer]]
  700. ===== `greek` analyzer
  701. The `greek` analyzer could be reimplemented as a `custom` analyzer as follows:
  702. [source,js]
  703. ----------------------------------------------------
  704. {
  705. "settings": {
  706. "analysis": {
  707. "filter": {
  708. "greek_stop": {
  709. "type": "stop",
  710. "stopwords": "_greek_" <1>
  711. },
  712. "greek_lowercase": {
  713. "type": "lowercase",
  714. "language": "greek"
  715. },
  716. "greek_keywords": {
  717. "type": "keyword_marker",
  718. "keywords": [] <2>
  719. },
  720. "greek_stemmer": {
  721. "type": "stemmer",
  722. "language": "greek"
  723. }
  724. },
  725. "analyzer": {
  726. "greek": {
  727. "tokenizer": "standard",
  728. "filter": [
  729. "greek_lowercase",
  730. "greek_stop",
  731. "greek_keywords",
  732. "greek_stemmer"
  733. ]
  734. }
  735. }
  736. }
  737. }
  738. }
  739. ----------------------------------------------------
  740. <1> The default stopwords can be overridden with the `stopwords`
  741. or `stopwords_path` parameters.
  742. <2> This filter should be removed unless there are words which should
  743. be excluded from stemming.
  744. [[hindi-analyzer]]
  745. ===== `hindi` analyzer
  746. The `hindi` analyzer could be reimplemented as a `custom` analyzer as follows:
  747. [source,js]
  748. ----------------------------------------------------
  749. {
  750. "settings": {
  751. "analysis": {
  752. "filter": {
  753. "hindi_stop": {
  754. "type": "stop",
  755. "stopwords": "_hindi_" <1>
  756. },
  757. "hindi_keywords": {
  758. "type": "keyword_marker",
  759. "keywords": [] <2>
  760. },
  761. "hindi_stemmer": {
  762. "type": "stemmer",
  763. "language": "hindi"
  764. }
  765. },
  766. "analyzer": {
  767. "hindi": {
  768. "tokenizer": "standard",
  769. "filter": [
  770. "lowercase",
  771. "indic_normalization",
  772. "hindi_normalization",
  773. "hindi_stop",
  774. "hindi_keywords",
  775. "hindi_stemmer"
  776. ]
  777. }
  778. }
  779. }
  780. }
  781. }
  782. ----------------------------------------------------
  783. <1> The default stopwords can be overridden with the `stopwords`
  784. or `stopwords_path` parameters.
  785. <2> This filter should be removed unless there are words which should
  786. be excluded from stemming.
  787. [[hungarian-analyzer]]
  788. ===== `hungarian` analyzer
  789. The `hungarian` analyzer could be reimplemented as a `custom` analyzer as follows:
  790. [source,js]
  791. ----------------------------------------------------
  792. {
  793. "settings": {
  794. "analysis": {
  795. "filter": {
  796. "hungarian_stop": {
  797. "type": "stop",
  798. "stopwords": "_hungarian_" <1>
  799. },
  800. "hungarian_keywords": {
  801. "type": "keyword_marker",
  802. "keywords": [] <2>
  803. },
  804. "hungarian_stemmer": {
  805. "type": "stemmer",
  806. "language": "hungarian"
  807. }
  808. },
  809. "analyzer": {
  810. "hungarian": {
  811. "tokenizer": "standard",
  812. "filter": [
  813. "lowercase",
  814. "hungarian_stop",
  815. "hungarian_keywords",
  816. "hungarian_stemmer"
  817. ]
  818. }
  819. }
  820. }
  821. }
  822. }
  823. ----------------------------------------------------
  824. <1> The default stopwords can be overridden with the `stopwords`
  825. or `stopwords_path` parameters.
  826. <2> This filter should be removed unless there are words which should
  827. be excluded from stemming.
  828. [[indonesian-analyzer]]
  829. ===== `indonesian` analyzer
  830. The `indonesian` analyzer could be reimplemented as a `custom` analyzer as follows:
  831. [source,js]
  832. ----------------------------------------------------
  833. {
  834. "settings": {
  835. "analysis": {
  836. "filter": {
  837. "indonesian_stop": {
  838. "type": "stop",
  839. "stopwords": "_indonesian_" <1>
  840. },
  841. "indonesian_keywords": {
  842. "type": "keyword_marker",
  843. "keywords": [] <2>
  844. },
  845. "indonesian_stemmer": {
  846. "type": "stemmer",
  847. "language": "indonesian"
  848. }
  849. },
  850. "analyzer": {
  851. "indonesian": {
  852. "tokenizer": "standard",
  853. "filter": [
  854. "lowercase",
  855. "indonesian_stop",
  856. "indonesian_keywords",
  857. "indonesian_stemmer"
  858. ]
  859. }
  860. }
  861. }
  862. }
  863. }
  864. ----------------------------------------------------
  865. <1> The default stopwords can be overridden with the `stopwords`
  866. or `stopwords_path` parameters.
  867. <2> This filter should be removed unless there are words which should
  868. be excluded from stemming.
  869. [[irish-analyzer]]
  870. ===== `irish` analyzer
  871. The `irish` analyzer could be reimplemented as a `custom` analyzer as follows:
  872. [source,js]
  873. ----------------------------------------------------
  874. {
  875. "settings": {
  876. "analysis": {
  877. "filter": {
  878. "irish_elision": {
  879. "type": "elision",
  880. "articles": [ "h", "n", "t" ]
  881. },
  882. "irish_stop": {
  883. "type": "stop",
  884. "stopwords": "_irish_" <1>
  885. },
  886. "irish_lowercase": {
  887. "type": "lowercase",
  888. "language": "irish"
  889. },
  890. "irish_keywords": {
  891. "type": "keyword_marker",
  892. "keywords": [] <2>
  893. },
  894. "irish_stemmer": {
  895. "type": "stemmer",
  896. "language": "irish"
  897. }
  898. },
  899. "analyzer": {
  900. "irish": {
  901. "tokenizer": "standard",
  902. "filter": [
  903. "irish_stop",
  904. "irish_elision",
  905. "irish_lowercase",
  906. "irish_keywords",
  907. "irish_stemmer"
  908. ]
  909. }
  910. }
  911. }
  912. }
  913. }
  914. ----------------------------------------------------
  915. <1> The default stopwords can be overridden with the `stopwords`
  916. or `stopwords_path` parameters.
  917. <2> This filter should be removed unless there are words which should
  918. be excluded from stemming.
  919. [[italian-analyzer]]
  920. ===== `italian` analyzer
  921. The `italian` analyzer could be reimplemented as a `custom` analyzer as follows:
  922. [source,js]
  923. ----------------------------------------------------
  924. {
  925. "settings": {
  926. "analysis": {
  927. "filter": {
  928. "italian_elision": {
  929. "type": "elision",
  930. "articles": [
  931. "c", "l", "all", "dall", "dell",
  932. "nell", "sull", "coll", "pell",
  933. "gl", "agl", "dagl", "degl", "negl",
  934. "sugl", "un", "m", "t", "s", "v", "d"
  935. ]
  936. },
  937. "italian_stop": {
  938. "type": "stop",
  939. "stopwords": "_italian_" <1>
  940. },
  941. "italian_keywords": {
  942. "type": "keyword_marker",
  943. "keywords": [] <2>
  944. },
  945. "italian_stemmer": {
  946. "type": "stemmer",
  947. "language": "light_italian"
  948. }
  949. },
  950. "analyzer": {
  951. "italian": {
  952. "tokenizer": "standard",
  953. "filter": [
  954. "italian_elision",
  955. "lowercase",
  956. "italian_stop",
  957. "italian_keywords",
  958. "italian_stemmer"
  959. ]
  960. }
  961. }
  962. }
  963. }
  964. }
  965. ----------------------------------------------------
  966. <1> The default stopwords can be overridden with the `stopwords`
  967. or `stopwords_path` parameters.
  968. <2> This filter should be removed unless there are words which should
  969. be excluded from stemming.
  970. [[latvian-analyzer]]
  971. ===== `latvian` analyzer
  972. The `latvian` analyzer could be reimplemented as a `custom` analyzer as follows:
  973. [source,js]
  974. ----------------------------------------------------
  975. {
  976. "settings": {
  977. "analysis": {
  978. "filter": {
  979. "latvian_stop": {
  980. "type": "stop",
  981. "stopwords": "_latvian_" <1>
  982. },
  983. "latvian_keywords": {
  984. "type": "keyword_marker",
  985. "keywords": [] <2>
  986. },
  987. "latvian_stemmer": {
  988. "type": "stemmer",
  989. "language": "latvian"
  990. }
  991. },
  992. "analyzer": {
  993. "latvian": {
  994. "tokenizer": "standard",
  995. "filter": [
  996. "lowercase",
  997. "latvian_stop",
  998. "latvian_keywords",
  999. "latvian_stemmer"
  1000. ]
  1001. }
  1002. }
  1003. }
  1004. }
  1005. }
  1006. ----------------------------------------------------
  1007. <1> The default stopwords can be overridden with the `stopwords`
  1008. or `stopwords_path` parameters.
  1009. <2> This filter should be removed unless there are words which should
  1010. be excluded from stemming.
  1011. [[lithuanian-analyzer]]
  1012. ===== `lithuanian` analyzer
  1013. The `lithuanian` analyzer could be reimplemented as a `custom` analyzer as follows:
  1014. [source,js]
  1015. ----------------------------------------------------
  1016. {
  1017. "settings": {
  1018. "analysis": {
  1019. "filter": {
  1020. "lithuanian_stop": {
  1021. "type": "stop",
  1022. "stopwords": "_lithuanian_" <1>
  1023. },
  1024. "lithuanian_keywords": {
  1025. "type": "keyword_marker",
  1026. "keywords": [] <2>
  1027. },
  1028. "lithuanian_stemmer": {
  1029. "type": "stemmer",
  1030. "language": "lithuanian"
  1031. }
  1032. },
  1033. "analyzer": {
  1034. "lithuanian": {
  1035. "tokenizer": "standard",
  1036. "filter": [
  1037. "lowercase",
  1038. "lithuanian_stop",
  1039. "lithuanian_keywords",
  1040. "lithuanian_stemmer"
  1041. ]
  1042. }
  1043. }
  1044. }
  1045. }
  1046. }
  1047. ----------------------------------------------------
  1048. <1> The default stopwords can be overridden with the `stopwords`
  1049. or `stopwords_path` parameters.
  1050. <2> This filter should be removed unless there are words which should
  1051. be excluded from stemming.
  1052. [[norwegian-analyzer]]
  1053. ===== `norwegian` analyzer
  1054. The `norwegian` analyzer could be reimplemented as a `custom` analyzer as follows:
  1055. [source,js]
  1056. ----------------------------------------------------
  1057. {
  1058. "settings": {
  1059. "analysis": {
  1060. "filter": {
  1061. "norwegian_stop": {
  1062. "type": "stop",
  1063. "stopwords": "_norwegian_" <1>
  1064. },
  1065. "norwegian_keywords": {
  1066. "type": "keyword_marker",
  1067. "keywords": [] <2>
  1068. },
  1069. "norwegian_stemmer": {
  1070. "type": "stemmer",
  1071. "language": "norwegian"
  1072. }
  1073. },
  1074. "analyzer": {
  1075. "norwegian": {
  1076. "tokenizer": "standard",
  1077. "filter": [
  1078. "lowercase",
  1079. "norwegian_stop",
  1080. "norwegian_keywords",
  1081. "norwegian_stemmer"
  1082. ]
  1083. }
  1084. }
  1085. }
  1086. }
  1087. }
  1088. ----------------------------------------------------
  1089. <1> The default stopwords can be overridden with the `stopwords`
  1090. or `stopwords_path` parameters.
  1091. <2> This filter should be removed unless there are words which should
  1092. be excluded from stemming.
  1093. [[persian-analyzer]]
  1094. ===== `persian` analyzer
  1095. The `persian` analyzer could be reimplemented as a `custom` analyzer as follows:
  1096. [source,js]
  1097. ----------------------------------------------------
  1098. {
  1099. "settings": {
  1100. "analysis": {
  1101. "char_filter": {
  1102. "zero_width_spaces": {
  1103. "type": "mapping",
  1104. "mappings": [ "\\u200C=> "] <1>
  1105. }
  1106. },
  1107. "filter": {
  1108. "persian_stop": {
  1109. "type": "stop",
  1110. "stopwords": "_persian_" <2>
  1111. }
  1112. },
  1113. "analyzer": {
  1114. "persian": {
  1115. "tokenizer": "standard",
  1116. "char_filter": [ "zero_width_spaces" ],
  1117. "filter": [
  1118. "lowercase",
  1119. "arabic_normalization",
  1120. "persian_normalization",
  1121. "persian_stop"
  1122. ]
  1123. }
  1124. }
  1125. }
  1126. }
  1127. }
  1128. ----------------------------------------------------
  1129. <1> Replaces zero-width non-joiners with an ASCII space.
  1130. <2> The default stopwords can be overridden with the `stopwords`
  1131. or `stopwords_path` parameters.
  1132. [[portuguese-analyzer]]
  1133. ===== `portuguese` analyzer
  1134. The `portuguese` analyzer could be reimplemented as a `custom` analyzer as follows:
  1135. [source,js]
  1136. ----------------------------------------------------
  1137. {
  1138. "settings": {
  1139. "analysis": {
  1140. "filter": {
  1141. "portuguese_stop": {
  1142. "type": "stop",
  1143. "stopwords": "_portuguese_" <1>
  1144. },
  1145. "portuguese_keywords": {
  1146. "type": "keyword_marker",
  1147. "keywords": [] <2>
  1148. },
  1149. "portuguese_stemmer": {
  1150. "type": "stemmer",
  1151. "language": "light_portuguese"
  1152. }
  1153. },
  1154. "analyzer": {
  1155. "portuguese": {
  1156. "tokenizer": "standard",
  1157. "filter": [
  1158. "lowercase",
  1159. "portuguese_stop",
  1160. "portuguese_keywords",
  1161. "portuguese_stemmer"
  1162. ]
  1163. }
  1164. }
  1165. }
  1166. }
  1167. }
  1168. ----------------------------------------------------
  1169. <1> The default stopwords can be overridden with the `stopwords`
  1170. or `stopwords_path` parameters.
  1171. <2> This filter should be removed unless there are words which should
  1172. be excluded from stemming.
  1173. [[romanian-analyzer]]
  1174. ===== `romanian` analyzer
  1175. The `romanian` analyzer could be reimplemented as a `custom` analyzer as follows:
  1176. [source,js]
  1177. ----------------------------------------------------
  1178. {
  1179. "settings": {
  1180. "analysis": {
  1181. "filter": {
  1182. "romanian_stop": {
  1183. "type": "stop",
  1184. "stopwords": "_romanian_" <1>
  1185. },
  1186. "romanian_keywords": {
  1187. "type": "keyword_marker",
  1188. "keywords": [] <2>
  1189. },
  1190. "romanian_stemmer": {
  1191. "type": "stemmer",
  1192. "language": "romanian"
  1193. }
  1194. },
  1195. "analyzer": {
  1196. "romanian": {
  1197. "tokenizer": "standard",
  1198. "filter": [
  1199. "lowercase",
  1200. "romanian_stop",
  1201. "romanian_keywords",
  1202. "romanian_stemmer"
  1203. ]
  1204. }
  1205. }
  1206. }
  1207. }
  1208. }
  1209. ----------------------------------------------------
  1210. <1> The default stopwords can be overridden with the `stopwords`
  1211. or `stopwords_path` parameters.
  1212. <2> This filter should be removed unless there are words which should
  1213. be excluded from stemming.
  1214. [[russian-analyzer]]
  1215. ===== `russian` analyzer
  1216. The `russian` analyzer could be reimplemented as a `custom` analyzer as follows:
  1217. [source,js]
  1218. ----------------------------------------------------
  1219. {
  1220. "settings": {
  1221. "analysis": {
  1222. "filter": {
  1223. "russian_stop": {
  1224. "type": "stop",
  1225. "stopwords": "_russian_" <1>
  1226. },
  1227. "russian_keywords": {
  1228. "type": "keyword_marker",
  1229. "keywords": [] <2>
  1230. },
  1231. "russian_stemmer": {
  1232. "type": "stemmer",
  1233. "language": "russian"
  1234. }
  1235. },
  1236. "analyzer": {
  1237. "russian": {
  1238. "tokenizer": "standard",
  1239. "filter": [
  1240. "lowercase",
  1241. "russian_stop",
  1242. "russian_keywords",
  1243. "russian_stemmer"
  1244. ]
  1245. }
  1246. }
  1247. }
  1248. }
  1249. }
  1250. ----------------------------------------------------
  1251. <1> The default stopwords can be overridden with the `stopwords`
  1252. or `stopwords_path` parameters.
  1253. <2> This filter should be removed unless there are words which should
  1254. be excluded from stemming.
  1255. [[sorani-analyzer]]
  1256. ===== `sorani` analyzer
  1257. The `sorani` analyzer could be reimplemented as a `custom` analyzer as follows:
  1258. [source,js]
  1259. ----------------------------------------------------
  1260. {
  1261. "settings": {
  1262. "analysis": {
  1263. "filter": {
  1264. "sorani_stop": {
  1265. "type": "stop",
  1266. "stopwords": "_sorani_" <1>
  1267. },
  1268. "sorani_keywords": {
  1269. "type": "keyword_marker",
  1270. "keywords": [] <2>
  1271. },
  1272. "sorani_stemmer": {
  1273. "type": "stemmer",
  1274. "language": "sorani"
  1275. }
  1276. },
  1277. "analyzer": {
  1278. "sorani": {
  1279. "tokenizer": "standard",
  1280. "filter": [
  1281. "sorani_normalization",
  1282. "lowercase",
  1283. "sorani_stop",
  1284. "sorani_keywords",
  1285. "sorani_stemmer"
  1286. ]
  1287. }
  1288. }
  1289. }
  1290. }
  1291. }
  1292. ----------------------------------------------------
  1293. <1> The default stopwords can be overridden with the `stopwords`
  1294. or `stopwords_path` parameters.
  1295. <2> This filter should be removed unless there are words which should
  1296. be excluded from stemming.
  1297. [[spanish-analyzer]]
  1298. ===== `spanish` analyzer
  1299. The `spanish` analyzer could be reimplemented as a `custom` analyzer as follows:
  1300. [source,js]
  1301. ----------------------------------------------------
  1302. {
  1303. "settings": {
  1304. "analysis": {
  1305. "filter": {
  1306. "spanish_stop": {
  1307. "type": "stop",
  1308. "stopwords": "_spanish_" <1>
  1309. },
  1310. "spanish_keywords": {
  1311. "type": "keyword_marker",
  1312. "keywords": [] <2>
  1313. },
  1314. "spanish_stemmer": {
  1315. "type": "stemmer",
  1316. "language": "light_spanish"
  1317. }
  1318. },
  1319. "analyzer": {
  1320. "spanish": {
  1321. "tokenizer": "standard",
  1322. "filter": [
  1323. "lowercase",
  1324. "spanish_stop",
  1325. "spanish_keywords",
  1326. "spanish_stemmer"
  1327. ]
  1328. }
  1329. }
  1330. }
  1331. }
  1332. }
  1333. ----------------------------------------------------
  1334. <1> The default stopwords can be overridden with the `stopwords`
  1335. or `stopwords_path` parameters.
  1336. <2> This filter should be removed unless there are words which should
  1337. be excluded from stemming.
  1338. [[swedish-analyzer]]
  1339. ===== `swedish` analyzer
  1340. The `swedish` analyzer could be reimplemented as a `custom` analyzer as follows:
  1341. [source,js]
  1342. ----------------------------------------------------
  1343. {
  1344. "settings": {
  1345. "analysis": {
  1346. "filter": {
  1347. "swedish_stop": {
  1348. "type": "stop",
  1349. "stopwords": "_swedish_" <1>
  1350. },
  1351. "swedish_keywords": {
  1352. "type": "keyword_marker",
  1353. "keywords": [] <2>
  1354. },
  1355. "swedish_stemmer": {
  1356. "type": "stemmer",
  1357. "language": "swedish"
  1358. }
  1359. },
  1360. "analyzer": {
  1361. "swedish": {
  1362. "tokenizer": "standard",
  1363. "filter": [
  1364. "lowercase",
  1365. "swedish_stop",
  1366. "swedish_keywords",
  1367. "swedish_stemmer"
  1368. ]
  1369. }
  1370. }
  1371. }
  1372. }
  1373. }
  1374. ----------------------------------------------------
  1375. <1> The default stopwords can be overridden with the `stopwords`
  1376. or `stopwords_path` parameters.
  1377. <2> This filter should be removed unless there are words which should
  1378. be excluded from stemming.
  1379. [[turkish-analyzer]]
  1380. ===== `turkish` analyzer
  1381. The `turkish` analyzer could be reimplemented as a `custom` analyzer as follows:
  1382. [source,js]
  1383. ----------------------------------------------------
  1384. {
  1385. "settings": {
  1386. "analysis": {
  1387. "filter": {
  1388. "turkish_stop": {
  1389. "type": "stop",
  1390. "stopwords": "_turkish_" <1>
  1391. },
  1392. "turkish_lowercase": {
  1393. "type": "lowercase",
  1394. "language": "turkish"
  1395. },
  1396. "turkish_keywords": {
  1397. "type": "keyword_marker",
  1398. "keywords": [] <2>
  1399. },
  1400. "turkish_stemmer": {
  1401. "type": "stemmer",
  1402. "language": "turkish"
  1403. }
  1404. },
  1405. "analyzer": {
  1406. "turkish": {
  1407. "tokenizer": "standard",
  1408. "filter": [
  1409. "apostrophe",
  1410. "turkish_lowercase",
  1411. "turkish_stop",
  1412. "turkish_keywords",
  1413. "turkish_stemmer"
  1414. ]
  1415. }
  1416. }
  1417. }
  1418. }
  1419. }
  1420. ----------------------------------------------------
  1421. <1> The default stopwords can be overridden with the `stopwords`
  1422. or `stopwords_path` parameters.
  1423. <2> This filter should be removed unless there are words which should
  1424. be excluded from stemming.
  1425. [[thai-analyzer]]
  1426. ===== `thai` analyzer
  1427. The `thai` analyzer could be reimplemented as a `custom` analyzer as follows:
  1428. [source,js]
  1429. ----------------------------------------------------
  1430. {
  1431. "settings": {
  1432. "analysis": {
  1433. "filter": {
  1434. "thai_stop": {
  1435. "type": "stop",
  1436. "stopwords": "_thai_" <1>
  1437. }
  1438. },
  1439. "analyzer": {
  1440. "thai": {
  1441. "tokenizer": "thai",
  1442. "filter": [
  1443. "lowercase",
  1444. "thai_stop"
  1445. ]
  1446. }
  1447. }
  1448. }
  1449. }
  1450. }
  1451. ----------------------------------------------------
  1452. <1> The default stopwords can be overridden with the `stopwords`
  1453. or `stopwords_path` parameters.