fts4langid.test 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485
  1. # 2012 March 01
  2. #
  3. # The author disclaims copyright to this source code. In place of
  4. # a legal notice, here is a blessing:
  5. #
  6. # May you do good and not evil.
  7. # May you find forgiveness for yourself and forgive others.
  8. # May you share freely, never taking more than you give.
  9. #
  10. #*************************************************************************
  11. # This file implements regression tests for SQLite library. The
  12. # focus of this script is testing the languageid=xxx FTS4 option.
  13. #
  14. set testdir [file dirname $argv0]
  15. source $testdir/tester.tcl
  16. set ::testprefix fts4content
  17. # If SQLITE_ENABLE_FTS3 is defined, omit this file.
  18. ifcapable !fts3 {
  19. finish_test
  20. return
  21. }
  22. set ::testprefix fts4langid
  23. #---------------------------------------------------------------------------
  24. # Test plan:
  25. #
  26. # 1.* - Warm-body tests created for specific purposes during development.
  27. # Passing these doesn't really prove much.
  28. #
  29. # 2.1.* - Test that FTS queries only ever return rows associated with
  30. # the requested language.
  31. #
  32. # 2.2.* - Same as 2.1.*, after an 'optimize' command.
  33. #
  34. # 2.3.* - Same as 2.1.*, after a 'rebuild' command.
  35. #
  36. # 3.* - Tests with content= tables. Both where there is a real
  37. # underlying content table and where there is not.
  38. #
  39. # 4.* - Test that if one is provided, the tokenizer xLanguage method
  40. # is called to configure the tokenizer before tokenizing query
  41. # or document text.
  42. #
  43. # 5.* - Test the fts4aux table when the associated FTS4 table contains
  44. # multiple languages.
  45. #
  46. do_execsql_test 1.1 {
  47. CREATE VIRTUAL TABLE t1 USING fts4(a, b, languageid=lang_id);
  48. }
  49. do_execsql_test 1.2 {
  50. SELECT sql FROM sqlite_master WHERE name = 't1_content';
  51. } {{CREATE TABLE 't1_content'(docid INTEGER PRIMARY KEY, 'c0a', 'c1b', langid)}}
  52. do_execsql_test 1.3 {SELECT docid FROM t1} {}
  53. do_execsql_test 1.4 {SELECT lang_id FROM t1} {}
  54. do_execsql_test 1.5 {INSERT INTO t1(a, b) VALUES('aaa', 'bbb')}
  55. do_execsql_test 1.6 {SELECT lang_id FROM t1 } {0}
  56. do_execsql_test 1.7 {INSERT INTO t1(a, b, lang_id) VALUES('aaa', 'bbb', 4)}
  57. do_execsql_test 1.8 {SELECT lang_id FROM t1 } {0 4}
  58. do_execsql_test 1.9 {INSERT INTO t1(a, b, lang_id) VALUES('aaa', 'bbb', 'xyz')}
  59. do_execsql_test 1.10 {SELECT lang_id FROM t1} {0 4 0}
  60. do_execsql_test 1.11 {
  61. CREATE VIRTUAL TABLE t2 USING fts4;
  62. INSERT INTO t2 VALUES('abc');
  63. }
  64. do_execsql_test 1.12 { SELECT rowid FROM t2 WHERE content MATCH 'abc' } 1
  65. do_execsql_test 1.13 {
  66. DROP TABLE t1;
  67. CREATE VIRTUAL TABLE t1 USING fts4(languageid=lang_id);
  68. INSERT INTO t1(content) VALUES('a b c');
  69. INSERT INTO t1(content, lang_id) VALUES('a b c', 1);
  70. }
  71. do_execsql_test 1.14 {
  72. SELECT rowid FROM t1 WHERE t1 MATCH 'b';
  73. } {1}
  74. do_execsql_test 1.15 {
  75. SELECT rowid FROM t1 WHERE t1 MATCH 'b' AND lang_id = 0;
  76. } {1}
  77. do_execsql_test 1.16 {
  78. SELECT rowid FROM t1 WHERE t1 MATCH 'b' AND lang_id = 1;
  79. } {2}
  80. do_catchsql_test 1.17 {
  81. INSERT INTO t1(content, lang_id) VALUES('123', -1);
  82. } {1 {constraint failed}}
  83. do_execsql_test 1.18 {
  84. DROP TABLE t1;
  85. CREATE VIRTUAL TABLE t1 USING fts4(languageid=lang_id);
  86. INSERT INTO t1(content, lang_id) VALUES('A', 13);
  87. INSERT INTO t1(content, lang_id) VALUES('B', 13);
  88. INSERT INTO t1(content, lang_id) VALUES('C', 13);
  89. INSERT INTO t1(content, lang_id) VALUES('D', 13);
  90. INSERT INTO t1(content, lang_id) VALUES('E', 13);
  91. INSERT INTO t1(content, lang_id) VALUES('F', 13);
  92. INSERT INTO t1(content, lang_id) VALUES('G', 13);
  93. INSERT INTO t1(content, lang_id) VALUES('H', 13);
  94. INSERT INTO t1(content, lang_id) VALUES('I', 13);
  95. INSERT INTO t1(content, lang_id) VALUES('J', 13);
  96. INSERT INTO t1(content, lang_id) VALUES('K', 13);
  97. INSERT INTO t1(content, lang_id) VALUES('L', 13);
  98. INSERT INTO t1(content, lang_id) VALUES('M', 13);
  99. INSERT INTO t1(content, lang_id) VALUES('N', 13);
  100. INSERT INTO t1(content, lang_id) VALUES('O', 13);
  101. INSERT INTO t1(content, lang_id) VALUES('P', 13);
  102. INSERT INTO t1(content, lang_id) VALUES('Q', 13);
  103. INSERT INTO t1(content, lang_id) VALUES('R', 13);
  104. INSERT INTO t1(content, lang_id) VALUES('S', 13);
  105. SELECT rowid FROM t1 WHERE t1 MATCH 'A';
  106. } {}
  107. #-------------------------------------------------------------------------
  108. # Test cases 2.*
  109. #
  110. proc build_multilingual_db_1 {db} {
  111. $db eval { CREATE VIRTUAL TABLE t2 USING fts4(x, y, languageid=l) }
  112. set xwords [list zero one two three four five six seven eight nine ten]
  113. set ywords [list alpha beta gamma delta epsilon zeta eta theta iota kappa]
  114. for {set i 0} {$i < 1000} {incr i} {
  115. set iLangid [expr $i%9]
  116. set x ""
  117. set y ""
  118. set x [list]
  119. lappend x [lindex $xwords [expr ($i / 1000) % 10]]
  120. lappend x [lindex $xwords [expr ($i / 100) % 10]]
  121. lappend x [lindex $xwords [expr ($i / 10) % 10]]
  122. lappend x [lindex $xwords [expr ($i / 1) % 10]]
  123. set y [list]
  124. lappend y [lindex $ywords [expr ($i / 1000) % 10]]
  125. lappend y [lindex $ywords [expr ($i / 100) % 10]]
  126. lappend y [lindex $ywords [expr ($i / 10) % 10]]
  127. lappend y [lindex $ywords [expr ($i / 1) % 10]]
  128. $db eval { INSERT INTO t2(docid, x, y, l) VALUES($i, $x, $y, $iLangid) }
  129. }
  130. $db eval {
  131. CREATE TABLE data(x, y, l);
  132. INSERT INTO data(rowid, x, y, l) SELECT docid, x, y, l FROM t2;
  133. }
  134. }
  135. proc rowid_list_set_langid {langid} {
  136. set ::rowid_list_langid $langid
  137. }
  138. proc rowid_list {pattern} {
  139. set langid $::rowid_list_langid
  140. set res [list]
  141. db eval {SELECT rowid, x, y FROM data WHERE l = $langid ORDER BY rowid ASC} {
  142. if {[string match "*$pattern*" $x] || [string match "*$pattern*" $y]} {
  143. lappend res $rowid
  144. }
  145. }
  146. return $res
  147. }
  148. proc or_merge_list {list1 list2} {
  149. set res [list]
  150. set i1 0
  151. set i2 0
  152. set n1 [llength $list1]
  153. set n2 [llength $list2]
  154. while {$i1 < $n1 && $i2 < $n2} {
  155. set e1 [lindex $list1 $i1]
  156. set e2 [lindex $list2 $i2]
  157. if {$e1==$e2} {
  158. lappend res $e1
  159. incr i1
  160. incr i2
  161. } elseif {$e1 < $e2} {
  162. lappend res $e1
  163. incr i1
  164. } else {
  165. lappend res $e2
  166. incr i2
  167. }
  168. }
  169. concat $res [lrange $list1 $i1 end] [lrange $list2 $i2 end]
  170. }
  171. proc or_merge_lists {args} {
  172. set res [lindex $args 0]
  173. for {set i 1} {$i < [llength $args]} {incr i} {
  174. set res [or_merge_list $res [lindex $args $i]]
  175. }
  176. set res
  177. }
  178. proc and_merge_list {list1 list2} {
  179. foreach i $list2 { set a($i) 1 }
  180. set res [list]
  181. foreach i $list1 {
  182. if {[info exists a($i)]} {lappend res $i}
  183. }
  184. set res
  185. }
  186. proc and_merge_lists {args} {
  187. set res [lindex $args 0]
  188. for {set i 1} {$i < [llength $args]} {incr i} {
  189. set res [and_merge_list $res [lindex $args $i]]
  190. }
  191. set res
  192. }
  193. proc filter_list {list langid} {
  194. set res [list]
  195. foreach i $list {
  196. if {($i % 9) == $langid} {lappend res $i}
  197. }
  198. set res
  199. }
  200. do_test 2.0 {
  201. reset_db
  202. build_multilingual_db_1 db
  203. } {}
  204. proc do_test_query1 {tn query res_script} {
  205. for {set langid 0} {$langid < 10} {incr langid} {
  206. rowid_list_set_langid $langid
  207. set res [eval $res_script]
  208. set actual [
  209. execsql {SELECT docid FROM t2 WHERE t2 MATCH $query AND l = $langid}
  210. ]
  211. do_test $tn.$langid [list set {} $actual] $res
  212. }
  213. }
  214. # Run some queries.
  215. do_test_query1 2.1.1 {delta} { rowid_list delta }
  216. do_test_query1 2.1.2 {"zero one two"} { rowid_list "zero one two" }
  217. do_test_query1 2.1.3 {zero one two} {
  218. and_merge_lists [rowid_list zero] [rowid_list one] [rowid_list two]
  219. }
  220. do_test_query1 2.1.4 {"zero one" OR "one two"} {
  221. or_merge_lists [rowid_list "zero one"] [rowid_list "one two"]
  222. }
  223. # Now try the same tests as above, but after running the 'optimize'
  224. # command on the FTS table.
  225. #
  226. do_execsql_test 2.2 {
  227. INSERT INTO t2(t2) VALUES('optimize');
  228. SELECT count(*) FROM t2_segdir;
  229. } {9}
  230. do_test_query1 2.2.1 {delta} { rowid_list delta }
  231. do_test_query1 2.2.2 {"zero one two"} { rowid_list "zero one two" }
  232. do_test_query1 2.2.3 {zero one two} {
  233. and_merge_lists [rowid_list zero] [rowid_list one] [rowid_list two]
  234. }
  235. do_test_query1 2.2.4 {"zero one" OR "one two"} {
  236. or_merge_lists [rowid_list "zero one"] [rowid_list "one two"]
  237. }
  238. # And rebuild.
  239. #
  240. do_test 2.3 {
  241. reset_db
  242. build_multilingual_db_1 db
  243. execsql { INSERT INTO t2(t2) VALUES('rebuild') }
  244. } {}
  245. do_test_query1 2.3.1 {delta} { rowid_list delta }
  246. do_test_query1 2.3.2 {"zero one two"} { rowid_list "zero one two" }
  247. do_test_query1 2.3.3 {zero one two} {
  248. and_merge_lists [rowid_list zero] [rowid_list one] [rowid_list two]
  249. }
  250. do_test_query1 2.3.4 {"zero one" OR "one two"} {
  251. or_merge_lists [rowid_list "zero one"] [rowid_list "one two"]
  252. }
  253. #-------------------------------------------------------------------------
  254. # Test cases 3.*
  255. #
  256. do_test 3.0 {
  257. reset_db
  258. build_multilingual_db_1 db
  259. execsql {
  260. CREATE TABLE t3_data(l, x, y);
  261. INSERT INTO t3_data(rowid, l, x, y) SELECT docid, l, x, y FROM t2;
  262. DROP TABLE t2;
  263. }
  264. } {}
  265. do_execsql_test 3.1 {
  266. CREATE VIRTUAL TABLE t2 USING fts4(content=t3_data, languageid=l);
  267. INSERT INTO t2(t2) VALUES('rebuild');
  268. }
  269. do_test_query1 3.1.1 {delta} { rowid_list delta }
  270. do_test_query1 3.1.2 {"zero one two"} { rowid_list "zero one two" }
  271. do_test_query1 3.1.3 {zero one two} {
  272. and_merge_lists [rowid_list zero] [rowid_list one] [rowid_list two]
  273. }
  274. do_test_query1 3.1.4 {"zero one" OR "one two"} {
  275. or_merge_lists [rowid_list "zero one"] [rowid_list "one two"]
  276. }
  277. do_execsql_test 3.2.1 {
  278. DROP TABLE t2;
  279. CREATE VIRTUAL TABLE t2 USING fts4(x, y, languageid=l, content=nosuchtable);
  280. }
  281. do_execsql_test 3.2.2 {
  282. INSERT INTO t2(docid, x, y, l) SELECT rowid, x, y, l FROM t3_data;
  283. }
  284. do_execsql_test 3.2.3 {
  285. DROP TABLE t3_data;
  286. }
  287. do_test_query1 3.3.1 {delta} { rowid_list delta }
  288. do_test_query1 3.3.2 {"zero one two"} { rowid_list "zero one two" }
  289. do_test_query1 3.3.3 {zero one two} {
  290. and_merge_lists [rowid_list zero] [rowid_list one] [rowid_list two]
  291. }
  292. do_test_query1 3.3.4 {"zero one" OR "one two"} {
  293. or_merge_lists [rowid_list "zero one"] [rowid_list "one two"]
  294. }
  295. #-------------------------------------------------------------------------
  296. # Test cases 4.*
  297. #
  298. proc build_multilingual_db_2 {db} {
  299. $db eval {
  300. CREATE VIRTUAL TABLE t4 USING fts4(
  301. tokenize=testtokenizer,
  302. languageid=lid
  303. );
  304. }
  305. for {set i 0} {$i < 50} {incr i} {
  306. execsql {
  307. INSERT INTO t4(docid, content, lid) VALUES($i, 'The Quick Brown Fox', $i)
  308. }
  309. }
  310. }
  311. do_test 4.1.0 {
  312. reset_db
  313. set ptr [fts3_test_tokenizer]
  314. execsql { SELECT fts3_tokenizer('testtokenizer', $ptr) }
  315. build_multilingual_db_2 db
  316. } {}
  317. do_execsql_test 4.1.1 {
  318. SELECT docid FROM t4 WHERE t4 MATCH 'quick';
  319. } {0}
  320. do_execsql_test 4.1.2 {
  321. SELECT docid FROM t4 WHERE t4 MATCH 'quick' AND lid=1;
  322. } {}
  323. do_execsql_test 4.1.3 {
  324. SELECT docid FROM t4 WHERE t4 MATCH 'Quick' AND lid=1;
  325. } {1}
  326. for {set i 0} {$i < 50} {incr i} {
  327. do_execsql_test 4.1.4.$i {
  328. SELECT count(*) FROM t4 WHERE t4 MATCH 'fox' AND lid=$i;
  329. } [expr 0==($i%2)]
  330. }
  331. do_catchsql_test 4.1.5 {
  332. INSERT INTO t4(content, lid) VALUES('hello world', 101)
  333. } {1 {SQL logic error or missing database}}
  334. #-------------------------------------------------------------------------
  335. # Test cases 5.*
  336. #
  337. # The following test cases are designed to detect a 32-bit overflow bug
  338. # that existed at one point.
  339. #
  340. proc build_multilingual_db_3 {db} {
  341. $db eval {
  342. CREATE VIRTUAL TABLE t5 USING fts4(languageid=lid);
  343. }
  344. set languages [list 0 1 2 [expr 1<<30]]
  345. foreach lid $languages {
  346. execsql {
  347. INSERT INTO t5(docid, content, lid) VALUES(
  348. $lid, 'My language is ' || $lid, $lid
  349. )
  350. }
  351. }
  352. }
  353. do_test 5.1.0 {
  354. reset_db
  355. build_multilingual_db_3 db
  356. } {}
  357. do_execsql_test 5.1.1 {
  358. SELECT level FROM t5_segdir;
  359. } [list 0 1024 2048 [expr 1<<40]]
  360. do_execsql_test 5.1.2 {SELECT docid FROM t5 WHERE t5 MATCH 'language'} 0
  361. foreach langid [list 0 1 2 [expr 1<<30]] {
  362. do_execsql_test 5.2.$langid {
  363. SELECT docid FROM t5 WHERE t5 MATCH 'language' AND lid = $langid
  364. } $langid
  365. }
  366. set lid [expr 1<<30]
  367. do_execsql_test 5.3.1 {
  368. CREATE VIRTUAL TABLE t6 USING fts4(languageid=lid);
  369. INSERT INTO t6 VALUES('I belong to language 0!');
  370. }
  371. do_test 5.3.2 {
  372. for {set i 0} {$i < 20} {incr i} {
  373. execsql {
  374. INSERT INTO t6(content, lid) VALUES(
  375. 'I (row '||$i||') belong to langauge N!', $lid
  376. );
  377. }
  378. }
  379. execsql { SELECT docid FROM t6 WHERE t6 MATCH 'belong' }
  380. } {1}
  381. do_test 5.3.3 {
  382. execsql { SELECT docid FROM t6 WHERE t6 MATCH 'belong' AND lid=$lid}
  383. } {2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21}
  384. do_execsql_test 5.3.4 { INSERT INTO t6(t6) VALUES('optimize') } {}
  385. do_execsql_test 5.3.5 { SELECT docid FROM t6 WHERE t6 MATCH 'belong' } {1}
  386. do_execsql_test 5.3.6 {
  387. SELECT docid FROM t6 WHERE t6 MATCH 'belong' AND lid=$lid
  388. } {2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21}
  389. set lid [expr 1<<30]
  390. foreach lid [list 4 [expr 1<<30]] {
  391. do_execsql_test 5.4.$lid.1 {
  392. DELETE FROM t6;
  393. SELECT count(*) FROM t6_segdir;
  394. SELECT count(*) FROM t6_segments;
  395. } {0 0}
  396. do_execsql_test 5.4.$lid.2 {
  397. INSERT INTO t6(content, lid) VALUES('zero zero zero', $lid);
  398. INSERT INTO t6(content, lid) VALUES('zero zero one', $lid);
  399. INSERT INTO t6(content, lid) VALUES('zero one zero', $lid);
  400. INSERT INTO t6(content, lid) VALUES('zero one one', $lid);
  401. INSERT INTO t6(content, lid) VALUES('one zero zero', $lid);
  402. INSERT INTO t6(content, lid) VALUES('one zero one', $lid);
  403. INSERT INTO t6(content, lid) VALUES('one one zero', $lid);
  404. INSERT INTO t6(content, lid) VALUES('one one one', $lid);
  405. SELECT docid FROM t6 WHERE t6 MATCH '"zero zero"' AND lid=$lid;
  406. } {1 2 5}
  407. do_execsql_test 5.4.$lid.3 {
  408. SELECT count(*) FROM t6_segdir;
  409. SELECT count(*) FROM t6_segments;
  410. } {8 0}
  411. do_execsql_test 5.4.$lid.4 {
  412. INSERT INTO t6(t6) VALUES('merge=100,3');
  413. INSERT INTO t6(t6) VALUES('merge=100,3');
  414. SELECT docid FROM t6 WHERE t6 MATCH '"zero zero"' AND lid=$lid;
  415. } {1 2 5}
  416. do_execsql_test 5.4.$lid.5 {
  417. SELECT count(*) FROM t6_segdir;
  418. SELECT count(*) FROM t6_segments;
  419. } {4 4}
  420. }
  421. finish_test