fts3rnd.test 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
  1. # 2009 December 03
  2. #
  3. # May you do good and not evil.
  4. # May you find forgiveness for yourself and forgive others.
  5. # May you share freely, never taking more than you give.
  6. #
  7. #***********************************************************************
  8. #
  9. # Brute force (random data) tests for FTS3.
  10. #
  11. #-------------------------------------------------------------------------
  12. #
  13. # The FTS3 tests implemented in this file focus on testing that FTS3
  14. # returns the correct set of documents for various types of full-text
  15. # query. This is done using pseudo-randomly generated data and queries.
  16. # The expected result of each query is calculated using Tcl code.
  17. #
  18. # 1. The database is initialized to contain a single table with three
  19. # columns. 100 rows are inserted into the table. Each of the three
  20. # values in each row is a document consisting of between 0 and 100
  21. # terms. Terms are selected from a vocabulary of $G(nVocab) terms.
  22. #
  23. # 2. The following is performed 100 times:
  24. #
  25. # a. A row is inserted into the database. The row contents are
  26. # generated as in step 1. The docid is a pseudo-randomly selected
  27. # value between 0 and 1000000.
  28. #
  29. # b. A psuedo-randomly selected row is updated. One of its columns is
  30. # set to contain a new document generated in the same way as the
  31. # documents in step 1.
  32. #
  33. # c. A psuedo-randomly selected row is deleted.
  34. #
  35. # d. For each of several types of fts3 queries, 10 SELECT queries
  36. # of the form:
  37. #
  38. # SELECT docid FROM <tbl> WHERE <tbl> MATCH '<query>'
  39. #
  40. # are evaluated. The results are compared to those calculated by
  41. # Tcl code in this file. The patterns used for the different query
  42. # types are:
  43. #
  44. # 1. query = <term>
  45. # 2. query = <prefix>
  46. # 3. query = "<term> <term>"
  47. # 4. query = "<term> <term> <term>"
  48. # 5. query = "<prefix> <prefix> <prefix>"
  49. # 6. query = <term> NEAR <term>
  50. # 7. query = <term> NEAR/11 <term> NEAR/11 <term>
  51. # 8. query = <term> OR <term>
  52. # 9. query = <term> NOT <term>
  53. # 10. query = <term> AND <term>
  54. # 11. query = <term> NEAR <term> OR <term> NEAR <term>
  55. # 12. query = <term> NEAR <term> NOT <term> NEAR <term>
  56. # 13. query = <term> NEAR <term> AND <term> NEAR <term>
  57. #
  58. # where <term> is a term psuedo-randomly selected from the vocabulary
  59. # and prefix is the first 2 characters of such a term followed by
  60. # a "*" character.
  61. #
  62. # Every second iteration, steps (a) through (d) above are performed
  63. # within a single transaction. This forces the queries in (d) to
  64. # read data from both the database and the in-memory hash table
  65. # that caches the full-text index entries created by steps (a), (b)
  66. # and (c) until the transaction is committed.
  67. #
  68. # The procedure above is run 5 times, using advisory fts3 node sizes of 50,
  69. # 500, 1000 and 2000 bytes.
  70. #
  71. # After the test using an advisory node-size of 50, an OOM test is run using
  72. # the database. This test is similar to step (d) above, except that it tests
  73. # the effects of transient and persistent OOM conditions encountered while
  74. # executing each query.
  75. #
  76. set testdir [file dirname $argv0]
  77. source $testdir/tester.tcl
  78. # If this build does not include FTS3, skip the tests in this file.
  79. #
  80. ifcapable !fts3 { finish_test ; return }
  81. source $testdir/fts3_common.tcl
  82. source $testdir/malloc_common.tcl
  83. set G(nVocab) 100
  84. set nVocab 100
  85. set lVocab [list]
  86. expr srand(0)
  87. # Generate a vocabulary of nVocab words. Each word is 3 characters long.
  88. #
  89. set lChar {a b c d e f g h i j k l m n o p q r s t u v w x y z}
  90. for {set i 0} {$i < $nVocab} {incr i} {
  91. set len [expr int(rand()*3)+2]
  92. set word [lindex $lChar [expr int(rand()*26)]]
  93. append word [lindex $lChar [expr int(rand()*26)]]
  94. if {$len>2} { append word [lindex $lChar [expr int(rand()*26)]] }
  95. if {$len>3} { append word [lindex $lChar [expr int(rand()*26)]] }
  96. lappend lVocab $word
  97. }
  98. proc random_term {} {
  99. lindex $::lVocab [expr {int(rand()*$::nVocab)}]
  100. }
  101. # Return a document consisting of $nWord arbitrarily selected terms
  102. # from the $::lVocab list.
  103. #
  104. proc generate_doc {nWord} {
  105. set doc [list]
  106. for {set i 0} {$i < $nWord} {incr i} {
  107. lappend doc [random_term]
  108. }
  109. return $doc
  110. }
  111. # Primitives to update the table.
  112. #
  113. unset -nocomplain t1
  114. proc insert_row {rowid} {
  115. set a [generate_doc [expr int((rand()*100))]]
  116. set b [generate_doc [expr int((rand()*100))]]
  117. set c [generate_doc [expr int((rand()*100))]]
  118. execsql { INSERT INTO t1(docid, a, b, c) VALUES($rowid, $a, $b, $c) }
  119. set ::t1($rowid) [list $a $b $c]
  120. }
  121. proc delete_row {rowid} {
  122. execsql { DELETE FROM t1 WHERE rowid = $rowid }
  123. catch {unset ::t1($rowid)}
  124. }
  125. proc update_row {rowid} {
  126. set cols {a b c}
  127. set iCol [expr int(rand()*3)]
  128. set doc [generate_doc [expr int((rand()*100))]]
  129. lset ::t1($rowid) $iCol $doc
  130. execsql "UPDATE t1 SET [lindex $cols $iCol] = \$doc WHERE rowid = \$rowid"
  131. }
  132. proc simple_phrase {zPrefix} {
  133. set ret [list]
  134. set reg [string map {* {[^ ]*}} $zPrefix]
  135. set reg " $reg "
  136. foreach key [lsort -integer [array names ::t1]] {
  137. set value $::t1($key)
  138. set cnt [list]
  139. foreach col $value {
  140. if {[regexp $reg " $col "]} { lappend ret $key ; break }
  141. }
  142. }
  143. #lsort -uniq -integer $ret
  144. set ret
  145. }
  146. # This [proc] is used to test the FTS3 matchinfo() function.
  147. #
  148. proc simple_token_matchinfo {zToken bDesc} {
  149. set nDoc(0) 0
  150. set nDoc(1) 0
  151. set nDoc(2) 0
  152. set nHit(0) 0
  153. set nHit(1) 0
  154. set nHit(2) 0
  155. set dir -inc
  156. if {$bDesc} { set dir -dec }
  157. foreach key [array names ::t1] {
  158. set value $::t1($key)
  159. set a($key) [list]
  160. foreach i {0 1 2} col $value {
  161. set hit [llength [lsearch -all $col $zToken]]
  162. lappend a($key) $hit
  163. incr nHit($i) $hit
  164. if {$hit>0} { incr nDoc($i) }
  165. }
  166. }
  167. set ret [list]
  168. foreach docid [lsort -integer $dir [array names a]] {
  169. if { [lindex [lsort -integer $a($docid)] end] } {
  170. set matchinfo [list 1 3]
  171. foreach i {0 1 2} hit $a($docid) {
  172. lappend matchinfo $hit $nHit($i) $nDoc($i)
  173. }
  174. lappend ret $docid $matchinfo
  175. }
  176. }
  177. set ret
  178. }
  179. proc simple_near {termlist nNear} {
  180. set ret [list]
  181. foreach {key value} [array get ::t1] {
  182. foreach v $value {
  183. set l [lsearch -exact -all $v [lindex $termlist 0]]
  184. foreach T [lrange $termlist 1 end] {
  185. set l2 [list]
  186. foreach i $l {
  187. set iStart [expr $i - $nNear - 1]
  188. set iEnd [expr $i + $nNear + 1]
  189. if {$iStart < 0} {set iStart 0}
  190. foreach i2 [lsearch -exact -all [lrange $v $iStart $iEnd] $T] {
  191. incr i2 $iStart
  192. if {$i2 != $i} { lappend l2 $i2 }
  193. }
  194. }
  195. set l [lsort -uniq -integer $l2]
  196. }
  197. if {[llength $l]} {
  198. #puts "MATCH($key): $v"
  199. lappend ret $key
  200. }
  201. }
  202. }
  203. lsort -unique -integer $ret
  204. }
  205. # The following three procs:
  206. #
  207. # setup_not A B
  208. # setup_or A B
  209. # setup_and A B
  210. #
  211. # each take two arguments. Both arguments must be lists of integer values
  212. # sorted by value. The return value is the list produced by evaluating
  213. # the equivalent of "A op B", where op is the FTS3 operator NOT, OR or
  214. # AND.
  215. #
  216. proc setop_not {A B} {
  217. foreach b $B { set n($b) {} }
  218. set ret [list]
  219. foreach a $A { if {![info exists n($a)]} {lappend ret $a} }
  220. return $ret
  221. }
  222. proc setop_or {A B} {
  223. lsort -integer -uniq [concat $A $B]
  224. }
  225. proc setop_and {A B} {
  226. foreach b $B { set n($b) {} }
  227. set ret [list]
  228. foreach a $A { if {[info exists n($a)]} {lappend ret $a} }
  229. return $ret
  230. }
  231. proc mit {blob} {
  232. set scan(littleEndian) i*
  233. set scan(bigEndian) I*
  234. binary scan $blob $scan($::tcl_platform(byteOrder)) r
  235. return $r
  236. }
  237. db func mit mit
  238. set sqlite_fts3_enable_parentheses 1
  239. proc do_orderbydocid_test {tn sql res} {
  240. uplevel [list do_select_test $tn.asc "$sql ORDER BY docid ASC" $res]
  241. uplevel [list do_select_test $tn.desc "$sql ORDER BY docid DESC" \
  242. [lsort -int -dec $res]
  243. ]
  244. }
  245. set NUM_TRIALS 100
  246. foreach {nodesize order} {
  247. 50 DESC
  248. 50 ASC
  249. 500 ASC
  250. 1000 DESC
  251. 2000 ASC
  252. } {
  253. catch { array unset ::t1 }
  254. set testname "$nodesize/$order"
  255. # Create the FTS3 table. Populate it (and the Tcl array) with 100 rows.
  256. #
  257. db transaction {
  258. catchsql { DROP TABLE t1 }
  259. execsql "CREATE VIRTUAL TABLE t1 USING fts4(a, b, c, order=$order)"
  260. execsql "INSERT INTO t1(t1) VALUES('nodesize=$nodesize')"
  261. for {set i 0} {$i < 100} {incr i} { insert_row $i }
  262. }
  263. for {set iTest 1} {$iTest <= $NUM_TRIALS} {incr iTest} {
  264. catchsql COMMIT
  265. set DO_MALLOC_TEST 0
  266. set nRep 10
  267. if {$iTest==100 && $nodesize==50} {
  268. set DO_MALLOC_TEST 1
  269. set nRep 2
  270. }
  271. set ::testprefix fts3rnd-1.$testname.$iTest
  272. # Delete one row, update one row and insert one row.
  273. #
  274. set rows [array names ::t1]
  275. set nRow [llength $rows]
  276. set iUpdate [lindex $rows [expr {int(rand()*$nRow)}]]
  277. set iDelete $iUpdate
  278. while {$iDelete == $iUpdate} {
  279. set iDelete [lindex $rows [expr {int(rand()*$nRow)}]]
  280. }
  281. set iInsert $iUpdate
  282. while {[info exists ::t1($iInsert)]} {
  283. set iInsert [expr {int(rand()*1000000)}]
  284. }
  285. execsql BEGIN
  286. insert_row $iInsert
  287. update_row $iUpdate
  288. delete_row $iDelete
  289. if {0==($iTest%2)} { execsql COMMIT }
  290. if {0==($iTest%2)} {
  291. #do_test 0 { fts3_integrity_check t1 } ok
  292. }
  293. # Pick 10 terms from the vocabulary. Check that the results of querying
  294. # the database for the set of documents containing each of these terms
  295. # is the same as the result obtained by scanning the contents of the Tcl
  296. # array for each term.
  297. #
  298. for {set i 0} {$i < 10} {incr i} {
  299. set term [random_term]
  300. do_select_test 1.$i.asc {
  301. SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term
  302. ORDER BY docid ASC
  303. } [simple_token_matchinfo $term 0]
  304. do_select_test 1.$i.desc {
  305. SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term
  306. ORDER BY docid DESC
  307. } [simple_token_matchinfo $term 1]
  308. }
  309. # This time, use the first two characters of each term as a term prefix
  310. # to query for. Test that querying the Tcl array produces the same results
  311. # as querying the FTS3 table for the prefix.
  312. #
  313. for {set i 0} {$i < $nRep} {incr i} {
  314. set prefix [string range [random_term] 0 end-1]
  315. set match "${prefix}*"
  316. do_orderbydocid_test 2.$i {
  317. SELECT docid FROM t1 WHERE t1 MATCH $match
  318. } [simple_phrase $match]
  319. }
  320. # Similar to the above, except for phrase queries.
  321. #
  322. for {set i 0} {$i < $nRep} {incr i} {
  323. set term [list [random_term] [random_term]]
  324. set match "\"$term\""
  325. do_orderbydocid_test 3.$i {
  326. SELECT docid FROM t1 WHERE t1 MATCH $match
  327. } [simple_phrase $term]
  328. }
  329. # Three word phrases.
  330. #
  331. for {set i 0} {$i < $nRep} {incr i} {
  332. set term [list [random_term] [random_term] [random_term]]
  333. set match "\"$term\""
  334. do_orderbydocid_test 4.$i {
  335. SELECT docid FROM t1 WHERE t1 MATCH $match
  336. } [simple_phrase $term]
  337. }
  338. # Three word phrases made up of term-prefixes.
  339. #
  340. for {set i 0} {$i < $nRep} {incr i} {
  341. set query "[string range [random_term] 0 end-1]* "
  342. append query "[string range [random_term] 0 end-1]* "
  343. append query "[string range [random_term] 0 end-1]*"
  344. set match "\"$query\""
  345. do_orderbydocid_test 5.$i {
  346. SELECT docid FROM t1 WHERE t1 MATCH $match
  347. } [simple_phrase $query]
  348. }
  349. # A NEAR query with terms as the arguments:
  350. #
  351. # ... MATCH '$term1 NEAR $term2' ...
  352. #
  353. for {set i 0} {$i < $nRep} {incr i} {
  354. set terms [list [random_term] [random_term]]
  355. set match [join $terms " NEAR "]
  356. do_orderbydocid_test 6.$i {
  357. SELECT docid FROM t1 WHERE t1 MATCH $match
  358. } [simple_near $terms 10]
  359. }
  360. # A 3-way NEAR query with terms as the arguments.
  361. #
  362. for {set i 0} {$i < $nRep} {incr i} {
  363. set terms [list [random_term] [random_term] [random_term]]
  364. set nNear 11
  365. set match [join $terms " NEAR/$nNear "]
  366. do_orderbydocid_test 7.$i {
  367. SELECT docid FROM t1 WHERE t1 MATCH $match
  368. } [simple_near $terms $nNear]
  369. }
  370. # Set operations on simple term queries.
  371. #
  372. foreach {tn op proc} {
  373. 8 OR setop_or
  374. 9 NOT setop_not
  375. 10 AND setop_and
  376. } {
  377. for {set i 0} {$i < $nRep} {incr i} {
  378. set term1 [random_term]
  379. set term2 [random_term]
  380. set match "$term1 $op $term2"
  381. do_orderbydocid_test $tn.$i {
  382. SELECT docid FROM t1 WHERE t1 MATCH $match
  383. } [$proc [simple_phrase $term1] [simple_phrase $term2]]
  384. }
  385. }
  386. # Set operations on NEAR queries.
  387. #
  388. foreach {tn op proc} {
  389. 11 OR setop_or
  390. 12 NOT setop_not
  391. 13 AND setop_and
  392. } {
  393. for {set i 0} {$i < $nRep} {incr i} {
  394. set term1 [random_term]
  395. set term2 [random_term]
  396. set term3 [random_term]
  397. set term4 [random_term]
  398. set match "$term1 NEAR $term2 $op $term3 NEAR $term4"
  399. do_orderbydocid_test $tn.$i {
  400. SELECT docid FROM t1 WHERE t1 MATCH $match
  401. } [$proc \
  402. [simple_near [list $term1 $term2] 10] \
  403. [simple_near [list $term3 $term4] 10]
  404. ]
  405. }
  406. }
  407. catchsql COMMIT
  408. }
  409. }
  410. finish_test