123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460 |
- # 2009 December 03
- #
- # May you do good and not evil.
- # May you find forgiveness for yourself and forgive others.
- # May you share freely, never taking more than you give.
- #
- #***********************************************************************
- #
- # Brute force (random data) tests for FTS3.
- #
- #-------------------------------------------------------------------------
- #
- # The FTS3 tests implemented in this file focus on testing that FTS3
- # returns the correct set of documents for various types of full-text
- # query. This is done using pseudo-randomly generated data and queries.
- # The expected result of each query is calculated using Tcl code.
- #
- # 1. The database is initialized to contain a single table with three
- # columns. 100 rows are inserted into the table. Each of the three
- # values in each row is a document consisting of between 0 and 100
- # terms. Terms are selected from a vocabulary of $G(nVocab) terms.
- #
- # 2. The following is performed 100 times:
- #
- # a. A row is inserted into the database. The row contents are
- # generated as in step 1. The docid is a pseudo-randomly selected
- # value between 0 and 1000000.
- #
- # b. A psuedo-randomly selected row is updated. One of its columns is
- # set to contain a new document generated in the same way as the
- # documents in step 1.
- #
- # c. A psuedo-randomly selected row is deleted.
- #
- # d. For each of several types of fts3 queries, 10 SELECT queries
- # of the form:
- #
- # SELECT docid FROM <tbl> WHERE <tbl> MATCH '<query>'
- #
- # are evaluated. The results are compared to those calculated by
- # Tcl code in this file. The patterns used for the different query
- # types are:
- #
- # 1. query = <term>
- # 2. query = <prefix>
- # 3. query = "<term> <term>"
- # 4. query = "<term> <term> <term>"
- # 5. query = "<prefix> <prefix> <prefix>"
- # 6. query = <term> NEAR <term>
- # 7. query = <term> NEAR/11 <term> NEAR/11 <term>
- # 8. query = <term> OR <term>
- # 9. query = <term> NOT <term>
- # 10. query = <term> AND <term>
- # 11. query = <term> NEAR <term> OR <term> NEAR <term>
- # 12. query = <term> NEAR <term> NOT <term> NEAR <term>
- # 13. query = <term> NEAR <term> AND <term> NEAR <term>
- #
- # where <term> is a term psuedo-randomly selected from the vocabulary
- # and prefix is the first 2 characters of such a term followed by
- # a "*" character.
- #
- # Every second iteration, steps (a) through (d) above are performed
- # within a single transaction. This forces the queries in (d) to
- # read data from both the database and the in-memory hash table
- # that caches the full-text index entries created by steps (a), (b)
- # and (c) until the transaction is committed.
- #
- # The procedure above is run 5 times, using advisory fts3 node sizes of 50,
- # 500, 1000 and 2000 bytes.
- #
- # After the test using an advisory node-size of 50, an OOM test is run using
- # the database. This test is similar to step (d) above, except that it tests
- # the effects of transient and persistent OOM conditions encountered while
- # executing each query.
- #
- set testdir [file dirname $argv0]
- source $testdir/tester.tcl
- # If this build does not include FTS3, skip the tests in this file.
- #
- ifcapable !fts3 { finish_test ; return }
- source $testdir/fts3_common.tcl
- source $testdir/malloc_common.tcl
- set G(nVocab) 100
- set nVocab 100
- set lVocab [list]
- expr srand(0)
- # Generate a vocabulary of nVocab words. Each word is 3 characters long.
- #
- set lChar {a b c d e f g h i j k l m n o p q r s t u v w x y z}
- for {set i 0} {$i < $nVocab} {incr i} {
- set len [expr int(rand()*3)+2]
- set word [lindex $lChar [expr int(rand()*26)]]
- append word [lindex $lChar [expr int(rand()*26)]]
- if {$len>2} { append word [lindex $lChar [expr int(rand()*26)]] }
- if {$len>3} { append word [lindex $lChar [expr int(rand()*26)]] }
- lappend lVocab $word
- }
- proc random_term {} {
- lindex $::lVocab [expr {int(rand()*$::nVocab)}]
- }
- # Return a document consisting of $nWord arbitrarily selected terms
- # from the $::lVocab list.
- #
- proc generate_doc {nWord} {
- set doc [list]
- for {set i 0} {$i < $nWord} {incr i} {
- lappend doc [random_term]
- }
- return $doc
- }
- # Primitives to update the table.
- #
- unset -nocomplain t1
- proc insert_row {rowid} {
- set a [generate_doc [expr int((rand()*100))]]
- set b [generate_doc [expr int((rand()*100))]]
- set c [generate_doc [expr int((rand()*100))]]
- execsql { INSERT INTO t1(docid, a, b, c) VALUES($rowid, $a, $b, $c) }
- set ::t1($rowid) [list $a $b $c]
- }
- proc delete_row {rowid} {
- execsql { DELETE FROM t1 WHERE rowid = $rowid }
- catch {unset ::t1($rowid)}
- }
- proc update_row {rowid} {
- set cols {a b c}
- set iCol [expr int(rand()*3)]
- set doc [generate_doc [expr int((rand()*100))]]
- lset ::t1($rowid) $iCol $doc
- execsql "UPDATE t1 SET [lindex $cols $iCol] = \$doc WHERE rowid = \$rowid"
- }
- proc simple_phrase {zPrefix} {
- set ret [list]
- set reg [string map {* {[^ ]*}} $zPrefix]
- set reg " $reg "
- foreach key [lsort -integer [array names ::t1]] {
- set value $::t1($key)
- set cnt [list]
- foreach col $value {
- if {[regexp $reg " $col "]} { lappend ret $key ; break }
- }
- }
- #lsort -uniq -integer $ret
- set ret
- }
- # This [proc] is used to test the FTS3 matchinfo() function.
- #
- proc simple_token_matchinfo {zToken bDesc} {
- set nDoc(0) 0
- set nDoc(1) 0
- set nDoc(2) 0
- set nHit(0) 0
- set nHit(1) 0
- set nHit(2) 0
- set dir -inc
- if {$bDesc} { set dir -dec }
- foreach key [array names ::t1] {
- set value $::t1($key)
- set a($key) [list]
- foreach i {0 1 2} col $value {
- set hit [llength [lsearch -all $col $zToken]]
- lappend a($key) $hit
- incr nHit($i) $hit
- if {$hit>0} { incr nDoc($i) }
- }
- }
- set ret [list]
- foreach docid [lsort -integer $dir [array names a]] {
- if { [lindex [lsort -integer $a($docid)] end] } {
- set matchinfo [list 1 3]
- foreach i {0 1 2} hit $a($docid) {
- lappend matchinfo $hit $nHit($i) $nDoc($i)
- }
- lappend ret $docid $matchinfo
- }
- }
- set ret
- }
- proc simple_near {termlist nNear} {
- set ret [list]
- foreach {key value} [array get ::t1] {
- foreach v $value {
- set l [lsearch -exact -all $v [lindex $termlist 0]]
- foreach T [lrange $termlist 1 end] {
- set l2 [list]
- foreach i $l {
- set iStart [expr $i - $nNear - 1]
- set iEnd [expr $i + $nNear + 1]
- if {$iStart < 0} {set iStart 0}
- foreach i2 [lsearch -exact -all [lrange $v $iStart $iEnd] $T] {
- incr i2 $iStart
- if {$i2 != $i} { lappend l2 $i2 }
- }
- }
- set l [lsort -uniq -integer $l2]
- }
- if {[llength $l]} {
- #puts "MATCH($key): $v"
- lappend ret $key
- }
- }
- }
- lsort -unique -integer $ret
- }
- # The following three procs:
- #
- # setup_not A B
- # setup_or A B
- # setup_and A B
- #
- # each take two arguments. Both arguments must be lists of integer values
- # sorted by value. The return value is the list produced by evaluating
- # the equivalent of "A op B", where op is the FTS3 operator NOT, OR or
- # AND.
- #
- proc setop_not {A B} {
- foreach b $B { set n($b) {} }
- set ret [list]
- foreach a $A { if {![info exists n($a)]} {lappend ret $a} }
- return $ret
- }
- proc setop_or {A B} {
- lsort -integer -uniq [concat $A $B]
- }
- proc setop_and {A B} {
- foreach b $B { set n($b) {} }
- set ret [list]
- foreach a $A { if {[info exists n($a)]} {lappend ret $a} }
- return $ret
- }
- proc mit {blob} {
- set scan(littleEndian) i*
- set scan(bigEndian) I*
- binary scan $blob $scan($::tcl_platform(byteOrder)) r
- return $r
- }
- db func mit mit
- set sqlite_fts3_enable_parentheses 1
- proc do_orderbydocid_test {tn sql res} {
- uplevel [list do_select_test $tn.asc "$sql ORDER BY docid ASC" $res]
- uplevel [list do_select_test $tn.desc "$sql ORDER BY docid DESC" \
- [lsort -int -dec $res]
- ]
- }
- set NUM_TRIALS 100
- foreach {nodesize order} {
- 50 DESC
- 50 ASC
- 500 ASC
- 1000 DESC
- 2000 ASC
- } {
- catch { array unset ::t1 }
- set testname "$nodesize/$order"
- # Create the FTS3 table. Populate it (and the Tcl array) with 100 rows.
- #
- db transaction {
- catchsql { DROP TABLE t1 }
- execsql "CREATE VIRTUAL TABLE t1 USING fts4(a, b, c, order=$order)"
- execsql "INSERT INTO t1(t1) VALUES('nodesize=$nodesize')"
- for {set i 0} {$i < 100} {incr i} { insert_row $i }
- }
-
- for {set iTest 1} {$iTest <= $NUM_TRIALS} {incr iTest} {
- catchsql COMMIT
- set DO_MALLOC_TEST 0
- set nRep 10
- if {$iTest==100 && $nodesize==50} {
- set DO_MALLOC_TEST 1
- set nRep 2
- }
- set ::testprefix fts3rnd-1.$testname.$iTest
-
- # Delete one row, update one row and insert one row.
- #
- set rows [array names ::t1]
- set nRow [llength $rows]
- set iUpdate [lindex $rows [expr {int(rand()*$nRow)}]]
- set iDelete $iUpdate
- while {$iDelete == $iUpdate} {
- set iDelete [lindex $rows [expr {int(rand()*$nRow)}]]
- }
- set iInsert $iUpdate
- while {[info exists ::t1($iInsert)]} {
- set iInsert [expr {int(rand()*1000000)}]
- }
- execsql BEGIN
- insert_row $iInsert
- update_row $iUpdate
- delete_row $iDelete
- if {0==($iTest%2)} { execsql COMMIT }
- if {0==($iTest%2)} {
- #do_test 0 { fts3_integrity_check t1 } ok
- }
- # Pick 10 terms from the vocabulary. Check that the results of querying
- # the database for the set of documents containing each of these terms
- # is the same as the result obtained by scanning the contents of the Tcl
- # array for each term.
- #
- for {set i 0} {$i < 10} {incr i} {
- set term [random_term]
- do_select_test 1.$i.asc {
- SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term
- ORDER BY docid ASC
- } [simple_token_matchinfo $term 0]
- do_select_test 1.$i.desc {
- SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term
- ORDER BY docid DESC
- } [simple_token_matchinfo $term 1]
- }
- # This time, use the first two characters of each term as a term prefix
- # to query for. Test that querying the Tcl array produces the same results
- # as querying the FTS3 table for the prefix.
- #
- for {set i 0} {$i < $nRep} {incr i} {
- set prefix [string range [random_term] 0 end-1]
- set match "${prefix}*"
- do_orderbydocid_test 2.$i {
- SELECT docid FROM t1 WHERE t1 MATCH $match
- } [simple_phrase $match]
- }
- # Similar to the above, except for phrase queries.
- #
- for {set i 0} {$i < $nRep} {incr i} {
- set term [list [random_term] [random_term]]
- set match "\"$term\""
- do_orderbydocid_test 3.$i {
- SELECT docid FROM t1 WHERE t1 MATCH $match
- } [simple_phrase $term]
- }
- # Three word phrases.
- #
- for {set i 0} {$i < $nRep} {incr i} {
- set term [list [random_term] [random_term] [random_term]]
- set match "\"$term\""
- do_orderbydocid_test 4.$i {
- SELECT docid FROM t1 WHERE t1 MATCH $match
- } [simple_phrase $term]
- }
- # Three word phrases made up of term-prefixes.
- #
- for {set i 0} {$i < $nRep} {incr i} {
- set query "[string range [random_term] 0 end-1]* "
- append query "[string range [random_term] 0 end-1]* "
- append query "[string range [random_term] 0 end-1]*"
- set match "\"$query\""
- do_orderbydocid_test 5.$i {
- SELECT docid FROM t1 WHERE t1 MATCH $match
- } [simple_phrase $query]
- }
- # A NEAR query with terms as the arguments:
- #
- # ... MATCH '$term1 NEAR $term2' ...
- #
- for {set i 0} {$i < $nRep} {incr i} {
- set terms [list [random_term] [random_term]]
- set match [join $terms " NEAR "]
- do_orderbydocid_test 6.$i {
- SELECT docid FROM t1 WHERE t1 MATCH $match
- } [simple_near $terms 10]
- }
- # A 3-way NEAR query with terms as the arguments.
- #
- for {set i 0} {$i < $nRep} {incr i} {
- set terms [list [random_term] [random_term] [random_term]]
- set nNear 11
- set match [join $terms " NEAR/$nNear "]
- do_orderbydocid_test 7.$i {
- SELECT docid FROM t1 WHERE t1 MATCH $match
- } [simple_near $terms $nNear]
- }
-
- # Set operations on simple term queries.
- #
- foreach {tn op proc} {
- 8 OR setop_or
- 9 NOT setop_not
- 10 AND setop_and
- } {
- for {set i 0} {$i < $nRep} {incr i} {
- set term1 [random_term]
- set term2 [random_term]
- set match "$term1 $op $term2"
- do_orderbydocid_test $tn.$i {
- SELECT docid FROM t1 WHERE t1 MATCH $match
- } [$proc [simple_phrase $term1] [simple_phrase $term2]]
- }
- }
-
- # Set operations on NEAR queries.
- #
- foreach {tn op proc} {
- 11 OR setop_or
- 12 NOT setop_not
- 13 AND setop_and
- } {
- for {set i 0} {$i < $nRep} {incr i} {
- set term1 [random_term]
- set term2 [random_term]
- set term3 [random_term]
- set term4 [random_term]
- set match "$term1 NEAR $term2 $op $term3 NEAR $term4"
- do_orderbydocid_test $tn.$i {
- SELECT docid FROM t1 WHERE t1 MATCH $match
- } [$proc \
- [simple_near [list $term1 $term2] 10] \
- [simple_near [list $term3 $term4] 10]
- ]
- }
- }
- catchsql COMMIT
- }
- }
- finish_test
|