123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416 |
- # 2009 November 04
- #
- # The author disclaims copyright to this source code. In place of
- # a legal notice, here is a blessing:
- #
- # May you do good and not evil.
- # May you find forgiveness for yourself and forgive others.
- # May you share freely, never taking more than you give.
- #
- #***********************************************************************
- #
- # This file contains common code used the fts3 tests. At one point
- # equivalent functionality was implemented in C code. But it is easier
- # to use Tcl.
- #
- #-------------------------------------------------------------------------
- # INSTRUCTIONS
- #
- # The following commands are available:
- #
- # fts3_build_db_1 N
- # Using database handle [db] create an FTS4 table named t1 and populate
- # it with N rows of data. N must be less than 10,000. Refer to the
- # header comments above the proc implementation below for details.
- #
- # fts3_build_db_2 N
- # Using database handle [db] create an FTS4 table named t2 and populate
- # it with N rows of data. N must be less than 100,000. Refer to the
- # header comments above the proc implementation below for details.
- #
- # fts3_integrity_check TBL
- # TBL must be an FTS table in the database currently opened by handle
- # [db]. This proc loads and tokenizes all documents within the table,
- # then checks that the current contents of the FTS index matches the
- # results.
- #
- # fts3_terms TBL WHERE
- # Todo.
- #
- # fts3_doclist TBL TERM WHERE
- # Todo.
- #
- #
- #
- #-------------------------------------------------------------------------
- # USAGE: fts3_build_db_1 SWITCHES N
- #
- # Build a sample FTS table in the database opened by database connection
- # [db]. The name of the new table is "t1".
- #
- proc fts3_build_db_1 {args} {
- set default(-module) fts4
- set nArg [llength $args]
- if {($nArg%2)==0} {
- error "wrong # args: should be \"fts3_build_db_1 ?switches? n\""
- }
- set n [lindex $args [expr $nArg-1]]
- array set opts [array get default]
- array set opts [lrange $args 0 [expr $nArg-2]]
- foreach k [array names opts] {
- if {0==[info exists default($k)]} { error "unknown option: $k" }
- }
- if {$n > 10000} {error "n must be <= 10000"}
- db eval "CREATE VIRTUAL TABLE t1 USING $opts(-module) (x, y)"
- set xwords [list zero one two three four five six seven eight nine ten]
- set ywords [list alpha beta gamma delta epsilon zeta eta theta iota kappa]
- for {set i 0} {$i < $n} {incr i} {
- set x ""
- set y ""
- set x [list]
- lappend x [lindex $xwords [expr ($i / 1000) % 10]]
- lappend x [lindex $xwords [expr ($i / 100) % 10]]
- lappend x [lindex $xwords [expr ($i / 10) % 10]]
- lappend x [lindex $xwords [expr ($i / 1) % 10]]
- set y [list]
- lappend y [lindex $ywords [expr ($i / 1000) % 10]]
- lappend y [lindex $ywords [expr ($i / 100) % 10]]
- lappend y [lindex $ywords [expr ($i / 10) % 10]]
- lappend y [lindex $ywords [expr ($i / 1) % 10]]
- db eval { INSERT INTO t1(docid, x, y) VALUES($i, $x, $y) }
- }
- }
- #-------------------------------------------------------------------------
- # USAGE: fts3_build_db_2 N ARGS
- #
- # Build a sample FTS table in the database opened by database connection
- # [db]. The name of the new table is "t2".
- #
- proc fts3_build_db_2 {args} {
- set default(-module) fts4
- set default(-extra) ""
- set nArg [llength $args]
- if {($nArg%2)==0} {
- error "wrong # args: should be \"fts3_build_db_1 ?switches? n\""
- }
- set n [lindex $args [expr $nArg-1]]
- array set opts [array get default]
- array set opts [lrange $args 0 [expr $nArg-2]]
- foreach k [array names opts] {
- if {0==[info exists default($k)]} { error "unknown option: $k" }
- }
- if {$n > 100000} {error "n must be <= 100000"}
- set sql "CREATE VIRTUAL TABLE t2 USING $opts(-module) (content"
- if {$opts(-extra) != ""} {
- append sql ", " $opts(-extra)
- }
- append sql ")"
- db eval $sql
- set chars [list a b c d e f g h i j k l m n o p q r s t u v w x y z ""]
- for {set i 0} {$i < $n} {incr i} {
- set word ""
- set nChar [llength $chars]
- append word [lindex $chars [expr {($i / 1) % $nChar}]]
- append word [lindex $chars [expr {($i / $nChar) % $nChar}]]
- append word [lindex $chars [expr {($i / ($nChar*$nChar)) % $nChar}]]
- db eval { INSERT INTO t2(docid, content) VALUES($i, $word) }
- }
- }
- #-------------------------------------------------------------------------
- # USAGE: fts3_integrity_check TBL
- #
- # This proc is used to verify that the full-text index is consistent with
- # the contents of the fts3 table. In other words, it checks that the
- # data in the %_contents table matches that in the %_segdir and %_segments
- # tables.
- #
- # This is not an efficient procedure. It uses a lot of memory and a lot
- # of CPU. But it is better than not checking at all.
- #
- # The procedure is:
- #
- # 1) Read the entire full-text index from the %_segdir and %_segments
- # tables into memory. For each entry in the index, the following is
- # done:
- #
- # set C($iDocid,$iCol,$iPosition) $zTerm
- #
- # 2) Iterate through each column of each row of the %_content table.
- # Tokenize all documents, and check that for each token there is
- # a corresponding entry in the $C array. After checking a token,
- # [unset] the $C array entry.
- #
- # 3) Check that array $C is now empty.
- #
- #
- proc fts3_integrity_check {tbl} {
- fts3_read2 $tbl 1 A
- foreach zTerm [array names A] {
- #puts $zTerm
- foreach doclist $A($zTerm) {
- set docid 0
- while {[string length $doclist]>0} {
- set iCol 0
- set iPos 0
- set lPos [list]
- set lCol [list]
- # First varint of a doclist-entry is the docid. Delta-compressed
- # with respect to the docid of the previous entry.
- #
- incr docid [gobble_varint doclist]
- if {[info exists D($zTerm,$docid)]} {
- while {[set iDelta [gobble_varint doclist]] != 0} {}
- continue
- }
- set D($zTerm,$docid) 1
- # Gobble varints until the 0x00 that terminates the doclist-entry
- # is found.
- while {[set iDelta [gobble_varint doclist]] > 0} {
- if {$iDelta == 1} {
- set iCol [gobble_varint doclist]
- set iPos 0
- } else {
- incr iPos $iDelta
- incr iPos -2
- set C($docid,$iCol,$iPos) $zTerm
- }
- }
- }
- }
- }
- foreach key [array names C] {
- #puts "$key -> $C($key)"
- }
- db eval "SELECT * FROM ${tbl}_content" E {
- set iCol 0
- set iDoc $E(docid)
- foreach col [lrange $E(*) 1 end] {
- set c $E($col)
- set sql {SELECT fts3_tokenizer_test('simple', $c)}
- foreach {pos term dummy} [db one $sql] {
- if {![info exists C($iDoc,$iCol,$pos)]} {
- set es "Error at docid=$iDoc col=$iCol pos=$pos. Index is missing"
- lappend errors $es
- } else {
- if {[string compare $C($iDoc,$iCol,$pos) $term]} {
- set es "Error at docid=$iDoc col=$iCol pos=$pos. Index "
- append es "has \"$C($iDoc,$iCol,$pos)\", document has \"$term\""
- lappend errors $es
- }
- unset C($iDoc,$iCol,$pos)
- }
- }
- incr iCol
- }
- }
- foreach c [array names C] {
- lappend errors "Bad index entry: $c -> $C($c)"
- }
- if {[info exists errors]} { return [join $errors "\n"] }
- return "ok"
- }
- # USAGE: fts3_terms TBL WHERE
- #
- # Argument TBL must be the name of an FTS3 table. Argument WHERE is an
- # SQL expression that will be used as the WHERE clause when scanning
- # the %_segdir table. As in the following query:
- #
- # "SELECT * FROM ${TBL}_segdir WHERE ${WHERE}"
- #
- # This function returns a list of all terms present in the segments
- # selected by the statement above.
- #
- proc fts3_terms {tbl where} {
- fts3_read $tbl $where a
- return [lsort [array names a]]
- }
- # USAGE: fts3_doclist TBL TERM WHERE
- #
- # Argument TBL must be the name of an FTS3 table. TERM is a term that may
- # or may not be present in the table. Argument WHERE is used to select a
- # subset of the b-tree segments in the associated full-text index as
- # described above for [fts3_terms].
- #
- # This function returns the results of merging the doclists associated
- # with TERM in the selected segments. Each doclist is an element of the
- # returned list. Each doclist is formatted as follows:
- #
- # [$docid ?$col[$off1 $off2...]?...]
- #
- # The formatting is odd for a Tcl command in order to be compatible with
- # the original C-language implementation. If argument WHERE is "1", then
- # any empty doclists are omitted from the returned list.
- #
- proc fts3_doclist {tbl term where} {
- fts3_read $tbl $where a
- foreach doclist $a($term) {
- set docid 0
- while {[string length $doclist]>0} {
- set iCol 0
- set iPos 0
- set lPos [list]
- set lCol [list]
- incr docid [gobble_varint doclist]
-
- while {[set iDelta [gobble_varint doclist]] > 0} {
- if {$iDelta == 1} {
- lappend lCol [list $iCol $lPos]
- set iPos 0
- set lPos [list]
- set iCol [gobble_varint doclist]
- } else {
- incr iPos $iDelta
- incr iPos -2
- lappend lPos $iPos
- }
- }
-
- if {[llength $lPos]>0} {
- lappend lCol [list $iCol $lPos]
- }
-
- if {$where != "1" || [llength $lCol]>0} {
- set ret($docid) $lCol
- } else {
- unset -nocomplain ret($docid)
- }
- }
- }
- set lDoc [list]
- foreach docid [lsort -integer [array names ret]] {
- set lCol [list]
- set cols ""
- foreach col $ret($docid) {
- foreach {iCol lPos} $col {}
- append cols " $iCol\[[join $lPos { }]\]"
- }
- lappend lDoc "\[${docid}${cols}\]"
- }
- join $lDoc " "
- }
- ###########################################################################
- proc gobble_varint {varname} {
- upvar $varname blob
- set n [read_fts3varint $blob ret]
- set blob [string range $blob $n end]
- return $ret
- }
- proc gobble_string {varname nLength} {
- upvar $varname blob
- set ret [string range $blob 0 [expr $nLength-1]]
- set blob [string range $blob $nLength end]
- return $ret
- }
- # The argument is a blob of data representing an FTS3 segment leaf.
- # Return a list consisting of alternating terms (strings) and doclists
- # (blobs of data).
- #
- proc fts3_readleaf {blob} {
- set zPrev ""
- set terms [list]
- while {[string length $blob] > 0} {
- set nPrefix [gobble_varint blob]
- set nSuffix [gobble_varint blob]
- set zTerm [string range $zPrev 0 [expr $nPrefix-1]]
- append zTerm [gobble_string blob $nSuffix]
- set nDoclist [gobble_varint blob]
- set doclist [gobble_string blob $nDoclist]
- lappend terms $zTerm $doclist
- set zPrev $zTerm
- }
- return $terms
- }
- proc fts3_read2 {tbl where varname} {
- upvar $varname a
- array unset a
- db eval " SELECT start_block, leaves_end_block, root
- FROM ${tbl}_segdir WHERE $where
- ORDER BY level ASC, idx DESC
- " {
- set c 0
- binary scan $root c c
- if {$c==0} {
- foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
- } else {
- db eval " SELECT block
- FROM ${tbl}_segments
- WHERE blockid>=$start_block AND blockid<=$leaves_end_block
- ORDER BY blockid
- " {
- foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
- }
- }
- }
- }
- proc fts3_read {tbl where varname} {
- upvar $varname a
- array unset a
- db eval " SELECT start_block, leaves_end_block, root
- FROM ${tbl}_segdir WHERE $where
- ORDER BY level DESC, idx ASC
- " {
- if {$start_block == 0} {
- foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
- } else {
- db eval " SELECT block
- FROM ${tbl}_segments
- WHERE blockid>=$start_block AND blockid<$leaves_end_block
- ORDER BY blockid
- " {
- foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
- }
- }
- }
- }
- ##########################################################################
|