1
0

fts3_common.tcl 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. # 2009 November 04
  2. #
  3. # The author disclaims copyright to this source code. In place of
  4. # a legal notice, here is a blessing:
  5. #
  6. # May you do good and not evil.
  7. # May you find forgiveness for yourself and forgive others.
  8. # May you share freely, never taking more than you give.
  9. #
  10. #***********************************************************************
  11. #
  12. # This file contains common code used the fts3 tests. At one point
  13. # equivalent functionality was implemented in C code. But it is easier
  14. # to use Tcl.
  15. #
  16. #-------------------------------------------------------------------------
  17. # INSTRUCTIONS
  18. #
  19. # The following commands are available:
  20. #
  21. # fts3_build_db_1 N
  22. # Using database handle [db] create an FTS4 table named t1 and populate
  23. # it with N rows of data. N must be less than 10,000. Refer to the
  24. # header comments above the proc implementation below for details.
  25. #
  26. # fts3_build_db_2 N
  27. # Using database handle [db] create an FTS4 table named t2 and populate
  28. # it with N rows of data. N must be less than 100,000. Refer to the
  29. # header comments above the proc implementation below for details.
  30. #
  31. # fts3_integrity_check TBL
  32. # TBL must be an FTS table in the database currently opened by handle
  33. # [db]. This proc loads and tokenizes all documents within the table,
  34. # then checks that the current contents of the FTS index matches the
  35. # results.
  36. #
  37. # fts3_terms TBL WHERE
  38. # Todo.
  39. #
  40. # fts3_doclist TBL TERM WHERE
  41. # Todo.
  42. #
  43. #
  44. #
  45. #-------------------------------------------------------------------------
  46. # USAGE: fts3_build_db_1 SWITCHES N
  47. #
  48. # Build a sample FTS table in the database opened by database connection
  49. # [db]. The name of the new table is "t1".
  50. #
  51. proc fts3_build_db_1 {args} {
  52. set default(-module) fts4
  53. set nArg [llength $args]
  54. if {($nArg%2)==0} {
  55. error "wrong # args: should be \"fts3_build_db_1 ?switches? n\""
  56. }
  57. set n [lindex $args [expr $nArg-1]]
  58. array set opts [array get default]
  59. array set opts [lrange $args 0 [expr $nArg-2]]
  60. foreach k [array names opts] {
  61. if {0==[info exists default($k)]} { error "unknown option: $k" }
  62. }
  63. if {$n > 10000} {error "n must be <= 10000"}
  64. db eval "CREATE VIRTUAL TABLE t1 USING $opts(-module) (x, y)"
  65. set xwords [list zero one two three four five six seven eight nine ten]
  66. set ywords [list alpha beta gamma delta epsilon zeta eta theta iota kappa]
  67. for {set i 0} {$i < $n} {incr i} {
  68. set x ""
  69. set y ""
  70. set x [list]
  71. lappend x [lindex $xwords [expr ($i / 1000) % 10]]
  72. lappend x [lindex $xwords [expr ($i / 100) % 10]]
  73. lappend x [lindex $xwords [expr ($i / 10) % 10]]
  74. lappend x [lindex $xwords [expr ($i / 1) % 10]]
  75. set y [list]
  76. lappend y [lindex $ywords [expr ($i / 1000) % 10]]
  77. lappend y [lindex $ywords [expr ($i / 100) % 10]]
  78. lappend y [lindex $ywords [expr ($i / 10) % 10]]
  79. lappend y [lindex $ywords [expr ($i / 1) % 10]]
  80. db eval { INSERT INTO t1(docid, x, y) VALUES($i, $x, $y) }
  81. }
  82. }
  83. #-------------------------------------------------------------------------
  84. # USAGE: fts3_build_db_2 N ARGS
  85. #
  86. # Build a sample FTS table in the database opened by database connection
  87. # [db]. The name of the new table is "t2".
  88. #
  89. proc fts3_build_db_2 {args} {
  90. set default(-module) fts4
  91. set default(-extra) ""
  92. set nArg [llength $args]
  93. if {($nArg%2)==0} {
  94. error "wrong # args: should be \"fts3_build_db_1 ?switches? n\""
  95. }
  96. set n [lindex $args [expr $nArg-1]]
  97. array set opts [array get default]
  98. array set opts [lrange $args 0 [expr $nArg-2]]
  99. foreach k [array names opts] {
  100. if {0==[info exists default($k)]} { error "unknown option: $k" }
  101. }
  102. if {$n > 100000} {error "n must be <= 100000"}
  103. set sql "CREATE VIRTUAL TABLE t2 USING $opts(-module) (content"
  104. if {$opts(-extra) != ""} {
  105. append sql ", " $opts(-extra)
  106. }
  107. append sql ")"
  108. db eval $sql
  109. set chars [list a b c d e f g h i j k l m n o p q r s t u v w x y z ""]
  110. for {set i 0} {$i < $n} {incr i} {
  111. set word ""
  112. set nChar [llength $chars]
  113. append word [lindex $chars [expr {($i / 1) % $nChar}]]
  114. append word [lindex $chars [expr {($i / $nChar) % $nChar}]]
  115. append word [lindex $chars [expr {($i / ($nChar*$nChar)) % $nChar}]]
  116. db eval { INSERT INTO t2(docid, content) VALUES($i, $word) }
  117. }
  118. }
  119. #-------------------------------------------------------------------------
  120. # USAGE: fts3_integrity_check TBL
  121. #
  122. # This proc is used to verify that the full-text index is consistent with
  123. # the contents of the fts3 table. In other words, it checks that the
  124. # data in the %_contents table matches that in the %_segdir and %_segments
  125. # tables.
  126. #
  127. # This is not an efficient procedure. It uses a lot of memory and a lot
  128. # of CPU. But it is better than not checking at all.
  129. #
  130. # The procedure is:
  131. #
  132. # 1) Read the entire full-text index from the %_segdir and %_segments
  133. # tables into memory. For each entry in the index, the following is
  134. # done:
  135. #
  136. # set C($iDocid,$iCol,$iPosition) $zTerm
  137. #
  138. # 2) Iterate through each column of each row of the %_content table.
  139. # Tokenize all documents, and check that for each token there is
  140. # a corresponding entry in the $C array. After checking a token,
  141. # [unset] the $C array entry.
  142. #
  143. # 3) Check that array $C is now empty.
  144. #
  145. #
  146. proc fts3_integrity_check {tbl} {
  147. fts3_read2 $tbl 1 A
  148. foreach zTerm [array names A] {
  149. #puts $zTerm
  150. foreach doclist $A($zTerm) {
  151. set docid 0
  152. while {[string length $doclist]>0} {
  153. set iCol 0
  154. set iPos 0
  155. set lPos [list]
  156. set lCol [list]
  157. # First varint of a doclist-entry is the docid. Delta-compressed
  158. # with respect to the docid of the previous entry.
  159. #
  160. incr docid [gobble_varint doclist]
  161. if {[info exists D($zTerm,$docid)]} {
  162. while {[set iDelta [gobble_varint doclist]] != 0} {}
  163. continue
  164. }
  165. set D($zTerm,$docid) 1
  166. # Gobble varints until the 0x00 that terminates the doclist-entry
  167. # is found.
  168. while {[set iDelta [gobble_varint doclist]] > 0} {
  169. if {$iDelta == 1} {
  170. set iCol [gobble_varint doclist]
  171. set iPos 0
  172. } else {
  173. incr iPos $iDelta
  174. incr iPos -2
  175. set C($docid,$iCol,$iPos) $zTerm
  176. }
  177. }
  178. }
  179. }
  180. }
  181. foreach key [array names C] {
  182. #puts "$key -> $C($key)"
  183. }
  184. db eval "SELECT * FROM ${tbl}_content" E {
  185. set iCol 0
  186. set iDoc $E(docid)
  187. foreach col [lrange $E(*) 1 end] {
  188. set c $E($col)
  189. set sql {SELECT fts3_tokenizer_test('simple', $c)}
  190. foreach {pos term dummy} [db one $sql] {
  191. if {![info exists C($iDoc,$iCol,$pos)]} {
  192. set es "Error at docid=$iDoc col=$iCol pos=$pos. Index is missing"
  193. lappend errors $es
  194. } else {
  195. if {[string compare $C($iDoc,$iCol,$pos) $term]} {
  196. set es "Error at docid=$iDoc col=$iCol pos=$pos. Index "
  197. append es "has \"$C($iDoc,$iCol,$pos)\", document has \"$term\""
  198. lappend errors $es
  199. }
  200. unset C($iDoc,$iCol,$pos)
  201. }
  202. }
  203. incr iCol
  204. }
  205. }
  206. foreach c [array names C] {
  207. lappend errors "Bad index entry: $c -> $C($c)"
  208. }
  209. if {[info exists errors]} { return [join $errors "\n"] }
  210. return "ok"
  211. }
  212. # USAGE: fts3_terms TBL WHERE
  213. #
  214. # Argument TBL must be the name of an FTS3 table. Argument WHERE is an
  215. # SQL expression that will be used as the WHERE clause when scanning
  216. # the %_segdir table. As in the following query:
  217. #
  218. # "SELECT * FROM ${TBL}_segdir WHERE ${WHERE}"
  219. #
  220. # This function returns a list of all terms present in the segments
  221. # selected by the statement above.
  222. #
  223. proc fts3_terms {tbl where} {
  224. fts3_read $tbl $where a
  225. return [lsort [array names a]]
  226. }
  227. # USAGE: fts3_doclist TBL TERM WHERE
  228. #
  229. # Argument TBL must be the name of an FTS3 table. TERM is a term that may
  230. # or may not be present in the table. Argument WHERE is used to select a
  231. # subset of the b-tree segments in the associated full-text index as
  232. # described above for [fts3_terms].
  233. #
  234. # This function returns the results of merging the doclists associated
  235. # with TERM in the selected segments. Each doclist is an element of the
  236. # returned list. Each doclist is formatted as follows:
  237. #
  238. # [$docid ?$col[$off1 $off2...]?...]
  239. #
  240. # The formatting is odd for a Tcl command in order to be compatible with
  241. # the original C-language implementation. If argument WHERE is "1", then
  242. # any empty doclists are omitted from the returned list.
  243. #
  244. proc fts3_doclist {tbl term where} {
  245. fts3_read $tbl $where a
  246. foreach doclist $a($term) {
  247. set docid 0
  248. while {[string length $doclist]>0} {
  249. set iCol 0
  250. set iPos 0
  251. set lPos [list]
  252. set lCol [list]
  253. incr docid [gobble_varint doclist]
  254. while {[set iDelta [gobble_varint doclist]] > 0} {
  255. if {$iDelta == 1} {
  256. lappend lCol [list $iCol $lPos]
  257. set iPos 0
  258. set lPos [list]
  259. set iCol [gobble_varint doclist]
  260. } else {
  261. incr iPos $iDelta
  262. incr iPos -2
  263. lappend lPos $iPos
  264. }
  265. }
  266. if {[llength $lPos]>0} {
  267. lappend lCol [list $iCol $lPos]
  268. }
  269. if {$where != "1" || [llength $lCol]>0} {
  270. set ret($docid) $lCol
  271. } else {
  272. unset -nocomplain ret($docid)
  273. }
  274. }
  275. }
  276. set lDoc [list]
  277. foreach docid [lsort -integer [array names ret]] {
  278. set lCol [list]
  279. set cols ""
  280. foreach col $ret($docid) {
  281. foreach {iCol lPos} $col {}
  282. append cols " $iCol\[[join $lPos { }]\]"
  283. }
  284. lappend lDoc "\[${docid}${cols}\]"
  285. }
  286. join $lDoc " "
  287. }
  288. ###########################################################################
  289. proc gobble_varint {varname} {
  290. upvar $varname blob
  291. set n [read_fts3varint $blob ret]
  292. set blob [string range $blob $n end]
  293. return $ret
  294. }
  295. proc gobble_string {varname nLength} {
  296. upvar $varname blob
  297. set ret [string range $blob 0 [expr $nLength-1]]
  298. set blob [string range $blob $nLength end]
  299. return $ret
  300. }
  301. # The argument is a blob of data representing an FTS3 segment leaf.
  302. # Return a list consisting of alternating terms (strings) and doclists
  303. # (blobs of data).
  304. #
  305. proc fts3_readleaf {blob} {
  306. set zPrev ""
  307. set terms [list]
  308. while {[string length $blob] > 0} {
  309. set nPrefix [gobble_varint blob]
  310. set nSuffix [gobble_varint blob]
  311. set zTerm [string range $zPrev 0 [expr $nPrefix-1]]
  312. append zTerm [gobble_string blob $nSuffix]
  313. set nDoclist [gobble_varint blob]
  314. set doclist [gobble_string blob $nDoclist]
  315. lappend terms $zTerm $doclist
  316. set zPrev $zTerm
  317. }
  318. return $terms
  319. }
  320. proc fts3_read2 {tbl where varname} {
  321. upvar $varname a
  322. array unset a
  323. db eval " SELECT start_block, leaves_end_block, root
  324. FROM ${tbl}_segdir WHERE $where
  325. ORDER BY level ASC, idx DESC
  326. " {
  327. set c 0
  328. binary scan $root c c
  329. if {$c==0} {
  330. foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
  331. } else {
  332. db eval " SELECT block
  333. FROM ${tbl}_segments
  334. WHERE blockid>=$start_block AND blockid<=$leaves_end_block
  335. ORDER BY blockid
  336. " {
  337. foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
  338. }
  339. }
  340. }
  341. }
  342. proc fts3_read {tbl where varname} {
  343. upvar $varname a
  344. array unset a
  345. db eval " SELECT start_block, leaves_end_block, root
  346. FROM ${tbl}_segdir WHERE $where
  347. ORDER BY level DESC, idx ASC
  348. " {
  349. if {$start_block == 0} {
  350. foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
  351. } else {
  352. db eval " SELECT block
  353. FROM ${tbl}_segments
  354. WHERE blockid>=$start_block AND blockid<$leaves_end_block
  355. ORDER BY blockid
  356. " {
  357. foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
  358. }
  359. }
  360. }
  361. }
  362. ##########################################################################