fts2_tokenizer.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. /*
  2. ** 2007 June 22
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. ******************************************************************************
  12. **
  13. ** This is part of an SQLite module implementing full-text search.
  14. ** This particular file implements the generic tokenizer interface.
  15. */
  16. /*
  17. ** The code in this file is only compiled if:
  18. **
  19. ** * The FTS2 module is being built as an extension
  20. ** (in which case SQLITE_CORE is not defined), or
  21. **
  22. ** * The FTS2 module is being built into the core of
  23. ** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
  24. */
  25. #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
  26. #include "sqlite3.h"
  27. #include "sqlite3ext.h"
  28. SQLITE_EXTENSION_INIT3
  29. #include "fts2_hash.h"
  30. #include "fts2_tokenizer.h"
  31. #include <assert.h>
  32. /*
  33. ** Implementation of the SQL scalar function for accessing the underlying
  34. ** hash table. This function may be called as follows:
  35. **
  36. ** SELECT <function-name>(<key-name>);
  37. ** SELECT <function-name>(<key-name>, <pointer>);
  38. **
  39. ** where <function-name> is the name passed as the second argument
  40. ** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer').
  41. **
  42. ** If the <pointer> argument is specified, it must be a blob value
  43. ** containing a pointer to be stored as the hash data corresponding
  44. ** to the string <key-name>. If <pointer> is not specified, then
  45. ** the string <key-name> must already exist in the has table. Otherwise,
  46. ** an error is returned.
  47. **
  48. ** Whether or not the <pointer> argument is specified, the value returned
  49. ** is a blob containing the pointer stored as the hash data corresponding
  50. ** to string <key-name> (after the hash-table is updated, if applicable).
  51. */
  52. static void scalarFunc(
  53. sqlite3_context *context,
  54. int argc,
  55. sqlite3_value **argv
  56. ){
  57. fts2Hash *pHash;
  58. void *pPtr = 0;
  59. const unsigned char *zName;
  60. int nName;
  61. assert( argc==1 || argc==2 );
  62. pHash = (fts2Hash *)sqlite3_user_data(context);
  63. zName = sqlite3_value_text(argv[0]);
  64. nName = sqlite3_value_bytes(argv[0])+1;
  65. if( argc==2 ){
  66. void *pOld;
  67. int n = sqlite3_value_bytes(argv[1]);
  68. if( n!=sizeof(pPtr) ){
  69. sqlite3_result_error(context, "argument type mismatch", -1);
  70. return;
  71. }
  72. pPtr = *(void **)sqlite3_value_blob(argv[1]);
  73. pOld = sqlite3Fts2HashInsert(pHash, (void *)zName, nName, pPtr);
  74. if( pOld==pPtr ){
  75. sqlite3_result_error(context, "out of memory", -1);
  76. return;
  77. }
  78. }else{
  79. pPtr = sqlite3Fts2HashFind(pHash, zName, nName);
  80. if( !pPtr ){
  81. char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
  82. sqlite3_result_error(context, zErr, -1);
  83. sqlite3_free(zErr);
  84. return;
  85. }
  86. }
  87. sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT);
  88. }
  89. #ifdef SQLITE_TEST
  90. #include <tcl.h>
  91. #include <string.h>
  92. /*
  93. ** Implementation of a special SQL scalar function for testing tokenizers
  94. ** designed to be used in concert with the Tcl testing framework. This
  95. ** function must be called with two arguments:
  96. **
  97. ** SELECT <function-name>(<key-name>, <input-string>);
  98. ** SELECT <function-name>(<key-name>, <pointer>);
  99. **
  100. ** where <function-name> is the name passed as the second argument
  101. ** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer')
  102. ** concatenated with the string '_test' (e.g. 'fts2_tokenizer_test').
  103. **
  104. ** The return value is a string that may be interpreted as a Tcl
  105. ** list. For each token in the <input-string>, three elements are
  106. ** added to the returned list. The first is the token position, the
  107. ** second is the token text (folded, stemmed, etc.) and the third is the
  108. ** substring of <input-string> associated with the token. For example,
  109. ** using the built-in "simple" tokenizer:
  110. **
  111. ** SELECT fts_tokenizer_test('simple', 'I don't see how');
  112. **
  113. ** will return the string:
  114. **
  115. ** "{0 i I 1 dont don't 2 see see 3 how how}"
  116. **
  117. */
  118. static void testFunc(
  119. sqlite3_context *context,
  120. int argc,
  121. sqlite3_value **argv
  122. ){
  123. fts2Hash *pHash;
  124. sqlite3_tokenizer_module *p;
  125. sqlite3_tokenizer *pTokenizer = 0;
  126. sqlite3_tokenizer_cursor *pCsr = 0;
  127. const char *zErr = 0;
  128. const char *zName;
  129. int nName;
  130. const char *zInput;
  131. int nInput;
  132. const char *zArg = 0;
  133. const char *zToken;
  134. int nToken;
  135. int iStart;
  136. int iEnd;
  137. int iPos;
  138. Tcl_Obj *pRet;
  139. assert( argc==2 || argc==3 );
  140. nName = sqlite3_value_bytes(argv[0]);
  141. zName = (const char *)sqlite3_value_text(argv[0]);
  142. nInput = sqlite3_value_bytes(argv[argc-1]);
  143. zInput = (const char *)sqlite3_value_text(argv[argc-1]);
  144. if( argc==3 ){
  145. zArg = (const char *)sqlite3_value_text(argv[1]);
  146. }
  147. pHash = (fts2Hash *)sqlite3_user_data(context);
  148. p = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zName, nName+1);
  149. if( !p ){
  150. char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
  151. sqlite3_result_error(context, zErr, -1);
  152. sqlite3_free(zErr);
  153. return;
  154. }
  155. pRet = Tcl_NewObj();
  156. Tcl_IncrRefCount(pRet);
  157. if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
  158. zErr = "error in xCreate()";
  159. goto finish;
  160. }
  161. pTokenizer->pModule = p;
  162. if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
  163. zErr = "error in xOpen()";
  164. goto finish;
  165. }
  166. pCsr->pTokenizer = pTokenizer;
  167. while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){
  168. Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
  169. Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
  170. zToken = &zInput[iStart];
  171. nToken = iEnd-iStart;
  172. Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
  173. }
  174. if( SQLITE_OK!=p->xClose(pCsr) ){
  175. zErr = "error in xClose()";
  176. goto finish;
  177. }
  178. if( SQLITE_OK!=p->xDestroy(pTokenizer) ){
  179. zErr = "error in xDestroy()";
  180. goto finish;
  181. }
  182. finish:
  183. if( zErr ){
  184. sqlite3_result_error(context, zErr, -1);
  185. }else{
  186. sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT);
  187. }
  188. Tcl_DecrRefCount(pRet);
  189. }
  190. static
  191. int registerTokenizer(
  192. sqlite3 *db,
  193. char *zName,
  194. const sqlite3_tokenizer_module *p
  195. ){
  196. int rc;
  197. sqlite3_stmt *pStmt;
  198. const char zSql[] = "SELECT fts2_tokenizer(?, ?)";
  199. rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
  200. if( rc!=SQLITE_OK ){
  201. return rc;
  202. }
  203. sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
  204. sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
  205. sqlite3_step(pStmt);
  206. return sqlite3_finalize(pStmt);
  207. }
  208. static
  209. int queryFts2Tokenizer(
  210. sqlite3 *db,
  211. char *zName,
  212. const sqlite3_tokenizer_module **pp
  213. ){
  214. int rc;
  215. sqlite3_stmt *pStmt;
  216. const char zSql[] = "SELECT fts2_tokenizer(?)";
  217. *pp = 0;
  218. rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
  219. if( rc!=SQLITE_OK ){
  220. return rc;
  221. }
  222. sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
  223. if( SQLITE_ROW==sqlite3_step(pStmt) ){
  224. if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
  225. memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
  226. }
  227. }
  228. return sqlite3_finalize(pStmt);
  229. }
  230. void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
  231. /*
  232. ** Implementation of the scalar function fts2_tokenizer_internal_test().
  233. ** This function is used for testing only, it is not included in the
  234. ** build unless SQLITE_TEST is defined.
  235. **
  236. ** The purpose of this is to test that the fts2_tokenizer() function
  237. ** can be used as designed by the C-code in the queryFts2Tokenizer and
  238. ** registerTokenizer() functions above. These two functions are repeated
  239. ** in the README.tokenizer file as an example, so it is important to
  240. ** test them.
  241. **
  242. ** To run the tests, evaluate the fts2_tokenizer_internal_test() scalar
  243. ** function with no arguments. An assert() will fail if a problem is
  244. ** detected. i.e.:
  245. **
  246. ** SELECT fts2_tokenizer_internal_test();
  247. **
  248. */
  249. static void intTestFunc(
  250. sqlite3_context *context,
  251. int argc,
  252. sqlite3_value **argv
  253. ){
  254. int rc;
  255. const sqlite3_tokenizer_module *p1;
  256. const sqlite3_tokenizer_module *p2;
  257. sqlite3 *db = (sqlite3 *)sqlite3_user_data(context);
  258. /* Test the query function */
  259. sqlite3Fts2SimpleTokenizerModule(&p1);
  260. rc = queryFts2Tokenizer(db, "simple", &p2);
  261. assert( rc==SQLITE_OK );
  262. assert( p1==p2 );
  263. rc = queryFts2Tokenizer(db, "nosuchtokenizer", &p2);
  264. assert( rc==SQLITE_ERROR );
  265. assert( p2==0 );
  266. assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") );
  267. /* Test the storage function */
  268. rc = registerTokenizer(db, "nosuchtokenizer", p1);
  269. assert( rc==SQLITE_OK );
  270. rc = queryFts2Tokenizer(db, "nosuchtokenizer", &p2);
  271. assert( rc==SQLITE_OK );
  272. assert( p2==p1 );
  273. sqlite3_result_text(context, "ok", -1, SQLITE_STATIC);
  274. }
  275. #endif
  276. /*
  277. ** Set up SQL objects in database db used to access the contents of
  278. ** the hash table pointed to by argument pHash. The hash table must
  279. ** been initialized to use string keys, and to take a private copy
  280. ** of the key when a value is inserted. i.e. by a call similar to:
  281. **
  282. ** sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1);
  283. **
  284. ** This function adds a scalar function (see header comment above
  285. ** scalarFunc() in this file for details) and, if ENABLE_TABLE is
  286. ** defined at compilation time, a temporary virtual table (see header
  287. ** comment above struct HashTableVtab) to the database schema. Both
  288. ** provide read/write access to the contents of *pHash.
  289. **
  290. ** The third argument to this function, zName, is used as the name
  291. ** of both the scalar and, if created, the virtual table.
  292. */
  293. int sqlite3Fts2InitHashTable(
  294. sqlite3 *db,
  295. fts2Hash *pHash,
  296. const char *zName
  297. ){
  298. int rc = SQLITE_OK;
  299. void *p = (void *)pHash;
  300. const int any = SQLITE_ANY;
  301. char *zTest = 0;
  302. char *zTest2 = 0;
  303. #ifdef SQLITE_TEST
  304. void *pdb = (void *)db;
  305. zTest = sqlite3_mprintf("%s_test", zName);
  306. zTest2 = sqlite3_mprintf("%s_internal_test", zName);
  307. if( !zTest || !zTest2 ){
  308. rc = SQLITE_NOMEM;
  309. }
  310. #endif
  311. if( rc!=SQLITE_OK
  312. || (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0))
  313. || (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0))
  314. #ifdef SQLITE_TEST
  315. || (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0))
  316. || (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0))
  317. || (rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0))
  318. #endif
  319. );
  320. sqlite3_free(zTest);
  321. sqlite3_free(zTest2);
  322. return rc;
  323. }
  324. #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */