fts3_test.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535
  1. /*
  2. ** 2011 Jun 13
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. ******************************************************************************
  12. **
  13. ** This file is not part of the production FTS code. It is only used for
  14. ** testing. It contains a Tcl command that can be used to test if a document
  15. ** matches an FTS NEAR expression.
  16. **
  17. ** As of March 2012, it also contains a version 1 tokenizer used for testing
  18. ** that the sqlite3_tokenizer_module.xLanguage() method is invoked correctly.
  19. */
  20. #include <tcl.h>
  21. #include <string.h>
  22. #include <assert.h>
  23. #if defined(SQLITE_TEST)
  24. #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)
  25. /* Required so that the "ifdef SQLITE_ENABLE_FTS3" below works */
  26. #include "fts3Int.h"
  27. #define NM_MAX_TOKEN 12
  28. typedef struct NearPhrase NearPhrase;
  29. typedef struct NearDocument NearDocument;
  30. typedef struct NearToken NearToken;
  31. struct NearDocument {
  32. int nToken; /* Length of token in bytes */
  33. NearToken *aToken; /* Token array */
  34. };
  35. struct NearToken {
  36. int n; /* Length of token in bytes */
  37. const char *z; /* Pointer to token string */
  38. };
  39. struct NearPhrase {
  40. int nNear; /* Preceding NEAR value */
  41. int nToken; /* Number of tokens in this phrase */
  42. NearToken aToken[NM_MAX_TOKEN]; /* Array of tokens in this phrase */
  43. };
  44. static int nm_phrase_match(
  45. NearPhrase *p,
  46. NearToken *aToken
  47. ){
  48. int ii;
  49. for(ii=0; ii<p->nToken; ii++){
  50. NearToken *pToken = &p->aToken[ii];
  51. if( pToken->n>0 && pToken->z[pToken->n-1]=='*' ){
  52. if( aToken[ii].n<(pToken->n-1) ) return 0;
  53. if( memcmp(aToken[ii].z, pToken->z, pToken->n-1) ) return 0;
  54. }else{
  55. if( aToken[ii].n!=pToken->n ) return 0;
  56. if( memcmp(aToken[ii].z, pToken->z, pToken->n) ) return 0;
  57. }
  58. }
  59. return 1;
  60. }
  61. static int nm_near_chain(
  62. int iDir, /* Direction to iterate through aPhrase[] */
  63. NearDocument *pDoc, /* Document to match against */
  64. int iPos, /* Position at which iPhrase was found */
  65. int nPhrase, /* Size of phrase array */
  66. NearPhrase *aPhrase, /* Phrase array */
  67. int iPhrase /* Index of phrase found */
  68. ){
  69. int iStart;
  70. int iStop;
  71. int ii;
  72. int nNear;
  73. int iPhrase2;
  74. NearPhrase *p;
  75. NearPhrase *pPrev;
  76. assert( iDir==1 || iDir==-1 );
  77. if( iDir==1 ){
  78. if( (iPhrase+1)==nPhrase ) return 1;
  79. nNear = aPhrase[iPhrase+1].nNear;
  80. }else{
  81. if( iPhrase==0 ) return 1;
  82. nNear = aPhrase[iPhrase].nNear;
  83. }
  84. pPrev = &aPhrase[iPhrase];
  85. iPhrase2 = iPhrase+iDir;
  86. p = &aPhrase[iPhrase2];
  87. iStart = iPos - nNear - p->nToken;
  88. iStop = iPos + nNear + pPrev->nToken;
  89. if( iStart<0 ) iStart = 0;
  90. if( iStop > pDoc->nToken - p->nToken ) iStop = pDoc->nToken - p->nToken;
  91. for(ii=iStart; ii<=iStop; ii++){
  92. if( nm_phrase_match(p, &pDoc->aToken[ii]) ){
  93. if( nm_near_chain(iDir, pDoc, ii, nPhrase, aPhrase, iPhrase2) ) return 1;
  94. }
  95. }
  96. return 0;
  97. }
  98. static int nm_match_count(
  99. NearDocument *pDoc, /* Document to match against */
  100. int nPhrase, /* Size of phrase array */
  101. NearPhrase *aPhrase, /* Phrase array */
  102. int iPhrase /* Index of phrase to count matches for */
  103. ){
  104. int nOcc = 0;
  105. int ii;
  106. NearPhrase *p = &aPhrase[iPhrase];
  107. for(ii=0; ii<(pDoc->nToken + 1 - p->nToken); ii++){
  108. if( nm_phrase_match(p, &pDoc->aToken[ii]) ){
  109. /* Test forward NEAR chain (i>iPhrase) */
  110. if( 0==nm_near_chain(1, pDoc, ii, nPhrase, aPhrase, iPhrase) ) continue;
  111. /* Test reverse NEAR chain (i<iPhrase) */
  112. if( 0==nm_near_chain(-1, pDoc, ii, nPhrase, aPhrase, iPhrase) ) continue;
  113. /* This is a real match. Increment the counter. */
  114. nOcc++;
  115. }
  116. }
  117. return nOcc;
  118. }
  119. /*
  120. ** Tclcmd: fts3_near_match DOCUMENT EXPR ?OPTIONS?
  121. */
  122. static int fts3_near_match_cmd(
  123. ClientData clientData,
  124. Tcl_Interp *interp,
  125. int objc,
  126. Tcl_Obj *CONST objv[]
  127. ){
  128. int nTotal = 0;
  129. int rc;
  130. int ii;
  131. int nPhrase;
  132. NearPhrase *aPhrase = 0;
  133. NearDocument doc = {0, 0};
  134. Tcl_Obj **apDocToken;
  135. Tcl_Obj *pRet;
  136. Tcl_Obj *pPhrasecount = 0;
  137. Tcl_Obj **apExprToken;
  138. int nExprToken;
  139. UNUSED_PARAMETER(clientData);
  140. /* Must have 3 or more arguments. */
  141. if( objc<3 || (objc%2)==0 ){
  142. Tcl_WrongNumArgs(interp, 1, objv, "DOCUMENT EXPR ?OPTION VALUE?...");
  143. rc = TCL_ERROR;
  144. goto near_match_out;
  145. }
  146. for(ii=3; ii<objc; ii+=2){
  147. enum NM_enum { NM_PHRASECOUNTS };
  148. struct TestnmSubcmd {
  149. char *zName;
  150. enum NM_enum eOpt;
  151. } aOpt[] = {
  152. { "-phrasecountvar", NM_PHRASECOUNTS },
  153. { 0, 0 }
  154. };
  155. int iOpt;
  156. if( Tcl_GetIndexFromObjStruct(
  157. interp, objv[ii], aOpt, sizeof(aOpt[0]), "option", 0, &iOpt)
  158. ){
  159. return TCL_ERROR;
  160. }
  161. switch( aOpt[iOpt].eOpt ){
  162. case NM_PHRASECOUNTS:
  163. pPhrasecount = objv[ii+1];
  164. break;
  165. }
  166. }
  167. rc = Tcl_ListObjGetElements(interp, objv[1], &doc.nToken, &apDocToken);
  168. if( rc!=TCL_OK ) goto near_match_out;
  169. doc.aToken = (NearToken *)ckalloc(doc.nToken*sizeof(NearToken));
  170. for(ii=0; ii<doc.nToken; ii++){
  171. doc.aToken[ii].z = Tcl_GetStringFromObj(apDocToken[ii], &doc.aToken[ii].n);
  172. }
  173. rc = Tcl_ListObjGetElements(interp, objv[2], &nExprToken, &apExprToken);
  174. if( rc!=TCL_OK ) goto near_match_out;
  175. nPhrase = (nExprToken + 1) / 2;
  176. aPhrase = (NearPhrase *)ckalloc(nPhrase * sizeof(NearPhrase));
  177. memset(aPhrase, 0, nPhrase * sizeof(NearPhrase));
  178. for(ii=0; ii<nPhrase; ii++){
  179. Tcl_Obj *pPhrase = apExprToken[ii*2];
  180. Tcl_Obj **apToken;
  181. int nToken;
  182. int jj;
  183. rc = Tcl_ListObjGetElements(interp, pPhrase, &nToken, &apToken);
  184. if( rc!=TCL_OK ) goto near_match_out;
  185. if( nToken>NM_MAX_TOKEN ){
  186. Tcl_AppendResult(interp, "Too many tokens in phrase", 0);
  187. rc = TCL_ERROR;
  188. goto near_match_out;
  189. }
  190. for(jj=0; jj<nToken; jj++){
  191. NearToken *pT = &aPhrase[ii].aToken[jj];
  192. pT->z = Tcl_GetStringFromObj(apToken[jj], &pT->n);
  193. }
  194. aPhrase[ii].nToken = nToken;
  195. }
  196. for(ii=1; ii<nPhrase; ii++){
  197. Tcl_Obj *pNear = apExprToken[2*ii-1];
  198. int nNear;
  199. rc = Tcl_GetIntFromObj(interp, pNear, &nNear);
  200. if( rc!=TCL_OK ) goto near_match_out;
  201. aPhrase[ii].nNear = nNear;
  202. }
  203. pRet = Tcl_NewObj();
  204. Tcl_IncrRefCount(pRet);
  205. for(ii=0; ii<nPhrase; ii++){
  206. int nOcc = nm_match_count(&doc, nPhrase, aPhrase, ii);
  207. Tcl_ListObjAppendElement(interp, pRet, Tcl_NewIntObj(nOcc));
  208. nTotal += nOcc;
  209. }
  210. if( pPhrasecount ){
  211. Tcl_ObjSetVar2(interp, pPhrasecount, 0, pRet, 0);
  212. }
  213. Tcl_DecrRefCount(pRet);
  214. Tcl_SetObjResult(interp, Tcl_NewBooleanObj(nTotal>0));
  215. near_match_out:
  216. ckfree((char *)aPhrase);
  217. ckfree((char *)doc.aToken);
  218. return rc;
  219. }
  220. /*
  221. ** Tclcmd: fts3_configure_incr_load ?CHUNKSIZE THRESHOLD?
  222. **
  223. ** Normally, FTS uses hard-coded values to determine the minimum doclist
  224. ** size eligible for incremental loading, and the size of the chunks loaded
  225. ** when a doclist is incrementally loaded. This command allows the built-in
  226. ** values to be overridden for testing purposes.
  227. **
  228. ** If present, the first argument is the chunksize in bytes to load doclists
  229. ** in. The second argument is the minimum doclist size in bytes to use
  230. ** incremental loading with.
  231. **
  232. ** Whether or not the arguments are present, this command returns a list of
  233. ** two integers - the initial chunksize and threshold when the command is
  234. ** invoked. This can be used to restore the default behavior after running
  235. ** tests. For example:
  236. **
  237. ** # Override incr-load settings for testing:
  238. ** set cfg [fts3_configure_incr_load $new_chunksize $new_threshold]
  239. **
  240. ** .... run tests ....
  241. **
  242. ** # Restore initial incr-load settings:
  243. ** eval fts3_configure_incr_load $cfg
  244. */
  245. static int fts3_configure_incr_load_cmd(
  246. ClientData clientData,
  247. Tcl_Interp *interp,
  248. int objc,
  249. Tcl_Obj *CONST objv[]
  250. ){
  251. #ifdef SQLITE_ENABLE_FTS3
  252. extern int test_fts3_node_chunksize;
  253. extern int test_fts3_node_chunk_threshold;
  254. Tcl_Obj *pRet;
  255. if( objc!=1 && objc!=3 ){
  256. Tcl_WrongNumArgs(interp, 1, objv, "?CHUNKSIZE THRESHOLD?");
  257. return TCL_ERROR;
  258. }
  259. pRet = Tcl_NewObj();
  260. Tcl_IncrRefCount(pRet);
  261. Tcl_ListObjAppendElement(
  262. interp, pRet, Tcl_NewIntObj(test_fts3_node_chunksize));
  263. Tcl_ListObjAppendElement(
  264. interp, pRet, Tcl_NewIntObj(test_fts3_node_chunk_threshold));
  265. if( objc==3 ){
  266. int iArg1;
  267. int iArg2;
  268. if( Tcl_GetIntFromObj(interp, objv[1], &iArg1)
  269. || Tcl_GetIntFromObj(interp, objv[2], &iArg2)
  270. ){
  271. Tcl_DecrRefCount(pRet);
  272. return TCL_ERROR;
  273. }
  274. test_fts3_node_chunksize = iArg1;
  275. test_fts3_node_chunk_threshold = iArg2;
  276. }
  277. Tcl_SetObjResult(interp, pRet);
  278. Tcl_DecrRefCount(pRet);
  279. #endif
  280. UNUSED_PARAMETER(clientData);
  281. return TCL_OK;
  282. }
  283. #ifdef SQLITE_ENABLE_FTS3
  284. /**************************************************************************
  285. ** Beginning of test tokenizer code.
  286. **
  287. ** For language 0, this tokenizer is similar to the default 'simple'
  288. ** tokenizer. For other languages L, the following:
  289. **
  290. ** * Odd numbered languages are case-sensitive. Even numbered
  291. ** languages are not.
  292. **
  293. ** * Language ids 100 or greater are considered an error.
  294. **
  295. ** The implementation assumes that the input contains only ASCII characters
  296. ** (i.e. those that may be encoded in UTF-8 using a single byte).
  297. */
  298. typedef struct test_tokenizer {
  299. sqlite3_tokenizer base;
  300. } test_tokenizer;
  301. typedef struct test_tokenizer_cursor {
  302. sqlite3_tokenizer_cursor base;
  303. const char *aInput; /* Input being tokenized */
  304. int nInput; /* Size of the input in bytes */
  305. int iInput; /* Current offset in aInput */
  306. int iToken; /* Index of next token to be returned */
  307. char *aBuffer; /* Buffer containing current token */
  308. int nBuffer; /* Number of bytes allocated at pToken */
  309. int iLangid; /* Configured language id */
  310. } test_tokenizer_cursor;
  311. static int testTokenizerCreate(
  312. int argc, const char * const *argv,
  313. sqlite3_tokenizer **ppTokenizer
  314. ){
  315. test_tokenizer *pNew;
  316. UNUSED_PARAMETER(argc);
  317. UNUSED_PARAMETER(argv);
  318. pNew = sqlite3_malloc(sizeof(test_tokenizer));
  319. if( !pNew ) return SQLITE_NOMEM;
  320. memset(pNew, 0, sizeof(test_tokenizer));
  321. *ppTokenizer = (sqlite3_tokenizer *)pNew;
  322. return SQLITE_OK;
  323. }
  324. static int testTokenizerDestroy(sqlite3_tokenizer *pTokenizer){
  325. test_tokenizer *p = (test_tokenizer *)pTokenizer;
  326. sqlite3_free(p);
  327. return SQLITE_OK;
  328. }
  329. static int testTokenizerOpen(
  330. sqlite3_tokenizer *pTokenizer, /* The tokenizer */
  331. const char *pInput, int nBytes, /* String to be tokenized */
  332. sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
  333. ){
  334. int rc = SQLITE_OK; /* Return code */
  335. test_tokenizer_cursor *pCsr; /* New cursor object */
  336. UNUSED_PARAMETER(pTokenizer);
  337. pCsr = (test_tokenizer_cursor *)sqlite3_malloc(sizeof(test_tokenizer_cursor));
  338. if( pCsr==0 ){
  339. rc = SQLITE_NOMEM;
  340. }else{
  341. memset(pCsr, 0, sizeof(test_tokenizer_cursor));
  342. pCsr->aInput = pInput;
  343. if( nBytes<0 ){
  344. pCsr->nInput = (int)strlen(pInput);
  345. }else{
  346. pCsr->nInput = nBytes;
  347. }
  348. }
  349. *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
  350. return rc;
  351. }
  352. static int testTokenizerClose(sqlite3_tokenizer_cursor *pCursor){
  353. test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
  354. sqlite3_free(pCsr->aBuffer);
  355. sqlite3_free(pCsr);
  356. return SQLITE_OK;
  357. }
  358. static int testIsTokenChar(char c){
  359. return (c>='a' && c<='z') || (c>='A' && c<='Z');
  360. }
  361. static int testTolower(char c){
  362. char ret = c;
  363. if( ret>='A' && ret<='Z') ret = ret - ('A'-'a');
  364. return ret;
  365. }
  366. static int testTokenizerNext(
  367. sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by testTokenizerOpen */
  368. const char **ppToken, /* OUT: *ppToken is the token text */
  369. int *pnBytes, /* OUT: Number of bytes in token */
  370. int *piStartOffset, /* OUT: Starting offset of token */
  371. int *piEndOffset, /* OUT: Ending offset of token */
  372. int *piPosition /* OUT: Position integer of token */
  373. ){
  374. test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
  375. int rc = SQLITE_OK;
  376. const char *p;
  377. const char *pEnd;
  378. p = &pCsr->aInput[pCsr->iInput];
  379. pEnd = &pCsr->aInput[pCsr->nInput];
  380. /* Skip past any white-space */
  381. assert( p<=pEnd );
  382. while( p<pEnd && testIsTokenChar(*p)==0 ) p++;
  383. if( p==pEnd ){
  384. rc = SQLITE_DONE;
  385. }else{
  386. /* Advance to the end of the token */
  387. const char *pToken = p;
  388. int nToken;
  389. while( p<pEnd && testIsTokenChar(*p) ) p++;
  390. nToken = (int)(p-pToken);
  391. /* Copy the token into the buffer */
  392. if( nToken>pCsr->nBuffer ){
  393. sqlite3_free(pCsr->aBuffer);
  394. pCsr->aBuffer = sqlite3_malloc(nToken);
  395. }
  396. if( pCsr->aBuffer==0 ){
  397. rc = SQLITE_NOMEM;
  398. }else{
  399. int i;
  400. if( pCsr->iLangid & 0x00000001 ){
  401. for(i=0; i<nToken; i++) pCsr->aBuffer[i] = pToken[i];
  402. }else{
  403. for(i=0; i<nToken; i++) pCsr->aBuffer[i] = testTolower(pToken[i]);
  404. }
  405. pCsr->iToken++;
  406. pCsr->iInput = (int)(p - pCsr->aInput);
  407. *ppToken = pCsr->aBuffer;
  408. *pnBytes = nToken;
  409. *piStartOffset = (int)(pToken - pCsr->aInput);
  410. *piEndOffset = (int)(p - pCsr->aInput);
  411. *piPosition = pCsr->iToken;
  412. }
  413. }
  414. return rc;
  415. }
  416. static int testTokenizerLanguage(
  417. sqlite3_tokenizer_cursor *pCursor,
  418. int iLangid
  419. ){
  420. int rc = SQLITE_OK;
  421. test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
  422. pCsr->iLangid = iLangid;
  423. if( pCsr->iLangid>=100 ){
  424. rc = SQLITE_ERROR;
  425. }
  426. return rc;
  427. }
  428. #endif
  429. static int fts3_test_tokenizer_cmd(
  430. ClientData clientData,
  431. Tcl_Interp *interp,
  432. int objc,
  433. Tcl_Obj *CONST objv[]
  434. ){
  435. #ifdef SQLITE_ENABLE_FTS3
  436. static const sqlite3_tokenizer_module testTokenizerModule = {
  437. 1,
  438. testTokenizerCreate,
  439. testTokenizerDestroy,
  440. testTokenizerOpen,
  441. testTokenizerClose,
  442. testTokenizerNext,
  443. testTokenizerLanguage
  444. };
  445. const sqlite3_tokenizer_module *pPtr = &testTokenizerModule;
  446. if( objc!=1 ){
  447. Tcl_WrongNumArgs(interp, 1, objv, "");
  448. return TCL_ERROR;
  449. }
  450. Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(
  451. (const unsigned char *)&pPtr, sizeof(sqlite3_tokenizer_module *)
  452. ));
  453. #endif
  454. UNUSED_PARAMETER(clientData);
  455. return TCL_OK;
  456. }
  457. /*
  458. ** End of tokenizer code.
  459. **************************************************************************/
  460. int Sqlitetestfts3_Init(Tcl_Interp *interp){
  461. Tcl_CreateObjCommand(interp, "fts3_near_match", fts3_near_match_cmd, 0, 0);
  462. Tcl_CreateObjCommand(interp,
  463. "fts3_configure_incr_load", fts3_configure_incr_load_cmd, 0, 0
  464. );
  465. Tcl_CreateObjCommand(
  466. interp, "fts3_test_tokenizer", fts3_test_tokenizer_cmd, 0, 0
  467. );
  468. return TCL_OK;
  469. }
  470. #endif /* SQLITE_ENABLE_FTS3 || SQLITE_ENABLE_FTS4 */
  471. #endif /* ifdef SQLITE_TEST */