fts3_snippet.c 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521
  1. /*
  2. ** 2009 Oct 23
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. ******************************************************************************
  12. */
  13. #include "fts3Int.h"
  14. #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
  15. #include <string.h>
  16. #include <assert.h>
  17. /*
  18. ** Characters that may appear in the second argument to matchinfo().
  19. */
  20. #define FTS3_MATCHINFO_NPHRASE 'p' /* 1 value */
  21. #define FTS3_MATCHINFO_NCOL 'c' /* 1 value */
  22. #define FTS3_MATCHINFO_NDOC 'n' /* 1 value */
  23. #define FTS3_MATCHINFO_AVGLENGTH 'a' /* nCol values */
  24. #define FTS3_MATCHINFO_LENGTH 'l' /* nCol values */
  25. #define FTS3_MATCHINFO_LCS 's' /* nCol values */
  26. #define FTS3_MATCHINFO_HITS 'x' /* 3*nCol*nPhrase values */
  27. /*
  28. ** The default value for the second argument to matchinfo().
  29. */
  30. #define FTS3_MATCHINFO_DEFAULT "pcx"
  31. /*
  32. ** Used as an fts3ExprIterate() context when loading phrase doclists to
  33. ** Fts3Expr.aDoclist[]/nDoclist.
  34. */
  35. typedef struct LoadDoclistCtx LoadDoclistCtx;
  36. struct LoadDoclistCtx {
  37. Fts3Cursor *pCsr; /* FTS3 Cursor */
  38. int nPhrase; /* Number of phrases seen so far */
  39. int nToken; /* Number of tokens seen so far */
  40. };
  41. /*
  42. ** The following types are used as part of the implementation of the
  43. ** fts3BestSnippet() routine.
  44. */
  45. typedef struct SnippetIter SnippetIter;
  46. typedef struct SnippetPhrase SnippetPhrase;
  47. typedef struct SnippetFragment SnippetFragment;
  48. struct SnippetIter {
  49. Fts3Cursor *pCsr; /* Cursor snippet is being generated from */
  50. int iCol; /* Extract snippet from this column */
  51. int nSnippet; /* Requested snippet length (in tokens) */
  52. int nPhrase; /* Number of phrases in query */
  53. SnippetPhrase *aPhrase; /* Array of size nPhrase */
  54. int iCurrent; /* First token of current snippet */
  55. };
  56. struct SnippetPhrase {
  57. int nToken; /* Number of tokens in phrase */
  58. char *pList; /* Pointer to start of phrase position list */
  59. int iHead; /* Next value in position list */
  60. char *pHead; /* Position list data following iHead */
  61. int iTail; /* Next value in trailing position list */
  62. char *pTail; /* Position list data following iTail */
  63. };
  64. struct SnippetFragment {
  65. int iCol; /* Column snippet is extracted from */
  66. int iPos; /* Index of first token in snippet */
  67. u64 covered; /* Mask of query phrases covered */
  68. u64 hlmask; /* Mask of snippet terms to highlight */
  69. };
  70. /*
  71. ** This type is used as an fts3ExprIterate() context object while
  72. ** accumulating the data returned by the matchinfo() function.
  73. */
  74. typedef struct MatchInfo MatchInfo;
  75. struct MatchInfo {
  76. Fts3Cursor *pCursor; /* FTS3 Cursor */
  77. int nCol; /* Number of columns in table */
  78. int nPhrase; /* Number of matchable phrases in query */
  79. sqlite3_int64 nDoc; /* Number of docs in database */
  80. u32 *aMatchinfo; /* Pre-allocated buffer */
  81. };
  82. /*
  83. ** The snippet() and offsets() functions both return text values. An instance
  84. ** of the following structure is used to accumulate those values while the
  85. ** functions are running. See fts3StringAppend() for details.
  86. */
  87. typedef struct StrBuffer StrBuffer;
  88. struct StrBuffer {
  89. char *z; /* Pointer to buffer containing string */
  90. int n; /* Length of z in bytes (excl. nul-term) */
  91. int nAlloc; /* Allocated size of buffer z in bytes */
  92. };
  93. /*
  94. ** This function is used to help iterate through a position-list. A position
  95. ** list is a list of unique integers, sorted from smallest to largest. Each
  96. ** element of the list is represented by an FTS3 varint that takes the value
  97. ** of the difference between the current element and the previous one plus
  98. ** two. For example, to store the position-list:
  99. **
  100. ** 4 9 113
  101. **
  102. ** the three varints:
  103. **
  104. ** 6 7 106
  105. **
  106. ** are encoded.
  107. **
  108. ** When this function is called, *pp points to the start of an element of
  109. ** the list. *piPos contains the value of the previous entry in the list.
  110. ** After it returns, *piPos contains the value of the next element of the
  111. ** list and *pp is advanced to the following varint.
  112. */
  113. static void fts3GetDeltaPosition(char **pp, int *piPos){
  114. int iVal;
  115. *pp += sqlite3Fts3GetVarint32(*pp, &iVal);
  116. *piPos += (iVal-2);
  117. }
  118. /*
  119. ** Helper function for fts3ExprIterate() (see below).
  120. */
  121. static int fts3ExprIterate2(
  122. Fts3Expr *pExpr, /* Expression to iterate phrases of */
  123. int *piPhrase, /* Pointer to phrase counter */
  124. int (*x)(Fts3Expr*,int,void*), /* Callback function to invoke for phrases */
  125. void *pCtx /* Second argument to pass to callback */
  126. ){
  127. int rc; /* Return code */
  128. int eType = pExpr->eType; /* Type of expression node pExpr */
  129. if( eType!=FTSQUERY_PHRASE ){
  130. assert( pExpr->pLeft && pExpr->pRight );
  131. rc = fts3ExprIterate2(pExpr->pLeft, piPhrase, x, pCtx);
  132. if( rc==SQLITE_OK && eType!=FTSQUERY_NOT ){
  133. rc = fts3ExprIterate2(pExpr->pRight, piPhrase, x, pCtx);
  134. }
  135. }else{
  136. rc = x(pExpr, *piPhrase, pCtx);
  137. (*piPhrase)++;
  138. }
  139. return rc;
  140. }
  141. /*
  142. ** Iterate through all phrase nodes in an FTS3 query, except those that
  143. ** are part of a sub-tree that is the right-hand-side of a NOT operator.
  144. ** For each phrase node found, the supplied callback function is invoked.
  145. **
  146. ** If the callback function returns anything other than SQLITE_OK,
  147. ** the iteration is abandoned and the error code returned immediately.
  148. ** Otherwise, SQLITE_OK is returned after a callback has been made for
  149. ** all eligible phrase nodes.
  150. */
  151. static int fts3ExprIterate(
  152. Fts3Expr *pExpr, /* Expression to iterate phrases of */
  153. int (*x)(Fts3Expr*,int,void*), /* Callback function to invoke for phrases */
  154. void *pCtx /* Second argument to pass to callback */
  155. ){
  156. int iPhrase = 0; /* Variable used as the phrase counter */
  157. return fts3ExprIterate2(pExpr, &iPhrase, x, pCtx);
  158. }
  159. /*
  160. ** This is an fts3ExprIterate() callback used while loading the doclists
  161. ** for each phrase into Fts3Expr.aDoclist[]/nDoclist. See also
  162. ** fts3ExprLoadDoclists().
  163. */
  164. static int fts3ExprLoadDoclistsCb(Fts3Expr *pExpr, int iPhrase, void *ctx){
  165. int rc = SQLITE_OK;
  166. Fts3Phrase *pPhrase = pExpr->pPhrase;
  167. LoadDoclistCtx *p = (LoadDoclistCtx *)ctx;
  168. UNUSED_PARAMETER(iPhrase);
  169. p->nPhrase++;
  170. p->nToken += pPhrase->nToken;
  171. return rc;
  172. }
  173. /*
  174. ** Load the doclists for each phrase in the query associated with FTS3 cursor
  175. ** pCsr.
  176. **
  177. ** If pnPhrase is not NULL, then *pnPhrase is set to the number of matchable
  178. ** phrases in the expression (all phrases except those directly or
  179. ** indirectly descended from the right-hand-side of a NOT operator). If
  180. ** pnToken is not NULL, then it is set to the number of tokens in all
  181. ** matchable phrases of the expression.
  182. */
  183. static int fts3ExprLoadDoclists(
  184. Fts3Cursor *pCsr, /* Fts3 cursor for current query */
  185. int *pnPhrase, /* OUT: Number of phrases in query */
  186. int *pnToken /* OUT: Number of tokens in query */
  187. ){
  188. int rc; /* Return Code */
  189. LoadDoclistCtx sCtx = {0,0,0}; /* Context for fts3ExprIterate() */
  190. sCtx.pCsr = pCsr;
  191. rc = fts3ExprIterate(pCsr->pExpr, fts3ExprLoadDoclistsCb, (void *)&sCtx);
  192. if( pnPhrase ) *pnPhrase = sCtx.nPhrase;
  193. if( pnToken ) *pnToken = sCtx.nToken;
  194. return rc;
  195. }
  196. static int fts3ExprPhraseCountCb(Fts3Expr *pExpr, int iPhrase, void *ctx){
  197. (*(int *)ctx)++;
  198. UNUSED_PARAMETER(pExpr);
  199. UNUSED_PARAMETER(iPhrase);
  200. return SQLITE_OK;
  201. }
  202. static int fts3ExprPhraseCount(Fts3Expr *pExpr){
  203. int nPhrase = 0;
  204. (void)fts3ExprIterate(pExpr, fts3ExprPhraseCountCb, (void *)&nPhrase);
  205. return nPhrase;
  206. }
  207. /*
  208. ** Advance the position list iterator specified by the first two
  209. ** arguments so that it points to the first element with a value greater
  210. ** than or equal to parameter iNext.
  211. */
  212. static void fts3SnippetAdvance(char **ppIter, int *piIter, int iNext){
  213. char *pIter = *ppIter;
  214. if( pIter ){
  215. int iIter = *piIter;
  216. while( iIter<iNext ){
  217. if( 0==(*pIter & 0xFE) ){
  218. iIter = -1;
  219. pIter = 0;
  220. break;
  221. }
  222. fts3GetDeltaPosition(&pIter, &iIter);
  223. }
  224. *piIter = iIter;
  225. *ppIter = pIter;
  226. }
  227. }
  228. /*
  229. ** Advance the snippet iterator to the next candidate snippet.
  230. */
  231. static int fts3SnippetNextCandidate(SnippetIter *pIter){
  232. int i; /* Loop counter */
  233. if( pIter->iCurrent<0 ){
  234. /* The SnippetIter object has just been initialized. The first snippet
  235. ** candidate always starts at offset 0 (even if this candidate has a
  236. ** score of 0.0).
  237. */
  238. pIter->iCurrent = 0;
  239. /* Advance the 'head' iterator of each phrase to the first offset that
  240. ** is greater than or equal to (iNext+nSnippet).
  241. */
  242. for(i=0; i<pIter->nPhrase; i++){
  243. SnippetPhrase *pPhrase = &pIter->aPhrase[i];
  244. fts3SnippetAdvance(&pPhrase->pHead, &pPhrase->iHead, pIter->nSnippet);
  245. }
  246. }else{
  247. int iStart;
  248. int iEnd = 0x7FFFFFFF;
  249. for(i=0; i<pIter->nPhrase; i++){
  250. SnippetPhrase *pPhrase = &pIter->aPhrase[i];
  251. if( pPhrase->pHead && pPhrase->iHead<iEnd ){
  252. iEnd = pPhrase->iHead;
  253. }
  254. }
  255. if( iEnd==0x7FFFFFFF ){
  256. return 1;
  257. }
  258. pIter->iCurrent = iStart = iEnd - pIter->nSnippet + 1;
  259. for(i=0; i<pIter->nPhrase; i++){
  260. SnippetPhrase *pPhrase = &pIter->aPhrase[i];
  261. fts3SnippetAdvance(&pPhrase->pHead, &pPhrase->iHead, iEnd+1);
  262. fts3SnippetAdvance(&pPhrase->pTail, &pPhrase->iTail, iStart);
  263. }
  264. }
  265. return 0;
  266. }
  267. /*
  268. ** Retrieve information about the current candidate snippet of snippet
  269. ** iterator pIter.
  270. */
  271. static void fts3SnippetDetails(
  272. SnippetIter *pIter, /* Snippet iterator */
  273. u64 mCovered, /* Bitmask of phrases already covered */
  274. int *piToken, /* OUT: First token of proposed snippet */
  275. int *piScore, /* OUT: "Score" for this snippet */
  276. u64 *pmCover, /* OUT: Bitmask of phrases covered */
  277. u64 *pmHighlight /* OUT: Bitmask of terms to highlight */
  278. ){
  279. int iStart = pIter->iCurrent; /* First token of snippet */
  280. int iScore = 0; /* Score of this snippet */
  281. int i; /* Loop counter */
  282. u64 mCover = 0; /* Mask of phrases covered by this snippet */
  283. u64 mHighlight = 0; /* Mask of tokens to highlight in snippet */
  284. for(i=0; i<pIter->nPhrase; i++){
  285. SnippetPhrase *pPhrase = &pIter->aPhrase[i];
  286. if( pPhrase->pTail ){
  287. char *pCsr = pPhrase->pTail;
  288. int iCsr = pPhrase->iTail;
  289. while( iCsr<(iStart+pIter->nSnippet) ){
  290. int j;
  291. u64 mPhrase = (u64)1 << i;
  292. u64 mPos = (u64)1 << (iCsr - iStart);
  293. assert( iCsr>=iStart );
  294. if( (mCover|mCovered)&mPhrase ){
  295. iScore++;
  296. }else{
  297. iScore += 1000;
  298. }
  299. mCover |= mPhrase;
  300. for(j=0; j<pPhrase->nToken; j++){
  301. mHighlight |= (mPos>>j);
  302. }
  303. if( 0==(*pCsr & 0x0FE) ) break;
  304. fts3GetDeltaPosition(&pCsr, &iCsr);
  305. }
  306. }
  307. }
  308. /* Set the output variables before returning. */
  309. *piToken = iStart;
  310. *piScore = iScore;
  311. *pmCover = mCover;
  312. *pmHighlight = mHighlight;
  313. }
  314. /*
  315. ** This function is an fts3ExprIterate() callback used by fts3BestSnippet().
  316. ** Each invocation populates an element of the SnippetIter.aPhrase[] array.
  317. */
  318. static int fts3SnippetFindPositions(Fts3Expr *pExpr, int iPhrase, void *ctx){
  319. SnippetIter *p = (SnippetIter *)ctx;
  320. SnippetPhrase *pPhrase = &p->aPhrase[iPhrase];
  321. char *pCsr;
  322. int rc;
  323. pPhrase->nToken = pExpr->pPhrase->nToken;
  324. rc = sqlite3Fts3EvalPhrasePoslist(p->pCsr, pExpr, p->iCol, &pCsr);
  325. assert( rc==SQLITE_OK || pCsr==0 );
  326. if( pCsr ){
  327. int iFirst = 0;
  328. pPhrase->pList = pCsr;
  329. fts3GetDeltaPosition(&pCsr, &iFirst);
  330. assert( iFirst>=0 );
  331. pPhrase->pHead = pCsr;
  332. pPhrase->pTail = pCsr;
  333. pPhrase->iHead = iFirst;
  334. pPhrase->iTail = iFirst;
  335. }else{
  336. assert( rc!=SQLITE_OK || (
  337. pPhrase->pList==0 && pPhrase->pHead==0 && pPhrase->pTail==0
  338. ));
  339. }
  340. return rc;
  341. }
  342. /*
  343. ** Select the fragment of text consisting of nFragment contiguous tokens
  344. ** from column iCol that represent the "best" snippet. The best snippet
  345. ** is the snippet with the highest score, where scores are calculated
  346. ** by adding:
  347. **
  348. ** (a) +1 point for each occurrence of a matchable phrase in the snippet.
  349. **
  350. ** (b) +1000 points for the first occurrence of each matchable phrase in
  351. ** the snippet for which the corresponding mCovered bit is not set.
  352. **
  353. ** The selected snippet parameters are stored in structure *pFragment before
  354. ** returning. The score of the selected snippet is stored in *piScore
  355. ** before returning.
  356. */
  357. static int fts3BestSnippet(
  358. int nSnippet, /* Desired snippet length */
  359. Fts3Cursor *pCsr, /* Cursor to create snippet for */
  360. int iCol, /* Index of column to create snippet from */
  361. u64 mCovered, /* Mask of phrases already covered */
  362. u64 *pmSeen, /* IN/OUT: Mask of phrases seen */
  363. SnippetFragment *pFragment, /* OUT: Best snippet found */
  364. int *piScore /* OUT: Score of snippet pFragment */
  365. ){
  366. int rc; /* Return Code */
  367. int nList; /* Number of phrases in expression */
  368. SnippetIter sIter; /* Iterates through snippet candidates */
  369. int nByte; /* Number of bytes of space to allocate */
  370. int iBestScore = -1; /* Best snippet score found so far */
  371. int i; /* Loop counter */
  372. memset(&sIter, 0, sizeof(sIter));
  373. /* Iterate through the phrases in the expression to count them. The same
  374. ** callback makes sure the doclists are loaded for each phrase.
  375. */
  376. rc = fts3ExprLoadDoclists(pCsr, &nList, 0);
  377. if( rc!=SQLITE_OK ){
  378. return rc;
  379. }
  380. /* Now that it is known how many phrases there are, allocate and zero
  381. ** the required space using malloc().
  382. */
  383. nByte = sizeof(SnippetPhrase) * nList;
  384. sIter.aPhrase = (SnippetPhrase *)sqlite3_malloc(nByte);
  385. if( !sIter.aPhrase ){
  386. return SQLITE_NOMEM;
  387. }
  388. memset(sIter.aPhrase, 0, nByte);
  389. /* Initialize the contents of the SnippetIter object. Then iterate through
  390. ** the set of phrases in the expression to populate the aPhrase[] array.
  391. */
  392. sIter.pCsr = pCsr;
  393. sIter.iCol = iCol;
  394. sIter.nSnippet = nSnippet;
  395. sIter.nPhrase = nList;
  396. sIter.iCurrent = -1;
  397. (void)fts3ExprIterate(pCsr->pExpr, fts3SnippetFindPositions, (void *)&sIter);
  398. /* Set the *pmSeen output variable. */
  399. for(i=0; i<nList; i++){
  400. if( sIter.aPhrase[i].pHead ){
  401. *pmSeen |= (u64)1 << i;
  402. }
  403. }
  404. /* Loop through all candidate snippets. Store the best snippet in
  405. ** *pFragment. Store its associated 'score' in iBestScore.
  406. */
  407. pFragment->iCol = iCol;
  408. while( !fts3SnippetNextCandidate(&sIter) ){
  409. int iPos;
  410. int iScore;
  411. u64 mCover;
  412. u64 mHighlight;
  413. fts3SnippetDetails(&sIter, mCovered, &iPos, &iScore, &mCover, &mHighlight);
  414. assert( iScore>=0 );
  415. if( iScore>iBestScore ){
  416. pFragment->iPos = iPos;
  417. pFragment->hlmask = mHighlight;
  418. pFragment->covered = mCover;
  419. iBestScore = iScore;
  420. }
  421. }
  422. sqlite3_free(sIter.aPhrase);
  423. *piScore = iBestScore;
  424. return SQLITE_OK;
  425. }
  426. /*
  427. ** Append a string to the string-buffer passed as the first argument.
  428. **
  429. ** If nAppend is negative, then the length of the string zAppend is
  430. ** determined using strlen().
  431. */
  432. static int fts3StringAppend(
  433. StrBuffer *pStr, /* Buffer to append to */
  434. const char *zAppend, /* Pointer to data to append to buffer */
  435. int nAppend /* Size of zAppend in bytes (or -1) */
  436. ){
  437. if( nAppend<0 ){
  438. nAppend = (int)strlen(zAppend);
  439. }
  440. /* If there is insufficient space allocated at StrBuffer.z, use realloc()
  441. ** to grow the buffer until so that it is big enough to accomadate the
  442. ** appended data.
  443. */
  444. if( pStr->n+nAppend+1>=pStr->nAlloc ){
  445. int nAlloc = pStr->nAlloc+nAppend+100;
  446. char *zNew = sqlite3_realloc(pStr->z, nAlloc);
  447. if( !zNew ){
  448. return SQLITE_NOMEM;
  449. }
  450. pStr->z = zNew;
  451. pStr->nAlloc = nAlloc;
  452. }
  453. assert( pStr->z!=0 && (pStr->nAlloc >= pStr->n+nAppend+1) );
  454. /* Append the data to the string buffer. */
  455. memcpy(&pStr->z[pStr->n], zAppend, nAppend);
  456. pStr->n += nAppend;
  457. pStr->z[pStr->n] = '\0';
  458. return SQLITE_OK;
  459. }
  460. /*
  461. ** The fts3BestSnippet() function often selects snippets that end with a
  462. ** query term. That is, the final term of the snippet is always a term
  463. ** that requires highlighting. For example, if 'X' is a highlighted term
  464. ** and '.' is a non-highlighted term, BestSnippet() may select:
  465. **
  466. ** ........X.....X
  467. **
  468. ** This function "shifts" the beginning of the snippet forward in the
  469. ** document so that there are approximately the same number of
  470. ** non-highlighted terms to the right of the final highlighted term as there
  471. ** are to the left of the first highlighted term. For example, to this:
  472. **
  473. ** ....X.....X....
  474. **
  475. ** This is done as part of extracting the snippet text, not when selecting
  476. ** the snippet. Snippet selection is done based on doclists only, so there
  477. ** is no way for fts3BestSnippet() to know whether or not the document
  478. ** actually contains terms that follow the final highlighted term.
  479. */
  480. static int fts3SnippetShift(
  481. Fts3Table *pTab, /* FTS3 table snippet comes from */
  482. int iLangid, /* Language id to use in tokenizing */
  483. int nSnippet, /* Number of tokens desired for snippet */
  484. const char *zDoc, /* Document text to extract snippet from */
  485. int nDoc, /* Size of buffer zDoc in bytes */
  486. int *piPos, /* IN/OUT: First token of snippet */
  487. u64 *pHlmask /* IN/OUT: Mask of tokens to highlight */
  488. ){
  489. u64 hlmask = *pHlmask; /* Local copy of initial highlight-mask */
  490. if( hlmask ){
  491. int nLeft; /* Tokens to the left of first highlight */
  492. int nRight; /* Tokens to the right of last highlight */
  493. int nDesired; /* Ideal number of tokens to shift forward */
  494. for(nLeft=0; !(hlmask & ((u64)1 << nLeft)); nLeft++);
  495. for(nRight=0; !(hlmask & ((u64)1 << (nSnippet-1-nRight))); nRight++);
  496. nDesired = (nLeft-nRight)/2;
  497. /* Ideally, the start of the snippet should be pushed forward in the
  498. ** document nDesired tokens. This block checks if there are actually
  499. ** nDesired tokens to the right of the snippet. If so, *piPos and
  500. ** *pHlMask are updated to shift the snippet nDesired tokens to the
  501. ** right. Otherwise, the snippet is shifted by the number of tokens
  502. ** available.
  503. */
  504. if( nDesired>0 ){
  505. int nShift; /* Number of tokens to shift snippet by */
  506. int iCurrent = 0; /* Token counter */
  507. int rc; /* Return Code */
  508. sqlite3_tokenizer_module *pMod;
  509. sqlite3_tokenizer_cursor *pC;
  510. pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule;
  511. /* Open a cursor on zDoc/nDoc. Check if there are (nSnippet+nDesired)
  512. ** or more tokens in zDoc/nDoc.
  513. */
  514. rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, iLangid, zDoc, nDoc, &pC);
  515. if( rc!=SQLITE_OK ){
  516. return rc;
  517. }
  518. while( rc==SQLITE_OK && iCurrent<(nSnippet+nDesired) ){
  519. const char *ZDUMMY; int DUMMY1 = 0, DUMMY2 = 0, DUMMY3 = 0;
  520. rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &DUMMY2, &DUMMY3, &iCurrent);
  521. }
  522. pMod->xClose(pC);
  523. if( rc!=SQLITE_OK && rc!=SQLITE_DONE ){ return rc; }
  524. nShift = (rc==SQLITE_DONE)+iCurrent-nSnippet;
  525. assert( nShift<=nDesired );
  526. if( nShift>0 ){
  527. *piPos += nShift;
  528. *pHlmask = hlmask >> nShift;
  529. }
  530. }
  531. }
  532. return SQLITE_OK;
  533. }
  534. /*
  535. ** Extract the snippet text for fragment pFragment from cursor pCsr and
  536. ** append it to string buffer pOut.
  537. */
  538. static int fts3SnippetText(
  539. Fts3Cursor *pCsr, /* FTS3 Cursor */
  540. SnippetFragment *pFragment, /* Snippet to extract */
  541. int iFragment, /* Fragment number */
  542. int isLast, /* True for final fragment in snippet */
  543. int nSnippet, /* Number of tokens in extracted snippet */
  544. const char *zOpen, /* String inserted before highlighted term */
  545. const char *zClose, /* String inserted after highlighted term */
  546. const char *zEllipsis, /* String inserted between snippets */
  547. StrBuffer *pOut /* Write output here */
  548. ){
  549. Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
  550. int rc; /* Return code */
  551. const char *zDoc; /* Document text to extract snippet from */
  552. int nDoc; /* Size of zDoc in bytes */
  553. int iCurrent = 0; /* Current token number of document */
  554. int iEnd = 0; /* Byte offset of end of current token */
  555. int isShiftDone = 0; /* True after snippet is shifted */
  556. int iPos = pFragment->iPos; /* First token of snippet */
  557. u64 hlmask = pFragment->hlmask; /* Highlight-mask for snippet */
  558. int iCol = pFragment->iCol+1; /* Query column to extract text from */
  559. sqlite3_tokenizer_module *pMod; /* Tokenizer module methods object */
  560. sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor open on zDoc/nDoc */
  561. zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol);
  562. if( zDoc==0 ){
  563. if( sqlite3_column_type(pCsr->pStmt, iCol)!=SQLITE_NULL ){
  564. return SQLITE_NOMEM;
  565. }
  566. return SQLITE_OK;
  567. }
  568. nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol);
  569. /* Open a token cursor on the document. */
  570. pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule;
  571. rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, pCsr->iLangid, zDoc,nDoc,&pC);
  572. if( rc!=SQLITE_OK ){
  573. return rc;
  574. }
  575. while( rc==SQLITE_OK ){
  576. const char *ZDUMMY; /* Dummy argument used with tokenizer */
  577. int DUMMY1 = -1; /* Dummy argument used with tokenizer */
  578. int iBegin = 0; /* Offset in zDoc of start of token */
  579. int iFin = 0; /* Offset in zDoc of end of token */
  580. int isHighlight = 0; /* True for highlighted terms */
  581. /* Variable DUMMY1 is initialized to a negative value above. Elsewhere
  582. ** in the FTS code the variable that the third argument to xNext points to
  583. ** is initialized to zero before the first (*but not necessarily
  584. ** subsequent*) call to xNext(). This is done for a particular application
  585. ** that needs to know whether or not the tokenizer is being used for
  586. ** snippet generation or for some other purpose.
  587. **
  588. ** Extreme care is required when writing code to depend on this
  589. ** initialization. It is not a documented part of the tokenizer interface.
  590. ** If a tokenizer is used directly by any code outside of FTS, this
  591. ** convention might not be respected. */
  592. rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &iBegin, &iFin, &iCurrent);
  593. if( rc!=SQLITE_OK ){
  594. if( rc==SQLITE_DONE ){
  595. /* Special case - the last token of the snippet is also the last token
  596. ** of the column. Append any punctuation that occurred between the end
  597. ** of the previous token and the end of the document to the output.
  598. ** Then break out of the loop. */
  599. rc = fts3StringAppend(pOut, &zDoc[iEnd], -1);
  600. }
  601. break;
  602. }
  603. if( iCurrent<iPos ){ continue; }
  604. if( !isShiftDone ){
  605. int n = nDoc - iBegin;
  606. rc = fts3SnippetShift(
  607. pTab, pCsr->iLangid, nSnippet, &zDoc[iBegin], n, &iPos, &hlmask
  608. );
  609. isShiftDone = 1;
  610. /* Now that the shift has been done, check if the initial "..." are
  611. ** required. They are required if (a) this is not the first fragment,
  612. ** or (b) this fragment does not begin at position 0 of its column.
  613. */
  614. if( rc==SQLITE_OK && (iPos>0 || iFragment>0) ){
  615. rc = fts3StringAppend(pOut, zEllipsis, -1);
  616. }
  617. if( rc!=SQLITE_OK || iCurrent<iPos ) continue;
  618. }
  619. if( iCurrent>=(iPos+nSnippet) ){
  620. if( isLast ){
  621. rc = fts3StringAppend(pOut, zEllipsis, -1);
  622. }
  623. break;
  624. }
  625. /* Set isHighlight to true if this term should be highlighted. */
  626. isHighlight = (hlmask & ((u64)1 << (iCurrent-iPos)))!=0;
  627. if( iCurrent>iPos ) rc = fts3StringAppend(pOut, &zDoc[iEnd], iBegin-iEnd);
  628. if( rc==SQLITE_OK && isHighlight ) rc = fts3StringAppend(pOut, zOpen, -1);
  629. if( rc==SQLITE_OK ) rc = fts3StringAppend(pOut, &zDoc[iBegin], iFin-iBegin);
  630. if( rc==SQLITE_OK && isHighlight ) rc = fts3StringAppend(pOut, zClose, -1);
  631. iEnd = iFin;
  632. }
  633. pMod->xClose(pC);
  634. return rc;
  635. }
  636. /*
  637. ** This function is used to count the entries in a column-list (a
  638. ** delta-encoded list of term offsets within a single column of a single
  639. ** row). When this function is called, *ppCollist should point to the
  640. ** beginning of the first varint in the column-list (the varint that
  641. ** contains the position of the first matching term in the column data).
  642. ** Before returning, *ppCollist is set to point to the first byte after
  643. ** the last varint in the column-list (either the 0x00 signifying the end
  644. ** of the position-list, or the 0x01 that precedes the column number of
  645. ** the next column in the position-list).
  646. **
  647. ** The number of elements in the column-list is returned.
  648. */
  649. static int fts3ColumnlistCount(char **ppCollist){
  650. char *pEnd = *ppCollist;
  651. char c = 0;
  652. int nEntry = 0;
  653. /* A column-list is terminated by either a 0x01 or 0x00. */
  654. while( 0xFE & (*pEnd | c) ){
  655. c = *pEnd++ & 0x80;
  656. if( !c ) nEntry++;
  657. }
  658. *ppCollist = pEnd;
  659. return nEntry;
  660. }
  661. /*
  662. ** fts3ExprIterate() callback used to collect the "global" matchinfo stats
  663. ** for a single query.
  664. **
  665. ** fts3ExprIterate() callback to load the 'global' elements of a
  666. ** FTS3_MATCHINFO_HITS matchinfo array. The global stats are those elements
  667. ** of the matchinfo array that are constant for all rows returned by the
  668. ** current query.
  669. **
  670. ** Argument pCtx is actually a pointer to a struct of type MatchInfo. This
  671. ** function populates Matchinfo.aMatchinfo[] as follows:
  672. **
  673. ** for(iCol=0; iCol<nCol; iCol++){
  674. ** aMatchinfo[3*iPhrase*nCol + 3*iCol + 1] = X;
  675. ** aMatchinfo[3*iPhrase*nCol + 3*iCol + 2] = Y;
  676. ** }
  677. **
  678. ** where X is the number of matches for phrase iPhrase is column iCol of all
  679. ** rows of the table. Y is the number of rows for which column iCol contains
  680. ** at least one instance of phrase iPhrase.
  681. **
  682. ** If the phrase pExpr consists entirely of deferred tokens, then all X and
  683. ** Y values are set to nDoc, where nDoc is the number of documents in the
  684. ** file system. This is done because the full-text index doclist is required
  685. ** to calculate these values properly, and the full-text index doclist is
  686. ** not available for deferred tokens.
  687. */
  688. static int fts3ExprGlobalHitsCb(
  689. Fts3Expr *pExpr, /* Phrase expression node */
  690. int iPhrase, /* Phrase number (numbered from zero) */
  691. void *pCtx /* Pointer to MatchInfo structure */
  692. ){
  693. MatchInfo *p = (MatchInfo *)pCtx;
  694. return sqlite3Fts3EvalPhraseStats(
  695. p->pCursor, pExpr, &p->aMatchinfo[3*iPhrase*p->nCol]
  696. );
  697. }
  698. /*
  699. ** fts3ExprIterate() callback used to collect the "local" part of the
  700. ** FTS3_MATCHINFO_HITS array. The local stats are those elements of the
  701. ** array that are different for each row returned by the query.
  702. */
  703. static int fts3ExprLocalHitsCb(
  704. Fts3Expr *pExpr, /* Phrase expression node */
  705. int iPhrase, /* Phrase number */
  706. void *pCtx /* Pointer to MatchInfo structure */
  707. ){
  708. int rc = SQLITE_OK;
  709. MatchInfo *p = (MatchInfo *)pCtx;
  710. int iStart = iPhrase * p->nCol * 3;
  711. int i;
  712. for(i=0; i<p->nCol && rc==SQLITE_OK; i++){
  713. char *pCsr;
  714. rc = sqlite3Fts3EvalPhrasePoslist(p->pCursor, pExpr, i, &pCsr);
  715. if( pCsr ){
  716. p->aMatchinfo[iStart+i*3] = fts3ColumnlistCount(&pCsr);
  717. }else{
  718. p->aMatchinfo[iStart+i*3] = 0;
  719. }
  720. }
  721. return rc;
  722. }
  723. static int fts3MatchinfoCheck(
  724. Fts3Table *pTab,
  725. char cArg,
  726. char **pzErr
  727. ){
  728. if( (cArg==FTS3_MATCHINFO_NPHRASE)
  729. || (cArg==FTS3_MATCHINFO_NCOL)
  730. || (cArg==FTS3_MATCHINFO_NDOC && pTab->bFts4)
  731. || (cArg==FTS3_MATCHINFO_AVGLENGTH && pTab->bFts4)
  732. || (cArg==FTS3_MATCHINFO_LENGTH && pTab->bHasDocsize)
  733. || (cArg==FTS3_MATCHINFO_LCS)
  734. || (cArg==FTS3_MATCHINFO_HITS)
  735. ){
  736. return SQLITE_OK;
  737. }
  738. *pzErr = sqlite3_mprintf("unrecognized matchinfo request: %c", cArg);
  739. return SQLITE_ERROR;
  740. }
  741. static int fts3MatchinfoSize(MatchInfo *pInfo, char cArg){
  742. int nVal; /* Number of integers output by cArg */
  743. switch( cArg ){
  744. case FTS3_MATCHINFO_NDOC:
  745. case FTS3_MATCHINFO_NPHRASE:
  746. case FTS3_MATCHINFO_NCOL:
  747. nVal = 1;
  748. break;
  749. case FTS3_MATCHINFO_AVGLENGTH:
  750. case FTS3_MATCHINFO_LENGTH:
  751. case FTS3_MATCHINFO_LCS:
  752. nVal = pInfo->nCol;
  753. break;
  754. default:
  755. assert( cArg==FTS3_MATCHINFO_HITS );
  756. nVal = pInfo->nCol * pInfo->nPhrase * 3;
  757. break;
  758. }
  759. return nVal;
  760. }
  761. static int fts3MatchinfoSelectDoctotal(
  762. Fts3Table *pTab,
  763. sqlite3_stmt **ppStmt,
  764. sqlite3_int64 *pnDoc,
  765. const char **paLen
  766. ){
  767. sqlite3_stmt *pStmt;
  768. const char *a;
  769. sqlite3_int64 nDoc;
  770. if( !*ppStmt ){
  771. int rc = sqlite3Fts3SelectDoctotal(pTab, ppStmt);
  772. if( rc!=SQLITE_OK ) return rc;
  773. }
  774. pStmt = *ppStmt;
  775. assert( sqlite3_data_count(pStmt)==1 );
  776. a = sqlite3_column_blob(pStmt, 0);
  777. a += sqlite3Fts3GetVarint(a, &nDoc);
  778. if( nDoc==0 ) return FTS_CORRUPT_VTAB;
  779. *pnDoc = (u32)nDoc;
  780. if( paLen ) *paLen = a;
  781. return SQLITE_OK;
  782. }
  783. /*
  784. ** An instance of the following structure is used to store state while
  785. ** iterating through a multi-column position-list corresponding to the
  786. ** hits for a single phrase on a single row in order to calculate the
  787. ** values for a matchinfo() FTS3_MATCHINFO_LCS request.
  788. */
  789. typedef struct LcsIterator LcsIterator;
  790. struct LcsIterator {
  791. Fts3Expr *pExpr; /* Pointer to phrase expression */
  792. int iPosOffset; /* Tokens count up to end of this phrase */
  793. char *pRead; /* Cursor used to iterate through aDoclist */
  794. int iPos; /* Current position */
  795. };
  796. /*
  797. ** If LcsIterator.iCol is set to the following value, the iterator has
  798. ** finished iterating through all offsets for all columns.
  799. */
  800. #define LCS_ITERATOR_FINISHED 0x7FFFFFFF;
  801. static int fts3MatchinfoLcsCb(
  802. Fts3Expr *pExpr, /* Phrase expression node */
  803. int iPhrase, /* Phrase number (numbered from zero) */
  804. void *pCtx /* Pointer to MatchInfo structure */
  805. ){
  806. LcsIterator *aIter = (LcsIterator *)pCtx;
  807. aIter[iPhrase].pExpr = pExpr;
  808. return SQLITE_OK;
  809. }
  810. /*
  811. ** Advance the iterator passed as an argument to the next position. Return
  812. ** 1 if the iterator is at EOF or if it now points to the start of the
  813. ** position list for the next column.
  814. */
  815. static int fts3LcsIteratorAdvance(LcsIterator *pIter){
  816. char *pRead = pIter->pRead;
  817. sqlite3_int64 iRead;
  818. int rc = 0;
  819. pRead += sqlite3Fts3GetVarint(pRead, &iRead);
  820. if( iRead==0 || iRead==1 ){
  821. pRead = 0;
  822. rc = 1;
  823. }else{
  824. pIter->iPos += (int)(iRead-2);
  825. }
  826. pIter->pRead = pRead;
  827. return rc;
  828. }
  829. /*
  830. ** This function implements the FTS3_MATCHINFO_LCS matchinfo() flag.
  831. **
  832. ** If the call is successful, the longest-common-substring lengths for each
  833. ** column are written into the first nCol elements of the pInfo->aMatchinfo[]
  834. ** array before returning. SQLITE_OK is returned in this case.
  835. **
  836. ** Otherwise, if an error occurs, an SQLite error code is returned and the
  837. ** data written to the first nCol elements of pInfo->aMatchinfo[] is
  838. ** undefined.
  839. */
  840. static int fts3MatchinfoLcs(Fts3Cursor *pCsr, MatchInfo *pInfo){
  841. LcsIterator *aIter;
  842. int i;
  843. int iCol;
  844. int nToken = 0;
  845. /* Allocate and populate the array of LcsIterator objects. The array
  846. ** contains one element for each matchable phrase in the query.
  847. **/
  848. aIter = sqlite3_malloc(sizeof(LcsIterator) * pCsr->nPhrase);
  849. if( !aIter ) return SQLITE_NOMEM;
  850. memset(aIter, 0, sizeof(LcsIterator) * pCsr->nPhrase);
  851. (void)fts3ExprIterate(pCsr->pExpr, fts3MatchinfoLcsCb, (void*)aIter);
  852. for(i=0; i<pInfo->nPhrase; i++){
  853. LcsIterator *pIter = &aIter[i];
  854. nToken -= pIter->pExpr->pPhrase->nToken;
  855. pIter->iPosOffset = nToken;
  856. }
  857. for(iCol=0; iCol<pInfo->nCol; iCol++){
  858. int nLcs = 0; /* LCS value for this column */
  859. int nLive = 0; /* Number of iterators in aIter not at EOF */
  860. for(i=0; i<pInfo->nPhrase; i++){
  861. int rc;
  862. LcsIterator *pIt = &aIter[i];
  863. rc = sqlite3Fts3EvalPhrasePoslist(pCsr, pIt->pExpr, iCol, &pIt->pRead);
  864. if( rc!=SQLITE_OK ) return rc;
  865. if( pIt->pRead ){
  866. pIt->iPos = pIt->iPosOffset;
  867. fts3LcsIteratorAdvance(&aIter[i]);
  868. nLive++;
  869. }
  870. }
  871. while( nLive>0 ){
  872. LcsIterator *pAdv = 0; /* The iterator to advance by one position */
  873. int nThisLcs = 0; /* LCS for the current iterator positions */
  874. for(i=0; i<pInfo->nPhrase; i++){
  875. LcsIterator *pIter = &aIter[i];
  876. if( pIter->pRead==0 ){
  877. /* This iterator is already at EOF for this column. */
  878. nThisLcs = 0;
  879. }else{
  880. if( pAdv==0 || pIter->iPos<pAdv->iPos ){
  881. pAdv = pIter;
  882. }
  883. if( nThisLcs==0 || pIter->iPos==pIter[-1].iPos ){
  884. nThisLcs++;
  885. }else{
  886. nThisLcs = 1;
  887. }
  888. if( nThisLcs>nLcs ) nLcs = nThisLcs;
  889. }
  890. }
  891. if( fts3LcsIteratorAdvance(pAdv) ) nLive--;
  892. }
  893. pInfo->aMatchinfo[iCol] = nLcs;
  894. }
  895. sqlite3_free(aIter);
  896. return SQLITE_OK;
  897. }
  898. /*
  899. ** Populate the buffer pInfo->aMatchinfo[] with an array of integers to
  900. ** be returned by the matchinfo() function. Argument zArg contains the
  901. ** format string passed as the second argument to matchinfo (or the
  902. ** default value "pcx" if no second argument was specified). The format
  903. ** string has already been validated and the pInfo->aMatchinfo[] array
  904. ** is guaranteed to be large enough for the output.
  905. **
  906. ** If bGlobal is true, then populate all fields of the matchinfo() output.
  907. ** If it is false, then assume that those fields that do not change between
  908. ** rows (i.e. FTS3_MATCHINFO_NPHRASE, NCOL, NDOC, AVGLENGTH and part of HITS)
  909. ** have already been populated.
  910. **
  911. ** Return SQLITE_OK if successful, or an SQLite error code if an error
  912. ** occurs. If a value other than SQLITE_OK is returned, the state the
  913. ** pInfo->aMatchinfo[] buffer is left in is undefined.
  914. */
  915. static int fts3MatchinfoValues(
  916. Fts3Cursor *pCsr, /* FTS3 cursor object */
  917. int bGlobal, /* True to grab the global stats */
  918. MatchInfo *pInfo, /* Matchinfo context object */
  919. const char *zArg /* Matchinfo format string */
  920. ){
  921. int rc = SQLITE_OK;
  922. int i;
  923. Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
  924. sqlite3_stmt *pSelect = 0;
  925. for(i=0; rc==SQLITE_OK && zArg[i]; i++){
  926. switch( zArg[i] ){
  927. case FTS3_MATCHINFO_NPHRASE:
  928. if( bGlobal ) pInfo->aMatchinfo[0] = pInfo->nPhrase;
  929. break;
  930. case FTS3_MATCHINFO_NCOL:
  931. if( bGlobal ) pInfo->aMatchinfo[0] = pInfo->nCol;
  932. break;
  933. case FTS3_MATCHINFO_NDOC:
  934. if( bGlobal ){
  935. sqlite3_int64 nDoc = 0;
  936. rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &nDoc, 0);
  937. pInfo->aMatchinfo[0] = (u32)nDoc;
  938. }
  939. break;
  940. case FTS3_MATCHINFO_AVGLENGTH:
  941. if( bGlobal ){
  942. sqlite3_int64 nDoc; /* Number of rows in table */
  943. const char *a; /* Aggregate column length array */
  944. rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &nDoc, &a);
  945. if( rc==SQLITE_OK ){
  946. int iCol;
  947. for(iCol=0; iCol<pInfo->nCol; iCol++){
  948. u32 iVal;
  949. sqlite3_int64 nToken;
  950. a += sqlite3Fts3GetVarint(a, &nToken);
  951. iVal = (u32)(((u32)(nToken&0xffffffff)+nDoc/2)/nDoc);
  952. pInfo->aMatchinfo[iCol] = iVal;
  953. }
  954. }
  955. }
  956. break;
  957. case FTS3_MATCHINFO_LENGTH: {
  958. sqlite3_stmt *pSelectDocsize = 0;
  959. rc = sqlite3Fts3SelectDocsize(pTab, pCsr->iPrevId, &pSelectDocsize);
  960. if( rc==SQLITE_OK ){
  961. int iCol;
  962. const char *a = sqlite3_column_blob(pSelectDocsize, 0);
  963. for(iCol=0; iCol<pInfo->nCol; iCol++){
  964. sqlite3_int64 nToken;
  965. a += sqlite3Fts3GetVarint(a, &nToken);
  966. pInfo->aMatchinfo[iCol] = (u32)nToken;
  967. }
  968. }
  969. sqlite3_reset(pSelectDocsize);
  970. break;
  971. }
  972. case FTS3_MATCHINFO_LCS:
  973. rc = fts3ExprLoadDoclists(pCsr, 0, 0);
  974. if( rc==SQLITE_OK ){
  975. rc = fts3MatchinfoLcs(pCsr, pInfo);
  976. }
  977. break;
  978. default: {
  979. Fts3Expr *pExpr;
  980. assert( zArg[i]==FTS3_MATCHINFO_HITS );
  981. pExpr = pCsr->pExpr;
  982. rc = fts3ExprLoadDoclists(pCsr, 0, 0);
  983. if( rc!=SQLITE_OK ) break;
  984. if( bGlobal ){
  985. if( pCsr->pDeferred ){
  986. rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &pInfo->nDoc, 0);
  987. if( rc!=SQLITE_OK ) break;
  988. }
  989. rc = fts3ExprIterate(pExpr, fts3ExprGlobalHitsCb,(void*)pInfo);
  990. if( rc!=SQLITE_OK ) break;
  991. }
  992. (void)fts3ExprIterate(pExpr, fts3ExprLocalHitsCb,(void*)pInfo);
  993. break;
  994. }
  995. }
  996. pInfo->aMatchinfo += fts3MatchinfoSize(pInfo, zArg[i]);
  997. }
  998. sqlite3_reset(pSelect);
  999. return rc;
  1000. }
  1001. /*
  1002. ** Populate pCsr->aMatchinfo[] with data for the current row. The
  1003. ** 'matchinfo' data is an array of 32-bit unsigned integers (C type u32).
  1004. */
  1005. static int fts3GetMatchinfo(
  1006. Fts3Cursor *pCsr, /* FTS3 Cursor object */
  1007. const char *zArg /* Second argument to matchinfo() function */
  1008. ){
  1009. MatchInfo sInfo;
  1010. Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
  1011. int rc = SQLITE_OK;
  1012. int bGlobal = 0; /* Collect 'global' stats as well as local */
  1013. memset(&sInfo, 0, sizeof(MatchInfo));
  1014. sInfo.pCursor = pCsr;
  1015. sInfo.nCol = pTab->nColumn;
  1016. /* If there is cached matchinfo() data, but the format string for the
  1017. ** cache does not match the format string for this request, discard
  1018. ** the cached data. */
  1019. if( pCsr->zMatchinfo && strcmp(pCsr->zMatchinfo, zArg) ){
  1020. assert( pCsr->aMatchinfo );
  1021. sqlite3_free(pCsr->aMatchinfo);
  1022. pCsr->zMatchinfo = 0;
  1023. pCsr->aMatchinfo = 0;
  1024. }
  1025. /* If Fts3Cursor.aMatchinfo[] is NULL, then this is the first time the
  1026. ** matchinfo function has been called for this query. In this case
  1027. ** allocate the array used to accumulate the matchinfo data and
  1028. ** initialize those elements that are constant for every row.
  1029. */
  1030. if( pCsr->aMatchinfo==0 ){
  1031. int nMatchinfo = 0; /* Number of u32 elements in match-info */
  1032. int nArg; /* Bytes in zArg */
  1033. int i; /* Used to iterate through zArg */
  1034. /* Determine the number of phrases in the query */
  1035. pCsr->nPhrase = fts3ExprPhraseCount(pCsr->pExpr);
  1036. sInfo.nPhrase = pCsr->nPhrase;
  1037. /* Determine the number of integers in the buffer returned by this call. */
  1038. for(i=0; zArg[i]; i++){
  1039. nMatchinfo += fts3MatchinfoSize(&sInfo, zArg[i]);
  1040. }
  1041. /* Allocate space for Fts3Cursor.aMatchinfo[] and Fts3Cursor.zMatchinfo. */
  1042. nArg = (int)strlen(zArg);
  1043. pCsr->aMatchinfo = (u32 *)sqlite3_malloc(sizeof(u32)*nMatchinfo + nArg + 1);
  1044. if( !pCsr->aMatchinfo ) return SQLITE_NOMEM;
  1045. pCsr->zMatchinfo = (char *)&pCsr->aMatchinfo[nMatchinfo];
  1046. pCsr->nMatchinfo = nMatchinfo;
  1047. memcpy(pCsr->zMatchinfo, zArg, nArg+1);
  1048. memset(pCsr->aMatchinfo, 0, sizeof(u32)*nMatchinfo);
  1049. pCsr->isMatchinfoNeeded = 1;
  1050. bGlobal = 1;
  1051. }
  1052. sInfo.aMatchinfo = pCsr->aMatchinfo;
  1053. sInfo.nPhrase = pCsr->nPhrase;
  1054. if( pCsr->isMatchinfoNeeded ){
  1055. rc = fts3MatchinfoValues(pCsr, bGlobal, &sInfo, zArg);
  1056. pCsr->isMatchinfoNeeded = 0;
  1057. }
  1058. return rc;
  1059. }
  1060. /*
  1061. ** Implementation of snippet() function.
  1062. */
  1063. void sqlite3Fts3Snippet(
  1064. sqlite3_context *pCtx, /* SQLite function call context */
  1065. Fts3Cursor *pCsr, /* Cursor object */
  1066. const char *zStart, /* Snippet start text - "<b>" */
  1067. const char *zEnd, /* Snippet end text - "</b>" */
  1068. const char *zEllipsis, /* Snippet ellipsis text - "<b>...</b>" */
  1069. int iCol, /* Extract snippet from this column */
  1070. int nToken /* Approximate number of tokens in snippet */
  1071. ){
  1072. Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
  1073. int rc = SQLITE_OK;
  1074. int i;
  1075. StrBuffer res = {0, 0, 0};
  1076. /* The returned text includes up to four fragments of text extracted from
  1077. ** the data in the current row. The first iteration of the for(...) loop
  1078. ** below attempts to locate a single fragment of text nToken tokens in
  1079. ** size that contains at least one instance of all phrases in the query
  1080. ** expression that appear in the current row. If such a fragment of text
  1081. ** cannot be found, the second iteration of the loop attempts to locate
  1082. ** a pair of fragments, and so on.
  1083. */
  1084. int nSnippet = 0; /* Number of fragments in this snippet */
  1085. SnippetFragment aSnippet[4]; /* Maximum of 4 fragments per snippet */
  1086. int nFToken = -1; /* Number of tokens in each fragment */
  1087. if( !pCsr->pExpr ){
  1088. sqlite3_result_text(pCtx, "", 0, SQLITE_STATIC);
  1089. return;
  1090. }
  1091. for(nSnippet=1; 1; nSnippet++){
  1092. int iSnip; /* Loop counter 0..nSnippet-1 */
  1093. u64 mCovered = 0; /* Bitmask of phrases covered by snippet */
  1094. u64 mSeen = 0; /* Bitmask of phrases seen by BestSnippet() */
  1095. if( nToken>=0 ){
  1096. nFToken = (nToken+nSnippet-1) / nSnippet;
  1097. }else{
  1098. nFToken = -1 * nToken;
  1099. }
  1100. for(iSnip=0; iSnip<nSnippet; iSnip++){
  1101. int iBestScore = -1; /* Best score of columns checked so far */
  1102. int iRead; /* Used to iterate through columns */
  1103. SnippetFragment *pFragment = &aSnippet[iSnip];
  1104. memset(pFragment, 0, sizeof(*pFragment));
  1105. /* Loop through all columns of the table being considered for snippets.
  1106. ** If the iCol argument to this function was negative, this means all
  1107. ** columns of the FTS3 table. Otherwise, only column iCol is considered.
  1108. */
  1109. for(iRead=0; iRead<pTab->nColumn; iRead++){
  1110. SnippetFragment sF = {0, 0, 0, 0};
  1111. int iS;
  1112. if( iCol>=0 && iRead!=iCol ) continue;
  1113. /* Find the best snippet of nFToken tokens in column iRead. */
  1114. rc = fts3BestSnippet(nFToken, pCsr, iRead, mCovered, &mSeen, &sF, &iS);
  1115. if( rc!=SQLITE_OK ){
  1116. goto snippet_out;
  1117. }
  1118. if( iS>iBestScore ){
  1119. *pFragment = sF;
  1120. iBestScore = iS;
  1121. }
  1122. }
  1123. mCovered |= pFragment->covered;
  1124. }
  1125. /* If all query phrases seen by fts3BestSnippet() are present in at least
  1126. ** one of the nSnippet snippet fragments, break out of the loop.
  1127. */
  1128. assert( (mCovered&mSeen)==mCovered );
  1129. if( mSeen==mCovered || nSnippet==SizeofArray(aSnippet) ) break;
  1130. }
  1131. assert( nFToken>0 );
  1132. for(i=0; i<nSnippet && rc==SQLITE_OK; i++){
  1133. rc = fts3SnippetText(pCsr, &aSnippet[i],
  1134. i, (i==nSnippet-1), nFToken, zStart, zEnd, zEllipsis, &res
  1135. );
  1136. }
  1137. snippet_out:
  1138. sqlite3Fts3SegmentsClose(pTab);
  1139. if( rc!=SQLITE_OK ){
  1140. sqlite3_result_error_code(pCtx, rc);
  1141. sqlite3_free(res.z);
  1142. }else{
  1143. sqlite3_result_text(pCtx, res.z, -1, sqlite3_free);
  1144. }
  1145. }
  1146. typedef struct TermOffset TermOffset;
  1147. typedef struct TermOffsetCtx TermOffsetCtx;
  1148. struct TermOffset {
  1149. char *pList; /* Position-list */
  1150. int iPos; /* Position just read from pList */
  1151. int iOff; /* Offset of this term from read positions */
  1152. };
  1153. struct TermOffsetCtx {
  1154. Fts3Cursor *pCsr;
  1155. int iCol; /* Column of table to populate aTerm for */
  1156. int iTerm;
  1157. sqlite3_int64 iDocid;
  1158. TermOffset *aTerm;
  1159. };
  1160. /*
  1161. ** This function is an fts3ExprIterate() callback used by sqlite3Fts3Offsets().
  1162. */
  1163. static int fts3ExprTermOffsetInit(Fts3Expr *pExpr, int iPhrase, void *ctx){
  1164. TermOffsetCtx *p = (TermOffsetCtx *)ctx;
  1165. int nTerm; /* Number of tokens in phrase */
  1166. int iTerm; /* For looping through nTerm phrase terms */
  1167. char *pList; /* Pointer to position list for phrase */
  1168. int iPos = 0; /* First position in position-list */
  1169. int rc;
  1170. UNUSED_PARAMETER(iPhrase);
  1171. rc = sqlite3Fts3EvalPhrasePoslist(p->pCsr, pExpr, p->iCol, &pList);
  1172. nTerm = pExpr->pPhrase->nToken;
  1173. if( pList ){
  1174. fts3GetDeltaPosition(&pList, &iPos);
  1175. assert( iPos>=0 );
  1176. }
  1177. for(iTerm=0; iTerm<nTerm; iTerm++){
  1178. TermOffset *pT = &p->aTerm[p->iTerm++];
  1179. pT->iOff = nTerm-iTerm-1;
  1180. pT->pList = pList;
  1181. pT->iPos = iPos;
  1182. }
  1183. return rc;
  1184. }
  1185. /*
  1186. ** Implementation of offsets() function.
  1187. */
  1188. void sqlite3Fts3Offsets(
  1189. sqlite3_context *pCtx, /* SQLite function call context */
  1190. Fts3Cursor *pCsr /* Cursor object */
  1191. ){
  1192. Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
  1193. sqlite3_tokenizer_module const *pMod = pTab->pTokenizer->pModule;
  1194. int rc; /* Return Code */
  1195. int nToken; /* Number of tokens in query */
  1196. int iCol; /* Column currently being processed */
  1197. StrBuffer res = {0, 0, 0}; /* Result string */
  1198. TermOffsetCtx sCtx; /* Context for fts3ExprTermOffsetInit() */
  1199. if( !pCsr->pExpr ){
  1200. sqlite3_result_text(pCtx, "", 0, SQLITE_STATIC);
  1201. return;
  1202. }
  1203. memset(&sCtx, 0, sizeof(sCtx));
  1204. assert( pCsr->isRequireSeek==0 );
  1205. /* Count the number of terms in the query */
  1206. rc = fts3ExprLoadDoclists(pCsr, 0, &nToken);
  1207. if( rc!=SQLITE_OK ) goto offsets_out;
  1208. /* Allocate the array of TermOffset iterators. */
  1209. sCtx.aTerm = (TermOffset *)sqlite3_malloc(sizeof(TermOffset)*nToken);
  1210. if( 0==sCtx.aTerm ){
  1211. rc = SQLITE_NOMEM;
  1212. goto offsets_out;
  1213. }
  1214. sCtx.iDocid = pCsr->iPrevId;
  1215. sCtx.pCsr = pCsr;
  1216. /* Loop through the table columns, appending offset information to
  1217. ** string-buffer res for each column.
  1218. */
  1219. for(iCol=0; iCol<pTab->nColumn; iCol++){
  1220. sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor */
  1221. const char *ZDUMMY; /* Dummy argument used with xNext() */
  1222. int NDUMMY = 0; /* Dummy argument used with xNext() */
  1223. int iStart = 0;
  1224. int iEnd = 0;
  1225. int iCurrent = 0;
  1226. const char *zDoc;
  1227. int nDoc;
  1228. /* Initialize the contents of sCtx.aTerm[] for column iCol. There is
  1229. ** no way that this operation can fail, so the return code from
  1230. ** fts3ExprIterate() can be discarded.
  1231. */
  1232. sCtx.iCol = iCol;
  1233. sCtx.iTerm = 0;
  1234. (void)fts3ExprIterate(pCsr->pExpr, fts3ExprTermOffsetInit, (void *)&sCtx);
  1235. /* Retreive the text stored in column iCol. If an SQL NULL is stored
  1236. ** in column iCol, jump immediately to the next iteration of the loop.
  1237. ** If an OOM occurs while retrieving the data (this can happen if SQLite
  1238. ** needs to transform the data from utf-16 to utf-8), return SQLITE_NOMEM
  1239. ** to the caller.
  1240. */
  1241. zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol+1);
  1242. nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol+1);
  1243. if( zDoc==0 ){
  1244. if( sqlite3_column_type(pCsr->pStmt, iCol+1)==SQLITE_NULL ){
  1245. continue;
  1246. }
  1247. rc = SQLITE_NOMEM;
  1248. goto offsets_out;
  1249. }
  1250. /* Initialize a tokenizer iterator to iterate through column iCol. */
  1251. rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, pCsr->iLangid,
  1252. zDoc, nDoc, &pC
  1253. );
  1254. if( rc!=SQLITE_OK ) goto offsets_out;
  1255. rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent);
  1256. while( rc==SQLITE_OK ){
  1257. int i; /* Used to loop through terms */
  1258. int iMinPos = 0x7FFFFFFF; /* Position of next token */
  1259. TermOffset *pTerm = 0; /* TermOffset associated with next token */
  1260. for(i=0; i<nToken; i++){
  1261. TermOffset *pT = &sCtx.aTerm[i];
  1262. if( pT->pList && (pT->iPos-pT->iOff)<iMinPos ){
  1263. iMinPos = pT->iPos-pT->iOff;
  1264. pTerm = pT;
  1265. }
  1266. }
  1267. if( !pTerm ){
  1268. /* All offsets for this column have been gathered. */
  1269. rc = SQLITE_DONE;
  1270. }else{
  1271. assert( iCurrent<=iMinPos );
  1272. if( 0==(0xFE&*pTerm->pList) ){
  1273. pTerm->pList = 0;
  1274. }else{
  1275. fts3GetDeltaPosition(&pTerm->pList, &pTerm->iPos);
  1276. }
  1277. while( rc==SQLITE_OK && iCurrent<iMinPos ){
  1278. rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent);
  1279. }
  1280. if( rc==SQLITE_OK ){
  1281. char aBuffer[64];
  1282. sqlite3_snprintf(sizeof(aBuffer), aBuffer,
  1283. "%d %d %d %d ", iCol, pTerm-sCtx.aTerm, iStart, iEnd-iStart
  1284. );
  1285. rc = fts3StringAppend(&res, aBuffer, -1);
  1286. }else if( rc==SQLITE_DONE && pTab->zContentTbl==0 ){
  1287. rc = FTS_CORRUPT_VTAB;
  1288. }
  1289. }
  1290. }
  1291. if( rc==SQLITE_DONE ){
  1292. rc = SQLITE_OK;
  1293. }
  1294. pMod->xClose(pC);
  1295. if( rc!=SQLITE_OK ) goto offsets_out;
  1296. }
  1297. offsets_out:
  1298. sqlite3_free(sCtx.aTerm);
  1299. assert( rc!=SQLITE_DONE );
  1300. sqlite3Fts3SegmentsClose(pTab);
  1301. if( rc!=SQLITE_OK ){
  1302. sqlite3_result_error_code(pCtx, rc);
  1303. sqlite3_free(res.z);
  1304. }else{
  1305. sqlite3_result_text(pCtx, res.z, res.n-1, sqlite3_free);
  1306. }
  1307. return;
  1308. }
  1309. /*
  1310. ** Implementation of matchinfo() function.
  1311. */
  1312. void sqlite3Fts3Matchinfo(
  1313. sqlite3_context *pContext, /* Function call context */
  1314. Fts3Cursor *pCsr, /* FTS3 table cursor */
  1315. const char *zArg /* Second arg to matchinfo() function */
  1316. ){
  1317. Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
  1318. int rc;
  1319. int i;
  1320. const char *zFormat;
  1321. if( zArg ){
  1322. for(i=0; zArg[i]; i++){
  1323. char *zErr = 0;
  1324. if( fts3MatchinfoCheck(pTab, zArg[i], &zErr) ){
  1325. sqlite3_result_error(pContext, zErr, -1);
  1326. sqlite3_free(zErr);
  1327. return;
  1328. }
  1329. }
  1330. zFormat = zArg;
  1331. }else{
  1332. zFormat = FTS3_MATCHINFO_DEFAULT;
  1333. }
  1334. if( !pCsr->pExpr ){
  1335. sqlite3_result_blob(pContext, "", 0, SQLITE_STATIC);
  1336. return;
  1337. }
  1338. /* Retrieve matchinfo() data. */
  1339. rc = fts3GetMatchinfo(pCsr, zFormat);
  1340. sqlite3Fts3SegmentsClose(pTab);
  1341. if( rc!=SQLITE_OK ){
  1342. sqlite3_result_error_code(pContext, rc);
  1343. }else{
  1344. int n = pCsr->nMatchinfo * sizeof(u32);
  1345. sqlite3_result_blob(pContext, pCsr->aMatchinfo, n, SQLITE_TRANSIENT);
  1346. }
  1347. }
  1348. #endif