fts1.c 100 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348
  1. /* fts1 has a design flaw which can lead to database corruption (see
  2. ** below). It is recommended not to use it any longer, instead use
  3. ** fts3 (or higher). If you believe that your use of fts1 is safe,
  4. ** add -DSQLITE_ENABLE_BROKEN_FTS1=1 to your CFLAGS.
  5. */
  6. #if (!defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)) \
  7. && !defined(SQLITE_ENABLE_BROKEN_FTS1)
  8. #error fts1 has a design flaw and has been deprecated.
  9. #endif
  10. /* The flaw is that fts1 uses the content table's unaliased rowid as
  11. ** the unique docid. fts1 embeds the rowid in the index it builds,
  12. ** and expects the rowid to not change. The SQLite VACUUM operation
  13. ** will renumber such rowids, thereby breaking fts1. If you are using
  14. ** fts1 in a system which has disabled VACUUM, then you can continue
  15. ** to use it safely. Note that PRAGMA auto_vacuum does NOT disable
  16. ** VACUUM, though systems using auto_vacuum are unlikely to invoke
  17. ** VACUUM.
  18. **
  19. ** fts1 should be safe even across VACUUM if you only insert documents
  20. ** and never delete.
  21. */
  22. /* The author disclaims copyright to this source code.
  23. *
  24. * This is an SQLite module implementing full-text search.
  25. */
  26. /*
  27. ** The code in this file is only compiled if:
  28. **
  29. ** * The FTS1 module is being built as an extension
  30. ** (in which case SQLITE_CORE is not defined), or
  31. **
  32. ** * The FTS1 module is being built into the core of
  33. ** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
  34. */
  35. #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
  36. #if defined(SQLITE_ENABLE_FTS1) && !defined(SQLITE_CORE)
  37. # define SQLITE_CORE 1
  38. #endif
  39. #include <assert.h>
  40. #include <stdlib.h>
  41. #include <stdio.h>
  42. #include <string.h>
  43. #include <ctype.h>
  44. #include "fts1.h"
  45. #include "fts1_hash.h"
  46. #include "fts1_tokenizer.h"
  47. #include "sqlite3.h"
  48. #include "sqlite3ext.h"
  49. SQLITE_EXTENSION_INIT1
  50. #if 0
  51. # define TRACE(A) printf A; fflush(stdout)
  52. #else
  53. # define TRACE(A)
  54. #endif
  55. /* utility functions */
  56. typedef struct StringBuffer {
  57. int len; /* length, not including null terminator */
  58. int alloced; /* Space allocated for s[] */
  59. char *s; /* Content of the string */
  60. } StringBuffer;
  61. static void initStringBuffer(StringBuffer *sb){
  62. sb->len = 0;
  63. sb->alloced = 100;
  64. sb->s = malloc(100);
  65. sb->s[0] = '\0';
  66. }
  67. static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
  68. if( sb->len + nFrom >= sb->alloced ){
  69. sb->alloced = sb->len + nFrom + 100;
  70. sb->s = realloc(sb->s, sb->alloced+1);
  71. if( sb->s==0 ){
  72. initStringBuffer(sb);
  73. return;
  74. }
  75. }
  76. memcpy(sb->s + sb->len, zFrom, nFrom);
  77. sb->len += nFrom;
  78. sb->s[sb->len] = 0;
  79. }
  80. static void append(StringBuffer *sb, const char *zFrom){
  81. nappend(sb, zFrom, strlen(zFrom));
  82. }
  83. /* We encode variable-length integers in little-endian order using seven bits
  84. * per byte as follows:
  85. **
  86. ** KEY:
  87. ** A = 0xxxxxxx 7 bits of data and one flag bit
  88. ** B = 1xxxxxxx 7 bits of data and one flag bit
  89. **
  90. ** 7 bits - A
  91. ** 14 bits - BA
  92. ** 21 bits - BBA
  93. ** and so on.
  94. */
  95. /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
  96. #define VARINT_MAX 10
  97. /* Write a 64-bit variable-length integer to memory starting at p[0].
  98. * The length of data written will be between 1 and VARINT_MAX bytes.
  99. * The number of bytes written is returned. */
  100. static int putVarint(char *p, sqlite_int64 v){
  101. unsigned char *q = (unsigned char *) p;
  102. sqlite_uint64 vu = v;
  103. do{
  104. *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
  105. vu >>= 7;
  106. }while( vu!=0 );
  107. q[-1] &= 0x7f; /* turn off high bit in final byte */
  108. assert( q - (unsigned char *)p <= VARINT_MAX );
  109. return (int) (q - (unsigned char *)p);
  110. }
  111. /* Read a 64-bit variable-length integer from memory starting at p[0].
  112. * Return the number of bytes read, or 0 on error.
  113. * The value is stored in *v. */
  114. static int getVarint(const char *p, sqlite_int64 *v){
  115. const unsigned char *q = (const unsigned char *) p;
  116. sqlite_uint64 x = 0, y = 1;
  117. while( (*q & 0x80) == 0x80 ){
  118. x += y * (*q++ & 0x7f);
  119. y <<= 7;
  120. if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */
  121. assert( 0 );
  122. return 0;
  123. }
  124. }
  125. x += y * (*q++);
  126. *v = (sqlite_int64) x;
  127. return (int) (q - (unsigned char *)p);
  128. }
  129. static int getVarint32(const char *p, int *pi){
  130. sqlite_int64 i;
  131. int ret = getVarint(p, &i);
  132. *pi = (int) i;
  133. assert( *pi==i );
  134. return ret;
  135. }
  136. /*** Document lists ***
  137. *
  138. * A document list holds a sorted list of varint-encoded document IDs.
  139. *
  140. * A doclist with type DL_POSITIONS_OFFSETS is stored like this:
  141. *
  142. * array {
  143. * varint docid;
  144. * array {
  145. * varint position; (delta from previous position plus POS_BASE)
  146. * varint startOffset; (delta from previous startOffset)
  147. * varint endOffset; (delta from startOffset)
  148. * }
  149. * }
  150. *
  151. * Here, array { X } means zero or more occurrences of X, adjacent in memory.
  152. *
  153. * A position list may hold positions for text in multiple columns. A position
  154. * POS_COLUMN is followed by a varint containing the index of the column for
  155. * following positions in the list. Any positions appearing before any
  156. * occurrences of POS_COLUMN are for column 0.
  157. *
  158. * A doclist with type DL_POSITIONS is like the above, but holds only docids
  159. * and positions without offset information.
  160. *
  161. * A doclist with type DL_DOCIDS is like the above, but holds only docids
  162. * without positions or offset information.
  163. *
  164. * On disk, every document list has positions and offsets, so we don't bother
  165. * to serialize a doclist's type.
  166. *
  167. * We don't yet delta-encode document IDs; doing so will probably be a
  168. * modest win.
  169. *
  170. * NOTE(shess) I've thought of a slightly (1%) better offset encoding.
  171. * After the first offset, estimate the next offset by using the
  172. * current token position and the previous token position and offset,
  173. * offset to handle some variance. So the estimate would be
  174. * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded
  175. * as normal. Offsets more than 64 chars from the estimate are
  176. * encoded as the delta to the previous start offset + 128. An
  177. * additional tiny increment can be gained by using the end offset of
  178. * the previous token to make the estimate a tiny bit more precise.
  179. */
  180. /* It is not safe to call isspace(), tolower(), or isalnum() on
  181. ** hi-bit-set characters. This is the same solution used in the
  182. ** tokenizer.
  183. */
  184. /* TODO(shess) The snippet-generation code should be using the
  185. ** tokenizer-generated tokens rather than doing its own local
  186. ** tokenization.
  187. */
  188. /* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
  189. static int safe_isspace(char c){
  190. return (c&0x80)==0 ? isspace(c) : 0;
  191. }
  192. static int safe_tolower(char c){
  193. return (c&0x80)==0 ? tolower(c) : c;
  194. }
  195. static int safe_isalnum(char c){
  196. return (c&0x80)==0 ? isalnum(c) : 0;
  197. }
  198. typedef enum DocListType {
  199. DL_DOCIDS, /* docids only */
  200. DL_POSITIONS, /* docids + positions */
  201. DL_POSITIONS_OFFSETS /* docids + positions + offsets */
  202. } DocListType;
  203. /*
  204. ** By default, only positions and not offsets are stored in the doclists.
  205. ** To change this so that offsets are stored too, compile with
  206. **
  207. ** -DDL_DEFAULT=DL_POSITIONS_OFFSETS
  208. **
  209. */
  210. #ifndef DL_DEFAULT
  211. # define DL_DEFAULT DL_POSITIONS
  212. #endif
  213. typedef struct DocList {
  214. char *pData;
  215. int nData;
  216. DocListType iType;
  217. int iLastColumn; /* the last column written */
  218. int iLastPos; /* the last position written */
  219. int iLastOffset; /* the last start offset written */
  220. } DocList;
  221. enum {
  222. POS_END = 0, /* end of this position list */
  223. POS_COLUMN, /* followed by new column number */
  224. POS_BASE
  225. };
  226. /* Initialize a new DocList to hold the given data. */
  227. static void docListInit(DocList *d, DocListType iType,
  228. const char *pData, int nData){
  229. d->nData = nData;
  230. if( nData>0 ){
  231. d->pData = malloc(nData);
  232. memcpy(d->pData, pData, nData);
  233. } else {
  234. d->pData = NULL;
  235. }
  236. d->iType = iType;
  237. d->iLastColumn = 0;
  238. d->iLastPos = d->iLastOffset = 0;
  239. }
  240. /* Create a new dynamically-allocated DocList. */
  241. static DocList *docListNew(DocListType iType){
  242. DocList *d = (DocList *) malloc(sizeof(DocList));
  243. docListInit(d, iType, 0, 0);
  244. return d;
  245. }
  246. static void docListDestroy(DocList *d){
  247. free(d->pData);
  248. #ifndef NDEBUG
  249. memset(d, 0x55, sizeof(*d));
  250. #endif
  251. }
  252. static void docListDelete(DocList *d){
  253. docListDestroy(d);
  254. free(d);
  255. }
  256. static char *docListEnd(DocList *d){
  257. return d->pData + d->nData;
  258. }
  259. /* Append a varint to a DocList's data. */
  260. static void appendVarint(DocList *d, sqlite_int64 i){
  261. char c[VARINT_MAX];
  262. int n = putVarint(c, i);
  263. d->pData = realloc(d->pData, d->nData + n);
  264. memcpy(d->pData + d->nData, c, n);
  265. d->nData += n;
  266. }
  267. static void docListAddDocid(DocList *d, sqlite_int64 iDocid){
  268. appendVarint(d, iDocid);
  269. if( d->iType>=DL_POSITIONS ){
  270. appendVarint(d, POS_END); /* initially empty position list */
  271. d->iLastColumn = 0;
  272. d->iLastPos = d->iLastOffset = 0;
  273. }
  274. }
  275. /* helper function for docListAddPos and docListAddPosOffset */
  276. static void addPos(DocList *d, int iColumn, int iPos){
  277. assert( d->nData>0 );
  278. --d->nData; /* remove previous terminator */
  279. if( iColumn!=d->iLastColumn ){
  280. assert( iColumn>d->iLastColumn );
  281. appendVarint(d, POS_COLUMN);
  282. appendVarint(d, iColumn);
  283. d->iLastColumn = iColumn;
  284. d->iLastPos = d->iLastOffset = 0;
  285. }
  286. assert( iPos>=d->iLastPos );
  287. appendVarint(d, iPos-d->iLastPos+POS_BASE);
  288. d->iLastPos = iPos;
  289. }
  290. /* Add a position to the last position list in a doclist. */
  291. static void docListAddPos(DocList *d, int iColumn, int iPos){
  292. assert( d->iType==DL_POSITIONS );
  293. addPos(d, iColumn, iPos);
  294. appendVarint(d, POS_END); /* add new terminator */
  295. }
  296. /*
  297. ** Add a position and starting and ending offsets to a doclist.
  298. **
  299. ** If the doclist is setup to handle only positions, then insert
  300. ** the position only and ignore the offsets.
  301. */
  302. static void docListAddPosOffset(
  303. DocList *d, /* Doclist under construction */
  304. int iColumn, /* Column the inserted term is part of */
  305. int iPos, /* Position of the inserted term */
  306. int iStartOffset, /* Starting offset of inserted term */
  307. int iEndOffset /* Ending offset of inserted term */
  308. ){
  309. assert( d->iType>=DL_POSITIONS );
  310. addPos(d, iColumn, iPos);
  311. if( d->iType==DL_POSITIONS_OFFSETS ){
  312. assert( iStartOffset>=d->iLastOffset );
  313. appendVarint(d, iStartOffset-d->iLastOffset);
  314. d->iLastOffset = iStartOffset;
  315. assert( iEndOffset>=iStartOffset );
  316. appendVarint(d, iEndOffset-iStartOffset);
  317. }
  318. appendVarint(d, POS_END); /* add new terminator */
  319. }
  320. /*
  321. ** A DocListReader object is a cursor into a doclist. Initialize
  322. ** the cursor to the beginning of the doclist by calling readerInit().
  323. ** Then use routines
  324. **
  325. ** peekDocid()
  326. ** readDocid()
  327. ** readPosition()
  328. ** skipPositionList()
  329. ** and so forth...
  330. **
  331. ** to read information out of the doclist. When we reach the end
  332. ** of the doclist, atEnd() returns TRUE.
  333. */
  334. typedef struct DocListReader {
  335. DocList *pDoclist; /* The document list we are stepping through */
  336. char *p; /* Pointer to next unread byte in the doclist */
  337. int iLastColumn;
  338. int iLastPos; /* the last position read, or -1 when not in a position list */
  339. } DocListReader;
  340. /*
  341. ** Initialize the DocListReader r to point to the beginning of pDoclist.
  342. */
  343. static void readerInit(DocListReader *r, DocList *pDoclist){
  344. r->pDoclist = pDoclist;
  345. if( pDoclist!=NULL ){
  346. r->p = pDoclist->pData;
  347. }
  348. r->iLastColumn = -1;
  349. r->iLastPos = -1;
  350. }
  351. /*
  352. ** Return TRUE if we have reached then end of pReader and there is
  353. ** nothing else left to read.
  354. */
  355. static int atEnd(DocListReader *pReader){
  356. return pReader->pDoclist==0 || (pReader->p >= docListEnd(pReader->pDoclist));
  357. }
  358. /* Peek at the next docid without advancing the read pointer.
  359. */
  360. static sqlite_int64 peekDocid(DocListReader *pReader){
  361. sqlite_int64 ret;
  362. assert( !atEnd(pReader) );
  363. assert( pReader->iLastPos==-1 );
  364. getVarint(pReader->p, &ret);
  365. return ret;
  366. }
  367. /* Read the next docid. See also nextDocid().
  368. */
  369. static sqlite_int64 readDocid(DocListReader *pReader){
  370. sqlite_int64 ret;
  371. assert( !atEnd(pReader) );
  372. assert( pReader->iLastPos==-1 );
  373. pReader->p += getVarint(pReader->p, &ret);
  374. if( pReader->pDoclist->iType>=DL_POSITIONS ){
  375. pReader->iLastColumn = 0;
  376. pReader->iLastPos = 0;
  377. }
  378. return ret;
  379. }
  380. /* Read the next position and column index from a position list.
  381. * Returns the position, or -1 at the end of the list. */
  382. static int readPosition(DocListReader *pReader, int *iColumn){
  383. int i;
  384. int iType = pReader->pDoclist->iType;
  385. if( pReader->iLastPos==-1 ){
  386. return -1;
  387. }
  388. assert( !atEnd(pReader) );
  389. if( iType<DL_POSITIONS ){
  390. return -1;
  391. }
  392. pReader->p += getVarint32(pReader->p, &i);
  393. if( i==POS_END ){
  394. pReader->iLastColumn = pReader->iLastPos = -1;
  395. *iColumn = -1;
  396. return -1;
  397. }
  398. if( i==POS_COLUMN ){
  399. pReader->p += getVarint32(pReader->p, &pReader->iLastColumn);
  400. pReader->iLastPos = 0;
  401. pReader->p += getVarint32(pReader->p, &i);
  402. assert( i>=POS_BASE );
  403. }
  404. pReader->iLastPos += ((int) i)-POS_BASE;
  405. if( iType>=DL_POSITIONS_OFFSETS ){
  406. /* Skip over offsets, ignoring them for now. */
  407. int iStart, iEnd;
  408. pReader->p += getVarint32(pReader->p, &iStart);
  409. pReader->p += getVarint32(pReader->p, &iEnd);
  410. }
  411. *iColumn = pReader->iLastColumn;
  412. return pReader->iLastPos;
  413. }
  414. /* Skip past the end of a position list. */
  415. static void skipPositionList(DocListReader *pReader){
  416. DocList *p = pReader->pDoclist;
  417. if( p && p->iType>=DL_POSITIONS ){
  418. int iColumn;
  419. while( readPosition(pReader, &iColumn)!=-1 ){}
  420. }
  421. }
  422. /* Skip over a docid, including its position list if the doclist has
  423. * positions. */
  424. static void skipDocument(DocListReader *pReader){
  425. readDocid(pReader);
  426. skipPositionList(pReader);
  427. }
  428. /* Skip past all docids which are less than [iDocid]. Returns 1 if a docid
  429. * matching [iDocid] was found. */
  430. static int skipToDocid(DocListReader *pReader, sqlite_int64 iDocid){
  431. sqlite_int64 d = 0;
  432. while( !atEnd(pReader) && (d=peekDocid(pReader))<iDocid ){
  433. skipDocument(pReader);
  434. }
  435. return !atEnd(pReader) && d==iDocid;
  436. }
  437. /* Return the first document in a document list.
  438. */
  439. static sqlite_int64 firstDocid(DocList *d){
  440. DocListReader r;
  441. readerInit(&r, d);
  442. return readDocid(&r);
  443. }
  444. #ifdef SQLITE_DEBUG
  445. /*
  446. ** This routine is used for debugging purpose only.
  447. **
  448. ** Write the content of a doclist to standard output.
  449. */
  450. static void printDoclist(DocList *p){
  451. DocListReader r;
  452. const char *zSep = "";
  453. readerInit(&r, p);
  454. while( !atEnd(&r) ){
  455. sqlite_int64 docid = readDocid(&r);
  456. if( docid==0 ){
  457. skipPositionList(&r);
  458. continue;
  459. }
  460. printf("%s%lld", zSep, docid);
  461. zSep = ",";
  462. if( p->iType>=DL_POSITIONS ){
  463. int iPos, iCol;
  464. const char *zDiv = "";
  465. printf("(");
  466. while( (iPos = readPosition(&r, &iCol))>=0 ){
  467. printf("%s%d:%d", zDiv, iCol, iPos);
  468. zDiv = ":";
  469. }
  470. printf(")");
  471. }
  472. }
  473. printf("\n");
  474. fflush(stdout);
  475. }
  476. #endif /* SQLITE_DEBUG */
  477. /* Trim the given doclist to contain only positions in column
  478. * [iRestrictColumn]. */
  479. static void docListRestrictColumn(DocList *in, int iRestrictColumn){
  480. DocListReader r;
  481. DocList out;
  482. assert( in->iType>=DL_POSITIONS );
  483. readerInit(&r, in);
  484. docListInit(&out, DL_POSITIONS, NULL, 0);
  485. while( !atEnd(&r) ){
  486. sqlite_int64 iDocid = readDocid(&r);
  487. int iPos, iColumn;
  488. docListAddDocid(&out, iDocid);
  489. while( (iPos = readPosition(&r, &iColumn)) != -1 ){
  490. if( iColumn==iRestrictColumn ){
  491. docListAddPos(&out, iColumn, iPos);
  492. }
  493. }
  494. }
  495. docListDestroy(in);
  496. *in = out;
  497. }
  498. /* Trim the given doclist by discarding any docids without any remaining
  499. * positions. */
  500. static void docListDiscardEmpty(DocList *in) {
  501. DocListReader r;
  502. DocList out;
  503. /* TODO: It would be nice to implement this operation in place; that
  504. * could save a significant amount of memory in queries with long doclists. */
  505. assert( in->iType>=DL_POSITIONS );
  506. readerInit(&r, in);
  507. docListInit(&out, DL_POSITIONS, NULL, 0);
  508. while( !atEnd(&r) ){
  509. sqlite_int64 iDocid = readDocid(&r);
  510. int match = 0;
  511. int iPos, iColumn;
  512. while( (iPos = readPosition(&r, &iColumn)) != -1 ){
  513. if( !match ){
  514. docListAddDocid(&out, iDocid);
  515. match = 1;
  516. }
  517. docListAddPos(&out, iColumn, iPos);
  518. }
  519. }
  520. docListDestroy(in);
  521. *in = out;
  522. }
  523. /* Helper function for docListUpdate() and docListAccumulate().
  524. ** Splices a doclist element into the doclist represented by r,
  525. ** leaving r pointing after the newly spliced element.
  526. */
  527. static void docListSpliceElement(DocListReader *r, sqlite_int64 iDocid,
  528. const char *pSource, int nSource){
  529. DocList *d = r->pDoclist;
  530. char *pTarget;
  531. int nTarget, found;
  532. found = skipToDocid(r, iDocid);
  533. /* Describe slice in d to place pSource/nSource. */
  534. pTarget = r->p;
  535. if( found ){
  536. skipDocument(r);
  537. nTarget = r->p-pTarget;
  538. }else{
  539. nTarget = 0;
  540. }
  541. /* The sense of the following is that there are three possibilities.
  542. ** If nTarget==nSource, we should not move any memory nor realloc.
  543. ** If nTarget>nSource, trim target and realloc.
  544. ** If nTarget<nSource, realloc then expand target.
  545. */
  546. if( nTarget>nSource ){
  547. memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget));
  548. }
  549. if( nTarget!=nSource ){
  550. int iDoclist = pTarget-d->pData;
  551. d->pData = realloc(d->pData, d->nData+nSource-nTarget);
  552. pTarget = d->pData+iDoclist;
  553. }
  554. if( nTarget<nSource ){
  555. memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget));
  556. }
  557. memcpy(pTarget, pSource, nSource);
  558. d->nData += nSource-nTarget;
  559. r->p = pTarget+nSource;
  560. }
  561. /* Insert/update pUpdate into the doclist. */
  562. static void docListUpdate(DocList *d, DocList *pUpdate){
  563. DocListReader reader;
  564. assert( d!=NULL && pUpdate!=NULL );
  565. assert( d->iType==pUpdate->iType);
  566. readerInit(&reader, d);
  567. docListSpliceElement(&reader, firstDocid(pUpdate),
  568. pUpdate->pData, pUpdate->nData);
  569. }
  570. /* Propagate elements from pUpdate to pAcc, overwriting elements with
  571. ** matching docids.
  572. */
  573. static void docListAccumulate(DocList *pAcc, DocList *pUpdate){
  574. DocListReader accReader, updateReader;
  575. /* Handle edge cases where one doclist is empty. */
  576. assert( pAcc!=NULL );
  577. if( pUpdate==NULL || pUpdate->nData==0 ) return;
  578. if( pAcc->nData==0 ){
  579. pAcc->pData = malloc(pUpdate->nData);
  580. memcpy(pAcc->pData, pUpdate->pData, pUpdate->nData);
  581. pAcc->nData = pUpdate->nData;
  582. return;
  583. }
  584. readerInit(&accReader, pAcc);
  585. readerInit(&updateReader, pUpdate);
  586. while( !atEnd(&updateReader) ){
  587. char *pSource = updateReader.p;
  588. sqlite_int64 iDocid = readDocid(&updateReader);
  589. skipPositionList(&updateReader);
  590. docListSpliceElement(&accReader, iDocid, pSource, updateReader.p-pSource);
  591. }
  592. }
  593. /*
  594. ** Read the next docid off of pIn. Return 0 if we reach the end.
  595. *
  596. * TODO: This assumes that docids are never 0, but they may actually be 0 since
  597. * users can choose docids when inserting into a full-text table. Fix this.
  598. */
  599. static sqlite_int64 nextDocid(DocListReader *pIn){
  600. skipPositionList(pIn);
  601. return atEnd(pIn) ? 0 : readDocid(pIn);
  602. }
  603. /*
  604. ** pLeft and pRight are two DocListReaders that are pointing to
  605. ** positions lists of the same document: iDocid.
  606. **
  607. ** If there are no instances in pLeft or pRight where the position
  608. ** of pLeft is one less than the position of pRight, then this
  609. ** routine adds nothing to pOut.
  610. **
  611. ** If there are one or more instances where positions from pLeft
  612. ** are exactly one less than positions from pRight, then add a new
  613. ** document record to pOut. If pOut wants to hold positions, then
  614. ** include the positions from pRight that are one more than a
  615. ** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1.
  616. **
  617. ** pLeft and pRight are left pointing at the next document record.
  618. */
  619. static void mergePosList(
  620. DocListReader *pLeft, /* Left position list */
  621. DocListReader *pRight, /* Right position list */
  622. sqlite_int64 iDocid, /* The docid from pLeft and pRight */
  623. DocList *pOut /* Write the merged document record here */
  624. ){
  625. int iLeftCol, iLeftPos = readPosition(pLeft, &iLeftCol);
  626. int iRightCol, iRightPos = readPosition(pRight, &iRightCol);
  627. int match = 0;
  628. /* Loop until we've reached the end of both position lists. */
  629. while( iLeftPos!=-1 && iRightPos!=-1 ){
  630. if( iLeftCol==iRightCol && iLeftPos+1==iRightPos ){
  631. if( !match ){
  632. docListAddDocid(pOut, iDocid);
  633. match = 1;
  634. }
  635. if( pOut->iType>=DL_POSITIONS ){
  636. docListAddPos(pOut, iRightCol, iRightPos);
  637. }
  638. iLeftPos = readPosition(pLeft, &iLeftCol);
  639. iRightPos = readPosition(pRight, &iRightCol);
  640. }else if( iRightCol<iLeftCol ||
  641. (iRightCol==iLeftCol && iRightPos<iLeftPos+1) ){
  642. iRightPos = readPosition(pRight, &iRightCol);
  643. }else{
  644. iLeftPos = readPosition(pLeft, &iLeftCol);
  645. }
  646. }
  647. if( iLeftPos>=0 ) skipPositionList(pLeft);
  648. if( iRightPos>=0 ) skipPositionList(pRight);
  649. }
  650. /* We have two doclists: pLeft and pRight.
  651. ** Write the phrase intersection of these two doclists into pOut.
  652. **
  653. ** A phrase intersection means that two documents only match
  654. ** if pLeft.iPos+1==pRight.iPos.
  655. **
  656. ** The output pOut may or may not contain positions. If pOut
  657. ** does contain positions, they are the positions of pRight.
  658. */
  659. static void docListPhraseMerge(
  660. DocList *pLeft, /* Doclist resulting from the words on the left */
  661. DocList *pRight, /* Doclist for the next word to the right */
  662. DocList *pOut /* Write the combined doclist here */
  663. ){
  664. DocListReader left, right;
  665. sqlite_int64 docidLeft, docidRight;
  666. readerInit(&left, pLeft);
  667. readerInit(&right, pRight);
  668. docidLeft = nextDocid(&left);
  669. docidRight = nextDocid(&right);
  670. while( docidLeft>0 && docidRight>0 ){
  671. if( docidLeft<docidRight ){
  672. docidLeft = nextDocid(&left);
  673. }else if( docidRight<docidLeft ){
  674. docidRight = nextDocid(&right);
  675. }else{
  676. mergePosList(&left, &right, docidLeft, pOut);
  677. docidLeft = nextDocid(&left);
  678. docidRight = nextDocid(&right);
  679. }
  680. }
  681. }
  682. /* We have two doclists: pLeft and pRight.
  683. ** Write the intersection of these two doclists into pOut.
  684. ** Only docids are matched. Position information is ignored.
  685. **
  686. ** The output pOut never holds positions.
  687. */
  688. static void docListAndMerge(
  689. DocList *pLeft, /* Doclist resulting from the words on the left */
  690. DocList *pRight, /* Doclist for the next word to the right */
  691. DocList *pOut /* Write the combined doclist here */
  692. ){
  693. DocListReader left, right;
  694. sqlite_int64 docidLeft, docidRight;
  695. assert( pOut->iType<DL_POSITIONS );
  696. readerInit(&left, pLeft);
  697. readerInit(&right, pRight);
  698. docidLeft = nextDocid(&left);
  699. docidRight = nextDocid(&right);
  700. while( docidLeft>0 && docidRight>0 ){
  701. if( docidLeft<docidRight ){
  702. docidLeft = nextDocid(&left);
  703. }else if( docidRight<docidLeft ){
  704. docidRight = nextDocid(&right);
  705. }else{
  706. docListAddDocid(pOut, docidLeft);
  707. docidLeft = nextDocid(&left);
  708. docidRight = nextDocid(&right);
  709. }
  710. }
  711. }
  712. /* We have two doclists: pLeft and pRight.
  713. ** Write the union of these two doclists into pOut.
  714. ** Only docids are matched. Position information is ignored.
  715. **
  716. ** The output pOut never holds positions.
  717. */
  718. static void docListOrMerge(
  719. DocList *pLeft, /* Doclist resulting from the words on the left */
  720. DocList *pRight, /* Doclist for the next word to the right */
  721. DocList *pOut /* Write the combined doclist here */
  722. ){
  723. DocListReader left, right;
  724. sqlite_int64 docidLeft, docidRight, priorLeft;
  725. readerInit(&left, pLeft);
  726. readerInit(&right, pRight);
  727. docidLeft = nextDocid(&left);
  728. docidRight = nextDocid(&right);
  729. while( docidLeft>0 && docidRight>0 ){
  730. if( docidLeft<=docidRight ){
  731. docListAddDocid(pOut, docidLeft);
  732. }else{
  733. docListAddDocid(pOut, docidRight);
  734. }
  735. priorLeft = docidLeft;
  736. if( docidLeft<=docidRight ){
  737. docidLeft = nextDocid(&left);
  738. }
  739. if( docidRight>0 && docidRight<=priorLeft ){
  740. docidRight = nextDocid(&right);
  741. }
  742. }
  743. while( docidLeft>0 ){
  744. docListAddDocid(pOut, docidLeft);
  745. docidLeft = nextDocid(&left);
  746. }
  747. while( docidRight>0 ){
  748. docListAddDocid(pOut, docidRight);
  749. docidRight = nextDocid(&right);
  750. }
  751. }
  752. /* We have two doclists: pLeft and pRight.
  753. ** Write into pOut all documents that occur in pLeft but not
  754. ** in pRight.
  755. **
  756. ** Only docids are matched. Position information is ignored.
  757. **
  758. ** The output pOut never holds positions.
  759. */
  760. static void docListExceptMerge(
  761. DocList *pLeft, /* Doclist resulting from the words on the left */
  762. DocList *pRight, /* Doclist for the next word to the right */
  763. DocList *pOut /* Write the combined doclist here */
  764. ){
  765. DocListReader left, right;
  766. sqlite_int64 docidLeft, docidRight, priorLeft;
  767. readerInit(&left, pLeft);
  768. readerInit(&right, pRight);
  769. docidLeft = nextDocid(&left);
  770. docidRight = nextDocid(&right);
  771. while( docidLeft>0 && docidRight>0 ){
  772. priorLeft = docidLeft;
  773. if( docidLeft<docidRight ){
  774. docListAddDocid(pOut, docidLeft);
  775. }
  776. if( docidLeft<=docidRight ){
  777. docidLeft = nextDocid(&left);
  778. }
  779. if( docidRight>0 && docidRight<=priorLeft ){
  780. docidRight = nextDocid(&right);
  781. }
  782. }
  783. while( docidLeft>0 ){
  784. docListAddDocid(pOut, docidLeft);
  785. docidLeft = nextDocid(&left);
  786. }
  787. }
  788. static char *string_dup_n(const char *s, int n){
  789. char *str = malloc(n + 1);
  790. memcpy(str, s, n);
  791. str[n] = '\0';
  792. return str;
  793. }
  794. /* Duplicate a string; the caller must free() the returned string.
  795. * (We don't use strdup() since it is not part of the standard C library and
  796. * may not be available everywhere.) */
  797. static char *string_dup(const char *s){
  798. return string_dup_n(s, strlen(s));
  799. }
  800. /* Format a string, replacing each occurrence of the % character with
  801. * zDb.zName. This may be more convenient than sqlite_mprintf()
  802. * when one string is used repeatedly in a format string.
  803. * The caller must free() the returned string. */
  804. static char *string_format(const char *zFormat,
  805. const char *zDb, const char *zName){
  806. const char *p;
  807. size_t len = 0;
  808. size_t nDb = strlen(zDb);
  809. size_t nName = strlen(zName);
  810. size_t nFullTableName = nDb+1+nName;
  811. char *result;
  812. char *r;
  813. /* first compute length needed */
  814. for(p = zFormat ; *p ; ++p){
  815. len += (*p=='%' ? nFullTableName : 1);
  816. }
  817. len += 1; /* for null terminator */
  818. r = result = malloc(len);
  819. for(p = zFormat; *p; ++p){
  820. if( *p=='%' ){
  821. memcpy(r, zDb, nDb);
  822. r += nDb;
  823. *r++ = '.';
  824. memcpy(r, zName, nName);
  825. r += nName;
  826. } else {
  827. *r++ = *p;
  828. }
  829. }
  830. *r++ = '\0';
  831. assert( r == result + len );
  832. return result;
  833. }
  834. static int sql_exec(sqlite3 *db, const char *zDb, const char *zName,
  835. const char *zFormat){
  836. char *zCommand = string_format(zFormat, zDb, zName);
  837. int rc;
  838. TRACE(("FTS1 sql: %s\n", zCommand));
  839. rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
  840. free(zCommand);
  841. return rc;
  842. }
  843. static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName,
  844. sqlite3_stmt **ppStmt, const char *zFormat){
  845. char *zCommand = string_format(zFormat, zDb, zName);
  846. int rc;
  847. TRACE(("FTS1 prepare: %s\n", zCommand));
  848. rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL);
  849. free(zCommand);
  850. return rc;
  851. }
  852. /* end utility functions */
  853. /* Forward reference */
  854. typedef struct fulltext_vtab fulltext_vtab;
  855. /* A single term in a query is represented by an instances of
  856. ** the following structure.
  857. */
  858. typedef struct QueryTerm {
  859. short int nPhrase; /* How many following terms are part of the same phrase */
  860. short int iPhrase; /* This is the i-th term of a phrase. */
  861. short int iColumn; /* Column of the index that must match this term */
  862. signed char isOr; /* this term is preceded by "OR" */
  863. signed char isNot; /* this term is preceded by "-" */
  864. char *pTerm; /* text of the term. '\000' terminated. malloced */
  865. int nTerm; /* Number of bytes in pTerm[] */
  866. } QueryTerm;
  867. /* A query string is parsed into a Query structure.
  868. *
  869. * We could, in theory, allow query strings to be complicated
  870. * nested expressions with precedence determined by parentheses.
  871. * But none of the major search engines do this. (Perhaps the
  872. * feeling is that an parenthesized expression is two complex of
  873. * an idea for the average user to grasp.) Taking our lead from
  874. * the major search engines, we will allow queries to be a list
  875. * of terms (with an implied AND operator) or phrases in double-quotes,
  876. * with a single optional "-" before each non-phrase term to designate
  877. * negation and an optional OR connector.
  878. *
  879. * OR binds more tightly than the implied AND, which is what the
  880. * major search engines seem to do. So, for example:
  881. *
  882. * [one two OR three] ==> one AND (two OR three)
  883. * [one OR two three] ==> (one OR two) AND three
  884. *
  885. * A "-" before a term matches all entries that lack that term.
  886. * The "-" must occur immediately before the term with in intervening
  887. * space. This is how the search engines do it.
  888. *
  889. * A NOT term cannot be the right-hand operand of an OR. If this
  890. * occurs in the query string, the NOT is ignored:
  891. *
  892. * [one OR -two] ==> one OR two
  893. *
  894. */
  895. typedef struct Query {
  896. fulltext_vtab *pFts; /* The full text index */
  897. int nTerms; /* Number of terms in the query */
  898. QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */
  899. int nextIsOr; /* Set the isOr flag on the next inserted term */
  900. int nextColumn; /* Next word parsed must be in this column */
  901. int dfltColumn; /* The default column */
  902. } Query;
  903. /*
  904. ** An instance of the following structure keeps track of generated
  905. ** matching-word offset information and snippets.
  906. */
  907. typedef struct Snippet {
  908. int nMatch; /* Total number of matches */
  909. int nAlloc; /* Space allocated for aMatch[] */
  910. struct snippetMatch { /* One entry for each matching term */
  911. char snStatus; /* Status flag for use while constructing snippets */
  912. short int iCol; /* The column that contains the match */
  913. short int iTerm; /* The index in Query.pTerms[] of the matching term */
  914. short int nByte; /* Number of bytes in the term */
  915. int iStart; /* The offset to the first character of the term */
  916. } *aMatch; /* Points to space obtained from malloc */
  917. char *zOffset; /* Text rendering of aMatch[] */
  918. int nOffset; /* strlen(zOffset) */
  919. char *zSnippet; /* Snippet text */
  920. int nSnippet; /* strlen(zSnippet) */
  921. } Snippet;
  922. typedef enum QueryType {
  923. QUERY_GENERIC, /* table scan */
  924. QUERY_ROWID, /* lookup by rowid */
  925. QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/
  926. } QueryType;
  927. /* TODO(shess) CHUNK_MAX controls how much data we allow in segment 0
  928. ** before we start aggregating into larger segments. Lower CHUNK_MAX
  929. ** means that for a given input we have more individual segments per
  930. ** term, which means more rows in the table and a bigger index (due to
  931. ** both more rows and bigger rowids). But it also reduces the average
  932. ** cost of adding new elements to the segment 0 doclist, and it seems
  933. ** to reduce the number of pages read and written during inserts. 256
  934. ** was chosen by measuring insertion times for a certain input (first
  935. ** 10k documents of Enron corpus), though including query performance
  936. ** in the decision may argue for a larger value.
  937. */
  938. #define CHUNK_MAX 256
  939. typedef enum fulltext_statement {
  940. CONTENT_INSERT_STMT,
  941. CONTENT_SELECT_STMT,
  942. CONTENT_UPDATE_STMT,
  943. CONTENT_DELETE_STMT,
  944. TERM_SELECT_STMT,
  945. TERM_SELECT_ALL_STMT,
  946. TERM_INSERT_STMT,
  947. TERM_UPDATE_STMT,
  948. TERM_DELETE_STMT,
  949. MAX_STMT /* Always at end! */
  950. } fulltext_statement;
  951. /* These must exactly match the enum above. */
  952. /* TODO(adam): Is there some risk that a statement (in particular,
  953. ** pTermSelectStmt) will be used in two cursors at once, e.g. if a
  954. ** query joins a virtual table to itself? If so perhaps we should
  955. ** move some of these to the cursor object.
  956. */
  957. static const char *const fulltext_zStatement[MAX_STMT] = {
  958. /* CONTENT_INSERT */ NULL, /* generated in contentInsertStatement() */
  959. /* CONTENT_SELECT */ "select * from %_content where rowid = ?",
  960. /* CONTENT_UPDATE */ NULL, /* generated in contentUpdateStatement() */
  961. /* CONTENT_DELETE */ "delete from %_content where rowid = ?",
  962. /* TERM_SELECT */
  963. "select rowid, doclist from %_term where term = ? and segment = ?",
  964. /* TERM_SELECT_ALL */
  965. "select doclist from %_term where term = ? order by segment",
  966. /* TERM_INSERT */
  967. "insert into %_term (rowid, term, segment, doclist) values (?, ?, ?, ?)",
  968. /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?",
  969. /* TERM_DELETE */ "delete from %_term where rowid = ?",
  970. };
  971. /*
  972. ** A connection to a fulltext index is an instance of the following
  973. ** structure. The xCreate and xConnect methods create an instance
  974. ** of this structure and xDestroy and xDisconnect free that instance.
  975. ** All other methods receive a pointer to the structure as one of their
  976. ** arguments.
  977. */
  978. struct fulltext_vtab {
  979. sqlite3_vtab base; /* Base class used by SQLite core */
  980. sqlite3 *db; /* The database connection */
  981. const char *zDb; /* logical database name */
  982. const char *zName; /* virtual table name */
  983. int nColumn; /* number of columns in virtual table */
  984. char **azColumn; /* column names. malloced */
  985. char **azContentColumn; /* column names in content table; malloced */
  986. sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */
  987. /* Precompiled statements which we keep as long as the table is
  988. ** open.
  989. */
  990. sqlite3_stmt *pFulltextStatements[MAX_STMT];
  991. };
  992. /*
  993. ** When the core wants to do a query, it create a cursor using a
  994. ** call to xOpen. This structure is an instance of a cursor. It
  995. ** is destroyed by xClose.
  996. */
  997. typedef struct fulltext_cursor {
  998. sqlite3_vtab_cursor base; /* Base class used by SQLite core */
  999. QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */
  1000. sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */
  1001. int eof; /* True if at End Of Results */
  1002. Query q; /* Parsed query string */
  1003. Snippet snippet; /* Cached snippet for the current row */
  1004. int iColumn; /* Column being searched */
  1005. DocListReader result; /* used when iCursorType == QUERY_FULLTEXT */
  1006. } fulltext_cursor;
  1007. static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
  1008. return (fulltext_vtab *) c->base.pVtab;
  1009. }
  1010. static const sqlite3_module fulltextModule; /* forward declaration */
  1011. /* Append a list of strings separated by commas to a StringBuffer. */
  1012. static void appendList(StringBuffer *sb, int nString, char **azString){
  1013. int i;
  1014. for(i=0; i<nString; ++i){
  1015. if( i>0 ) append(sb, ", ");
  1016. append(sb, azString[i]);
  1017. }
  1018. }
  1019. /* Return a dynamically generated statement of the form
  1020. * insert into %_content (rowid, ...) values (?, ...)
  1021. */
  1022. static const char *contentInsertStatement(fulltext_vtab *v){
  1023. StringBuffer sb;
  1024. int i;
  1025. initStringBuffer(&sb);
  1026. append(&sb, "insert into %_content (rowid, ");
  1027. appendList(&sb, v->nColumn, v->azContentColumn);
  1028. append(&sb, ") values (?");
  1029. for(i=0; i<v->nColumn; ++i)
  1030. append(&sb, ", ?");
  1031. append(&sb, ")");
  1032. return sb.s;
  1033. }
  1034. /* Return a dynamically generated statement of the form
  1035. * update %_content set [col_0] = ?, [col_1] = ?, ...
  1036. * where rowid = ?
  1037. */
  1038. static const char *contentUpdateStatement(fulltext_vtab *v){
  1039. StringBuffer sb;
  1040. int i;
  1041. initStringBuffer(&sb);
  1042. append(&sb, "update %_content set ");
  1043. for(i=0; i<v->nColumn; ++i) {
  1044. if( i>0 ){
  1045. append(&sb, ", ");
  1046. }
  1047. append(&sb, v->azContentColumn[i]);
  1048. append(&sb, " = ?");
  1049. }
  1050. append(&sb, " where rowid = ?");
  1051. return sb.s;
  1052. }
  1053. /* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
  1054. ** If the indicated statement has never been prepared, it is prepared
  1055. ** and cached, otherwise the cached version is reset.
  1056. */
  1057. static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
  1058. sqlite3_stmt **ppStmt){
  1059. assert( iStmt<MAX_STMT );
  1060. if( v->pFulltextStatements[iStmt]==NULL ){
  1061. const char *zStmt;
  1062. int rc;
  1063. switch( iStmt ){
  1064. case CONTENT_INSERT_STMT:
  1065. zStmt = contentInsertStatement(v); break;
  1066. case CONTENT_UPDATE_STMT:
  1067. zStmt = contentUpdateStatement(v); break;
  1068. default:
  1069. zStmt = fulltext_zStatement[iStmt];
  1070. }
  1071. rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt],
  1072. zStmt);
  1073. if( zStmt != fulltext_zStatement[iStmt]) free((void *) zStmt);
  1074. if( rc!=SQLITE_OK ) return rc;
  1075. } else {
  1076. int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
  1077. if( rc!=SQLITE_OK ) return rc;
  1078. }
  1079. *ppStmt = v->pFulltextStatements[iStmt];
  1080. return SQLITE_OK;
  1081. }
  1082. /* Step the indicated statement, handling errors SQLITE_BUSY (by
  1083. ** retrying) and SQLITE_SCHEMA (by re-preparing and transferring
  1084. ** bindings to the new statement).
  1085. ** TODO(adam): We should extend this function so that it can work with
  1086. ** statements declared locally, not only globally cached statements.
  1087. */
  1088. static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt,
  1089. sqlite3_stmt **ppStmt){
  1090. int rc;
  1091. sqlite3_stmt *s = *ppStmt;
  1092. assert( iStmt<MAX_STMT );
  1093. assert( s==v->pFulltextStatements[iStmt] );
  1094. while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){
  1095. if( rc==SQLITE_BUSY ) continue;
  1096. if( rc!=SQLITE_ERROR ) return rc;
  1097. /* If an SQLITE_SCHEMA error has occurred, then finalizing this
  1098. * statement is going to delete the fulltext_vtab structure. If
  1099. * the statement just executed is in the pFulltextStatements[]
  1100. * array, it will be finalized twice. So remove it before
  1101. * calling sqlite3_finalize().
  1102. */
  1103. v->pFulltextStatements[iStmt] = NULL;
  1104. rc = sqlite3_finalize(s);
  1105. break;
  1106. }
  1107. return rc;
  1108. err:
  1109. sqlite3_finalize(s);
  1110. return rc;
  1111. }
  1112. /* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK.
  1113. ** Useful for statements like UPDATE, where we expect no results.
  1114. */
  1115. static int sql_single_step_statement(fulltext_vtab *v,
  1116. fulltext_statement iStmt,
  1117. sqlite3_stmt **ppStmt){
  1118. int rc = sql_step_statement(v, iStmt, ppStmt);
  1119. return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
  1120. }
  1121. /* insert into %_content (rowid, ...) values ([rowid], [pValues]) */
  1122. static int content_insert(fulltext_vtab *v, sqlite3_value *rowid,
  1123. sqlite3_value **pValues){
  1124. sqlite3_stmt *s;
  1125. int i;
  1126. int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
  1127. if( rc!=SQLITE_OK ) return rc;
  1128. rc = sqlite3_bind_value(s, 1, rowid);
  1129. if( rc!=SQLITE_OK ) return rc;
  1130. for(i=0; i<v->nColumn; ++i){
  1131. rc = sqlite3_bind_value(s, 2+i, pValues[i]);
  1132. if( rc!=SQLITE_OK ) return rc;
  1133. }
  1134. return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s);
  1135. }
  1136. /* update %_content set col0 = pValues[0], col1 = pValues[1], ...
  1137. * where rowid = [iRowid] */
  1138. static int content_update(fulltext_vtab *v, sqlite3_value **pValues,
  1139. sqlite_int64 iRowid){
  1140. sqlite3_stmt *s;
  1141. int i;
  1142. int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s);
  1143. if( rc!=SQLITE_OK ) return rc;
  1144. for(i=0; i<v->nColumn; ++i){
  1145. rc = sqlite3_bind_value(s, 1+i, pValues[i]);
  1146. if( rc!=SQLITE_OK ) return rc;
  1147. }
  1148. rc = sqlite3_bind_int64(s, 1+v->nColumn, iRowid);
  1149. if( rc!=SQLITE_OK ) return rc;
  1150. return sql_single_step_statement(v, CONTENT_UPDATE_STMT, &s);
  1151. }
  1152. static void freeStringArray(int nString, const char **pString){
  1153. int i;
  1154. for (i=0 ; i < nString ; ++i) {
  1155. if( pString[i]!=NULL ) free((void *) pString[i]);
  1156. }
  1157. free((void *) pString);
  1158. }
  1159. /* select * from %_content where rowid = [iRow]
  1160. * The caller must delete the returned array and all strings in it.
  1161. * null fields will be NULL in the returned array.
  1162. *
  1163. * TODO: Perhaps we should return pointer/length strings here for consistency
  1164. * with other code which uses pointer/length. */
  1165. static int content_select(fulltext_vtab *v, sqlite_int64 iRow,
  1166. const char ***pValues){
  1167. sqlite3_stmt *s;
  1168. const char **values;
  1169. int i;
  1170. int rc;
  1171. *pValues = NULL;
  1172. rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
  1173. if( rc!=SQLITE_OK ) return rc;
  1174. rc = sqlite3_bind_int64(s, 1, iRow);
  1175. if( rc!=SQLITE_OK ) return rc;
  1176. rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s);
  1177. if( rc!=SQLITE_ROW ) return rc;
  1178. values = (const char **) malloc(v->nColumn * sizeof(const char *));
  1179. for(i=0; i<v->nColumn; ++i){
  1180. if( sqlite3_column_type(s, i)==SQLITE_NULL ){
  1181. values[i] = NULL;
  1182. }else{
  1183. values[i] = string_dup((char*)sqlite3_column_text(s, i));
  1184. }
  1185. }
  1186. /* We expect only one row. We must execute another sqlite3_step()
  1187. * to complete the iteration; otherwise the table will remain locked. */
  1188. rc = sqlite3_step(s);
  1189. if( rc==SQLITE_DONE ){
  1190. *pValues = values;
  1191. return SQLITE_OK;
  1192. }
  1193. freeStringArray(v->nColumn, values);
  1194. return rc;
  1195. }
  1196. /* delete from %_content where rowid = [iRow ] */
  1197. static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){
  1198. sqlite3_stmt *s;
  1199. int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
  1200. if( rc!=SQLITE_OK ) return rc;
  1201. rc = sqlite3_bind_int64(s, 1, iRow);
  1202. if( rc!=SQLITE_OK ) return rc;
  1203. return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s);
  1204. }
  1205. /* select rowid, doclist from %_term
  1206. * where term = [pTerm] and segment = [iSegment]
  1207. * If found, returns SQLITE_ROW; the caller must free the
  1208. * returned doclist. If no rows found, returns SQLITE_DONE. */
  1209. static int term_select(fulltext_vtab *v, const char *pTerm, int nTerm,
  1210. int iSegment,
  1211. sqlite_int64 *rowid, DocList *out){
  1212. sqlite3_stmt *s;
  1213. int rc = sql_get_statement(v, TERM_SELECT_STMT, &s);
  1214. if( rc!=SQLITE_OK ) return rc;
  1215. rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC);
  1216. if( rc!=SQLITE_OK ) return rc;
  1217. rc = sqlite3_bind_int(s, 2, iSegment);
  1218. if( rc!=SQLITE_OK ) return rc;
  1219. rc = sql_step_statement(v, TERM_SELECT_STMT, &s);
  1220. if( rc!=SQLITE_ROW ) return rc;
  1221. *rowid = sqlite3_column_int64(s, 0);
  1222. docListInit(out, DL_DEFAULT,
  1223. sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1));
  1224. /* We expect only one row. We must execute another sqlite3_step()
  1225. * to complete the iteration; otherwise the table will remain locked. */
  1226. rc = sqlite3_step(s);
  1227. return rc==SQLITE_DONE ? SQLITE_ROW : rc;
  1228. }
  1229. /* Load the segment doclists for term pTerm and merge them in
  1230. ** appropriate order into out. Returns SQLITE_OK if successful. If
  1231. ** there are no segments for pTerm, successfully returns an empty
  1232. ** doclist in out.
  1233. **
  1234. ** Each document consists of 1 or more "columns". The number of
  1235. ** columns is v->nColumn. If iColumn==v->nColumn, then return
  1236. ** position information about all columns. If iColumn<v->nColumn,
  1237. ** then only return position information about the iColumn-th column
  1238. ** (where the first column is 0).
  1239. */
  1240. static int term_select_all(
  1241. fulltext_vtab *v, /* The fulltext index we are querying against */
  1242. int iColumn, /* If <nColumn, only look at the iColumn-th column */
  1243. const char *pTerm, /* The term whose posting lists we want */
  1244. int nTerm, /* Number of bytes in pTerm */
  1245. DocList *out /* Write the resulting doclist here */
  1246. ){
  1247. DocList doclist;
  1248. sqlite3_stmt *s;
  1249. int rc = sql_get_statement(v, TERM_SELECT_ALL_STMT, &s);
  1250. if( rc!=SQLITE_OK ) return rc;
  1251. rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC);
  1252. if( rc!=SQLITE_OK ) return rc;
  1253. docListInit(&doclist, DL_DEFAULT, 0, 0);
  1254. /* TODO(shess) Handle schema and busy errors. */
  1255. while( (rc=sql_step_statement(v, TERM_SELECT_ALL_STMT, &s))==SQLITE_ROW ){
  1256. DocList old;
  1257. /* TODO(shess) If we processed doclists from oldest to newest, we
  1258. ** could skip the malloc() involved with the following call. For
  1259. ** now, I'd rather keep this logic similar to index_insert_term().
  1260. ** We could additionally drop elements when we see deletes, but
  1261. ** that would require a distinct version of docListAccumulate().
  1262. */
  1263. docListInit(&old, DL_DEFAULT,
  1264. sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0));
  1265. if( iColumn<v->nColumn ){ /* querying a single column */
  1266. docListRestrictColumn(&old, iColumn);
  1267. }
  1268. /* doclist contains the newer data, so write it over old. Then
  1269. ** steal accumulated result for doclist.
  1270. */
  1271. docListAccumulate(&old, &doclist);
  1272. docListDestroy(&doclist);
  1273. doclist = old;
  1274. }
  1275. if( rc!=SQLITE_DONE ){
  1276. docListDestroy(&doclist);
  1277. return rc;
  1278. }
  1279. docListDiscardEmpty(&doclist);
  1280. *out = doclist;
  1281. return SQLITE_OK;
  1282. }
  1283. /* insert into %_term (rowid, term, segment, doclist)
  1284. values ([piRowid], [pTerm], [iSegment], [doclist])
  1285. ** Lets sqlite select rowid if piRowid is NULL, else uses *piRowid.
  1286. **
  1287. ** NOTE(shess) piRowid is IN, with values of "space of int64" plus
  1288. ** null, it is not used to pass data back to the caller.
  1289. */
  1290. static int term_insert(fulltext_vtab *v, sqlite_int64 *piRowid,
  1291. const char *pTerm, int nTerm,
  1292. int iSegment, DocList *doclist){
  1293. sqlite3_stmt *s;
  1294. int rc = sql_get_statement(v, TERM_INSERT_STMT, &s);
  1295. if( rc!=SQLITE_OK ) return rc;
  1296. if( piRowid==NULL ){
  1297. rc = sqlite3_bind_null(s, 1);
  1298. }else{
  1299. rc = sqlite3_bind_int64(s, 1, *piRowid);
  1300. }
  1301. if( rc!=SQLITE_OK ) return rc;
  1302. rc = sqlite3_bind_text(s, 2, pTerm, nTerm, SQLITE_STATIC);
  1303. if( rc!=SQLITE_OK ) return rc;
  1304. rc = sqlite3_bind_int(s, 3, iSegment);
  1305. if( rc!=SQLITE_OK ) return rc;
  1306. rc = sqlite3_bind_blob(s, 4, doclist->pData, doclist->nData, SQLITE_STATIC);
  1307. if( rc!=SQLITE_OK ) return rc;
  1308. return sql_single_step_statement(v, TERM_INSERT_STMT, &s);
  1309. }
  1310. /* update %_term set doclist = [doclist] where rowid = [rowid] */
  1311. static int term_update(fulltext_vtab *v, sqlite_int64 rowid,
  1312. DocList *doclist){
  1313. sqlite3_stmt *s;
  1314. int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s);
  1315. if( rc!=SQLITE_OK ) return rc;
  1316. rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, SQLITE_STATIC);
  1317. if( rc!=SQLITE_OK ) return rc;
  1318. rc = sqlite3_bind_int64(s, 2, rowid);
  1319. if( rc!=SQLITE_OK ) return rc;
  1320. return sql_single_step_statement(v, TERM_UPDATE_STMT, &s);
  1321. }
  1322. static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){
  1323. sqlite3_stmt *s;
  1324. int rc = sql_get_statement(v, TERM_DELETE_STMT, &s);
  1325. if( rc!=SQLITE_OK ) return rc;
  1326. rc = sqlite3_bind_int64(s, 1, rowid);
  1327. if( rc!=SQLITE_OK ) return rc;
  1328. return sql_single_step_statement(v, TERM_DELETE_STMT, &s);
  1329. }
  1330. /*
  1331. ** Free the memory used to contain a fulltext_vtab structure.
  1332. */
  1333. static void fulltext_vtab_destroy(fulltext_vtab *v){
  1334. int iStmt, i;
  1335. TRACE(("FTS1 Destroy %p\n", v));
  1336. for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){
  1337. if( v->pFulltextStatements[iStmt]!=NULL ){
  1338. sqlite3_finalize(v->pFulltextStatements[iStmt]);
  1339. v->pFulltextStatements[iStmt] = NULL;
  1340. }
  1341. }
  1342. if( v->pTokenizer!=NULL ){
  1343. v->pTokenizer->pModule->xDestroy(v->pTokenizer);
  1344. v->pTokenizer = NULL;
  1345. }
  1346. free(v->azColumn);
  1347. for(i = 0; i < v->nColumn; ++i) {
  1348. sqlite3_free(v->azContentColumn[i]);
  1349. }
  1350. free(v->azContentColumn);
  1351. free(v);
  1352. }
  1353. /*
  1354. ** Token types for parsing the arguments to xConnect or xCreate.
  1355. */
  1356. #define TOKEN_EOF 0 /* End of file */
  1357. #define TOKEN_SPACE 1 /* Any kind of whitespace */
  1358. #define TOKEN_ID 2 /* An identifier */
  1359. #define TOKEN_STRING 3 /* A string literal */
  1360. #define TOKEN_PUNCT 4 /* A single punctuation character */
  1361. /*
  1362. ** If X is a character that can be used in an identifier then
  1363. ** IdChar(X) will be true. Otherwise it is false.
  1364. **
  1365. ** For ASCII, any character with the high-order bit set is
  1366. ** allowed in an identifier. For 7-bit characters,
  1367. ** sqlite3IsIdChar[X] must be 1.
  1368. **
  1369. ** Ticket #1066. the SQL standard does not allow '$' in the
  1370. ** middle of identfiers. But many SQL implementations do.
  1371. ** SQLite will allow '$' in identifiers for compatibility.
  1372. ** But the feature is undocumented.
  1373. */
  1374. static const char isIdChar[] = {
  1375. /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
  1376. 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */
  1377. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
  1378. 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
  1379. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
  1380. 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
  1381. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
  1382. };
  1383. #define IdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20]))
  1384. /*
  1385. ** Return the length of the token that begins at z[0].
  1386. ** Store the token type in *tokenType before returning.
  1387. */
  1388. static int getToken(const char *z, int *tokenType){
  1389. int i, c;
  1390. switch( *z ){
  1391. case 0: {
  1392. *tokenType = TOKEN_EOF;
  1393. return 0;
  1394. }
  1395. case ' ': case '\t': case '\n': case '\f': case '\r': {
  1396. for(i=1; safe_isspace(z[i]); i++){}
  1397. *tokenType = TOKEN_SPACE;
  1398. return i;
  1399. }
  1400. case '`':
  1401. case '\'':
  1402. case '"': {
  1403. int delim = z[0];
  1404. for(i=1; (c=z[i])!=0; i++){
  1405. if( c==delim ){
  1406. if( z[i+1]==delim ){
  1407. i++;
  1408. }else{
  1409. break;
  1410. }
  1411. }
  1412. }
  1413. *tokenType = TOKEN_STRING;
  1414. return i + (c!=0);
  1415. }
  1416. case '[': {
  1417. for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
  1418. *tokenType = TOKEN_ID;
  1419. return i;
  1420. }
  1421. default: {
  1422. if( !IdChar(*z) ){
  1423. break;
  1424. }
  1425. for(i=1; IdChar(z[i]); i++){}
  1426. *tokenType = TOKEN_ID;
  1427. return i;
  1428. }
  1429. }
  1430. *tokenType = TOKEN_PUNCT;
  1431. return 1;
  1432. }
  1433. /*
  1434. ** A token extracted from a string is an instance of the following
  1435. ** structure.
  1436. */
  1437. typedef struct Token {
  1438. const char *z; /* Pointer to token text. Not '\000' terminated */
  1439. short int n; /* Length of the token text in bytes. */
  1440. } Token;
  1441. /*
  1442. ** Given a input string (which is really one of the argv[] parameters
  1443. ** passed into xConnect or xCreate) split the string up into tokens.
  1444. ** Return an array of pointers to '\000' terminated strings, one string
  1445. ** for each non-whitespace token.
  1446. **
  1447. ** The returned array is terminated by a single NULL pointer.
  1448. **
  1449. ** Space to hold the returned array is obtained from a single
  1450. ** malloc and should be freed by passing the return value to free().
  1451. ** The individual strings within the token list are all a part of
  1452. ** the single memory allocation and will all be freed at once.
  1453. */
  1454. static char **tokenizeString(const char *z, int *pnToken){
  1455. int nToken = 0;
  1456. Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) );
  1457. int n = 1;
  1458. int e, i;
  1459. int totalSize = 0;
  1460. char **azToken;
  1461. char *zCopy;
  1462. while( n>0 ){
  1463. n = getToken(z, &e);
  1464. if( e!=TOKEN_SPACE ){
  1465. aToken[nToken].z = z;
  1466. aToken[nToken].n = n;
  1467. nToken++;
  1468. totalSize += n+1;
  1469. }
  1470. z += n;
  1471. }
  1472. azToken = (char**)malloc( nToken*sizeof(char*) + totalSize );
  1473. zCopy = (char*)&azToken[nToken];
  1474. nToken--;
  1475. for(i=0; i<nToken; i++){
  1476. azToken[i] = zCopy;
  1477. n = aToken[i].n;
  1478. memcpy(zCopy, aToken[i].z, n);
  1479. zCopy[n] = 0;
  1480. zCopy += n+1;
  1481. }
  1482. azToken[nToken] = 0;
  1483. free(aToken);
  1484. *pnToken = nToken;
  1485. return azToken;
  1486. }
  1487. /*
  1488. ** Convert an SQL-style quoted string into a normal string by removing
  1489. ** the quote characters. The conversion is done in-place. If the
  1490. ** input does not begin with a quote character, then this routine
  1491. ** is a no-op.
  1492. **
  1493. ** Examples:
  1494. **
  1495. ** "abc" becomes abc
  1496. ** 'xyz' becomes xyz
  1497. ** [pqr] becomes pqr
  1498. ** `mno` becomes mno
  1499. */
  1500. static void dequoteString(char *z){
  1501. int quote;
  1502. int i, j;
  1503. if( z==0 ) return;
  1504. quote = z[0];
  1505. switch( quote ){
  1506. case '\'': break;
  1507. case '"': break;
  1508. case '`': break; /* For MySQL compatibility */
  1509. case '[': quote = ']'; break; /* For MS SqlServer compatibility */
  1510. default: return;
  1511. }
  1512. for(i=1, j=0; z[i]; i++){
  1513. if( z[i]==quote ){
  1514. if( z[i+1]==quote ){
  1515. z[j++] = quote;
  1516. i++;
  1517. }else{
  1518. z[j++] = 0;
  1519. break;
  1520. }
  1521. }else{
  1522. z[j++] = z[i];
  1523. }
  1524. }
  1525. }
  1526. /*
  1527. ** The input azIn is a NULL-terminated list of tokens. Remove the first
  1528. ** token and all punctuation tokens. Remove the quotes from
  1529. ** around string literal tokens.
  1530. **
  1531. ** Example:
  1532. **
  1533. ** input: tokenize chinese ( 'simplifed' , 'mixed' )
  1534. ** output: chinese simplifed mixed
  1535. **
  1536. ** Another example:
  1537. **
  1538. ** input: delimiters ( '[' , ']' , '...' )
  1539. ** output: [ ] ...
  1540. */
  1541. static void tokenListToIdList(char **azIn){
  1542. int i, j;
  1543. if( azIn ){
  1544. for(i=0, j=-1; azIn[i]; i++){
  1545. if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
  1546. dequoteString(azIn[i]);
  1547. if( j>=0 ){
  1548. azIn[j] = azIn[i];
  1549. }
  1550. j++;
  1551. }
  1552. }
  1553. azIn[j] = 0;
  1554. }
  1555. }
  1556. /*
  1557. ** Find the first alphanumeric token in the string zIn. Null-terminate
  1558. ** this token. Remove any quotation marks. And return a pointer to
  1559. ** the result.
  1560. */
  1561. static char *firstToken(char *zIn, char **pzTail){
  1562. int n, ttype;
  1563. while(1){
  1564. n = getToken(zIn, &ttype);
  1565. if( ttype==TOKEN_SPACE ){
  1566. zIn += n;
  1567. }else if( ttype==TOKEN_EOF ){
  1568. *pzTail = zIn;
  1569. return 0;
  1570. }else{
  1571. zIn[n] = 0;
  1572. *pzTail = &zIn[1];
  1573. dequoteString(zIn);
  1574. return zIn;
  1575. }
  1576. }
  1577. /*NOTREACHED*/
  1578. }
  1579. /* Return true if...
  1580. **
  1581. ** * s begins with the string t, ignoring case
  1582. ** * s is longer than t
  1583. ** * The first character of s beyond t is not a alphanumeric
  1584. **
  1585. ** Ignore leading space in *s.
  1586. **
  1587. ** To put it another way, return true if the first token of
  1588. ** s[] is t[].
  1589. */
  1590. static int startsWith(const char *s, const char *t){
  1591. while( safe_isspace(*s) ){ s++; }
  1592. while( *t ){
  1593. if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
  1594. }
  1595. return *s!='_' && !safe_isalnum(*s);
  1596. }
  1597. /*
  1598. ** An instance of this structure defines the "spec" of a
  1599. ** full text index. This structure is populated by parseSpec
  1600. ** and use by fulltextConnect and fulltextCreate.
  1601. */
  1602. typedef struct TableSpec {
  1603. const char *zDb; /* Logical database name */
  1604. const char *zName; /* Name of the full-text index */
  1605. int nColumn; /* Number of columns to be indexed */
  1606. char **azColumn; /* Original names of columns to be indexed */
  1607. char **azContentColumn; /* Column names for %_content */
  1608. char **azTokenizer; /* Name of tokenizer and its arguments */
  1609. } TableSpec;
  1610. /*
  1611. ** Reclaim all of the memory used by a TableSpec
  1612. */
  1613. static void clearTableSpec(TableSpec *p) {
  1614. free(p->azColumn);
  1615. free(p->azContentColumn);
  1616. free(p->azTokenizer);
  1617. }
  1618. /* Parse a CREATE VIRTUAL TABLE statement, which looks like this:
  1619. *
  1620. * CREATE VIRTUAL TABLE email
  1621. * USING fts1(subject, body, tokenize mytokenizer(myarg))
  1622. *
  1623. * We return parsed information in a TableSpec structure.
  1624. *
  1625. */
  1626. static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv,
  1627. char**pzErr){
  1628. int i, n;
  1629. char *z, *zDummy;
  1630. char **azArg;
  1631. const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */
  1632. assert( argc>=3 );
  1633. /* Current interface:
  1634. ** argv[0] - module name
  1635. ** argv[1] - database name
  1636. ** argv[2] - table name
  1637. ** argv[3..] - columns, optionally followed by tokenizer specification
  1638. ** and snippet delimiters specification.
  1639. */
  1640. /* Make a copy of the complete argv[][] array in a single allocation.
  1641. ** The argv[][] array is read-only and transient. We can write to the
  1642. ** copy in order to modify things and the copy is persistent.
  1643. */
  1644. memset(pSpec, 0, sizeof(*pSpec));
  1645. for(i=n=0; i<argc; i++){
  1646. n += strlen(argv[i]) + 1;
  1647. }
  1648. azArg = malloc( sizeof(char*)*argc + n );
  1649. if( azArg==0 ){
  1650. return SQLITE_NOMEM;
  1651. }
  1652. z = (char*)&azArg[argc];
  1653. for(i=0; i<argc; i++){
  1654. azArg[i] = z;
  1655. strcpy(z, argv[i]);
  1656. z += strlen(z)+1;
  1657. }
  1658. /* Identify the column names and the tokenizer and delimiter arguments
  1659. ** in the argv[][] array.
  1660. */
  1661. pSpec->zDb = azArg[1];
  1662. pSpec->zName = azArg[2];
  1663. pSpec->nColumn = 0;
  1664. pSpec->azColumn = azArg;
  1665. zTokenizer = "tokenize simple";
  1666. for(i=3; i<argc; ++i){
  1667. if( startsWith(azArg[i],"tokenize") ){
  1668. zTokenizer = azArg[i];
  1669. }else{
  1670. z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy);
  1671. pSpec->nColumn++;
  1672. }
  1673. }
  1674. if( pSpec->nColumn==0 ){
  1675. azArg[0] = "content";
  1676. pSpec->nColumn = 1;
  1677. }
  1678. /*
  1679. ** Construct the list of content column names.
  1680. **
  1681. ** Each content column name will be of the form cNNAAAA
  1682. ** where NN is the column number and AAAA is the sanitized
  1683. ** column name. "sanitized" means that special characters are
  1684. ** converted to "_". The cNN prefix guarantees that all column
  1685. ** names are unique.
  1686. **
  1687. ** The AAAA suffix is not strictly necessary. It is included
  1688. ** for the convenience of people who might examine the generated
  1689. ** %_content table and wonder what the columns are used for.
  1690. */
  1691. pSpec->azContentColumn = malloc( pSpec->nColumn * sizeof(char *) );
  1692. if( pSpec->azContentColumn==0 ){
  1693. clearTableSpec(pSpec);
  1694. return SQLITE_NOMEM;
  1695. }
  1696. for(i=0; i<pSpec->nColumn; i++){
  1697. char *p;
  1698. pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
  1699. for (p = pSpec->azContentColumn[i]; *p ; ++p) {
  1700. if( !safe_isalnum(*p) ) *p = '_';
  1701. }
  1702. }
  1703. /*
  1704. ** Parse the tokenizer specification string.
  1705. */
  1706. pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
  1707. tokenListToIdList(pSpec->azTokenizer);
  1708. return SQLITE_OK;
  1709. }
  1710. /*
  1711. ** Generate a CREATE TABLE statement that describes the schema of
  1712. ** the virtual table. Return a pointer to this schema string.
  1713. **
  1714. ** Space is obtained from sqlite3_mprintf() and should be freed
  1715. ** using sqlite3_free().
  1716. */
  1717. static char *fulltextSchema(
  1718. int nColumn, /* Number of columns */
  1719. const char *const* azColumn, /* List of columns */
  1720. const char *zTableName /* Name of the table */
  1721. ){
  1722. int i;
  1723. char *zSchema, *zNext;
  1724. const char *zSep = "(";
  1725. zSchema = sqlite3_mprintf("CREATE TABLE x");
  1726. for(i=0; i<nColumn; i++){
  1727. zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]);
  1728. sqlite3_free(zSchema);
  1729. zSchema = zNext;
  1730. zSep = ",";
  1731. }
  1732. zNext = sqlite3_mprintf("%s,%Q)", zSchema, zTableName);
  1733. sqlite3_free(zSchema);
  1734. return zNext;
  1735. }
  1736. /*
  1737. ** Build a new sqlite3_vtab structure that will describe the
  1738. ** fulltext index defined by spec.
  1739. */
  1740. static int constructVtab(
  1741. sqlite3 *db, /* The SQLite database connection */
  1742. TableSpec *spec, /* Parsed spec information from parseSpec() */
  1743. sqlite3_vtab **ppVTab, /* Write the resulting vtab structure here */
  1744. char **pzErr /* Write any error message here */
  1745. ){
  1746. int rc;
  1747. int n;
  1748. fulltext_vtab *v = 0;
  1749. const sqlite3_tokenizer_module *m = NULL;
  1750. char *schema;
  1751. v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab));
  1752. if( v==0 ) return SQLITE_NOMEM;
  1753. memset(v, 0, sizeof(*v));
  1754. /* sqlite will initialize v->base */
  1755. v->db = db;
  1756. v->zDb = spec->zDb; /* Freed when azColumn is freed */
  1757. v->zName = spec->zName; /* Freed when azColumn is freed */
  1758. v->nColumn = spec->nColumn;
  1759. v->azContentColumn = spec->azContentColumn;
  1760. spec->azContentColumn = 0;
  1761. v->azColumn = spec->azColumn;
  1762. spec->azColumn = 0;
  1763. if( spec->azTokenizer==0 ){
  1764. return SQLITE_NOMEM;
  1765. }
  1766. /* TODO(shess) For now, add new tokenizers as else if clauses. */
  1767. if( spec->azTokenizer[0]==0 || startsWith(spec->azTokenizer[0], "simple") ){
  1768. sqlite3Fts1SimpleTokenizerModule(&m);
  1769. }else if( startsWith(spec->azTokenizer[0], "porter") ){
  1770. sqlite3Fts1PorterTokenizerModule(&m);
  1771. }else{
  1772. *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]);
  1773. rc = SQLITE_ERROR;
  1774. goto err;
  1775. }
  1776. for(n=0; spec->azTokenizer[n]; n++){}
  1777. if( n ){
  1778. rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1],
  1779. &v->pTokenizer);
  1780. }else{
  1781. rc = m->xCreate(0, 0, &v->pTokenizer);
  1782. }
  1783. if( rc!=SQLITE_OK ) goto err;
  1784. v->pTokenizer->pModule = m;
  1785. /* TODO: verify the existence of backing tables foo_content, foo_term */
  1786. schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn,
  1787. spec->zName);
  1788. rc = sqlite3_declare_vtab(db, schema);
  1789. sqlite3_free(schema);
  1790. if( rc!=SQLITE_OK ) goto err;
  1791. memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
  1792. *ppVTab = &v->base;
  1793. TRACE(("FTS1 Connect %p\n", v));
  1794. return rc;
  1795. err:
  1796. fulltext_vtab_destroy(v);
  1797. return rc;
  1798. }
  1799. static int fulltextConnect(
  1800. sqlite3 *db,
  1801. void *pAux,
  1802. int argc, const char *const*argv,
  1803. sqlite3_vtab **ppVTab,
  1804. char **pzErr
  1805. ){
  1806. TableSpec spec;
  1807. int rc = parseSpec(&spec, argc, argv, pzErr);
  1808. if( rc!=SQLITE_OK ) return rc;
  1809. rc = constructVtab(db, &spec, ppVTab, pzErr);
  1810. clearTableSpec(&spec);
  1811. return rc;
  1812. }
  1813. /* The %_content table holds the text of each document, with
  1814. ** the rowid used as the docid.
  1815. **
  1816. ** The %_term table maps each term to a document list blob
  1817. ** containing elements sorted by ascending docid, each element
  1818. ** encoded as:
  1819. **
  1820. ** docid varint-encoded
  1821. ** token elements:
  1822. ** position+1 varint-encoded as delta from previous position
  1823. ** start offset varint-encoded as delta from previous start offset
  1824. ** end offset varint-encoded as delta from start offset
  1825. **
  1826. ** The sentinel position of 0 indicates the end of the token list.
  1827. **
  1828. ** Additionally, doclist blobs are chunked into multiple segments,
  1829. ** using segment to order the segments. New elements are added to
  1830. ** the segment at segment 0, until it exceeds CHUNK_MAX. Then
  1831. ** segment 0 is deleted, and the doclist is inserted at segment 1.
  1832. ** If there is already a doclist at segment 1, the segment 0 doclist
  1833. ** is merged with it, the segment 1 doclist is deleted, and the
  1834. ** merged doclist is inserted at segment 2, repeating those
  1835. ** operations until an insert succeeds.
  1836. **
  1837. ** Since this structure doesn't allow us to update elements in place
  1838. ** in case of deletion or update, these are simply written to
  1839. ** segment 0 (with an empty token list in case of deletion), with
  1840. ** docListAccumulate() taking care to retain lower-segment
  1841. ** information in preference to higher-segment information.
  1842. */
  1843. /* TODO(shess) Provide a VACUUM type operation which both removes
  1844. ** deleted elements which are no longer necessary, and duplicated
  1845. ** elements. I suspect this will probably not be necessary in
  1846. ** practice, though.
  1847. */
  1848. static int fulltextCreate(sqlite3 *db, void *pAux,
  1849. int argc, const char * const *argv,
  1850. sqlite3_vtab **ppVTab, char **pzErr){
  1851. int rc;
  1852. TableSpec spec;
  1853. StringBuffer schema;
  1854. TRACE(("FTS1 Create\n"));
  1855. rc = parseSpec(&spec, argc, argv, pzErr);
  1856. if( rc!=SQLITE_OK ) return rc;
  1857. initStringBuffer(&schema);
  1858. append(&schema, "CREATE TABLE %_content(");
  1859. appendList(&schema, spec.nColumn, spec.azContentColumn);
  1860. append(&schema, ")");
  1861. rc = sql_exec(db, spec.zDb, spec.zName, schema.s);
  1862. free(schema.s);
  1863. if( rc!=SQLITE_OK ) goto out;
  1864. rc = sql_exec(db, spec.zDb, spec.zName,
  1865. "create table %_term(term text, segment integer, doclist blob, "
  1866. "primary key(term, segment));");
  1867. if( rc!=SQLITE_OK ) goto out;
  1868. rc = constructVtab(db, &spec, ppVTab, pzErr);
  1869. out:
  1870. clearTableSpec(&spec);
  1871. return rc;
  1872. }
  1873. /* Decide how to handle an SQL query. */
  1874. static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
  1875. int i;
  1876. TRACE(("FTS1 BestIndex\n"));
  1877. for(i=0; i<pInfo->nConstraint; ++i){
  1878. const struct sqlite3_index_constraint *pConstraint;
  1879. pConstraint = &pInfo->aConstraint[i];
  1880. if( pConstraint->usable ) {
  1881. if( pConstraint->iColumn==-1 &&
  1882. pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){
  1883. pInfo->idxNum = QUERY_ROWID; /* lookup by rowid */
  1884. TRACE(("FTS1 QUERY_ROWID\n"));
  1885. } else if( pConstraint->iColumn>=0 &&
  1886. pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){
  1887. /* full-text search */
  1888. pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn;
  1889. TRACE(("FTS1 QUERY_FULLTEXT %d\n", pConstraint->iColumn));
  1890. } else continue;
  1891. pInfo->aConstraintUsage[i].argvIndex = 1;
  1892. pInfo->aConstraintUsage[i].omit = 1;
  1893. /* An arbitrary value for now.
  1894. * TODO: Perhaps rowid matches should be considered cheaper than
  1895. * full-text searches. */
  1896. pInfo->estimatedCost = 1.0;
  1897. return SQLITE_OK;
  1898. }
  1899. }
  1900. pInfo->idxNum = QUERY_GENERIC;
  1901. return SQLITE_OK;
  1902. }
  1903. static int fulltextDisconnect(sqlite3_vtab *pVTab){
  1904. TRACE(("FTS1 Disconnect %p\n", pVTab));
  1905. fulltext_vtab_destroy((fulltext_vtab *)pVTab);
  1906. return SQLITE_OK;
  1907. }
  1908. static int fulltextDestroy(sqlite3_vtab *pVTab){
  1909. fulltext_vtab *v = (fulltext_vtab *)pVTab;
  1910. int rc;
  1911. TRACE(("FTS1 Destroy %p\n", pVTab));
  1912. rc = sql_exec(v->db, v->zDb, v->zName,
  1913. "drop table if exists %_content;"
  1914. "drop table if exists %_term;"
  1915. );
  1916. if( rc!=SQLITE_OK ) return rc;
  1917. fulltext_vtab_destroy((fulltext_vtab *)pVTab);
  1918. return SQLITE_OK;
  1919. }
  1920. static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
  1921. fulltext_cursor *c;
  1922. c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1);
  1923. /* sqlite will initialize c->base */
  1924. *ppCursor = &c->base;
  1925. TRACE(("FTS1 Open %p: %p\n", pVTab, c));
  1926. return SQLITE_OK;
  1927. }
  1928. /* Free all of the dynamically allocated memory held by *q
  1929. */
  1930. static void queryClear(Query *q){
  1931. int i;
  1932. for(i = 0; i < q->nTerms; ++i){
  1933. free(q->pTerms[i].pTerm);
  1934. }
  1935. free(q->pTerms);
  1936. memset(q, 0, sizeof(*q));
  1937. }
  1938. /* Free all of the dynamically allocated memory held by the
  1939. ** Snippet
  1940. */
  1941. static void snippetClear(Snippet *p){
  1942. free(p->aMatch);
  1943. free(p->zOffset);
  1944. free(p->zSnippet);
  1945. memset(p, 0, sizeof(*p));
  1946. }
  1947. /*
  1948. ** Append a single entry to the p->aMatch[] log.
  1949. */
  1950. static void snippetAppendMatch(
  1951. Snippet *p, /* Append the entry to this snippet */
  1952. int iCol, int iTerm, /* The column and query term */
  1953. int iStart, int nByte /* Offset and size of the match */
  1954. ){
  1955. int i;
  1956. struct snippetMatch *pMatch;
  1957. if( p->nMatch+1>=p->nAlloc ){
  1958. p->nAlloc = p->nAlloc*2 + 10;
  1959. p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) );
  1960. if( p->aMatch==0 ){
  1961. p->nMatch = 0;
  1962. p->nAlloc = 0;
  1963. return;
  1964. }
  1965. }
  1966. i = p->nMatch++;
  1967. pMatch = &p->aMatch[i];
  1968. pMatch->iCol = iCol;
  1969. pMatch->iTerm = iTerm;
  1970. pMatch->iStart = iStart;
  1971. pMatch->nByte = nByte;
  1972. }
  1973. /*
  1974. ** Sizing information for the circular buffer used in snippetOffsetsOfColumn()
  1975. */
  1976. #define FTS1_ROTOR_SZ (32)
  1977. #define FTS1_ROTOR_MASK (FTS1_ROTOR_SZ-1)
  1978. /*
  1979. ** Add entries to pSnippet->aMatch[] for every match that occurs against
  1980. ** document zDoc[0..nDoc-1] which is stored in column iColumn.
  1981. */
  1982. static void snippetOffsetsOfColumn(
  1983. Query *pQuery,
  1984. Snippet *pSnippet,
  1985. int iColumn,
  1986. const char *zDoc,
  1987. int nDoc
  1988. ){
  1989. const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */
  1990. sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */
  1991. sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */
  1992. fulltext_vtab *pVtab; /* The full text index */
  1993. int nColumn; /* Number of columns in the index */
  1994. const QueryTerm *aTerm; /* Query string terms */
  1995. int nTerm; /* Number of query string terms */
  1996. int i, j; /* Loop counters */
  1997. int rc; /* Return code */
  1998. unsigned int match, prevMatch; /* Phrase search bitmasks */
  1999. const char *zToken; /* Next token from the tokenizer */
  2000. int nToken; /* Size of zToken */
  2001. int iBegin, iEnd, iPos; /* Offsets of beginning and end */
  2002. /* The following variables keep a circular buffer of the last
  2003. ** few tokens */
  2004. unsigned int iRotor = 0; /* Index of current token */
  2005. int iRotorBegin[FTS1_ROTOR_SZ]; /* Beginning offset of token */
  2006. int iRotorLen[FTS1_ROTOR_SZ]; /* Length of token */
  2007. pVtab = pQuery->pFts;
  2008. nColumn = pVtab->nColumn;
  2009. pTokenizer = pVtab->pTokenizer;
  2010. pTModule = pTokenizer->pModule;
  2011. rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor);
  2012. if( rc ) return;
  2013. pTCursor->pTokenizer = pTokenizer;
  2014. aTerm = pQuery->pTerms;
  2015. nTerm = pQuery->nTerms;
  2016. if( nTerm>=FTS1_ROTOR_SZ ){
  2017. nTerm = FTS1_ROTOR_SZ - 1;
  2018. }
  2019. prevMatch = 0;
  2020. while(1){
  2021. rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos);
  2022. if( rc ) break;
  2023. iRotorBegin[iRotor&FTS1_ROTOR_MASK] = iBegin;
  2024. iRotorLen[iRotor&FTS1_ROTOR_MASK] = iEnd-iBegin;
  2025. match = 0;
  2026. for(i=0; i<nTerm; i++){
  2027. int iCol;
  2028. iCol = aTerm[i].iColumn;
  2029. if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue;
  2030. if( aTerm[i].nTerm!=nToken ) continue;
  2031. if( memcmp(aTerm[i].pTerm, zToken, nToken) ) continue;
  2032. if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue;
  2033. match |= 1<<i;
  2034. if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){
  2035. for(j=aTerm[i].iPhrase-1; j>=0; j--){
  2036. int k = (iRotor-j) & FTS1_ROTOR_MASK;
  2037. snippetAppendMatch(pSnippet, iColumn, i-j,
  2038. iRotorBegin[k], iRotorLen[k]);
  2039. }
  2040. }
  2041. }
  2042. prevMatch = match<<1;
  2043. iRotor++;
  2044. }
  2045. pTModule->xClose(pTCursor);
  2046. }
  2047. /*
  2048. ** Compute all offsets for the current row of the query.
  2049. ** If the offsets have already been computed, this routine is a no-op.
  2050. */
  2051. static void snippetAllOffsets(fulltext_cursor *p){
  2052. int nColumn;
  2053. int iColumn, i;
  2054. int iFirst, iLast;
  2055. fulltext_vtab *pFts;
  2056. if( p->snippet.nMatch ) return;
  2057. if( p->q.nTerms==0 ) return;
  2058. pFts = p->q.pFts;
  2059. nColumn = pFts->nColumn;
  2060. iColumn = p->iCursorType - QUERY_FULLTEXT;
  2061. if( iColumn<0 || iColumn>=nColumn ){
  2062. iFirst = 0;
  2063. iLast = nColumn-1;
  2064. }else{
  2065. iFirst = iColumn;
  2066. iLast = iColumn;
  2067. }
  2068. for(i=iFirst; i<=iLast; i++){
  2069. const char *zDoc;
  2070. int nDoc;
  2071. zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1);
  2072. nDoc = sqlite3_column_bytes(p->pStmt, i+1);
  2073. snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc);
  2074. }
  2075. }
  2076. /*
  2077. ** Convert the information in the aMatch[] array of the snippet
  2078. ** into the string zOffset[0..nOffset-1].
  2079. */
  2080. static void snippetOffsetText(Snippet *p){
  2081. int i;
  2082. int cnt = 0;
  2083. StringBuffer sb;
  2084. char zBuf[200];
  2085. if( p->zOffset ) return;
  2086. initStringBuffer(&sb);
  2087. for(i=0; i<p->nMatch; i++){
  2088. struct snippetMatch *pMatch = &p->aMatch[i];
  2089. zBuf[0] = ' ';
  2090. sqlite3_snprintf(sizeof(zBuf)-1, &zBuf[cnt>0], "%d %d %d %d",
  2091. pMatch->iCol, pMatch->iTerm, pMatch->iStart, pMatch->nByte);
  2092. append(&sb, zBuf);
  2093. cnt++;
  2094. }
  2095. p->zOffset = sb.s;
  2096. p->nOffset = sb.len;
  2097. }
  2098. /*
  2099. ** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set
  2100. ** of matching words some of which might be in zDoc. zDoc is column
  2101. ** number iCol.
  2102. **
  2103. ** iBreak is suggested spot in zDoc where we could begin or end an
  2104. ** excerpt. Return a value similar to iBreak but possibly adjusted
  2105. ** to be a little left or right so that the break point is better.
  2106. */
  2107. static int wordBoundary(
  2108. int iBreak, /* The suggested break point */
  2109. const char *zDoc, /* Document text */
  2110. int nDoc, /* Number of bytes in zDoc[] */
  2111. struct snippetMatch *aMatch, /* Matching words */
  2112. int nMatch, /* Number of entries in aMatch[] */
  2113. int iCol /* The column number for zDoc[] */
  2114. ){
  2115. int i;
  2116. if( iBreak<=10 ){
  2117. return 0;
  2118. }
  2119. if( iBreak>=nDoc-10 ){
  2120. return nDoc;
  2121. }
  2122. for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){}
  2123. while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
  2124. if( i<nMatch ){
  2125. if( aMatch[i].iStart<iBreak+10 ){
  2126. return aMatch[i].iStart;
  2127. }
  2128. if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
  2129. return aMatch[i-1].iStart;
  2130. }
  2131. }
  2132. for(i=1; i<=10; i++){
  2133. if( safe_isspace(zDoc[iBreak-i]) ){
  2134. return iBreak - i + 1;
  2135. }
  2136. if( safe_isspace(zDoc[iBreak+i]) ){
  2137. return iBreak + i + 1;
  2138. }
  2139. }
  2140. return iBreak;
  2141. }
  2142. /*
  2143. ** If the StringBuffer does not end in white space, add a single
  2144. ** space character to the end.
  2145. */
  2146. static void appendWhiteSpace(StringBuffer *p){
  2147. if( p->len==0 ) return;
  2148. if( safe_isspace(p->s[p->len-1]) ) return;
  2149. append(p, " ");
  2150. }
  2151. /*
  2152. ** Remove white space from teh end of the StringBuffer
  2153. */
  2154. static void trimWhiteSpace(StringBuffer *p){
  2155. while( p->len>0 && safe_isspace(p->s[p->len-1]) ){
  2156. p->len--;
  2157. }
  2158. }
  2159. /*
  2160. ** Allowed values for Snippet.aMatch[].snStatus
  2161. */
  2162. #define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */
  2163. #define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */
  2164. /*
  2165. ** Generate the text of a snippet.
  2166. */
  2167. static void snippetText(
  2168. fulltext_cursor *pCursor, /* The cursor we need the snippet for */
  2169. const char *zStartMark, /* Markup to appear before each match */
  2170. const char *zEndMark, /* Markup to appear after each match */
  2171. const char *zEllipsis /* Ellipsis mark */
  2172. ){
  2173. int i, j;
  2174. struct snippetMatch *aMatch;
  2175. int nMatch;
  2176. int nDesired;
  2177. StringBuffer sb;
  2178. int tailCol;
  2179. int tailOffset;
  2180. int iCol;
  2181. int nDoc;
  2182. const char *zDoc;
  2183. int iStart, iEnd;
  2184. int tailEllipsis = 0;
  2185. int iMatch;
  2186. free(pCursor->snippet.zSnippet);
  2187. pCursor->snippet.zSnippet = 0;
  2188. aMatch = pCursor->snippet.aMatch;
  2189. nMatch = pCursor->snippet.nMatch;
  2190. initStringBuffer(&sb);
  2191. for(i=0; i<nMatch; i++){
  2192. aMatch[i].snStatus = SNIPPET_IGNORE;
  2193. }
  2194. nDesired = 0;
  2195. for(i=0; i<pCursor->q.nTerms; i++){
  2196. for(j=0; j<nMatch; j++){
  2197. if( aMatch[j].iTerm==i ){
  2198. aMatch[j].snStatus = SNIPPET_DESIRED;
  2199. nDesired++;
  2200. break;
  2201. }
  2202. }
  2203. }
  2204. iMatch = 0;
  2205. tailCol = -1;
  2206. tailOffset = 0;
  2207. for(i=0; i<nMatch && nDesired>0; i++){
  2208. if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
  2209. nDesired--;
  2210. iCol = aMatch[i].iCol;
  2211. zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
  2212. nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
  2213. iStart = aMatch[i].iStart - 40;
  2214. iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
  2215. if( iStart<=10 ){
  2216. iStart = 0;
  2217. }
  2218. if( iCol==tailCol && iStart<=tailOffset+20 ){
  2219. iStart = tailOffset;
  2220. }
  2221. if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){
  2222. trimWhiteSpace(&sb);
  2223. appendWhiteSpace(&sb);
  2224. append(&sb, zEllipsis);
  2225. appendWhiteSpace(&sb);
  2226. }
  2227. iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
  2228. iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
  2229. if( iEnd>=nDoc-10 ){
  2230. iEnd = nDoc;
  2231. tailEllipsis = 0;
  2232. }else{
  2233. tailEllipsis = 1;
  2234. }
  2235. while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
  2236. while( iStart<iEnd ){
  2237. while( iMatch<nMatch && aMatch[iMatch].iStart<iStart
  2238. && aMatch[iMatch].iCol<=iCol ){
  2239. iMatch++;
  2240. }
  2241. if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd
  2242. && aMatch[iMatch].iCol==iCol ){
  2243. nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
  2244. iStart = aMatch[iMatch].iStart;
  2245. append(&sb, zStartMark);
  2246. nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
  2247. append(&sb, zEndMark);
  2248. iStart += aMatch[iMatch].nByte;
  2249. for(j=iMatch+1; j<nMatch; j++){
  2250. if( aMatch[j].iTerm==aMatch[iMatch].iTerm
  2251. && aMatch[j].snStatus==SNIPPET_DESIRED ){
  2252. nDesired--;
  2253. aMatch[j].snStatus = SNIPPET_IGNORE;
  2254. }
  2255. }
  2256. }else{
  2257. nappend(&sb, &zDoc[iStart], iEnd - iStart);
  2258. iStart = iEnd;
  2259. }
  2260. }
  2261. tailCol = iCol;
  2262. tailOffset = iEnd;
  2263. }
  2264. trimWhiteSpace(&sb);
  2265. if( tailEllipsis ){
  2266. appendWhiteSpace(&sb);
  2267. append(&sb, zEllipsis);
  2268. }
  2269. pCursor->snippet.zSnippet = sb.s;
  2270. pCursor->snippet.nSnippet = sb.len;
  2271. }
  2272. /*
  2273. ** Close the cursor. For additional information see the documentation
  2274. ** on the xClose method of the virtual table interface.
  2275. */
  2276. static int fulltextClose(sqlite3_vtab_cursor *pCursor){
  2277. fulltext_cursor *c = (fulltext_cursor *) pCursor;
  2278. TRACE(("FTS1 Close %p\n", c));
  2279. sqlite3_finalize(c->pStmt);
  2280. queryClear(&c->q);
  2281. snippetClear(&c->snippet);
  2282. if( c->result.pDoclist!=NULL ){
  2283. docListDelete(c->result.pDoclist);
  2284. }
  2285. free(c);
  2286. return SQLITE_OK;
  2287. }
  2288. static int fulltextNext(sqlite3_vtab_cursor *pCursor){
  2289. fulltext_cursor *c = (fulltext_cursor *) pCursor;
  2290. sqlite_int64 iDocid;
  2291. int rc;
  2292. TRACE(("FTS1 Next %p\n", pCursor));
  2293. snippetClear(&c->snippet);
  2294. if( c->iCursorType < QUERY_FULLTEXT ){
  2295. /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
  2296. rc = sqlite3_step(c->pStmt);
  2297. switch( rc ){
  2298. case SQLITE_ROW:
  2299. c->eof = 0;
  2300. return SQLITE_OK;
  2301. case SQLITE_DONE:
  2302. c->eof = 1;
  2303. return SQLITE_OK;
  2304. default:
  2305. c->eof = 1;
  2306. return rc;
  2307. }
  2308. } else { /* full-text query */
  2309. rc = sqlite3_reset(c->pStmt);
  2310. if( rc!=SQLITE_OK ) return rc;
  2311. iDocid = nextDocid(&c->result);
  2312. if( iDocid==0 ){
  2313. c->eof = 1;
  2314. return SQLITE_OK;
  2315. }
  2316. rc = sqlite3_bind_int64(c->pStmt, 1, iDocid);
  2317. if( rc!=SQLITE_OK ) return rc;
  2318. /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
  2319. rc = sqlite3_step(c->pStmt);
  2320. if( rc==SQLITE_ROW ){ /* the case we expect */
  2321. c->eof = 0;
  2322. return SQLITE_OK;
  2323. }
  2324. /* an error occurred; abort */
  2325. return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
  2326. }
  2327. }
  2328. /* Return a DocList corresponding to the query term *pTerm. If *pTerm
  2329. ** is the first term of a phrase query, go ahead and evaluate the phrase
  2330. ** query and return the doclist for the entire phrase query.
  2331. **
  2332. ** The result is stored in pTerm->doclist.
  2333. */
  2334. static int docListOfTerm(
  2335. fulltext_vtab *v, /* The full text index */
  2336. int iColumn, /* column to restrict to. No restrition if >=nColumn */
  2337. QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */
  2338. DocList **ppResult /* Write the result here */
  2339. ){
  2340. DocList *pLeft, *pRight, *pNew;
  2341. int i, rc;
  2342. pLeft = docListNew(DL_POSITIONS);
  2343. rc = term_select_all(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pLeft);
  2344. if( rc ){
  2345. docListDelete(pLeft);
  2346. return rc;
  2347. }
  2348. for(i=1; i<=pQTerm->nPhrase; i++){
  2349. pRight = docListNew(DL_POSITIONS);
  2350. rc = term_select_all(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm, pRight);
  2351. if( rc ){
  2352. docListDelete(pLeft);
  2353. return rc;
  2354. }
  2355. pNew = docListNew(i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS);
  2356. docListPhraseMerge(pLeft, pRight, pNew);
  2357. docListDelete(pLeft);
  2358. docListDelete(pRight);
  2359. pLeft = pNew;
  2360. }
  2361. *ppResult = pLeft;
  2362. return SQLITE_OK;
  2363. }
  2364. /* Add a new term pTerm[0..nTerm-1] to the query *q.
  2365. */
  2366. static void queryAdd(Query *q, const char *pTerm, int nTerm){
  2367. QueryTerm *t;
  2368. ++q->nTerms;
  2369. q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0]));
  2370. if( q->pTerms==0 ){
  2371. q->nTerms = 0;
  2372. return;
  2373. }
  2374. t = &q->pTerms[q->nTerms - 1];
  2375. memset(t, 0, sizeof(*t));
  2376. t->pTerm = malloc(nTerm+1);
  2377. memcpy(t->pTerm, pTerm, nTerm);
  2378. t->pTerm[nTerm] = 0;
  2379. t->nTerm = nTerm;
  2380. t->isOr = q->nextIsOr;
  2381. q->nextIsOr = 0;
  2382. t->iColumn = q->nextColumn;
  2383. q->nextColumn = q->dfltColumn;
  2384. }
  2385. /*
  2386. ** Check to see if the string zToken[0...nToken-1] matches any
  2387. ** column name in the virtual table. If it does,
  2388. ** return the zero-indexed column number. If not, return -1.
  2389. */
  2390. static int checkColumnSpecifier(
  2391. fulltext_vtab *pVtab, /* The virtual table */
  2392. const char *zToken, /* Text of the token */
  2393. int nToken /* Number of characters in the token */
  2394. ){
  2395. int i;
  2396. for(i=0; i<pVtab->nColumn; i++){
  2397. if( memcmp(pVtab->azColumn[i], zToken, nToken)==0
  2398. && pVtab->azColumn[i][nToken]==0 ){
  2399. return i;
  2400. }
  2401. }
  2402. return -1;
  2403. }
  2404. /*
  2405. ** Parse the text at pSegment[0..nSegment-1]. Add additional terms
  2406. ** to the query being assemblied in pQuery.
  2407. **
  2408. ** inPhrase is true if pSegment[0..nSegement-1] is contained within
  2409. ** double-quotes. If inPhrase is true, then the first term
  2410. ** is marked with the number of terms in the phrase less one and
  2411. ** OR and "-" syntax is ignored. If inPhrase is false, then every
  2412. ** term found is marked with nPhrase=0 and OR and "-" syntax is significant.
  2413. */
  2414. static int tokenizeSegment(
  2415. sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */
  2416. const char *pSegment, int nSegment, /* Query expression being parsed */
  2417. int inPhrase, /* True if within "..." */
  2418. Query *pQuery /* Append results here */
  2419. ){
  2420. const sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
  2421. sqlite3_tokenizer_cursor *pCursor;
  2422. int firstIndex = pQuery->nTerms;
  2423. int iCol;
  2424. int nTerm = 1;
  2425. int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor);
  2426. if( rc!=SQLITE_OK ) return rc;
  2427. pCursor->pTokenizer = pTokenizer;
  2428. while( 1 ){
  2429. const char *pToken;
  2430. int nToken, iBegin, iEnd, iPos;
  2431. rc = pModule->xNext(pCursor,
  2432. &pToken, &nToken,
  2433. &iBegin, &iEnd, &iPos);
  2434. if( rc!=SQLITE_OK ) break;
  2435. if( !inPhrase &&
  2436. pSegment[iEnd]==':' &&
  2437. (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){
  2438. pQuery->nextColumn = iCol;
  2439. continue;
  2440. }
  2441. if( !inPhrase && pQuery->nTerms>0 && nToken==2
  2442. && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){
  2443. pQuery->nextIsOr = 1;
  2444. continue;
  2445. }
  2446. queryAdd(pQuery, pToken, nToken);
  2447. if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){
  2448. pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
  2449. }
  2450. pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm;
  2451. if( inPhrase ){
  2452. nTerm++;
  2453. }
  2454. }
  2455. if( inPhrase && pQuery->nTerms>firstIndex ){
  2456. pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1;
  2457. }
  2458. return pModule->xClose(pCursor);
  2459. }
  2460. /* Parse a query string, yielding a Query object pQuery.
  2461. **
  2462. ** The calling function will need to queryClear() to clean up
  2463. ** the dynamically allocated memory held by pQuery.
  2464. */
  2465. static int parseQuery(
  2466. fulltext_vtab *v, /* The fulltext index */
  2467. const char *zInput, /* Input text of the query string */
  2468. int nInput, /* Size of the input text */
  2469. int dfltColumn, /* Default column of the index to match against */
  2470. Query *pQuery /* Write the parse results here. */
  2471. ){
  2472. int iInput, inPhrase = 0;
  2473. if( zInput==0 ) nInput = 0;
  2474. if( nInput<0 ) nInput = strlen(zInput);
  2475. pQuery->nTerms = 0;
  2476. pQuery->pTerms = NULL;
  2477. pQuery->nextIsOr = 0;
  2478. pQuery->nextColumn = dfltColumn;
  2479. pQuery->dfltColumn = dfltColumn;
  2480. pQuery->pFts = v;
  2481. for(iInput=0; iInput<nInput; ++iInput){
  2482. int i;
  2483. for(i=iInput; i<nInput && zInput[i]!='"'; ++i){}
  2484. if( i>iInput ){
  2485. tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase,
  2486. pQuery);
  2487. }
  2488. iInput = i;
  2489. if( i<nInput ){
  2490. assert( zInput[i]=='"' );
  2491. inPhrase = !inPhrase;
  2492. }
  2493. }
  2494. if( inPhrase ){
  2495. /* unmatched quote */
  2496. queryClear(pQuery);
  2497. return SQLITE_ERROR;
  2498. }
  2499. return SQLITE_OK;
  2500. }
  2501. /* Perform a full-text query using the search expression in
  2502. ** zInput[0..nInput-1]. Return a list of matching documents
  2503. ** in pResult.
  2504. **
  2505. ** Queries must match column iColumn. Or if iColumn>=nColumn
  2506. ** they are allowed to match against any column.
  2507. */
  2508. static int fulltextQuery(
  2509. fulltext_vtab *v, /* The full text index */
  2510. int iColumn, /* Match against this column by default */
  2511. const char *zInput, /* The query string */
  2512. int nInput, /* Number of bytes in zInput[] */
  2513. DocList **pResult, /* Write the result doclist here */
  2514. Query *pQuery /* Put parsed query string here */
  2515. ){
  2516. int i, iNext, rc;
  2517. DocList *pLeft = NULL;
  2518. DocList *pRight, *pNew, *pOr;
  2519. int nNot = 0;
  2520. QueryTerm *aTerm;
  2521. rc = parseQuery(v, zInput, nInput, iColumn, pQuery);
  2522. if( rc!=SQLITE_OK ) return rc;
  2523. /* Merge AND terms. */
  2524. aTerm = pQuery->pTerms;
  2525. for(i = 0; i<pQuery->nTerms; i=iNext){
  2526. if( aTerm[i].isNot ){
  2527. /* Handle all NOT terms in a separate pass */
  2528. nNot++;
  2529. iNext = i + aTerm[i].nPhrase+1;
  2530. continue;
  2531. }
  2532. iNext = i + aTerm[i].nPhrase + 1;
  2533. rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight);
  2534. if( rc ){
  2535. queryClear(pQuery);
  2536. return rc;
  2537. }
  2538. while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){
  2539. rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &pOr);
  2540. iNext += aTerm[iNext].nPhrase + 1;
  2541. if( rc ){
  2542. queryClear(pQuery);
  2543. return rc;
  2544. }
  2545. pNew = docListNew(DL_DOCIDS);
  2546. docListOrMerge(pRight, pOr, pNew);
  2547. docListDelete(pRight);
  2548. docListDelete(pOr);
  2549. pRight = pNew;
  2550. }
  2551. if( pLeft==0 ){
  2552. pLeft = pRight;
  2553. }else{
  2554. pNew = docListNew(DL_DOCIDS);
  2555. docListAndMerge(pLeft, pRight, pNew);
  2556. docListDelete(pRight);
  2557. docListDelete(pLeft);
  2558. pLeft = pNew;
  2559. }
  2560. }
  2561. if( nNot && pLeft==0 ){
  2562. /* We do not yet know how to handle a query of only NOT terms */
  2563. return SQLITE_ERROR;
  2564. }
  2565. /* Do the EXCEPT terms */
  2566. for(i=0; i<pQuery->nTerms; i += aTerm[i].nPhrase + 1){
  2567. if( !aTerm[i].isNot ) continue;
  2568. rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight);
  2569. if( rc ){
  2570. queryClear(pQuery);
  2571. docListDelete(pLeft);
  2572. return rc;
  2573. }
  2574. pNew = docListNew(DL_DOCIDS);
  2575. docListExceptMerge(pLeft, pRight, pNew);
  2576. docListDelete(pRight);
  2577. docListDelete(pLeft);
  2578. pLeft = pNew;
  2579. }
  2580. *pResult = pLeft;
  2581. return rc;
  2582. }
  2583. /*
  2584. ** This is the xFilter interface for the virtual table. See
  2585. ** the virtual table xFilter method documentation for additional
  2586. ** information.
  2587. **
  2588. ** If idxNum==QUERY_GENERIC then do a full table scan against
  2589. ** the %_content table.
  2590. **
  2591. ** If idxNum==QUERY_ROWID then do a rowid lookup for a single entry
  2592. ** in the %_content table.
  2593. **
  2594. ** If idxNum>=QUERY_FULLTEXT then use the full text index. The
  2595. ** column on the left-hand side of the MATCH operator is column
  2596. ** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand
  2597. ** side of the MATCH operator.
  2598. */
  2599. /* TODO(shess) Upgrade the cursor initialization and destruction to
  2600. ** account for fulltextFilter() being called multiple times on the
  2601. ** same cursor. The current solution is very fragile. Apply fix to
  2602. ** fts2 as appropriate.
  2603. */
  2604. static int fulltextFilter(
  2605. sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */
  2606. int idxNum, const char *idxStr, /* Which indexing scheme to use */
  2607. int argc, sqlite3_value **argv /* Arguments for the indexing scheme */
  2608. ){
  2609. fulltext_cursor *c = (fulltext_cursor *) pCursor;
  2610. fulltext_vtab *v = cursor_vtab(c);
  2611. int rc;
  2612. char *zSql;
  2613. TRACE(("FTS1 Filter %p\n",pCursor));
  2614. zSql = sqlite3_mprintf("select rowid, * from %%_content %s",
  2615. idxNum==QUERY_GENERIC ? "" : "where rowid=?");
  2616. sqlite3_finalize(c->pStmt);
  2617. rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, zSql);
  2618. sqlite3_free(zSql);
  2619. if( rc!=SQLITE_OK ) return rc;
  2620. c->iCursorType = idxNum;
  2621. switch( idxNum ){
  2622. case QUERY_GENERIC:
  2623. break;
  2624. case QUERY_ROWID:
  2625. rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0]));
  2626. if( rc!=SQLITE_OK ) return rc;
  2627. break;
  2628. default: /* full-text search */
  2629. {
  2630. const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
  2631. DocList *pResult;
  2632. assert( idxNum<=QUERY_FULLTEXT+v->nColumn);
  2633. assert( argc==1 );
  2634. queryClear(&c->q);
  2635. rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &pResult, &c->q);
  2636. if( rc!=SQLITE_OK ) return rc;
  2637. if( c->result.pDoclist!=NULL ) docListDelete(c->result.pDoclist);
  2638. readerInit(&c->result, pResult);
  2639. break;
  2640. }
  2641. }
  2642. return fulltextNext(pCursor);
  2643. }
  2644. /* This is the xEof method of the virtual table. The SQLite core
  2645. ** calls this routine to find out if it has reached the end of
  2646. ** a query's results set.
  2647. */
  2648. static int fulltextEof(sqlite3_vtab_cursor *pCursor){
  2649. fulltext_cursor *c = (fulltext_cursor *) pCursor;
  2650. return c->eof;
  2651. }
  2652. /* This is the xColumn method of the virtual table. The SQLite
  2653. ** core calls this method during a query when it needs the value
  2654. ** of a column from the virtual table. This method needs to use
  2655. ** one of the sqlite3_result_*() routines to store the requested
  2656. ** value back in the pContext.
  2657. */
  2658. static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
  2659. sqlite3_context *pContext, int idxCol){
  2660. fulltext_cursor *c = (fulltext_cursor *) pCursor;
  2661. fulltext_vtab *v = cursor_vtab(c);
  2662. if( idxCol<v->nColumn ){
  2663. sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1);
  2664. sqlite3_result_value(pContext, pVal);
  2665. }else if( idxCol==v->nColumn ){
  2666. /* The extra column whose name is the same as the table.
  2667. ** Return a blob which is a pointer to the cursor
  2668. */
  2669. sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT);
  2670. }
  2671. return SQLITE_OK;
  2672. }
  2673. /* This is the xRowid method. The SQLite core calls this routine to
  2674. ** retrive the rowid for the current row of the result set. The
  2675. ** rowid should be written to *pRowid.
  2676. */
  2677. static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
  2678. fulltext_cursor *c = (fulltext_cursor *) pCursor;
  2679. *pRowid = sqlite3_column_int64(c->pStmt, 0);
  2680. return SQLITE_OK;
  2681. }
  2682. /* Add all terms in [zText] to the given hash table. If [iColumn] > 0,
  2683. * we also store positions and offsets in the hash table using the given
  2684. * column number. */
  2685. static int buildTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iDocid,
  2686. const char *zText, int iColumn){
  2687. sqlite3_tokenizer *pTokenizer = v->pTokenizer;
  2688. sqlite3_tokenizer_cursor *pCursor;
  2689. const char *pToken;
  2690. int nTokenBytes;
  2691. int iStartOffset, iEndOffset, iPosition;
  2692. int rc;
  2693. rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
  2694. if( rc!=SQLITE_OK ) return rc;
  2695. pCursor->pTokenizer = pTokenizer;
  2696. while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor,
  2697. &pToken, &nTokenBytes,
  2698. &iStartOffset, &iEndOffset,
  2699. &iPosition) ){
  2700. DocList *p;
  2701. /* Positions can't be negative; we use -1 as a terminator internally. */
  2702. if( iPosition<0 ){
  2703. pTokenizer->pModule->xClose(pCursor);
  2704. return SQLITE_ERROR;
  2705. }
  2706. p = fts1HashFind(terms, pToken, nTokenBytes);
  2707. if( p==NULL ){
  2708. p = docListNew(DL_DEFAULT);
  2709. docListAddDocid(p, iDocid);
  2710. fts1HashInsert(terms, pToken, nTokenBytes, p);
  2711. }
  2712. if( iColumn>=0 ){
  2713. docListAddPosOffset(p, iColumn, iPosition, iStartOffset, iEndOffset);
  2714. }
  2715. }
  2716. /* TODO(shess) Check return? Should this be able to cause errors at
  2717. ** this point? Actually, same question about sqlite3_finalize(),
  2718. ** though one could argue that failure there means that the data is
  2719. ** not durable. *ponder*
  2720. */
  2721. pTokenizer->pModule->xClose(pCursor);
  2722. return rc;
  2723. }
  2724. /* Update the %_terms table to map the term [pTerm] to the given rowid. */
  2725. static int index_insert_term(fulltext_vtab *v, const char *pTerm, int nTerm,
  2726. DocList *d){
  2727. sqlite_int64 iIndexRow;
  2728. DocList doclist;
  2729. int iSegment = 0, rc;
  2730. rc = term_select(v, pTerm, nTerm, iSegment, &iIndexRow, &doclist);
  2731. if( rc==SQLITE_DONE ){
  2732. docListInit(&doclist, DL_DEFAULT, 0, 0);
  2733. docListUpdate(&doclist, d);
  2734. /* TODO(shess) Consider length(doclist)>CHUNK_MAX? */
  2735. rc = term_insert(v, NULL, pTerm, nTerm, iSegment, &doclist);
  2736. goto err;
  2737. }
  2738. if( rc!=SQLITE_ROW ) return SQLITE_ERROR;
  2739. docListUpdate(&doclist, d);
  2740. if( doclist.nData<=CHUNK_MAX ){
  2741. rc = term_update(v, iIndexRow, &doclist);
  2742. goto err;
  2743. }
  2744. /* Doclist doesn't fit, delete what's there, and accumulate
  2745. ** forward.
  2746. */
  2747. rc = term_delete(v, iIndexRow);
  2748. if( rc!=SQLITE_OK ) goto err;
  2749. /* Try to insert the doclist into a higher segment bucket. On
  2750. ** failure, accumulate existing doclist with the doclist from that
  2751. ** bucket, and put results in the next bucket.
  2752. */
  2753. iSegment++;
  2754. while( (rc=term_insert(v, &iIndexRow, pTerm, nTerm, iSegment,
  2755. &doclist))!=SQLITE_OK ){
  2756. sqlite_int64 iSegmentRow;
  2757. DocList old;
  2758. int rc2;
  2759. /* Retain old error in case the term_insert() error was really an
  2760. ** error rather than a bounced insert.
  2761. */
  2762. rc2 = term_select(v, pTerm, nTerm, iSegment, &iSegmentRow, &old);
  2763. if( rc2!=SQLITE_ROW ) goto err;
  2764. rc = term_delete(v, iSegmentRow);
  2765. if( rc!=SQLITE_OK ) goto err;
  2766. /* Reusing lowest-number deleted row keeps the index smaller. */
  2767. if( iSegmentRow<iIndexRow ) iIndexRow = iSegmentRow;
  2768. /* doclist contains the newer data, so accumulate it over old.
  2769. ** Then steal accumulated data for doclist.
  2770. */
  2771. docListAccumulate(&old, &doclist);
  2772. docListDestroy(&doclist);
  2773. doclist = old;
  2774. iSegment++;
  2775. }
  2776. err:
  2777. docListDestroy(&doclist);
  2778. return rc;
  2779. }
  2780. /* Add doclists for all terms in [pValues] to the hash table [terms]. */
  2781. static int insertTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iRowid,
  2782. sqlite3_value **pValues){
  2783. int i;
  2784. for(i = 0; i < v->nColumn ; ++i){
  2785. char *zText = (char*)sqlite3_value_text(pValues[i]);
  2786. int rc = buildTerms(v, terms, iRowid, zText, i);
  2787. if( rc!=SQLITE_OK ) return rc;
  2788. }
  2789. return SQLITE_OK;
  2790. }
  2791. /* Add empty doclists for all terms in the given row's content to the hash
  2792. * table [pTerms]. */
  2793. static int deleteTerms(fulltext_vtab *v, fts1Hash *pTerms, sqlite_int64 iRowid){
  2794. const char **pValues;
  2795. int i;
  2796. int rc = content_select(v, iRowid, &pValues);
  2797. if( rc!=SQLITE_OK ) return rc;
  2798. for(i = 0 ; i < v->nColumn; ++i) {
  2799. rc = buildTerms(v, pTerms, iRowid, pValues[i], -1);
  2800. if( rc!=SQLITE_OK ) break;
  2801. }
  2802. freeStringArray(v->nColumn, pValues);
  2803. return SQLITE_OK;
  2804. }
  2805. /* Insert a row into the %_content table; set *piRowid to be the ID of the
  2806. * new row. Fill [pTerms] with new doclists for the %_term table. */
  2807. static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestRowid,
  2808. sqlite3_value **pValues,
  2809. sqlite_int64 *piRowid, fts1Hash *pTerms){
  2810. int rc;
  2811. rc = content_insert(v, pRequestRowid, pValues); /* execute an SQL INSERT */
  2812. if( rc!=SQLITE_OK ) return rc;
  2813. *piRowid = sqlite3_last_insert_rowid(v->db);
  2814. return insertTerms(v, pTerms, *piRowid, pValues);
  2815. }
  2816. /* Delete a row from the %_content table; fill [pTerms] with empty doclists
  2817. * to be written to the %_term table. */
  2818. static int index_delete(fulltext_vtab *v, sqlite_int64 iRow, fts1Hash *pTerms){
  2819. int rc = deleteTerms(v, pTerms, iRow);
  2820. if( rc!=SQLITE_OK ) return rc;
  2821. return content_delete(v, iRow); /* execute an SQL DELETE */
  2822. }
  2823. /* Update a row in the %_content table; fill [pTerms] with new doclists for the
  2824. * %_term table. */
  2825. static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
  2826. sqlite3_value **pValues, fts1Hash *pTerms){
  2827. /* Generate an empty doclist for each term that previously appeared in this
  2828. * row. */
  2829. int rc = deleteTerms(v, pTerms, iRow);
  2830. if( rc!=SQLITE_OK ) return rc;
  2831. rc = content_update(v, pValues, iRow); /* execute an SQL UPDATE */
  2832. if( rc!=SQLITE_OK ) return rc;
  2833. /* Now add positions for terms which appear in the updated row. */
  2834. return insertTerms(v, pTerms, iRow, pValues);
  2835. }
  2836. /* This function implements the xUpdate callback; it is the top-level entry
  2837. * point for inserting, deleting or updating a row in a full-text table. */
  2838. static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
  2839. sqlite_int64 *pRowid){
  2840. fulltext_vtab *v = (fulltext_vtab *) pVtab;
  2841. fts1Hash terms; /* maps term string -> PosList */
  2842. int rc;
  2843. fts1HashElem *e;
  2844. TRACE(("FTS1 Update %p\n", pVtab));
  2845. fts1HashInit(&terms, FTS1_HASH_STRING, 1);
  2846. if( nArg<2 ){
  2847. rc = index_delete(v, sqlite3_value_int64(ppArg[0]), &terms);
  2848. } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
  2849. /* An update:
  2850. * ppArg[0] = old rowid
  2851. * ppArg[1] = new rowid
  2852. * ppArg[2..2+v->nColumn-1] = values
  2853. * ppArg[2+v->nColumn] = value for magic column (we ignore this)
  2854. */
  2855. sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]);
  2856. if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER ||
  2857. sqlite3_value_int64(ppArg[1]) != rowid ){
  2858. rc = SQLITE_ERROR; /* we don't allow changing the rowid */
  2859. } else {
  2860. assert( nArg==2+v->nColumn+1);
  2861. rc = index_update(v, rowid, &ppArg[2], &terms);
  2862. }
  2863. } else {
  2864. /* An insert:
  2865. * ppArg[1] = requested rowid
  2866. * ppArg[2..2+v->nColumn-1] = values
  2867. * ppArg[2+v->nColumn] = value for magic column (we ignore this)
  2868. */
  2869. assert( nArg==2+v->nColumn+1);
  2870. rc = index_insert(v, ppArg[1], &ppArg[2], pRowid, &terms);
  2871. }
  2872. if( rc==SQLITE_OK ){
  2873. /* Write updated doclists to disk. */
  2874. for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){
  2875. DocList *p = fts1HashData(e);
  2876. rc = index_insert_term(v, fts1HashKey(e), fts1HashKeysize(e), p);
  2877. if( rc!=SQLITE_OK ) break;
  2878. }
  2879. }
  2880. /* clean up */
  2881. for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){
  2882. DocList *p = fts1HashData(e);
  2883. docListDelete(p);
  2884. }
  2885. fts1HashClear(&terms);
  2886. return rc;
  2887. }
  2888. /*
  2889. ** Implementation of the snippet() function for FTS1
  2890. */
  2891. static void snippetFunc(
  2892. sqlite3_context *pContext,
  2893. int argc,
  2894. sqlite3_value **argv
  2895. ){
  2896. fulltext_cursor *pCursor;
  2897. if( argc<1 ) return;
  2898. if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
  2899. sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
  2900. sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
  2901. }else{
  2902. const char *zStart = "<b>";
  2903. const char *zEnd = "</b>";
  2904. const char *zEllipsis = "<b>...</b>";
  2905. memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
  2906. if( argc>=2 ){
  2907. zStart = (const char*)sqlite3_value_text(argv[1]);
  2908. if( argc>=3 ){
  2909. zEnd = (const char*)sqlite3_value_text(argv[2]);
  2910. if( argc>=4 ){
  2911. zEllipsis = (const char*)sqlite3_value_text(argv[3]);
  2912. }
  2913. }
  2914. }
  2915. snippetAllOffsets(pCursor);
  2916. snippetText(pCursor, zStart, zEnd, zEllipsis);
  2917. sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
  2918. pCursor->snippet.nSnippet, SQLITE_STATIC);
  2919. }
  2920. }
  2921. /*
  2922. ** Implementation of the offsets() function for FTS1
  2923. */
  2924. static void snippetOffsetsFunc(
  2925. sqlite3_context *pContext,
  2926. int argc,
  2927. sqlite3_value **argv
  2928. ){
  2929. fulltext_cursor *pCursor;
  2930. if( argc<1 ) return;
  2931. if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
  2932. sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
  2933. sqlite3_result_error(pContext, "illegal first argument to offsets",-1);
  2934. }else{
  2935. memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
  2936. snippetAllOffsets(pCursor);
  2937. snippetOffsetText(&pCursor->snippet);
  2938. sqlite3_result_text(pContext,
  2939. pCursor->snippet.zOffset, pCursor->snippet.nOffset,
  2940. SQLITE_STATIC);
  2941. }
  2942. }
  2943. /*
  2944. ** This routine implements the xFindFunction method for the FTS1
  2945. ** virtual table.
  2946. */
  2947. static int fulltextFindFunction(
  2948. sqlite3_vtab *pVtab,
  2949. int nArg,
  2950. const char *zName,
  2951. void (**pxFunc)(sqlite3_context*,int,sqlite3_value**),
  2952. void **ppArg
  2953. ){
  2954. if( strcmp(zName,"snippet")==0 ){
  2955. *pxFunc = snippetFunc;
  2956. return 1;
  2957. }else if( strcmp(zName,"offsets")==0 ){
  2958. *pxFunc = snippetOffsetsFunc;
  2959. return 1;
  2960. }
  2961. return 0;
  2962. }
  2963. /*
  2964. ** Rename an fts1 table.
  2965. */
  2966. static int fulltextRename(
  2967. sqlite3_vtab *pVtab,
  2968. const char *zName
  2969. ){
  2970. fulltext_vtab *p = (fulltext_vtab *)pVtab;
  2971. int rc = SQLITE_NOMEM;
  2972. char *zSql = sqlite3_mprintf(
  2973. "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';"
  2974. "ALTER TABLE %Q.'%q_term' RENAME TO '%q_term';"
  2975. , p->zDb, p->zName, zName
  2976. , p->zDb, p->zName, zName
  2977. );
  2978. if( zSql ){
  2979. rc = sqlite3_exec(p->db, zSql, 0, 0, 0);
  2980. sqlite3_free(zSql);
  2981. }
  2982. return rc;
  2983. }
  2984. static const sqlite3_module fulltextModule = {
  2985. /* iVersion */ 0,
  2986. /* xCreate */ fulltextCreate,
  2987. /* xConnect */ fulltextConnect,
  2988. /* xBestIndex */ fulltextBestIndex,
  2989. /* xDisconnect */ fulltextDisconnect,
  2990. /* xDestroy */ fulltextDestroy,
  2991. /* xOpen */ fulltextOpen,
  2992. /* xClose */ fulltextClose,
  2993. /* xFilter */ fulltextFilter,
  2994. /* xNext */ fulltextNext,
  2995. /* xEof */ fulltextEof,
  2996. /* xColumn */ fulltextColumn,
  2997. /* xRowid */ fulltextRowid,
  2998. /* xUpdate */ fulltextUpdate,
  2999. /* xBegin */ 0,
  3000. /* xSync */ 0,
  3001. /* xCommit */ 0,
  3002. /* xRollback */ 0,
  3003. /* xFindFunction */ fulltextFindFunction,
  3004. /* xRename */ fulltextRename,
  3005. };
  3006. int sqlite3Fts1Init(sqlite3 *db){
  3007. sqlite3_overload_function(db, "snippet", -1);
  3008. sqlite3_overload_function(db, "offsets", -1);
  3009. return sqlite3_create_module(db, "fts1", &fulltextModule, 0);
  3010. }
  3011. #if !SQLITE_CORE
  3012. #ifdef _WIN32
  3013. __declspec(dllexport)
  3014. #endif
  3015. int sqlite3_fts1_init(sqlite3 *db, char **pzErrMsg,
  3016. const sqlite3_api_routines *pApi){
  3017. SQLITE_EXTENSION_INIT2(pApi)
  3018. return sqlite3Fts1Init(db);
  3019. }
  3020. #endif
  3021. #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */