fts3_tokenizer1.c 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. /*
  2. ** 2006 Oct 10
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. ******************************************************************************
  12. **
  13. ** Implementation of the "simple" full-text-search tokenizer.
  14. */
  15. /*
  16. ** The code in this file is only compiled if:
  17. **
  18. ** * The FTS3 module is being built as an extension
  19. ** (in which case SQLITE_CORE is not defined), or
  20. **
  21. ** * The FTS3 module is being built into the core of
  22. ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
  23. */
  24. #include "fts3Int.h"
  25. #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
  26. #include <assert.h>
  27. #include <stdlib.h>
  28. #include <stdio.h>
  29. #include <string.h>
  30. #include "fts3_tokenizer.h"
  31. typedef struct simple_tokenizer {
  32. sqlite3_tokenizer base;
  33. char delim[128]; /* flag ASCII delimiters */
  34. } simple_tokenizer;
  35. typedef struct simple_tokenizer_cursor {
  36. sqlite3_tokenizer_cursor base;
  37. const char *pInput; /* input we are tokenizing */
  38. int nBytes; /* size of the input */
  39. int iOffset; /* current position in pInput */
  40. int iToken; /* index of next token to be returned */
  41. char *pToken; /* storage for current token */
  42. int nTokenAllocated; /* space allocated to zToken buffer */
  43. } simple_tokenizer_cursor;
  44. static int simpleDelim(simple_tokenizer *t, unsigned char c){
  45. return c<0x80 && t->delim[c];
  46. }
  47. static int fts3_isalnum(int x){
  48. return (x>='0' && x<='9') || (x>='A' && x<='Z') || (x>='a' && x<='z');
  49. }
  50. /*
  51. ** Create a new tokenizer instance.
  52. */
  53. static int simpleCreate(
  54. int argc, const char * const *argv,
  55. sqlite3_tokenizer **ppTokenizer
  56. ){
  57. simple_tokenizer *t;
  58. t = (simple_tokenizer *) sqlite3_malloc(sizeof(*t));
  59. if( t==NULL ) return SQLITE_NOMEM;
  60. memset(t, 0, sizeof(*t));
  61. /* TODO(shess) Delimiters need to remain the same from run to run,
  62. ** else we need to reindex. One solution would be a meta-table to
  63. ** track such information in the database, then we'd only want this
  64. ** information on the initial create.
  65. */
  66. if( argc>1 ){
  67. int i, n = (int)strlen(argv[1]);
  68. for(i=0; i<n; i++){
  69. unsigned char ch = argv[1][i];
  70. /* We explicitly don't support UTF-8 delimiters for now. */
  71. if( ch>=0x80 ){
  72. sqlite3_free(t);
  73. return SQLITE_ERROR;
  74. }
  75. t->delim[ch] = 1;
  76. }
  77. } else {
  78. /* Mark non-alphanumeric ASCII characters as delimiters */
  79. int i;
  80. for(i=1; i<0x80; i++){
  81. t->delim[i] = !fts3_isalnum(i) ? -1 : 0;
  82. }
  83. }
  84. *ppTokenizer = &t->base;
  85. return SQLITE_OK;
  86. }
  87. /*
  88. ** Destroy a tokenizer
  89. */
  90. static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
  91. sqlite3_free(pTokenizer);
  92. return SQLITE_OK;
  93. }
  94. /*
  95. ** Prepare to begin tokenizing a particular string. The input
  96. ** string to be tokenized is pInput[0..nBytes-1]. A cursor
  97. ** used to incrementally tokenize this string is returned in
  98. ** *ppCursor.
  99. */
  100. static int simpleOpen(
  101. sqlite3_tokenizer *pTokenizer, /* The tokenizer */
  102. const char *pInput, int nBytes, /* String to be tokenized */
  103. sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
  104. ){
  105. simple_tokenizer_cursor *c;
  106. UNUSED_PARAMETER(pTokenizer);
  107. c = (simple_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
  108. if( c==NULL ) return SQLITE_NOMEM;
  109. c->pInput = pInput;
  110. if( pInput==0 ){
  111. c->nBytes = 0;
  112. }else if( nBytes<0 ){
  113. c->nBytes = (int)strlen(pInput);
  114. }else{
  115. c->nBytes = nBytes;
  116. }
  117. c->iOffset = 0; /* start tokenizing at the beginning */
  118. c->iToken = 0;
  119. c->pToken = NULL; /* no space allocated, yet. */
  120. c->nTokenAllocated = 0;
  121. *ppCursor = &c->base;
  122. return SQLITE_OK;
  123. }
  124. /*
  125. ** Close a tokenization cursor previously opened by a call to
  126. ** simpleOpen() above.
  127. */
  128. static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
  129. simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
  130. sqlite3_free(c->pToken);
  131. sqlite3_free(c);
  132. return SQLITE_OK;
  133. }
  134. /*
  135. ** Extract the next token from a tokenization cursor. The cursor must
  136. ** have been opened by a prior call to simpleOpen().
  137. */
  138. static int simpleNext(
  139. sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
  140. const char **ppToken, /* OUT: *ppToken is the token text */
  141. int *pnBytes, /* OUT: Number of bytes in token */
  142. int *piStartOffset, /* OUT: Starting offset of token */
  143. int *piEndOffset, /* OUT: Ending offset of token */
  144. int *piPosition /* OUT: Position integer of token */
  145. ){
  146. simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
  147. simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
  148. unsigned char *p = (unsigned char *)c->pInput;
  149. while( c->iOffset<c->nBytes ){
  150. int iStartOffset;
  151. /* Scan past delimiter characters */
  152. while( c->iOffset<c->nBytes && simpleDelim(t, p[c->iOffset]) ){
  153. c->iOffset++;
  154. }
  155. /* Count non-delimiter characters. */
  156. iStartOffset = c->iOffset;
  157. while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){
  158. c->iOffset++;
  159. }
  160. if( c->iOffset>iStartOffset ){
  161. int i, n = c->iOffset-iStartOffset;
  162. if( n>c->nTokenAllocated ){
  163. char *pNew;
  164. c->nTokenAllocated = n+20;
  165. pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated);
  166. if( !pNew ) return SQLITE_NOMEM;
  167. c->pToken = pNew;
  168. }
  169. for(i=0; i<n; i++){
  170. /* TODO(shess) This needs expansion to handle UTF-8
  171. ** case-insensitivity.
  172. */
  173. unsigned char ch = p[iStartOffset+i];
  174. c->pToken[i] = (char)((ch>='A' && ch<='Z') ? ch-'A'+'a' : ch);
  175. }
  176. *ppToken = c->pToken;
  177. *pnBytes = n;
  178. *piStartOffset = iStartOffset;
  179. *piEndOffset = c->iOffset;
  180. *piPosition = c->iToken++;
  181. return SQLITE_OK;
  182. }
  183. }
  184. return SQLITE_DONE;
  185. }
  186. /*
  187. ** The set of routines that implement the simple tokenizer
  188. */
  189. static const sqlite3_tokenizer_module simpleTokenizerModule = {
  190. 0,
  191. simpleCreate,
  192. simpleDestroy,
  193. simpleOpen,
  194. simpleClose,
  195. simpleNext,
  196. 0,
  197. };
  198. /*
  199. ** Allocate a new simple tokenizer. Return a pointer to the new
  200. ** tokenizer in *ppModule
  201. */
  202. void sqlite3Fts3SimpleTokenizerModule(
  203. sqlite3_tokenizer_module const**ppModule
  204. ){
  205. *ppModule = &simpleTokenizerModule;
  206. }
  207. #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */