fts2_tokenizer1.c 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. /*
  2. ** 2006 Oct 10
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. ******************************************************************************
  12. **
  13. ** Implementation of the "simple" full-text-search tokenizer.
  14. */
  15. /*
  16. ** The code in this file is only compiled if:
  17. **
  18. ** * The FTS2 module is being built as an extension
  19. ** (in which case SQLITE_CORE is not defined), or
  20. **
  21. ** * The FTS2 module is being built into the core of
  22. ** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
  23. */
  24. #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
  25. #include <assert.h>
  26. #include <stdlib.h>
  27. #include <stdio.h>
  28. #include <string.h>
  29. #include "sqlite3.h"
  30. #include "sqlite3ext.h"
  31. SQLITE_EXTENSION_INIT3
  32. #include "fts2_tokenizer.h"
  33. typedef struct simple_tokenizer {
  34. sqlite3_tokenizer base;
  35. char delim[128]; /* flag ASCII delimiters */
  36. } simple_tokenizer;
  37. typedef struct simple_tokenizer_cursor {
  38. sqlite3_tokenizer_cursor base;
  39. const char *pInput; /* input we are tokenizing */
  40. int nBytes; /* size of the input */
  41. int iOffset; /* current position in pInput */
  42. int iToken; /* index of next token to be returned */
  43. char *pToken; /* storage for current token */
  44. int nTokenAllocated; /* space allocated to zToken buffer */
  45. } simple_tokenizer_cursor;
  46. /* Forward declaration */
  47. static const sqlite3_tokenizer_module simpleTokenizerModule;
  48. static int simpleDelim(simple_tokenizer *t, unsigned char c){
  49. return c<0x80 && t->delim[c];
  50. }
  51. /*
  52. ** Create a new tokenizer instance.
  53. */
  54. static int simpleCreate(
  55. int argc, const char * const *argv,
  56. sqlite3_tokenizer **ppTokenizer
  57. ){
  58. simple_tokenizer *t;
  59. t = (simple_tokenizer *) sqlite3_malloc(sizeof(*t));
  60. if( t==NULL ) return SQLITE_NOMEM;
  61. memset(t, 0, sizeof(*t));
  62. /* TODO(shess) Delimiters need to remain the same from run to run,
  63. ** else we need to reindex. One solution would be a meta-table to
  64. ** track such information in the database, then we'd only want this
  65. ** information on the initial create.
  66. */
  67. if( argc>1 ){
  68. int i, n = strlen(argv[1]);
  69. for(i=0; i<n; i++){
  70. unsigned char ch = argv[1][i];
  71. /* We explicitly don't support UTF-8 delimiters for now. */
  72. if( ch>=0x80 ){
  73. sqlite3_free(t);
  74. return SQLITE_ERROR;
  75. }
  76. t->delim[ch] = 1;
  77. }
  78. } else {
  79. /* Mark non-alphanumeric ASCII characters as delimiters */
  80. int i;
  81. for(i=1; i<0x80; i++){
  82. t->delim[i] = !((i>='0' && i<='9') || (i>='A' && i<='Z') ||
  83. (i>='a' && i<='z'));
  84. }
  85. }
  86. *ppTokenizer = &t->base;
  87. return SQLITE_OK;
  88. }
  89. /*
  90. ** Destroy a tokenizer
  91. */
  92. static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
  93. sqlite3_free(pTokenizer);
  94. return SQLITE_OK;
  95. }
  96. /*
  97. ** Prepare to begin tokenizing a particular string. The input
  98. ** string to be tokenized is pInput[0..nBytes-1]. A cursor
  99. ** used to incrementally tokenize this string is returned in
  100. ** *ppCursor.
  101. */
  102. static int simpleOpen(
  103. sqlite3_tokenizer *pTokenizer, /* The tokenizer */
  104. const char *pInput, int nBytes, /* String to be tokenized */
  105. sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
  106. ){
  107. simple_tokenizer_cursor *c;
  108. c = (simple_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
  109. if( c==NULL ) return SQLITE_NOMEM;
  110. c->pInput = pInput;
  111. if( pInput==0 ){
  112. c->nBytes = 0;
  113. }else if( nBytes<0 ){
  114. c->nBytes = (int)strlen(pInput);
  115. }else{
  116. c->nBytes = nBytes;
  117. }
  118. c->iOffset = 0; /* start tokenizing at the beginning */
  119. c->iToken = 0;
  120. c->pToken = NULL; /* no space allocated, yet. */
  121. c->nTokenAllocated = 0;
  122. *ppCursor = &c->base;
  123. return SQLITE_OK;
  124. }
  125. /*
  126. ** Close a tokenization cursor previously opened by a call to
  127. ** simpleOpen() above.
  128. */
  129. static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
  130. simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
  131. sqlite3_free(c->pToken);
  132. sqlite3_free(c);
  133. return SQLITE_OK;
  134. }
  135. /*
  136. ** Extract the next token from a tokenization cursor. The cursor must
  137. ** have been opened by a prior call to simpleOpen().
  138. */
  139. static int simpleNext(
  140. sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
  141. const char **ppToken, /* OUT: *ppToken is the token text */
  142. int *pnBytes, /* OUT: Number of bytes in token */
  143. int *piStartOffset, /* OUT: Starting offset of token */
  144. int *piEndOffset, /* OUT: Ending offset of token */
  145. int *piPosition /* OUT: Position integer of token */
  146. ){
  147. simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
  148. simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
  149. unsigned char *p = (unsigned char *)c->pInput;
  150. while( c->iOffset<c->nBytes ){
  151. int iStartOffset;
  152. /* Scan past delimiter characters */
  153. while( c->iOffset<c->nBytes && simpleDelim(t, p[c->iOffset]) ){
  154. c->iOffset++;
  155. }
  156. /* Count non-delimiter characters. */
  157. iStartOffset = c->iOffset;
  158. while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){
  159. c->iOffset++;
  160. }
  161. if( c->iOffset>iStartOffset ){
  162. int i, n = c->iOffset-iStartOffset;
  163. if( n>c->nTokenAllocated ){
  164. c->nTokenAllocated = n+20;
  165. c->pToken = sqlite3_realloc(c->pToken, c->nTokenAllocated);
  166. if( c->pToken==NULL ) return SQLITE_NOMEM;
  167. }
  168. for(i=0; i<n; i++){
  169. /* TODO(shess) This needs expansion to handle UTF-8
  170. ** case-insensitivity.
  171. */
  172. unsigned char ch = p[iStartOffset+i];
  173. c->pToken[i] = (ch>='A' && ch<='Z') ? (ch - 'A' + 'a') : ch;
  174. }
  175. *ppToken = c->pToken;
  176. *pnBytes = n;
  177. *piStartOffset = iStartOffset;
  178. *piEndOffset = c->iOffset;
  179. *piPosition = c->iToken++;
  180. return SQLITE_OK;
  181. }
  182. }
  183. return SQLITE_DONE;
  184. }
  185. /*
  186. ** The set of routines that implement the simple tokenizer
  187. */
  188. static const sqlite3_tokenizer_module simpleTokenizerModule = {
  189. 0,
  190. simpleCreate,
  191. simpleDestroy,
  192. simpleOpen,
  193. simpleClose,
  194. simpleNext,
  195. };
  196. /*
  197. ** Allocate a new simple tokenizer. Return a pointer to the new
  198. ** tokenizer in *ppModule
  199. */
  200. void sqlite3Fts2SimpleTokenizerModule(
  201. sqlite3_tokenizer_module const**ppModule
  202. ){
  203. *ppModule = &simpleTokenizerModule;
  204. }
  205. #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */