1
0

fts1_tokenizer1.c 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. /*
  2. ** The author disclaims copyright to this source code.
  3. **
  4. *************************************************************************
  5. ** Implementation of the "simple" full-text-search tokenizer.
  6. */
  7. /*
  8. ** The code in this file is only compiled if:
  9. **
  10. ** * The FTS1 module is being built as an extension
  11. ** (in which case SQLITE_CORE is not defined), or
  12. **
  13. ** * The FTS1 module is being built into the core of
  14. ** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
  15. */
  16. #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
  17. #include <assert.h>
  18. #include <stdlib.h>
  19. #include <stdio.h>
  20. #include <string.h>
  21. #include <ctype.h>
  22. #include "fts1_tokenizer.h"
  23. typedef struct simple_tokenizer {
  24. sqlite3_tokenizer base;
  25. char delim[128]; /* flag ASCII delimiters */
  26. } simple_tokenizer;
  27. typedef struct simple_tokenizer_cursor {
  28. sqlite3_tokenizer_cursor base;
  29. const char *pInput; /* input we are tokenizing */
  30. int nBytes; /* size of the input */
  31. int iOffset; /* current position in pInput */
  32. int iToken; /* index of next token to be returned */
  33. char *pToken; /* storage for current token */
  34. int nTokenAllocated; /* space allocated to zToken buffer */
  35. } simple_tokenizer_cursor;
  36. /* Forward declaration */
  37. static const sqlite3_tokenizer_module simpleTokenizerModule;
  38. static int isDelim(simple_tokenizer *t, unsigned char c){
  39. return c<0x80 && t->delim[c];
  40. }
  41. /*
  42. ** Create a new tokenizer instance.
  43. */
  44. static int simpleCreate(
  45. int argc, const char * const *argv,
  46. sqlite3_tokenizer **ppTokenizer
  47. ){
  48. simple_tokenizer *t;
  49. t = (simple_tokenizer *) calloc(sizeof(*t), 1);
  50. if( t==NULL ) return SQLITE_NOMEM;
  51. /* TODO(shess) Delimiters need to remain the same from run to run,
  52. ** else we need to reindex. One solution would be a meta-table to
  53. ** track such information in the database, then we'd only want this
  54. ** information on the initial create.
  55. */
  56. if( argc>1 ){
  57. int i, n = strlen(argv[1]);
  58. for(i=0; i<n; i++){
  59. unsigned char ch = argv[1][i];
  60. /* We explicitly don't support UTF-8 delimiters for now. */
  61. if( ch>=0x80 ){
  62. free(t);
  63. return SQLITE_ERROR;
  64. }
  65. t->delim[ch] = 1;
  66. }
  67. } else {
  68. /* Mark non-alphanumeric ASCII characters as delimiters */
  69. int i;
  70. for(i=1; i<0x80; i++){
  71. t->delim[i] = !isalnum(i);
  72. }
  73. }
  74. *ppTokenizer = &t->base;
  75. return SQLITE_OK;
  76. }
  77. /*
  78. ** Destroy a tokenizer
  79. */
  80. static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
  81. free(pTokenizer);
  82. return SQLITE_OK;
  83. }
  84. /*
  85. ** Prepare to begin tokenizing a particular string. The input
  86. ** string to be tokenized is pInput[0..nBytes-1]. A cursor
  87. ** used to incrementally tokenize this string is returned in
  88. ** *ppCursor.
  89. */
  90. static int simpleOpen(
  91. sqlite3_tokenizer *pTokenizer, /* The tokenizer */
  92. const char *pInput, int nBytes, /* String to be tokenized */
  93. sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
  94. ){
  95. simple_tokenizer_cursor *c;
  96. c = (simple_tokenizer_cursor *) malloc(sizeof(*c));
  97. if( c==NULL ) return SQLITE_NOMEM;
  98. c->pInput = pInput;
  99. if( pInput==0 ){
  100. c->nBytes = 0;
  101. }else if( nBytes<0 ){
  102. c->nBytes = (int)strlen(pInput);
  103. }else{
  104. c->nBytes = nBytes;
  105. }
  106. c->iOffset = 0; /* start tokenizing at the beginning */
  107. c->iToken = 0;
  108. c->pToken = NULL; /* no space allocated, yet. */
  109. c->nTokenAllocated = 0;
  110. *ppCursor = &c->base;
  111. return SQLITE_OK;
  112. }
  113. /*
  114. ** Close a tokenization cursor previously opened by a call to
  115. ** simpleOpen() above.
  116. */
  117. static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
  118. simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
  119. free(c->pToken);
  120. free(c);
  121. return SQLITE_OK;
  122. }
  123. /*
  124. ** Extract the next token from a tokenization cursor. The cursor must
  125. ** have been opened by a prior call to simpleOpen().
  126. */
  127. static int simpleNext(
  128. sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
  129. const char **ppToken, /* OUT: *ppToken is the token text */
  130. int *pnBytes, /* OUT: Number of bytes in token */
  131. int *piStartOffset, /* OUT: Starting offset of token */
  132. int *piEndOffset, /* OUT: Ending offset of token */
  133. int *piPosition /* OUT: Position integer of token */
  134. ){
  135. simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
  136. simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
  137. unsigned char *p = (unsigned char *)c->pInput;
  138. while( c->iOffset<c->nBytes ){
  139. int iStartOffset;
  140. /* Scan past delimiter characters */
  141. while( c->iOffset<c->nBytes && isDelim(t, p[c->iOffset]) ){
  142. c->iOffset++;
  143. }
  144. /* Count non-delimiter characters. */
  145. iStartOffset = c->iOffset;
  146. while( c->iOffset<c->nBytes && !isDelim(t, p[c->iOffset]) ){
  147. c->iOffset++;
  148. }
  149. if( c->iOffset>iStartOffset ){
  150. int i, n = c->iOffset-iStartOffset;
  151. if( n>c->nTokenAllocated ){
  152. c->nTokenAllocated = n+20;
  153. c->pToken = realloc(c->pToken, c->nTokenAllocated);
  154. if( c->pToken==NULL ) return SQLITE_NOMEM;
  155. }
  156. for(i=0; i<n; i++){
  157. /* TODO(shess) This needs expansion to handle UTF-8
  158. ** case-insensitivity.
  159. */
  160. unsigned char ch = p[iStartOffset+i];
  161. c->pToken[i] = ch<0x80 ? tolower(ch) : ch;
  162. }
  163. *ppToken = c->pToken;
  164. *pnBytes = n;
  165. *piStartOffset = iStartOffset;
  166. *piEndOffset = c->iOffset;
  167. *piPosition = c->iToken++;
  168. return SQLITE_OK;
  169. }
  170. }
  171. return SQLITE_DONE;
  172. }
  173. /*
  174. ** The set of routines that implement the simple tokenizer
  175. */
  176. static const sqlite3_tokenizer_module simpleTokenizerModule = {
  177. 0,
  178. simpleCreate,
  179. simpleDestroy,
  180. simpleOpen,
  181. simpleClose,
  182. simpleNext,
  183. };
  184. /*
  185. ** Allocate a new simple tokenizer. Return a pointer to the new
  186. ** tokenizer in *ppModule
  187. */
  188. void sqlite3Fts1SimpleTokenizerModule(
  189. sqlite3_tokenizer_module const**ppModule
  190. ){
  191. *ppModule = &simpleTokenizerModule;
  192. }
  193. #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */