1
0

fts2_icu.c 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. /*
  2. ** 2007 June 22
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. *************************************************************************
  12. ** This file implements a tokenizer for fts2 based on the ICU library.
  13. **
  14. ** $Id: fts2_icu.c,v 1.3 2008/12/18 05:30:26 danielk1977 Exp $
  15. */
  16. #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
  17. #ifdef SQLITE_ENABLE_ICU
  18. #include <assert.h>
  19. #include <string.h>
  20. #include "fts2_tokenizer.h"
  21. #include <unicode/ubrk.h>
  22. #include <unicode/ucol.h>
  23. #include <unicode/ustring.h>
  24. #include <unicode/utf16.h>
  25. typedef struct IcuTokenizer IcuTokenizer;
  26. typedef struct IcuCursor IcuCursor;
  27. struct IcuTokenizer {
  28. sqlite3_tokenizer base;
  29. char *zLocale;
  30. };
  31. struct IcuCursor {
  32. sqlite3_tokenizer_cursor base;
  33. UBreakIterator *pIter; /* ICU break-iterator object */
  34. int nChar; /* Number of UChar elements in pInput */
  35. UChar *aChar; /* Copy of input using utf-16 encoding */
  36. int *aOffset; /* Offsets of each character in utf-8 input */
  37. int nBuffer;
  38. char *zBuffer;
  39. int iToken;
  40. };
  41. /*
  42. ** Create a new tokenizer instance.
  43. */
  44. static int icuCreate(
  45. int argc, /* Number of entries in argv[] */
  46. const char * const *argv, /* Tokenizer creation arguments */
  47. sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
  48. ){
  49. IcuTokenizer *p;
  50. int n = 0;
  51. if( argc>0 ){
  52. n = strlen(argv[0])+1;
  53. }
  54. p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
  55. if( !p ){
  56. return SQLITE_NOMEM;
  57. }
  58. memset(p, 0, sizeof(IcuTokenizer));
  59. if( n ){
  60. p->zLocale = (char *)&p[1];
  61. memcpy(p->zLocale, argv[0], n);
  62. }
  63. *ppTokenizer = (sqlite3_tokenizer *)p;
  64. return SQLITE_OK;
  65. }
  66. /*
  67. ** Destroy a tokenizer
  68. */
  69. static int icuDestroy(sqlite3_tokenizer *pTokenizer){
  70. IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
  71. sqlite3_free(p);
  72. return SQLITE_OK;
  73. }
  74. /*
  75. ** Prepare to begin tokenizing a particular string. The input
  76. ** string to be tokenized is pInput[0..nBytes-1]. A cursor
  77. ** used to incrementally tokenize this string is returned in
  78. ** *ppCursor.
  79. */
  80. static int icuOpen(
  81. sqlite3_tokenizer *pTokenizer, /* The tokenizer */
  82. const char *zInput, /* Input string */
  83. int nInput, /* Length of zInput in bytes */
  84. sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
  85. ){
  86. IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
  87. IcuCursor *pCsr;
  88. const int32_t opt = U_FOLD_CASE_DEFAULT;
  89. UErrorCode status = U_ZERO_ERROR;
  90. int nChar;
  91. UChar32 c;
  92. int iInput = 0;
  93. int iOut = 0;
  94. *ppCursor = 0;
  95. if( nInput<0 ){
  96. nInput = strlen(zInput);
  97. }
  98. nChar = nInput+1;
  99. pCsr = (IcuCursor *)sqlite3_malloc(
  100. sizeof(IcuCursor) + /* IcuCursor */
  101. ((nChar+3)&~3) * sizeof(UChar) + /* IcuCursor.aChar[] */
  102. (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */
  103. );
  104. if( !pCsr ){
  105. return SQLITE_NOMEM;
  106. }
  107. memset(pCsr, 0, sizeof(IcuCursor));
  108. pCsr->aChar = (UChar *)&pCsr[1];
  109. pCsr->aOffset = (int *)&pCsr->aChar[(nChar+3)&~3];
  110. pCsr->aOffset[iOut] = iInput;
  111. U8_NEXT(zInput, iInput, nInput, c);
  112. while( c>0 ){
  113. int isError = 0;
  114. c = u_foldCase(c, opt);
  115. U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
  116. if( isError ){
  117. sqlite3_free(pCsr);
  118. return SQLITE_ERROR;
  119. }
  120. pCsr->aOffset[iOut] = iInput;
  121. if( iInput<nInput ){
  122. U8_NEXT(zInput, iInput, nInput, c);
  123. }else{
  124. c = 0;
  125. }
  126. }
  127. pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
  128. if( !U_SUCCESS(status) ){
  129. sqlite3_free(pCsr);
  130. return SQLITE_ERROR;
  131. }
  132. pCsr->nChar = iOut;
  133. ubrk_first(pCsr->pIter);
  134. *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
  135. return SQLITE_OK;
  136. }
  137. /*
  138. ** Close a tokenization cursor previously opened by a call to icuOpen().
  139. */
  140. static int icuClose(sqlite3_tokenizer_cursor *pCursor){
  141. IcuCursor *pCsr = (IcuCursor *)pCursor;
  142. ubrk_close(pCsr->pIter);
  143. sqlite3_free(pCsr->zBuffer);
  144. sqlite3_free(pCsr);
  145. return SQLITE_OK;
  146. }
  147. /*
  148. ** Extract the next token from a tokenization cursor.
  149. */
  150. static int icuNext(
  151. sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
  152. const char **ppToken, /* OUT: *ppToken is the token text */
  153. int *pnBytes, /* OUT: Number of bytes in token */
  154. int *piStartOffset, /* OUT: Starting offset of token */
  155. int *piEndOffset, /* OUT: Ending offset of token */
  156. int *piPosition /* OUT: Position integer of token */
  157. ){
  158. IcuCursor *pCsr = (IcuCursor *)pCursor;
  159. int iStart = 0;
  160. int iEnd = 0;
  161. int nByte = 0;
  162. while( iStart==iEnd ){
  163. UChar32 c;
  164. iStart = ubrk_current(pCsr->pIter);
  165. iEnd = ubrk_next(pCsr->pIter);
  166. if( iEnd==UBRK_DONE ){
  167. return SQLITE_DONE;
  168. }
  169. while( iStart<iEnd ){
  170. int iWhite = iStart;
  171. U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
  172. if( u_isspace(c) ){
  173. iStart = iWhite;
  174. }else{
  175. break;
  176. }
  177. }
  178. assert(iStart<=iEnd);
  179. }
  180. do {
  181. UErrorCode status = U_ZERO_ERROR;
  182. if( nByte ){
  183. char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
  184. if( !zNew ){
  185. return SQLITE_NOMEM;
  186. }
  187. pCsr->zBuffer = zNew;
  188. pCsr->nBuffer = nByte;
  189. }
  190. u_strToUTF8(
  191. pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */
  192. &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */
  193. &status /* Output success/failure */
  194. );
  195. } while( nByte>pCsr->nBuffer );
  196. *ppToken = pCsr->zBuffer;
  197. *pnBytes = nByte;
  198. *piStartOffset = pCsr->aOffset[iStart];
  199. *piEndOffset = pCsr->aOffset[iEnd];
  200. *piPosition = pCsr->iToken++;
  201. return SQLITE_OK;
  202. }
  203. /*
  204. ** The set of routines that implement the simple tokenizer
  205. */
  206. static const sqlite3_tokenizer_module icuTokenizerModule = {
  207. 0, /* iVersion */
  208. icuCreate, /* xCreate */
  209. icuDestroy, /* xCreate */
  210. icuOpen, /* xOpen */
  211. icuClose, /* xClose */
  212. icuNext, /* xNext */
  213. };
  214. /*
  215. ** Set *ppModule to point at the implementation of the ICU tokenizer.
  216. */
  217. void sqlite3Fts2IcuTokenizerModule(
  218. sqlite3_tokenizer_module const**ppModule
  219. ){
  220. *ppModule = &icuTokenizerModule;
  221. }
  222. #endif /* defined(SQLITE_ENABLE_ICU) */
  223. #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */