1
0

regexp.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760
  1. /*
  2. ** 2012-11-13
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. ******************************************************************************
  12. **
  13. ** The code in this file implements a compact but reasonably
  14. ** efficient regular-expression matcher for posix extended regular
  15. ** expressions against UTF8 text.
  16. **
  17. ** This file is an SQLite extension. It registers a single function
  18. ** named "regexp(A,B)" where A is the regular expression and B is the
  19. ** string to be matched. By registering this function, SQLite will also
  20. ** then implement the "B regexp A" operator. Note that with the function
  21. ** the regular expression comes first, but with the operator it comes
  22. ** second.
  23. **
  24. ** The following regular expression syntax is supported:
  25. **
  26. ** X* zero or more occurrences of X
  27. ** X+ one or more occurrences of X
  28. ** X? zero or one occurrences of X
  29. ** X{p,q} between p and q occurrences of X
  30. ** (X) match X
  31. ** X|Y X or Y
  32. ** ^X X occurring at the beginning of the string
  33. ** X$ X occurring at the end of the string
  34. ** . Match any single character
  35. ** \c Character c where c is one of \{}()[]|*+?.
  36. ** \c C-language escapes for c in afnrtv. ex: \t or \n
  37. ** \uXXXX Where XXXX is exactly 4 hex digits, unicode value XXXX
  38. ** \xXX Where XX is exactly 2 hex digits, unicode value XX
  39. ** [abc] Any single character from the set abc
  40. ** [^abc] Any single character not in the set abc
  41. ** [a-z] Any single character in the range a-z
  42. ** [^a-z] Any single character not in the range a-z
  43. ** \b Word boundary
  44. ** \w Word character. [A-Za-z0-9_]
  45. ** \W Non-word character
  46. ** \d Digit
  47. ** \D Non-digit
  48. ** \s Whitespace character
  49. ** \S Non-whitespace character
  50. **
  51. ** A nondeterministic finite automaton (NFA) is used for matching, so the
  52. ** performance is bounded by O(N*M) where N is the size of the regular
  53. ** expression and M is the size of the input string. The matcher never
  54. ** exhibits exponential behavior. Note that the X{p,q} operator expands
  55. ** to p copies of X following by q-p copies of X? and that the size of the
  56. ** regular expression in the O(N*M) performance bound is computed after
  57. ** this expansion.
  58. */
  59. #include <string.h>
  60. #include <stdlib.h>
  61. #include "sqlite3ext.h"
  62. SQLITE_EXTENSION_INIT1
  63. /*
  64. ** The following #defines change the names of some functions implemented in
  65. ** this file to prevent name collisions with C-library functions of the
  66. ** same name.
  67. */
  68. #define re_match sqlite3re_match
  69. #define re_compile sqlite3re_compile
  70. #define re_free sqlite3re_free
  71. /* The end-of-input character */
  72. #define RE_EOF 0 /* End of input */
  73. /* The NFA is implemented as sequence of opcodes taken from the following
  74. ** set. Each opcode has a single integer argument.
  75. */
  76. #define RE_OP_MATCH 1 /* Match the one character in the argument */
  77. #define RE_OP_ANY 2 /* Match any one character. (Implements ".") */
  78. #define RE_OP_ANYSTAR 3 /* Special optimized version of .* */
  79. #define RE_OP_FORK 4 /* Continue to both next and opcode at iArg */
  80. #define RE_OP_GOTO 5 /* Jump to opcode at iArg */
  81. #define RE_OP_ACCEPT 6 /* Halt and indicate a successful match */
  82. #define RE_OP_CC_INC 7 /* Beginning of a [...] character class */
  83. #define RE_OP_CC_EXC 8 /* Beginning of a [^...] character class */
  84. #define RE_OP_CC_VALUE 9 /* Single value in a character class */
  85. #define RE_OP_CC_RANGE 10 /* Range of values in a character class */
  86. #define RE_OP_WORD 11 /* Perl word character [A-Za-z0-9_] */
  87. #define RE_OP_NOTWORD 12 /* Not a perl word character */
  88. #define RE_OP_DIGIT 13 /* digit: [0-9] */
  89. #define RE_OP_NOTDIGIT 14 /* Not a digit */
  90. #define RE_OP_SPACE 15 /* space: [ \t\n\r\v\f] */
  91. #define RE_OP_NOTSPACE 16 /* Not a digit */
  92. #define RE_OP_BOUNDARY 17 /* Boundary between word and non-word */
  93. /* Each opcode is a "state" in the NFA */
  94. typedef unsigned short ReStateNumber;
  95. /* Because this is an NFA and not a DFA, multiple states can be active at
  96. ** once. An instance of the following object records all active states in
  97. ** the NFA. The implementation is optimized for the common case where the
  98. ** number of actives states is small.
  99. */
  100. typedef struct ReStateSet {
  101. unsigned nState; /* Number of current states */
  102. ReStateNumber *aState; /* Current states */
  103. } ReStateSet;
  104. /* An input string read one character at a time.
  105. */
  106. typedef struct ReInput ReInput;
  107. struct ReInput {
  108. const unsigned char *z; /* All text */
  109. int i; /* Next byte to read */
  110. int mx; /* EOF when i>=mx */
  111. };
  112. /* A compiled NFA (or an NFA that is in the process of being compiled) is
  113. ** an instance of the following object.
  114. */
  115. typedef struct ReCompiled ReCompiled;
  116. struct ReCompiled {
  117. ReInput sIn; /* Regular expression text */
  118. const char *zErr; /* Error message to return */
  119. char *aOp; /* Operators for the virtual machine */
  120. int *aArg; /* Arguments to each operator */
  121. unsigned (*xNextChar)(ReInput*); /* Next character function */
  122. unsigned char zInit[12]; /* Initial text to match */
  123. int nInit; /* Number of characters in zInit */
  124. unsigned nState; /* Number of entries in aOp[] and aArg[] */
  125. unsigned nAlloc; /* Slots allocated for aOp[] and aArg[] */
  126. };
  127. /* Add a state to the given state set if it is not already there */
  128. static void re_add_state(ReStateSet *pSet, int newState){
  129. unsigned i;
  130. for(i=0; i<pSet->nState; i++) if( pSet->aState[i]==newState ) return;
  131. pSet->aState[pSet->nState++] = newState;
  132. }
  133. /* Extract the next unicode character from *pzIn and return it. Advance
  134. ** *pzIn to the first byte past the end of the character returned. To
  135. ** be clear: this routine converts utf8 to unicode. This routine is
  136. ** optimized for the common case where the next character is a single byte.
  137. */
  138. static unsigned re_next_char(ReInput *p){
  139. unsigned c;
  140. if( p->i>=p->mx ) return 0;
  141. c = p->z[p->i++];
  142. if( c>=0x80 ){
  143. if( (c&0xe0)==0xc0 && p->i<p->mx && (p->z[p->i]&0xc0)==0x80 ){
  144. c = (c&0x1f)<<6 | (p->z[p->i++]&0x3f);
  145. if( c<0x80 ) c = 0xfffd;
  146. }else if( (c&0xf0)==0xe0 && p->i+1<p->mx && (p->z[p->i]&0xc0)==0x80
  147. && (p->z[p->i+1]&0xc0)==0x80 ){
  148. c = (c&0x0f)<<12 | ((p->z[p->i]&0x3f)<<6) | (p->z[p->i+1]&0x3f);
  149. p->i += 2;
  150. if( c<=0x3ff || (c>=0xd800 && c<=0xdfff) ) c = 0xfffd;
  151. }else if( (c&0xf8)==0xf0 && p->i+3<p->mx && (p->z[p->i]&0xc0)==0x80
  152. && (p->z[p->i+1]&0xc0)==0x80 && (p->z[p->i+2]&0xc0)==0x80 ){
  153. c = (c&0x07)<<18 | ((p->z[p->i]&0x3f)<<12) | ((p->z[p->i+1]&0x3f)<<6)
  154. | (p->z[p->i+2]&0x3f);
  155. p->i += 3;
  156. if( c<=0xffff || c>0x10ffff ) c = 0xfffd;
  157. }else{
  158. c = 0xfffd;
  159. }
  160. }
  161. return c;
  162. }
  163. static unsigned re_next_char_nocase(ReInput *p){
  164. unsigned c = re_next_char(p);
  165. if( c>='A' && c<='Z' ) c += 'a' - 'A';
  166. return c;
  167. }
  168. /* Return true if c is a perl "word" character: [A-Za-z0-9_] */
  169. static int re_word_char(int c){
  170. return (c>='0' && c<='9') || (c>='a' && c<='z')
  171. || (c>='A' && c<='Z') || c=='_';
  172. }
  173. /* Return true if c is a "digit" character: [0-9] */
  174. static int re_digit_char(int c){
  175. return (c>='0' && c<='9');
  176. }
  177. /* Return true if c is a perl "space" character: [ \t\r\n\v\f] */
  178. static int re_space_char(int c){
  179. return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
  180. }
  181. /* Run a compiled regular expression on the zero-terminated input
  182. ** string zIn[]. Return true on a match and false if there is no match.
  183. */
  184. static int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){
  185. ReStateSet aStateSet[2], *pThis, *pNext;
  186. ReStateNumber aSpace[100];
  187. ReStateNumber *pToFree;
  188. unsigned int i = 0;
  189. unsigned int iSwap = 0;
  190. int c = RE_EOF+1;
  191. int cPrev = 0;
  192. int rc = 0;
  193. ReInput in;
  194. in.z = zIn;
  195. in.i = 0;
  196. in.mx = nIn>=0 ? nIn : (int)strlen((char const*)zIn);
  197. /* Look for the initial prefix match, if there is one. */
  198. if( pRe->nInit ){
  199. unsigned char x = pRe->zInit[0];
  200. while( in.i+pRe->nInit<=in.mx
  201. && (zIn[in.i]!=x ||
  202. strncmp((const char*)zIn+in.i, (const char*)pRe->zInit, pRe->nInit)!=0)
  203. ){
  204. in.i++;
  205. }
  206. if( in.i+pRe->nInit>in.mx ) return 0;
  207. }
  208. if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){
  209. pToFree = 0;
  210. aStateSet[0].aState = aSpace;
  211. }else{
  212. pToFree = sqlite3_malloc( sizeof(ReStateNumber)*2*pRe->nState );
  213. if( pToFree==0 ) return -1;
  214. aStateSet[0].aState = pToFree;
  215. }
  216. aStateSet[1].aState = &aStateSet[0].aState[pRe->nState];
  217. pNext = &aStateSet[1];
  218. pNext->nState = 0;
  219. re_add_state(pNext, 0);
  220. while( c!=RE_EOF && pNext->nState>0 ){
  221. cPrev = c;
  222. c = pRe->xNextChar(&in);
  223. pThis = pNext;
  224. pNext = &aStateSet[iSwap];
  225. iSwap = 1 - iSwap;
  226. pNext->nState = 0;
  227. for(i=0; i<pThis->nState; i++){
  228. int x = pThis->aState[i];
  229. switch( pRe->aOp[x] ){
  230. case RE_OP_MATCH: {
  231. if( pRe->aArg[x]==c ) re_add_state(pNext, x+1);
  232. break;
  233. }
  234. case RE_OP_ANY: {
  235. re_add_state(pNext, x+1);
  236. break;
  237. }
  238. case RE_OP_WORD: {
  239. if( re_word_char(c) ) re_add_state(pNext, x+1);
  240. break;
  241. }
  242. case RE_OP_NOTWORD: {
  243. if( !re_word_char(c) ) re_add_state(pNext, x+1);
  244. break;
  245. }
  246. case RE_OP_DIGIT: {
  247. if( re_digit_char(c) ) re_add_state(pNext, x+1);
  248. break;
  249. }
  250. case RE_OP_NOTDIGIT: {
  251. if( !re_digit_char(c) ) re_add_state(pNext, x+1);
  252. break;
  253. }
  254. case RE_OP_SPACE: {
  255. if( re_space_char(c) ) re_add_state(pNext, x+1);
  256. break;
  257. }
  258. case RE_OP_NOTSPACE: {
  259. if( !re_space_char(c) ) re_add_state(pNext, x+1);
  260. break;
  261. }
  262. case RE_OP_BOUNDARY: {
  263. if( re_word_char(c)!=re_word_char(cPrev) ) re_add_state(pThis, x+1);
  264. break;
  265. }
  266. case RE_OP_ANYSTAR: {
  267. re_add_state(pNext, x);
  268. re_add_state(pThis, x+1);
  269. break;
  270. }
  271. case RE_OP_FORK: {
  272. re_add_state(pThis, x+pRe->aArg[x]);
  273. re_add_state(pThis, x+1);
  274. break;
  275. }
  276. case RE_OP_GOTO: {
  277. re_add_state(pThis, x+pRe->aArg[x]);
  278. break;
  279. }
  280. case RE_OP_ACCEPT: {
  281. rc = 1;
  282. goto re_match_end;
  283. }
  284. case RE_OP_CC_INC:
  285. case RE_OP_CC_EXC: {
  286. int j = 1;
  287. int n = pRe->aArg[x];
  288. int hit = 0;
  289. for(j=1; j>0 && j<n; j++){
  290. if( pRe->aOp[x+j]==RE_OP_CC_VALUE ){
  291. if( pRe->aArg[x+j]==c ){
  292. hit = 1;
  293. j = -1;
  294. }
  295. }else{
  296. if( pRe->aArg[x+j]<=c && pRe->aArg[x+j+1]>=c ){
  297. hit = 1;
  298. j = -1;
  299. }else{
  300. j++;
  301. }
  302. }
  303. }
  304. if( pRe->aOp[x]==RE_OP_CC_EXC ) hit = !hit;
  305. if( hit ) re_add_state(pNext, x+n);
  306. break;
  307. }
  308. }
  309. }
  310. }
  311. for(i=0; i<pNext->nState; i++){
  312. if( pRe->aOp[pNext->aState[i]]==RE_OP_ACCEPT ){ rc = 1; break; }
  313. }
  314. re_match_end:
  315. sqlite3_free(pToFree);
  316. return rc;
  317. }
  318. /* Resize the opcode and argument arrays for an RE under construction.
  319. */
  320. static int re_resize(ReCompiled *p, int N){
  321. char *aOp;
  322. int *aArg;
  323. aOp = sqlite3_realloc(p->aOp, N*sizeof(p->aOp[0]));
  324. if( aOp==0 ) return 1;
  325. p->aOp = aOp;
  326. aArg = sqlite3_realloc(p->aArg, N*sizeof(p->aArg[0]));
  327. if( aArg==0 ) return 1;
  328. p->aArg = aArg;
  329. p->nAlloc = N;
  330. return 0;
  331. }
  332. /* Insert a new opcode and argument into an RE under construction. The
  333. ** insertion point is just prior to existing opcode iBefore.
  334. */
  335. static int re_insert(ReCompiled *p, int iBefore, int op, int arg){
  336. int i;
  337. if( p->nAlloc<=p->nState && re_resize(p, p->nAlloc*2) ) return 0;
  338. for(i=p->nState; i>iBefore; i--){
  339. p->aOp[i] = p->aOp[i-1];
  340. p->aArg[i] = p->aArg[i-1];
  341. }
  342. p->nState++;
  343. p->aOp[iBefore] = op;
  344. p->aArg[iBefore] = arg;
  345. return iBefore;
  346. }
  347. /* Append a new opcode and argument to the end of the RE under construction.
  348. */
  349. static int re_append(ReCompiled *p, int op, int arg){
  350. return re_insert(p, p->nState, op, arg);
  351. }
  352. /* Make a copy of N opcodes starting at iStart onto the end of the RE
  353. ** under construction.
  354. */
  355. static void re_copy(ReCompiled *p, int iStart, int N){
  356. if( p->nState+N>=p->nAlloc && re_resize(p, p->nAlloc*2+N) ) return;
  357. memcpy(&p->aOp[p->nState], &p->aOp[iStart], N*sizeof(p->aOp[0]));
  358. memcpy(&p->aArg[p->nState], &p->aArg[iStart], N*sizeof(p->aArg[0]));
  359. p->nState += N;
  360. }
  361. /* Return true if c is a hexadecimal digit character: [0-9a-fA-F]
  362. ** If c is a hex digit, also set *pV = (*pV)*16 + valueof(c). If
  363. ** c is not a hex digit *pV is unchanged.
  364. */
  365. static int re_hex(int c, int *pV){
  366. if( c>='0' && c<='9' ){
  367. c -= '0';
  368. }else if( c>='a' && c<='f' ){
  369. c -= 'a' - 10;
  370. }else if( c>='A' && c<='F' ){
  371. c -= 'A' - 10;
  372. }else{
  373. return 0;
  374. }
  375. *pV = (*pV)*16 + (c & 0xff);
  376. return 1;
  377. }
  378. /* A backslash character has been seen, read the next character and
  379. ** return its interpretation.
  380. */
  381. static unsigned re_esc_char(ReCompiled *p){
  382. static const char zEsc[] = "afnrtv\\()*.+?[$^{|}]";
  383. static const char zTrans[] = "\a\f\n\r\t\v";
  384. int i, v = 0;
  385. char c;
  386. if( p->sIn.i>=p->sIn.mx ) return 0;
  387. c = p->sIn.z[p->sIn.i];
  388. if( c=='u' && p->sIn.i+4<p->sIn.mx ){
  389. const unsigned char *zIn = p->sIn.z + p->sIn.i;
  390. if( re_hex(zIn[1],&v)
  391. && re_hex(zIn[2],&v)
  392. && re_hex(zIn[3],&v)
  393. && re_hex(zIn[4],&v)
  394. ){
  395. p->sIn.i += 5;
  396. return v;
  397. }
  398. }
  399. if( c=='x' && p->sIn.i+2<p->sIn.mx ){
  400. const unsigned char *zIn = p->sIn.z + p->sIn.i;
  401. if( re_hex(zIn[1],&v)
  402. && re_hex(zIn[2],&v)
  403. ){
  404. p->sIn.i += 3;
  405. return v;
  406. }
  407. }
  408. for(i=0; zEsc[i] && zEsc[i]!=c; i++){}
  409. if( zEsc[i] ){
  410. if( i<6 ) c = zTrans[i];
  411. p->sIn.i++;
  412. }else{
  413. p->zErr = "unknown \\ escape";
  414. }
  415. return c;
  416. }
  417. /* Forward declaration */
  418. static const char *re_subcompile_string(ReCompiled*);
  419. /* Peek at the next byte of input */
  420. static unsigned char rePeek(ReCompiled *p){
  421. return p->sIn.i<p->sIn.mx ? p->sIn.z[p->sIn.i] : 0;
  422. }
  423. /* Compile RE text into a sequence of opcodes. Continue up to the
  424. ** first unmatched ")" character, then return. If an error is found,
  425. ** return a pointer to the error message string.
  426. */
  427. static const char *re_subcompile_re(ReCompiled *p){
  428. const char *zErr;
  429. int iStart, iEnd, iGoto;
  430. iStart = p->nState;
  431. zErr = re_subcompile_string(p);
  432. if( zErr ) return zErr;
  433. while( rePeek(p)=='|' ){
  434. iEnd = p->nState;
  435. re_insert(p, iStart, RE_OP_FORK, iEnd + 2 - iStart);
  436. iGoto = re_append(p, RE_OP_GOTO, 0);
  437. p->sIn.i++;
  438. zErr = re_subcompile_string(p);
  439. if( zErr ) return zErr;
  440. p->aArg[iGoto] = p->nState - iGoto;
  441. }
  442. return 0;
  443. }
  444. /* Compile an element of regular expression text (anything that can be
  445. ** an operand to the "|" operator). Return NULL on success or a pointer
  446. ** to the error message if there is a problem.
  447. */
  448. static const char *re_subcompile_string(ReCompiled *p){
  449. int iPrev = -1;
  450. int iStart;
  451. unsigned c;
  452. const char *zErr;
  453. while( (c = p->xNextChar(&p->sIn))!=0 ){
  454. iStart = p->nState;
  455. switch( c ){
  456. case '|':
  457. case '$':
  458. case ')': {
  459. p->sIn.i--;
  460. return 0;
  461. }
  462. case '(': {
  463. zErr = re_subcompile_re(p);
  464. if( zErr ) return zErr;
  465. if( rePeek(p)!=')' ) return "unmatched '('";
  466. p->sIn.i++;
  467. break;
  468. }
  469. case '.': {
  470. if( rePeek(p)=='*' ){
  471. re_append(p, RE_OP_ANYSTAR, 0);
  472. p->sIn.i++;
  473. }else{
  474. re_append(p, RE_OP_ANY, 0);
  475. }
  476. break;
  477. }
  478. case '*': {
  479. if( iPrev<0 ) return "'*' without operand";
  480. re_insert(p, iPrev, RE_OP_GOTO, p->nState - iPrev + 1);
  481. re_append(p, RE_OP_FORK, iPrev - p->nState + 1);
  482. break;
  483. }
  484. case '+': {
  485. if( iPrev<0 ) return "'+' without operand";
  486. re_append(p, RE_OP_FORK, iPrev - p->nState);
  487. break;
  488. }
  489. case '?': {
  490. if( iPrev<0 ) return "'?' without operand";
  491. re_insert(p, iPrev, RE_OP_FORK, p->nState - iPrev+1);
  492. break;
  493. }
  494. case '{': {
  495. int m = 0, n = 0;
  496. int sz, j;
  497. if( iPrev<0 ) return "'{m,n}' without operand";
  498. while( (c=rePeek(p))>='0' && c<='9' ){ m = m*10 + c - '0'; p->sIn.i++; }
  499. n = m;
  500. if( c==',' ){
  501. p->sIn.i++;
  502. n = 0;
  503. while( (c=rePeek(p))>='0' && c<='9' ){ n = n*10 + c-'0'; p->sIn.i++; }
  504. }
  505. if( c!='}' ) return "unmatched '{'";
  506. if( n>0 && n<m ) return "n less than m in '{m,n}'";
  507. p->sIn.i++;
  508. sz = p->nState - iPrev;
  509. if( m==0 ){
  510. if( n==0 ) return "both m and n are zero in '{m,n}'";
  511. re_insert(p, iPrev, RE_OP_FORK, sz+1);
  512. n--;
  513. }else{
  514. for(j=1; j<m; j++) re_copy(p, iPrev, sz);
  515. }
  516. for(j=m; j<n; j++){
  517. re_append(p, RE_OP_FORK, sz+1);
  518. re_copy(p, iPrev, sz);
  519. }
  520. if( n==0 && m>0 ){
  521. re_append(p, RE_OP_FORK, -sz);
  522. }
  523. break;
  524. }
  525. case '[': {
  526. int iFirst = p->nState;
  527. if( rePeek(p)=='^' ){
  528. re_append(p, RE_OP_CC_EXC, 0);
  529. p->sIn.i++;
  530. }else{
  531. re_append(p, RE_OP_CC_INC, 0);
  532. }
  533. while( (c = p->xNextChar(&p->sIn))!=0 ){
  534. if( c=='[' && rePeek(p)==':' ){
  535. return "POSIX character classes not supported";
  536. }
  537. if( c=='\\' ) c = re_esc_char(p);
  538. if( rePeek(p)=='-' ){
  539. re_append(p, RE_OP_CC_RANGE, c);
  540. p->sIn.i++;
  541. c = p->xNextChar(&p->sIn);
  542. if( c=='\\' ) c = re_esc_char(p);
  543. re_append(p, RE_OP_CC_RANGE, c);
  544. }else{
  545. re_append(p, RE_OP_CC_VALUE, c);
  546. }
  547. if( rePeek(p)==']' ){ p->sIn.i++; break; }
  548. }
  549. if( c==0 ) return "unclosed '['";
  550. p->aArg[iFirst] = p->nState - iFirst;
  551. break;
  552. }
  553. case '\\': {
  554. int specialOp = 0;
  555. switch( rePeek(p) ){
  556. case 'b': specialOp = RE_OP_BOUNDARY; break;
  557. case 'd': specialOp = RE_OP_DIGIT; break;
  558. case 'D': specialOp = RE_OP_NOTDIGIT; break;
  559. case 's': specialOp = RE_OP_SPACE; break;
  560. case 'S': specialOp = RE_OP_NOTSPACE; break;
  561. case 'w': specialOp = RE_OP_WORD; break;
  562. case 'W': specialOp = RE_OP_NOTWORD; break;
  563. }
  564. if( specialOp ){
  565. p->sIn.i++;
  566. re_append(p, specialOp, 0);
  567. }else{
  568. c = re_esc_char(p);
  569. re_append(p, RE_OP_MATCH, c);
  570. }
  571. break;
  572. }
  573. default: {
  574. re_append(p, RE_OP_MATCH, c);
  575. break;
  576. }
  577. }
  578. iPrev = iStart;
  579. }
  580. return 0;
  581. }
  582. /* Free and reclaim all the memory used by a previously compiled
  583. ** regular expression. Applications should invoke this routine once
  584. ** for every call to re_compile() to avoid memory leaks.
  585. */
  586. void re_free(ReCompiled *pRe){
  587. if( pRe ){
  588. sqlite3_free(pRe->aOp);
  589. sqlite3_free(pRe->aArg);
  590. sqlite3_free(pRe);
  591. }
  592. }
  593. /*
  594. ** Compile a textual regular expression in zIn[] into a compiled regular
  595. ** expression suitable for us by re_match() and return a pointer to the
  596. ** compiled regular expression in *ppRe. Return NULL on success or an
  597. ** error message if something goes wrong.
  598. */
  599. const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){
  600. ReCompiled *pRe;
  601. const char *zErr;
  602. int i, j;
  603. *ppRe = 0;
  604. pRe = sqlite3_malloc( sizeof(*pRe) );
  605. if( pRe==0 ){
  606. return "out of memory";
  607. }
  608. memset(pRe, 0, sizeof(*pRe));
  609. pRe->xNextChar = noCase ? re_next_char_nocase : re_next_char;
  610. if( re_resize(pRe, 30) ){
  611. re_free(pRe);
  612. return "out of memory";
  613. }
  614. if( zIn[0]=='^' ){
  615. zIn++;
  616. }else{
  617. re_append(pRe, RE_OP_ANYSTAR, 0);
  618. }
  619. pRe->sIn.z = (unsigned char*)zIn;
  620. pRe->sIn.i = 0;
  621. pRe->sIn.mx = (int)strlen(zIn);
  622. zErr = re_subcompile_re(pRe);
  623. if( zErr ){
  624. re_free(pRe);
  625. return zErr;
  626. }
  627. if( rePeek(pRe)=='$' && pRe->sIn.i+1>=pRe->sIn.mx ){
  628. re_append(pRe, RE_OP_MATCH, RE_EOF);
  629. re_append(pRe, RE_OP_ACCEPT, 0);
  630. *ppRe = pRe;
  631. }else if( pRe->sIn.i>=pRe->sIn.mx ){
  632. re_append(pRe, RE_OP_ACCEPT, 0);
  633. *ppRe = pRe;
  634. }else{
  635. re_free(pRe);
  636. return "unrecognized character";
  637. }
  638. /* The following is a performance optimization. If the regex begins with
  639. ** ".*" (if the input regex lacks an initial "^") and afterwards there are
  640. ** one or more matching characters, enter those matching characters into
  641. ** zInit[]. The re_match() routine can then search ahead in the input
  642. ** string looking for the initial match without having to run the whole
  643. ** regex engine over the string. Do not worry able trying to match
  644. ** unicode characters beyond plane 0 - those are very rare and this is
  645. ** just an optimization. */
  646. if( pRe->aOp[0]==RE_OP_ANYSTAR ){
  647. for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){
  648. unsigned x = pRe->aArg[i];
  649. if( x<=127 ){
  650. pRe->zInit[j++] = x;
  651. }else if( x<=0xfff ){
  652. pRe->zInit[j++] = 0xc0 | (x>>6);
  653. pRe->zInit[j++] = 0x80 | (x&0x3f);
  654. }else if( x<=0xffff ){
  655. pRe->zInit[j++] = 0xd0 | (x>>12);
  656. pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
  657. pRe->zInit[j++] = 0x80 | (x&0x3f);
  658. }else{
  659. break;
  660. }
  661. }
  662. if( j>0 && pRe->zInit[j-1]==0 ) j--;
  663. pRe->nInit = j;
  664. }
  665. return pRe->zErr;
  666. }
  667. /*
  668. ** Implementation of the regexp() SQL function. This function implements
  669. ** the build-in REGEXP operator. The first argument to the function is the
  670. ** pattern and the second argument is the string. So, the SQL statements:
  671. **
  672. ** A REGEXP B
  673. **
  674. ** is implemented as regexp(B,A).
  675. */
  676. static void re_sql_func(
  677. sqlite3_context *context,
  678. int argc,
  679. sqlite3_value **argv
  680. ){
  681. ReCompiled *pRe; /* Compiled regular expression */
  682. const char *zPattern; /* The regular expression */
  683. const unsigned char *zStr;/* String being searched */
  684. const char *zErr; /* Compile error message */
  685. int setAux = 0; /* True to invoke sqlite3_set_auxdata() */
  686. pRe = sqlite3_get_auxdata(context, 0);
  687. if( pRe==0 ){
  688. zPattern = (const char*)sqlite3_value_text(argv[0]);
  689. if( zPattern==0 ) return;
  690. zErr = re_compile(&pRe, zPattern, 0);
  691. if( zErr ){
  692. re_free(pRe);
  693. sqlite3_result_error(context, zErr, -1);
  694. return;
  695. }
  696. if( pRe==0 ){
  697. sqlite3_result_error_nomem(context);
  698. return;
  699. }
  700. setAux = 1;
  701. }
  702. zStr = (const unsigned char*)sqlite3_value_text(argv[1]);
  703. if( zStr!=0 ){
  704. sqlite3_result_int(context, re_match(pRe, zStr, -1));
  705. }
  706. if( setAux ){
  707. sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free);
  708. }
  709. }
  710. /*
  711. ** Invoke this routine to register the regexp() function with the
  712. ** SQLite database connection.
  713. */
  714. #ifdef _WIN32
  715. __declspec(dllexport)
  716. #endif
  717. int sqlite3_regexp_init(
  718. sqlite3 *db,
  719. char **pzErrMsg,
  720. const sqlite3_api_routines *pApi
  721. ){
  722. int rc = SQLITE_OK;
  723. SQLITE_EXTENSION_INIT2(pApi);
  724. rc = sqlite3_create_function(db, "regexp", 2, SQLITE_UTF8, 0,
  725. re_sql_func, 0, 0);
  726. return rc;
  727. }