OLD | NEW |
| (Empty) |
1 /* | |
2 ** 2012-11-13 | |
3 ** | |
4 ** The author disclaims copyright to this source code. In place of | |
5 ** a legal notice, here is a blessing: | |
6 ** | |
7 ** May you do good and not evil. | |
8 ** May you find forgiveness for yourself and forgive others. | |
9 ** May you share freely, never taking more than you give. | |
10 ** | |
11 ****************************************************************************** | |
12 ** | |
13 ** The code in this file implements a compact but reasonably | |
14 ** efficient regular-expression matcher for posix extended regular | |
15 ** expressions against UTF8 text. | |
16 ** | |
17 ** This file is an SQLite extension. It registers a single function | |
18 ** named "regexp(A,B)" where A is the regular expression and B is the | |
19 ** string to be matched. By registering this function, SQLite will also | |
20 ** then implement the "B regexp A" operator. Note that with the function | |
21 ** the regular expression comes first, but with the operator it comes | |
22 ** second. | |
23 ** | |
24 ** The following regular expression syntax is supported: | |
25 ** | |
26 ** X* zero or more occurrences of X | |
27 ** X+ one or more occurrences of X | |
28 ** X? zero or one occurrences of X | |
29 ** X{p,q} between p and q occurrences of X | |
30 ** (X) match X | |
31 ** X|Y X or Y | |
32 ** ^X X occurring at the beginning of the string | |
33 ** X$ X occurring at the end of the string | |
34 ** . Match any single character | |
35 ** \c Character c where c is one of \{}()[]|*+?. | |
36 ** \c C-language escapes for c in afnrtv. ex: \t or \n | |
37 ** \uXXXX Where XXXX is exactly 4 hex digits, unicode value XXXX | |
38 ** \xXX Where XX is exactly 2 hex digits, unicode value XX | |
39 ** [abc] Any single character from the set abc | |
40 ** [^abc] Any single character not in the set abc | |
41 ** [a-z] Any single character in the range a-z | |
42 ** [^a-z] Any single character not in the range a-z | |
43 ** \b Word boundary | |
44 ** \w Word character. [A-Za-z0-9_] | |
45 ** \W Non-word character | |
46 ** \d Digit | |
47 ** \D Non-digit | |
48 ** \s Whitespace character | |
49 ** \S Non-whitespace character | |
50 ** | |
51 ** A nondeterministic finite automaton (NFA) is used for matching, so the | |
52 ** performance is bounded by O(N*M) where N is the size of the regular | |
53 ** expression and M is the size of the input string. The matcher never | |
54 ** exhibits exponential behavior. Note that the X{p,q} operator expands | |
55 ** to p copies of X following by q-p copies of X? and that the size of the | |
56 ** regular expression in the O(N*M) performance bound is computed after | |
57 ** this expansion. | |
58 */ | |
59 #include <string.h> | |
60 #include <stdlib.h> | |
61 #include "sqlite3ext.h" | |
62 SQLITE_EXTENSION_INIT1 | |
63 | |
64 /* | |
65 ** The following #defines change the names of some functions implemented in | |
66 ** this file to prevent name collisions with C-library functions of the | |
67 ** same name. | |
68 */ | |
69 #define re_match sqlite3re_match | |
70 #define re_compile sqlite3re_compile | |
71 #define re_free sqlite3re_free | |
72 | |
73 /* The end-of-input character */ | |
74 #define RE_EOF 0 /* End of input */ | |
75 | |
76 /* The NFA is implemented as sequence of opcodes taken from the following | |
77 ** set. Each opcode has a single integer argument. | |
78 */ | |
79 #define RE_OP_MATCH 1 /* Match the one character in the argument */ | |
80 #define RE_OP_ANY 2 /* Match any one character. (Implements ".") */ | |
81 #define RE_OP_ANYSTAR 3 /* Special optimized version of .* */ | |
82 #define RE_OP_FORK 4 /* Continue to both next and opcode at iArg */ | |
83 #define RE_OP_GOTO 5 /* Jump to opcode at iArg */ | |
84 #define RE_OP_ACCEPT 6 /* Halt and indicate a successful match */ | |
85 #define RE_OP_CC_INC 7 /* Beginning of a [...] character class */ | |
86 #define RE_OP_CC_EXC 8 /* Beginning of a [^...] character class */ | |
87 #define RE_OP_CC_VALUE 9 /* Single value in a character class */ | |
88 #define RE_OP_CC_RANGE 10 /* Range of values in a character class */ | |
89 #define RE_OP_WORD 11 /* Perl word character [A-Za-z0-9_] */ | |
90 #define RE_OP_NOTWORD 12 /* Not a perl word character */ | |
91 #define RE_OP_DIGIT 13 /* digit: [0-9] */ | |
92 #define RE_OP_NOTDIGIT 14 /* Not a digit */ | |
93 #define RE_OP_SPACE 15 /* space: [ \t\n\r\v\f] */ | |
94 #define RE_OP_NOTSPACE 16 /* Not a digit */ | |
95 #define RE_OP_BOUNDARY 17 /* Boundary between word and non-word */ | |
96 | |
97 /* Each opcode is a "state" in the NFA */ | |
98 typedef unsigned short ReStateNumber; | |
99 | |
100 /* Because this is an NFA and not a DFA, multiple states can be active at | |
101 ** once. An instance of the following object records all active states in | |
102 ** the NFA. The implementation is optimized for the common case where the | |
103 ** number of actives states is small. | |
104 */ | |
105 typedef struct ReStateSet { | |
106 unsigned nState; /* Number of current states */ | |
107 ReStateNumber *aState; /* Current states */ | |
108 } ReStateSet; | |
109 | |
110 /* An input string read one character at a time. | |
111 */ | |
112 typedef struct ReInput ReInput; | |
113 struct ReInput { | |
114 const unsigned char *z; /* All text */ | |
115 int i; /* Next byte to read */ | |
116 int mx; /* EOF when i>=mx */ | |
117 }; | |
118 | |
119 /* A compiled NFA (or an NFA that is in the process of being compiled) is | |
120 ** an instance of the following object. | |
121 */ | |
122 typedef struct ReCompiled ReCompiled; | |
123 struct ReCompiled { | |
124 ReInput sIn; /* Regular expression text */ | |
125 const char *zErr; /* Error message to return */ | |
126 char *aOp; /* Operators for the virtual machine */ | |
127 int *aArg; /* Arguments to each operator */ | |
128 unsigned (*xNextChar)(ReInput*); /* Next character function */ | |
129 unsigned char zInit[12]; /* Initial text to match */ | |
130 int nInit; /* Number of characters in zInit */ | |
131 unsigned nState; /* Number of entries in aOp[] and aArg[] */ | |
132 unsigned nAlloc; /* Slots allocated for aOp[] and aArg[] */ | |
133 }; | |
134 | |
135 /* Add a state to the given state set if it is not already there */ | |
136 static void re_add_state(ReStateSet *pSet, int newState){ | |
137 unsigned i; | |
138 for(i=0; i<pSet->nState; i++) if( pSet->aState[i]==newState ) return; | |
139 pSet->aState[pSet->nState++] = newState; | |
140 } | |
141 | |
142 /* Extract the next unicode character from *pzIn and return it. Advance | |
143 ** *pzIn to the first byte past the end of the character returned. To | |
144 ** be clear: this routine converts utf8 to unicode. This routine is | |
145 ** optimized for the common case where the next character is a single byte. | |
146 */ | |
147 static unsigned re_next_char(ReInput *p){ | |
148 unsigned c; | |
149 if( p->i>=p->mx ) return 0; | |
150 c = p->z[p->i++]; | |
151 if( c>=0x80 ){ | |
152 if( (c&0xe0)==0xc0 && p->i<p->mx && (p->z[p->i]&0xc0)==0x80 ){ | |
153 c = (c&0x1f)<<6 | (p->z[p->i++]&0x3f); | |
154 if( c<0x80 ) c = 0xfffd; | |
155 }else if( (c&0xf0)==0xe0 && p->i+1<p->mx && (p->z[p->i]&0xc0)==0x80 | |
156 && (p->z[p->i+1]&0xc0)==0x80 ){ | |
157 c = (c&0x0f)<<12 | ((p->z[p->i]&0x3f)<<6) | (p->z[p->i+1]&0x3f); | |
158 p->i += 2; | |
159 if( c<=0x3ff || (c>=0xd800 && c<=0xdfff) ) c = 0xfffd; | |
160 }else if( (c&0xf8)==0xf0 && p->i+3<p->mx && (p->z[p->i]&0xc0)==0x80 | |
161 && (p->z[p->i+1]&0xc0)==0x80 && (p->z[p->i+2]&0xc0)==0x80 ){ | |
162 c = (c&0x07)<<18 | ((p->z[p->i]&0x3f)<<12) | ((p->z[p->i+1]&0x3f)<<6) | |
163 | (p->z[p->i+2]&0x3f); | |
164 p->i += 3; | |
165 if( c<=0xffff || c>0x10ffff ) c = 0xfffd; | |
166 }else{ | |
167 c = 0xfffd; | |
168 } | |
169 } | |
170 return c; | |
171 } | |
172 static unsigned re_next_char_nocase(ReInput *p){ | |
173 unsigned c = re_next_char(p); | |
174 if( c>='A' && c<='Z' ) c += 'a' - 'A'; | |
175 return c; | |
176 } | |
177 | |
178 /* Return true if c is a perl "word" character: [A-Za-z0-9_] */ | |
179 static int re_word_char(int c){ | |
180 return (c>='0' && c<='9') || (c>='a' && c<='z') | |
181 || (c>='A' && c<='Z') || c=='_'; | |
182 } | |
183 | |
184 /* Return true if c is a "digit" character: [0-9] */ | |
185 static int re_digit_char(int c){ | |
186 return (c>='0' && c<='9'); | |
187 } | |
188 | |
189 /* Return true if c is a perl "space" character: [ \t\r\n\v\f] */ | |
190 static int re_space_char(int c){ | |
191 return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f'; | |
192 } | |
193 | |
194 /* Run a compiled regular expression on the zero-terminated input | |
195 ** string zIn[]. Return true on a match and false if there is no match. | |
196 */ | |
197 static int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){ | |
198 ReStateSet aStateSet[2], *pThis, *pNext; | |
199 ReStateNumber aSpace[100]; | |
200 ReStateNumber *pToFree; | |
201 unsigned int i = 0; | |
202 unsigned int iSwap = 0; | |
203 int c = RE_EOF+1; | |
204 int cPrev = 0; | |
205 int rc = 0; | |
206 ReInput in; | |
207 | |
208 in.z = zIn; | |
209 in.i = 0; | |
210 in.mx = nIn>=0 ? nIn : (int)strlen((char const*)zIn); | |
211 | |
212 /* Look for the initial prefix match, if there is one. */ | |
213 if( pRe->nInit ){ | |
214 unsigned char x = pRe->zInit[0]; | |
215 while( in.i+pRe->nInit<=in.mx | |
216 && (zIn[in.i]!=x || | |
217 strncmp((const char*)zIn+in.i, (const char*)pRe->zInit, pRe->nInit)!=0) | |
218 ){ | |
219 in.i++; | |
220 } | |
221 if( in.i+pRe->nInit>in.mx ) return 0; | |
222 } | |
223 | |
224 if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){ | |
225 pToFree = 0; | |
226 aStateSet[0].aState = aSpace; | |
227 }else{ | |
228 pToFree = sqlite3_malloc( sizeof(ReStateNumber)*2*pRe->nState ); | |
229 if( pToFree==0 ) return -1; | |
230 aStateSet[0].aState = pToFree; | |
231 } | |
232 aStateSet[1].aState = &aStateSet[0].aState[pRe->nState]; | |
233 pNext = &aStateSet[1]; | |
234 pNext->nState = 0; | |
235 re_add_state(pNext, 0); | |
236 while( c!=RE_EOF && pNext->nState>0 ){ | |
237 cPrev = c; | |
238 c = pRe->xNextChar(&in); | |
239 pThis = pNext; | |
240 pNext = &aStateSet[iSwap]; | |
241 iSwap = 1 - iSwap; | |
242 pNext->nState = 0; | |
243 for(i=0; i<pThis->nState; i++){ | |
244 int x = pThis->aState[i]; | |
245 switch( pRe->aOp[x] ){ | |
246 case RE_OP_MATCH: { | |
247 if( pRe->aArg[x]==c ) re_add_state(pNext, x+1); | |
248 break; | |
249 } | |
250 case RE_OP_ANY: { | |
251 re_add_state(pNext, x+1); | |
252 break; | |
253 } | |
254 case RE_OP_WORD: { | |
255 if( re_word_char(c) ) re_add_state(pNext, x+1); | |
256 break; | |
257 } | |
258 case RE_OP_NOTWORD: { | |
259 if( !re_word_char(c) ) re_add_state(pNext, x+1); | |
260 break; | |
261 } | |
262 case RE_OP_DIGIT: { | |
263 if( re_digit_char(c) ) re_add_state(pNext, x+1); | |
264 break; | |
265 } | |
266 case RE_OP_NOTDIGIT: { | |
267 if( !re_digit_char(c) ) re_add_state(pNext, x+1); | |
268 break; | |
269 } | |
270 case RE_OP_SPACE: { | |
271 if( re_space_char(c) ) re_add_state(pNext, x+1); | |
272 break; | |
273 } | |
274 case RE_OP_NOTSPACE: { | |
275 if( !re_space_char(c) ) re_add_state(pNext, x+1); | |
276 break; | |
277 } | |
278 case RE_OP_BOUNDARY: { | |
279 if( re_word_char(c)!=re_word_char(cPrev) ) re_add_state(pThis, x+1); | |
280 break; | |
281 } | |
282 case RE_OP_ANYSTAR: { | |
283 re_add_state(pNext, x); | |
284 re_add_state(pThis, x+1); | |
285 break; | |
286 } | |
287 case RE_OP_FORK: { | |
288 re_add_state(pThis, x+pRe->aArg[x]); | |
289 re_add_state(pThis, x+1); | |
290 break; | |
291 } | |
292 case RE_OP_GOTO: { | |
293 re_add_state(pThis, x+pRe->aArg[x]); | |
294 break; | |
295 } | |
296 case RE_OP_ACCEPT: { | |
297 rc = 1; | |
298 goto re_match_end; | |
299 } | |
300 case RE_OP_CC_INC: | |
301 case RE_OP_CC_EXC: { | |
302 int j = 1; | |
303 int n = pRe->aArg[x]; | |
304 int hit = 0; | |
305 for(j=1; j>0 && j<n; j++){ | |
306 if( pRe->aOp[x+j]==RE_OP_CC_VALUE ){ | |
307 if( pRe->aArg[x+j]==c ){ | |
308 hit = 1; | |
309 j = -1; | |
310 } | |
311 }else{ | |
312 if( pRe->aArg[x+j]<=c && pRe->aArg[x+j+1]>=c ){ | |
313 hit = 1; | |
314 j = -1; | |
315 }else{ | |
316 j++; | |
317 } | |
318 } | |
319 } | |
320 if( pRe->aOp[x]==RE_OP_CC_EXC ) hit = !hit; | |
321 if( hit ) re_add_state(pNext, x+n); | |
322 break; | |
323 } | |
324 } | |
325 } | |
326 } | |
327 for(i=0; i<pNext->nState; i++){ | |
328 if( pRe->aOp[pNext->aState[i]]==RE_OP_ACCEPT ){ rc = 1; break; } | |
329 } | |
330 re_match_end: | |
331 sqlite3_free(pToFree); | |
332 return rc; | |
333 } | |
334 | |
335 /* Resize the opcode and argument arrays for an RE under construction. | |
336 */ | |
337 static int re_resize(ReCompiled *p, int N){ | |
338 char *aOp; | |
339 int *aArg; | |
340 aOp = sqlite3_realloc(p->aOp, N*sizeof(p->aOp[0])); | |
341 if( aOp==0 ) return 1; | |
342 p->aOp = aOp; | |
343 aArg = sqlite3_realloc(p->aArg, N*sizeof(p->aArg[0])); | |
344 if( aArg==0 ) return 1; | |
345 p->aArg = aArg; | |
346 p->nAlloc = N; | |
347 return 0; | |
348 } | |
349 | |
350 /* Insert a new opcode and argument into an RE under construction. The | |
351 ** insertion point is just prior to existing opcode iBefore. | |
352 */ | |
353 static int re_insert(ReCompiled *p, int iBefore, int op, int arg){ | |
354 int i; | |
355 if( p->nAlloc<=p->nState && re_resize(p, p->nAlloc*2) ) return 0; | |
356 for(i=p->nState; i>iBefore; i--){ | |
357 p->aOp[i] = p->aOp[i-1]; | |
358 p->aArg[i] = p->aArg[i-1]; | |
359 } | |
360 p->nState++; | |
361 p->aOp[iBefore] = op; | |
362 p->aArg[iBefore] = arg; | |
363 return iBefore; | |
364 } | |
365 | |
366 /* Append a new opcode and argument to the end of the RE under construction. | |
367 */ | |
368 static int re_append(ReCompiled *p, int op, int arg){ | |
369 return re_insert(p, p->nState, op, arg); | |
370 } | |
371 | |
372 /* Make a copy of N opcodes starting at iStart onto the end of the RE | |
373 ** under construction. | |
374 */ | |
375 static void re_copy(ReCompiled *p, int iStart, int N){ | |
376 if( p->nState+N>=p->nAlloc && re_resize(p, p->nAlloc*2+N) ) return; | |
377 memcpy(&p->aOp[p->nState], &p->aOp[iStart], N*sizeof(p->aOp[0])); | |
378 memcpy(&p->aArg[p->nState], &p->aArg[iStart], N*sizeof(p->aArg[0])); | |
379 p->nState += N; | |
380 } | |
381 | |
382 /* Return true if c is a hexadecimal digit character: [0-9a-fA-F] | |
383 ** If c is a hex digit, also set *pV = (*pV)*16 + valueof(c). If | |
384 ** c is not a hex digit *pV is unchanged. | |
385 */ | |
386 static int re_hex(int c, int *pV){ | |
387 if( c>='0' && c<='9' ){ | |
388 c -= '0'; | |
389 }else if( c>='a' && c<='f' ){ | |
390 c -= 'a' - 10; | |
391 }else if( c>='A' && c<='F' ){ | |
392 c -= 'A' - 10; | |
393 }else{ | |
394 return 0; | |
395 } | |
396 *pV = (*pV)*16 + (c & 0xff); | |
397 return 1; | |
398 } | |
399 | |
400 /* A backslash character has been seen, read the next character and | |
401 ** return its interpretation. | |
402 */ | |
403 static unsigned re_esc_char(ReCompiled *p){ | |
404 static const char zEsc[] = "afnrtv\\()*.+?[$^{|}]"; | |
405 static const char zTrans[] = "\a\f\n\r\t\v"; | |
406 int i, v = 0; | |
407 char c; | |
408 if( p->sIn.i>=p->sIn.mx ) return 0; | |
409 c = p->sIn.z[p->sIn.i]; | |
410 if( c=='u' && p->sIn.i+4<p->sIn.mx ){ | |
411 const unsigned char *zIn = p->sIn.z + p->sIn.i; | |
412 if( re_hex(zIn[1],&v) | |
413 && re_hex(zIn[2],&v) | |
414 && re_hex(zIn[3],&v) | |
415 && re_hex(zIn[4],&v) | |
416 ){ | |
417 p->sIn.i += 5; | |
418 return v; | |
419 } | |
420 } | |
421 if( c=='x' && p->sIn.i+2<p->sIn.mx ){ | |
422 const unsigned char *zIn = p->sIn.z + p->sIn.i; | |
423 if( re_hex(zIn[1],&v) | |
424 && re_hex(zIn[2],&v) | |
425 ){ | |
426 p->sIn.i += 3; | |
427 return v; | |
428 } | |
429 } | |
430 for(i=0; zEsc[i] && zEsc[i]!=c; i++){} | |
431 if( zEsc[i] ){ | |
432 if( i<6 ) c = zTrans[i]; | |
433 p->sIn.i++; | |
434 }else{ | |
435 p->zErr = "unknown \\ escape"; | |
436 } | |
437 return c; | |
438 } | |
439 | |
440 /* Forward declaration */ | |
441 static const char *re_subcompile_string(ReCompiled*); | |
442 | |
443 /* Peek at the next byte of input */ | |
444 static unsigned char rePeek(ReCompiled *p){ | |
445 return p->sIn.i<p->sIn.mx ? p->sIn.z[p->sIn.i] : 0; | |
446 } | |
447 | |
448 /* Compile RE text into a sequence of opcodes. Continue up to the | |
449 ** first unmatched ")" character, then return. If an error is found, | |
450 ** return a pointer to the error message string. | |
451 */ | |
452 static const char *re_subcompile_re(ReCompiled *p){ | |
453 const char *zErr; | |
454 int iStart, iEnd, iGoto; | |
455 iStart = p->nState; | |
456 zErr = re_subcompile_string(p); | |
457 if( zErr ) return zErr; | |
458 while( rePeek(p)=='|' ){ | |
459 iEnd = p->nState; | |
460 re_insert(p, iStart, RE_OP_FORK, iEnd + 2 - iStart); | |
461 iGoto = re_append(p, RE_OP_GOTO, 0); | |
462 p->sIn.i++; | |
463 zErr = re_subcompile_string(p); | |
464 if( zErr ) return zErr; | |
465 p->aArg[iGoto] = p->nState - iGoto; | |
466 } | |
467 return 0; | |
468 } | |
469 | |
470 /* Compile an element of regular expression text (anything that can be | |
471 ** an operand to the "|" operator). Return NULL on success or a pointer | |
472 ** to the error message if there is a problem. | |
473 */ | |
474 static const char *re_subcompile_string(ReCompiled *p){ | |
475 int iPrev = -1; | |
476 int iStart; | |
477 unsigned c; | |
478 const char *zErr; | |
479 while( (c = p->xNextChar(&p->sIn))!=0 ){ | |
480 iStart = p->nState; | |
481 switch( c ){ | |
482 case '|': | |
483 case '$': | |
484 case ')': { | |
485 p->sIn.i--; | |
486 return 0; | |
487 } | |
488 case '(': { | |
489 zErr = re_subcompile_re(p); | |
490 if( zErr ) return zErr; | |
491 if( rePeek(p)!=')' ) return "unmatched '('"; | |
492 p->sIn.i++; | |
493 break; | |
494 } | |
495 case '.': { | |
496 if( rePeek(p)=='*' ){ | |
497 re_append(p, RE_OP_ANYSTAR, 0); | |
498 p->sIn.i++; | |
499 }else{ | |
500 re_append(p, RE_OP_ANY, 0); | |
501 } | |
502 break; | |
503 } | |
504 case '*': { | |
505 if( iPrev<0 ) return "'*' without operand"; | |
506 re_insert(p, iPrev, RE_OP_GOTO, p->nState - iPrev + 1); | |
507 re_append(p, RE_OP_FORK, iPrev - p->nState + 1); | |
508 break; | |
509 } | |
510 case '+': { | |
511 if( iPrev<0 ) return "'+' without operand"; | |
512 re_append(p, RE_OP_FORK, iPrev - p->nState); | |
513 break; | |
514 } | |
515 case '?': { | |
516 if( iPrev<0 ) return "'?' without operand"; | |
517 re_insert(p, iPrev, RE_OP_FORK, p->nState - iPrev+1); | |
518 break; | |
519 } | |
520 case '{': { | |
521 int m = 0, n = 0; | |
522 int sz, j; | |
523 if( iPrev<0 ) return "'{m,n}' without operand"; | |
524 while( (c=rePeek(p))>='0' && c<='9' ){ m = m*10 + c - '0'; p->sIn.i++; } | |
525 n = m; | |
526 if( c==',' ){ | |
527 p->sIn.i++; | |
528 n = 0; | |
529 while( (c=rePeek(p))>='0' && c<='9' ){ n = n*10 + c-'0'; p->sIn.i++; } | |
530 } | |
531 if( c!='}' ) return "unmatched '{'"; | |
532 if( n>0 && n<m ) return "n less than m in '{m,n}'"; | |
533 p->sIn.i++; | |
534 sz = p->nState - iPrev; | |
535 if( m==0 ){ | |
536 if( n==0 ) return "both m and n are zero in '{m,n}'"; | |
537 re_insert(p, iPrev, RE_OP_FORK, sz+1); | |
538 n--; | |
539 }else{ | |
540 for(j=1; j<m; j++) re_copy(p, iPrev, sz); | |
541 } | |
542 for(j=m; j<n; j++){ | |
543 re_append(p, RE_OP_FORK, sz+1); | |
544 re_copy(p, iPrev, sz); | |
545 } | |
546 if( n==0 && m>0 ){ | |
547 re_append(p, RE_OP_FORK, -sz); | |
548 } | |
549 break; | |
550 } | |
551 case '[': { | |
552 int iFirst = p->nState; | |
553 if( rePeek(p)=='^' ){ | |
554 re_append(p, RE_OP_CC_EXC, 0); | |
555 p->sIn.i++; | |
556 }else{ | |
557 re_append(p, RE_OP_CC_INC, 0); | |
558 } | |
559 while( (c = p->xNextChar(&p->sIn))!=0 ){ | |
560 if( c=='[' && rePeek(p)==':' ){ | |
561 return "POSIX character classes not supported"; | |
562 } | |
563 if( c=='\\' ) c = re_esc_char(p); | |
564 if( rePeek(p)=='-' ){ | |
565 re_append(p, RE_OP_CC_RANGE, c); | |
566 p->sIn.i++; | |
567 c = p->xNextChar(&p->sIn); | |
568 if( c=='\\' ) c = re_esc_char(p); | |
569 re_append(p, RE_OP_CC_RANGE, c); | |
570 }else{ | |
571 re_append(p, RE_OP_CC_VALUE, c); | |
572 } | |
573 if( rePeek(p)==']' ){ p->sIn.i++; break; } | |
574 } | |
575 if( c==0 ) return "unclosed '['"; | |
576 p->aArg[iFirst] = p->nState - iFirst; | |
577 break; | |
578 } | |
579 case '\\': { | |
580 int specialOp = 0; | |
581 switch( rePeek(p) ){ | |
582 case 'b': specialOp = RE_OP_BOUNDARY; break; | |
583 case 'd': specialOp = RE_OP_DIGIT; break; | |
584 case 'D': specialOp = RE_OP_NOTDIGIT; break; | |
585 case 's': specialOp = RE_OP_SPACE; break; | |
586 case 'S': specialOp = RE_OP_NOTSPACE; break; | |
587 case 'w': specialOp = RE_OP_WORD; break; | |
588 case 'W': specialOp = RE_OP_NOTWORD; break; | |
589 } | |
590 if( specialOp ){ | |
591 p->sIn.i++; | |
592 re_append(p, specialOp, 0); | |
593 }else{ | |
594 c = re_esc_char(p); | |
595 re_append(p, RE_OP_MATCH, c); | |
596 } | |
597 break; | |
598 } | |
599 default: { | |
600 re_append(p, RE_OP_MATCH, c); | |
601 break; | |
602 } | |
603 } | |
604 iPrev = iStart; | |
605 } | |
606 return 0; | |
607 } | |
608 | |
609 /* Free and reclaim all the memory used by a previously compiled | |
610 ** regular expression. Applications should invoke this routine once | |
611 ** for every call to re_compile() to avoid memory leaks. | |
612 */ | |
613 void re_free(ReCompiled *pRe){ | |
614 if( pRe ){ | |
615 sqlite3_free(pRe->aOp); | |
616 sqlite3_free(pRe->aArg); | |
617 sqlite3_free(pRe); | |
618 } | |
619 } | |
620 | |
621 /* | |
622 ** Compile a textual regular expression in zIn[] into a compiled regular | |
623 ** expression suitable for us by re_match() and return a pointer to the | |
624 ** compiled regular expression in *ppRe. Return NULL on success or an | |
625 ** error message if something goes wrong. | |
626 */ | |
627 const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){ | |
628 ReCompiled *pRe; | |
629 const char *zErr; | |
630 int i, j; | |
631 | |
632 *ppRe = 0; | |
633 pRe = sqlite3_malloc( sizeof(*pRe) ); | |
634 if( pRe==0 ){ | |
635 return "out of memory"; | |
636 } | |
637 memset(pRe, 0, sizeof(*pRe)); | |
638 pRe->xNextChar = noCase ? re_next_char_nocase : re_next_char; | |
639 if( re_resize(pRe, 30) ){ | |
640 re_free(pRe); | |
641 return "out of memory"; | |
642 } | |
643 if( zIn[0]=='^' ){ | |
644 zIn++; | |
645 }else{ | |
646 re_append(pRe, RE_OP_ANYSTAR, 0); | |
647 } | |
648 pRe->sIn.z = (unsigned char*)zIn; | |
649 pRe->sIn.i = 0; | |
650 pRe->sIn.mx = (int)strlen(zIn); | |
651 zErr = re_subcompile_re(pRe); | |
652 if( zErr ){ | |
653 re_free(pRe); | |
654 return zErr; | |
655 } | |
656 if( rePeek(pRe)=='$' && pRe->sIn.i+1>=pRe->sIn.mx ){ | |
657 re_append(pRe, RE_OP_MATCH, RE_EOF); | |
658 re_append(pRe, RE_OP_ACCEPT, 0); | |
659 *ppRe = pRe; | |
660 }else if( pRe->sIn.i>=pRe->sIn.mx ){ | |
661 re_append(pRe, RE_OP_ACCEPT, 0); | |
662 *ppRe = pRe; | |
663 }else{ | |
664 re_free(pRe); | |
665 return "unrecognized character"; | |
666 } | |
667 | |
668 /* The following is a performance optimization. If the regex begins with | |
669 ** ".*" (if the input regex lacks an initial "^") and afterwards there are | |
670 ** one or more matching characters, enter those matching characters into | |
671 ** zInit[]. The re_match() routine can then search ahead in the input | |
672 ** string looking for the initial match without having to run the whole | |
673 ** regex engine over the string. Do not worry able trying to match | |
674 ** unicode characters beyond plane 0 - those are very rare and this is | |
675 ** just an optimization. */ | |
676 if( pRe->aOp[0]==RE_OP_ANYSTAR ){ | |
677 for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){ | |
678 unsigned x = pRe->aArg[i]; | |
679 if( x<=127 ){ | |
680 pRe->zInit[j++] = x; | |
681 }else if( x<=0xfff ){ | |
682 pRe->zInit[j++] = 0xc0 | (x>>6); | |
683 pRe->zInit[j++] = 0x80 | (x&0x3f); | |
684 }else if( x<=0xffff ){ | |
685 pRe->zInit[j++] = 0xd0 | (x>>12); | |
686 pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f); | |
687 pRe->zInit[j++] = 0x80 | (x&0x3f); | |
688 }else{ | |
689 break; | |
690 } | |
691 } | |
692 if( j>0 && pRe->zInit[j-1]==0 ) j--; | |
693 pRe->nInit = j; | |
694 } | |
695 return pRe->zErr; | |
696 } | |
697 | |
698 /* | |
699 ** Implementation of the regexp() SQL function. This function implements | |
700 ** the build-in REGEXP operator. The first argument to the function is the | |
701 ** pattern and the second argument is the string. So, the SQL statements: | |
702 ** | |
703 ** A REGEXP B | |
704 ** | |
705 ** is implemented as regexp(B,A). | |
706 */ | |
707 static void re_sql_func( | |
708 sqlite3_context *context, | |
709 int argc, | |
710 sqlite3_value **argv | |
711 ){ | |
712 ReCompiled *pRe; /* Compiled regular expression */ | |
713 const char *zPattern; /* The regular expression */ | |
714 const unsigned char *zStr;/* String being searched */ | |
715 const char *zErr; /* Compile error message */ | |
716 int setAux = 0; /* True to invoke sqlite3_set_auxdata() */ | |
717 | |
718 pRe = sqlite3_get_auxdata(context, 0); | |
719 if( pRe==0 ){ | |
720 zPattern = (const char*)sqlite3_value_text(argv[0]); | |
721 if( zPattern==0 ) return; | |
722 zErr = re_compile(&pRe, zPattern, 0); | |
723 if( zErr ){ | |
724 re_free(pRe); | |
725 sqlite3_result_error(context, zErr, -1); | |
726 return; | |
727 } | |
728 if( pRe==0 ){ | |
729 sqlite3_result_error_nomem(context); | |
730 return; | |
731 } | |
732 setAux = 1; | |
733 } | |
734 zStr = (const unsigned char*)sqlite3_value_text(argv[1]); | |
735 if( zStr!=0 ){ | |
736 sqlite3_result_int(context, re_match(pRe, zStr, -1)); | |
737 } | |
738 if( setAux ){ | |
739 sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free); | |
740 } | |
741 } | |
742 | |
743 /* | |
744 ** Invoke this routine to register the regexp() function with the | |
745 ** SQLite database connection. | |
746 */ | |
747 #ifdef _WIN32 | |
748 __declspec(dllexport) | |
749 #endif | |
750 int sqlite3_regexp_init( | |
751 sqlite3 *db, | |
752 char **pzErrMsg, | |
753 const sqlite3_api_routines *pApi | |
754 ){ | |
755 int rc = SQLITE_OK; | |
756 SQLITE_EXTENSION_INIT2(pApi); | |
757 rc = sqlite3_create_function(db, "regexp", 2, SQLITE_UTF8, 0, | |
758 re_sql_func, 0, 0); | |
759 return rc; | |
760 } | |
OLD | NEW |