third_party/sqlite/src/ext/misc/regexp.c - Issue 949043002: Add //third_party/sqlite to dirs_to_snapshot, remove net_sql.patch

Side by Side Diff: third_party/sqlite/src/ext/misc/regexp.c

Issue 949043002: Add //third_party/sqlite to dirs_to_snapshot, remove net_sql.patch (Closed) Base URL: git@github.com:domokit/mojo.git@master

Patch Set: Created 5 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 ** 2012-11-13

	3 **

	4 ** The author disclaims copyright to this source code. In place of

	5 ** a legal notice, here is a blessing:

	6 **

	7 ** May you do good and not evil.

	8 ** May you find forgiveness for yourself and forgive others.

	9 ** May you share freely, never taking more than you give.

	10 **

	11 ******************************************************************************

	12 **

	13 ** The code in this file implements a compact but reasonably

	14 ** efficient regular-expression matcher for posix extended regular

	15 ** expressions against UTF8 text.

	16 **

	17 ** This file is an SQLite extension. It registers a single function

	18 ** named "regexp(A,B)" where A is the regular expression and B is the

	19 ** string to be matched. By registering this function, SQLite will also

	20 ** then implement the "B regexp A" operator. Note that with the function

	21 ** the regular expression comes first, but with the operator it comes

	22 ** second.

	23 **

	24 ** The following regular expression syntax is supported:

	25 **

	26 ** X* zero or more occurrences of X

	27 ** X+ one or more occurrences of X

	28 ** X? zero or one occurrences of X

	29 ** X{p,q} between p and q occurrences of X

	30 ** (X) match X

	31 ** X\|Y X or Y

	32 ** ^X X occurring at the beginning of the string

	33 ** X$ X occurring at the end of the string

	34 ** . Match any single character

	35 ** \c Character c where c is one of \{}()[]\|*+?.

	36 ** \c C-language escapes for c in afnrtv. ex: \t or \n

	37 ** \uXXXX Where XXXX is exactly 4 hex digits, unicode value XXXX

	38 ** \xXX Where XX is exactly 2 hex digits, unicode value XX

	39 ** [abc] Any single character from the set abc

	40 ** [^abc] Any single character not in the set abc

	41 ** [a-z] Any single character in the range a-z

	42 ** [^a-z] Any single character not in the range a-z

	43 ** \b Word boundary

	44 ** \w Word character. [A-Za-z0-9_]

	45 ** \W Non-word character

	46 ** \d Digit

	47 ** \D Non-digit

	48 ** \s Whitespace character

	49 ** \S Non-whitespace character

	50 **

	51 ** A nondeterministic finite automaton (NFA) is used for matching, so the

	52 ** performance is bounded by O(N*M) where N is the size of the regular

	53 ** expression and M is the size of the input string. The matcher never

	54 ** exhibits exponential behavior. Note that the X{p,q} operator expands

	55 ** to p copies of X following by q-p copies of X? and that the size of the

	56 ** regular expression in the O(N*M) performance bound is computed after

	57 ** this expansion.

	58 */

	59 #include <string.h>

	60 #include <stdlib.h>

	61 #include "sqlite3ext.h"

	62 SQLITE_EXTENSION_INIT1

	63

	64 /*

	65 ** The following #defines change the names of some functions implemented in

	66 ** this file to prevent name collisions with C-library functions of the

	67 ** same name.

	68 */

	69 #define re_match sqlite3re_match

	70 #define re_compile sqlite3re_compile

	71 #define re_free sqlite3re_free

	72

	73 /* The end-of-input character */

	74 #define RE_EOF 0 /* End of input */

	75

	76 /* The NFA is implemented as sequence of opcodes taken from the following

	77 ** set. Each opcode has a single integer argument.

	78 */

	79 #define RE_OP_MATCH 1 /* Match the one character in the argument */

	80 #define RE_OP_ANY 2 /* Match any one character. (Implements ".") */

	81 #define RE_OP_ANYSTAR 3 /* Special optimized version of .* */

	82 #define RE_OP_FORK 4 /* Continue to both next and opcode at iArg */

	83 #define RE_OP_GOTO 5 /* Jump to opcode at iArg */

	84 #define RE_OP_ACCEPT 6 /* Halt and indicate a successful match */

	85 #define RE_OP_CC_INC 7 /* Beginning of a [...] character class */

	86 #define RE_OP_CC_EXC 8 /* Beginning of a [^...] character class */

	87 #define RE_OP_CC_VALUE 9 /* Single value in a character class */

	88 #define RE_OP_CC_RANGE 10 /* Range of values in a character class */

	89 #define RE_OP_WORD 11 /* Perl word character [A-Za-z0-9_] */

	90 #define RE_OP_NOTWORD 12 /* Not a perl word character */

	91 #define RE_OP_DIGIT 13 /* digit: [0-9] */

	92 #define RE_OP_NOTDIGIT 14 /* Not a digit */

	93 #define RE_OP_SPACE 15 /* space: [ \t\n\r\v\f] */

	94 #define RE_OP_NOTSPACE 16 /* Not a digit */

	95 #define RE_OP_BOUNDARY 17 /* Boundary between word and non-word */

	96

	97 /* Each opcode is a "state" in the NFA */

	98 typedef unsigned short ReStateNumber;

	99

	100 /* Because this is an NFA and not a DFA, multiple states can be active at

	101 ** once. An instance of the following object records all active states in

	102 ** the NFA. The implementation is optimized for the common case where the

	103 ** number of actives states is small.

	104 */

	105 typedef struct ReStateSet {

	106 unsigned nState; /* Number of current states */

	107 ReStateNumber aState; / Current states */

	108 } ReStateSet;

	109

	110 /* An input string read one character at a time.

	111 */

	112 typedef struct ReInput ReInput;

	113 struct ReInput {

	114 const unsigned char z; / All text */

	115 int i; /* Next byte to read */

	116 int mx; /* EOF when i>=mx */

	117 };

	118

	119 /* A compiled NFA (or an NFA that is in the process of being compiled) is

	120 ** an instance of the following object.

	121 */

	122 typedef struct ReCompiled ReCompiled;

	123 struct ReCompiled {

	124 ReInput sIn; /* Regular expression text */

	125 const char zErr; / Error message to return */

	126 char aOp; / Operators for the virtual machine */

	127 int aArg; / Arguments to each operator */

	128 unsigned (xNextChar)(ReInput); /* Next character function */

	129 unsigned char zInit[12]; /* Initial text to match */

	130 int nInit; /* Number of characters in zInit */

	131 unsigned nState; /* Number of entries in aOp[] and aArg[] */

	132 unsigned nAlloc; /* Slots allocated for aOp[] and aArg[] */

	133 };

	134

	135 /* Add a state to the given state set if it is not already there */

	136 static void re_add_state(ReStateSet *pSet, int newState){

	137 unsigned i;

	138 for(i=0; i<pSet->nState; i++) if( pSet->aState[i]==newState ) return;

	139 pSet->aState[pSet->nState++] = newState;

	140 }

	141

	142 /* Extract the next unicode character from *pzIn and return it. Advance

	143 ** *pzIn to the first byte past the end of the character returned. To

	144 ** be clear: this routine converts utf8 to unicode. This routine is

	145 ** optimized for the common case where the next character is a single byte.

	146 */

	147 static unsigned re_next_char(ReInput *p){

	148 unsigned c;

	149 if( p->i>=p->mx ) return 0;

	150 c = p->z[p->i++];

	151 if( c>=0x80 ){

	152 if( (c&0xe0)==0xc0 && p->i<p->mx && (p->z[p->i]&0xc0)==0x80 ){

	153 c = (c&0x1f)<<6 \| (p->z[p->i++]&0x3f);

	154 if( c<0x80 ) c = 0xfffd;

	155 }else if( (c&0xf0)==0xe0 && p->i+1<p->mx && (p->z[p->i]&0xc0)==0x80

	156 && (p->z[p->i+1]&0xc0)==0x80 ){

	157 c = (c&0x0f)<<12 \| ((p->z[p->i]&0x3f)<<6) \| (p->z[p->i+1]&0x3f);

	158 p->i += 2;

	159 if( c<=0x3ff \|\| (c>=0xd800 && c<=0xdfff) ) c = 0xfffd;

	160 }else if( (c&0xf8)==0xf0 && p->i+3<p->mx && (p->z[p->i]&0xc0)==0x80

	161 && (p->z[p->i+1]&0xc0)==0x80 && (p->z[p->i+2]&0xc0)==0x80 ){

	162 c = (c&0x07)<<18 \| ((p->z[p->i]&0x3f)<<12) \| ((p->z[p->i+1]&0x3f)<<6)

	163 \| (p->z[p->i+2]&0x3f);

	164 p->i += 3;

	165 if( c<=0xffff \|\| c>0x10ffff ) c = 0xfffd;

	166 }else{

	167 c = 0xfffd;

	168 }

	169 }

	170 return c;

	171 }

	172 static unsigned re_next_char_nocase(ReInput *p){

	173 unsigned c = re_next_char(p);

	174 if( c>='A' && c<='Z' ) c += 'a' - 'A';

	175 return c;

	176 }

	177

	178 /* Return true if c is a perl "word" character: [A-Za-z0-9_] */

	179 static int re_word_char(int c){

	180 return (c>='0' && c<='9') \|\| (c>='a' && c<='z')

	181 \|\| (c>='A' && c<='Z') \|\| c=='_';

	182 }

	183

	184 /* Return true if c is a "digit" character: [0-9] */

	185 static int re_digit_char(int c){

	186 return (c>='0' && c<='9');

	187 }

	188

	189 /* Return true if c is a perl "space" character: [ \t\r\n\v\f] */

	190 static int re_space_char(int c){

	191 return c==' ' \|\| c=='\t' \|\| c=='\n' \|\| c=='\r' \|\| c=='\v' \|\| c=='\f';

	192 }

	193

	194 /* Run a compiled regular expression on the zero-terminated input

	195 ** string zIn[]. Return true on a match and false if there is no match.

	196 */

	197 static int re_match(ReCompiled pRe, const unsigned char zIn, int nIn){

	198 ReStateSet aStateSet[2], pThis, pNext;

	199 ReStateNumber aSpace[100];

	200 ReStateNumber *pToFree;

	201 unsigned int i = 0;

	202 unsigned int iSwap = 0;

	203 int c = RE_EOF+1;

	204 int cPrev = 0;

	205 int rc = 0;

	206 ReInput in;

	207

	208 in.z = zIn;

	209 in.i = 0;

	210 in.mx = nIn>=0 ? nIn : (int)strlen((char const*)zIn);

	211

	212 /* Look for the initial prefix match, if there is one. */

	213 if( pRe->nInit ){

	214 unsigned char x = pRe->zInit[0];

	215 while( in.i+pRe->nInit<=in.mx

	216 && (zIn[in.i]!=x \|\|

	217 strncmp((const char)zIn+in.i, (const char)pRe->zInit, pRe->nInit)!=0)

	218 ){

	219 in.i++;

	220 }

	221 if( in.i+pRe->nInit>in.mx ) return 0;

	222 }

	223

	224 if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){

	225 pToFree = 0;

	226 aStateSet[0].aState = aSpace;

	227 }else{

	228 pToFree = sqlite3_malloc( sizeof(ReStateNumber)2pRe->nState );

	229 if( pToFree==0 ) return -1;

	230 aStateSet[0].aState = pToFree;

	231 }

	232 aStateSet[1].aState = &aStateSet[0].aState[pRe->nState];

	233 pNext = &aStateSet[1];

	234 pNext->nState = 0;

	235 re_add_state(pNext, 0);

	236 while( c!=RE_EOF && pNext->nState>0 ){

	237 cPrev = c;

	238 c = pRe->xNextChar(&in);

	239 pThis = pNext;

	240 pNext = &aStateSet[iSwap];

	241 iSwap = 1 - iSwap;

	242 pNext->nState = 0;

	243 for(i=0; i<pThis->nState; i++){

	244 int x = pThis->aState[i];

	245 switch( pRe->aOp[x] ){

	246 case RE_OP_MATCH: {

	247 if( pRe->aArg[x]==c ) re_add_state(pNext, x+1);

	248 break;

	249 }

	250 case RE_OP_ANY: {

	251 re_add_state(pNext, x+1);

	252 break;

	253 }

	254 case RE_OP_WORD: {

	255 if( re_word_char(c) ) re_add_state(pNext, x+1);

	256 break;

	257 }

	258 case RE_OP_NOTWORD: {

	259 if( !re_word_char(c) ) re_add_state(pNext, x+1);

	260 break;

	261 }

	262 case RE_OP_DIGIT: {

	263 if( re_digit_char(c) ) re_add_state(pNext, x+1);

	264 break;

	265 }

	266 case RE_OP_NOTDIGIT: {

	267 if( !re_digit_char(c) ) re_add_state(pNext, x+1);

	268 break;

	269 }

	270 case RE_OP_SPACE: {

	271 if( re_space_char(c) ) re_add_state(pNext, x+1);

	272 break;

	273 }

	274 case RE_OP_NOTSPACE: {

	275 if( !re_space_char(c) ) re_add_state(pNext, x+1);

	276 break;

	277 }

	278 case RE_OP_BOUNDARY: {

	279 if( re_word_char(c)!=re_word_char(cPrev) ) re_add_state(pThis, x+1);

	280 break;

	281 }

	282 case RE_OP_ANYSTAR: {

	283 re_add_state(pNext, x);

	284 re_add_state(pThis, x+1);

	285 break;

	286 }

	287 case RE_OP_FORK: {

	288 re_add_state(pThis, x+pRe->aArg[x]);

	289 re_add_state(pThis, x+1);

	290 break;

	291 }

	292 case RE_OP_GOTO: {

	293 re_add_state(pThis, x+pRe->aArg[x]);

	294 break;

	295 }

	296 case RE_OP_ACCEPT: {

	297 rc = 1;

	298 goto re_match_end;

	299 }

	300 case RE_OP_CC_INC:

	301 case RE_OP_CC_EXC: {

	302 int j = 1;

	303 int n = pRe->aArg[x];

	304 int hit = 0;

	305 for(j=1; j>0 && j<n; j++){

	306 if( pRe->aOp[x+j]==RE_OP_CC_VALUE ){

	307 if( pRe->aArg[x+j]==c ){

	308 hit = 1;

	309 j = -1;

	310 }

	311 }else{

	312 if( pRe->aArg[x+j]<=c && pRe->aArg[x+j+1]>=c ){

	313 hit = 1;

	314 j = -1;

	315 }else{

	316 j++;

	317 }

	318 }

	319 }

	320 if( pRe->aOp[x]==RE_OP_CC_EXC ) hit = !hit;

	321 if( hit ) re_add_state(pNext, x+n);

	322 break;

	323 }

	324 }

	325 }

	326 }

	327 for(i=0; i<pNext->nState; i++){

	328 if( pRe->aOp[pNext->aState[i]]==RE_OP_ACCEPT ){ rc = 1; break; }

	329 }

	330 re_match_end:

	331 sqlite3_free(pToFree);

	332 return rc;

	333 }

	334

	335 /* Resize the opcode and argument arrays for an RE under construction.

	336 */

	337 static int re_resize(ReCompiled *p, int N){

	338 char *aOp;

	339 int *aArg;

	340 aOp = sqlite3_realloc(p->aOp, N*sizeof(p->aOp[0]));

	341 if( aOp==0 ) return 1;

	342 p->aOp = aOp;

	343 aArg = sqlite3_realloc(p->aArg, N*sizeof(p->aArg[0]));

	344 if( aArg==0 ) return 1;

	345 p->aArg = aArg;

	346 p->nAlloc = N;

	347 return 0;

	348 }

	349

	350 /* Insert a new opcode and argument into an RE under construction. The

	351 ** insertion point is just prior to existing opcode iBefore.

	352 */

	353 static int re_insert(ReCompiled *p, int iBefore, int op, int arg){

	354 int i;

	355 if( p->nAlloc<=p->nState && re_resize(p, p->nAlloc*2) ) return 0;

	356 for(i=p->nState; i>iBefore; i--){

	357 p->aOp[i] = p->aOp[i-1];

	358 p->aArg[i] = p->aArg[i-1];

	359 }

	360 p->nState++;

	361 p->aOp[iBefore] = op;

	362 p->aArg[iBefore] = arg;

	363 return iBefore;

	364 }

	365

	366 /* Append a new opcode and argument to the end of the RE under construction.

	367 */

	368 static int re_append(ReCompiled *p, int op, int arg){

	369 return re_insert(p, p->nState, op, arg);

	370 }

	371

	372 /* Make a copy of N opcodes starting at iStart onto the end of the RE

	373 ** under construction.

	374 */

	375 static void re_copy(ReCompiled *p, int iStart, int N){

	376 if( p->nState+N>=p->nAlloc && re_resize(p, p->nAlloc*2+N) ) return;

	377 memcpy(&p->aOp[p->nState], &p->aOp[iStart], N*sizeof(p->aOp[0]));

	378 memcpy(&p->aArg[p->nState], &p->aArg[iStart], N*sizeof(p->aArg[0]));

	379 p->nState += N;

	380 }

	381

	382 /* Return true if c is a hexadecimal digit character: [0-9a-fA-F]

	383 ** If c is a hex digit, also set pV = (pV)*16 + valueof(c). If

	384 ** c is not a hex digit *pV is unchanged.

	385 */

	386 static int re_hex(int c, int *pV){

	387 if( c>='0' && c<='9' ){

	388 c -= '0';

	389 }else if( c>='a' && c<='f' ){

	390 c -= 'a' - 10;

	391 }else if( c>='A' && c<='F' ){

	392 c -= 'A' - 10;

	393 }else{

	394 return 0;

	395 }

	396 pV = (pV)*16 + (c & 0xff);

	397 return 1;

	398 }

	399

	400 /* A backslash character has been seen, read the next character and

	401 ** return its interpretation.

	402 */

	403 static unsigned re_esc_char(ReCompiled *p){

	404 static const char zEsc[] = "afnrtv\\()*.+?[$^{\|}]";

	405 static const char zTrans[] = "\a\f\n\r\t\v";

	406 int i, v = 0;

	407 char c;

	408 if( p->sIn.i>=p->sIn.mx ) return 0;

	409 c = p->sIn.z[p->sIn.i];

	410 if( c=='u' && p->sIn.i+4<p->sIn.mx ){

	411 const unsigned char *zIn = p->sIn.z + p->sIn.i;

	412 if( re_hex(zIn[1],&v)

	413 && re_hex(zIn[2],&v)

	414 && re_hex(zIn[3],&v)

	415 && re_hex(zIn[4],&v)

	416 ){

	417 p->sIn.i += 5;

	418 return v;

	419 }

	420 }

	421 if( c=='x' && p->sIn.i+2<p->sIn.mx ){

	422 const unsigned char *zIn = p->sIn.z + p->sIn.i;

	423 if( re_hex(zIn[1],&v)

	424 && re_hex(zIn[2],&v)

	425 ){

	426 p->sIn.i += 3;

	427 return v;

	428 }

	429 }

	430 for(i=0; zEsc[i] && zEsc[i]!=c; i++){}

	431 if( zEsc[i] ){

	432 if( i<6 ) c = zTrans[i];

	433 p->sIn.i++;

	434 }else{

	435 p->zErr = "unknown \\ escape";

	436 }

	437 return c;

	438 }

	439

	440 /* Forward declaration */

	441 static const char re_subcompile_string(ReCompiled);

	442

	443 /* Peek at the next byte of input */

	444 static unsigned char rePeek(ReCompiled *p){

	445 return p->sIn.i<p->sIn.mx ? p->sIn.z[p->sIn.i] : 0;

	446 }

	447

	448 /* Compile RE text into a sequence of opcodes. Continue up to the

	449 ** first unmatched ")" character, then return. If an error is found,

	450 ** return a pointer to the error message string.

	451 */

	452 static const char re_subcompile_re(ReCompiled p){

	453 const char *zErr;

	454 int iStart, iEnd, iGoto;

	455 iStart = p->nState;

	456 zErr = re_subcompile_string(p);

	457 if( zErr ) return zErr;

	458 while( rePeek(p)=='\|' ){

	459 iEnd = p->nState;

	460 re_insert(p, iStart, RE_OP_FORK, iEnd + 2 - iStart);

	461 iGoto = re_append(p, RE_OP_GOTO, 0);

	462 p->sIn.i++;

	463 zErr = re_subcompile_string(p);

	464 if( zErr ) return zErr;

	465 p->aArg[iGoto] = p->nState - iGoto;

	466 }

	467 return 0;

	468 }

	469

	470 /* Compile an element of regular expression text (anything that can be

	471 ** an operand to the "\|" operator). Return NULL on success or a pointer

	472 ** to the error message if there is a problem.

	473 */

	474 static const char re_subcompile_string(ReCompiled p){

	475 int iPrev = -1;

	476 int iStart;

	477 unsigned c;

	478 const char *zErr;

	479 while( (c = p->xNextChar(&p->sIn))!=0 ){

	480 iStart = p->nState;

	481 switch( c ){

	482 case '\|':

	483 case '$':

	484 case ')': {

	485 p->sIn.i--;

	486 return 0;

	487 }

	488 case '(': {

	489 zErr = re_subcompile_re(p);

	490 if( zErr ) return zErr;

	491 if( rePeek(p)!=')' ) return "unmatched '('";

	492 p->sIn.i++;

	493 break;

	494 }

	495 case '.': {

	496 if( rePeek(p)=='*' ){

	497 re_append(p, RE_OP_ANYSTAR, 0);

	498 p->sIn.i++;

	499 }else{

	500 re_append(p, RE_OP_ANY, 0);

	501 }

	502 break;

	503 }

	504 case '*': {

	505 if( iPrev<0 ) return "'*' without operand";

	506 re_insert(p, iPrev, RE_OP_GOTO, p->nState - iPrev + 1);

	507 re_append(p, RE_OP_FORK, iPrev - p->nState + 1);

	508 break;

	509 }

	510 case '+': {

	511 if( iPrev<0 ) return "'+' without operand";

	512 re_append(p, RE_OP_FORK, iPrev - p->nState);

	513 break;

	514 }

	515 case '?': {

	516 if( iPrev<0 ) return "'?' without operand";

	517 re_insert(p, iPrev, RE_OP_FORK, p->nState - iPrev+1);

	518 break;

	519 }

	520 case '{': {

	521 int m = 0, n = 0;

	522 int sz, j;

	523 if( iPrev<0 ) return "'{m,n}' without operand";

	524 while( (c=rePeek(p))>='0' && c<='9' ){ m = m*10 + c - '0'; p->sIn.i++; }

	525 n = m;

	526 if( c==',' ){

	527 p->sIn.i++;

	528 n = 0;

	529 while( (c=rePeek(p))>='0' && c<='9' ){ n = n*10 + c-'0'; p->sIn.i++; }

	530 }

	531 if( c!='}' ) return "unmatched '{'";

	532 if( n>0 && n<m ) return "n less than m in '{m,n}'";

	533 p->sIn.i++;

	534 sz = p->nState - iPrev;

	535 if( m==0 ){

	536 if( n==0 ) return "both m and n are zero in '{m,n}'";

	537 re_insert(p, iPrev, RE_OP_FORK, sz+1);

	538 n--;

	539 }else{

	540 for(j=1; j<m; j++) re_copy(p, iPrev, sz);

	541 }

	542 for(j=m; j<n; j++){

	543 re_append(p, RE_OP_FORK, sz+1);

	544 re_copy(p, iPrev, sz);

	545 }

	546 if( n==0 && m>0 ){

	547 re_append(p, RE_OP_FORK, -sz);

	548 }

	549 break;

	550 }

	551 case '[': {

	552 int iFirst = p->nState;

	553 if( rePeek(p)=='^' ){

	554 re_append(p, RE_OP_CC_EXC, 0);

	555 p->sIn.i++;

	556 }else{

	557 re_append(p, RE_OP_CC_INC, 0);

	558 }

	559 while( (c = p->xNextChar(&p->sIn))!=0 ){

	560 if( c=='[' && rePeek(p)==':' ){

	561 return "POSIX character classes not supported";

	562 }

	563 if( c=='\\' ) c = re_esc_char(p);

	564 if( rePeek(p)=='-' ){

	565 re_append(p, RE_OP_CC_RANGE, c);

	566 p->sIn.i++;

	567 c = p->xNextChar(&p->sIn);

	568 if( c=='\\' ) c = re_esc_char(p);

	569 re_append(p, RE_OP_CC_RANGE, c);

	570 }else{

	571 re_append(p, RE_OP_CC_VALUE, c);

	572 }

	573 if( rePeek(p)==']' ){ p->sIn.i++; break; }

	574 }

	575 if( c==0 ) return "unclosed '['";

	576 p->aArg[iFirst] = p->nState - iFirst;

	577 break;

	578 }

	579 case '\\': {

	580 int specialOp = 0;

	581 switch( rePeek(p) ){

	582 case 'b': specialOp = RE_OP_BOUNDARY; break;

	583 case 'd': specialOp = RE_OP_DIGIT; break;

	584 case 'D': specialOp = RE_OP_NOTDIGIT; break;

	585 case 's': specialOp = RE_OP_SPACE; break;

	586 case 'S': specialOp = RE_OP_NOTSPACE; break;

	587 case 'w': specialOp = RE_OP_WORD; break;

	588 case 'W': specialOp = RE_OP_NOTWORD; break;

	589 }

	590 if( specialOp ){

	591 p->sIn.i++;

	592 re_append(p, specialOp, 0);

	593 }else{

	594 c = re_esc_char(p);

	595 re_append(p, RE_OP_MATCH, c);

	596 }

	597 break;

	598 }

	599 default: {

	600 re_append(p, RE_OP_MATCH, c);

	601 break;

	602 }

	603 }

	604 iPrev = iStart;

	605 }

	606 return 0;

	607 }

	608

	609 /* Free and reclaim all the memory used by a previously compiled

	610 ** regular expression. Applications should invoke this routine once

	611 ** for every call to re_compile() to avoid memory leaks.

	612 */

	613 void re_free(ReCompiled *pRe){

	614 if( pRe ){

	615 sqlite3_free(pRe->aOp);

	616 sqlite3_free(pRe->aArg);

	617 sqlite3_free(pRe);

	618 }

	619 }

	620

	621 /*

	622 ** Compile a textual regular expression in zIn[] into a compiled regular

	623 ** expression suitable for us by re_match() and return a pointer to the

	624 ** compiled regular expression in *ppRe. Return NULL on success or an

	625 ** error message if something goes wrong.

	626 */

	627 const char re_compile(ReCompiled ppRe, const char zIn, int noCase){

	628 ReCompiled *pRe;

	629 const char *zErr;

	630 int i, j;

	631

	632 *ppRe = 0;

	633 pRe = sqlite3_malloc( sizeof(*pRe) );

	634 if( pRe==0 ){

	635 return "out of memory";

	636 }

	637 memset(pRe, 0, sizeof(*pRe));

	638 pRe->xNextChar = noCase ? re_next_char_nocase : re_next_char;

	639 if( re_resize(pRe, 30) ){

	640 re_free(pRe);

	641 return "out of memory";

	642 }

	643 if( zIn[0]=='^' ){

	644 zIn++;

	645 }else{

	646 re_append(pRe, RE_OP_ANYSTAR, 0);

	647 }

	648 pRe->sIn.z = (unsigned char*)zIn;

	649 pRe->sIn.i = 0;

	650 pRe->sIn.mx = (int)strlen(zIn);

	651 zErr = re_subcompile_re(pRe);

	652 if( zErr ){

	653 re_free(pRe);

	654 return zErr;

	655 }

	656 if( rePeek(pRe)=='$' && pRe->sIn.i+1>=pRe->sIn.mx ){

	657 re_append(pRe, RE_OP_MATCH, RE_EOF);

	658 re_append(pRe, RE_OP_ACCEPT, 0);

	659 *ppRe = pRe;

	660 }else if( pRe->sIn.i>=pRe->sIn.mx ){

	661 re_append(pRe, RE_OP_ACCEPT, 0);

	662 *ppRe = pRe;

	663 }else{

	664 re_free(pRe);

	665 return "unrecognized character";

	666 }

	667

	668 /* The following is a performance optimization. If the regex begins with

	669 ** ".*" (if the input regex lacks an initial "^") and afterwards there are

	670 ** one or more matching characters, enter those matching characters into

	671 ** zInit[]. The re_match() routine can then search ahead in the input

	672 ** string looking for the initial match without having to run the whole

	673 ** regex engine over the string. Do not worry able trying to match

	674 ** unicode characters beyond plane 0 - those are very rare and this is

	675 ** just an optimization. */

	676 if( pRe->aOp[0]==RE_OP_ANYSTAR ){

	677 for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){

	678 unsigned x = pRe->aArg[i];

	679 if( x<=127 ){

	680 pRe->zInit[j++] = x;

	681 }else if( x<=0xfff ){

	682 pRe->zInit[j++] = 0xc0 \| (x>>6);

	683 pRe->zInit[j++] = 0x80 \| (x&0x3f);

	684 }else if( x<=0xffff ){

	685 pRe->zInit[j++] = 0xd0 \| (x>>12);

	686 pRe->zInit[j++] = 0x80 \| ((x>>6)&0x3f);

	687 pRe->zInit[j++] = 0x80 \| (x&0x3f);

	688 }else{

	689 break;

	690 }

	691 }

	692 if( j>0 && pRe->zInit[j-1]==0 ) j--;

	693 pRe->nInit = j;

	694 }

	695 return pRe->zErr;

	696 }

	697

	698 /*

	699 ** Implementation of the regexp() SQL function. This function implements

	700 ** the build-in REGEXP operator. The first argument to the function is the

	701 ** pattern and the second argument is the string. So, the SQL statements:

	702 **

	703 ** A REGEXP B

	704 **

	705 ** is implemented as regexp(B,A).

	706 */

	707 static void re_sql_func(

	708 sqlite3_context *context,

	709 int argc,

	710 sqlite3_value **argv

	711 ){

	712 ReCompiled pRe; / Compiled regular expression */

	713 const char zPattern; / The regular expression */

	714 const unsigned char zStr;/ String being searched */

	715 const char zErr; / Compile error message */

	716 int setAux = 0; /* True to invoke sqlite3_set_auxdata() */

	717

	718 pRe = sqlite3_get_auxdata(context, 0);

	719 if( pRe==0 ){

	720 zPattern = (const char*)sqlite3_value_text(argv[0]);

	721 if( zPattern==0 ) return;

	722 zErr = re_compile(&pRe, zPattern, 0);

	723 if( zErr ){

	724 re_free(pRe);

	725 sqlite3_result_error(context, zErr, -1);

	726 return;

	727 }

	728 if( pRe==0 ){

	729 sqlite3_result_error_nomem(context);

	730 return;

	731 }

	732 setAux = 1;

	733 }

	734 zStr = (const unsigned char*)sqlite3_value_text(argv[1]);

	735 if( zStr!=0 ){

	736 sqlite3_result_int(context, re_match(pRe, zStr, -1));

	737 }

	738 if( setAux ){

	739 sqlite3_set_auxdata(context, 0, pRe, (void()(void))re_free);

	740 }

	741 }

	742

	743 /*

	744 ** Invoke this routine to register the regexp() function with the

	745 ** SQLite database connection.

	746 */

	747 #ifdef _WIN32

	748 __declspec(dllexport)

	749 #endif

	750 int sqlite3_regexp_init(

	751 sqlite3 *db,

	752 char **pzErrMsg,

	753 const sqlite3_api_routines *pApi

	754 ){

	755 int rc = SQLITE_OK;

	756 SQLITE_EXTENSION_INIT2(pApi);

	757 rc = sqlite3_create_function(db, "regexp", 2, SQLITE_UTF8, 0,

	758 re_sql_func, 0, 0);

	759 return rc;

	760 }

OLD	NEW

« no previous file with comments | « third_party/sqlite/src/ext/misc/percentile.c ('k') | third_party/sqlite/src/ext/misc/rot13.c » ('j') | no next file with comments »