Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(520)

Side by Side Diff: runtime/third_party/jscre/pcre_internal.h

Issue 1071713003: - Remove JSCRE from the runtime. (Closed) Base URL: http://dart.googlecode.com/svn/branches/bleeding_edge/dart/
Patch Set: Created 5 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 /* This is JavaScriptCore's variant of the PCRE library. While this library
2 started out as a copy of PCRE, many of the features of PCRE have been
3 removed. This library now supports only the regular expression features
4 required by the JavaScript language specification, and has only the functions
5 needed by JavaScriptCore and the rest of WebKit.
6
7 Originally written by Philip Hazel
8 Copyright (c) 1997-2006 University of Cambridge
9 Copyright (C) 2002, 2004, 2006, 2007 Apple Inc. All rights reserved.
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This header contains definitions that are shared between the different
41 modules, but which are not relevant to the exported API. This includes some
42 functions whose names all begin with "_pcre_". */
43
44 #ifndef THIRD_PARTY_JSCRE_PCRE_INTERNAL_H_
45 #define THIRD_PARTY_JSCRE_PCRE_INTERNAL_H_
46
47 #if defined(_WIN32)
48 typedef unsigned __int32 uint32_t;
49 #else
50 #include <inttypes.h>
51 #include <stdint.h>
52 #endif
53
54 #include <stdarg.h>
55 #include <stddef.h>
56 #include <stdlib.h>
57 #include <string.h>
58 #include <sys/types.h>
59
60 /* Bit definitions for entries in the pcre_ctypes table. */
61
62 #define ctype_space 0x01
63 #define ctype_xdigit 0x08
64 #define ctype_word 0x10 /* alphameric or '_' */
65
66 /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
67 of bits for a class map. Some classes are built by combining these tables. */
68
69 #define cbit_space 0 /* \s */
70 #define cbit_digit 32 /* \d */
71 #define cbit_word 64 /* \w */
72 #define cbit_length 96 /* Length of the cbits table */
73
74 /* Offsets of the various tables from the base tables pointer, and
75 total length. */
76
77 #define lcc_offset 0
78 #define fcc_offset 128
79 #define cbits_offset 256
80 #define ctypes_offset (cbits_offset + cbit_length)
81 #define tables_length (ctypes_offset + 128)
82
83 #ifndef DFTABLES
84
85 // TODO(xxx): Hook this up to something that checks assertions.
86 #define ASSERT(x) if (!(x)) abort()
87 #define ASSERT_NOT_REACHED() abort()
88
89 #ifdef WIN32
90 #pragma warning(disable: 4232)
91 #pragma warning(disable: 4244)
92 #endif
93
94 #include "./pcre.h"
95
96 /* The value of LINK_SIZE determines the number of bytes used to store links as
97 offsets within the compiled regex. The default is 2, which allows for compiled
98 patterns up to 64K long. */
99
100 #define LINK_SIZE 2
101
102 /* Define DEBUG to get debugging output on stdout. */
103
104 #if 0
105 #define DEBUG
106 #endif
107
108 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
109 inline, and there are *still* stupid compilers about that don't like indented
110 pre-processor statements, or at least there were when I first wrote this. After
111 all, it had only been about 10 years then... */
112
113 #ifdef DEBUG
114 #define DPRINTF(p) printf p
115 #else
116 #define DPRINTF(p) /*nothing*/
117 #endif
118
119 namespace dart { namespace jscre {
120
121 /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored
122 in big-endian order) by default. These are used, for example, to link from the
123 start of a subpattern to its alternatives and its end. The use of 2 bytes per
124 offset limits the size of the compiled regex to around 64K, which is big enough
125 for almost everybody. However, I received a request for an even bigger limit.
126 For this reason, and also to make the code easier to maintain, the storing and
127 loading of offsets from the byte string is now handled by the functions that are
128 defined here. */
129
130 /* PCRE uses some other 2-byte quantities that do not change when the size of
131 offsets changes. There are used for repeat counts and for other things such as
132 capturing parenthesis numbers in back references. */
133
134 static inline void put2ByteValue(unsigned char* opcodePtr, int value) {
135 ASSERT(value >= 0 && value <= 0xFFFF);
136 opcodePtr[0] = value >> 8;
137 opcodePtr[1] = value;
138 }
139
140 static inline int get2ByteValue(const unsigned char* opcodePtr) {
141 return (opcodePtr[0] << 8) | opcodePtr[1];
142 }
143
144 static inline void put2ByteValueAndAdvance(unsigned char*& opcodePtr,
145 int value) {
146 put2ByteValue(opcodePtr, value);
147 opcodePtr += 2;
148 }
149
150 static inline void putLinkValueAllowZero(unsigned char* opcodePtr,
151 int value) {
152 put2ByteValue(opcodePtr, value);
153 }
154
155 static inline int getLinkValueAllowZero(const unsigned char* opcodePtr) {
156 return get2ByteValue(opcodePtr);
157 }
158
159 #define MAX_PATTERN_SIZE (1 << 16)
160
161 static inline void putLinkValue(unsigned char* opcodePtr, int value) {
162 ASSERT(value);
163 putLinkValueAllowZero(opcodePtr, value);
164 }
165
166 static inline int getLinkValue(const unsigned char* opcodePtr) {
167 int value = getLinkValueAllowZero(opcodePtr);
168 ASSERT(value);
169 return value;
170 }
171
172 static inline void putLinkValueAndAdvance(unsigned char*& opcodePtr,
173 int value) {
174 putLinkValue(opcodePtr, value);
175 opcodePtr += LINK_SIZE;
176 }
177
178 static inline void putLinkValueAllowZeroAndAdvance(unsigned char*& opcodePtr,
179 int value) {
180 putLinkValueAllowZero(opcodePtr, value);
181 opcodePtr += LINK_SIZE;
182 }
183
184 // FIXME: These are really more of a "compiled regexp state"
185 // than "regexp options"
186 enum RegExpOptions {
187 UseFirstByteOptimizationOption = 0x40000000, /* first_byte is set */
188 UseRequiredByteOptimizationOption = 0x20000000, /* req_byte is set */
189 UseMultiLineFirstByteOptimizationOption = 0x10000000,
190 /* start after \n for multiline */
191 IsAnchoredOption = 0x02000000, /* can't use partial with this regex */
192 IgnoreCaseOption = 0x00000001,
193 MatchAcrossMultipleLinesOption = 0x00000002
194 };
195
196 /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a
197 variable-length repeat, or a anything other than literal characters. */
198
199 #define REQ_IGNORE_CASE 0x0100 /* indicates should ignore case */
200 #define REQ_VARY 0x0200 /* reqbyte followed non-literal item */
201
202 /* Miscellaneous definitions */
203
204 /* Flag bits and data types for the extended class (OP_XCLASS) for classes that
205 contain UTF-8 characters with values greater than 255. */
206
207 #define XCL_NOT 0x01 /* Flag: this is a negative class */
208 #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
209
210 #define XCL_END 0 /* Marks end of individual items */
211 #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
212 #define XCL_RANGE 2 /* A range (two multibyte chars) follows */
213
214 /* These are escaped items that aren't just an encoding of a particular data
215 value such as \n. They must have non-zero values, as check_escape() returns
216 their negation. Also, they must appear in the same order as in the opcode
217 definitions below, up to ESC_w. The final one must be
218 ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
219 tests in the code for an escape > ESC_b and <= ESC_w to
220 detect the types that may be repeated. These are the types that consume
221 characters. If any new escapes are put in between that don't consume a
222 character, that code will have to change. */
223
224 enum { ESC_B = 1, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w, ESC_REF };
225
226 /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
227 that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
228 OP_EOD must correspond in order to the list of escapes immediately above.
229 Note that whenever this list is updated, the two macro definitions that follow
230 must also be updated to match. */
231
232 #define FOR_EACH_OPCODE(macro) \
233 macro(END) \
234 \
235 macro(NOT_WORD_BOUNDARY) \
236 macro(WORD_BOUNDARY) \
237 macro(NOT_DIGIT) \
238 macro(DIGIT) \
239 macro(NOT_WHITESPACE) \
240 macro(WHITESPACE) \
241 macro(NOT_WORDCHAR) \
242 macro(WORDCHAR) \
243 \
244 macro(NOT_NEWLINE) \
245 \
246 macro(CIRC) \
247 macro(DOLL) \
248 macro(BOL) \
249 macro(EOL) \
250 macro(CHAR) \
251 macro(CHAR_IGNORING_CASE) \
252 macro(ASCII_CHAR) \
253 macro(ASCII_LETTER_IGNORING_CASE) \
254 macro(NOT) \
255 \
256 macro(STAR) \
257 macro(MINSTAR) \
258 macro(PLUS) \
259 macro(MINPLUS) \
260 macro(QUERY) \
261 macro(MINQUERY) \
262 macro(UPTO) \
263 macro(MINUPTO) \
264 macro(EXACT) \
265 \
266 macro(NOTSTAR) \
267 macro(NOTMINSTAR) \
268 macro(NOTPLUS) \
269 macro(NOTMINPLUS) \
270 macro(NOTQUERY) \
271 macro(NOTMINQUERY) \
272 macro(NOTUPTO) \
273 macro(NOTMINUPTO) \
274 macro(NOTEXACT) \
275 \
276 macro(TYPESTAR) \
277 macro(TYPEMINSTAR) \
278 macro(TYPEPLUS) \
279 macro(TYPEMINPLUS) \
280 macro(TYPEQUERY) \
281 macro(TYPEMINQUERY) \
282 macro(TYPEUPTO) \
283 macro(TYPEMINUPTO) \
284 macro(TYPEEXACT) \
285 \
286 macro(CRSTAR) \
287 macro(CRMINSTAR) \
288 macro(CRPLUS) \
289 macro(CRMINPLUS) \
290 macro(CRQUERY) \
291 macro(CRMINQUERY) \
292 macro(CRRANGE) \
293 macro(CRMINRANGE) \
294 \
295 macro(CLASS) \
296 macro(NCLASS) \
297 macro(XCLASS) \
298 \
299 macro(REF) \
300 \
301 macro(ALT) \
302 macro(KET) \
303 macro(KETRMAX) \
304 macro(KETRMIN) \
305 \
306 macro(ASSERT) \
307 macro(ASSERT_NOT) \
308 \
309 macro(BRAZERO) \
310 macro(BRAMINZERO) \
311 macro(BRANUMBER) \
312 macro(BRA)
313
314 #define OPCODE_ENUM_VALUE(opcode) OP_##opcode,
315 enum { FOR_EACH_OPCODE(OPCODE_ENUM_VALUE) };
316
317 /* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and
318 study.c that all opcodes are less than 128 in value. This makes handling UTF-8
319 character sequences easier. */
320
321 /* The highest extraction number before we have to start using additional
322 bytes. (Originally PCRE didn't have support for extraction counts higher than
323 this number.) The value is limited by the number of opcodes left after OP_BRA,
324 i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
325 opcodes. */
326
327 /* FIXME: Note that OP_BRA + 100 is > 128, so the two comments above
328 are in conflict! */
329
330 #define EXTRACT_BASIC_MAX 100
331
332 /* The index of names and the
333 code vector run on as long as necessary after the end. We store an explicit
334 offset to the name table so that if a regex is compiled on one host, saved, and
335 then run on another where the size of pointers is different, all might still
336 be well. For the case of compiled-on-4 and run-on-8, we include an extra
337 pointer that is always NULL.
338 */
339
340 struct JSRegExp {
341 uint32_t options;
342
343 uint16_t top_bracket;
344 uint16_t top_backref;
345
346 uint16_t first_byte;
347 uint16_t req_byte;
348 };
349
350 /* Internal shared data tables. These are tables that are used by more than one
351 of the exported public functions. They have to be "external" in the C sense,
352 but are not part of the PCRE public API. The data for these tables is in the
353 pcre_tables.c module. */
354
355 #define kjs_pcre_utf8_table1_size 6
356
357 extern const int kjs_pcre_utf8_table1[6];
358 extern const int kjs_pcre_utf8_table2[6];
359 extern const int kjs_pcre_utf8_table3[6];
360 extern const unsigned char kjs_pcre_utf8_table4[0x40];
361
362 extern const unsigned char kjs_pcre_default_tables[tables_length];
363
364 static inline unsigned char toLowerCase(unsigned char c) {
365 static const unsigned char* lowerCaseChars =
366 kjs_pcre_default_tables + lcc_offset;
367 return lowerCaseChars[c];
368 }
369
370 static inline unsigned char flipCase(unsigned char c) {
371 static const unsigned char* flippedCaseChars =
372 kjs_pcre_default_tables + fcc_offset;
373 return flippedCaseChars[c];
374 }
375
376 static inline unsigned char classBitmapForChar(unsigned char c) {
377 static const unsigned char* charClassBitmaps =
378 kjs_pcre_default_tables + cbits_offset;
379 return charClassBitmaps[c];
380 }
381
382 static inline unsigned char charTypeForChar(unsigned char c) {
383 const unsigned char* charTypeMap = kjs_pcre_default_tables + ctypes_offset;
384 return charTypeMap[c];
385 }
386
387 static inline bool isWordChar(UChar c) {
388 return c < 128 && (charTypeForChar(c) & ctype_word);
389 }
390
391 static inline bool isSpaceChar(UChar c) {
392 return (c < 128 && (charTypeForChar(c) & ctype_space));
393 }
394
395 static inline bool isNewline(UChar nl) {
396 return (nl == 0xA || nl == 0xD || nl == 0x2028 || nl == 0x2029);
397 }
398
399 static inline bool isBracketStartOpcode(unsigned char opcode) {
400 if (opcode >= OP_BRA)
401 return true;
402 switch (opcode) {
403 case OP_ASSERT:
404 case OP_ASSERT_NOT:
405 return true;
406 default:
407 return false;
408 }
409 }
410
411 static inline void advanceToEndOfBracket(const unsigned char*& opcodePtr) {
412 ASSERT(isBracketStartOpcode(*opcodePtr) || *opcodePtr == OP_ALT);
413 do
414 opcodePtr += getLinkValue(opcodePtr + 1);
415 while (*opcodePtr == OP_ALT);
416 }
417
418 /* Internal shared functions. These are functions that are used in more
419 that one of the source files. They have to have external linkage, but
420 but are not part of the public API and so not exported from the library. */
421
422 extern int kjs_pcre_ucp_othercase(unsigned);
423 extern bool kjs_pcre_xclass(int, const unsigned char*);
424
425 } } // namespace dart::jscre
426 #endif
427
428 #endif // THIRD_PARTY_JSCRE_PCRE_INTERNAL_H_
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698