icu46/source/common/util.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/common/util.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 **********************************************************************

	3 * Copyright (c) 2001-2008, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 **********************************************************************

	6 * Date Name Description

	7 * 11/19/2001 aliu Creation.

	8 **********************************************************************

	9 */

	10

	11 #include "util.h"

	12 #include "unicode/unimatch.h"

	13 #include "unicode/uniset.h"

	14

	15 // Define UChar constants using hex for EBCDIC compatibility

	16

	17 static const UChar BACKSLASH = 0x005C; /\/

	18 static const UChar UPPER_U = 0x0055; /U/

	19 static const UChar LOWER_U = 0x0075; /u/

	20 static const UChar APOSTROPHE = 0x0027; // '\''

	21 static const UChar SPACE = 0x0020; // ' '

	22

	23 // "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"

	24 static const UChar DIGITS[] = {

	25 48,49,50,51,52,53,54,55,56,57,

	26 65,66,67,68,69,70,71,72,73,74,

	27 75,76,77,78,79,80,81,82,83,84,

	28 85,86,87,88,89,90

	29 };

	30

	31 U_NAMESPACE_BEGIN

	32

	33 UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n,

	34 int32_t radix, int32_t minDigits) {

	35 if (radix < 2 \|\| radix > 36) {

	36 // Bogus radix

	37 return result.append((UChar)63/?/);

	38 }

	39 // Handle negatives

	40 if (n < 0) {

	41 n = -n;

	42 result.append((UChar)45/-/);

	43 }

	44 // First determine the number of digits

	45 int32_t nn = n;

	46 int32_t r = 1;

	47 while (nn >= radix) {

	48 nn /= radix;

	49 r *= radix;

	50 --minDigits;

	51 }

	52 // Now generate the digits

	53 while (--minDigits > 0) {

	54 result.append(DIGITS[0]);

	55 }

	56 while (r > 0) {

	57 int32_t digit = n / r;

	58 result.append(DIGITS[digit]);

	59 n -= digit * r;

	60 r /= radix;

	61 }

	62 return result;

	63 }

	64

	65 /**

	66 * Return true if the character is NOT printable ASCII.

	67 */

	68 UBool ICU_Utility::isUnprintable(UChar32 c) {

	69 return !(c >= 0x20 && c <= 0x7E);

	70 }

	71

	72 /**

	73 * Escape unprintable characters using \uxxxx notation for U+0000 to

	74 * U+FFFF and \Uxxxxxxxx for U+10000 and above. If the character is

	75 * printable ASCII, then do nothing and return FALSE. Otherwise,

	76 * append the escaped notation and return TRUE.

	77 */

	78 UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) {

	79 if (isUnprintable(c)) {

	80 result.append(BACKSLASH);

	81 if (c & ~0xFFFF) {

	82 result.append(UPPER_U);

	83 result.append(DIGITS[0xF&(c>>28)]);

	84 result.append(DIGITS[0xF&(c>>24)]);

	85 result.append(DIGITS[0xF&(c>>20)]);

	86 result.append(DIGITS[0xF&(c>>16)]);

	87 } else {

	88 result.append(LOWER_U);

	89 }

	90 result.append(DIGITS[0xF&(c>>12)]);

	91 result.append(DIGITS[0xF&(c>>8)]);

	92 result.append(DIGITS[0xF&(c>>4)]);

	93 result.append(DIGITS[0xF&c]);

	94 return TRUE;

	95 }

	96 return FALSE;

	97 }

	98

	99 /**

	100 * Returns the index of a character, ignoring quoted text.

	101 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be

	102 * found by a search for 'h'.

	103 */

	104 // FOR FUTURE USE. DISABLE FOR NOW for coverage reasons.

	105 /*

	106 int32_t ICU_Utility::quotedIndexOf(const UnicodeString& text,

	107 int32_t start, int32_t limit,

	108 UChar charToFind) {

	109 for (int32_t i=start; i<limit; ++i) {

	110 UChar c = text.charAt(i);

	111 if (c == BACKSLASH) {

	112 ++i;

	113 } else if (c == APOSTROPHE) {

	114 while (++i < limit

	115 && text.charAt(i) != APOSTROPHE) {}

	116 } else if (c == charToFind) {

	117 return i;

	118 }

	119 }

	120 return -1;

	121 }

	122 */

	123

	124 /**

	125 * Skip over a sequence of zero or more white space characters at pos.

	126 * @param advance if true, advance pos to the first non-white-space

	127 * character at or after pos, or str.length(), if there is none.

	128 * Otherwise leave pos unchanged.

	129 * @return the index of the first non-white-space character at or

	130 * after pos, or str.length(), if there is none.

	131 */

	132 int32_t ICU_Utility::skipWhitespace(const UnicodeString& str, int32_t& pos,

	133 UBool advance) {

	134 int32_t p = pos;

	135 while (p < str.length()) {

	136 UChar32 c = str.char32At(p);

	137 if (!uprv_isRuleWhiteSpace(c)) {

	138 break;

	139 }

	140 p += UTF_CHAR_LENGTH(c);

	141 }

	142 if (advance) {

	143 pos = p;

	144 }

	145 return p;

	146 }

	147

	148 /**

	149 * Skip over whitespace in a Replaceable. Whitespace is defined by

	150 * uprv_isRuleWhiteSpace(). Skipping may be done in the forward or

	151 * reverse direction. In either case, the leftmost index will be

	152 * inclusive, and the rightmost index will be exclusive. That is,

	153 * given a range defined as [start, limit), the call

	154 * skipWhitespace(text, start, limit) will advance start past leading

	155 * whitespace, whereas the call skipWhitespace(text, limit, start),

	156 * will back up limit past trailing whitespace.

	157 * @param text the text to be analyzed

	158 * @param pos either the start or limit of a range of 'text', to skip

	159 * leading or trailing whitespace, respectively

	160 * @param stop either the limit or start of a range of 'text', to skip

	161 * leading or trailing whitespace, respectively

	162 * @return the new start or limit, depending on what was passed in to

	163 * 'pos'

	164 */

	165 //?FOR FUTURE USE. DISABLE FOR NOW for coverage reasons.

	166 //?int32_t ICU_Utility::skipWhitespace(const Replaceable& text,

	167 //? int32_t pos, int32_t stop) {

	168 //? UChar32 c;

	169 //? UBool isForward = (stop >= pos);

	170 //?

	171 //? if (!isForward) {

	172 //? --pos; // pos is a limit, so back up by one

	173 //? }

	174 //?

	175 //? while (pos != stop &&

	176 //? uprv_isRuleWhiteSpace(c = text.char32At(pos))) {

	177 //? if (isForward) {

	178 //? pos += UTF_CHAR_LENGTH(c);

	179 //? } else {

	180 //? pos -= UTF_CHAR_LENGTH(c);

	181 //? }

	182 //? }

	183 //?

	184 //? if (!isForward) {

	185 //? ++pos; // make pos back into a limit

	186 //? }

	187 //?

	188 //? return pos;

	189 //?}

	190

	191 /**

	192 * Parse a single non-whitespace character 'ch', optionally

	193 * preceded by whitespace.

	194 * @param id the string to be parsed

	195 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the

	196 * offset of the first character to be parsed. On output, pos[0]

	197 * is the index after the last parsed character. If the parse

	198 * fails, pos[0] will be unchanged.

	199 * @param ch the non-whitespace character to be parsed.

	200 * @return true if 'ch' is seen preceded by zero or more

	201 * whitespace characters.

	202 */

	203 UBool ICU_Utility::parseChar(const UnicodeString& id, int32_t& pos, UChar ch) {

	204 int32_t start = pos;

	205 skipWhitespace(id, pos, TRUE);

	206 if (pos == id.length() \|\|

	207 id.charAt(pos) != ch) {

	208 pos = start;

	209 return FALSE;

	210 }

	211 ++pos;

	212 return TRUE;

	213 }

	214

	215 /**

	216 * Parse a pattern string within the given Replaceable and a parsing

	217 * pattern. Characters are matched literally and case-sensitively

	218 * except for the following special characters:

	219 *

	220 * ~ zero or more uprv_isRuleWhiteSpace chars

	221 *

	222 * If end of pattern is reached with all matches along the way,

	223 * pos is advanced to the first unparsed index and returned.

	224 * Otherwise -1 is returned.

	225 * @param pat pattern that controls parsing

	226 * @param text text to be parsed, starting at index

	227 * @param index offset to first character to parse

	228 * @param limit offset after last character to parse

	229 * @return index after last parsed character, or -1 on parse failure.

	230 */

	231 int32_t ICU_Utility::parsePattern(const UnicodeString& pat,

	232 const Replaceable& text,

	233 int32_t index,

	234 int32_t limit) {

	235 int32_t ipat = 0;

	236

	237 // empty pattern matches immediately

	238 if (ipat == pat.length()) {

	239 return index;

	240 }

	241

	242 UChar32 cpat = pat.char32At(ipat);

	243

	244 while (index < limit) {

	245 UChar32 c = text.char32At(index);

	246

	247 // parse \s*

	248 if (cpat == 126 /~/) {

	249 if (uprv_isRuleWhiteSpace(c)) {

	250 index += UTF_CHAR_LENGTH(c);

	251 continue;

	252 } else {

	253 if (++ipat == pat.length()) {

	254 return index; // success; c unparsed

	255 }

	256 // fall thru; process c again with next cpat

	257 }

	258 }

	259

	260 // parse literal

	261 else if (c == cpat) {

	262 index += UTF_CHAR_LENGTH(c);

	263 ipat += UTF_CHAR_LENGTH(cpat);

	264 if (ipat == pat.length()) {

	265 return index; // success; c parsed

	266 }

	267 // fall thru; get next cpat

	268 }

	269

	270 // match failure of literal

	271 else {

	272 return -1;

	273 }

	274

	275 cpat = pat.char32At(ipat);

	276 }

	277

	278 return -1; // text ended before end of pat

	279 }

	280

	281 /**

	282 * Append a character to a rule that is being built up. To flush

	283 * the quoteBuf to rule, make one final call with isLiteral == TRUE.

	284 * If there is no final character, pass in (UChar32)-1 as c.

	285 * @param rule the string to append the character to

	286 * @param c the character to append, or (UChar32)-1 if none.

	287 * @param isLiteral if true, then the given character should not be

	288 * quoted or escaped. Usually this means it is a syntactic element

	289 * such as > or $

	290 * @param escapeUnprintable if true, then unprintable characters

	291 * should be escaped using \uxxxx or \Uxxxxxxxx. These escapes will

	292 * appear outside of quotes.

	293 * @param quoteBuf a buffer which is used to build up quoted

	294 * substrings. The caller should initially supply an empty buffer,

	295 * and thereafter should not modify the buffer. The buffer should be

	296 * cleared out by, at the end, calling this method with a literal

	297 * character.

	298 */

	299 void ICU_Utility::appendToRule(UnicodeString& rule,

	300 UChar32 c,

	301 UBool isLiteral,

	302 UBool escapeUnprintable,

	303 UnicodeString& quoteBuf) {

	304 // If we are escaping unprintables, then escape them outside

	305 // quotes. \u and \U are not recognized within quotes. The same

	306 // logic applies to literals, but literals are never escaped.

	307 if (isLiteral \|\|

	308 (escapeUnprintable && ICU_Utility::isUnprintable(c))) {

	309 if (quoteBuf.length() > 0) {

	310 // We prefer backslash APOSTROPHE to double APOSTROPHE

	311 // (more readable, less similar to ") so if there are

	312 // double APOSTROPHEs at the ends, we pull them outside

	313 // of the quote.

	314

	315 // If the first thing in the quoteBuf is APOSTROPHE

	316 // (doubled) then pull it out.

	317 while (quoteBuf.length() >= 2 &&

	318 quoteBuf.charAt(0) == APOSTROPHE &&

	319 quoteBuf.charAt(1) == APOSTROPHE) {

	320 rule.append(BACKSLASH).append(APOSTROPHE);

	321 quoteBuf.remove(0, 2);

	322 }

	323 // If the last thing in the quoteBuf is APOSTROPHE

	324 // (doubled) then remove and count it and add it after.

	325 int32_t trailingCount = 0;

	326 while (quoteBuf.length() >= 2 &&

	327 quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&

	328 quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {

	329 quoteBuf.truncate(quoteBuf.length()-2);

	330 ++trailingCount;

	331 }

	332 if (quoteBuf.length() > 0) {

	333 rule.append(APOSTROPHE);

	334 rule.append(quoteBuf);

	335 rule.append(APOSTROPHE);

	336 quoteBuf.truncate(0);

	337 }

	338 while (trailingCount-- > 0) {

	339 rule.append(BACKSLASH).append(APOSTROPHE);

	340 }

	341 }

	342 if (c != (UChar32)-1) {

	343 /* Since spaces are ignored during parsing, they are

	344 * emitted only for readability. We emit one here

	345 * only if there isn't already one at the end of the

	346 * rule.

	347 */

	348 if (c == SPACE) {

	349 int32_t len = rule.length();

	350 if (len > 0 && rule.charAt(len-1) != c) {

	351 rule.append(c);

	352 }

	353 } else if (!escapeUnprintable \|\| !ICU_Utility::escapeUnprintable(rul e, c)) {

	354 rule.append(c);

	355 }

	356 }

	357 }

	358

	359 // Escape ' and '\' and don't begin a quote just for them

	360 else if (quoteBuf.length() == 0 &&

	361 (c == APOSTROPHE \|\| c == BACKSLASH)) {

	362 rule.append(BACKSLASH);

	363 rule.append(c);

	364 }

	365

	366 // Specials (printable ascii that isn't [0-9a-zA-Z]) and

	367 // whitespace need quoting. Also append stuff to quotes if we are

	368 // building up a quoted substring already.

	369 else if (quoteBuf.length() > 0 \|\|

	370 (c >= 0x0021 && c <= 0x007E &&

	371 !((c >= 0x0030/'0'/ && c <= 0x0039/'9'/) \|\|

	372 (c >= 0x0041/'A'/ && c <= 0x005A/'Z'/) \|\|

	373 (c >= 0x0061/'a'/ && c <= 0x007A/'z'/))) \|\|

	374 uprv_isRuleWhiteSpace(c)) {

	375 quoteBuf.append(c);

	376 // Double ' within a quote

	377 if (c == APOSTROPHE) {

	378 quoteBuf.append(c);

	379 }

	380 }

	381

	382 // Otherwise just append

	383 else {

	384 rule.append(c);

	385 }

	386 }

	387

	388 void ICU_Utility::appendToRule(UnicodeString& rule,

	389 const UnicodeString& text,

	390 UBool isLiteral,

	391 UBool escapeUnprintable,

	392 UnicodeString& quoteBuf) {

	393 for (int32_t i=0; i<text.length(); ++i) {

	394 appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf);

	395 }

	396 }

	397

	398 /**

	399 * Given a matcher reference, which may be null, append its

	400 * pattern as a literal to the given rule.

	401 */

	402 void ICU_Utility::appendToRule(UnicodeString& rule,

	403 const UnicodeMatcher* matcher,

	404 UBool escapeUnprintable,

	405 UnicodeString& quoteBuf) {

	406 if (matcher != NULL) {

	407 UnicodeString pat;

	408 appendToRule(rule, matcher->toPattern(pat, escapeUnprintable),

	409 TRUE, escapeUnprintable, quoteBuf);

	410 }

	411 }

	412

	413 U_NAMESPACE_END

	414

	415 U_CAPI UBool U_EXPORT2

	416 uprv_isRuleWhiteSpace(UChar32 c) {

	417 /* "white space" in the sense of ICU rule parsers

	418 This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.

	419 See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports /tr31/

	420 U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029

	421 Equivalent to test for Pattern_White_Space Unicode property.

	422 */

	423 return (c >= 0x0009 && c <= 0x2029 &&

	424 (c <= 0x000D \|\| c == 0x0020 \|\| c == 0x0085 \|\|

	425 c == 0x200E \|\| c == 0x200F \|\| c >= 0x2028));

	426 }

	427

	428 U_CAPI U_NAMESPACE_QUALIFIER UnicodeSet* U_EXPORT2

	429 uprv_openRuleWhiteSpaceSet(UErrorCode* ec) {

	430 if(U_FAILURE(*ec)) {

	431 return NULL;

	432 }

	433 // create a set with the Pattern_White_Space characters,

	434 // without a pattern for fewer code dependencies

	435 U_NAMESPACE_QUALIFIER UnicodeSet *set=new U_NAMESPACE_QUALIFIER UnicodeSet(9 , 0xd);

	436 // Check for new failure.

	437 if (set == NULL) {

	438 *ec = U_MEMORY_ALLOCATION_ERROR;

	439 return NULL;

	440 }

	441 set->UnicodeSet::add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029) ;

	442 return set;

	443 }

	444

	445 //eof

OLD	NEW

« no previous file with comments | « icu46/source/common/util.h ('k') | icu46/source/common/util_props.cpp » ('j') | no next file with comments »