icu46/source/common/util.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Unified Diff: icu46/source/common/util.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu46/source/common/util.cpp

===================================================================

--- icu46/source/common/util.cpp (revision 0)

+++ icu46/source/common/util.cpp (revision 0)

@@ -0,0 +1,445 @@

+/*

+**********************************************************************

+* Date Name Description

+* 11/19/2001 aliu Creation.

+**********************************************************************

+*/

+#include "util.h"

+#include "unicode/unimatch.h"

+#include "unicode/uniset.h"

+// Define UChar constants using hex for EBCDIC compatibility

+static const UChar BACKSLASH = 0x005C; /*\*/

+static const UChar UPPER_U = 0x0055; /*U*/

+static const UChar LOWER_U = 0x0075; /*u*/

+static const UChar APOSTROPHE = 0x0027; // '\''

+static const UChar SPACE = 0x0020; // ' '

+// "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"

+static const UChar DIGITS[] = {

+ 48,49,50,51,52,53,54,55,56,57,

+ 65,66,67,68,69,70,71,72,73,74,

+ 75,76,77,78,79,80,81,82,83,84,

+ 85,86,87,88,89,90

+};

+U_NAMESPACE_BEGIN

+UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n,

+ int32_t radix, int32_t minDigits) {

+ if (radix < 2 || radix > 36) {

+ // Bogus radix

+ return result.append((UChar)63/*?*/);

+ }

+ // Handle negatives

+ if (n < 0) {

+ n = -n;

+ result.append((UChar)45/*-*/);

+ }

+ // First determine the number of digits

+ int32_t nn = n;

+ int32_t r = 1;

+ while (nn >= radix) {

+ nn /= radix;

+ r *= radix;

+ --minDigits;

+ }

+ // Now generate the digits

+ while (--minDigits > 0) {

+ result.append(DIGITS[0]);

+ }

+ while (r > 0) {

+ int32_t digit = n / r;

+ result.append(DIGITS[digit]);

+ n -= digit * r;

+ r /= radix;

+ }

+ return result;

+/**

+ * Return true if the character is NOT printable ASCII.

+ */

+UBool ICU_Utility::isUnprintable(UChar32 c) {

+ return !(c >= 0x20 && c <= 0x7E);

+/**

+ * Escape unprintable characters using \uxxxx notation for U+0000 to

+ * U+FFFF and \Uxxxxxxxx for U+10000 and above. If the character is

+ * printable ASCII, then do nothing and return FALSE. Otherwise,

+ * append the escaped notation and return TRUE.

+ */

+UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) {

+ if (isUnprintable(c)) {

+ result.append(BACKSLASH);

+ if (c & ~0xFFFF) {

+ result.append(UPPER_U);

+ result.append(DIGITS[0xF&(c>>28)]);

+ result.append(DIGITS[0xF&(c>>24)]);

+ result.append(DIGITS[0xF&(c>>20)]);

+ result.append(DIGITS[0xF&(c>>16)]);

+ } else {

+ result.append(LOWER_U);

+ }

+ result.append(DIGITS[0xF&(c>>12)]);

+ result.append(DIGITS[0xF&(c>>8)]);

+ result.append(DIGITS[0xF&(c>>4)]);

+ result.append(DIGITS[0xF&c]);

+ return TRUE;

+ }

+ return FALSE;

+/**

+ * Returns the index of a character, ignoring quoted text.

+ * For example, in the string "abc'hide'h", the 'h' in "hide" will not be

+ * found by a search for 'h'.

+ */

+// FOR FUTURE USE. DISABLE FOR NOW for coverage reasons.

+/*

+int32_t ICU_Utility::quotedIndexOf(const UnicodeString& text,

+ int32_t start, int32_t limit,

+ UChar charToFind) {

+ for (int32_t i=start; i<limit; ++i) {

+ UChar c = text.charAt(i);

+ if (c == BACKSLASH) {

+ ++i;

+ } else if (c == APOSTROPHE) {

+ while (++i < limit

+ && text.charAt(i) != APOSTROPHE) {}

+ } else if (c == charToFind) {

+ return i;

+ }

+ return -1;

+*/

+/**

+ * Skip over a sequence of zero or more white space characters at pos.

+ * @param advance if true, advance pos to the first non-white-space

+ * character at or after pos, or str.length(), if there is none.

+ * Otherwise leave pos unchanged.

+ * @return the index of the first non-white-space character at or

+ * after pos, or str.length(), if there is none.

+ */

+int32_t ICU_Utility::skipWhitespace(const UnicodeString& str, int32_t& pos,

+ UBool advance) {

+ int32_t p = pos;

+ while (p < str.length()) {

+ UChar32 c = str.char32At(p);

+ if (!uprv_isRuleWhiteSpace(c)) {

+ break;

+ }

+ p += UTF_CHAR_LENGTH(c);

+ }

+ if (advance) {

+ pos = p;

+ }

+ return p;

+/**

+ * Skip over whitespace in a Replaceable. Whitespace is defined by

+ * uprv_isRuleWhiteSpace(). Skipping may be done in the forward or

+ * reverse direction. In either case, the leftmost index will be

+ * inclusive, and the rightmost index will be exclusive. That is,

+ * given a range defined as [start, limit), the call

+ * skipWhitespace(text, start, limit) will advance start past leading

+ * whitespace, whereas the call skipWhitespace(text, limit, start),

+ * will back up limit past trailing whitespace.

+ * @param text the text to be analyzed

+ * @param pos either the start or limit of a range of 'text', to skip

+ * leading or trailing whitespace, respectively

+ * @param stop either the limit or start of a range of 'text', to skip

+ * leading or trailing whitespace, respectively

+ * @return the new start or limit, depending on what was passed in to

+ * 'pos'

+ */

+//?FOR FUTURE USE. DISABLE FOR NOW for coverage reasons.

+//?int32_t ICU_Utility::skipWhitespace(const Replaceable& text,

+//? int32_t pos, int32_t stop) {

+//? UChar32 c;

+//? UBool isForward = (stop >= pos);

+//?

+//? if (!isForward) {

+//? --pos; // pos is a limit, so back up by one

+//? }

+//?

+//? while (pos != stop &&

+//? uprv_isRuleWhiteSpace(c = text.char32At(pos))) {

+//? if (isForward) {

+//? pos += UTF_CHAR_LENGTH(c);

+//? } else {

+//? pos -= UTF_CHAR_LENGTH(c);

+//? }

+//?

+//? if (!isForward) {

+//? ++pos; // make pos back into a limit

+//? }

+//?

+//? return pos;

+//?}

+/**

+ * Parse a single non-whitespace character 'ch', optionally

+ * preceded by whitespace.

+ * @param id the string to be parsed

+ * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the

+ * offset of the first character to be parsed. On output, pos[0]

+ * is the index after the last parsed character. If the parse

+ * fails, pos[0] will be unchanged.

+ * @param ch the non-whitespace character to be parsed.

+ * @return true if 'ch' is seen preceded by zero or more

+ * whitespace characters.

+ */

+UBool ICU_Utility::parseChar(const UnicodeString& id, int32_t& pos, UChar ch) {

+ int32_t start = pos;

+ skipWhitespace(id, pos, TRUE);

+ if (pos == id.length() ||

+ id.charAt(pos) != ch) {

+ pos = start;

+ return FALSE;

+ }

+ ++pos;

+ return TRUE;

+/**

+ * Parse a pattern string within the given Replaceable and a parsing

+ * pattern. Characters are matched literally and case-sensitively

+ * except for the following special characters:

+ *

+ * ~ zero or more uprv_isRuleWhiteSpace chars

+ *

+ * If end of pattern is reached with all matches along the way,

+ * pos is advanced to the first unparsed index and returned.

+ * Otherwise -1 is returned.

+ * @param pat pattern that controls parsing

+ * @param text text to be parsed, starting at index

+ * @param index offset to first character to parse

+ * @param limit offset after last character to parse

+ * @return index after last parsed character, or -1 on parse failure.

+ */

+int32_t ICU_Utility::parsePattern(const UnicodeString& pat,

+ const Replaceable& text,

+ int32_t index,

+ int32_t limit) {

+ int32_t ipat = 0;

+ // empty pattern matches immediately

+ if (ipat == pat.length()) {

+ return index;

+ }

+ UChar32 cpat = pat.char32At(ipat);

+ while (index < limit) {

+ UChar32 c = text.char32At(index);

+ // parse \s*

+ if (cpat == 126 /*~*/) {

+ if (uprv_isRuleWhiteSpace(c)) {

+ index += UTF_CHAR_LENGTH(c);

+ continue;

+ } else {

+ if (++ipat == pat.length()) {

+ return index; // success; c unparsed

+ }

+ // fall thru; process c again with next cpat

+ }

+ // parse literal

+ else if (c == cpat) {

+ index += UTF_CHAR_LENGTH(c);

+ ipat += UTF_CHAR_LENGTH(cpat);

+ if (ipat == pat.length()) {

+ return index; // success; c parsed

+ }

+ // fall thru; get next cpat

+ }

+ // match failure of literal

+ else {

+ return -1;

+ }

+ cpat = pat.char32At(ipat);

+ }

+ return -1; // text ended before end of pat

+/**

+ * Append a character to a rule that is being built up. To flush

+ * the quoteBuf to rule, make one final call with isLiteral == TRUE.

+ * If there is no final character, pass in (UChar32)-1 as c.

+ * @param rule the string to append the character to

+ * @param c the character to append, or (UChar32)-1 if none.

+ * @param isLiteral if true, then the given character should not be

+ * quoted or escaped. Usually this means it is a syntactic element

+ * such as > or $

+ * @param escapeUnprintable if true, then unprintable characters

+ * should be escaped using \uxxxx or \Uxxxxxxxx. These escapes will

+ * appear outside of quotes.

+ * @param quoteBuf a buffer which is used to build up quoted

+ * substrings. The caller should initially supply an empty buffer,

+ * and thereafter should not modify the buffer. The buffer should be

+ * cleared out by, at the end, calling this method with a literal

+ * character.

+ */

+void ICU_Utility::appendToRule(UnicodeString& rule,

+ UChar32 c,

+ UBool isLiteral,

+ UBool escapeUnprintable,

+ UnicodeString& quoteBuf) {

+ // If we are escaping unprintables, then escape them outside

+ // quotes. \u and \U are not recognized within quotes. The same

+ // logic applies to literals, but literals are never escaped.

+ if (isLiteral ||

+ (escapeUnprintable && ICU_Utility::isUnprintable(c))) {

+ if (quoteBuf.length() > 0) {

+ // We prefer backslash APOSTROPHE to double APOSTROPHE

+ // (more readable, less similar to ") so if there are

+ // double APOSTROPHEs at the ends, we pull them outside

+ // of the quote.

+ // If the first thing in the quoteBuf is APOSTROPHE

+ // (doubled) then pull it out.

+ while (quoteBuf.length() >= 2 &&

+ quoteBuf.charAt(0) == APOSTROPHE &&

+ quoteBuf.charAt(1) == APOSTROPHE) {

+ rule.append(BACKSLASH).append(APOSTROPHE);

+ quoteBuf.remove(0, 2);

+ }

+ // If the last thing in the quoteBuf is APOSTROPHE

+ // (doubled) then remove and count it and add it after.

+ int32_t trailingCount = 0;

+ while (quoteBuf.length() >= 2 &&

+ quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&

+ quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {

+ quoteBuf.truncate(quoteBuf.length()-2);

+ ++trailingCount;

+ }

+ if (quoteBuf.length() > 0) {

+ rule.append(APOSTROPHE);

+ rule.append(quoteBuf);

+ rule.append(APOSTROPHE);

+ quoteBuf.truncate(0);

+ }

+ while (trailingCount-- > 0) {

+ rule.append(BACKSLASH).append(APOSTROPHE);

+ }

+ if (c != (UChar32)-1) {

+ /* Since spaces are ignored during parsing, they are

+ * emitted only for readability. We emit one here

+ * only if there isn't already one at the end of the

+ * rule.

+ */

+ if (c == SPACE) {

+ int32_t len = rule.length();

+ if (len > 0 && rule.charAt(len-1) != c) {

+ rule.append(c);

+ }

+ } else if (!escapeUnprintable || !ICU_Utility::escapeUnprintable(rule, c)) {

+ rule.append(c);

+ }

+ // Escape ' and '\' and don't begin a quote just for them

+ else if (quoteBuf.length() == 0 &&

+ (c == APOSTROPHE || c == BACKSLASH)) {

+ rule.append(BACKSLASH);

+ rule.append(c);

+ }

+ // Specials (printable ascii that isn't [0-9a-zA-Z]) and

+ // whitespace need quoting. Also append stuff to quotes if we are

+ // building up a quoted substring already.

+ else if (quoteBuf.length() > 0 ||

+ (c >= 0x0021 && c <= 0x007E &&

+ !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||

+ (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||

+ (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||

+ uprv_isRuleWhiteSpace(c)) {

+ quoteBuf.append(c);

+ // Double ' within a quote

+ if (c == APOSTROPHE) {

+ quoteBuf.append(c);

+ }

+ // Otherwise just append

+ else {

+ rule.append(c);

+ }

+void ICU_Utility::appendToRule(UnicodeString& rule,

+ const UnicodeString& text,

+ UBool isLiteral,

+ UBool escapeUnprintable,

+ UnicodeString& quoteBuf) {

+ for (int32_t i=0; i<text.length(); ++i) {

+ appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf);

+ }

+/**

+ * Given a matcher reference, which may be null, append its

+ * pattern as a literal to the given rule.

+ */

+void ICU_Utility::appendToRule(UnicodeString& rule,

+ const UnicodeMatcher* matcher,

+ UBool escapeUnprintable,

+ UnicodeString& quoteBuf) {

+ if (matcher != NULL) {

+ UnicodeString pat;

+ appendToRule(rule, matcher->toPattern(pat, escapeUnprintable),

+ TRUE, escapeUnprintable, quoteBuf);

+ }

+U_NAMESPACE_END

+U_CAPI UBool U_EXPORT2

+uprv_isRuleWhiteSpace(UChar32 c) {

+ /* "white space" in the sense of ICU rule parsers

+ This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.

+ See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/

+ U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029

+ Equivalent to test for Pattern_White_Space Unicode property.

+ */

+ return (c >= 0x0009 && c <= 0x2029 &&

+ (c <= 0x000D || c == 0x0020 || c == 0x0085 ||

+ c == 0x200E || c == 0x200F || c >= 0x2028));

+U_CAPI U_NAMESPACE_QUALIFIER UnicodeSet* U_EXPORT2

+uprv_openRuleWhiteSpaceSet(UErrorCode* ec) {

+ if(U_FAILURE(*ec)) {

+ return NULL;

+ }

+ // create a set with the Pattern_White_Space characters,

+ // without a pattern for fewer code dependencies

+ U_NAMESPACE_QUALIFIER UnicodeSet *set=new U_NAMESPACE_QUALIFIER UnicodeSet(9, 0xd);

+ // Check for new failure.

+ if (set == NULL) {

+ *ec = U_MEMORY_ALLOCATION_ERROR;

+ return NULL;

+ }

+ set->UnicodeSet::add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029);

+ return set;

+//eof

Property changes on: icu46/source/common/util.cpp

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « icu46/source/common/util.h ('k') | icu46/source/common/util_props.cpp » ('j') | no next file with comments »