OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ****************************************************************************** |
| 3 * Copyright (C) 1997-2008, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ****************************************************************************** |
| 6 * file name: nfrule.cpp |
| 7 * encoding: US-ASCII |
| 8 * tab size: 8 (not used) |
| 9 * indentation:4 |
| 10 * |
| 11 * Modification history |
| 12 * Date Name Comments |
| 13 * 10/11/2001 Doug Ported from ICU4J |
| 14 */ |
| 15 |
| 16 #include "nfrule.h" |
| 17 |
| 18 #if U_HAVE_RBNF |
| 19 |
| 20 #include "unicode/rbnf.h" |
| 21 #include "unicode/tblcoll.h" |
| 22 #include "unicode/coleitr.h" |
| 23 #include "unicode/uchar.h" |
| 24 #include "nfrs.h" |
| 25 #include "nfrlist.h" |
| 26 #include "nfsubs.h" |
| 27 |
| 28 #include "util.h" |
| 29 |
| 30 U_NAMESPACE_BEGIN |
| 31 |
| 32 NFRule::NFRule(const RuleBasedNumberFormat* _rbnf) |
| 33 : baseValue((int32_t)0) |
| 34 , radix(0) |
| 35 , exponent(0) |
| 36 , ruleText() |
| 37 , sub1(NULL) |
| 38 , sub2(NULL) |
| 39 , formatter(_rbnf) |
| 40 { |
| 41 } |
| 42 |
| 43 NFRule::~NFRule() |
| 44 { |
| 45 delete sub1; |
| 46 delete sub2; |
| 47 } |
| 48 |
| 49 static const UChar gLeftBracket = 0x005b; |
| 50 static const UChar gRightBracket = 0x005d; |
| 51 static const UChar gColon = 0x003a; |
| 52 static const UChar gZero = 0x0030; |
| 53 static const UChar gNine = 0x0039; |
| 54 static const UChar gSpace = 0x0020; |
| 55 static const UChar gSlash = 0x002f; |
| 56 static const UChar gGreaterThan = 0x003e; |
| 57 static const UChar gLessThan = 0x003c; |
| 58 static const UChar gComma = 0x002c; |
| 59 static const UChar gDot = 0x002e; |
| 60 static const UChar gTick = 0x0027; |
| 61 //static const UChar gMinus = 0x002d; |
| 62 static const UChar gSemicolon = 0x003b; |
| 63 |
| 64 static const UChar gMinusX[] = {0x2D, 0x78, 0}; /* "-x" */ |
| 65 static const UChar gXDotX[] = {0x78, 0x2E, 0x78, 0}; /* "x.x"
*/ |
| 66 static const UChar gXDotZero[] = {0x78, 0x2E, 0x30, 0}; /* "x.0"
*/ |
| 67 static const UChar gZeroDotX[] = {0x30, 0x2E, 0x78, 0}; /* "0.x"
*/ |
| 68 |
| 69 static const UChar gLessLess[] = {0x3C, 0x3C, 0}; /* "<<" */ |
| 70 static const UChar gLessPercent[] = {0x3C, 0x25, 0}; /* "<%" */ |
| 71 static const UChar gLessHash[] = {0x3C, 0x23, 0}; /* "<#" */ |
| 72 static const UChar gLessZero[] = {0x3C, 0x30, 0}; /* "<0" */ |
| 73 static const UChar gGreaterGreater[] = {0x3E, 0x3E, 0}; /* ">>" */ |
| 74 static const UChar gGreaterPercent[] = {0x3E, 0x25, 0}; /* ">%" */ |
| 75 static const UChar gGreaterHash[] = {0x3E, 0x23, 0}; /* ">#" */ |
| 76 static const UChar gGreaterZero[] = {0x3E, 0x30, 0}; /* ">0" */ |
| 77 static const UChar gEqualPercent[] = {0x3D, 0x25, 0}; /* "=%" */ |
| 78 static const UChar gEqualHash[] = {0x3D, 0x23, 0}; /* "=#" */ |
| 79 static const UChar gEqualZero[] = {0x3D, 0x30, 0}; /* "=0" */ |
| 80 static const UChar gEmptyString[] = {0}; /* "" */ |
| 81 static const UChar gGreaterGreaterGreater[] = {0x3E, 0x3E, 0x3E, 0}; /* ">>>"
*/ |
| 82 |
| 83 static const UChar * const tokenStrings[] = { |
| 84 gLessLess, gLessPercent, gLessHash, gLessZero, |
| 85 gGreaterGreater, gGreaterPercent,gGreaterHash, gGreaterZero, |
| 86 gEqualPercent, gEqualHash, gEqualZero, NULL |
| 87 }; |
| 88 |
| 89 void |
| 90 NFRule::makeRules(UnicodeString& description, |
| 91 const NFRuleSet *ruleSet, |
| 92 const NFRule *predecessor, |
| 93 const RuleBasedNumberFormat *rbnf, |
| 94 NFRuleList& rules, |
| 95 UErrorCode& status) |
| 96 { |
| 97 // we know we're making at least one rule, so go ahead and |
| 98 // new it up and initialize its basevalue and divisor |
| 99 // (this also strips the rule descriptor, if any, off the |
| 100 // descripton string) |
| 101 NFRule* rule1 = new NFRule(rbnf); |
| 102 /* test for NULL */ |
| 103 if (rule1 == 0) { |
| 104 status = U_MEMORY_ALLOCATION_ERROR; |
| 105 return; |
| 106 } |
| 107 rule1->parseRuleDescriptor(description, status); |
| 108 |
| 109 // check the description to see whether there's text enclosed |
| 110 // in brackets |
| 111 int32_t brack1 = description.indexOf(gLeftBracket); |
| 112 int32_t brack2 = description.indexOf(gRightBracket); |
| 113 |
| 114 // if the description doesn't contain a matched pair of brackets, |
| 115 // or if it's of a type that doesn't recognize bracketed text, |
| 116 // then leave the description alone, initialize the rule's |
| 117 // rule text and substitutions, and return that rule |
| 118 if (brack1 == -1 || brack2 == -1 || brack1 > brack2 |
| 119 || rule1->getType() == kProperFractionRule |
| 120 || rule1->getType() == kNegativeNumberRule) { |
| 121 rule1->ruleText = description; |
| 122 rule1->extractSubstitutions(ruleSet, predecessor, rbnf, status); |
| 123 rules.add(rule1); |
| 124 } else { |
| 125 // if the description does contain a matched pair of brackets, |
| 126 // then it's really shorthand for two rules (with one exception) |
| 127 NFRule* rule2 = NULL; |
| 128 UnicodeString sbuf; |
| 129 |
| 130 // we'll actually only split the rule into two rules if its |
| 131 // base value is an even multiple of its divisor (or it's one |
| 132 // of the special rules) |
| 133 if ((rule1->baseValue > 0 |
| 134 && (rule1->baseValue % util64_pow(rule1->radix, rule1->exponent)) ==
0) |
| 135 || rule1->getType() == kImproperFractionRule |
| 136 || rule1->getType() == kMasterRule) { |
| 137 |
| 138 // if it passes that test, new up the second rule. If the |
| 139 // rule set both rules will belong to is a fraction rule |
| 140 // set, they both have the same base value; otherwise, |
| 141 // increment the original rule's base value ("rule1" actually |
| 142 // goes SECOND in the rule set's rule list) |
| 143 rule2 = new NFRule(rbnf); |
| 144 /* test for NULL */ |
| 145 if (rule2 == 0) { |
| 146 status = U_MEMORY_ALLOCATION_ERROR; |
| 147 return; |
| 148 } |
| 149 if (rule1->baseValue >= 0) { |
| 150 rule2->baseValue = rule1->baseValue; |
| 151 if (!ruleSet->isFractionRuleSet()) { |
| 152 ++rule1->baseValue; |
| 153 } |
| 154 } |
| 155 |
| 156 // if the description began with "x.x" and contains bracketed |
| 157 // text, it describes both the improper fraction rule and |
| 158 // the proper fraction rule |
| 159 else if (rule1->getType() == kImproperFractionRule) { |
| 160 rule2->setType(kProperFractionRule); |
| 161 } |
| 162 |
| 163 // if the description began with "x.0" and contains bracketed |
| 164 // text, it describes both the master rule and the |
| 165 // improper fraction rule |
| 166 else if (rule1->getType() == kMasterRule) { |
| 167 rule2->baseValue = rule1->baseValue; |
| 168 rule1->setType(kImproperFractionRule); |
| 169 } |
| 170 |
| 171 // both rules have the same radix and exponent (i.e., the |
| 172 // same divisor) |
| 173 rule2->radix = rule1->radix; |
| 174 rule2->exponent = rule1->exponent; |
| 175 |
| 176 // rule2's rule text omits the stuff in brackets: initalize |
| 177 // its rule text and substitutions accordingly |
| 178 sbuf.append(description, 0, brack1); |
| 179 if (brack2 + 1 < description.length()) { |
| 180 sbuf.append(description, brack2 + 1, description.length() - brac
k2 - 1); |
| 181 } |
| 182 rule2->ruleText.setTo(sbuf); |
| 183 rule2->extractSubstitutions(ruleSet, predecessor, rbnf, status); |
| 184 } |
| 185 |
| 186 // rule1's text includes the text in the brackets but omits |
| 187 // the brackets themselves: initialize _its_ rule text and |
| 188 // substitutions accordingly |
| 189 sbuf.setTo(description, 0, brack1); |
| 190 sbuf.append(description, brack1 + 1, brack2 - brack1 - 1); |
| 191 if (brack2 + 1 < description.length()) { |
| 192 sbuf.append(description, brack2 + 1, description.length() - brack2 -
1); |
| 193 } |
| 194 rule1->ruleText.setTo(sbuf); |
| 195 rule1->extractSubstitutions(ruleSet, predecessor, rbnf, status); |
| 196 |
| 197 // if we only have one rule, return it; if we have two, return |
| 198 // a two-element array containing them (notice that rule2 goes |
| 199 // BEFORE rule1 in the list: in all cases, rule2 OMITS the |
| 200 // material in the brackets and rule1 INCLUDES the material |
| 201 // in the brackets) |
| 202 if (rule2 != NULL) { |
| 203 rules.add(rule2); |
| 204 } |
| 205 rules.add(rule1); |
| 206 } |
| 207 } |
| 208 |
| 209 /** |
| 210 * This function parses the rule's rule descriptor (i.e., the base |
| 211 * value and/or other tokens that precede the rule's rule text |
| 212 * in the description) and sets the rule's base value, radix, and |
| 213 * exponent according to the descriptor. (If the description doesn't |
| 214 * include a rule descriptor, then this function sets everything to |
| 215 * default values and the rule set sets the rule's real base value). |
| 216 * @param description The rule's description |
| 217 * @return If "description" included a rule descriptor, this is |
| 218 * "description" with the descriptor and any trailing whitespace |
| 219 * stripped off. Otherwise; it's "descriptor" unchangd. |
| 220 */ |
| 221 void |
| 222 NFRule::parseRuleDescriptor(UnicodeString& description, UErrorCode& status) |
| 223 { |
| 224 // the description consists of a rule descriptor and a rule body, |
| 225 // separated by a colon. The rule descriptor is optional. If |
| 226 // it's omitted, just set the base value to 0. |
| 227 int32_t p = description.indexOf(gColon); |
| 228 if (p == -1) { |
| 229 setBaseValue((int32_t)0, status); |
| 230 } else { |
| 231 // copy the descriptor out into its own string and strip it, |
| 232 // along with any trailing whitespace, out of the original |
| 233 // description |
| 234 UnicodeString descriptor; |
| 235 descriptor.setTo(description, 0, p); |
| 236 |
| 237 ++p; |
| 238 while (p < description.length() && uprv_isRuleWhiteSpace(description.cha
rAt(p))) { |
| 239 ++p; |
| 240 } |
| 241 description.removeBetween(0, p); |
| 242 |
| 243 // check first to see if the rule descriptor matches the token |
| 244 // for one of the special rules. If it does, set the base |
| 245 // value to the correct identfier value |
| 246 if (descriptor == gMinusX) { |
| 247 setType(kNegativeNumberRule); |
| 248 } |
| 249 else if (descriptor == gXDotX) { |
| 250 setType(kImproperFractionRule); |
| 251 } |
| 252 else if (descriptor == gZeroDotX) { |
| 253 setType(kProperFractionRule); |
| 254 } |
| 255 else if (descriptor == gXDotZero) { |
| 256 setType(kMasterRule); |
| 257 } |
| 258 |
| 259 // if the rule descriptor begins with a digit, it's a descriptor |
| 260 // for a normal rule |
| 261 // since we don't have Long.parseLong, and this isn't much work anyway, |
| 262 // just build up the value as we encounter the digits. |
| 263 else if (descriptor.charAt(0) >= gZero && descriptor.charAt(0) <= gNine)
{ |
| 264 int64_t val = 0; |
| 265 p = 0; |
| 266 UChar c = gSpace; |
| 267 |
| 268 // begin parsing the descriptor: copy digits |
| 269 // into "tempValue", skip periods, commas, and spaces, |
| 270 // stop on a slash or > sign (or at the end of the string), |
| 271 // and throw an exception on any other character |
| 272 int64_t ll_10 = 10; |
| 273 while (p < descriptor.length()) { |
| 274 c = descriptor.charAt(p); |
| 275 if (c >= gZero && c <= gNine) { |
| 276 val = val * ll_10 + (int32_t)(c - gZero); |
| 277 } |
| 278 else if (c == gSlash || c == gGreaterThan) { |
| 279 break; |
| 280 } |
| 281 else if (uprv_isRuleWhiteSpace(c) || c == gComma || c == gDot) { |
| 282 } |
| 283 else { |
| 284 // throw new IllegalArgumentException("Illegal character in
rule descriptor"); |
| 285 status = U_PARSE_ERROR; |
| 286 return; |
| 287 } |
| 288 ++p; |
| 289 } |
| 290 |
| 291 // we have the base value, so set it |
| 292 setBaseValue(val, status); |
| 293 |
| 294 // if we stopped the previous loop on a slash, we're |
| 295 // now parsing the rule's radix. Again, accumulate digits |
| 296 // in tempValue, skip punctuation, stop on a > mark, and |
| 297 // throw an exception on anything else |
| 298 if (c == gSlash) { |
| 299 val = 0; |
| 300 ++p; |
| 301 int64_t ll_10 = 10; |
| 302 while (p < descriptor.length()) { |
| 303 c = descriptor.charAt(p); |
| 304 if (c >= gZero && c <= gNine) { |
| 305 val = val * ll_10 + (int32_t)(c - gZero); |
| 306 } |
| 307 else if (c == gGreaterThan) { |
| 308 break; |
| 309 } |
| 310 else if (uprv_isRuleWhiteSpace(c) || c == gComma || c == gDo
t) { |
| 311 } |
| 312 else { |
| 313 // throw new IllegalArgumentException("Illegal character
is rule descriptor"); |
| 314 status = U_PARSE_ERROR; |
| 315 return; |
| 316 } |
| 317 ++p; |
| 318 } |
| 319 |
| 320 // tempValue now contain's the rule's radix. Set it |
| 321 // accordingly, and recalculate the rule's exponent |
| 322 radix = (int32_t)val; |
| 323 if (radix == 0) { |
| 324 // throw new IllegalArgumentException("Rule can't have radix
of 0"); |
| 325 status = U_PARSE_ERROR; |
| 326 } |
| 327 |
| 328 exponent = expectedExponent(); |
| 329 } |
| 330 |
| 331 // if we stopped the previous loop on a > sign, then continue |
| 332 // for as long as we still see > signs. For each one, |
| 333 // decrement the exponent (unless the exponent is already 0). |
| 334 // If we see another character before reaching the end of |
| 335 // the descriptor, that's also a syntax error. |
| 336 if (c == gGreaterThan) { |
| 337 while (p < descriptor.length()) { |
| 338 c = descriptor.charAt(p); |
| 339 if (c == gGreaterThan && exponent > 0) { |
| 340 --exponent; |
| 341 } else { |
| 342 // throw new IllegalArgumentException("Illegal character
in rule descriptor"); |
| 343 status = U_PARSE_ERROR; |
| 344 return; |
| 345 } |
| 346 ++p; |
| 347 } |
| 348 } |
| 349 } |
| 350 } |
| 351 |
| 352 // finally, if the rule body begins with an apostrophe, strip it off |
| 353 // (this is generally used to put whitespace at the beginning of |
| 354 // a rule's rule text) |
| 355 if (description.length() > 0 && description.charAt(0) == gTick) { |
| 356 description.removeBetween(0, 1); |
| 357 } |
| 358 |
| 359 // return the description with all the stuff we've just waded through |
| 360 // stripped off the front. It now contains just the rule body. |
| 361 // return description; |
| 362 } |
| 363 |
| 364 /** |
| 365 * Searches the rule's rule text for the substitution tokens, |
| 366 * creates the substitutions, and removes the substitution tokens |
| 367 * from the rule's rule text. |
| 368 * @param owner The rule set containing this rule |
| 369 * @param predecessor The rule preseding this one in "owners" rule list |
| 370 * @param ownersOwner The RuleBasedFormat that owns this rule |
| 371 */ |
| 372 void |
| 373 NFRule::extractSubstitutions(const NFRuleSet* ruleSet, |
| 374 const NFRule* predecessor, |
| 375 const RuleBasedNumberFormat* rbnf, |
| 376 UErrorCode& status) |
| 377 { |
| 378 if (U_SUCCESS(status)) { |
| 379 sub1 = extractSubstitution(ruleSet, predecessor, rbnf, status); |
| 380 sub2 = extractSubstitution(ruleSet, predecessor, rbnf, status); |
| 381 } |
| 382 } |
| 383 |
| 384 /** |
| 385 * Searches the rule's rule text for the first substitution token, |
| 386 * creates a substitution based on it, and removes the token from |
| 387 * the rule's rule text. |
| 388 * @param owner The rule set containing this rule |
| 389 * @param predecessor The rule preceding this one in the rule set's |
| 390 * rule list |
| 391 * @param ownersOwner The RuleBasedNumberFormat that owns this rule |
| 392 * @return The newly-created substitution. This is never null; if |
| 393 * the rule text doesn't contain any substitution tokens, this will |
| 394 * be a NullSubstitution. |
| 395 */ |
| 396 NFSubstitution * |
| 397 NFRule::extractSubstitution(const NFRuleSet* ruleSet, |
| 398 const NFRule* predecessor, |
| 399 const RuleBasedNumberFormat* rbnf, |
| 400 UErrorCode& status) |
| 401 { |
| 402 NFSubstitution* result = NULL; |
| 403 |
| 404 // search the rule's rule text for the first two characters of |
| 405 // a substitution token |
| 406 int32_t subStart = indexOfAny(tokenStrings); |
| 407 int32_t subEnd = subStart; |
| 408 |
| 409 // if we didn't find one, create a null substitution positioned |
| 410 // at the end of the rule text |
| 411 if (subStart == -1) { |
| 412 return NFSubstitution::makeSubstitution(ruleText.length(), this, predece
ssor, |
| 413 ruleSet, rbnf, gEmptyString, status); |
| 414 } |
| 415 |
| 416 // special-case the ">>>" token, since searching for the > at the |
| 417 // end will actually find the > in the middle |
| 418 if (ruleText.indexOf(gGreaterGreaterGreater) == subStart) { |
| 419 subEnd = subStart + 2; |
| 420 |
| 421 // otherwise the substitution token ends with the same character |
| 422 // it began with |
| 423 } else { |
| 424 UChar c = ruleText.charAt(subStart); |
| 425 subEnd = ruleText.indexOf(c, subStart + 1); |
| 426 // special case for '<%foo<<' |
| 427 if (c == gLessThan && subEnd != -1 && subEnd < ruleText.length() - 1 &&
ruleText.charAt(subEnd+1) == c) { |
| 428 // ordinals use "=#,##0==%abbrev=" as their rule. Notice that the '
==' in the middle |
| 429 // occurs because of the juxtaposition of two different rules. The
check for '<' is a hack |
| 430 // to get around this. Having the duplicate at the front would caus
e problems with |
| 431 // rules like "<<%" to format, say, percents... |
| 432 ++subEnd; |
| 433 } |
| 434 } |
| 435 |
| 436 // if we don't find the end of the token (i.e., if we're on a single, |
| 437 // unmatched token character), create a null substitution positioned |
| 438 // at the end of the rule |
| 439 if (subEnd == -1) { |
| 440 return NFSubstitution::makeSubstitution(ruleText.length(), this, predece
ssor, |
| 441 ruleSet, rbnf, gEmptyString, status); |
| 442 } |
| 443 |
| 444 // if we get here, we have a real substitution token (or at least |
| 445 // some text bounded by substitution token characters). Use |
| 446 // makeSubstitution() to create the right kind of substitution |
| 447 UnicodeString subToken; |
| 448 subToken.setTo(ruleText, subStart, subEnd + 1 - subStart); |
| 449 result = NFSubstitution::makeSubstitution(subStart, this, predecessor, ruleS
et, |
| 450 rbnf, subToken, status); |
| 451 |
| 452 // remove the substitution from the rule text |
| 453 ruleText.removeBetween(subStart, subEnd+1); |
| 454 |
| 455 return result; |
| 456 } |
| 457 |
| 458 /** |
| 459 * Sets the rule's base value, and causes the radix and exponent |
| 460 * to be recalculated. This is used during construction when we |
| 461 * don't know the rule's base value until after it's been |
| 462 * constructed. It should be used at any other time. |
| 463 * @param The new base value for the rule. |
| 464 */ |
| 465 void |
| 466 NFRule::setBaseValue(int64_t newBaseValue, UErrorCode& status) |
| 467 { |
| 468 // set the base value |
| 469 baseValue = newBaseValue; |
| 470 |
| 471 // if this isn't a special rule, recalculate the radix and exponent |
| 472 // (the radix always defaults to 10; if it's supposed to be something |
| 473 // else, it's cleaned up by the caller and the exponent is |
| 474 // recalculated again-- the only function that does this is |
| 475 // NFRule.parseRuleDescriptor() ) |
| 476 if (baseValue >= 1) { |
| 477 radix = 10; |
| 478 exponent = expectedExponent(); |
| 479 |
| 480 // this function gets called on a fully-constructed rule whose |
| 481 // description didn't specify a base value. This means it |
| 482 // has substitutions, and some substitutions hold on to copies |
| 483 // of the rule's divisor. Fix their copies of the divisor. |
| 484 if (sub1 != NULL) { |
| 485 sub1->setDivisor(radix, exponent, status); |
| 486 } |
| 487 if (sub2 != NULL) { |
| 488 sub2->setDivisor(radix, exponent, status); |
| 489 } |
| 490 |
| 491 // if this is a special rule, its radix and exponent are basically |
| 492 // ignored. Set them to "safe" default values |
| 493 } else { |
| 494 radix = 10; |
| 495 exponent = 0; |
| 496 } |
| 497 } |
| 498 |
| 499 /** |
| 500 * This calculates the rule's exponent based on its radix and base |
| 501 * value. This will be the highest power the radix can be raised to |
| 502 * and still produce a result less than or equal to the base value. |
| 503 */ |
| 504 int16_t |
| 505 NFRule::expectedExponent() const |
| 506 { |
| 507 // since the log of 0, or the log base 0 of something, causes an |
| 508 // error, declare the exponent in these cases to be 0 (we also |
| 509 // deal with the special-rule identifiers here) |
| 510 if (radix == 0 || baseValue < 1) { |
| 511 return 0; |
| 512 } |
| 513 |
| 514 // we get rounding error in some cases-- for example, log 1000 / log 10 |
| 515 // gives us 1.9999999996 instead of 2. The extra logic here is to take |
| 516 // that into account |
| 517 int16_t tempResult = (int16_t)(uprv_log((double)baseValue) / uprv_log((doubl
e)radix)); |
| 518 int64_t temp = util64_pow(radix, tempResult + 1); |
| 519 if (temp <= baseValue) { |
| 520 tempResult += 1; |
| 521 } |
| 522 return tempResult; |
| 523 } |
| 524 |
| 525 /** |
| 526 * Searches the rule's rule text for any of the specified strings. |
| 527 * @param strings An array of strings to search the rule's rule |
| 528 * text for |
| 529 * @return The index of the first match in the rule's rule text |
| 530 * (i.e., the first substring in the rule's rule text that matches |
| 531 * _any_ of the strings in "strings"). If none of the strings in |
| 532 * "strings" is found in the rule's rule text, returns -1. |
| 533 */ |
| 534 int32_t |
| 535 NFRule::indexOfAny(const UChar* const strings[]) const |
| 536 { |
| 537 int result = -1; |
| 538 for (int i = 0; strings[i]; i++) { |
| 539 int32_t pos = ruleText.indexOf(*strings[i]); |
| 540 if (pos != -1 && (result == -1 || pos < result)) { |
| 541 result = pos; |
| 542 } |
| 543 } |
| 544 return result; |
| 545 } |
| 546 |
| 547 //----------------------------------------------------------------------- |
| 548 // boilerplate |
| 549 //----------------------------------------------------------------------- |
| 550 |
| 551 /** |
| 552 * Tests two rules for equality. |
| 553 * @param that The rule to compare this one against |
| 554 * @return True is the two rules are functionally equivalent |
| 555 */ |
| 556 UBool |
| 557 NFRule::operator==(const NFRule& rhs) const |
| 558 { |
| 559 return baseValue == rhs.baseValue |
| 560 && radix == rhs.radix |
| 561 && exponent == rhs.exponent |
| 562 && ruleText == rhs.ruleText |
| 563 && *sub1 == *rhs.sub1 |
| 564 && *sub2 == *rhs.sub2; |
| 565 } |
| 566 |
| 567 /** |
| 568 * Returns a textual representation of the rule. This won't |
| 569 * necessarily be the same as the description that this rule |
| 570 * was created with, but it will produce the same result. |
| 571 * @return A textual description of the rule |
| 572 */ |
| 573 static void util_append64(UnicodeString& result, int64_t n) |
| 574 { |
| 575 UChar buffer[256]; |
| 576 int32_t len = util64_tou(n, buffer, sizeof(buffer)); |
| 577 UnicodeString temp(buffer, len); |
| 578 result.append(temp); |
| 579 } |
| 580 |
| 581 void |
| 582 NFRule::_appendRuleText(UnicodeString& result) const |
| 583 { |
| 584 switch (getType()) { |
| 585 case kNegativeNumberRule: result.append(gMinusX); break; |
| 586 case kImproperFractionRule: result.append(gXDotX); break; |
| 587 case kProperFractionRule: result.append(gZeroDotX); break; |
| 588 case kMasterRule: result.append(gXDotZero); break; |
| 589 default: |
| 590 // for a normal rule, write out its base value, and if the radix is |
| 591 // something other than 10, write out the radix (with the preceding |
| 592 // slash, of course). Then calculate the expected exponent and if |
| 593 // if isn't the same as the actual exponent, write an appropriate |
| 594 // number of > signs. Finally, terminate the whole thing with |
| 595 // a colon. |
| 596 util_append64(result, baseValue); |
| 597 if (radix != 10) { |
| 598 result.append(gSlash); |
| 599 util_append64(result, radix); |
| 600 } |
| 601 int numCarets = expectedExponent() - exponent; |
| 602 for (int i = 0; i < numCarets; i++) { |
| 603 result.append(gGreaterThan); |
| 604 } |
| 605 break; |
| 606 } |
| 607 result.append(gColon); |
| 608 result.append(gSpace); |
| 609 |
| 610 // if the rule text begins with a space, write an apostrophe |
| 611 // (whitespace after the rule descriptor is ignored; the |
| 612 // apostrophe is used to make the whitespace significant) |
| 613 if (ruleText.startsWith(gSpace) && sub1->getPos() != 0) { |
| 614 result.append(gTick); |
| 615 } |
| 616 |
| 617 // now, write the rule's rule text, inserting appropriate |
| 618 // substitution tokens in the appropriate places |
| 619 UnicodeString ruleTextCopy; |
| 620 ruleTextCopy.setTo(ruleText); |
| 621 |
| 622 UnicodeString temp; |
| 623 sub2->toString(temp); |
| 624 ruleTextCopy.insert(sub2->getPos(), temp); |
| 625 sub1->toString(temp); |
| 626 ruleTextCopy.insert(sub1->getPos(), temp); |
| 627 |
| 628 result.append(ruleTextCopy); |
| 629 |
| 630 // and finally, top the whole thing off with a semicolon and |
| 631 // return the result |
| 632 result.append(gSemicolon); |
| 633 } |
| 634 |
| 635 //----------------------------------------------------------------------- |
| 636 // formatting |
| 637 //----------------------------------------------------------------------- |
| 638 |
| 639 /** |
| 640 * Formats the number, and inserts the resulting text into |
| 641 * toInsertInto. |
| 642 * @param number The number being formatted |
| 643 * @param toInsertInto The string where the resultant text should |
| 644 * be inserted |
| 645 * @param pos The position in toInsertInto where the resultant text |
| 646 * should be inserted |
| 647 */ |
| 648 void |
| 649 NFRule::doFormat(int64_t number, UnicodeString& toInsertInto, int32_t pos) const |
| 650 { |
| 651 // first, insert the rule's rule text into toInsertInto at the |
| 652 // specified position, then insert the results of the substitutions |
| 653 // into the right places in toInsertInto (notice we do the |
| 654 // substitutions in reverse order so that the offsets don't get |
| 655 // messed up) |
| 656 toInsertInto.insert(pos, ruleText); |
| 657 sub2->doSubstitution(number, toInsertInto, pos); |
| 658 sub1->doSubstitution(number, toInsertInto, pos); |
| 659 } |
| 660 |
| 661 /** |
| 662 * Formats the number, and inserts the resulting text into |
| 663 * toInsertInto. |
| 664 * @param number The number being formatted |
| 665 * @param toInsertInto The string where the resultant text should |
| 666 * be inserted |
| 667 * @param pos The position in toInsertInto where the resultant text |
| 668 * should be inserted |
| 669 */ |
| 670 void |
| 671 NFRule::doFormat(double number, UnicodeString& toInsertInto, int32_t pos) const |
| 672 { |
| 673 // first, insert the rule's rule text into toInsertInto at the |
| 674 // specified position, then insert the results of the substitutions |
| 675 // into the right places in toInsertInto |
| 676 // [again, we have two copies of this routine that do the same thing |
| 677 // so that we don't sacrifice precision in a long by casting it |
| 678 // to a double] |
| 679 toInsertInto.insert(pos, ruleText); |
| 680 sub2->doSubstitution(number, toInsertInto, pos); |
| 681 sub1->doSubstitution(number, toInsertInto, pos); |
| 682 } |
| 683 |
| 684 /** |
| 685 * Used by the owning rule set to determine whether to invoke the |
| 686 * rollback rule (i.e., whether this rule or the one that precedes |
| 687 * it in the rule set's list should be used to format the number) |
| 688 * @param The number being formatted |
| 689 * @return True if the rule set should use the rule that precedes |
| 690 * this one in its list; false if it should use this rule |
| 691 */ |
| 692 UBool |
| 693 NFRule::shouldRollBack(double number) const |
| 694 { |
| 695 // we roll back if the rule contains a modulus substitution, |
| 696 // the number being formatted is an even multiple of the rule's |
| 697 // divisor, and the rule's base value is NOT an even multiple |
| 698 // of its divisor |
| 699 // In other words, if the original description had |
| 700 // 100: << hundred[ >>]; |
| 701 // that expands into |
| 702 // 100: << hundred; |
| 703 // 101: << hundred >>; |
| 704 // internally. But when we're formatting 200, if we use the rule |
| 705 // at 101, which would normally apply, we get "two hundred zero". |
| 706 // To prevent this, we roll back and use the rule at 100 instead. |
| 707 // This is the logic that makes this happen: the rule at 101 has |
| 708 // a modulus substitution, its base value isn't an even multiple |
| 709 // of 100, and the value we're trying to format _is_ an even |
| 710 // multiple of 100. This is called the "rollback rule." |
| 711 if ((sub1->isModulusSubstitution()) || (sub2->isModulusSubstitution())) { |
| 712 int64_t re = util64_pow(radix, exponent); |
| 713 return uprv_fmod(number, (double)re) == 0 && (baseValue % re) != 0; |
| 714 } |
| 715 return FALSE; |
| 716 } |
| 717 |
| 718 //----------------------------------------------------------------------- |
| 719 // parsing |
| 720 //----------------------------------------------------------------------- |
| 721 |
| 722 /** |
| 723 * Attempts to parse the string with this rule. |
| 724 * @param text The string being parsed |
| 725 * @param parsePosition On entry, the value is ignored and assumed to |
| 726 * be 0. On exit, this has been updated with the position of the first |
| 727 * character not consumed by matching the text against this rule |
| 728 * (if this rule doesn't match the text at all, the parse position |
| 729 * if left unchanged (presumably at 0) and the function returns |
| 730 * new Long(0)). |
| 731 * @param isFractionRule True if this rule is contained within a |
| 732 * fraction rule set. This is only used if the rule has no |
| 733 * substitutions. |
| 734 * @return If this rule matched the text, this is the rule's base value |
| 735 * combined appropriately with the results of parsing the substitutions. |
| 736 * If nothing matched, this is new Long(0) and the parse position is |
| 737 * left unchanged. The result will be an instance of Long if the |
| 738 * result is an integer and Double otherwise. The result is never null. |
| 739 */ |
| 740 #ifdef RBNF_DEBUG |
| 741 #include <stdio.h> |
| 742 |
| 743 static void dumpUS(FILE* f, const UnicodeString& us) { |
| 744 int len = us.length(); |
| 745 char* buf = (char *)uprv_malloc((len+1)*sizeof(char)); //new char[len+1]; |
| 746 if (buf != NULL) { |
| 747 us.extract(0, len, buf); |
| 748 buf[len] = 0; |
| 749 fprintf(f, "%s", buf); |
| 750 uprv_free(buf); //delete[] buf; |
| 751 } |
| 752 } |
| 753 #endif |
| 754 |
| 755 UBool |
| 756 NFRule::doParse(const UnicodeString& text, |
| 757 ParsePosition& parsePosition, |
| 758 UBool isFractionRule, |
| 759 double upperBound, |
| 760 Formattable& resVal) const |
| 761 { |
| 762 // internally we operate on a copy of the string being parsed |
| 763 // (because we're going to change it) and use our own ParsePosition |
| 764 ParsePosition pp; |
| 765 UnicodeString workText(text); |
| 766 |
| 767 // check to see whether the text before the first substitution |
| 768 // matches the text at the beginning of the string being |
| 769 // parsed. If it does, strip that off the front of workText; |
| 770 // otherwise, dump out with a mismatch |
| 771 UnicodeString prefix; |
| 772 prefix.setTo(ruleText, 0, sub1->getPos()); |
| 773 |
| 774 #ifdef RBNF_DEBUG |
| 775 fprintf(stderr, "doParse %x ", this); |
| 776 { |
| 777 UnicodeString rt; |
| 778 _appendRuleText(rt); |
| 779 dumpUS(stderr, rt); |
| 780 } |
| 781 |
| 782 fprintf(stderr, " text: '", this); |
| 783 dumpUS(stderr, text); |
| 784 fprintf(stderr, "' prefix: '"); |
| 785 dumpUS(stderr, prefix); |
| 786 #endif |
| 787 stripPrefix(workText, prefix, pp); |
| 788 int32_t prefixLength = text.length() - workText.length(); |
| 789 |
| 790 #ifdef RBNF_DEBUG |
| 791 fprintf(stderr, "' pl: %d ppi: %d s1p: %d\n", prefixLength, pp.getIndex(), s
ub1->getPos()); |
| 792 #endif |
| 793 |
| 794 if (pp.getIndex() == 0 && sub1->getPos() != 0) { |
| 795 // commented out because ParsePosition doesn't have error index in 1.1.x |
| 796 // restored for ICU4C port |
| 797 parsePosition.setErrorIndex(pp.getErrorIndex()); |
| 798 resVal.setLong(0); |
| 799 return TRUE; |
| 800 } |
| 801 |
| 802 // this is the fun part. The basic guts of the rule-matching |
| 803 // logic is matchToDelimiter(), which is called twice. The first |
| 804 // time it searches the input string for the rule text BETWEEN |
| 805 // the substitutions and tries to match the intervening text |
| 806 // in the input string with the first substitution. If that |
| 807 // succeeds, it then calls it again, this time to look for the |
| 808 // rule text after the second substitution and to match the |
| 809 // intervening input text against the second substitution. |
| 810 // |
| 811 // For example, say we have a rule that looks like this: |
| 812 // first << middle >> last; |
| 813 // and input text that looks like this: |
| 814 // first one middle two last |
| 815 // First we use stripPrefix() to match "first " in both places and |
| 816 // strip it off the front, leaving |
| 817 // one middle two last |
| 818 // Then we use matchToDelimiter() to match " middle " and try to |
| 819 // match "one" against a substitution. If it's successful, we now |
| 820 // have |
| 821 // two last |
| 822 // We use matchToDelimiter() a second time to match " last" and |
| 823 // try to match "two" against a substitution. If "two" matches |
| 824 // the substitution, we have a successful parse. |
| 825 // |
| 826 // Since it's possible in many cases to find multiple instances |
| 827 // of each of these pieces of rule text in the input string, |
| 828 // we need to try all the possible combinations of these |
| 829 // locations. This prevents us from prematurely declaring a mismatch, |
| 830 // and makes sure we match as much input text as we can. |
| 831 int highWaterMark = 0; |
| 832 double result = 0; |
| 833 int start = 0; |
| 834 double tempBaseValue = (double)(baseValue <= 0 ? 0 : baseValue); |
| 835 |
| 836 UnicodeString temp; |
| 837 do { |
| 838 // our partial parse result starts out as this rule's base |
| 839 // value. If it finds a successful match, matchToDelimiter() |
| 840 // will compose this in some way with what it gets back from |
| 841 // the substitution, giving us a new partial parse result |
| 842 pp.setIndex(0); |
| 843 |
| 844 temp.setTo(ruleText, sub1->getPos(), sub2->getPos() - sub1->getPos()); |
| 845 double partialResult = matchToDelimiter(workText, start, tempBaseValue, |
| 846 temp, pp, sub1, |
| 847 upperBound); |
| 848 |
| 849 // if we got a successful match (or were trying to match a |
| 850 // null substitution), pp is now pointing at the first unmatched |
| 851 // character. Take note of that, and try matchToDelimiter() |
| 852 // on the input text again |
| 853 if (pp.getIndex() != 0 || sub1->isNullSubstitution()) { |
| 854 start = pp.getIndex(); |
| 855 |
| 856 UnicodeString workText2; |
| 857 workText2.setTo(workText, pp.getIndex(), workText.length() - pp.getI
ndex()); |
| 858 ParsePosition pp2; |
| 859 |
| 860 // the second matchToDelimiter() will compose our previous |
| 861 // partial result with whatever it gets back from its |
| 862 // substitution if there's a successful match, giving us |
| 863 // a real result |
| 864 temp.setTo(ruleText, sub2->getPos(), ruleText.length() - sub2->getPo
s()); |
| 865 partialResult = matchToDelimiter(workText2, 0, partialResult, |
| 866 temp, pp2, sub2, |
| 867 upperBound); |
| 868 |
| 869 // if we got a successful match on this second |
| 870 // matchToDelimiter() call, update the high-water mark |
| 871 // and result (if necessary) |
| 872 if (pp2.getIndex() != 0 || sub2->isNullSubstitution()) { |
| 873 if (prefixLength + pp.getIndex() + pp2.getIndex() > highWaterMar
k) { |
| 874 highWaterMark = prefixLength + pp.getIndex() + pp2.getIndex(
); |
| 875 result = partialResult; |
| 876 } |
| 877 } |
| 878 // commented out because ParsePosition doesn't have error index in 1
.1.x |
| 879 // restored for ICU4C port |
| 880 else { |
| 881 int32_t temp = pp2.getErrorIndex() + sub1->getPos() + pp.getInde
x(); |
| 882 if (temp> parsePosition.getErrorIndex()) { |
| 883 parsePosition.setErrorIndex(temp); |
| 884 } |
| 885 } |
| 886 } |
| 887 // commented out because ParsePosition doesn't have error index in 1.1.x |
| 888 // restored for ICU4C port |
| 889 else { |
| 890 int32_t temp = sub1->getPos() + pp.getErrorIndex(); |
| 891 if (temp > parsePosition.getErrorIndex()) { |
| 892 parsePosition.setErrorIndex(temp); |
| 893 } |
| 894 } |
| 895 // keep trying to match things until the outer matchToDelimiter() |
| 896 // call fails to make a match (each time, it picks up where it |
| 897 // left off the previous time) |
| 898 } while (sub1->getPos() != sub2->getPos() |
| 899 && pp.getIndex() > 0 |
| 900 && pp.getIndex() < workText.length() |
| 901 && pp.getIndex() != start); |
| 902 |
| 903 // update the caller's ParsePosition with our high-water mark |
| 904 // (i.e., it now points at the first character this function |
| 905 // didn't match-- the ParsePosition is therefore unchanged if |
| 906 // we didn't match anything) |
| 907 parsePosition.setIndex(highWaterMark); |
| 908 // commented out because ParsePosition doesn't have error index in 1.1.x |
| 909 // restored for ICU4C port |
| 910 if (highWaterMark > 0) { |
| 911 parsePosition.setErrorIndex(0); |
| 912 } |
| 913 |
| 914 // this is a hack for one unusual condition: Normally, whether this |
| 915 // rule belong to a fraction rule set or not is handled by its |
| 916 // substitutions. But if that rule HAS NO substitutions, then |
| 917 // we have to account for it here. By definition, if the matching |
| 918 // rule in a fraction rule set has no substitutions, its numerator |
| 919 // is 1, and so the result is the reciprocal of its base value. |
| 920 if (isFractionRule && |
| 921 highWaterMark > 0 && |
| 922 sub1->isNullSubstitution()) { |
| 923 result = 1 / result; |
| 924 } |
| 925 |
| 926 resVal.setDouble(result); |
| 927 return TRUE; // ??? do we need to worry if it is a long or a double? |
| 928 } |
| 929 |
| 930 /** |
| 931 * This function is used by parse() to match the text being parsed |
| 932 * against a possible prefix string. This function |
| 933 * matches characters from the beginning of the string being parsed |
| 934 * to characters from the prospective prefix. If they match, pp is |
| 935 * updated to the first character not matched, and the result is |
| 936 * the unparsed part of the string. If they don't match, the whole |
| 937 * string is returned, and pp is left unchanged. |
| 938 * @param text The string being parsed |
| 939 * @param prefix The text to match against |
| 940 * @param pp On entry, ignored and assumed to be 0. On exit, points |
| 941 * to the first unmatched character (assuming the whole prefix matched), |
| 942 * or is unchanged (if the whole prefix didn't match). |
| 943 * @return If things match, this is the unparsed part of "text"; |
| 944 * if they didn't match, this is "text". |
| 945 */ |
| 946 void |
| 947 NFRule::stripPrefix(UnicodeString& text, const UnicodeString& prefix, ParsePosit
ion& pp) const |
| 948 { |
| 949 // if the prefix text is empty, dump out without doing anything |
| 950 if (prefix.length() != 0) { |
| 951 UErrorCode status = U_ZERO_ERROR; |
| 952 // use prefixLength() to match the beginning of |
| 953 // "text" against "prefix". This function returns the |
| 954 // number of characters from "text" that matched (or 0 if |
| 955 // we didn't match the whole prefix) |
| 956 int32_t pfl = prefixLength(text, prefix, status); |
| 957 if (U_FAILURE(status)) { // Memory allocation error. |
| 958 return; |
| 959 } |
| 960 if (pfl != 0) { |
| 961 // if we got a successful match, update the parse position |
| 962 // and strip the prefix off of "text" |
| 963 pp.setIndex(pp.getIndex() + pfl); |
| 964 text.remove(0, pfl); |
| 965 } |
| 966 } |
| 967 } |
| 968 |
| 969 /** |
| 970 * Used by parse() to match a substitution and any following text. |
| 971 * "text" is searched for instances of "delimiter". For each instance |
| 972 * of delimiter, the intervening text is tested to see whether it |
| 973 * matches the substitution. The longest match wins. |
| 974 * @param text The string being parsed |
| 975 * @param startPos The position in "text" where we should start looking |
| 976 * for "delimiter". |
| 977 * @param baseValue A partial parse result (often the rule's base value), |
| 978 * which is combined with the result from matching the substitution |
| 979 * @param delimiter The string to search "text" for. |
| 980 * @param pp Ignored and presumed to be 0 on entry. If there's a match, |
| 981 * on exit this will point to the first unmatched character. |
| 982 * @param sub If we find "delimiter" in "text", this substitution is used |
| 983 * to match the text between the beginning of the string and the |
| 984 * position of "delimiter." (If "delimiter" is the empty string, then |
| 985 * this function just matches against this substitution and updates |
| 986 * everything accordingly.) |
| 987 * @param upperBound When matching the substitution, it will only |
| 988 * consider rules with base values lower than this value. |
| 989 * @return If there's a match, this is the result of composing |
| 990 * baseValue with the result of matching the substitution. Otherwise, |
| 991 * this is new Long(0). It's never null. If the result is an integer, |
| 992 * this will be an instance of Long; otherwise, it's an instance of |
| 993 * Double. |
| 994 * |
| 995 * !!! note {dlf} in point of fact, in the java code the caller always converts |
| 996 * the result to a double, so we might as well return one. |
| 997 */ |
| 998 double |
| 999 NFRule::matchToDelimiter(const UnicodeString& text, |
| 1000 int32_t startPos, |
| 1001 double _baseValue, |
| 1002 const UnicodeString& delimiter, |
| 1003 ParsePosition& pp, |
| 1004 const NFSubstitution* sub, |
| 1005 double upperBound) const |
| 1006 { |
| 1007 UErrorCode status = U_ZERO_ERROR; |
| 1008 // if "delimiter" contains real (i.e., non-ignorable) text, search |
| 1009 // it for "delimiter" beginning at "start". If that succeeds, then |
| 1010 // use "sub"'s doParse() method to match the text before the |
| 1011 // instance of "delimiter" we just found. |
| 1012 if (!allIgnorable(delimiter, status)) { |
| 1013 if (U_FAILURE(status)) { //Memory allocation error. |
| 1014 return 0; |
| 1015 } |
| 1016 ParsePosition tempPP; |
| 1017 Formattable result; |
| 1018 |
| 1019 // use findText() to search for "delimiter". It returns a two- |
| 1020 // element array: element 0 is the position of the match, and |
| 1021 // element 1 is the number of characters that matched |
| 1022 // "delimiter". |
| 1023 int32_t dLen; |
| 1024 int32_t dPos = findText(text, delimiter, startPos, &dLen); |
| 1025 |
| 1026 // if findText() succeeded, isolate the text preceding the |
| 1027 // match, and use "sub" to match that text |
| 1028 while (dPos >= 0) { |
| 1029 UnicodeString subText; |
| 1030 subText.setTo(text, 0, dPos); |
| 1031 if (subText.length() > 0) { |
| 1032 UBool success = sub->doParse(subText, tempPP, _baseValue, upperB
ound, |
| 1033 #if UCONFIG_NO_COLLATION |
| 1034 FALSE, |
| 1035 #else |
| 1036 formatter->isLenient(), |
| 1037 #endif |
| 1038 result); |
| 1039 |
| 1040 // if the substitution could match all the text up to |
| 1041 // where we found "delimiter", then this function has |
| 1042 // a successful match. Bump the caller's parse position |
| 1043 // to point to the first character after the text |
| 1044 // that matches "delimiter", and return the result |
| 1045 // we got from parsing the substitution. |
| 1046 if (success && tempPP.getIndex() == dPos) { |
| 1047 pp.setIndex(dPos + dLen); |
| 1048 return result.getDouble(); |
| 1049 } |
| 1050 // commented out because ParsePosition doesn't have error index
in 1.1.x |
| 1051 // restored for ICU4C port |
| 1052 else { |
| 1053 if (tempPP.getErrorIndex() > 0) { |
| 1054 pp.setErrorIndex(tempPP.getErrorIndex()); |
| 1055 } else { |
| 1056 pp.setErrorIndex(tempPP.getIndex()); |
| 1057 } |
| 1058 } |
| 1059 } |
| 1060 |
| 1061 // if we didn't match the substitution, search for another |
| 1062 // copy of "delimiter" in "text" and repeat the loop if |
| 1063 // we find it |
| 1064 tempPP.setIndex(0); |
| 1065 dPos = findText(text, delimiter, dPos + dLen, &dLen); |
| 1066 } |
| 1067 // if we make it here, this was an unsuccessful match, and we |
| 1068 // leave pp unchanged and return 0 |
| 1069 pp.setIndex(0); |
| 1070 return 0; |
| 1071 |
| 1072 // if "delimiter" is empty, or consists only of ignorable characters |
| 1073 // (i.e., is semantically empty), thwe we obviously can't search |
| 1074 // for "delimiter". Instead, just use "sub" to parse as much of |
| 1075 // "text" as possible. |
| 1076 } else { |
| 1077 ParsePosition tempPP; |
| 1078 Formattable result; |
| 1079 |
| 1080 // try to match the whole string against the substitution |
| 1081 UBool success = sub->doParse(text, tempPP, _baseValue, upperBound, |
| 1082 #if UCONFIG_NO_COLLATION |
| 1083 FALSE, |
| 1084 #else |
| 1085 formatter->isLenient(), |
| 1086 #endif |
| 1087 result); |
| 1088 if (success && (tempPP.getIndex() != 0 || sub->isNullSubstitution())) { |
| 1089 // if there's a successful match (or it's a null |
| 1090 // substitution), update pp to point to the first |
| 1091 // character we didn't match, and pass the result from |
| 1092 // sub.doParse() on through to the caller |
| 1093 pp.setIndex(tempPP.getIndex()); |
| 1094 return result.getDouble(); |
| 1095 } |
| 1096 // commented out because ParsePosition doesn't have error index in 1.1.x |
| 1097 // restored for ICU4C port |
| 1098 else { |
| 1099 pp.setErrorIndex(tempPP.getErrorIndex()); |
| 1100 } |
| 1101 |
| 1102 // and if we get to here, then nothing matched, so we return |
| 1103 // 0 and leave pp alone |
| 1104 return 0; |
| 1105 } |
| 1106 } |
| 1107 |
| 1108 /** |
| 1109 * Used by stripPrefix() to match characters. If lenient parse mode |
| 1110 * is off, this just calls startsWith(). If lenient parse mode is on, |
| 1111 * this function uses CollationElementIterators to match characters in |
| 1112 * the strings (only primary-order differences are significant in |
| 1113 * determining whether there's a match). |
| 1114 * @param str The string being tested |
| 1115 * @param prefix The text we're hoping to see at the beginning |
| 1116 * of "str" |
| 1117 * @return If "prefix" is found at the beginning of "str", this |
| 1118 * is the number of characters in "str" that were matched (this |
| 1119 * isn't necessarily the same as the length of "prefix" when matching |
| 1120 * text with a collator). If there's no match, this is 0. |
| 1121 */ |
| 1122 int32_t |
| 1123 NFRule::prefixLength(const UnicodeString& str, const UnicodeString& prefix, UErr
orCode& status) const |
| 1124 { |
| 1125 // if we're looking for an empty prefix, it obviously matches |
| 1126 // zero characters. Just go ahead and return 0. |
| 1127 if (prefix.length() == 0) { |
| 1128 return 0; |
| 1129 } |
| 1130 |
| 1131 #if !UCONFIG_NO_COLLATION |
| 1132 // go through all this grief if we're in lenient-parse mode |
| 1133 if (formatter->isLenient()) { |
| 1134 // get the formatter's collator and use it to create two |
| 1135 // collation element iterators, one over the target string |
| 1136 // and another over the prefix (right now, we'll throw an |
| 1137 // exception if the collator we get back from the formatter |
| 1138 // isn't a RuleBasedCollator, because RuleBasedCollator defines |
| 1139 // the CollationElementIterator protocol. Hopefully, this |
| 1140 // will change someday.) |
| 1141 RuleBasedCollator* collator = (RuleBasedCollator*)formatter->getCollator
(); |
| 1142 CollationElementIterator* strIter = collator->createCollationElementIter
ator(str); |
| 1143 CollationElementIterator* prefixIter = collator->createCollationElementI
terator(prefix); |
| 1144 // Check for memory allocation error. |
| 1145 if (collator == NULL || strIter == NULL || prefixIter == NULL) { |
| 1146 delete collator; |
| 1147 delete strIter; |
| 1148 delete prefixIter; |
| 1149 status = U_MEMORY_ALLOCATION_ERROR; |
| 1150 return 0; |
| 1151 } |
| 1152 |
| 1153 UErrorCode err = U_ZERO_ERROR; |
| 1154 |
| 1155 // The original code was problematic. Consider this match: |
| 1156 // prefix = "fifty-" |
| 1157 // string = " fifty-7" |
| 1158 // The intent is to match string up to the '7', by matching 'fifty-' at
position 1 |
| 1159 // in the string. Unfortunately, we were getting a match, and then comp
uting where |
| 1160 // the match terminated by rematching the string. The rematch code was
using as an |
| 1161 // initial guess the substring of string between 0 and prefix.length. B
ecause of |
| 1162 // the leading space and trailing hyphen (both ignorable) this was succe
eding, leaving |
| 1163 // the position before the hyphen in the string. Recursing down, we the
n parsed the |
| 1164 // remaining string '-7' as numeric. The resulting number turned out as
43 (50 - 7). |
| 1165 // This was not pretty, especially since the string "fifty-7" parsed jus
t fine. |
| 1166 // |
| 1167 // We have newer APIs now, so we can use calls on the iterator to determ
ine what we |
| 1168 // matched up to. If we terminate because we hit the last element in th
e string, |
| 1169 // our match terminates at this length. If we terminate because we hit
the last element |
| 1170 // in the target, our match terminates at one before the element iterato
r position. |
| 1171 |
| 1172 // match collation elements between the strings |
| 1173 int32_t oStr = strIter->next(err); |
| 1174 int32_t oPrefix = prefixIter->next(err); |
| 1175 |
| 1176 while (oPrefix != CollationElementIterator::NULLORDER) { |
| 1177 // skip over ignorable characters in the target string |
| 1178 while (CollationElementIterator::primaryOrder(oStr) == 0 |
| 1179 && oStr != CollationElementIterator::NULLORDER) { |
| 1180 oStr = strIter->next(err); |
| 1181 } |
| 1182 |
| 1183 // skip over ignorable characters in the prefix |
| 1184 while (CollationElementIterator::primaryOrder(oPrefix) == 0 |
| 1185 && oPrefix != CollationElementIterator::NULLORDER) { |
| 1186 oPrefix = prefixIter->next(err); |
| 1187 } |
| 1188 |
| 1189 // dlf: move this above following test, if we consume the |
| 1190 // entire target, aren't we ok even if the source was also |
| 1191 // entirely consumed? |
| 1192 |
| 1193 // if skipping over ignorables brought to the end of |
| 1194 // the prefix, we DID match: drop out of the loop |
| 1195 if (oPrefix == CollationElementIterator::NULLORDER) { |
| 1196 break; |
| 1197 } |
| 1198 |
| 1199 // if skipping over ignorables brought us to the end |
| 1200 // of the target string, we didn't match and return 0 |
| 1201 if (oStr == CollationElementIterator::NULLORDER) { |
| 1202 delete prefixIter; |
| 1203 delete strIter; |
| 1204 return 0; |
| 1205 } |
| 1206 |
| 1207 // match collation elements from the two strings |
| 1208 // (considering only primary differences). If we |
| 1209 // get a mismatch, dump out and return 0 |
| 1210 if (CollationElementIterator::primaryOrder(oStr) |
| 1211 != CollationElementIterator::primaryOrder(oPrefix)) { |
| 1212 delete prefixIter; |
| 1213 delete strIter; |
| 1214 return 0; |
| 1215 |
| 1216 // otherwise, advance to the next character in each string |
| 1217 // and loop (we drop out of the loop when we exhaust |
| 1218 // collation elements in the prefix) |
| 1219 } else { |
| 1220 oStr = strIter->next(err); |
| 1221 oPrefix = prefixIter->next(err); |
| 1222 } |
| 1223 } |
| 1224 |
| 1225 int32_t result = strIter->getOffset(); |
| 1226 if (oStr != CollationElementIterator::NULLORDER) { |
| 1227 --result; // back over character that we don't want to consume; |
| 1228 } |
| 1229 |
| 1230 #ifdef RBNF_DEBUG |
| 1231 fprintf(stderr, "prefix length: %d\n", result); |
| 1232 #endif |
| 1233 delete prefixIter; |
| 1234 delete strIter; |
| 1235 |
| 1236 return result; |
| 1237 #if 0 |
| 1238 //---------------------------------------------------------------- |
| 1239 // JDK 1.2-specific API call |
| 1240 // return strIter.getOffset(); |
| 1241 //---------------------------------------------------------------- |
| 1242 // JDK 1.1 HACK (take out for 1.2-specific code) |
| 1243 |
| 1244 // if we make it to here, we have a successful match. Now we |
| 1245 // have to find out HOW MANY characters from the target string |
| 1246 // matched the prefix (there isn't necessarily a one-to-one |
| 1247 // mapping between collation elements and characters). |
| 1248 // In JDK 1.2, there's a simple getOffset() call we can use. |
| 1249 // In JDK 1.1, on the other hand, we have to go through some |
| 1250 // ugly contortions. First, use the collator to compare the |
| 1251 // same number of characters from the prefix and target string. |
| 1252 // If they're equal, we're done. |
| 1253 collator->setStrength(Collator::PRIMARY); |
| 1254 if (str.length() >= prefix.length()) { |
| 1255 UnicodeString temp; |
| 1256 temp.setTo(str, 0, prefix.length()); |
| 1257 if (collator->equals(temp, prefix)) { |
| 1258 #ifdef RBNF_DEBUG |
| 1259 fprintf(stderr, "returning: %d\n", prefix.length()); |
| 1260 #endif |
| 1261 return prefix.length(); |
| 1262 } |
| 1263 } |
| 1264 |
| 1265 // if they're not equal, then we have to compare successively |
| 1266 // larger and larger substrings of the target string until we |
| 1267 // get to one that matches the prefix. At that point, we know |
| 1268 // how many characters matched the prefix, and we can return. |
| 1269 int32_t p = 1; |
| 1270 while (p <= str.length()) { |
| 1271 UnicodeString temp; |
| 1272 temp.setTo(str, 0, p); |
| 1273 if (collator->equals(temp, prefix)) { |
| 1274 return p; |
| 1275 } else { |
| 1276 ++p; |
| 1277 } |
| 1278 } |
| 1279 |
| 1280 // SHOULD NEVER GET HERE!!! |
| 1281 return 0; |
| 1282 //---------------------------------------------------------------- |
| 1283 #endif |
| 1284 |
| 1285 // If lenient parsing is turned off, forget all that crap above. |
| 1286 // Just use String.startsWith() and be done with it. |
| 1287 } else |
| 1288 #endif |
| 1289 { |
| 1290 if (str.startsWith(prefix)) { |
| 1291 return prefix.length(); |
| 1292 } else { |
| 1293 return 0; |
| 1294 } |
| 1295 } |
| 1296 } |
| 1297 |
| 1298 /** |
| 1299 * Searches a string for another string. If lenient parsing is off, |
| 1300 * this just calls indexOf(). If lenient parsing is on, this function |
| 1301 * uses CollationElementIterator to match characters, and only |
| 1302 * primary-order differences are significant in determining whether |
| 1303 * there's a match. |
| 1304 * @param str The string to search |
| 1305 * @param key The string to search "str" for |
| 1306 * @param startingAt The index into "str" where the search is to |
| 1307 * begin |
| 1308 * @return A two-element array of ints. Element 0 is the position |
| 1309 * of the match, or -1 if there was no match. Element 1 is the |
| 1310 * number of characters in "str" that matched (which isn't necessarily |
| 1311 * the same as the length of "key") |
| 1312 */ |
| 1313 int32_t |
| 1314 NFRule::findText(const UnicodeString& str, |
| 1315 const UnicodeString& key, |
| 1316 int32_t startingAt, |
| 1317 int32_t* length) const |
| 1318 { |
| 1319 #if !UCONFIG_NO_COLLATION |
| 1320 // if lenient parsing is turned off, this is easy: just call |
| 1321 // String.indexOf() and we're done |
| 1322 if (!formatter->isLenient()) { |
| 1323 *length = key.length(); |
| 1324 return str.indexOf(key, startingAt); |
| 1325 |
| 1326 // but if lenient parsing is turned ON, we've got some work |
| 1327 // ahead of us |
| 1328 } else |
| 1329 #endif |
| 1330 { |
| 1331 //---------------------------------------------------------------- |
| 1332 // JDK 1.1 HACK (take out of 1.2-specific code) |
| 1333 |
| 1334 // in JDK 1.2, CollationElementIterator provides us with an |
| 1335 // API to map between character offsets and collation elements |
| 1336 // and we can do this by marching through the string comparing |
| 1337 // collation elements. We can't do that in JDK 1.1. Insted, |
| 1338 // we have to go through this horrible slow mess: |
| 1339 int32_t p = startingAt; |
| 1340 int32_t keyLen = 0; |
| 1341 |
| 1342 // basically just isolate smaller and smaller substrings of |
| 1343 // the target string (each running to the end of the string, |
| 1344 // and with the first one running from startingAt to the end) |
| 1345 // and then use prefixLength() to see if the search key is at |
| 1346 // the beginning of each substring. This is excruciatingly |
| 1347 // slow, but it will locate the key and tell use how long the |
| 1348 // matching text was. |
| 1349 UnicodeString temp; |
| 1350 UErrorCode status = U_ZERO_ERROR; |
| 1351 while (p < str.length() && keyLen == 0) { |
| 1352 temp.setTo(str, p, str.length() - p); |
| 1353 keyLen = prefixLength(temp, key, status); |
| 1354 if (U_FAILURE(status)) { |
| 1355 break; |
| 1356 } |
| 1357 if (keyLen != 0) { |
| 1358 *length = keyLen; |
| 1359 return p; |
| 1360 } |
| 1361 ++p; |
| 1362 } |
| 1363 // if we make it to here, we didn't find it. Return -1 for the |
| 1364 // location. The length should be ignored, but set it to 0, |
| 1365 // which should be "safe" |
| 1366 *length = 0; |
| 1367 return -1; |
| 1368 |
| 1369 //---------------------------------------------------------------- |
| 1370 // JDK 1.2 version of this routine |
| 1371 //RuleBasedCollator collator = (RuleBasedCollator)formatter.getCollator(
); |
| 1372 // |
| 1373 //CollationElementIterator strIter = collator.getCollationElementIterato
r(str); |
| 1374 //CollationElementIterator keyIter = collator.getCollationElementIterato
r(key); |
| 1375 // |
| 1376 //int keyStart = -1; |
| 1377 // |
| 1378 //str.setOffset(startingAt); |
| 1379 // |
| 1380 //int oStr = strIter.next(); |
| 1381 //int oKey = keyIter.next(); |
| 1382 //while (oKey != CollationElementIterator.NULLORDER) { |
| 1383 // while (oStr != CollationElementIterator.NULLORDER && |
| 1384 // CollationElementIterator.primaryOrder(oStr) == 0) |
| 1385 // oStr = strIter.next(); |
| 1386 // |
| 1387 // while (oKey != CollationElementIterator.NULLORDER && |
| 1388 // CollationElementIterator.primaryOrder(oKey) == 0) |
| 1389 // oKey = keyIter.next(); |
| 1390 // |
| 1391 // if (oStr == CollationElementIterator.NULLORDER) { |
| 1392 // return new int[] { -1, 0 }; |
| 1393 // } |
| 1394 // |
| 1395 // if (oKey == CollationElementIterator.NULLORDER) { |
| 1396 // break; |
| 1397 // } |
| 1398 // |
| 1399 // if (CollationElementIterator.primaryOrder(oStr) == |
| 1400 // CollationElementIterator.primaryOrder(oKey)) { |
| 1401 // keyStart = strIter.getOffset(); |
| 1402 // oStr = strIter.next(); |
| 1403 // oKey = keyIter.next(); |
| 1404 // } else { |
| 1405 // if (keyStart != -1) { |
| 1406 // keyStart = -1; |
| 1407 // keyIter.reset(); |
| 1408 // } else { |
| 1409 // oStr = strIter.next(); |
| 1410 // } |
| 1411 // } |
| 1412 //} |
| 1413 // |
| 1414 //if (oKey == CollationElementIterator.NULLORDER) { |
| 1415 // return new int[] { keyStart, strIter.getOffset() - keyStart }; |
| 1416 //} else { |
| 1417 // return new int[] { -1, 0 }; |
| 1418 //} |
| 1419 } |
| 1420 } |
| 1421 |
| 1422 /** |
| 1423 * Checks to see whether a string consists entirely of ignorable |
| 1424 * characters. |
| 1425 * @param str The string to test. |
| 1426 * @return true if the string is empty of consists entirely of |
| 1427 * characters that the number formatter's collator says are |
| 1428 * ignorable at the primary-order level. false otherwise. |
| 1429 */ |
| 1430 UBool |
| 1431 NFRule::allIgnorable(const UnicodeString& str, UErrorCode& status) const |
| 1432 { |
| 1433 // if the string is empty, we can just return true |
| 1434 if (str.length() == 0) { |
| 1435 return TRUE; |
| 1436 } |
| 1437 |
| 1438 #if !UCONFIG_NO_COLLATION |
| 1439 // if lenient parsing is turned on, walk through the string with |
| 1440 // a collation element iterator and make sure each collation |
| 1441 // element is 0 (ignorable) at the primary level |
| 1442 if (formatter->isLenient()) { |
| 1443 RuleBasedCollator* collator = (RuleBasedCollator*)(formatter->getCollato
r()); |
| 1444 CollationElementIterator* iter = collator->createCollationElementIterato
r(str); |
| 1445 |
| 1446 // Memory allocation error check. |
| 1447 if (collator == NULL || iter == NULL) { |
| 1448 delete collator; |
| 1449 delete iter; |
| 1450 status = U_MEMORY_ALLOCATION_ERROR; |
| 1451 return FALSE; |
| 1452 } |
| 1453 |
| 1454 UErrorCode err = U_ZERO_ERROR; |
| 1455 int32_t o = iter->next(err); |
| 1456 while (o != CollationElementIterator::NULLORDER |
| 1457 && CollationElementIterator::primaryOrder(o) == 0) { |
| 1458 o = iter->next(err); |
| 1459 } |
| 1460 |
| 1461 delete iter; |
| 1462 return o == CollationElementIterator::NULLORDER; |
| 1463 } |
| 1464 #endif |
| 1465 |
| 1466 // if lenient parsing is turned off, there is no such thing as |
| 1467 // an ignorable character: return true only if the string is empty |
| 1468 return FALSE; |
| 1469 } |
| 1470 |
| 1471 U_NAMESPACE_END |
| 1472 |
| 1473 /* U_HAVE_RBNF */ |
| 1474 #endif |
| 1475 |
| 1476 |
OLD | NEW |