OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * |
| 4 * Copyright (C) 2004-2010, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ******************************************************************************* |
| 8 * file name: xmlparser.cpp |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 2004jul21 |
| 14 * created by: Andy Heninger |
| 15 */ |
| 16 |
| 17 #include <stdio.h> |
| 18 #include "unicode/uchar.h" |
| 19 #include "unicode/ucnv.h" |
| 20 #include "unicode/regex.h" |
| 21 #include "filestrm.h" |
| 22 #include "xmlparser.h" |
| 23 |
| 24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION |
| 25 |
| 26 // character constants |
| 27 enum { |
| 28 x_QUOT=0x22, |
| 29 x_AMP=0x26, |
| 30 x_APOS=0x27, |
| 31 x_LT=0x3c, |
| 32 x_GT=0x3e, |
| 33 x_l=0x6c |
| 34 }; |
| 35 |
| 36 #define XML_SPACES "[ \\u0009\\u000d\\u000a]" |
| 37 |
| 38 // XML #4 |
| 39 #define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \ |
| 40 "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C
-\\u200D]" \ |
| 41 "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900
-\\uFDCF]" \ |
| 42 "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" |
| 43 |
| 44 // XML #5 |
| 45 #define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\
\u203f-\\u2040]]" |
| 46 |
| 47 // XML #6 |
| 48 #define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*" |
| 49 |
| 50 U_NAMESPACE_BEGIN |
| 51 |
| 52 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser) |
| 53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement) |
| 54 |
| 55 // |
| 56 // UXMLParser constructor. Mostly just initializes the ICU regexes that are |
| 57 // used for parsing. |
| 58 // |
| 59 UXMLParser::UXMLParser(UErrorCode &status) : |
| 60 // XML Declaration. XML Production #23. |
| 61 // example: "<?xml version=1.0 encoding="utf-16" ?> |
| 62 // This is a sloppy implementation - just look for the leading <?xml
and the closing ?> |
| 63 // allow for a possible leading BOM. |
| 64 mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, statu
s), |
| 65 |
| 66 // XML Comment production #15 |
| 67 // example: "<!-- whatever --> |
| 68 // note, does not detect an illegal "--" within comments |
| 69 mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status), |
| 70 |
| 71 // XML Spaces |
| 72 // production [3] |
| 73 mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status), |
| 74 |
| 75 // XML Doctype decl production #28 |
| 76 // example "<!DOCTYPE foo SYSTEM "somewhere" > |
| 77 // or "<!DOCTYPE foo [internal dtd]> |
| 78 // TODO: we don't actually parse the DOCTYPE or internal subsets. |
| 79 // Some internal dtd subsets could confuse this simple-minded |
| 80 // attempt at skipping over them, specifically, occcurences |
| 81 // of closeing square brackets. These could appear in comments,
|
| 82 // or in parameter entity declarations, for example. |
| 83 mXMLDoctype(UnicodeString( |
| 84 "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV |
| 85 ), 0, status), |
| 86 |
| 87 // XML PI production #16 |
| 88 // example "<?target stuff?> |
| 89 mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status), |
| 90 |
| 91 // XML Element Start Productions #40, #41 |
| 92 // example <foo att1='abc' att2="d e f" > |
| 93 // capture #1: the tag name |
| 94 // |
| 95 mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")"
// match "<tag_name" |
| 96 "(?:" |
| 97 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // ma
tch "ATTR_NAME = " |
| 98 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // ma
tch '"attribute value"' |
| 99 ")*" //
* for zero or more attributes. |
| 100 XML_SPACES "*?>", -1, US_INV), 0, status),
// match " >" |
| 101 |
| 102 // XML Element End production #42 |
| 103 // example </foo> |
| 104 mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV)
, 0, status), |
| 105 |
| 106 // XML Element Empty production #44 |
| 107 // example <foo att1="abc" att2="d e f" /> |
| 108 mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")"
// match "<tag_name" |
| 109 "(?:" |
| 110 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // ma
tch "ATTR_NAME = " |
| 111 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // ma
tch '"attribute value"' |
| 112 ")*" //
* for zero or more attributes. |
| 113 XML_SPACES "*?/>", -1, US_INV), 0, status),
// match " />" |
| 114 |
| 115 |
| 116 // XMLCharData. Everything but '<'. Note that & will be dealt with later
. |
| 117 mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status), |
| 118 |
| 119 // Attribute name = "value". XML Productions 10, 40/41 |
| 120 // Capture group 1 is name, |
| 121 // 2 is the attribute value, including the quotes. |
| 122 // |
| 123 // Note that attributes are scanned twice. The first time is with |
| 124 // the regex for an entire element start. There, the attributes |
| 125 // are checked syntactically, but not separted out one by one. |
| 126 // Here, we match a single attribute, and make its name and |
| 127 // attribute value available to the parser code. |
| 128 mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XM
L_SPACES "*" |
| 129 "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, stat
us), |
| 130 |
| 131 |
| 132 mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status), |
| 133 |
| 134 // Match any of the new-line sequences in content. |
| 135 // All are changed to \u000a. |
| 136 mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\
u000d|\\u0085|\\u2028", -1, US_INV), 0, status), |
| 137 |
| 138 // & char references |
| 139 // We will figure out what we've got based on which capture group has co
ntent. |
| 140 // The last one is a catchall for unrecognized entity references.. |
| 141 // 1 2 3 4 5 6
7 8 |
| 142 mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]
{1,8});|#([0-9]{1,8});|(.))"), |
| 143 0, status), |
| 144 |
| 145 fNames(status), |
| 146 fElementStack(status), |
| 147 fOneLF((UChar)0x0a) // Plain new-line string, used in new line norm
alization. |
| 148 { |
| 149 } |
| 150 |
| 151 UXMLParser * |
| 152 UXMLParser::createParser(UErrorCode &errorCode) { |
| 153 if (U_FAILURE(errorCode)) { |
| 154 return NULL; |
| 155 } else { |
| 156 return new UXMLParser(errorCode); |
| 157 } |
| 158 } |
| 159 |
| 160 UXMLParser::~UXMLParser() {} |
| 161 |
| 162 UXMLElement * |
| 163 UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { |
| 164 char bytes[4096], charsetBuffer[100]; |
| 165 FileStream *f; |
| 166 const char *charset, *pb; |
| 167 UnicodeString src; |
| 168 UConverter *cnv; |
| 169 UChar *buffer, *pu; |
| 170 int32_t fileLength, bytesLength, length, capacity; |
| 171 UBool flush; |
| 172 |
| 173 if(U_FAILURE(errorCode)) { |
| 174 return NULL; |
| 175 } |
| 176 |
| 177 f=T_FileStream_open(filename, "rb"); |
| 178 if(f==NULL) { |
| 179 errorCode=U_FILE_ACCESS_ERROR; |
| 180 return NULL; |
| 181 } |
| 182 |
| 183 bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); |
| 184 if(bytesLength<(int32_t)sizeof(bytes)) { |
| 185 // we have already read the entire file |
| 186 fileLength=bytesLength; |
| 187 } else { |
| 188 // get the file length |
| 189 fileLength=T_FileStream_size(f); |
| 190 } |
| 191 |
| 192 /* |
| 193 * get the charset: |
| 194 * 1. Unicode signature |
| 195 * 2. treat as ISO-8859-1 and read XML encoding="charser" |
| 196 * 3. default to UTF-8 |
| 197 */ |
| 198 charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode); |
| 199 if(U_SUCCESS(errorCode) && charset!=NULL) { |
| 200 // open converter according to Unicode signature |
| 201 cnv=ucnv_open(charset, &errorCode); |
| 202 } else { |
| 203 // read as Latin-1 and parse the XML declaration and encoding |
| 204 cnv=ucnv_open("ISO-8859-1", &errorCode); |
| 205 if(U_FAILURE(errorCode)) { |
| 206 // unexpected error opening Latin-1 converter |
| 207 goto exit; |
| 208 } |
| 209 |
| 210 buffer=src.getBuffer(bytesLength); |
| 211 if(buffer==NULL) { |
| 212 // unexpected failure to reserve some string capacity |
| 213 errorCode=U_MEMORY_ALLOCATION_ERROR; |
| 214 goto exit; |
| 215 } |
| 216 pb=bytes; |
| 217 pu=buffer; |
| 218 ucnv_toUnicode( |
| 219 cnv, |
| 220 &pu, buffer+src.getCapacity(), |
| 221 &pb, bytes+bytesLength, |
| 222 NULL, TRUE, &errorCode); |
| 223 src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); |
| 224 ucnv_close(cnv); |
| 225 cnv=NULL; |
| 226 if(U_FAILURE(errorCode)) { |
| 227 // unexpected error in conversion from Latin-1 |
| 228 src.remove(); |
| 229 goto exit; |
| 230 } |
| 231 |
| 232 // parse XML declaration |
| 233 if(mXMLDecl.reset(src).lookingAt(0, errorCode)) { |
| 234 int32_t declEnd=mXMLDecl.end(errorCode); |
| 235 // go beyond <?xml |
| 236 int32_t pos=src.indexOf((UChar)x_l)+1; |
| 237 |
| 238 mAttrValue.reset(src); |
| 239 while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loo
p runs once per attribute on this element. |
| 240 UnicodeString attName = mAttrValue.group(1, errorCode); |
| 241 UnicodeString attValue = mAttrValue.group(2, errorCode); |
| 242 |
| 243 // Trim the quotes from the att value. These are left over from
the original regex |
| 244 // that parsed the attribue, which couldn't conveniently strip
them. |
| 245 attValue.remove(0,1); // one char from the be
ginning |
| 246 attValue.truncate(attValue.length()-1); // and one from the end
. |
| 247 |
| 248 if(attName==UNICODE_STRING("encoding", 8)) { |
| 249 length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32
_t)sizeof(charsetBuffer)); |
| 250 charset=charsetBuffer; |
| 251 break; |
| 252 } |
| 253 pos = mAttrValue.end(2, errorCode); |
| 254 } |
| 255 |
| 256 if(charset==NULL) { |
| 257 // default to UTF-8 |
| 258 charset="UTF-8"; |
| 259 } |
| 260 cnv=ucnv_open(charset, &errorCode); |
| 261 } |
| 262 } |
| 263 |
| 264 if(U_FAILURE(errorCode)) { |
| 265 // unable to open the converter |
| 266 goto exit; |
| 267 } |
| 268 |
| 269 // convert the file contents |
| 270 capacity=fileLength; // estimated capacity |
| 271 src.getBuffer(capacity); |
| 272 src.releaseBuffer(0); // zero length |
| 273 flush=FALSE; |
| 274 for(;;) { |
| 275 // convert contents of bytes[bytesLength] |
| 276 pb=bytes; |
| 277 for(;;) { |
| 278 length=src.length(); |
| 279 buffer=src.getBuffer(capacity); |
| 280 if(buffer==NULL) { |
| 281 // unexpected failure to reserve some string capacity |
| 282 errorCode=U_MEMORY_ALLOCATION_ERROR; |
| 283 goto exit; |
| 284 } |
| 285 |
| 286 pu=buffer+length; |
| 287 ucnv_toUnicode( |
| 288 cnv, &pu, buffer+src.getCapacity(), |
| 289 &pb, bytes+bytesLength, |
| 290 NULL, FALSE, &errorCode); |
| 291 src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); |
| 292 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { |
| 293 errorCode=U_ZERO_ERROR; |
| 294 capacity=(3*src.getCapacity())/2; // increase capacity by 50% |
| 295 } else { |
| 296 break; |
| 297 } |
| 298 } |
| 299 |
| 300 if(U_FAILURE(errorCode)) { |
| 301 break; // conversion error |
| 302 } |
| 303 |
| 304 if(flush) { |
| 305 break; // completely converted the file |
| 306 } |
| 307 |
| 308 // read next block |
| 309 bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); |
| 310 if(bytesLength==0) { |
| 311 // reached end of file, convert once more to flush the converter |
| 312 flush=TRUE; |
| 313 } |
| 314 }; |
| 315 |
| 316 exit: |
| 317 ucnv_close(cnv); |
| 318 T_FileStream_close(f); |
| 319 |
| 320 if(U_SUCCESS(errorCode)) { |
| 321 return parse(src, errorCode); |
| 322 } else { |
| 323 return NULL; |
| 324 } |
| 325 } |
| 326 |
| 327 UXMLElement * |
| 328 UXMLParser::parse(const UnicodeString &src, UErrorCode &status) { |
| 329 if(U_FAILURE(status)) { |
| 330 return NULL; |
| 331 } |
| 332 |
| 333 UXMLElement *root = NULL; |
| 334 fPos = 0; // TODO use just a local pos variable and pass it into functions |
| 335 // where necessary? |
| 336 |
| 337 // set all matchers to work on the input string |
| 338 mXMLDecl.reset(src); |
| 339 mXMLComment.reset(src); |
| 340 mXMLSP.reset(src); |
| 341 mXMLDoctype.reset(src); |
| 342 mXMLPI.reset(src); |
| 343 mXMLElemStart.reset(src); |
| 344 mXMLElemEnd.reset(src); |
| 345 mXMLElemEmpty.reset(src); |
| 346 mXMLCharData.reset(src); |
| 347 mAttrValue.reset(src); |
| 348 mAttrNormalizer.reset(src); |
| 349 mNewLineNormalizer.reset(src); |
| 350 mAmps.reset(src); |
| 351 |
| 352 // Consume the XML Declaration, if present. |
| 353 if (mXMLDecl.lookingAt(fPos, status)) { |
| 354 fPos = mXMLDecl.end(status); |
| 355 } |
| 356 |
| 357 // Consume "misc" [XML production 27] appearing before DocType |
| 358 parseMisc(status); |
| 359 |
| 360 // Consume a DocType declaration, if present. |
| 361 if (mXMLDoctype.lookingAt(fPos, status)) { |
| 362 fPos = mXMLDoctype.end(status); |
| 363 } |
| 364 |
| 365 // Consume additional "misc" [XML production 27] appearing after the DocType |
| 366 parseMisc(status); |
| 367 |
| 368 // Get the root element |
| 369 if (mXMLElemEmpty.lookingAt(fPos, status)) { |
| 370 // Root is an empty element (no nested elements or content) |
| 371 root = createElement(mXMLElemEmpty, status); |
| 372 fPos = mXMLElemEmpty.end(status); |
| 373 } else { |
| 374 if (mXMLElemStart.lookingAt(fPos, status) == FALSE) { |
| 375 error("Root Element expected", status); |
| 376 goto errorExit; |
| 377 } |
| 378 root = createElement(mXMLElemStart, status); |
| 379 UXMLElement *el = root; |
| 380 |
| 381 // |
| 382 // This is the loop that consumes the root element of the document, |
| 383 // including all nested content. Nested elements are handled by |
| 384 // explicit pushes/pops of the element stack; there is no recursion |
| 385 // in the control flow of this code. |
| 386 // "el" always refers to the current element, the one to which cont
ent |
| 387 // is being added. It is above the top of the element stack. |
| 388 for (;;) { |
| 389 // Nested Element Start |
| 390 if (mXMLElemStart.lookingAt(fPos, status)) { |
| 391 UXMLElement *t = createElement(mXMLElemStart, status); |
| 392 el->fChildren.addElement(t, status); |
| 393 t->fParent = el; |
| 394 fElementStack.push(el, status); |
| 395 el = t; |
| 396 continue; |
| 397 } |
| 398 |
| 399 // Text Content. String is concatenated onto the current node's con
tent, |
| 400 // but only if it contains something other than space
s. |
| 401 UnicodeString s = scanContent(status); |
| 402 if (s.length() > 0) { |
| 403 mXMLSP.reset(s); |
| 404 if (mXMLSP.matches(status) == FALSE) { |
| 405 // This chunk of text contains something other than just |
| 406 // white space. Make a child node for it. |
| 407 replaceCharRefs(s, status); |
| 408 el->fChildren.addElement(s.clone(), status); |
| 409 } |
| 410 mXMLSP.reset(src); // The matchers need to stay set to the ma
in input string. |
| 411 continue; |
| 412 } |
| 413 |
| 414 // Comments. Discard. |
| 415 if (mXMLComment.lookingAt(fPos, status)) { |
| 416 fPos = mXMLComment.end(status); |
| 417 continue; |
| 418 } |
| 419 |
| 420 // PIs. Discard. |
| 421 if (mXMLPI.lookingAt(fPos, status)) { |
| 422 fPos = mXMLPI.end(status); |
| 423 continue; |
| 424 } |
| 425 |
| 426 // Element End |
| 427 if (mXMLElemEnd.lookingAt(fPos, status)) { |
| 428 fPos = mXMLElemEnd.end(0, status); |
| 429 const UnicodeString name = mXMLElemEnd.group(1, status); |
| 430 if (name != *el->fName) { |
| 431 error("Element start / end tag mismatch", status); |
| 432 goto errorExit; |
| 433 } |
| 434 if (fElementStack.empty()) { |
| 435 // Close of the root element. We're done with the doc. |
| 436 el = NULL; |
| 437 break; |
| 438 } |
| 439 el = (UXMLElement *)fElementStack.pop(); |
| 440 continue; |
| 441 } |
| 442 |
| 443 // Empty Element. Stored as a child of the current element, but not
stacked. |
| 444 if (mXMLElemEmpty.lookingAt(fPos, status)) { |
| 445 UXMLElement *t = createElement(mXMLElemEmpty, status); |
| 446 el->fChildren.addElement(t, status); |
| 447 continue; |
| 448 } |
| 449 |
| 450 // Hit something within the document that doesn't match anything. |
| 451 // It's an error. |
| 452 error("Unrecognized markup", status); |
| 453 break; |
| 454 } |
| 455 |
| 456 if (el != NULL || !fElementStack.empty()) { |
| 457 // We bailed out early, for some reason. |
| 458 error("Root element not closed.", status); |
| 459 goto errorExit; |
| 460 } |
| 461 } |
| 462 |
| 463 // Root Element parse is complete. |
| 464 // Consume the annoying xml "Misc" that can appear at the end of the doc. |
| 465 parseMisc(status); |
| 466 |
| 467 // We should have reached the end of the input |
| 468 if (fPos != src.length()) { |
| 469 error("Extra content at the end of the document", status); |
| 470 goto errorExit; |
| 471 } |
| 472 |
| 473 // Success! |
| 474 return root; |
| 475 |
| 476 errorExit: |
| 477 delete root; |
| 478 return NULL; |
| 479 } |
| 480 |
| 481 // |
| 482 // createElement |
| 483 // We've just matched an element start tag. Create and fill in a UXMLEleme
nt object |
| 484 // for it. |
| 485 // |
| 486 UXMLElement * |
| 487 UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) { |
| 488 // First capture group is the element's name. |
| 489 UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status)
, status); |
| 490 |
| 491 // Scan for attributes. |
| 492 int32_t pos = mEl.end(1, status); // The position after the end of the ta
g name |
| 493 |
| 494 while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute
on this element. |
| 495 UnicodeString attName = mAttrValue.group(1, status); |
| 496 UnicodeString attValue = mAttrValue.group(2, status); |
| 497 |
| 498 // Trim the quotes from the att value. These are left over from the ori
ginal regex |
| 499 // that parsed the attribue, which couldn't conveniently strip them. |
| 500 attValue.remove(0,1); // one char from the beginning |
| 501 attValue.truncate(attValue.length()-1); // and one from the end. |
| 502 |
| 503 // XML Attribue value normalization. |
| 504 // This is one of the really screwy parts of the XML spec. |
| 505 // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize |
| 506 // Note that non-validating parsers must treat all entities as type CDAT
A |
| 507 // which simplifies things some. |
| 508 |
| 509 // Att normalization step 1: normalize any newlines in the attribute va
lue |
| 510 mNewLineNormalizer.reset(attValue); |
| 511 attValue = mNewLineNormalizer.replaceAll(fOneLF, status); |
| 512 |
| 513 // Next change all xml white space chars to plain \u0020 spaces. |
| 514 mAttrNormalizer.reset(attValue); |
| 515 UnicodeString oneSpace((UChar)0x0020); |
| 516 attValue = mAttrNormalizer.replaceAll(oneSpace, status); |
| 517 |
| 518 // Replace character entities. |
| 519 replaceCharRefs(attValue, status); |
| 520 |
| 521 // Save the attribute name and value in our document structure. |
| 522 el->fAttNames.addElement((void *)intern(attName, status), status); |
| 523 el->fAttValues.addElement(attValue.clone(), status); |
| 524 pos = mAttrValue.end(2, status); |
| 525 } |
| 526 fPos = mEl.end(0, status); |
| 527 return el; |
| 528 } |
| 529 |
| 530 // |
| 531 // parseMisc |
| 532 // Consume XML "Misc" [production #27] |
| 533 // which is any combination of space, PI and comments |
| 534 // Need to watch end-of-input because xml MISC stuff is allowed after |
| 535 // the document element, so we WILL scan off the end in this function |
| 536 // |
| 537 void |
| 538 UXMLParser::parseMisc(UErrorCode &status) { |
| 539 for (;;) { |
| 540 if (fPos >= mXMLPI.input().length()) { |
| 541 break; |
| 542 } |
| 543 if (mXMLPI.lookingAt(fPos, status)) { |
| 544 fPos = mXMLPI.end(status); |
| 545 continue; |
| 546 } |
| 547 if (mXMLSP.lookingAt(fPos, status)) { |
| 548 fPos = mXMLSP.end(status); |
| 549 continue; |
| 550 } |
| 551 if (mXMLComment.lookingAt(fPos, status)) { |
| 552 fPos = mXMLComment.end(status); |
| 553 continue; |
| 554 } |
| 555 break; |
| 556 } |
| 557 } |
| 558 |
| 559 // |
| 560 // Scan for document content. |
| 561 // |
| 562 UnicodeString |
| 563 UXMLParser::scanContent(UErrorCode &status) { |
| 564 UnicodeString result; |
| 565 if (mXMLCharData.lookingAt(fPos, status)) { |
| 566 result = mXMLCharData.group((int32_t)0, status); |
| 567 // Normalize the new-lines. (Before char ref substitution) |
| 568 mNewLineNormalizer.reset(result); |
| 569 result = mNewLineNormalizer.replaceAll(fOneLF, status); |
| 570 |
| 571 // TODO: handle CDATA |
| 572 fPos = mXMLCharData.end(0, status); |
| 573 } |
| 574 |
| 575 return result; |
| 576 } |
| 577 |
| 578 // |
| 579 // replaceCharRefs |
| 580 // |
| 581 // replace the char entities < & { ካ etc. in a string |
| 582 // with the corresponding actual character. |
| 583 // |
| 584 void |
| 585 UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) { |
| 586 UnicodeString result; |
| 587 UnicodeString replacement; |
| 588 int i; |
| 589 |
| 590 mAmps.reset(s); |
| 591 // See the initialization for the regex matcher mAmps. |
| 592 // Which entity we've matched is determined by which capture group has co
ntent, |
| 593 // which is flaged by start() of that group not being -1. |
| 594 while (mAmps.find()) { |
| 595 if (mAmps.start(1, status) != -1) { |
| 596 replacement.setTo((UChar)x_AMP); |
| 597 } else if (mAmps.start(2, status) != -1) { |
| 598 replacement.setTo((UChar)x_LT); |
| 599 } else if (mAmps.start(3, status) != -1) { |
| 600 replacement.setTo((UChar)x_GT); |
| 601 } else if (mAmps.start(4, status) != -1) { |
| 602 replacement.setTo((UChar)x_APOS); |
| 603 } else if (mAmps.start(5, status) != -1) { |
| 604 replacement.setTo((UChar)x_QUOT); |
| 605 } else if (mAmps.start(6, status) != -1) { |
| 606 UnicodeString hexString = mAmps.group(6, status); |
| 607 UChar32 val = 0; |
| 608 for (i=0; i<hexString.length(); i++) { |
| 609 val = (val << 4) + u_digit(hexString.charAt(i), 16); |
| 610 } |
| 611 // TODO: some verification that the character is valid |
| 612 replacement.setTo(val); |
| 613 } else if (mAmps.start(7, status) != -1) { |
| 614 UnicodeString decimalString = mAmps.group(7, status); |
| 615 UChar32 val = 0; |
| 616 for (i=0; i<decimalString.length(); i++) { |
| 617 val = val*10 + u_digit(decimalString.charAt(i), 10); |
| 618 } |
| 619 // TODO: some verification that the character is valid |
| 620 replacement.setTo(val); |
| 621 } else { |
| 622 // An unrecognized &entity; Leave it alone. |
| 623 // TODO: check that it really looks like an entity, and is not som
e |
| 624 // random & in the text. |
| 625 replacement = mAmps.group((int32_t)0, status); |
| 626 } |
| 627 mAmps.appendReplacement(result, replacement, status); |
| 628 } |
| 629 mAmps.appendTail(result); |
| 630 s = result; |
| 631 } |
| 632 |
| 633 void |
| 634 UXMLParser::error(const char *message, UErrorCode &status) { |
| 635 // TODO: something better here... |
| 636 const UnicodeString &src=mXMLDecl.input(); |
| 637 int line = 0; |
| 638 int ci = 0; |
| 639 while (ci < fPos && ci>=0) { |
| 640 ci = src.indexOf((UChar)0x0a, ci+1); |
| 641 line++; |
| 642 } |
| 643 fprintf(stderr, "Error: %s at line %d\n", message, line); |
| 644 if (U_SUCCESS(status)) { |
| 645 status = U_PARSE_ERROR; |
| 646 } |
| 647 } |
| 648 |
| 649 // intern strings like in Java |
| 650 |
| 651 const UnicodeString * |
| 652 UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) { |
| 653 const UHashElement *he=fNames.find(s); |
| 654 if(he!=NULL) { |
| 655 // already a known name, return its hashed key pointer |
| 656 return (const UnicodeString *)he->key.pointer; |
| 657 } else { |
| 658 // add this new name and return its hashed key pointer |
| 659 fNames.puti(s, 0, errorCode); |
| 660 he=fNames.find(s); |
| 661 return (const UnicodeString *)he->key.pointer; |
| 662 } |
| 663 } |
| 664 |
| 665 const UnicodeString * |
| 666 UXMLParser::findName(const UnicodeString &s) const { |
| 667 const UHashElement *he=fNames.find(s); |
| 668 if(he!=NULL) { |
| 669 // a known name, return its hashed key pointer |
| 670 return (const UnicodeString *)he->key.pointer; |
| 671 } else { |
| 672 // unknown name |
| 673 return NULL; |
| 674 } |
| 675 } |
| 676 |
| 677 // UXMLElement ------------------------------------------------------------- *** |
| 678 |
| 679 UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UE
rrorCode &errorCode) : |
| 680 fParser(parser), |
| 681 fName(name), |
| 682 fAttNames(errorCode), |
| 683 fAttValues(errorCode), |
| 684 fChildren(errorCode), |
| 685 fParent(NULL) |
| 686 { |
| 687 } |
| 688 |
| 689 UXMLElement::~UXMLElement() { |
| 690 int i; |
| 691 // attribute names are owned by the UXMLParser, don't delete them here |
| 692 for (i=fAttValues.size()-1; i>=0; i--) { |
| 693 delete (UObject *)fAttValues.elementAt(i); |
| 694 } |
| 695 for (i=fChildren.size()-1; i>=0; i--) { |
| 696 delete (UObject *)fChildren.elementAt(i); |
| 697 } |
| 698 } |
| 699 |
| 700 const UnicodeString & |
| 701 UXMLElement::getTagName() const { |
| 702 return *fName; |
| 703 } |
| 704 |
| 705 UnicodeString |
| 706 UXMLElement::getText(UBool recurse) const { |
| 707 UnicodeString text; |
| 708 appendText(text, recurse); |
| 709 return text; |
| 710 } |
| 711 |
| 712 void |
| 713 UXMLElement::appendText(UnicodeString &text, UBool recurse) const { |
| 714 const UObject *node; |
| 715 int32_t i, count=fChildren.size(); |
| 716 for(i=0; i<count; ++i) { |
| 717 node=(const UObject *)fChildren.elementAt(i); |
| 718 const UnicodeString *s=dynamic_cast<const UnicodeString *>(node); |
| 719 if(s!=NULL) { |
| 720 text.append(*s); |
| 721 } else if(recurse) /* must be a UXMLElement */ { |
| 722 ((const UXMLElement *)node)->appendText(text, recurse); |
| 723 } |
| 724 } |
| 725 } |
| 726 |
| 727 int32_t |
| 728 UXMLElement::countAttributes() const { |
| 729 return fAttNames.size(); |
| 730 } |
| 731 |
| 732 const UnicodeString * |
| 733 UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value)
const { |
| 734 if(0<=i && i<fAttNames.size()) { |
| 735 name.setTo(*(const UnicodeString *)fAttNames.elementAt(i)); |
| 736 value.setTo(*(const UnicodeString *)fAttValues.elementAt(i)); |
| 737 return &value; // or return (UnicodeString *)fAttValues.elementAt(i); |
| 738 } else { |
| 739 return NULL; |
| 740 } |
| 741 } |
| 742 |
| 743 const UnicodeString * |
| 744 UXMLElement::getAttribute(const UnicodeString &name) const { |
| 745 // search for the attribute name by comparing the interned pointer, |
| 746 // not the string contents |
| 747 const UnicodeString *p=fParser->findName(name); |
| 748 if(p==NULL) { |
| 749 return NULL; // no such attribute seen by the parser at all |
| 750 } |
| 751 |
| 752 int32_t i, count=fAttNames.size(); |
| 753 for(i=0; i<count; ++i) { |
| 754 if(p==(const UnicodeString *)fAttNames.elementAt(i)) { |
| 755 return (const UnicodeString *)fAttValues.elementAt(i); |
| 756 } |
| 757 } |
| 758 return NULL; |
| 759 } |
| 760 |
| 761 int32_t |
| 762 UXMLElement::countChildren() const { |
| 763 return fChildren.size(); |
| 764 } |
| 765 |
| 766 const UObject * |
| 767 UXMLElement::getChild(int32_t i, UXMLNodeType &type) const { |
| 768 if(0<=i && i<fChildren.size()) { |
| 769 const UObject *node=(const UObject *)fChildren.elementAt(i); |
| 770 if(dynamic_cast<const UXMLElement *>(node)!=NULL) { |
| 771 type=UXML_NODE_TYPE_ELEMENT; |
| 772 } else { |
| 773 type=UXML_NODE_TYPE_STRING; |
| 774 } |
| 775 return node; |
| 776 } else { |
| 777 return NULL; |
| 778 } |
| 779 } |
| 780 |
| 781 const UXMLElement * |
| 782 UXMLElement::nextChildElement(int32_t &i) const { |
| 783 if(i<0) { |
| 784 return NULL; |
| 785 } |
| 786 |
| 787 const UObject *node; |
| 788 int32_t count=fChildren.size(); |
| 789 while(i<count) { |
| 790 node=(const UObject *)fChildren.elementAt(i++); |
| 791 const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); |
| 792 if(elem!=NULL) { |
| 793 return elem; |
| 794 } |
| 795 } |
| 796 return NULL; |
| 797 } |
| 798 |
| 799 const UXMLElement * |
| 800 UXMLElement::getChildElement(const UnicodeString &name) const { |
| 801 // search for the element name by comparing the interned pointer, |
| 802 // not the string contents |
| 803 const UnicodeString *p=fParser->findName(name); |
| 804 if(p==NULL) { |
| 805 return NULL; // no such element seen by the parser at all |
| 806 } |
| 807 |
| 808 const UObject *node; |
| 809 int32_t i, count=fChildren.size(); |
| 810 for(i=0; i<count; ++i) { |
| 811 node=(const UObject *)fChildren.elementAt(i); |
| 812 const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); |
| 813 if(elem!=NULL) { |
| 814 if(p==elem->fName) { |
| 815 return elem; |
| 816 } |
| 817 } |
| 818 } |
| 819 return NULL; |
| 820 } |
| 821 |
| 822 U_NAMESPACE_END |
| 823 |
| 824 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ |
| 825 |
OLD | NEW |