icu46/source/tools/toolutil/xmlparser.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/tools/toolutil/xmlparser.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 *******************************************************************************

	3 *

	4 * Copyright (C) 2004-2010, International Business Machines

	5 * Corporation and others. All Rights Reserved.

	6 *

	7 *******************************************************************************

	8 * file name: xmlparser.cpp

	9 * encoding: US-ASCII

	10 * tab size: 8 (not used)

	11 * indentation:4

	12 *

	13 * created on: 2004jul21

	14 * created by: Andy Heninger

	15 */

	16

	17 #include <stdio.h>

	18 #include "unicode/uchar.h"

	19 #include "unicode/ucnv.h"

	20 #include "unicode/regex.h"

	21 #include "filestrm.h"

	22 #include "xmlparser.h"

	23

	24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION

	25

	26 // character constants

	27 enum {

	28 x_QUOT=0x22,

	29 x_AMP=0x26,

	30 x_APOS=0x27,

	31 x_LT=0x3c,

	32 x_GT=0x3e,

	33 x_l=0x6c

	34 };

	35

	36 #define XML_SPACES "[ \\u0009\\u000d\\u000a]"

	37

	38 // XML #4

	39 #define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \

	40 "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C -\\u200D]" \

	41 "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900 -\\uFDCF]" \

	42 "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"

	43

	44 // XML #5

	45 #define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\ \u203f-\\u2040]]"

	46

	47 // XML #6

	48 #define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"

	49

	50 U_NAMESPACE_BEGIN

	51

	52 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)

	53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)

	54

	55 //

	56 // UXMLParser constructor. Mostly just initializes the ICU regexes that are

	57 // used for parsing.

	58 //

	59 UXMLParser::UXMLParser(UErrorCode &status) :

	60 // XML Declaration. XML Production #23.

	61 // example: "<?xml version=1.0 encoding="utf-16" ?>

	62 // This is a sloppy implementation - just look for the leading <?xml and the closing ?>

	63 // allow for a possible leading BOM.

	64 mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, statu s),

	65

	66 // XML Comment production #15

	67 // example: "<!-- whatever -->

	68 // note, does not detect an illegal "--" within comments

	69 mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),

	70

	71 // XML Spaces

	72 // production [3]

	73 mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),

	74

	75 // XML Doctype decl production #28

	76 // example "<!DOCTYPE foo SYSTEM "somewhere" >

	77 // or "<!DOCTYPE foo [internal dtd]>

	78 // TODO: we don't actually parse the DOCTYPE or internal subsets.

	79 // Some internal dtd subsets could confuse this simple-minded

	80 // attempt at skipping over them, specifically, occcurences

	81 // of closeing square brackets. These could appear in comments,

	82 // or in parameter entity declarations, for example.

	83 mXMLDoctype(UnicodeString(

	84 "(?s)<!DOCTYPE.?(>\|\\[.?\\].*?>)", -1, US_INV

	85 ), 0, status),

	86

	87 // XML PI production #16

	88 // example "<?target stuff?>

	89 mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),

	90

	91 // XML Element Start Productions #40, #41

	92 // example <foo att1='abc' att2="d e f" >

	93 // capture #1: the tag name

	94 //

	95 mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name"

	96 "(?:"

	97 XML_SPACES "+" XML_NAME XML_SPACES "=" XML_SPACES "" // ma tch "ATTR_NAME = "

	98 "(?:(?:\\\'[^<\\\']?\\\')\|(?:\\\"[^<\\\"]?\\\"))" // ma tch '"attribute value"'

	99 ")" // for zero or more attributes.

	100 XML_SPACES "*?>", -1, US_INV), 0, status), // match " >"

	101

	102 // XML Element End production #42

	103 // example </foo>

	104 mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV) , 0, status),

	105

	106 // XML Element Empty production #44

	107 // example <foo att1="abc" att2="d e f" />

	108 mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name"

	109 "(?:"

	110 XML_SPACES "+" XML_NAME XML_SPACES "=" XML_SPACES "" // ma tch "ATTR_NAME = "

	111 "(?:(?:\\\'[^<\\\']?\\\')\|(?:\\\"[^<\\\"]?\\\"))" // ma tch '"attribute value"'

	112 ")" // for zero or more attributes.

	113 XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />"

	114

	115

	116 // XMLCharData. Everything but '<'. Note that & will be dealt with later .

	117 mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),

	118

	119 // Attribute name = "value". XML Productions 10, 40/41

	120 // Capture group 1 is name,

	121 // 2 is the attribute value, including the quotes.

	122 //

	123 // Note that attributes are scanned twice. The first time is with

	124 // the regex for an entire element start. There, the attributes

	125 // are checked syntactically, but not separted out one by one.

	126 // Here, we match a single attribute, and make its name and

	127 // attribute value available to the parser code.

	128 mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "=" XM L_SPACES ""

	129 "((?:\\\'[^<\\\']?\\\')\|(?:\\\"[^<\\\"]?\\\"))", -1, US_INV), 0, stat us),

	130

	131

	132 mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),

	133

	134 // Match any of the new-line sequences in content.

	135 // All are changed to \u000a.

	136 mNewLineNormalizer(UnicodeString("\\u000d\\u000a\|\\u000d\\u0085\|\\u000a\|\\ u000d\|\\u0085\|\\u2028", -1, US_INV), 0, status),

	137

	138 // & char references

	139 // We will figure out what we've got based on which capture group has co ntent.

	140 // The last one is a catchall for unrecognized entity references..

	141 // 1 2 3 4 5 6 7 8

	142 mAmps(UnicodeString("&(?:(amp;)\|(lt;)\|(gt;)\|(apos;)\|(quot;)\|#x([0-9A-Fa-f] {1,8});\|#([0-9]{1,8});\|(.))"),

	143 0, status),

	144

	145 fNames(status),

	146 fElementStack(status),

	147 fOneLF((UChar)0x0a) // Plain new-line string, used in new line norm alization.

	148 {

	149 }

	150

	151 UXMLParser *

	152 UXMLParser::createParser(UErrorCode &errorCode) {

	153 if (U_FAILURE(errorCode)) {

	154 return NULL;

	155 } else {

	156 return new UXMLParser(errorCode);

	157 }

	158 }

	159

	160 UXMLParser::~UXMLParser() {}

	161

	162 UXMLElement *

	163 UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {

	164 char bytes[4096], charsetBuffer[100];

	165 FileStream *f;

	166 const char charset, pb;

	167 UnicodeString src;

	168 UConverter *cnv;

	169 UChar buffer, pu;

	170 int32_t fileLength, bytesLength, length, capacity;

	171 UBool flush;

	172

	173 if(U_FAILURE(errorCode)) {

	174 return NULL;

	175 }

	176

	177 f=T_FileStream_open(filename, "rb");

	178 if(f==NULL) {

	179 errorCode=U_FILE_ACCESS_ERROR;

	180 return NULL;

	181 }

	182

	183 bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));

	184 if(bytesLength<(int32_t)sizeof(bytes)) {

	185 // we have already read the entire file

	186 fileLength=bytesLength;

	187 } else {

	188 // get the file length

	189 fileLength=T_FileStream_size(f);

	190 }

	191

	192 /*

	193 * get the charset:

	194 * 1. Unicode signature

	195 * 2. treat as ISO-8859-1 and read XML encoding="charser"

	196 * 3. default to UTF-8

	197 */

	198 charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);

	199 if(U_SUCCESS(errorCode) && charset!=NULL) {

	200 // open converter according to Unicode signature

	201 cnv=ucnv_open(charset, &errorCode);

	202 } else {

	203 // read as Latin-1 and parse the XML declaration and encoding

	204 cnv=ucnv_open("ISO-8859-1", &errorCode);

	205 if(U_FAILURE(errorCode)) {

	206 // unexpected error opening Latin-1 converter

	207 goto exit;

	208 }

	209

	210 buffer=src.getBuffer(bytesLength);

	211 if(buffer==NULL) {

	212 // unexpected failure to reserve some string capacity

	213 errorCode=U_MEMORY_ALLOCATION_ERROR;

	214 goto exit;

	215 }

	216 pb=bytes;

	217 pu=buffer;

	218 ucnv_toUnicode(

	219 cnv,

	220 &pu, buffer+src.getCapacity(),

	221 &pb, bytes+bytesLength,

	222 NULL, TRUE, &errorCode);

	223 src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);

	224 ucnv_close(cnv);

	225 cnv=NULL;

	226 if(U_FAILURE(errorCode)) {

	227 // unexpected error in conversion from Latin-1

	228 src.remove();

	229 goto exit;

	230 }

	231

	232 // parse XML declaration

	233 if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {

	234 int32_t declEnd=mXMLDecl.end(errorCode);

	235 // go beyond <?xml

	236 int32_t pos=src.indexOf((UChar)x_l)+1;

	237

	238 mAttrValue.reset(src);

	239 while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loo p runs once per attribute on this element.

	240 UnicodeString attName = mAttrValue.group(1, errorCode);

	241 UnicodeString attValue = mAttrValue.group(2, errorCode);

	242

	243 // Trim the quotes from the att value. These are left over from the original regex

	244 // that parsed the attribue, which couldn't conveniently strip them.

	245 attValue.remove(0,1); // one char from the be ginning

	246 attValue.truncate(attValue.length()-1); // and one from the end .

	247

	248 if(attName==UNICODE_STRING("encoding", 8)) {

	249 length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32 _t)sizeof(charsetBuffer));

	250 charset=charsetBuffer;

	251 break;

	252 }

	253 pos = mAttrValue.end(2, errorCode);

	254 }

	255

	256 if(charset==NULL) {

	257 // default to UTF-8

	258 charset="UTF-8";

	259 }

	260 cnv=ucnv_open(charset, &errorCode);

	261 }

	262 }

	263

	264 if(U_FAILURE(errorCode)) {

	265 // unable to open the converter

	266 goto exit;

	267 }

	268

	269 // convert the file contents

	270 capacity=fileLength; // estimated capacity

	271 src.getBuffer(capacity);

	272 src.releaseBuffer(0); // zero length

	273 flush=FALSE;

	274 for(;;) {

	275 // convert contents of bytes[bytesLength]

	276 pb=bytes;

	277 for(;;) {

	278 length=src.length();

	279 buffer=src.getBuffer(capacity);

	280 if(buffer==NULL) {

	281 // unexpected failure to reserve some string capacity

	282 errorCode=U_MEMORY_ALLOCATION_ERROR;

	283 goto exit;

	284 }

	285

	286 pu=buffer+length;

	287 ucnv_toUnicode(

	288 cnv, &pu, buffer+src.getCapacity(),

	289 &pb, bytes+bytesLength,

	290 NULL, FALSE, &errorCode);

	291 src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);

	292 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {

	293 errorCode=U_ZERO_ERROR;

	294 capacity=(3*src.getCapacity())/2; // increase capacity by 50%

	295 } else {

	296 break;

	297 }

	298 }

	299

	300 if(U_FAILURE(errorCode)) {

	301 break; // conversion error

	302 }

	303

	304 if(flush) {

	305 break; // completely converted the file

	306 }

	307

	308 // read next block

	309 bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));

	310 if(bytesLength==0) {

	311 // reached end of file, convert once more to flush the converter

	312 flush=TRUE;

	313 }

	314 };

	315

	316 exit:

	317 ucnv_close(cnv);

	318 T_FileStream_close(f);

	319

	320 if(U_SUCCESS(errorCode)) {

	321 return parse(src, errorCode);

	322 } else {

	323 return NULL;

	324 }

	325 }

	326

	327 UXMLElement *

	328 UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {

	329 if(U_FAILURE(status)) {

	330 return NULL;

	331 }

	332

	333 UXMLElement *root = NULL;

	334 fPos = 0; // TODO use just a local pos variable and pass it into functions

	335 // where necessary?

	336

	337 // set all matchers to work on the input string

	338 mXMLDecl.reset(src);

	339 mXMLComment.reset(src);

	340 mXMLSP.reset(src);

	341 mXMLDoctype.reset(src);

	342 mXMLPI.reset(src);

	343 mXMLElemStart.reset(src);

	344 mXMLElemEnd.reset(src);

	345 mXMLElemEmpty.reset(src);

	346 mXMLCharData.reset(src);

	347 mAttrValue.reset(src);

	348 mAttrNormalizer.reset(src);

	349 mNewLineNormalizer.reset(src);

	350 mAmps.reset(src);

	351

	352 // Consume the XML Declaration, if present.

	353 if (mXMLDecl.lookingAt(fPos, status)) {

	354 fPos = mXMLDecl.end(status);

	355 }

	356

	357 // Consume "misc" [XML production 27] appearing before DocType

	358 parseMisc(status);

	359

	360 // Consume a DocType declaration, if present.

	361 if (mXMLDoctype.lookingAt(fPos, status)) {

	362 fPos = mXMLDoctype.end(status);

	363 }

	364

	365 // Consume additional "misc" [XML production 27] appearing after the DocType

	366 parseMisc(status);

	367

	368 // Get the root element

	369 if (mXMLElemEmpty.lookingAt(fPos, status)) {

	370 // Root is an empty element (no nested elements or content)

	371 root = createElement(mXMLElemEmpty, status);

	372 fPos = mXMLElemEmpty.end(status);

	373 } else {

	374 if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {

	375 error("Root Element expected", status);

	376 goto errorExit;

	377 }

	378 root = createElement(mXMLElemStart, status);

	379 UXMLElement *el = root;

	380

	381 //

	382 // This is the loop that consumes the root element of the document,

	383 // including all nested content. Nested elements are handled by

	384 // explicit pushes/pops of the element stack; there is no recursion

	385 // in the control flow of this code.

	386 // "el" always refers to the current element, the one to which cont ent

	387 // is being added. It is above the top of the element stack.

	388 for (;;) {

	389 // Nested Element Start

	390 if (mXMLElemStart.lookingAt(fPos, status)) {

	391 UXMLElement *t = createElement(mXMLElemStart, status);

	392 el->fChildren.addElement(t, status);

	393 t->fParent = el;

	394 fElementStack.push(el, status);

	395 el = t;

	396 continue;

	397 }

	398

	399 // Text Content. String is concatenated onto the current node's con tent,

	400 // but only if it contains something other than space s.

	401 UnicodeString s = scanContent(status);

	402 if (s.length() > 0) {

	403 mXMLSP.reset(s);

	404 if (mXMLSP.matches(status) == FALSE) {

	405 // This chunk of text contains something other than just

	406 // white space. Make a child node for it.

	407 replaceCharRefs(s, status);

	408 el->fChildren.addElement(s.clone(), status);

	409 }

	410 mXMLSP.reset(src); // The matchers need to stay set to the ma in input string.

	411 continue;

	412 }

	413

	414 // Comments. Discard.

	415 if (mXMLComment.lookingAt(fPos, status)) {

	416 fPos = mXMLComment.end(status);

	417 continue;

	418 }

	419

	420 // PIs. Discard.

	421 if (mXMLPI.lookingAt(fPos, status)) {

	422 fPos = mXMLPI.end(status);

	423 continue;

	424 }

	425

	426 // Element End

	427 if (mXMLElemEnd.lookingAt(fPos, status)) {

	428 fPos = mXMLElemEnd.end(0, status);

	429 const UnicodeString name = mXMLElemEnd.group(1, status);

	430 if (name != *el->fName) {

	431 error("Element start / end tag mismatch", status);

	432 goto errorExit;

	433 }

	434 if (fElementStack.empty()) {

	435 // Close of the root element. We're done with the doc.

	436 el = NULL;

	437 break;

	438 }

	439 el = (UXMLElement *)fElementStack.pop();

	440 continue;

	441 }

	442

	443 // Empty Element. Stored as a child of the current element, but not stacked.

	444 if (mXMLElemEmpty.lookingAt(fPos, status)) {

	445 UXMLElement *t = createElement(mXMLElemEmpty, status);

	446 el->fChildren.addElement(t, status);

	447 continue;

	448 }

	449

	450 // Hit something within the document that doesn't match anything.

	451 // It's an error.

	452 error("Unrecognized markup", status);

	453 break;

	454 }

	455

	456 if (el != NULL \|\| !fElementStack.empty()) {

	457 // We bailed out early, for some reason.

	458 error("Root element not closed.", status);

	459 goto errorExit;

	460 }

	461 }

	462

	463 // Root Element parse is complete.

	464 // Consume the annoying xml "Misc" that can appear at the end of the doc.

	465 parseMisc(status);

	466

	467 // We should have reached the end of the input

	468 if (fPos != src.length()) {

	469 error("Extra content at the end of the document", status);

	470 goto errorExit;

	471 }

	472

	473 // Success!

	474 return root;

	475

	476 errorExit:

	477 delete root;

	478 return NULL;

	479 }

	480

	481 //

	482 // createElement

	483 // We've just matched an element start tag. Create and fill in a UXMLEleme nt object

	484 // for it.

	485 //

	486 UXMLElement *

	487 UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) {

	488 // First capture group is the element's name.

	489 UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status) , status);

	490

	491 // Scan for attributes.

	492 int32_t pos = mEl.end(1, status); // The position after the end of the ta g name

	493

	494 while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element.

	495 UnicodeString attName = mAttrValue.group(1, status);

	496 UnicodeString attValue = mAttrValue.group(2, status);

	497

	498 // Trim the quotes from the att value. These are left over from the ori ginal regex

	499 // that parsed the attribue, which couldn't conveniently strip them.

	500 attValue.remove(0,1); // one char from the beginning

	501 attValue.truncate(attValue.length()-1); // and one from the end.

	502

	503 // XML Attribue value normalization.

	504 // This is one of the really screwy parts of the XML spec.

	505 // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize

	506 // Note that non-validating parsers must treat all entities as type CDAT A

	507 // which simplifies things some.

	508

	509 // Att normalization step 1: normalize any newlines in the attribute va lue

	510 mNewLineNormalizer.reset(attValue);

	511 attValue = mNewLineNormalizer.replaceAll(fOneLF, status);

	512

	513 // Next change all xml white space chars to plain \u0020 spaces.

	514 mAttrNormalizer.reset(attValue);

	515 UnicodeString oneSpace((UChar)0x0020);

	516 attValue = mAttrNormalizer.replaceAll(oneSpace, status);

	517

	518 // Replace character entities.

	519 replaceCharRefs(attValue, status);

	520

	521 // Save the attribute name and value in our document structure.

	522 el->fAttNames.addElement((void *)intern(attName, status), status);

	523 el->fAttValues.addElement(attValue.clone(), status);

	524 pos = mAttrValue.end(2, status);

	525 }

	526 fPos = mEl.end(0, status);

	527 return el;

	528 }

	529

	530 //

	531 // parseMisc

	532 // Consume XML "Misc" [production #27]

	533 // which is any combination of space, PI and comments

	534 // Need to watch end-of-input because xml MISC stuff is allowed after

	535 // the document element, so we WILL scan off the end in this function

	536 //

	537 void

	538 UXMLParser::parseMisc(UErrorCode &status) {

	539 for (;;) {

	540 if (fPos >= mXMLPI.input().length()) {

	541 break;

	542 }

	543 if (mXMLPI.lookingAt(fPos, status)) {

	544 fPos = mXMLPI.end(status);

	545 continue;

	546 }

	547 if (mXMLSP.lookingAt(fPos, status)) {

	548 fPos = mXMLSP.end(status);

	549 continue;

	550 }

	551 if (mXMLComment.lookingAt(fPos, status)) {

	552 fPos = mXMLComment.end(status);

	553 continue;

	554 }

	555 break;

	556 }

	557 }

	558

	559 //

	560 // Scan for document content.

	561 //

	562 UnicodeString

	563 UXMLParser::scanContent(UErrorCode &status) {

	564 UnicodeString result;

	565 if (mXMLCharData.lookingAt(fPos, status)) {

	566 result = mXMLCharData.group((int32_t)0, status);

	567 // Normalize the new-lines. (Before char ref substitution)

	568 mNewLineNormalizer.reset(result);

	569 result = mNewLineNormalizer.replaceAll(fOneLF, status);

	570

	571 // TODO: handle CDATA

	572 fPos = mXMLCharData.end(0, status);

	573 }

	574

	575 return result;

	576 }

	577

	578 //

	579 // replaceCharRefs

	580 //

	581 // replace the char entities < & { ካ etc. in a string

	582 // with the corresponding actual character.

	583 //

	584 void

	585 UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {

	586 UnicodeString result;

	587 UnicodeString replacement;

	588 int i;

	589

	590 mAmps.reset(s);

	591 // See the initialization for the regex matcher mAmps.

	592 // Which entity we've matched is determined by which capture group has co ntent,

	593 // which is flaged by start() of that group not being -1.

	594 while (mAmps.find()) {

	595 if (mAmps.start(1, status) != -1) {

	596 replacement.setTo((UChar)x_AMP);

	597 } else if (mAmps.start(2, status) != -1) {

	598 replacement.setTo((UChar)x_LT);

	599 } else if (mAmps.start(3, status) != -1) {

	600 replacement.setTo((UChar)x_GT);

	601 } else if (mAmps.start(4, status) != -1) {

	602 replacement.setTo((UChar)x_APOS);

	603 } else if (mAmps.start(5, status) != -1) {

	604 replacement.setTo((UChar)x_QUOT);

	605 } else if (mAmps.start(6, status) != -1) {

	606 UnicodeString hexString = mAmps.group(6, status);

	607 UChar32 val = 0;

	608 for (i=0; i<hexString.length(); i++) {

	609 val = (val << 4) + u_digit(hexString.charAt(i), 16);

	610 }

	611 // TODO: some verification that the character is valid

	612 replacement.setTo(val);

	613 } else if (mAmps.start(7, status) != -1) {

	614 UnicodeString decimalString = mAmps.group(7, status);

	615 UChar32 val = 0;

	616 for (i=0; i<decimalString.length(); i++) {

	617 val = val*10 + u_digit(decimalString.charAt(i), 10);

	618 }

	619 // TODO: some verification that the character is valid

	620 replacement.setTo(val);

	621 } else {

	622 // An unrecognized &entity; Leave it alone.

	623 // TODO: check that it really looks like an entity, and is not som e

	624 // random & in the text.

	625 replacement = mAmps.group((int32_t)0, status);

	626 }

	627 mAmps.appendReplacement(result, replacement, status);

	628 }

	629 mAmps.appendTail(result);

	630 s = result;

	631 }

	632

	633 void

	634 UXMLParser::error(const char *message, UErrorCode &status) {

	635 // TODO: something better here...

	636 const UnicodeString &src=mXMLDecl.input();

	637 int line = 0;

	638 int ci = 0;

	639 while (ci < fPos && ci>=0) {

	640 ci = src.indexOf((UChar)0x0a, ci+1);

	641 line++;

	642 }

	643 fprintf(stderr, "Error: %s at line %d\n", message, line);

	644 if (U_SUCCESS(status)) {

	645 status = U_PARSE_ERROR;

	646 }

	647 }

	648

	649 // intern strings like in Java

	650

	651 const UnicodeString *

	652 UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {

	653 const UHashElement *he=fNames.find(s);

	654 if(he!=NULL) {

	655 // already a known name, return its hashed key pointer

	656 return (const UnicodeString *)he->key.pointer;

	657 } else {

	658 // add this new name and return its hashed key pointer

	659 fNames.puti(s, 0, errorCode);

	660 he=fNames.find(s);

	661 return (const UnicodeString *)he->key.pointer;

	662 }

	663 }

	664

	665 const UnicodeString *

	666 UXMLParser::findName(const UnicodeString &s) const {

	667 const UHashElement *he=fNames.find(s);

	668 if(he!=NULL) {

	669 // a known name, return its hashed key pointer

	670 return (const UnicodeString *)he->key.pointer;

	671 } else {

	672 // unknown name

	673 return NULL;

	674 }

	675 }

	676

	677 // UXMLElement ------------------------------------------------------------- ***

	678

	679 UXMLElement::UXMLElement(const UXMLParser parser, const UnicodeString name, UE rrorCode &errorCode) :

	680 fParser(parser),

	681 fName(name),

	682 fAttNames(errorCode),

	683 fAttValues(errorCode),

	684 fChildren(errorCode),

	685 fParent(NULL)

	686 {

	687 }

	688

	689 UXMLElement::~UXMLElement() {

	690 int i;

	691 // attribute names are owned by the UXMLParser, don't delete them here

	692 for (i=fAttValues.size()-1; i>=0; i--) {

	693 delete (UObject *)fAttValues.elementAt(i);

	694 }

	695 for (i=fChildren.size()-1; i>=0; i--) {

	696 delete (UObject *)fChildren.elementAt(i);

	697 }

	698 }

	699

	700 const UnicodeString &

	701 UXMLElement::getTagName() const {

	702 return *fName;

	703 }

	704

	705 UnicodeString

	706 UXMLElement::getText(UBool recurse) const {

	707 UnicodeString text;

	708 appendText(text, recurse);

	709 return text;

	710 }

	711

	712 void

	713 UXMLElement::appendText(UnicodeString &text, UBool recurse) const {

	714 const UObject *node;

	715 int32_t i, count=fChildren.size();

	716 for(i=0; i<count; ++i) {

	717 node=(const UObject *)fChildren.elementAt(i);

	718 const UnicodeString s=dynamic_cast<const UnicodeString >(node);

	719 if(s!=NULL) {

	720 text.append(*s);

	721 } else if(recurse) /* must be a UXMLElement */ {

	722 ((const UXMLElement *)node)->appendText(text, recurse);

	723 }

	724 }

	725 }

	726

	727 int32_t

	728 UXMLElement::countAttributes() const {

	729 return fAttNames.size();

	730 }

	731

	732 const UnicodeString *

	733 UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {

	734 if(0<=i && i<fAttNames.size()) {

	735 name.setTo((const UnicodeString )fAttNames.elementAt(i));

	736 value.setTo((const UnicodeString )fAttValues.elementAt(i));

	737 return &value; // or return (UnicodeString *)fAttValues.elementAt(i);

	738 } else {

	739 return NULL;

	740 }

	741 }

	742

	743 const UnicodeString *

	744 UXMLElement::getAttribute(const UnicodeString &name) const {

	745 // search for the attribute name by comparing the interned pointer,

	746 // not the string contents

	747 const UnicodeString *p=fParser->findName(name);

	748 if(p==NULL) {

	749 return NULL; // no such attribute seen by the parser at all

	750 }

	751

	752 int32_t i, count=fAttNames.size();

	753 for(i=0; i<count; ++i) {

	754 if(p==(const UnicodeString *)fAttNames.elementAt(i)) {

	755 return (const UnicodeString *)fAttValues.elementAt(i);

	756 }

	757 }

	758 return NULL;

	759 }

	760

	761 int32_t

	762 UXMLElement::countChildren() const {

	763 return fChildren.size();

	764 }

	765

	766 const UObject *

	767 UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {

	768 if(0<=i && i<fChildren.size()) {

	769 const UObject node=(const UObject )fChildren.elementAt(i);

	770 if(dynamic_cast<const UXMLElement *>(node)!=NULL) {

	771 type=UXML_NODE_TYPE_ELEMENT;

	772 } else {

	773 type=UXML_NODE_TYPE_STRING;

	774 }

	775 return node;

	776 } else {

	777 return NULL;

	778 }

	779 }

	780

	781 const UXMLElement *

	782 UXMLElement::nextChildElement(int32_t &i) const {

	783 if(i<0) {

	784 return NULL;

	785 }

	786

	787 const UObject *node;

	788 int32_t count=fChildren.size();

	789 while(i<count) {

	790 node=(const UObject *)fChildren.elementAt(i++);

	791 const UXMLElement elem=dynamic_cast<const UXMLElement >(node);

	792 if(elem!=NULL) {

	793 return elem;

	794 }

	795 }

	796 return NULL;

	797 }

	798

	799 const UXMLElement *

	800 UXMLElement::getChildElement(const UnicodeString &name) const {

	801 // search for the element name by comparing the interned pointer,

	802 // not the string contents

	803 const UnicodeString *p=fParser->findName(name);

	804 if(p==NULL) {

	805 return NULL; // no such element seen by the parser at all

	806 }

	807

	808 const UObject *node;

	809 int32_t i, count=fChildren.size();

	810 for(i=0; i<count; ++i) {

	811 node=(const UObject *)fChildren.elementAt(i);

	812 const UXMLElement elem=dynamic_cast<const UXMLElement >(node);

	813 if(elem!=NULL) {

	814 if(p==elem->fName) {

	815 return elem;

	816 }

	817 }

	818 }

	819 return NULL;

	820 }

	821

	822 U_NAMESPACE_END

	823

	824 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */

	825

OLD	NEW

« no previous file with comments | « icu46/source/tools/toolutil/xmlparser.h ('k') | icu46/source/tools/tzcode/Makefile.in » ('j') | no next file with comments »