| Index: icu46/source/tools/toolutil/xmlparser.cpp
|
| ===================================================================
|
| --- icu46/source/tools/toolutil/xmlparser.cpp (revision 0)
|
| +++ icu46/source/tools/toolutil/xmlparser.cpp (revision 0)
|
| @@ -0,0 +1,825 @@
|
| +/*
|
| +*******************************************************************************
|
| +*
|
| +* Copyright (C) 2004-2010, International Business Machines
|
| +* Corporation and others. All Rights Reserved.
|
| +*
|
| +*******************************************************************************
|
| +* file name: xmlparser.cpp
|
| +* encoding: US-ASCII
|
| +* tab size: 8 (not used)
|
| +* indentation:4
|
| +*
|
| +* created on: 2004jul21
|
| +* created by: Andy Heninger
|
| +*/
|
| +
|
| +#include <stdio.h>
|
| +#include "unicode/uchar.h"
|
| +#include "unicode/ucnv.h"
|
| +#include "unicode/regex.h"
|
| +#include "filestrm.h"
|
| +#include "xmlparser.h"
|
| +
|
| +#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
|
| +
|
| +// character constants
|
| +enum {
|
| + x_QUOT=0x22,
|
| + x_AMP=0x26,
|
| + x_APOS=0x27,
|
| + x_LT=0x3c,
|
| + x_GT=0x3e,
|
| + x_l=0x6c
|
| +};
|
| +
|
| +#define XML_SPACES "[ \\u0009\\u000d\\u000a]"
|
| +
|
| +// XML #4
|
| +#define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
|
| + "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
|
| + "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
|
| + "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
|
| +
|
| +// XML #5
|
| +#define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
|
| +
|
| +// XML #6
|
| +#define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
|
| +
|
| +U_NAMESPACE_BEGIN
|
| +
|
| +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
|
| +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
|
| +
|
| +//
|
| +// UXMLParser constructor. Mostly just initializes the ICU regexes that are
|
| +// used for parsing.
|
| +//
|
| +UXMLParser::UXMLParser(UErrorCode &status) :
|
| + // XML Declaration. XML Production #23.
|
| + // example: "<?xml version=1.0 encoding="utf-16" ?>
|
| + // This is a sloppy implementation - just look for the leading <?xml and the closing ?>
|
| + // allow for a possible leading BOM.
|
| + mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
|
| +
|
| + // XML Comment production #15
|
| + // example: "<!-- whatever -->
|
| + // note, does not detect an illegal "--" within comments
|
| + mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
|
| +
|
| + // XML Spaces
|
| + // production [3]
|
| + mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
|
| +
|
| + // XML Doctype decl production #28
|
| + // example "<!DOCTYPE foo SYSTEM "somewhere" >
|
| + // or "<!DOCTYPE foo [internal dtd]>
|
| + // TODO: we don't actually parse the DOCTYPE or internal subsets.
|
| + // Some internal dtd subsets could confuse this simple-minded
|
| + // attempt at skipping over them, specifically, occcurences
|
| + // of closeing square brackets. These could appear in comments,
|
| + // or in parameter entity declarations, for example.
|
| + mXMLDoctype(UnicodeString(
|
| + "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
|
| + ), 0, status),
|
| +
|
| + // XML PI production #16
|
| + // example "<?target stuff?>
|
| + mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
|
| +
|
| + // XML Element Start Productions #40, #41
|
| + // example <foo att1='abc' att2="d e f" >
|
| + // capture #1: the tag name
|
| + //
|
| + mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name"
|
| + "(?:"
|
| + XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = "
|
| + "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
|
| + ")*" // * for zero or more attributes.
|
| + XML_SPACES "*?>", -1, US_INV), 0, status), // match " >"
|
| +
|
| + // XML Element End production #42
|
| + // example </foo>
|
| + mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
|
| +
|
| + // XML Element Empty production #44
|
| + // example <foo att1="abc" att2="d e f" />
|
| + mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name"
|
| + "(?:"
|
| + XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = "
|
| + "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
|
| + ")*" // * for zero or more attributes.
|
| + XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />"
|
| +
|
| +
|
| + // XMLCharData. Everything but '<'. Note that & will be dealt with later.
|
| + mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
|
| +
|
| + // Attribute name = "value". XML Productions 10, 40/41
|
| + // Capture group 1 is name,
|
| + // 2 is the attribute value, including the quotes.
|
| + //
|
| + // Note that attributes are scanned twice. The first time is with
|
| + // the regex for an entire element start. There, the attributes
|
| + // are checked syntactically, but not separted out one by one.
|
| + // Here, we match a single attribute, and make its name and
|
| + // attribute value available to the parser code.
|
| + mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*"
|
| + "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
|
| +
|
| +
|
| + mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
|
| +
|
| + // Match any of the new-line sequences in content.
|
| + // All are changed to \u000a.
|
| + mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
|
| +
|
| + // & char references
|
| + // We will figure out what we've got based on which capture group has content.
|
| + // The last one is a catchall for unrecognized entity references..
|
| + // 1 2 3 4 5 6 7 8
|
| + mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
|
| + 0, status),
|
| +
|
| + fNames(status),
|
| + fElementStack(status),
|
| + fOneLF((UChar)0x0a) // Plain new-line string, used in new line normalization.
|
| + {
|
| + }
|
| +
|
| +UXMLParser *
|
| +UXMLParser::createParser(UErrorCode &errorCode) {
|
| + if (U_FAILURE(errorCode)) {
|
| + return NULL;
|
| + } else {
|
| + return new UXMLParser(errorCode);
|
| + }
|
| +}
|
| +
|
| +UXMLParser::~UXMLParser() {}
|
| +
|
| +UXMLElement *
|
| +UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
|
| + char bytes[4096], charsetBuffer[100];
|
| + FileStream *f;
|
| + const char *charset, *pb;
|
| + UnicodeString src;
|
| + UConverter *cnv;
|
| + UChar *buffer, *pu;
|
| + int32_t fileLength, bytesLength, length, capacity;
|
| + UBool flush;
|
| +
|
| + if(U_FAILURE(errorCode)) {
|
| + return NULL;
|
| + }
|
| +
|
| + f=T_FileStream_open(filename, "rb");
|
| + if(f==NULL) {
|
| + errorCode=U_FILE_ACCESS_ERROR;
|
| + return NULL;
|
| + }
|
| +
|
| + bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
|
| + if(bytesLength<(int32_t)sizeof(bytes)) {
|
| + // we have already read the entire file
|
| + fileLength=bytesLength;
|
| + } else {
|
| + // get the file length
|
| + fileLength=T_FileStream_size(f);
|
| + }
|
| +
|
| + /*
|
| + * get the charset:
|
| + * 1. Unicode signature
|
| + * 2. treat as ISO-8859-1 and read XML encoding="charser"
|
| + * 3. default to UTF-8
|
| + */
|
| + charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
|
| + if(U_SUCCESS(errorCode) && charset!=NULL) {
|
| + // open converter according to Unicode signature
|
| + cnv=ucnv_open(charset, &errorCode);
|
| + } else {
|
| + // read as Latin-1 and parse the XML declaration and encoding
|
| + cnv=ucnv_open("ISO-8859-1", &errorCode);
|
| + if(U_FAILURE(errorCode)) {
|
| + // unexpected error opening Latin-1 converter
|
| + goto exit;
|
| + }
|
| +
|
| + buffer=src.getBuffer(bytesLength);
|
| + if(buffer==NULL) {
|
| + // unexpected failure to reserve some string capacity
|
| + errorCode=U_MEMORY_ALLOCATION_ERROR;
|
| + goto exit;
|
| + }
|
| + pb=bytes;
|
| + pu=buffer;
|
| + ucnv_toUnicode(
|
| + cnv,
|
| + &pu, buffer+src.getCapacity(),
|
| + &pb, bytes+bytesLength,
|
| + NULL, TRUE, &errorCode);
|
| + src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
|
| + ucnv_close(cnv);
|
| + cnv=NULL;
|
| + if(U_FAILURE(errorCode)) {
|
| + // unexpected error in conversion from Latin-1
|
| + src.remove();
|
| + goto exit;
|
| + }
|
| +
|
| + // parse XML declaration
|
| + if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
|
| + int32_t declEnd=mXMLDecl.end(errorCode);
|
| + // go beyond <?xml
|
| + int32_t pos=src.indexOf((UChar)x_l)+1;
|
| +
|
| + mAttrValue.reset(src);
|
| + while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element.
|
| + UnicodeString attName = mAttrValue.group(1, errorCode);
|
| + UnicodeString attValue = mAttrValue.group(2, errorCode);
|
| +
|
| + // Trim the quotes from the att value. These are left over from the original regex
|
| + // that parsed the attribue, which couldn't conveniently strip them.
|
| + attValue.remove(0,1); // one char from the beginning
|
| + attValue.truncate(attValue.length()-1); // and one from the end.
|
| +
|
| + if(attName==UNICODE_STRING("encoding", 8)) {
|
| + length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
|
| + charset=charsetBuffer;
|
| + break;
|
| + }
|
| + pos = mAttrValue.end(2, errorCode);
|
| + }
|
| +
|
| + if(charset==NULL) {
|
| + // default to UTF-8
|
| + charset="UTF-8";
|
| + }
|
| + cnv=ucnv_open(charset, &errorCode);
|
| + }
|
| + }
|
| +
|
| + if(U_FAILURE(errorCode)) {
|
| + // unable to open the converter
|
| + goto exit;
|
| + }
|
| +
|
| + // convert the file contents
|
| + capacity=fileLength; // estimated capacity
|
| + src.getBuffer(capacity);
|
| + src.releaseBuffer(0); // zero length
|
| + flush=FALSE;
|
| + for(;;) {
|
| + // convert contents of bytes[bytesLength]
|
| + pb=bytes;
|
| + for(;;) {
|
| + length=src.length();
|
| + buffer=src.getBuffer(capacity);
|
| + if(buffer==NULL) {
|
| + // unexpected failure to reserve some string capacity
|
| + errorCode=U_MEMORY_ALLOCATION_ERROR;
|
| + goto exit;
|
| + }
|
| +
|
| + pu=buffer+length;
|
| + ucnv_toUnicode(
|
| + cnv, &pu, buffer+src.getCapacity(),
|
| + &pb, bytes+bytesLength,
|
| + NULL, FALSE, &errorCode);
|
| + src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
|
| + if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
|
| + errorCode=U_ZERO_ERROR;
|
| + capacity=(3*src.getCapacity())/2; // increase capacity by 50%
|
| + } else {
|
| + break;
|
| + }
|
| + }
|
| +
|
| + if(U_FAILURE(errorCode)) {
|
| + break; // conversion error
|
| + }
|
| +
|
| + if(flush) {
|
| + break; // completely converted the file
|
| + }
|
| +
|
| + // read next block
|
| + bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
|
| + if(bytesLength==0) {
|
| + // reached end of file, convert once more to flush the converter
|
| + flush=TRUE;
|
| + }
|
| + };
|
| +
|
| +exit:
|
| + ucnv_close(cnv);
|
| + T_FileStream_close(f);
|
| +
|
| + if(U_SUCCESS(errorCode)) {
|
| + return parse(src, errorCode);
|
| + } else {
|
| + return NULL;
|
| + }
|
| +}
|
| +
|
| +UXMLElement *
|
| +UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
|
| + if(U_FAILURE(status)) {
|
| + return NULL;
|
| + }
|
| +
|
| + UXMLElement *root = NULL;
|
| + fPos = 0; // TODO use just a local pos variable and pass it into functions
|
| + // where necessary?
|
| +
|
| + // set all matchers to work on the input string
|
| + mXMLDecl.reset(src);
|
| + mXMLComment.reset(src);
|
| + mXMLSP.reset(src);
|
| + mXMLDoctype.reset(src);
|
| + mXMLPI.reset(src);
|
| + mXMLElemStart.reset(src);
|
| + mXMLElemEnd.reset(src);
|
| + mXMLElemEmpty.reset(src);
|
| + mXMLCharData.reset(src);
|
| + mAttrValue.reset(src);
|
| + mAttrNormalizer.reset(src);
|
| + mNewLineNormalizer.reset(src);
|
| + mAmps.reset(src);
|
| +
|
| + // Consume the XML Declaration, if present.
|
| + if (mXMLDecl.lookingAt(fPos, status)) {
|
| + fPos = mXMLDecl.end(status);
|
| + }
|
| +
|
| + // Consume "misc" [XML production 27] appearing before DocType
|
| + parseMisc(status);
|
| +
|
| + // Consume a DocType declaration, if present.
|
| + if (mXMLDoctype.lookingAt(fPos, status)) {
|
| + fPos = mXMLDoctype.end(status);
|
| + }
|
| +
|
| + // Consume additional "misc" [XML production 27] appearing after the DocType
|
| + parseMisc(status);
|
| +
|
| + // Get the root element
|
| + if (mXMLElemEmpty.lookingAt(fPos, status)) {
|
| + // Root is an empty element (no nested elements or content)
|
| + root = createElement(mXMLElemEmpty, status);
|
| + fPos = mXMLElemEmpty.end(status);
|
| + } else {
|
| + if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
|
| + error("Root Element expected", status);
|
| + goto errorExit;
|
| + }
|
| + root = createElement(mXMLElemStart, status);
|
| + UXMLElement *el = root;
|
| +
|
| + //
|
| + // This is the loop that consumes the root element of the document,
|
| + // including all nested content. Nested elements are handled by
|
| + // explicit pushes/pops of the element stack; there is no recursion
|
| + // in the control flow of this code.
|
| + // "el" always refers to the current element, the one to which content
|
| + // is being added. It is above the top of the element stack.
|
| + for (;;) {
|
| + // Nested Element Start
|
| + if (mXMLElemStart.lookingAt(fPos, status)) {
|
| + UXMLElement *t = createElement(mXMLElemStart, status);
|
| + el->fChildren.addElement(t, status);
|
| + t->fParent = el;
|
| + fElementStack.push(el, status);
|
| + el = t;
|
| + continue;
|
| + }
|
| +
|
| + // Text Content. String is concatenated onto the current node's content,
|
| + // but only if it contains something other than spaces.
|
| + UnicodeString s = scanContent(status);
|
| + if (s.length() > 0) {
|
| + mXMLSP.reset(s);
|
| + if (mXMLSP.matches(status) == FALSE) {
|
| + // This chunk of text contains something other than just
|
| + // white space. Make a child node for it.
|
| + replaceCharRefs(s, status);
|
| + el->fChildren.addElement(s.clone(), status);
|
| + }
|
| + mXMLSP.reset(src); // The matchers need to stay set to the main input string.
|
| + continue;
|
| + }
|
| +
|
| + // Comments. Discard.
|
| + if (mXMLComment.lookingAt(fPos, status)) {
|
| + fPos = mXMLComment.end(status);
|
| + continue;
|
| + }
|
| +
|
| + // PIs. Discard.
|
| + if (mXMLPI.lookingAt(fPos, status)) {
|
| + fPos = mXMLPI.end(status);
|
| + continue;
|
| + }
|
| +
|
| + // Element End
|
| + if (mXMLElemEnd.lookingAt(fPos, status)) {
|
| + fPos = mXMLElemEnd.end(0, status);
|
| + const UnicodeString name = mXMLElemEnd.group(1, status);
|
| + if (name != *el->fName) {
|
| + error("Element start / end tag mismatch", status);
|
| + goto errorExit;
|
| + }
|
| + if (fElementStack.empty()) {
|
| + // Close of the root element. We're done with the doc.
|
| + el = NULL;
|
| + break;
|
| + }
|
| + el = (UXMLElement *)fElementStack.pop();
|
| + continue;
|
| + }
|
| +
|
| + // Empty Element. Stored as a child of the current element, but not stacked.
|
| + if (mXMLElemEmpty.lookingAt(fPos, status)) {
|
| + UXMLElement *t = createElement(mXMLElemEmpty, status);
|
| + el->fChildren.addElement(t, status);
|
| + continue;
|
| + }
|
| +
|
| + // Hit something within the document that doesn't match anything.
|
| + // It's an error.
|
| + error("Unrecognized markup", status);
|
| + break;
|
| + }
|
| +
|
| + if (el != NULL || !fElementStack.empty()) {
|
| + // We bailed out early, for some reason.
|
| + error("Root element not closed.", status);
|
| + goto errorExit;
|
| + }
|
| + }
|
| +
|
| + // Root Element parse is complete.
|
| + // Consume the annoying xml "Misc" that can appear at the end of the doc.
|
| + parseMisc(status);
|
| +
|
| + // We should have reached the end of the input
|
| + if (fPos != src.length()) {
|
| + error("Extra content at the end of the document", status);
|
| + goto errorExit;
|
| + }
|
| +
|
| + // Success!
|
| + return root;
|
| +
|
| +errorExit:
|
| + delete root;
|
| + return NULL;
|
| +}
|
| +
|
| +//
|
| +// createElement
|
| +// We've just matched an element start tag. Create and fill in a UXMLElement object
|
| +// for it.
|
| +//
|
| +UXMLElement *
|
| +UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) {
|
| + // First capture group is the element's name.
|
| + UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
|
| +
|
| + // Scan for attributes.
|
| + int32_t pos = mEl.end(1, status); // The position after the end of the tag name
|
| +
|
| + while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element.
|
| + UnicodeString attName = mAttrValue.group(1, status);
|
| + UnicodeString attValue = mAttrValue.group(2, status);
|
| +
|
| + // Trim the quotes from the att value. These are left over from the original regex
|
| + // that parsed the attribue, which couldn't conveniently strip them.
|
| + attValue.remove(0,1); // one char from the beginning
|
| + attValue.truncate(attValue.length()-1); // and one from the end.
|
| +
|
| + // XML Attribue value normalization.
|
| + // This is one of the really screwy parts of the XML spec.
|
| + // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
|
| + // Note that non-validating parsers must treat all entities as type CDATA
|
| + // which simplifies things some.
|
| +
|
| + // Att normalization step 1: normalize any newlines in the attribute value
|
| + mNewLineNormalizer.reset(attValue);
|
| + attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
|
| +
|
| + // Next change all xml white space chars to plain \u0020 spaces.
|
| + mAttrNormalizer.reset(attValue);
|
| + UnicodeString oneSpace((UChar)0x0020);
|
| + attValue = mAttrNormalizer.replaceAll(oneSpace, status);
|
| +
|
| + // Replace character entities.
|
| + replaceCharRefs(attValue, status);
|
| +
|
| + // Save the attribute name and value in our document structure.
|
| + el->fAttNames.addElement((void *)intern(attName, status), status);
|
| + el->fAttValues.addElement(attValue.clone(), status);
|
| + pos = mAttrValue.end(2, status);
|
| + }
|
| + fPos = mEl.end(0, status);
|
| + return el;
|
| +}
|
| +
|
| +//
|
| +// parseMisc
|
| +// Consume XML "Misc" [production #27]
|
| +// which is any combination of space, PI and comments
|
| +// Need to watch end-of-input because xml MISC stuff is allowed after
|
| +// the document element, so we WILL scan off the end in this function
|
| +//
|
| +void
|
| +UXMLParser::parseMisc(UErrorCode &status) {
|
| + for (;;) {
|
| + if (fPos >= mXMLPI.input().length()) {
|
| + break;
|
| + }
|
| + if (mXMLPI.lookingAt(fPos, status)) {
|
| + fPos = mXMLPI.end(status);
|
| + continue;
|
| + }
|
| + if (mXMLSP.lookingAt(fPos, status)) {
|
| + fPos = mXMLSP.end(status);
|
| + continue;
|
| + }
|
| + if (mXMLComment.lookingAt(fPos, status)) {
|
| + fPos = mXMLComment.end(status);
|
| + continue;
|
| + }
|
| + break;
|
| + }
|
| +}
|
| +
|
| +//
|
| +// Scan for document content.
|
| +//
|
| +UnicodeString
|
| +UXMLParser::scanContent(UErrorCode &status) {
|
| + UnicodeString result;
|
| + if (mXMLCharData.lookingAt(fPos, status)) {
|
| + result = mXMLCharData.group((int32_t)0, status);
|
| + // Normalize the new-lines. (Before char ref substitution)
|
| + mNewLineNormalizer.reset(result);
|
| + result = mNewLineNormalizer.replaceAll(fOneLF, status);
|
| +
|
| + // TODO: handle CDATA
|
| + fPos = mXMLCharData.end(0, status);
|
| + }
|
| +
|
| + return result;
|
| +}
|
| +
|
| +//
|
| +// replaceCharRefs
|
| +//
|
| +// replace the char entities < & { ካ etc. in a string
|
| +// with the corresponding actual character.
|
| +//
|
| +void
|
| +UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
|
| + UnicodeString result;
|
| + UnicodeString replacement;
|
| + int i;
|
| +
|
| + mAmps.reset(s);
|
| + // See the initialization for the regex matcher mAmps.
|
| + // Which entity we've matched is determined by which capture group has content,
|
| + // which is flaged by start() of that group not being -1.
|
| + while (mAmps.find()) {
|
| + if (mAmps.start(1, status) != -1) {
|
| + replacement.setTo((UChar)x_AMP);
|
| + } else if (mAmps.start(2, status) != -1) {
|
| + replacement.setTo((UChar)x_LT);
|
| + } else if (mAmps.start(3, status) != -1) {
|
| + replacement.setTo((UChar)x_GT);
|
| + } else if (mAmps.start(4, status) != -1) {
|
| + replacement.setTo((UChar)x_APOS);
|
| + } else if (mAmps.start(5, status) != -1) {
|
| + replacement.setTo((UChar)x_QUOT);
|
| + } else if (mAmps.start(6, status) != -1) {
|
| + UnicodeString hexString = mAmps.group(6, status);
|
| + UChar32 val = 0;
|
| + for (i=0; i<hexString.length(); i++) {
|
| + val = (val << 4) + u_digit(hexString.charAt(i), 16);
|
| + }
|
| + // TODO: some verification that the character is valid
|
| + replacement.setTo(val);
|
| + } else if (mAmps.start(7, status) != -1) {
|
| + UnicodeString decimalString = mAmps.group(7, status);
|
| + UChar32 val = 0;
|
| + for (i=0; i<decimalString.length(); i++) {
|
| + val = val*10 + u_digit(decimalString.charAt(i), 10);
|
| + }
|
| + // TODO: some verification that the character is valid
|
| + replacement.setTo(val);
|
| + } else {
|
| + // An unrecognized &entity; Leave it alone.
|
| + // TODO: check that it really looks like an entity, and is not some
|
| + // random & in the text.
|
| + replacement = mAmps.group((int32_t)0, status);
|
| + }
|
| + mAmps.appendReplacement(result, replacement, status);
|
| + }
|
| + mAmps.appendTail(result);
|
| + s = result;
|
| +}
|
| +
|
| +void
|
| +UXMLParser::error(const char *message, UErrorCode &status) {
|
| + // TODO: something better here...
|
| + const UnicodeString &src=mXMLDecl.input();
|
| + int line = 0;
|
| + int ci = 0;
|
| + while (ci < fPos && ci>=0) {
|
| + ci = src.indexOf((UChar)0x0a, ci+1);
|
| + line++;
|
| + }
|
| + fprintf(stderr, "Error: %s at line %d\n", message, line);
|
| + if (U_SUCCESS(status)) {
|
| + status = U_PARSE_ERROR;
|
| + }
|
| +}
|
| +
|
| +// intern strings like in Java
|
| +
|
| +const UnicodeString *
|
| +UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
|
| + const UHashElement *he=fNames.find(s);
|
| + if(he!=NULL) {
|
| + // already a known name, return its hashed key pointer
|
| + return (const UnicodeString *)he->key.pointer;
|
| + } else {
|
| + // add this new name and return its hashed key pointer
|
| + fNames.puti(s, 0, errorCode);
|
| + he=fNames.find(s);
|
| + return (const UnicodeString *)he->key.pointer;
|
| + }
|
| +}
|
| +
|
| +const UnicodeString *
|
| +UXMLParser::findName(const UnicodeString &s) const {
|
| + const UHashElement *he=fNames.find(s);
|
| + if(he!=NULL) {
|
| + // a known name, return its hashed key pointer
|
| + return (const UnicodeString *)he->key.pointer;
|
| + } else {
|
| + // unknown name
|
| + return NULL;
|
| + }
|
| +}
|
| +
|
| +// UXMLElement ------------------------------------------------------------- ***
|
| +
|
| +UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
|
| + fParser(parser),
|
| + fName(name),
|
| + fAttNames(errorCode),
|
| + fAttValues(errorCode),
|
| + fChildren(errorCode),
|
| + fParent(NULL)
|
| +{
|
| +}
|
| +
|
| +UXMLElement::~UXMLElement() {
|
| + int i;
|
| + // attribute names are owned by the UXMLParser, don't delete them here
|
| + for (i=fAttValues.size()-1; i>=0; i--) {
|
| + delete (UObject *)fAttValues.elementAt(i);
|
| + }
|
| + for (i=fChildren.size()-1; i>=0; i--) {
|
| + delete (UObject *)fChildren.elementAt(i);
|
| + }
|
| +}
|
| +
|
| +const UnicodeString &
|
| +UXMLElement::getTagName() const {
|
| + return *fName;
|
| +}
|
| +
|
| +UnicodeString
|
| +UXMLElement::getText(UBool recurse) const {
|
| + UnicodeString text;
|
| + appendText(text, recurse);
|
| + return text;
|
| +}
|
| +
|
| +void
|
| +UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
|
| + const UObject *node;
|
| + int32_t i, count=fChildren.size();
|
| + for(i=0; i<count; ++i) {
|
| + node=(const UObject *)fChildren.elementAt(i);
|
| + const UnicodeString *s=dynamic_cast<const UnicodeString *>(node);
|
| + if(s!=NULL) {
|
| + text.append(*s);
|
| + } else if(recurse) /* must be a UXMLElement */ {
|
| + ((const UXMLElement *)node)->appendText(text, recurse);
|
| + }
|
| + }
|
| +}
|
| +
|
| +int32_t
|
| +UXMLElement::countAttributes() const {
|
| + return fAttNames.size();
|
| +}
|
| +
|
| +const UnicodeString *
|
| +UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
|
| + if(0<=i && i<fAttNames.size()) {
|
| + name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
|
| + value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
|
| + return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
|
| + } else {
|
| + return NULL;
|
| + }
|
| +}
|
| +
|
| +const UnicodeString *
|
| +UXMLElement::getAttribute(const UnicodeString &name) const {
|
| + // search for the attribute name by comparing the interned pointer,
|
| + // not the string contents
|
| + const UnicodeString *p=fParser->findName(name);
|
| + if(p==NULL) {
|
| + return NULL; // no such attribute seen by the parser at all
|
| + }
|
| +
|
| + int32_t i, count=fAttNames.size();
|
| + for(i=0; i<count; ++i) {
|
| + if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
|
| + return (const UnicodeString *)fAttValues.elementAt(i);
|
| + }
|
| + }
|
| + return NULL;
|
| +}
|
| +
|
| +int32_t
|
| +UXMLElement::countChildren() const {
|
| + return fChildren.size();
|
| +}
|
| +
|
| +const UObject *
|
| +UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
|
| + if(0<=i && i<fChildren.size()) {
|
| + const UObject *node=(const UObject *)fChildren.elementAt(i);
|
| + if(dynamic_cast<const UXMLElement *>(node)!=NULL) {
|
| + type=UXML_NODE_TYPE_ELEMENT;
|
| + } else {
|
| + type=UXML_NODE_TYPE_STRING;
|
| + }
|
| + return node;
|
| + } else {
|
| + return NULL;
|
| + }
|
| +}
|
| +
|
| +const UXMLElement *
|
| +UXMLElement::nextChildElement(int32_t &i) const {
|
| + if(i<0) {
|
| + return NULL;
|
| + }
|
| +
|
| + const UObject *node;
|
| + int32_t count=fChildren.size();
|
| + while(i<count) {
|
| + node=(const UObject *)fChildren.elementAt(i++);
|
| + const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
|
| + if(elem!=NULL) {
|
| + return elem;
|
| + }
|
| + }
|
| + return NULL;
|
| +}
|
| +
|
| +const UXMLElement *
|
| +UXMLElement::getChildElement(const UnicodeString &name) const {
|
| + // search for the element name by comparing the interned pointer,
|
| + // not the string contents
|
| + const UnicodeString *p=fParser->findName(name);
|
| + if(p==NULL) {
|
| + return NULL; // no such element seen by the parser at all
|
| + }
|
| +
|
| + const UObject *node;
|
| + int32_t i, count=fChildren.size();
|
| + for(i=0; i<count; ++i) {
|
| + node=(const UObject *)fChildren.elementAt(i);
|
| + const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
|
| + if(elem!=NULL) {
|
| + if(p==elem->fName) {
|
| + return elem;
|
| + }
|
| + }
|
| + }
|
| + return NULL;
|
| +}
|
| +
|
| +U_NAMESPACE_END
|
| +
|
| +#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
|
| +
|
|
|
| Property changes on: icu46/source/tools/toolutil/xmlparser.cpp
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|