OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * |
| 4 * Copyright (C) 2004-2005, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ******************************************************************************* |
| 8 * file name: xmlparser.h |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 2004jul21 |
| 14 * created by: Andy Heninger |
| 15 * |
| 16 * Tiny XML parser using ICU and intended for use in ICU tests and in build tools
. |
| 17 * Not suitable for production use. Not supported. |
| 18 * Not conformant. Not efficient. |
| 19 * But very small. |
| 20 */ |
| 21 |
| 22 #ifndef __XMLPARSER_H__ |
| 23 #define __XMLPARSER_H__ |
| 24 |
| 25 #include "unicode/uobject.h" |
| 26 #include "unicode/unistr.h" |
| 27 #include "unicode/regex.h" |
| 28 #include "uvector.h" |
| 29 #include "hash.h" |
| 30 |
| 31 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION |
| 32 |
| 33 enum UXMLNodeType { |
| 34 /** Node type string (text contents), stored as a UnicodeString. */ |
| 35 UXML_NODE_TYPE_STRING, |
| 36 /** Node type element, stored as a UXMLElement. */ |
| 37 UXML_NODE_TYPE_ELEMENT, |
| 38 UXML_NODE_TYPE_COUNT |
| 39 }; |
| 40 |
| 41 U_NAMESPACE_BEGIN |
| 42 |
| 43 class UXMLParser; |
| 44 |
| 45 /** |
| 46 * This class represents an element node in a parsed XML tree. |
| 47 */ |
| 48 class U_TOOLUTIL_API UXMLElement : public UObject { |
| 49 public: |
| 50 /** |
| 51 * Destructor. |
| 52 */ |
| 53 virtual ~UXMLElement(); |
| 54 |
| 55 /** |
| 56 * Get the tag name of this element. |
| 57 */ |
| 58 const UnicodeString &getTagName() const; |
| 59 /** |
| 60 * Get the text contents of the element. |
| 61 * Append the contents of all text child nodes. |
| 62 * @param recurse If TRUE, also recursively appends the contents of all |
| 63 * text child nodes of element children. |
| 64 * @return The text contents. |
| 65 */ |
| 66 UnicodeString getText(UBool recurse) const; |
| 67 /** |
| 68 * Get the number of attributes. |
| 69 */ |
| 70 int32_t countAttributes() const; |
| 71 /** |
| 72 * Get the i-th attribute. |
| 73 * @param i Index of the attribute. |
| 74 * @param name Output parameter, receives the attribute name. |
| 75 * @param value Output parameter, receives the attribute value. |
| 76 * @return A pointer to the attribute value (may be &value or a pointer to a
n |
| 77 * internal string object), or NULL if i is out of bounds. |
| 78 */ |
| 79 const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeStr
ing &value) const; |
| 80 /** |
| 81 * Get the value of the attribute with the given name. |
| 82 * @param name Attribute name to be looked up. |
| 83 * @return A pointer to the attribute value, or NULL if this element |
| 84 * does not have this attribute. |
| 85 */ |
| 86 const UnicodeString *getAttribute(const UnicodeString &name) const; |
| 87 /** |
| 88 * Get the number of child nodes. |
| 89 */ |
| 90 int32_t countChildren() const; |
| 91 /** |
| 92 * Get the i-th child node. |
| 93 * @param i Index of the child node. |
| 94 * @param type The child node type. |
| 95 * @return A pointer to the child node object, or NULL if i is out of bounds
. |
| 96 */ |
| 97 const UObject *getChild(int32_t i, UXMLNodeType &type) const; |
| 98 /** |
| 99 * Get the next child element node, skipping non-element child nodes. |
| 100 * @param i Enumeration index; initialize to 0 before getting the first chil
d element. |
| 101 * @return A pointer to the next child element, or NULL if there is none. |
| 102 */ |
| 103 const UXMLElement *nextChildElement(int32_t &i) const; |
| 104 /** |
| 105 * Get the immediate child element with the given name. |
| 106 * If there are multiple child elements with this name, then return |
| 107 * the first one. |
| 108 * @param name Element name to be looked up. |
| 109 * @return A pointer to the element node, or NULL if this element |
| 110 * does not have this immediate child element. |
| 111 */ |
| 112 const UXMLElement *getChildElement(const UnicodeString &name) const; |
| 113 |
| 114 /** |
| 115 * ICU "poor man's RTTI", returns a UClassID for the actual class. |
| 116 */ |
| 117 virtual UClassID getDynamicClassID() const; |
| 118 |
| 119 /** |
| 120 * ICU "poor man's RTTI", returns a UClassID for this class. |
| 121 */ |
| 122 static UClassID U_EXPORT2 getStaticClassID(); |
| 123 |
| 124 private: |
| 125 // prevent default construction etc. |
| 126 UXMLElement(); |
| 127 UXMLElement(const UXMLElement &other); |
| 128 UXMLElement &operator=(const UXMLElement &other); |
| 129 |
| 130 void appendText(UnicodeString &text, UBool recurse) const; |
| 131 |
| 132 friend class UXMLParser; |
| 133 |
| 134 UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode
&errorCode); |
| 135 |
| 136 const UXMLParser *fParser; |
| 137 const UnicodeString *fName; // The tag name of this element (owned
by the UXMLParser) |
| 138 UnicodeString fContent; // The text content of this node. All
element content is |
| 139 // concatenated even when there are i
ntervening nested elements |
| 140 // (which doesn't happen with most xm
l files we care about) |
| 141 // Sections of content containing onl
y white space are dropped, |
| 142 // which gets rid the bogus white sp
ace content from |
| 143 // elements which are primarily conta
iners for nested elements. |
| 144 UVector fAttNames; // A vector containing the names of thi
s element's attributes |
| 145 // The names are UnicodeString objec
ts, owned by the UXMLParser. |
| 146 UVector fAttValues; // A vector containing the attribute va
lues for |
| 147 // this element's attributes. The o
rder is the same |
| 148 // as that of the attribute name vec
tor. |
| 149 |
| 150 UVector fChildren; // The child nodes of this element (a V
ector) |
| 151 |
| 152 UXMLElement *fParent; // A pointer to the parent element of t
his element. |
| 153 }; |
| 154 |
| 155 /** |
| 156 * A simple XML parser; it is neither efficient nor conformant and only useful f
or |
| 157 * restricted types of XML documents. |
| 158 * |
| 159 * The parse methods parse whole documents and return the parse trees via their |
| 160 * root elements. |
| 161 */ |
| 162 class U_TOOLUTIL_API UXMLParser : public UObject { |
| 163 public: |
| 164 /** |
| 165 * Create an XML parser. |
| 166 */ |
| 167 static UXMLParser *createParser(UErrorCode &errorCode); |
| 168 /** |
| 169 * Destructor. |
| 170 */ |
| 171 virtual ~UXMLParser(); |
| 172 |
| 173 /** |
| 174 * Parse an XML document, create the entire document tree, and |
| 175 * return a pointer to the root element of the parsed tree. |
| 176 * The caller must delete the element. |
| 177 */ |
| 178 UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode); |
| 179 /** |
| 180 * Parse an XML file, create the entire document tree, and |
| 181 * return a pointer to the root element of the parsed tree. |
| 182 * The caller must delete the element. |
| 183 */ |
| 184 UXMLElement *parseFile(const char *filename, UErrorCode &errorCode); |
| 185 |
| 186 /** |
| 187 * ICU "poor man's RTTI", returns a UClassID for the actual class. |
| 188 */ |
| 189 virtual UClassID getDynamicClassID() const; |
| 190 |
| 191 /** |
| 192 * ICU "poor man's RTTI", returns a UClassID for this class. |
| 193 */ |
| 194 static UClassID U_EXPORT2 getStaticClassID(); |
| 195 |
| 196 private: |
| 197 // prevent default construction etc. |
| 198 UXMLParser(); |
| 199 UXMLParser(const UXMLParser &other); |
| 200 UXMLParser &operator=(const UXMLParser &other); |
| 201 |
| 202 // constructor |
| 203 UXMLParser(UErrorCode &status); |
| 204 |
| 205 void parseMisc(UErrorCode &status); |
| 206 UXMLElement *createElement(RegexMatcher &mEl, UErrorCode &status); |
| 207 void error(const char *message, UErrorCode &status); |
| 208 UnicodeString scanContent(UErrorCode &status); |
| 209 void replaceCharRefs(UnicodeString &s, UErrorCode &status); |
| 210 |
| 211 const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode); |
| 212 public: |
| 213 // public for UXMLElement only |
| 214 const UnicodeString *findName(const UnicodeString &s) const; |
| 215 private: |
| 216 |
| 217 // There is one ICU regex matcher for each of the major XML syntax items |
| 218 // that are recognized. |
| 219 RegexMatcher mXMLDecl; |
| 220 RegexMatcher mXMLComment; |
| 221 RegexMatcher mXMLSP; |
| 222 RegexMatcher mXMLDoctype; |
| 223 RegexMatcher mXMLPI; |
| 224 RegexMatcher mXMLElemStart; |
| 225 RegexMatcher mXMLElemEnd; |
| 226 RegexMatcher mXMLElemEmpty; |
| 227 RegexMatcher mXMLCharData; |
| 228 RegexMatcher mAttrValue; |
| 229 RegexMatcher mAttrNormalizer; |
| 230 RegexMatcher mNewLineNormalizer; |
| 231 RegexMatcher mAmps; |
| 232 |
| 233 Hashtable fNames; // interned element/attribute name s
trings |
| 234 UStack fElementStack; // Stack holds the parent elements w
hen nested |
| 235 // elements are being parsed. Al
l items on this |
| 236 // stack are of type UXMLElement. |
| 237 int32_t fPos; // String index of the current scan
position in |
| 238 // xml source (in fSrc). |
| 239 UnicodeString fOneLF; |
| 240 }; |
| 241 |
| 242 U_NAMESPACE_END |
| 243 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ |
| 244 |
| 245 #endif |
OLD | NEW |