diff --git a/icu4c/source/allinone/allinone.sln b/icu4c/source/allinone/allinone.sln index 49c101949ab..729f863b48c 100644 --- a/icu4c/source/allinone/allinone.sln +++ b/icu4c/source/allinone/allinone.sln @@ -168,6 +168,7 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stubdata", "..\stubdata\stu EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "toolutil", "..\tools\toolutil\toolutil.vcproj", "{6B231032-3CB5-4EED-9210-810D666A23A0}" ProjectSection(ProjectDependencies) = postProject + {0178B127-6269-407D-B112-93877BB62776} = {0178B127-6269-407D-B112-93877BB62776} {73C0A65B-D1F2-4DE1-B3A6-15DAD2C23F3D} = {73C0A65B-D1F2-4DE1-B3A6-15DAD2C23F3D} EndProjectSection EndProject diff --git a/icu4c/source/common/unicode/utypes.h b/icu4c/source/common/unicode/utypes.h index 218ade3770e..53ea83c20f8 100644 --- a/icu4c/source/common/unicode/utypes.h +++ b/icu4c/source/common/unicode/utypes.h @@ -356,6 +356,13 @@ typedef void* UClassID; * @stable ICU 2.0 */ +/** + * \def U_TOOLUTIL_API + * Set to export library symbols from inside the toolutil library, + * and to import them from outside. + * @draft ICU 3.4 + */ + #if defined(U_COMBINED_IMPLEMENTATION) #define U_DATA_API U_EXPORT #define U_COMMON_API U_EXPORT @@ -363,6 +370,7 @@ typedef void* UClassID; #define U_LAYOUT_API U_EXPORT #define U_LAYOUTEX_API U_EXPORT #define U_IO_API U_EXPORT +#define U_TOOLUTIL_API U_EXPORT #elif defined(U_STATIC_IMPLEMENTATION) #define U_DATA_API #define U_COMMON_API @@ -370,6 +378,7 @@ typedef void* UClassID; #define U_LAYOUT_API #define U_LAYOUTEX_API #define U_IO_API +#define U_TOOLUTIL_API #elif defined(U_COMMON_IMPLEMENTATION) #define U_DATA_API U_IMPORT #define U_COMMON_API U_EXPORT @@ -377,6 +386,7 @@ typedef void* UClassID; #define U_LAYOUT_API U_IMPORT #define U_LAYOUTEX_API U_IMPORT #define U_IO_API U_IMPORT +#define U_TOOLUTIL_API U_IMPORT #elif defined(U_I18N_IMPLEMENTATION) #define U_DATA_API U_IMPORT #define U_COMMON_API U_IMPORT @@ -384,6 +394,7 @@ typedef void* UClassID; #define U_LAYOUT_API U_IMPORT #define U_LAYOUTEX_API U_IMPORT #define U_IO_API U_IMPORT +#define U_TOOLUTIL_API U_IMPORT #elif defined(U_LAYOUT_IMPLEMENTATION) #define U_DATA_API U_IMPORT #define U_COMMON_API U_IMPORT @@ -391,6 +402,7 @@ typedef void* UClassID; #define U_LAYOUT_API U_EXPORT #define U_LAYOUTEX_API U_IMPORT #define U_IO_API U_IMPORT +#define U_TOOLUTIL_API U_IMPORT #elif defined(U_LAYOUTEX_IMPLEMENTATION) #define U_DATA_API U_IMPORT #define U_COMMON_API U_IMPORT @@ -398,6 +410,7 @@ typedef void* UClassID; #define U_LAYOUT_API U_IMPORT #define U_LAYOUTEX_API U_EXPORT #define U_IO_API U_IMPORT +#define U_TOOLUTIL_API U_IMPORT #elif defined(U_IO_IMPLEMENTATION) #define U_DATA_API U_IMPORT #define U_COMMON_API U_IMPORT @@ -405,6 +418,15 @@ typedef void* UClassID; #define U_LAYOUT_API U_IMPORT #define U_LAYOUTEX_API U_IMPORT #define U_IO_API U_EXPORT +#define U_TOOLUTIL_API U_IMPORT +#elif defined(U_TOOLUTIL_IMPLEMENTATION) +#define U_DATA_API U_IMPORT +#define U_COMMON_API U_IMPORT +#define U_I18N_API U_IMPORT +#define U_LAYOUT_API U_IMPORT +#define U_LAYOUTEX_API U_IMPORT +#define U_IO_API U_IMPORT +#define U_TOOLUTIL_API U_EXPORT #else #define U_DATA_API U_IMPORT #define U_COMMON_API U_IMPORT @@ -412,6 +434,7 @@ typedef void* UClassID; #define U_LAYOUT_API U_IMPORT #define U_LAYOUTEX_API U_IMPORT #define U_IO_API U_IMPORT +#define U_TOOLUTIL_API U_IMPORT #endif /** diff --git a/icu4c/source/tools/toolutil/Makefile.in b/icu4c/source/tools/toolutil/Makefile.in index 1ae5cd5a6c7..a329daf1aac 100644 --- a/icu4c/source/tools/toolutil/Makefile.in +++ b/icu4c/source/tools/toolutil/Makefile.in @@ -39,9 +39,10 @@ DYNAMICCXXFLAGS = $(SHAREDLIBCXXFLAGS) LDFLAGS += $(LDFLAGSICUTOOLUTIL) CPPFLAGS += -I$(top_builddir)/common -I$(top_srcdir)/common $(LIBCPPFLAGS) -LIBS = $(LIBICUUC) $(DEFAULT_LIBS) +DEFS += -DU_TOOLUTIL_IMPLEMENTATION +LIBS = $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) -OBJECTS = propsvec.o toolutil.o unewdata.o ucm.o ucmstate.o uoptions.o uparse.o ucbuf.o +OBJECTS = propsvec.o toolutil.o unewdata.o ucm.o ucmstate.o uoptions.o uparse.o ucbuf.o xmlparser.o STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O)) diff --git a/icu4c/source/tools/toolutil/toolutil.vcproj b/icu4c/source/tools/toolutil/toolutil.vcproj index 1101a4bc01e..8469606eefe 100644 --- a/icu4c/source/tools/toolutil/toolutil.vcproj +++ b/icu4c/source/tools/toolutil/toolutil.vcproj @@ -21,8 +21,8 @@ + + + + +#include "unicode/uchar.h" +#include "unicode/ucnv.h" +#include "unicode/regex.h" +#include "filestrm.h" +#include "xmlparser.h" + +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser) +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement) + +// character constants +enum { + x_QUOT=0x22, + x_AMP=0x26, + x_APOS=0x27, + x_LT=0x3c, + x_GT=0x3e, + x_l=0x6c +}; + +#define XML_SPACES "[ \\u0009\\u000d\\u000a]" + +// XML #4 +#define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \ + "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \ + "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \ + "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" + +// XML #5 +#define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]" + +// XML #6 +#define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*" + +// +// UXMLParser constructor. Mostly just initializes the ICU regexes that are +// used for parsing. +// +UXMLParser::UXMLParser(UErrorCode &status) : + // XML Declaration. XML Production #23. + // example: " + // This is a sloppy implementation - just look for the leading + // allow for a possible leading BOM. + mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status), + + // XML Comment production #15 + // example: " + // note, does not detect an illegal "--" within comments + mXMLComment(UnicodeString("(?s)", -1, US_INV), 0, status), + + // XML Spaces + // production [3] + mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status), + + // XML Doctype decl production #28 + // example " + // TODO: we don't actually parse the DOCTYPE or internal subsets. + // Some internal dtd subsets could confuse this simple-minded + // attempt at skipping over them. + mXMLDoctype(UnicodeString("(?s)", -1, US_INV), 0, status), + + // XML PI production #16 + // example " + mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status), + + // XML Element Start Productions #40, #41 + // example + // capture #1: the tag name + // + mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "", -1, US_INV), 0, status), // match " >" + + // XML Element End production #42 + // example + mXMLElemEnd (UnicodeString("", -1, US_INV), 0, status), + + // XML Element Empty production #44 + // example + mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "", -1, US_INV), 0, status), // match " />" + + + // XMLCharData. Everything but '<'. Note that & will be dealt with later. + mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status), + + // Attribute name = "value". XML Productions 10, 40/41 + // Capture group 1 is name, + // 2 is the attribute value, including the quotes. + // + // Note that attributes are scanned twice. The first time is with + // the regex for an entire element start. There, the attributes + // are checked syntactically, but not separted out one by one. + // Here, we match a single attribute, and make its name and + // attribute value available to the parser code. + mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*" + "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status), + + + mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status), + + // Match any of the new-line sequences in content. + // All are changed to \u000a. + mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status), + + // & char references + // We will figure out what we've got based on which capture group has content. + // The last one is a catchall for unrecognized entity references.. + // 1 2 3 4 5 6 7 8 + mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))", -1, US_INV), + 0, status), + + fNames(status), + fElementStack(status), + fOneLF((UChar)0x0a) // Plain new-line string, used in new line normalization. + { + }; + +UXMLParser * +UXMLParser::createParser(UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { + return NULL; + } else { + return new UXMLParser(errorCode); + } +} + +UXMLParser::~UXMLParser() {} + +UXMLElement * +UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { + char bytes[4096], charsetBuffer[100]; + FileStream *f; + const char *charset, *pb; + UnicodeString src; + UConverter *cnv; + UChar *buffer, *pu; + int32_t fileLength, bytesLength, length, capacity; + UBool flush; + + if(U_FAILURE(errorCode)) { + return NULL; + } + + f=T_FileStream_open(filename, "rb"); + if(f==NULL) { + errorCode=U_FILE_ACCESS_ERROR; + return NULL; + } + + bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); + if(bytesLengthfChildren.addElement(t, status); + t->fParent = el; + fElementStack.push(el, status); + el = t; + continue; + } + + // Text Content. String is concatenated onto the current node's content, + // but only if it contains something other than spaces. + UnicodeString s = scanContent(status); + if (s.length() > 0) { + mXMLSP.reset(s); + if (mXMLSP.matches(status) == FALSE) { + // This chunk of text contains something other than just + // white space. Make a child node for it. + replaceCharRefs(s, status); + el->fChildren.addElement(s.clone(), status); + } + mXMLSP.reset(src); // The matchers need to stay set to the main input string. + continue; + } + + // Comments. Discard. + if (mXMLComment.lookingAt(fPos, status)) { + fPos = mXMLComment.end(status); + continue; + } + + // PIs. Discard. + if (mXMLPI.lookingAt(fPos, status)) { + fPos = mXMLPI.end(status); + continue; + } + + // Element End + if (mXMLElemEnd.lookingAt(fPos, status)) { + fPos = mXMLElemEnd.end(0, status); + const UnicodeString name = mXMLElemEnd.group(1, status); + if (name != *el->fName) { + error("Element start / end tag mismatch", status); + goto errorExit; + } + if (fElementStack.empty()) { + // Close of the root element. We're done with the doc. + el = NULL; + break; + } + el = (UXMLElement *)fElementStack.pop(); + continue; + } + + // Empty Element. Stored as a child of the current element, but not stacked. + if (mXMLElemEmpty.lookingAt(fPos, status)) { + UXMLElement *t = createElement(mXMLElemEmpty, status); + el->fChildren.addElement(t, status); + continue; + } + + // Hit something within the document that doesn't match anything. + // It's an error. + error("Unrecognized markup", status); + break; + } + + if (el != NULL || !fElementStack.empty()) { + // We bailed out early, for some reason. + error("Root element not closed.", status); + goto errorExit; + } + } + + // Root Element parse is complete. + // Consume the annoying xml "Misc" that can appear at the end of the doc. + parseMisc(status); + + // We should have reached the end of the input + if (fPos != src.length()) { + error("Extra content at the end of the document", status); + goto errorExit; + } + + // Success! + return root; + +errorExit: + delete root; + return NULL; +} + +// +// createElement +// We've just matched an element start tag. Create and fill in a UXMLElement object +// for it. +// +UXMLElement * +UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) { + // First capture group is the element's name. + UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status); + + // Scan for attributes. + int32_t pos = mEl.end(1, status); // The position after the end of the tag name + + while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element. + UnicodeString attName = mAttrValue.group(1, status); + UnicodeString attValue = mAttrValue.group(2, status); + + // Trim the quotes from the att value. These are left over from the original regex + // that parsed the attribue, which couldn't conveniently strip them. + attValue.remove(0,1); // one char from the beginning + attValue.truncate(attValue.length()-1); // and one from the end. + + // XML Attribue value normalization. + // This is one of the really screwy parts of the XML spec. + // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize + // Note that non-validating parsers must treat all entities as type CDATA + // which simplifies things some. + + // Att normalization step 1: normalize any newlines in the attribute value + mNewLineNormalizer.reset(attValue); + attValue = mNewLineNormalizer.replaceAll(fOneLF, status); + + // Next change all xml white space chars to plain \u0020 spaces. + mAttrNormalizer.reset(attValue); + UnicodeString oneSpace((UChar)0x0020); + attValue = mAttrNormalizer.replaceAll(oneSpace, status); + + // Replace character entities. + replaceCharRefs(attValue, status); + + // Save the attribute name and value in our document structure. + el->fAttNames.addElement((void *)intern(attName, status), status); + el->fAttValues.addElement(attValue.clone(), status); + pos = mAttrValue.end(2, status); + } + fPos = mEl.end(0, status); + return el; +} + +// +// parseMisc +// Consume XML "Misc" [production #27] +// which is any combination of space, PI and comments +// Need to watch end-of-input because xml MISC stuff is allowed after +// the document element, so we WILL scan off the end in this function +// +void +UXMLParser::parseMisc(UErrorCode &status) { + for (;;) { + if (fPos >= mXMLPI.input().length()) { + break; + } + if (mXMLPI.lookingAt(fPos, status)) { + fPos = mXMLPI.end(status); + continue; + } + if (mXMLSP.lookingAt(fPos, status)) { + fPos = mXMLSP.end(status); + continue; + } + if (mXMLComment.lookingAt(fPos, status)) { + fPos = mXMLComment.end(status); + continue; + } + break; + } +} + +// +// Scan for document content. +// +UnicodeString +UXMLParser::scanContent(UErrorCode &status) { + UnicodeString result; + if (mXMLCharData.lookingAt(fPos, status)) { + result = mXMLCharData.group(0, status); + // Normalize the new-lines. (Before char ref substitution) + mNewLineNormalizer.reset(result); + result = mNewLineNormalizer.replaceAll(fOneLF, status); + + // TODO: handle CDATA + fPos = mXMLCharData.end(0, status); + } + + return result; +} + +// +// replaceCharRefs +// +// replace the char entities < & { ካ etc. in a string +// with the corresponding actual character. +// +void +UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) { + UnicodeString result; + UnicodeString replacement; + int i; + + mAmps.reset(s); + // See the initialization for the regex matcher mAmps. + // Which entity we've matched is determined by which capture group has content, + // which is flaged by start() of that group not being -1. + while (mAmps.find()) { + if (mAmps.start(1, status) != -1) { + replacement.setTo((UChar)x_AMP); + } else if (mAmps.start(2, status) != -1) { + replacement.setTo((UChar)x_LT); + } else if (mAmps.start(3, status) != -1) { + replacement.setTo((UChar)x_GT); + } else if (mAmps.start(4, status) != -1) { + replacement.setTo((UChar)x_APOS); + } else if (mAmps.start(5, status) != -1) { + replacement.setTo((UChar)x_QUOT); + } else if (mAmps.start(6, status) != -1) { + UnicodeString hexString = mAmps.group(6, status); + UChar32 val = 0; + for (i=0; i=0) { + ci = src.indexOf((UChar)0x0a, ci+1); + line++; + } + fprintf(stderr, "Error: %s at line %d\n", message, line); + if (U_SUCCESS(status)) { + status = U_PARSE_ERROR; + } +} + +// intern strings like in Java + +const UnicodeString * +UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) { + const UHashElement *he=fNames.find(s); + if(he!=NULL) { + // already a known name, return its hashed key pointer + return (const UnicodeString *)he->key.pointer; + } else { + // add this new name and return its hashed key pointer + fNames.puti(s, 0, errorCode); + he=fNames.find(s); + return (const UnicodeString *)he->key.pointer; + } +} + +const UnicodeString * +UXMLParser::findName(const UnicodeString &s) const { + const UHashElement *he=fNames.find(s); + if(he!=NULL) { + // a known name, return its hashed key pointer + return (const UnicodeString *)he->key.pointer; + } else { + // unknown name + return NULL; + } +} + +// UXMLElement ------------------------------------------------------------- *** + +UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) : + fParser(parser), + fName(name), + fAttNames(errorCode), + fAttValues(errorCode), + fChildren(errorCode), + fParent(NULL) +{ +} + +UXMLElement::~UXMLElement() { + int i; + // attribute names are owned by the UXMLParser, don't delete them here + for (i=fAttValues.size()-1; i>=0; i--) { + delete (UObject *)fAttValues.elementAt(i); + } + for (i=fChildren.size()-1; i>=0; i--) { + delete (UObject *)fChildren.elementAt(i); + } +} + +const UnicodeString & +UXMLElement::getTagName() const { + return *fName; +} + +UnicodeString +UXMLElement::getText(UBool recurse) const { + UnicodeString text; + appendText(text, recurse); + return text; +} + +void +UXMLElement::appendText(UnicodeString &text, UBool recurse) const { + const UObject *node; + int32_t i, count=fChildren.size(); + for(i=0; igetDynamicClassID()==UnicodeString::getStaticClassID()) { + text.append(*(const UnicodeString *)node); + } else if(recurse) /* must be a UXMLElement */ { + ((const UXMLElement *)node)->appendText(text, recurse); + } + } +} + +int32_t +UXMLElement::countAttributes() const { + return fAttNames.size(); +} + +const UnicodeString * +UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const { + if(0<=i && ifindName(name); + if(p==NULL) { + return NULL; // no such attribute seen by the parser at all + } + + int32_t i, count=fAttNames.size(); + for(i=0; igetDynamicClassID()==UXMLElement::getStaticClassID()) { + type=UXML_NODE_TYPE_ELEMENT; + } else { + type=UXML_NODE_TYPE_STRING; + } + return node; + } else { + return NULL; + } +} + +const UXMLElement * +UXMLElement::nextChildElement(int32_t &i) const { + if(i<0) { + return NULL; + } + + const UObject *node; + int32_t count=fChildren.size(); + while(igetDynamicClassID()==UXMLElement::getStaticClassID()) { + return (const UXMLElement *)node; + } + } + return NULL; +} + +const UXMLElement * +UXMLElement::getChildElement(const UnicodeString &name) const { + // search for the element name by comparing the interned pointer, + // not the string contents + const UnicodeString *p=fParser->findName(name); + if(p==NULL) { + return NULL; // no such element seen by the parser at all + } + + const UObject *node; + int32_t i, count=fChildren.size(); + for(i=0; igetDynamicClassID()==UXMLElement::getStaticClassID()) { + const UXMLElement *elem=(const UXMLElement *)node; + if(p==elem->fName) { + return elem; + } + } + } + return NULL; +} diff --git a/icu4c/source/tools/toolutil/xmlparser.h b/icu4c/source/tools/toolutil/xmlparser.h new file mode 100644 index 00000000000..483333a1980 --- /dev/null +++ b/icu4c/source/tools/toolutil/xmlparser.h @@ -0,0 +1,236 @@ +/* +******************************************************************************* +* +* Copyright (C) 2004-2005, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: xmlparser.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2004jul21 +* created by: Andy Heninger +* +* Tiny XML parser using ICU and intended for use in ICU tests and in build tools. +* Not suitable for production use. Not supported. +* Not conformant. Not efficient. +* But very small. +*/ + +#ifndef __XMLPARSER_H__ +#define __XMLPARSER_H__ + +#include "unicode/uobject.h" +#include "unicode/unistr.h" +#include "unicode/regex.h" +#include "uvector.h" +#include "hash.h" + +enum UXMLNodeType { + /** Node type string (text contents), stored as a UnicodeString. */ + UXML_NODE_TYPE_STRING, + /** Node type element, stored as a UXMLElement. */ + UXML_NODE_TYPE_ELEMENT, + UXML_NODE_TYPE_COUNT +}; + +/** + * This class represents an element node in a parsed XML tree. + */ +class U_TOOLUTIL_API UXMLElement : public UObject { +public: + /** + * Destructor. + */ + virtual ~UXMLElement(); + + /** + * Get the tag name of this element. + */ + const UnicodeString &getTagName() const; + /** + * Get the text contents of the element. + * Append the contents of all text child nodes. + * @param recurse If TRUE, also recursively appends the contents of all + * text child nodes of element children. + * @return The text contents. + */ + UnicodeString getText(UBool recurse) const; + /** + * Get the number of attributes. + */ + int32_t countAttributes() const; + /** + * Get the i-th attribute. + * @param i Index of the attribute. + * @param name Output parameter, receives the attribute name. + * @param value Output parameter, receives the attribute value. + * @return A pointer to the attribute value (may be &value or a pointer to an + * internal string object), or NULL if i is out of bounds. + */ + const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const; + /** + * Get the value of the attribute with the given name. + * @param name Attribute name to be looked up. + * @return A pointer to the attribute value, or NULL if this element + * does not have this attribute. + */ + const UnicodeString *getAttribute(const UnicodeString &name) const; + /** + * Get the number of child nodes. + */ + int32_t countChildren() const; + /** + * Get the i-th child node. + * @param i Index of the child node. + * @param type The child node type. + * @return A pointer to the child node object, or NULL if i is out of bounds. + */ + const UObject *getChild(int32_t i, UXMLNodeType &type) const; + /** + * Get the next child element node, skipping non-element child nodes. + * @param i Enumeration index; initialize to 0 before getting the first child element. + * @return A pointer to the next child element, or NULL if there is none. + */ + const UXMLElement *nextChildElement(int32_t &i) const; + /** + * Get the immediate child element with the given name. + * If there are multiple child elements with this name, then return + * the first one. + * @param name Element name to be looked up. + * @return A pointer to the element node, or NULL if this element + * does not have this immediate child element. + */ + const UXMLElement *getChildElement(const UnicodeString &name) const; + + /** + * ICU "poor man's RTTI", returns a UClassID for the actual class. + */ + virtual UClassID getDynamicClassID() const; + + /** + * ICU "poor man's RTTI", returns a UClassID for this class. + */ + static UClassID U_EXPORT2 getStaticClassID(); + +private: + // prevent default construction etc. + UXMLElement(); + UXMLElement(const UXMLElement &other); + UXMLElement &operator=(const UXMLElement &other); + + void appendText(UnicodeString &text, UBool recurse) const; + + friend class UXMLParser; + + UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode); + + const UXMLParser *fParser; + const UnicodeString *fName; // The tag name of this element (owned by the UXMLParser) + UnicodeString fContent; // The text content of this node. All element content is + // concatenated even when there are intervening nested elements + // (which doesn't happen with most xml files we care about) + // Sections of content containing only white space are dropped, + // which gets rid the bogus white space content from + // elements which are primarily containers for nested elements. + UVector fAttNames; // A vector containing the names of this element's attributes + // The names are UnicodeString objects, owned by the UXMLParser. + UVector fAttValues; // A vector containing the attribute values for + // this element's attributes. The order is the same + // as that of the attribute name vector. + + UXMLElement *fParent; // A pointer to the parent element of this element. + + UVector fChildren; // The child nodes of this element (a Vector) +}; + +/** + * A simple XML parser; it is neither efficient nor conformant and only useful for + * restricted types of XML documents. + * + * The parse methods parse whole documents and return the parse trees via their + * root elements. + */ +class U_TOOLUTIL_API UXMLParser : public UObject { +public: + /** + * Create an XML parser. + */ + static UXMLParser *createParser(UErrorCode &errorCode); + /** + * Destructor. + */ + virtual ~UXMLParser(); + + /** + * Parse an XML document, create the entire document tree, and + * return a pointer to the root element of the parsed tree. + * The caller must delete the element. + */ + UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode); + /** + * Parse an XML file, create the entire document tree, and + * return a pointer to the root element of the parsed tree. + * The caller must delete the element. + */ + UXMLElement *parseFile(const char *filename, UErrorCode &errorCode); + + /** + * ICU "poor man's RTTI", returns a UClassID for the actual class. + */ + virtual UClassID getDynamicClassID() const; + + /** + * ICU "poor man's RTTI", returns a UClassID for this class. + */ + static UClassID U_EXPORT2 getStaticClassID(); + +private: + // prevent default construction etc. + UXMLParser(); + UXMLParser(const UXMLParser &other); + UXMLParser &operator=(const UXMLParser &other); + + // constructor + UXMLParser(UErrorCode &status); + + void parseMisc(UErrorCode &status); + UXMLElement *createElement(RegexMatcher &mEl, UErrorCode &status); + void error(const char *message, UErrorCode &status); + UnicodeString scanContent(UErrorCode &status); + void replaceCharRefs(UnicodeString &s, UErrorCode &status); + + const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode); +public: + // public for UXMLElement only + const UnicodeString *findName(const UnicodeString &s) const; +private: + + // There is one ICU regex matcher for each of the major XML syntax items + // that are recognized. + RegexMatcher mXMLDecl; + RegexMatcher mXMLComment; + RegexMatcher mXMLSP; + RegexMatcher mXMLDoctype; + RegexMatcher mXMLPI; + RegexMatcher mXMLElemStart; + RegexMatcher mXMLElemEnd; + RegexMatcher mXMLElemEmpty; + RegexMatcher mXMLCharData; + RegexMatcher mAttrValue; + RegexMatcher mAttrNormalizer; + RegexMatcher mNewLineNormalizer; + RegexMatcher mAmps; + + Hashtable fNames; // interned element/attribute name strings + UStack fElementStack; // Stack holds the parent elements when nested + // elements are being parsed. All items on this + // stack are of type UXMLElement. + int32_t fPos; // String index of the current scan position in + // xml source (in fSrc). + UnicodeString fOneLF; +}; + +#endif