ICU-4011 add tiny XML parser to toolutil library

X-SVN-Rev: 17485
2025-04-13 08:53:20 +00:00 · 2005-04-17 23:58:53 +00:00 · 2005-04-17 23:58:53 +00:00 · e37a743281
commit e37a743281
parent 59e030b4a5
6 changed files with 1083 additions and 6 deletions
--- a/icu4c/source/allinone/allinone.sln
+++ b/icu4c/source/allinone/allinone.sln
@ -168,6 +168,7 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stubdata", "..\stubdata\stu
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "toolutil", "..\tools\toolutil\toolutil.vcproj", "{6B231032-3CB5-4EED-9210-810D666A23A0}"
 	ProjectSection(ProjectDependencies) = postProject
+		{0178B127-6269-407D-B112-93877BB62776} = {0178B127-6269-407D-B112-93877BB62776}
 		{73C0A65B-D1F2-4DE1-B3A6-15DAD2C23F3D} = {73C0A65B-D1F2-4DE1-B3A6-15DAD2C23F3D}
 	EndProjectSection
 EndProject
--- a/icu4c/source/common/unicode/utypes.h
+++ b/icu4c/source/common/unicode/utypes.h
@ -356,6 +356,13 @@ typedef void* UClassID;
 * @stable ICU 2.0
 */

+/**
+ * \def U_TOOLUTIL_API
+ * Set to export library symbols from inside the toolutil library,
+ * and to import them from outside.
+ * @draft ICU 3.4
+ */
+
 #if defined(U_COMBINED_IMPLEMENTATION)
 #define U_DATA_API     U_EXPORT
 #define U_COMMON_API   U_EXPORT
@ -363,6 +370,7 @@ typedef void* UClassID;
 #define U_LAYOUT_API   U_EXPORT
 #define U_LAYOUTEX_API U_EXPORT
 #define U_IO_API       U_EXPORT
+#define U_TOOLUTIL_API U_EXPORT
 #elif defined(U_STATIC_IMPLEMENTATION)
 #define U_DATA_API
 #define U_COMMON_API
@ -370,6 +378,7 @@ typedef void* UClassID;
 #define U_LAYOUT_API
 #define U_LAYOUTEX_API
 #define U_IO_API
+#define U_TOOLUTIL_API
 #elif defined(U_COMMON_IMPLEMENTATION)
 #define U_DATA_API     U_IMPORT
 #define U_COMMON_API   U_EXPORT
@ -377,6 +386,7 @@ typedef void* UClassID;
 #define U_LAYOUT_API   U_IMPORT
 #define U_LAYOUTEX_API U_IMPORT
 #define U_IO_API       U_IMPORT
+#define U_TOOLUTIL_API U_IMPORT
 #elif defined(U_I18N_IMPLEMENTATION)
 #define U_DATA_API     U_IMPORT
 #define U_COMMON_API   U_IMPORT
@ -384,6 +394,7 @@ typedef void* UClassID;
 #define U_LAYOUT_API   U_IMPORT
 #define U_LAYOUTEX_API U_IMPORT
 #define U_IO_API       U_IMPORT
+#define U_TOOLUTIL_API U_IMPORT
 #elif defined(U_LAYOUT_IMPLEMENTATION)
 #define U_DATA_API     U_IMPORT
 #define U_COMMON_API   U_IMPORT
@ -391,6 +402,7 @@ typedef void* UClassID;
 #define U_LAYOUT_API   U_EXPORT
 #define U_LAYOUTEX_API U_IMPORT
 #define U_IO_API       U_IMPORT
+#define U_TOOLUTIL_API U_IMPORT
 #elif defined(U_LAYOUTEX_IMPLEMENTATION)
 #define U_DATA_API     U_IMPORT
 #define U_COMMON_API   U_IMPORT
@ -398,6 +410,7 @@ typedef void* UClassID;
 #define U_LAYOUT_API   U_IMPORT
 #define U_LAYOUTEX_API U_EXPORT
 #define U_IO_API       U_IMPORT
+#define U_TOOLUTIL_API U_IMPORT
 #elif defined(U_IO_IMPLEMENTATION)
 #define U_DATA_API     U_IMPORT
 #define U_COMMON_API   U_IMPORT
@ -405,6 +418,15 @@ typedef void* UClassID;
 #define U_LAYOUT_API   U_IMPORT
 #define U_LAYOUTEX_API U_IMPORT
 #define U_IO_API       U_EXPORT
+#define U_TOOLUTIL_API U_IMPORT
+#elif defined(U_TOOLUTIL_IMPLEMENTATION)
+#define U_DATA_API     U_IMPORT
+#define U_COMMON_API   U_IMPORT
+#define U_I18N_API     U_IMPORT
+#define U_LAYOUT_API   U_IMPORT
+#define U_LAYOUTEX_API U_IMPORT
+#define U_IO_API       U_IMPORT
+#define U_TOOLUTIL_API U_EXPORT
 #else
 #define U_DATA_API     U_IMPORT
 #define U_COMMON_API   U_IMPORT
@ -412,6 +434,7 @@ typedef void* UClassID;
 #define U_LAYOUT_API   U_IMPORT
 #define U_LAYOUTEX_API U_IMPORT
 #define U_IO_API       U_IMPORT
+#define U_TOOLUTIL_API U_IMPORT
 #endif

 /**
--- a/icu4c/source/tools/toolutil/Makefile.in
+++ b/icu4c/source/tools/toolutil/Makefile.in
@ -39,9 +39,10 @@ DYNAMICCXXFLAGS = $(SHAREDLIBCXXFLAGS)

 LDFLAGS += $(LDFLAGSICUTOOLUTIL)
 CPPFLAGS += -I$(top_builddir)/common -I$(top_srcdir)/common $(LIBCPPFLAGS)
-LIBS = $(LIBICUUC) $(DEFAULT_LIBS)
+DEFS += -DU_TOOLUTIL_IMPLEMENTATION
+LIBS = $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS)

-OBJECTS = propsvec.o toolutil.o unewdata.o ucm.o ucmstate.o uoptions.o uparse.o ucbuf.o
+OBJECTS = propsvec.o toolutil.o unewdata.o ucm.o ucmstate.o uoptions.o uparse.o ucbuf.o xmlparser.o

 STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O))

--- a/icu4c/source/tools/toolutil/toolutil.vcproj
+++ b/icu4c/source/tools/toolutil/toolutil.vcproj
@ -21,8 +21,8 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				ImproveFloatingPointConsistency="TRUE"
-				AdditionalIncludeDirectories="..\..\common"
-				PreprocessorDefinitions="WIN32;NDEBUG"
+				AdditionalIncludeDirectories="..\..\..\include,..\..\common"
+				PreprocessorDefinitions="WIN32;NDEBUG;U_TOOLUTIL_IMPLEMENTATION"
 				StringPooling="TRUE"
 				RuntimeLibrary="2"
 				EnableFunctionLevelLinking="TRUE"
@ -86,8 +86,8 @@
 				Name="VCCLCompilerTool"
 				Optimization="0"
 				ImproveFloatingPointConsistency="TRUE"
-				AdditionalIncludeDirectories="..\..\common"
-				PreprocessorDefinitions="WIN32;_DEBUG"
+				AdditionalIncludeDirectories="..\..\..\include,..\..\common"
+				PreprocessorDefinitions="WIN32;_DEBUG;U_TOOLUTIL_IMPLEMENTATION"
 				BasicRuntimeChecks="3"
 				RuntimeLibrary="3"
 				BufferSecurityCheck="TRUE"
@ -171,6 +171,9 @@
 			<File
 				RelativePath=".\uparse.c">
 			</File>
+			<File
+				RelativePath=".\xmlparser.cpp">
+			</File>
 		</Filter>
 		<Filter
 			Name="Header Files"
@ -196,6 +199,9 @@
 			<File
 				RelativePath=".\uparse.h">
 			</File>
+			<File
+				RelativePath=".\xmlparser.h">
+			</File>
 		</Filter>
 		<Filter
 			Name="Resource Files"
--- a/icu4c/source/tools/toolutil/xmlparser.cpp
+++ b/icu4c/source/tools/toolutil/xmlparser.cpp
@ -0,0 +1,810 @@
+/*
+*******************************************************************************
+*
+*   Copyright (C) 2004-2005, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+*
+*******************************************************************************
+*   file name:  xmlparser.cpp
+*   encoding:   US-ASCII
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 2004jul21
+*   created by: Andy Heninger
+*/
+
+#include <stdio.h>
+#include "unicode/uchar.h"
+#include "unicode/ucnv.h"
+#include "unicode/regex.h"
+#include "filestrm.h"
+#include "xmlparser.h"
+
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
+
+// character constants
+enum {
+    x_QUOT=0x22,
+    x_AMP=0x26,
+    x_APOS=0x27,
+    x_LT=0x3c,
+    x_GT=0x3e,
+    x_l=0x6c
+};
+
+#define  XML_SPACES "[ \\u0009\\u000d\\u000a]"
+
+// XML #4
+#define  XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
+                    "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
+                    "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
+                    "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
+
+//  XML #5
+#define  XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
+
+//  XML #6
+#define  XML_NAME    XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
+
+//
+//   UXMLParser constructor.   Mostly just initializes the ICU regexes that are
+//                             used for parsing.
+//
+UXMLParser::UXMLParser(UErrorCode &status) :
+      //  XML Declaration.  XML Production #23.
+      //      example:  "<?xml version=1.0 encoding="utf-16" ?>
+      //      This is a sloppy implementation - just look for the leading <?xml and the closing ?>
+      //            allow for a possible leading BOM.
+      mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
+      
+      //  XML Comment   production #15
+      //     example:  "<!-- whatever -->
+      //       note, does not detect an illegal "--" within comments
+      mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
+      
+      //  XML Spaces
+      //      production [3]
+      mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
+      
+      //  XML Doctype decl  production #28
+      //     example   "<!DOCTYPE foo SYSTEM "somewhere" >
+      //    TODO:  we don't actually parse the DOCTYPE or internal subsets.
+      //           Some internal dtd subsets could confuse this simple-minded
+      //           attempt at skipping over them.
+      mXMLDoctype(UnicodeString("(?s)<!DOCTYPE.+?>", -1, US_INV), 0, status),
+      
+      //  XML PI     production #16
+      //     example   "<?target stuff?>
+      mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
+      
+      //  XML Element Start   Productions #40, #41
+      //          example   <foo att1='abc'  att2="d e f" >
+      //      capture #1:  the tag name
+      //
+      mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
+          "(?:" 
+                XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
+                "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
+          ")*"                                                             //   * for zero or more attributes.
+          XML_SPACES "*?>", -1, US_INV), 0, status),                               // match " >"
+      
+      //  XML Element End     production #42
+      //     example   </foo>
+      mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
+      
+      // XML Element Empty    production #44
+      //     example   <foo att1="abc"   att2="d e f" />
+      mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
+          "(?:" 
+                XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
+                "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
+          ")*"                                                             //   * for zero or more attributes.
+          XML_SPACES "*?/>", -1, US_INV), 0, status),                              // match " />"
+      
+
+      // XMLCharData.  Everything but '<'.  Note that & will be dealt with later.
+      mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
+
+      // Attribute name = "value".  XML Productions 10, 40/41
+      //  Capture group 1 is name, 
+      //                2 is the attribute value, including the quotes.
+      //
+      //   Note that attributes are scanned twice.  The first time is with
+      //        the regex for an entire element start.  There, the attributes
+      //        are checked syntactically, but not separted out one by one.
+      //        Here, we match a single attribute, and make its name and
+      //        attribute value available to the parser code.
+      mAttrValue(UnicodeString(XML_SPACES "+("  XML_NAME ")"  XML_SPACES "*=" XML_SPACES "*"
+         "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
+
+
+      mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
+
+      // Match any of the new-line sequences in content.
+      //   All are changed to \u000a.
+      mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
+
+      // & char references
+      //   We will figure out what we've got based on which capture group has content.
+      //   The last one is a catchall for unrecognized entity references..
+      //             1     2     3      4      5           6                    7          8
+      mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))", -1, US_INV),
+                0, status),
+
+      fNames(status),
+      fElementStack(status),
+      fOneLF((UChar)0x0a)        // Plain new-line string, used in new line normalization.
+      {
+      };
+
+UXMLParser *
+UXMLParser::createParser(UErrorCode &errorCode) {
+    if (U_FAILURE(errorCode)) {
+        return NULL;
+    } else {
+        return new UXMLParser(errorCode);
+    }
+}
+
+UXMLParser::~UXMLParser() {}
+
+UXMLElement *
+UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
+    char bytes[4096], charsetBuffer[100];
+    FileStream *f;
+    const char *charset, *pb;
+    UnicodeString src;
+    UConverter *cnv;
+    UChar *buffer, *pu;
+    int32_t fileLength, bytesLength, length, capacity;
+    UBool flush;
+
+    if(U_FAILURE(errorCode)) {
+        return NULL;
+    }
+
+    f=T_FileStream_open(filename, "rb");
+    if(f==NULL) {
+        errorCode=U_FILE_ACCESS_ERROR;
+        return NULL;
+    }
+
+    bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
+    if(bytesLength<sizeof(bytes)) {
+        // we have already read the entire file
+        fileLength=bytesLength;
+    } else {
+        // get the file length
+        fileLength=T_FileStream_size(f);
+    }
+
+    /*
+     * get the charset:
+     * 1. Unicode signature
+     * 2. treat as ISO-8859-1 and read XML encoding="charser"
+     * 3. default to UTF-8
+     */
+    charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
+    if(U_SUCCESS(errorCode) && charset!=NULL) {
+        // open converter according to Unicode signature
+        cnv=ucnv_open(charset, &errorCode);
+    } else {
+        // read as Latin-1 and parse the XML declaration and encoding
+        cnv=ucnv_open("ISO-8859-1", &errorCode);
+        if(U_FAILURE(errorCode)) {
+            // unexpected error opening Latin-1 converter
+            goto exit;
+        }
+
+        buffer=src.getBuffer(bytesLength);
+        if(buffer==NULL) {
+            // unexpected failure to reserve some string capacity
+            errorCode=U_MEMORY_ALLOCATION_ERROR;
+            goto exit;
+        }
+        pb=bytes;
+        pu=buffer;
+        ucnv_toUnicode(
+            cnv,
+            &pu, buffer+src.getCapacity(),
+            &pb, bytes+sizeof(bytes),
+            NULL, TRUE, &errorCode);
+        src.releaseBuffer((int32_t)(pu-buffer));
+        ucnv_close(cnv);
+        if(U_FAILURE(errorCode)) {
+            // unexpected error in conversion from Latin-1
+            src.remove();
+            goto exit;
+        }
+
+        // parse XML declaration
+        if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
+            int32_t declEnd=mXMLDecl.end(errorCode);
+            // go beyond <?xml
+            int32_t pos=src.indexOf(x_l)+1;
+
+            mAttrValue.reset(src);
+            while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) {  // loop runs once per attribute on this element.
+                UnicodeString attName  = mAttrValue.group(1, errorCode);
+                UnicodeString attValue = mAttrValue.group(2, errorCode);
+
+                // Trim the quotes from the att value.  These are left over from the original regex
+                //   that parsed the attribue, which couldn't conveniently strip them.
+                attValue.remove(0,1);                    // one char from the beginning
+                attValue.truncate(attValue.length()-1);  // and one from the end.
+
+                if(attName==UNICODE_STRING("encoding", 8)) {
+                    length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer), US_INV);
+                    charset=charsetBuffer;
+                    break;
+                }
+                pos = mAttrValue.end(2, errorCode);
+            }
+
+            if(charset==NULL) {
+                // default to UTF-8
+                charset="UTF-8";
+            }
+            cnv=ucnv_open(charset, &errorCode);
+        }
+    }
+
+    if(U_FAILURE(errorCode)) {
+        // unable to open the converter
+        goto exit;
+    }
+
+    // convert the file contents
+    capacity=fileLength;        // estimated capacity
+    src.getBuffer(capacity);
+    src.releaseBuffer(0);       // zero length
+    flush=FALSE;
+    for(;;) {
+        // convert contents of bytes[bytesLength]
+        pb=bytes;
+        for(;;) {
+            length=src.length();
+            buffer=src.getBuffer(capacity);
+            if(buffer==NULL) {
+                // unexpected failure to reserve some string capacity
+                errorCode=U_MEMORY_ALLOCATION_ERROR;
+                goto exit;
+            }
+
+            pu=buffer+length;
+            ucnv_toUnicode(
+                cnv, &pu, buffer+src.getCapacity(),
+                &pb, bytes+bytesLength,
+                NULL, FALSE, &errorCode);
+            src.releaseBuffer((int32_t)(pu-buffer));
+            if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
+                errorCode=U_ZERO_ERROR;
+                capacity=(3*src.getCapacity())/2; // increase capacity by 50%
+            } else {
+                break;
+            }
+        }
+
+        if(U_FAILURE(errorCode)) {
+            break; // conversion error
+        }
+
+        if(flush) {
+            break; // completely converted the file
+        }
+
+        // read next block
+        bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
+        if(bytesLength==0) {
+            // reached end of file, convert once more to flush the converter
+            flush=TRUE;
+        }
+    };
+
+exit:
+    ucnv_close(cnv);
+    T_FileStream_close(f);
+
+    if(U_SUCCESS(errorCode)) {
+        return parse(src, errorCode);
+    } else {
+        return NULL;
+    }
+}
+
+UXMLElement *
+UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
+    if(U_FAILURE(status)) {
+        return NULL;
+    }
+
+    UXMLElement   *root = NULL;
+    fPos = 0; // TODO use just a local pos variable and pass it into functions
+              // where necessary?
+
+    // set all matchers to work on the input string
+    mXMLDecl.reset(src);
+    mXMLComment.reset(src);
+    mXMLSP.reset(src);
+    mXMLDoctype.reset(src);
+    mXMLPI.reset(src);
+    mXMLElemStart.reset(src);
+    mXMLElemEnd.reset(src);
+    mXMLElemEmpty.reset(src);
+    mXMLCharData.reset(src);
+    mAttrValue.reset(src);
+    mAttrNormalizer.reset(src);
+    mNewLineNormalizer.reset(src);
+    mAmps.reset(src);
+
+    // Consume the XML Declaration, if present.
+    if (mXMLDecl.lookingAt(fPos, status)) {
+        fPos = mXMLDecl.end(status);
+    }
+
+    // Consume "misc" [XML production 27] appearing before DocType
+    parseMisc(status);
+
+    // Consume a DocType declaration, if present.
+    if (mXMLDoctype.lookingAt(fPos, status)) {
+        fPos = mXMLDoctype.end(status);
+    }
+
+    // Consume additional "misc" [XML production 27] appearing after the DocType
+    parseMisc(status);
+
+    // Get the root element
+    if (mXMLElemEmpty.lookingAt(fPos, status)) {
+        // Root is an empty element (no nested elements or content)
+        root = createElement(mXMLElemEmpty, status);
+        fPos = mXMLElemEmpty.end(status);
+    } else {
+        if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
+            error("Root Element expected", status);
+            goto errorExit;
+        }
+        root = createElement(mXMLElemStart, status);
+        UXMLElement  *el = root;
+
+        //
+        // This is the loop that consumes the root element of the document,
+        //      including all nested content.   Nested elements are handled by
+        //      explicit pushes/pops of the element stack; there is no recursion
+        //      in the control flow of this code.
+        //      "el" always refers to the current element, the one to which content
+        //      is being added.  It is above the top of the element stack.
+        for (;;) {
+            // Nested Element Start
+            if (mXMLElemStart.lookingAt(fPos, status)) {
+                UXMLElement *t = createElement(mXMLElemStart, status);
+                el->fChildren.addElement(t, status);
+                t->fParent = el;
+                fElementStack.push(el, status);
+                el = t;
+                continue;
+            }
+
+            // Text Content.  String is concatenated onto the current node's content,
+            //                but only if it contains something other than spaces.
+            UnicodeString s = scanContent(status);
+            if (s.length() > 0) {
+                mXMLSP.reset(s);
+                if (mXMLSP.matches(status) == FALSE) {
+                    // This chunk of text contains something other than just
+                    //  white space. Make a child node for it.
+                    replaceCharRefs(s, status);
+                    el->fChildren.addElement(s.clone(), status);
+                }
+                mXMLSP.reset(src);    // The matchers need to stay set to the main input string.
+                continue;
+            }
+
+            // Comments.  Discard.
+            if (mXMLComment.lookingAt(fPos, status)) {
+                fPos = mXMLComment.end(status);
+                continue;
+            }
+
+            // PIs.  Discard.
+            if (mXMLPI.lookingAt(fPos, status)) {
+                fPos = mXMLPI.end(status);
+                continue;
+            }
+
+            // Element End
+            if (mXMLElemEnd.lookingAt(fPos, status)) {
+                fPos = mXMLElemEnd.end(0, status);
+                const UnicodeString name = mXMLElemEnd.group(1, status);
+                if (name != *el->fName) {
+                    error("Element start / end tag mismatch", status);
+                    goto errorExit;
+                }
+                if (fElementStack.empty()) {
+                    // Close of the root element.  We're done with the doc.
+                    el = NULL;
+                    break;
+                }
+                el = (UXMLElement *)fElementStack.pop();
+                continue;
+            }
+
+            // Empty Element.  Stored as a child of the current element, but not stacked.
+            if (mXMLElemEmpty.lookingAt(fPos, status)) {
+                UXMLElement *t = createElement(mXMLElemEmpty, status);
+                el->fChildren.addElement(t, status);
+                continue;
+            }
+
+            // Hit something within the document that doesn't match anything.
+            //   It's an error.
+            error("Unrecognized markup", status);
+            break;
+        }
+
+        if (el != NULL || !fElementStack.empty()) {
+            // We bailed out early, for some reason.
+            error("Root element not closed.", status);
+            goto errorExit;
+        }
+    }
+
+    // Root Element parse is complete.
+    // Consume the annoying xml "Misc" that can appear at the end of the doc.
+    parseMisc(status);
+
+    // We should have reached the end of the input
+    if (fPos != src.length()) {
+        error("Extra content at the end of the document", status);
+        goto errorExit;
+    }
+
+    // Success!
+    return root;
+
+errorExit:
+    delete root;
+    return NULL;
+}
+
+//
+//  createElement
+//      We've just matched an element start tag.  Create and fill in a UXMLElement object
+//      for it.
+//
+UXMLElement *
+UXMLParser::createElement(RegexMatcher  &mEl, UErrorCode &status) {
+    // First capture group is the element's name.
+    UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
+
+    // Scan for attributes.
+    int32_t   pos = mEl.end(1, status);  // The position after the end of the tag name
+
+    while (mAttrValue.lookingAt(pos, status)) {  // loop runs once per attribute on this element.
+        UnicodeString attName  = mAttrValue.group(1, status);
+        UnicodeString attValue = mAttrValue.group(2, status);
+
+        // Trim the quotes from the att value.  These are left over from the original regex
+        //   that parsed the attribue, which couldn't conveniently strip them.
+        attValue.remove(0,1);                    // one char from the beginning
+        attValue.truncate(attValue.length()-1);  // and one from the end.
+        
+        // XML Attribue value normalization. 
+        // This is one of the really screwy parts of the XML spec.
+        // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
+        // Note that non-validating parsers must treat all entities as type CDATA
+        //   which simplifies things some.
+
+        // Att normalization step 1:  normalize any newlines in the attribute value
+        mNewLineNormalizer.reset(attValue);
+        attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
+
+        // Next change all xml white space chars to plain \u0020 spaces.
+        mAttrNormalizer.reset(attValue);
+        UnicodeString oneSpace((UChar)0x0020);
+        attValue = mAttrNormalizer.replaceAll(oneSpace, status);
+
+        // Replace character entities.
+        replaceCharRefs(attValue, status);
+
+        // Save the attribute name and value in our document structure.
+        el->fAttNames.addElement((void *)intern(attName, status), status);
+        el->fAttValues.addElement(attValue.clone(), status);
+        pos = mAttrValue.end(2, status);
+    }
+    fPos = mEl.end(0, status);
+    return el;
+}
+
+//
+//  parseMisc
+//     Consume XML "Misc" [production #27]
+//        which is any combination of space, PI and comments
+//      Need to watch end-of-input because xml MISC stuff is allowed after
+//        the document element, so we WILL scan off the end in this function
+//
+void
+UXMLParser::parseMisc(UErrorCode &status)  {
+    for (;;) {
+        if (fPos >= mXMLPI.input().length()) {
+            break;
+        }
+        if (mXMLPI.lookingAt(fPos, status)) {
+            fPos = mXMLPI.end(status);
+            continue;
+        }
+        if (mXMLSP.lookingAt(fPos, status)) {
+            fPos = mXMLSP.end(status);
+            continue;
+        }
+        if (mXMLComment.lookingAt(fPos, status)) {
+            fPos = mXMLComment.end(status);
+            continue;
+        }
+        break;
+    }
+}
+
+//
+//  Scan for document content.
+//
+UnicodeString
+UXMLParser::scanContent(UErrorCode &status) {
+    UnicodeString  result;
+    if (mXMLCharData.lookingAt(fPos, status)) {
+        result = mXMLCharData.group(0, status);
+        // Normalize the new-lines.  (Before char ref substitution)
+        mNewLineNormalizer.reset(result);
+        result = mNewLineNormalizer.replaceAll(fOneLF, status);
+        
+        // TODO:  handle CDATA
+        fPos = mXMLCharData.end(0, status);
+    }
+
+    return result;
+}
+
+//
+//   replaceCharRefs
+//
+//      replace the char entities &lt;  &amp; &#123; &#x12ab; etc. in a string
+//       with the corresponding actual character.
+//
+void
+UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
+    UnicodeString result;
+    UnicodeString replacement;
+    int     i;
+
+    mAmps.reset(s);
+    // See the initialization for the regex matcher mAmps.
+    //    Which entity we've matched is determined by which capture group has content,
+    //      which is flaged by start() of that group not being -1.
+    while (mAmps.find()) {
+        if (mAmps.start(1, status) != -1) {
+            replacement.setTo((UChar)x_AMP);
+        } else if (mAmps.start(2, status) != -1) {
+            replacement.setTo((UChar)x_LT);
+        } else if (mAmps.start(3, status) != -1) {
+            replacement.setTo((UChar)x_GT);
+        } else if (mAmps.start(4, status) != -1) {
+            replacement.setTo((UChar)x_APOS);
+        } else if (mAmps.start(5, status) != -1) {
+            replacement.setTo((UChar)x_QUOT);
+        } else if (mAmps.start(6, status) != -1) {
+            UnicodeString hexString = mAmps.group(6, status);
+            UChar32 val = 0;
+            for (i=0; i<hexString.length(); i++) {
+                val = (val << 4) + u_digit(hexString.charAt(i), 16);
+            }
+            // TODO:  some verification that the character is valid
+            replacement.setTo(val);
+        } else if (mAmps.start(7, status) != -1) {
+            UnicodeString decimalString = mAmps.group(7, status);
+            UChar32 val = 0;
+            for (i=0; i<decimalString.length(); i++) {
+                val = val*10 + u_digit(decimalString.charAt(i), 10);
+            }
+            // TODO:  some verification that the character is valid
+            replacement.setTo(val);
+        } else {
+            // An unrecognized &entity;  Leave it alone.
+            //  TODO:  check that it really looks like an entity, and is not some
+            //         random & in the text.
+            replacement = mAmps.group(0, status);
+        }
+        mAmps.appendReplacement(result, replacement, status);
+    }
+    mAmps.appendTail(result);
+    s = result;
+}
+
+void
+UXMLParser::error(const char *message, UErrorCode &status) {
+    // TODO:  something better here...
+    const UnicodeString &src=mXMLDecl.input();
+    int  line = 0;
+    int  ci = 0;
+    while (ci < fPos && ci>=0) {
+        ci = src.indexOf((UChar)0x0a, ci+1);
+        line++;
+    }
+    fprintf(stderr, "Error: %s at line %d\n", message, line);
+    if (U_SUCCESS(status)) {
+        status = U_PARSE_ERROR;
+    }
+}
+
+// intern strings like in Java
+
+const UnicodeString *
+UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
+    const UHashElement *he=fNames.find(s);
+    if(he!=NULL) {
+        // already a known name, return its hashed key pointer
+        return (const UnicodeString *)he->key.pointer;
+    } else {
+        // add this new name and return its hashed key pointer
+        fNames.puti(s, 0, errorCode);
+        he=fNames.find(s);
+        return (const UnicodeString *)he->key.pointer;
+    }
+}
+
+const UnicodeString *
+UXMLParser::findName(const UnicodeString &s) const {
+    const UHashElement *he=fNames.find(s);
+    if(he!=NULL) {
+        // a known name, return its hashed key pointer
+        return (const UnicodeString *)he->key.pointer;
+    } else {
+        // unknown name
+        return NULL;
+    }
+}
+
+// UXMLElement ------------------------------------------------------------- ***
+
+UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
+   fParser(parser),
+   fName(name),
+   fAttNames(errorCode),
+   fAttValues(errorCode),
+   fChildren(errorCode),
+   fParent(NULL)
+{
+}
+
+UXMLElement::~UXMLElement() {
+    int   i;
+    // attribute names are owned by the UXMLParser, don't delete them here
+    for (i=fAttValues.size()-1; i>=0; i--) {
+        delete (UObject *)fAttValues.elementAt(i);
+    }
+    for (i=fChildren.size()-1; i>=0; i--) {
+        delete (UObject *)fChildren.elementAt(i);
+    }
+}
+
+const UnicodeString &
+UXMLElement::getTagName() const {
+    return *fName;
+}
+
+UnicodeString
+UXMLElement::getText(UBool recurse) const {
+    UnicodeString text;
+    appendText(text, recurse);
+    return text;
+}
+
+void
+UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
+    const UObject *node;
+    int32_t i, count=fChildren.size();
+    for(i=0; i<count; ++i) {
+        node=(const UObject *)fChildren.elementAt(i);
+        if(node->getDynamicClassID()==UnicodeString::getStaticClassID()) {
+            text.append(*(const UnicodeString *)node);
+        } else if(recurse) /* must be a UXMLElement */ {
+            ((const UXMLElement *)node)->appendText(text, recurse);
+        }
+    }
+}
+
+int32_t
+UXMLElement::countAttributes() const {
+    return fAttNames.size();
+}
+
+const UnicodeString *
+UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
+    if(0<=i && i<fAttNames.size()) {
+        name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
+        value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
+        return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
+    } else {
+        return NULL;
+    }
+}
+
+const UnicodeString *
+UXMLElement::getAttribute(const UnicodeString &name) const {
+    // search for the attribute name by comparing the interned pointer,
+    // not the string contents
+    const UnicodeString *p=fParser->findName(name);
+    if(p==NULL) {
+        return NULL; // no such attribute seen by the parser at all
+    }
+
+    int32_t i, count=fAttNames.size();
+    for(i=0; i<count; ++i) {
+        if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
+            return (const UnicodeString *)fAttValues.elementAt(i);
+        }
+    }
+    return NULL;
+}
+
+int32_t
+UXMLElement::countChildren() const {
+    return fChildren.size();
+}
+
+const UObject *
+UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
+    if(0<=i && i<fChildren.size()) {
+        const UObject *node=(const UObject *)fChildren.elementAt(i);
+        if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
+            type=UXML_NODE_TYPE_ELEMENT;
+        } else {
+            type=UXML_NODE_TYPE_STRING;
+        }
+        return node;
+    } else {
+        return NULL;
+    }
+}
+
+const UXMLElement *
+UXMLElement::nextChildElement(int32_t &i) const {
+    if(i<0) {
+        return NULL;
+    }
+
+    const UObject *node;
+    int32_t count=fChildren.size();
+    while(i<count) {
+        node=(const UObject *)fChildren.elementAt(i++);
+        // TODO: see if ICU can use C++ instanceof instead of its own poor man's RTTI
+        // if(node instanceof UXMLElement) {
+        if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
+            return (const UXMLElement *)node;
+        }
+    }
+    return NULL;
+}
+
+const UXMLElement *
+UXMLElement::getChildElement(const UnicodeString &name) const {
+    // search for the element name by comparing the interned pointer,
+    // not the string contents
+    const UnicodeString *p=fParser->findName(name);
+    if(p==NULL) {
+        return NULL; // no such element seen by the parser at all
+    }
+
+    const UObject *node;
+    int32_t i, count=fChildren.size();
+    for(i=0; i<count; ++i) {
+        node=(const UObject *)fChildren.elementAt(i);
+        if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
+            const UXMLElement *elem=(const UXMLElement *)node;
+            if(p==elem->fName) {
+                return elem;
+            }
+        }
+    }
+    return NULL;
+}
--- a/icu4c/source/tools/toolutil/xmlparser.h
+++ b/icu4c/source/tools/toolutil/xmlparser.h
@ -0,0 +1,236 @@
+/*
+*******************************************************************************
+*
+*   Copyright (C) 2004-2005, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+*
+*******************************************************************************
+*   file name:  xmlparser.h
+*   encoding:   US-ASCII
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 2004jul21
+*   created by: Andy Heninger
+*
+* Tiny XML parser using ICU and intended for use in ICU tests and in build tools.
+* Not suitable for production use. Not supported.
+* Not conformant. Not efficient.
+* But very small.
+*/
+
+#ifndef __XMLPARSER_H__
+#define __XMLPARSER_H__
+
+#include "unicode/uobject.h"
+#include "unicode/unistr.h"
+#include "unicode/regex.h"
+#include "uvector.h"
+#include "hash.h"
+
+enum UXMLNodeType {
+    /** Node type string (text contents), stored as a UnicodeString. */
+    UXML_NODE_TYPE_STRING,
+    /** Node type element, stored as a UXMLElement. */
+    UXML_NODE_TYPE_ELEMENT,
+    UXML_NODE_TYPE_COUNT
+};
+
+/**
+ * This class represents an element node in a parsed XML tree.
+ */
+class U_TOOLUTIL_API UXMLElement : public UObject {
+public:
+    /**
+     * Destructor.
+     */
+    virtual ~UXMLElement();
+
+    /**
+     * Get the tag name of this element.
+     */
+    const UnicodeString &getTagName() const;
+    /**
+     * Get the text contents of the element.
+     * Append the contents of all text child nodes.
+     * @param recurse If TRUE, also recursively appends the contents of all
+     *        text child nodes of element children.
+     * @return The text contents.
+     */
+    UnicodeString getText(UBool recurse) const;
+    /**
+     * Get the number of attributes.
+     */
+    int32_t countAttributes() const;
+    /**
+     * Get the i-th attribute.
+     * @param i Index of the attribute.
+     * @param name Output parameter, receives the attribute name.
+     * @param value Output parameter, receives the attribute value.
+     * @return A pointer to the attribute value (may be &value or a pointer to an
+     *         internal string object), or NULL if i is out of bounds.
+     */
+    const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const;
+    /**
+     * Get the value of the attribute with the given name.
+     * @param name Attribute name to be looked up.
+     * @return A pointer to the attribute value, or NULL if this element
+     * does not have this attribute.
+     */
+    const UnicodeString *getAttribute(const UnicodeString &name) const;
+    /**
+     * Get the number of child nodes.
+     */
+    int32_t countChildren() const;
+    /**
+     * Get the i-th child node.
+     * @param i Index of the child node.
+     * @param type The child node type.
+     * @return A pointer to the child node object, or NULL if i is out of bounds.
+     */
+    const UObject *getChild(int32_t i, UXMLNodeType &type) const;
+    /**
+     * Get the next child element node, skipping non-element child nodes.
+     * @param i Enumeration index; initialize to 0 before getting the first child element.
+     * @return A pointer to the next child element, or NULL if there is none.
+     */
+    const UXMLElement *nextChildElement(int32_t &i) const;
+    /**
+     * Get the immediate child element with the given name.
+     * If there are multiple child elements with this name, then return
+     * the first one.
+     * @param name Element name to be looked up.
+     * @return A pointer to the element node, or NULL if this element
+     * does not have this immediate child element.
+     */
+    const UXMLElement *getChildElement(const UnicodeString &name) const;
+
+    /**
+     * ICU "poor man's RTTI", returns a UClassID for the actual class.
+     */
+    virtual UClassID getDynamicClassID() const;
+
+    /**
+     * ICU "poor man's RTTI", returns a UClassID for this class.
+     */
+    static UClassID U_EXPORT2 getStaticClassID();
+
+private:
+    // prevent default construction etc.
+    UXMLElement();
+    UXMLElement(const UXMLElement &other);
+    UXMLElement &operator=(const UXMLElement &other);
+
+    void appendText(UnicodeString &text, UBool recurse) const;
+
+    friend class UXMLParser;
+
+    UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode);
+
+    const UXMLParser *fParser;
+    const UnicodeString *fName;          // The tag name of this element (owned by the UXMLParser)
+    UnicodeString       fContent;        // The text content of this node.  All element content is 
+                                         //   concatenated even when there are intervening nested elements
+                                         //   (which doesn't happen with most xml files we care about)
+                                         //   Sections of content containing only white space are dropped,
+                                         //   which gets rid  the bogus white space content from
+                                         //   elements which are primarily containers for nested elements.
+    UVector             fAttNames;       // A vector containing the names of this element's attributes
+                                         //    The names are UnicodeString objects, owned by the UXMLParser.
+    UVector             fAttValues;      // A vector containing the attribute values for
+                                         //    this element's attributes.  The order is the same
+                                         //    as that of the attribute name vector.
+
+    UXMLElement        *fParent;         // A pointer to the parent element of this element.
+
+    UVector             fChildren;       // The child nodes of this element (a Vector)
+};
+
+/**
+ * A simple XML parser; it is neither efficient nor conformant and only useful for
+ * restricted types of XML documents.
+ *
+ * The parse methods parse whole documents and return the parse trees via their
+ * root elements.
+ */
+class  U_TOOLUTIL_API UXMLParser : public UObject {
+public:
+    /**
+     * Create an XML parser.
+     */
+    static UXMLParser *createParser(UErrorCode &errorCode);
+    /**
+     * Destructor.
+     */
+    virtual ~UXMLParser();
+
+    /**
+     * Parse an XML document, create the entire document tree, and
+     * return a pointer to the root element of the parsed tree.
+     * The caller must delete the element.
+     */
+    UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode);
+    /**
+     * Parse an XML file, create the entire document tree, and
+     * return a pointer to the root element of the parsed tree.
+     * The caller must delete the element.
+     */
+    UXMLElement *parseFile(const char *filename, UErrorCode &errorCode);
+
+    /**
+     * ICU "poor man's RTTI", returns a UClassID for the actual class.
+     */
+    virtual UClassID getDynamicClassID() const;
+
+    /**
+     * ICU "poor man's RTTI", returns a UClassID for this class.
+     */
+    static UClassID U_EXPORT2 getStaticClassID();
+
+private:
+    // prevent default construction etc.
+    UXMLParser();
+    UXMLParser(const UXMLParser &other);
+    UXMLParser &operator=(const UXMLParser &other);
+
+    // constructor
+    UXMLParser(UErrorCode &status);
+
+    void           parseMisc(UErrorCode &status);
+    UXMLElement   *createElement(RegexMatcher &mEl, UErrorCode &status);
+    void           error(const char *message, UErrorCode &status);
+    UnicodeString  scanContent(UErrorCode &status);
+    void           replaceCharRefs(UnicodeString &s, UErrorCode &status);
+
+    const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode);
+public:
+    // public for UXMLElement only
+    const UnicodeString *findName(const UnicodeString &s) const;
+private:
+
+    // There is one ICU regex matcher for each of the major XML syntax items
+    //  that are recognized.
+    RegexMatcher mXMLDecl;
+    RegexMatcher mXMLComment;
+    RegexMatcher mXMLSP;
+    RegexMatcher mXMLDoctype;
+    RegexMatcher mXMLPI;
+    RegexMatcher mXMLElemStart;
+    RegexMatcher mXMLElemEnd;
+    RegexMatcher mXMLElemEmpty;
+    RegexMatcher mXMLCharData;
+    RegexMatcher mAttrValue;
+    RegexMatcher mAttrNormalizer;
+    RegexMatcher mNewLineNormalizer;
+    RegexMatcher mAmps;
+
+    Hashtable             fNames;           // interned element/attribute name strings
+    UStack                fElementStack;    // Stack holds the parent elements when nested
+                                            //    elements are being parsed.  All items on this
+                                            //    stack are of type UXMLElement.
+    int32_t               fPos;             // String index of the current scan position in
+                                            //    xml source (in fSrc).
+    UnicodeString         fOneLF;
+};
+
+#endif