mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-4011 add tiny XML parser to toolutil library
X-SVN-Rev: 17485
This commit is contained in:
parent
59e030b4a5
commit
e37a743281
6 changed files with 1083 additions and 6 deletions
icu4c/source
allinone
common/unicode
tools/toolutil
|
@ -168,6 +168,7 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stubdata", "..\stubdata\stu
|
|||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "toolutil", "..\tools\toolutil\toolutil.vcproj", "{6B231032-3CB5-4EED-9210-810D666A23A0}"
|
||||
ProjectSection(ProjectDependencies) = postProject
|
||||
{0178B127-6269-407D-B112-93877BB62776} = {0178B127-6269-407D-B112-93877BB62776}
|
||||
{73C0A65B-D1F2-4DE1-B3A6-15DAD2C23F3D} = {73C0A65B-D1F2-4DE1-B3A6-15DAD2C23F3D}
|
||||
EndProjectSection
|
||||
EndProject
|
||||
|
|
|
@ -356,6 +356,13 @@ typedef void* UClassID;
|
|||
* @stable ICU 2.0
|
||||
*/
|
||||
|
||||
/**
|
||||
* \def U_TOOLUTIL_API
|
||||
* Set to export library symbols from inside the toolutil library,
|
||||
* and to import them from outside.
|
||||
* @draft ICU 3.4
|
||||
*/
|
||||
|
||||
#if defined(U_COMBINED_IMPLEMENTATION)
|
||||
#define U_DATA_API U_EXPORT
|
||||
#define U_COMMON_API U_EXPORT
|
||||
|
@ -363,6 +370,7 @@ typedef void* UClassID;
|
|||
#define U_LAYOUT_API U_EXPORT
|
||||
#define U_LAYOUTEX_API U_EXPORT
|
||||
#define U_IO_API U_EXPORT
|
||||
#define U_TOOLUTIL_API U_EXPORT
|
||||
#elif defined(U_STATIC_IMPLEMENTATION)
|
||||
#define U_DATA_API
|
||||
#define U_COMMON_API
|
||||
|
@ -370,6 +378,7 @@ typedef void* UClassID;
|
|||
#define U_LAYOUT_API
|
||||
#define U_LAYOUTEX_API
|
||||
#define U_IO_API
|
||||
#define U_TOOLUTIL_API
|
||||
#elif defined(U_COMMON_IMPLEMENTATION)
|
||||
#define U_DATA_API U_IMPORT
|
||||
#define U_COMMON_API U_EXPORT
|
||||
|
@ -377,6 +386,7 @@ typedef void* UClassID;
|
|||
#define U_LAYOUT_API U_IMPORT
|
||||
#define U_LAYOUTEX_API U_IMPORT
|
||||
#define U_IO_API U_IMPORT
|
||||
#define U_TOOLUTIL_API U_IMPORT
|
||||
#elif defined(U_I18N_IMPLEMENTATION)
|
||||
#define U_DATA_API U_IMPORT
|
||||
#define U_COMMON_API U_IMPORT
|
||||
|
@ -384,6 +394,7 @@ typedef void* UClassID;
|
|||
#define U_LAYOUT_API U_IMPORT
|
||||
#define U_LAYOUTEX_API U_IMPORT
|
||||
#define U_IO_API U_IMPORT
|
||||
#define U_TOOLUTIL_API U_IMPORT
|
||||
#elif defined(U_LAYOUT_IMPLEMENTATION)
|
||||
#define U_DATA_API U_IMPORT
|
||||
#define U_COMMON_API U_IMPORT
|
||||
|
@ -391,6 +402,7 @@ typedef void* UClassID;
|
|||
#define U_LAYOUT_API U_EXPORT
|
||||
#define U_LAYOUTEX_API U_IMPORT
|
||||
#define U_IO_API U_IMPORT
|
||||
#define U_TOOLUTIL_API U_IMPORT
|
||||
#elif defined(U_LAYOUTEX_IMPLEMENTATION)
|
||||
#define U_DATA_API U_IMPORT
|
||||
#define U_COMMON_API U_IMPORT
|
||||
|
@ -398,6 +410,7 @@ typedef void* UClassID;
|
|||
#define U_LAYOUT_API U_IMPORT
|
||||
#define U_LAYOUTEX_API U_EXPORT
|
||||
#define U_IO_API U_IMPORT
|
||||
#define U_TOOLUTIL_API U_IMPORT
|
||||
#elif defined(U_IO_IMPLEMENTATION)
|
||||
#define U_DATA_API U_IMPORT
|
||||
#define U_COMMON_API U_IMPORT
|
||||
|
@ -405,6 +418,15 @@ typedef void* UClassID;
|
|||
#define U_LAYOUT_API U_IMPORT
|
||||
#define U_LAYOUTEX_API U_IMPORT
|
||||
#define U_IO_API U_EXPORT
|
||||
#define U_TOOLUTIL_API U_IMPORT
|
||||
#elif defined(U_TOOLUTIL_IMPLEMENTATION)
|
||||
#define U_DATA_API U_IMPORT
|
||||
#define U_COMMON_API U_IMPORT
|
||||
#define U_I18N_API U_IMPORT
|
||||
#define U_LAYOUT_API U_IMPORT
|
||||
#define U_LAYOUTEX_API U_IMPORT
|
||||
#define U_IO_API U_IMPORT
|
||||
#define U_TOOLUTIL_API U_EXPORT
|
||||
#else
|
||||
#define U_DATA_API U_IMPORT
|
||||
#define U_COMMON_API U_IMPORT
|
||||
|
@ -412,6 +434,7 @@ typedef void* UClassID;
|
|||
#define U_LAYOUT_API U_IMPORT
|
||||
#define U_LAYOUTEX_API U_IMPORT
|
||||
#define U_IO_API U_IMPORT
|
||||
#define U_TOOLUTIL_API U_IMPORT
|
||||
#endif
|
||||
|
||||
/**
|
||||
|
|
|
@ -39,9 +39,10 @@ DYNAMICCXXFLAGS = $(SHAREDLIBCXXFLAGS)
|
|||
|
||||
LDFLAGS += $(LDFLAGSICUTOOLUTIL)
|
||||
CPPFLAGS += -I$(top_builddir)/common -I$(top_srcdir)/common $(LIBCPPFLAGS)
|
||||
LIBS = $(LIBICUUC) $(DEFAULT_LIBS)
|
||||
DEFS += -DU_TOOLUTIL_IMPLEMENTATION
|
||||
LIBS = $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS)
|
||||
|
||||
OBJECTS = propsvec.o toolutil.o unewdata.o ucm.o ucmstate.o uoptions.o uparse.o ucbuf.o
|
||||
OBJECTS = propsvec.o toolutil.o unewdata.o ucm.o ucmstate.o uoptions.o uparse.o ucbuf.o xmlparser.o
|
||||
|
||||
STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O))
|
||||
|
||||
|
|
|
@ -21,8 +21,8 @@
|
|||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
ImproveFloatingPointConsistency="TRUE"
|
||||
AdditionalIncludeDirectories="..\..\common"
|
||||
PreprocessorDefinitions="WIN32;NDEBUG"
|
||||
AdditionalIncludeDirectories="..\..\..\include,..\..\common"
|
||||
PreprocessorDefinitions="WIN32;NDEBUG;U_TOOLUTIL_IMPLEMENTATION"
|
||||
StringPooling="TRUE"
|
||||
RuntimeLibrary="2"
|
||||
EnableFunctionLevelLinking="TRUE"
|
||||
|
@ -86,8 +86,8 @@
|
|||
Name="VCCLCompilerTool"
|
||||
Optimization="0"
|
||||
ImproveFloatingPointConsistency="TRUE"
|
||||
AdditionalIncludeDirectories="..\..\common"
|
||||
PreprocessorDefinitions="WIN32;_DEBUG"
|
||||
AdditionalIncludeDirectories="..\..\..\include,..\..\common"
|
||||
PreprocessorDefinitions="WIN32;_DEBUG;U_TOOLUTIL_IMPLEMENTATION"
|
||||
BasicRuntimeChecks="3"
|
||||
RuntimeLibrary="3"
|
||||
BufferSecurityCheck="TRUE"
|
||||
|
@ -171,6 +171,9 @@
|
|||
<File
|
||||
RelativePath=".\uparse.c">
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\xmlparser.cpp">
|
||||
</File>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="Header Files"
|
||||
|
@ -196,6 +199,9 @@
|
|||
<File
|
||||
RelativePath=".\uparse.h">
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\xmlparser.h">
|
||||
</File>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="Resource Files"
|
||||
|
|
810
icu4c/source/tools/toolutil/xmlparser.cpp
Normal file
810
icu4c/source/tools/toolutil/xmlparser.cpp
Normal file
|
@ -0,0 +1,810 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2004-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: xmlparser.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2004jul21
|
||||
* created by: Andy Heninger
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/ucnv.h"
|
||||
#include "unicode/regex.h"
|
||||
#include "filestrm.h"
|
||||
#include "xmlparser.h"
|
||||
|
||||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
|
||||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
|
||||
|
||||
// character constants
|
||||
enum {
|
||||
x_QUOT=0x22,
|
||||
x_AMP=0x26,
|
||||
x_APOS=0x27,
|
||||
x_LT=0x3c,
|
||||
x_GT=0x3e,
|
||||
x_l=0x6c
|
||||
};
|
||||
|
||||
#define XML_SPACES "[ \\u0009\\u000d\\u000a]"
|
||||
|
||||
// XML #4
|
||||
#define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
|
||||
"[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
|
||||
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
|
||||
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
|
||||
|
||||
// XML #5
|
||||
#define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
|
||||
|
||||
// XML #6
|
||||
#define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
|
||||
|
||||
//
|
||||
// UXMLParser constructor. Mostly just initializes the ICU regexes that are
|
||||
// used for parsing.
|
||||
//
|
||||
UXMLParser::UXMLParser(UErrorCode &status) :
|
||||
// XML Declaration. XML Production #23.
|
||||
// example: "<?xml version=1.0 encoding="utf-16" ?>
|
||||
// This is a sloppy implementation - just look for the leading <?xml and the closing ?>
|
||||
// allow for a possible leading BOM.
|
||||
mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
|
||||
|
||||
// XML Comment production #15
|
||||
// example: "<!-- whatever -->
|
||||
// note, does not detect an illegal "--" within comments
|
||||
mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
|
||||
|
||||
// XML Spaces
|
||||
// production [3]
|
||||
mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
|
||||
|
||||
// XML Doctype decl production #28
|
||||
// example "<!DOCTYPE foo SYSTEM "somewhere" >
|
||||
// TODO: we don't actually parse the DOCTYPE or internal subsets.
|
||||
// Some internal dtd subsets could confuse this simple-minded
|
||||
// attempt at skipping over them.
|
||||
mXMLDoctype(UnicodeString("(?s)<!DOCTYPE.+?>", -1, US_INV), 0, status),
|
||||
|
||||
// XML PI production #16
|
||||
// example "<?target stuff?>
|
||||
mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
|
||||
|
||||
// XML Element Start Productions #40, #41
|
||||
// example <foo att1='abc' att2="d e f" >
|
||||
// capture #1: the tag name
|
||||
//
|
||||
mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name"
|
||||
"(?:"
|
||||
XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = "
|
||||
"(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
|
||||
")*" // * for zero or more attributes.
|
||||
XML_SPACES "*?>", -1, US_INV), 0, status), // match " >"
|
||||
|
||||
// XML Element End production #42
|
||||
// example </foo>
|
||||
mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
|
||||
|
||||
// XML Element Empty production #44
|
||||
// example <foo att1="abc" att2="d e f" />
|
||||
mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name"
|
||||
"(?:"
|
||||
XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = "
|
||||
"(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
|
||||
")*" // * for zero or more attributes.
|
||||
XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />"
|
||||
|
||||
|
||||
// XMLCharData. Everything but '<'. Note that & will be dealt with later.
|
||||
mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
|
||||
|
||||
// Attribute name = "value". XML Productions 10, 40/41
|
||||
// Capture group 1 is name,
|
||||
// 2 is the attribute value, including the quotes.
|
||||
//
|
||||
// Note that attributes are scanned twice. The first time is with
|
||||
// the regex for an entire element start. There, the attributes
|
||||
// are checked syntactically, but not separted out one by one.
|
||||
// Here, we match a single attribute, and make its name and
|
||||
// attribute value available to the parser code.
|
||||
mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*"
|
||||
"((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
|
||||
|
||||
|
||||
mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
|
||||
|
||||
// Match any of the new-line sequences in content.
|
||||
// All are changed to \u000a.
|
||||
mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
|
||||
|
||||
// & char references
|
||||
// We will figure out what we've got based on which capture group has content.
|
||||
// The last one is a catchall for unrecognized entity references..
|
||||
// 1 2 3 4 5 6 7 8
|
||||
mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))", -1, US_INV),
|
||||
0, status),
|
||||
|
||||
fNames(status),
|
||||
fElementStack(status),
|
||||
fOneLF((UChar)0x0a) // Plain new-line string, used in new line normalization.
|
||||
{
|
||||
};
|
||||
|
||||
UXMLParser *
|
||||
UXMLParser::createParser(UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
} else {
|
||||
return new UXMLParser(errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
UXMLParser::~UXMLParser() {}
|
||||
|
||||
UXMLElement *
|
||||
UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
|
||||
char bytes[4096], charsetBuffer[100];
|
||||
FileStream *f;
|
||||
const char *charset, *pb;
|
||||
UnicodeString src;
|
||||
UConverter *cnv;
|
||||
UChar *buffer, *pu;
|
||||
int32_t fileLength, bytesLength, length, capacity;
|
||||
UBool flush;
|
||||
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
f=T_FileStream_open(filename, "rb");
|
||||
if(f==NULL) {
|
||||
errorCode=U_FILE_ACCESS_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
|
||||
if(bytesLength<sizeof(bytes)) {
|
||||
// we have already read the entire file
|
||||
fileLength=bytesLength;
|
||||
} else {
|
||||
// get the file length
|
||||
fileLength=T_FileStream_size(f);
|
||||
}
|
||||
|
||||
/*
|
||||
* get the charset:
|
||||
* 1. Unicode signature
|
||||
* 2. treat as ISO-8859-1 and read XML encoding="charser"
|
||||
* 3. default to UTF-8
|
||||
*/
|
||||
charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
|
||||
if(U_SUCCESS(errorCode) && charset!=NULL) {
|
||||
// open converter according to Unicode signature
|
||||
cnv=ucnv_open(charset, &errorCode);
|
||||
} else {
|
||||
// read as Latin-1 and parse the XML declaration and encoding
|
||||
cnv=ucnv_open("ISO-8859-1", &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
// unexpected error opening Latin-1 converter
|
||||
goto exit;
|
||||
}
|
||||
|
||||
buffer=src.getBuffer(bytesLength);
|
||||
if(buffer==NULL) {
|
||||
// unexpected failure to reserve some string capacity
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
goto exit;
|
||||
}
|
||||
pb=bytes;
|
||||
pu=buffer;
|
||||
ucnv_toUnicode(
|
||||
cnv,
|
||||
&pu, buffer+src.getCapacity(),
|
||||
&pb, bytes+sizeof(bytes),
|
||||
NULL, TRUE, &errorCode);
|
||||
src.releaseBuffer((int32_t)(pu-buffer));
|
||||
ucnv_close(cnv);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
// unexpected error in conversion from Latin-1
|
||||
src.remove();
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// parse XML declaration
|
||||
if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
|
||||
int32_t declEnd=mXMLDecl.end(errorCode);
|
||||
// go beyond <?xml
|
||||
int32_t pos=src.indexOf(x_l)+1;
|
||||
|
||||
mAttrValue.reset(src);
|
||||
while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element.
|
||||
UnicodeString attName = mAttrValue.group(1, errorCode);
|
||||
UnicodeString attValue = mAttrValue.group(2, errorCode);
|
||||
|
||||
// Trim the quotes from the att value. These are left over from the original regex
|
||||
// that parsed the attribue, which couldn't conveniently strip them.
|
||||
attValue.remove(0,1); // one char from the beginning
|
||||
attValue.truncate(attValue.length()-1); // and one from the end.
|
||||
|
||||
if(attName==UNICODE_STRING("encoding", 8)) {
|
||||
length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer), US_INV);
|
||||
charset=charsetBuffer;
|
||||
break;
|
||||
}
|
||||
pos = mAttrValue.end(2, errorCode);
|
||||
}
|
||||
|
||||
if(charset==NULL) {
|
||||
// default to UTF-8
|
||||
charset="UTF-8";
|
||||
}
|
||||
cnv=ucnv_open(charset, &errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
if(U_FAILURE(errorCode)) {
|
||||
// unable to open the converter
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// convert the file contents
|
||||
capacity=fileLength; // estimated capacity
|
||||
src.getBuffer(capacity);
|
||||
src.releaseBuffer(0); // zero length
|
||||
flush=FALSE;
|
||||
for(;;) {
|
||||
// convert contents of bytes[bytesLength]
|
||||
pb=bytes;
|
||||
for(;;) {
|
||||
length=src.length();
|
||||
buffer=src.getBuffer(capacity);
|
||||
if(buffer==NULL) {
|
||||
// unexpected failure to reserve some string capacity
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
pu=buffer+length;
|
||||
ucnv_toUnicode(
|
||||
cnv, &pu, buffer+src.getCapacity(),
|
||||
&pb, bytes+bytesLength,
|
||||
NULL, FALSE, &errorCode);
|
||||
src.releaseBuffer((int32_t)(pu-buffer));
|
||||
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
errorCode=U_ZERO_ERROR;
|
||||
capacity=(3*src.getCapacity())/2; // increase capacity by 50%
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(U_FAILURE(errorCode)) {
|
||||
break; // conversion error
|
||||
}
|
||||
|
||||
if(flush) {
|
||||
break; // completely converted the file
|
||||
}
|
||||
|
||||
// read next block
|
||||
bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
|
||||
if(bytesLength==0) {
|
||||
// reached end of file, convert once more to flush the converter
|
||||
flush=TRUE;
|
||||
}
|
||||
};
|
||||
|
||||
exit:
|
||||
ucnv_close(cnv);
|
||||
T_FileStream_close(f);
|
||||
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
return parse(src, errorCode);
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
UXMLElement *
|
||||
UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
|
||||
if(U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
UXMLElement *root = NULL;
|
||||
fPos = 0; // TODO use just a local pos variable and pass it into functions
|
||||
// where necessary?
|
||||
|
||||
// set all matchers to work on the input string
|
||||
mXMLDecl.reset(src);
|
||||
mXMLComment.reset(src);
|
||||
mXMLSP.reset(src);
|
||||
mXMLDoctype.reset(src);
|
||||
mXMLPI.reset(src);
|
||||
mXMLElemStart.reset(src);
|
||||
mXMLElemEnd.reset(src);
|
||||
mXMLElemEmpty.reset(src);
|
||||
mXMLCharData.reset(src);
|
||||
mAttrValue.reset(src);
|
||||
mAttrNormalizer.reset(src);
|
||||
mNewLineNormalizer.reset(src);
|
||||
mAmps.reset(src);
|
||||
|
||||
// Consume the XML Declaration, if present.
|
||||
if (mXMLDecl.lookingAt(fPos, status)) {
|
||||
fPos = mXMLDecl.end(status);
|
||||
}
|
||||
|
||||
// Consume "misc" [XML production 27] appearing before DocType
|
||||
parseMisc(status);
|
||||
|
||||
// Consume a DocType declaration, if present.
|
||||
if (mXMLDoctype.lookingAt(fPos, status)) {
|
||||
fPos = mXMLDoctype.end(status);
|
||||
}
|
||||
|
||||
// Consume additional "misc" [XML production 27] appearing after the DocType
|
||||
parseMisc(status);
|
||||
|
||||
// Get the root element
|
||||
if (mXMLElemEmpty.lookingAt(fPos, status)) {
|
||||
// Root is an empty element (no nested elements or content)
|
||||
root = createElement(mXMLElemEmpty, status);
|
||||
fPos = mXMLElemEmpty.end(status);
|
||||
} else {
|
||||
if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
|
||||
error("Root Element expected", status);
|
||||
goto errorExit;
|
||||
}
|
||||
root = createElement(mXMLElemStart, status);
|
||||
UXMLElement *el = root;
|
||||
|
||||
//
|
||||
// This is the loop that consumes the root element of the document,
|
||||
// including all nested content. Nested elements are handled by
|
||||
// explicit pushes/pops of the element stack; there is no recursion
|
||||
// in the control flow of this code.
|
||||
// "el" always refers to the current element, the one to which content
|
||||
// is being added. It is above the top of the element stack.
|
||||
for (;;) {
|
||||
// Nested Element Start
|
||||
if (mXMLElemStart.lookingAt(fPos, status)) {
|
||||
UXMLElement *t = createElement(mXMLElemStart, status);
|
||||
el->fChildren.addElement(t, status);
|
||||
t->fParent = el;
|
||||
fElementStack.push(el, status);
|
||||
el = t;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Text Content. String is concatenated onto the current node's content,
|
||||
// but only if it contains something other than spaces.
|
||||
UnicodeString s = scanContent(status);
|
||||
if (s.length() > 0) {
|
||||
mXMLSP.reset(s);
|
||||
if (mXMLSP.matches(status) == FALSE) {
|
||||
// This chunk of text contains something other than just
|
||||
// white space. Make a child node for it.
|
||||
replaceCharRefs(s, status);
|
||||
el->fChildren.addElement(s.clone(), status);
|
||||
}
|
||||
mXMLSP.reset(src); // The matchers need to stay set to the main input string.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Comments. Discard.
|
||||
if (mXMLComment.lookingAt(fPos, status)) {
|
||||
fPos = mXMLComment.end(status);
|
||||
continue;
|
||||
}
|
||||
|
||||
// PIs. Discard.
|
||||
if (mXMLPI.lookingAt(fPos, status)) {
|
||||
fPos = mXMLPI.end(status);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Element End
|
||||
if (mXMLElemEnd.lookingAt(fPos, status)) {
|
||||
fPos = mXMLElemEnd.end(0, status);
|
||||
const UnicodeString name = mXMLElemEnd.group(1, status);
|
||||
if (name != *el->fName) {
|
||||
error("Element start / end tag mismatch", status);
|
||||
goto errorExit;
|
||||
}
|
||||
if (fElementStack.empty()) {
|
||||
// Close of the root element. We're done with the doc.
|
||||
el = NULL;
|
||||
break;
|
||||
}
|
||||
el = (UXMLElement *)fElementStack.pop();
|
||||
continue;
|
||||
}
|
||||
|
||||
// Empty Element. Stored as a child of the current element, but not stacked.
|
||||
if (mXMLElemEmpty.lookingAt(fPos, status)) {
|
||||
UXMLElement *t = createElement(mXMLElemEmpty, status);
|
||||
el->fChildren.addElement(t, status);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Hit something within the document that doesn't match anything.
|
||||
// It's an error.
|
||||
error("Unrecognized markup", status);
|
||||
break;
|
||||
}
|
||||
|
||||
if (el != NULL || !fElementStack.empty()) {
|
||||
// We bailed out early, for some reason.
|
||||
error("Root element not closed.", status);
|
||||
goto errorExit;
|
||||
}
|
||||
}
|
||||
|
||||
// Root Element parse is complete.
|
||||
// Consume the annoying xml "Misc" that can appear at the end of the doc.
|
||||
parseMisc(status);
|
||||
|
||||
// We should have reached the end of the input
|
||||
if (fPos != src.length()) {
|
||||
error("Extra content at the end of the document", status);
|
||||
goto errorExit;
|
||||
}
|
||||
|
||||
// Success!
|
||||
return root;
|
||||
|
||||
errorExit:
|
||||
delete root;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//
|
||||
// createElement
|
||||
// We've just matched an element start tag. Create and fill in a UXMLElement object
|
||||
// for it.
|
||||
//
|
||||
UXMLElement *
|
||||
UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) {
|
||||
// First capture group is the element's name.
|
||||
UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
|
||||
|
||||
// Scan for attributes.
|
||||
int32_t pos = mEl.end(1, status); // The position after the end of the tag name
|
||||
|
||||
while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element.
|
||||
UnicodeString attName = mAttrValue.group(1, status);
|
||||
UnicodeString attValue = mAttrValue.group(2, status);
|
||||
|
||||
// Trim the quotes from the att value. These are left over from the original regex
|
||||
// that parsed the attribue, which couldn't conveniently strip them.
|
||||
attValue.remove(0,1); // one char from the beginning
|
||||
attValue.truncate(attValue.length()-1); // and one from the end.
|
||||
|
||||
// XML Attribue value normalization.
|
||||
// This is one of the really screwy parts of the XML spec.
|
||||
// See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
|
||||
// Note that non-validating parsers must treat all entities as type CDATA
|
||||
// which simplifies things some.
|
||||
|
||||
// Att normalization step 1: normalize any newlines in the attribute value
|
||||
mNewLineNormalizer.reset(attValue);
|
||||
attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
|
||||
|
||||
// Next change all xml white space chars to plain \u0020 spaces.
|
||||
mAttrNormalizer.reset(attValue);
|
||||
UnicodeString oneSpace((UChar)0x0020);
|
||||
attValue = mAttrNormalizer.replaceAll(oneSpace, status);
|
||||
|
||||
// Replace character entities.
|
||||
replaceCharRefs(attValue, status);
|
||||
|
||||
// Save the attribute name and value in our document structure.
|
||||
el->fAttNames.addElement((void *)intern(attName, status), status);
|
||||
el->fAttValues.addElement(attValue.clone(), status);
|
||||
pos = mAttrValue.end(2, status);
|
||||
}
|
||||
fPos = mEl.end(0, status);
|
||||
return el;
|
||||
}
|
||||
|
||||
//
|
||||
// parseMisc
|
||||
// Consume XML "Misc" [production #27]
|
||||
// which is any combination of space, PI and comments
|
||||
// Need to watch end-of-input because xml MISC stuff is allowed after
|
||||
// the document element, so we WILL scan off the end in this function
|
||||
//
|
||||
void
|
||||
UXMLParser::parseMisc(UErrorCode &status) {
|
||||
for (;;) {
|
||||
if (fPos >= mXMLPI.input().length()) {
|
||||
break;
|
||||
}
|
||||
if (mXMLPI.lookingAt(fPos, status)) {
|
||||
fPos = mXMLPI.end(status);
|
||||
continue;
|
||||
}
|
||||
if (mXMLSP.lookingAt(fPos, status)) {
|
||||
fPos = mXMLSP.end(status);
|
||||
continue;
|
||||
}
|
||||
if (mXMLComment.lookingAt(fPos, status)) {
|
||||
fPos = mXMLComment.end(status);
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Scan for document content.
|
||||
//
|
||||
UnicodeString
|
||||
UXMLParser::scanContent(UErrorCode &status) {
|
||||
UnicodeString result;
|
||||
if (mXMLCharData.lookingAt(fPos, status)) {
|
||||
result = mXMLCharData.group(0, status);
|
||||
// Normalize the new-lines. (Before char ref substitution)
|
||||
mNewLineNormalizer.reset(result);
|
||||
result = mNewLineNormalizer.replaceAll(fOneLF, status);
|
||||
|
||||
// TODO: handle CDATA
|
||||
fPos = mXMLCharData.end(0, status);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
//
|
||||
// replaceCharRefs
|
||||
//
|
||||
// replace the char entities < & { ካ etc. in a string
|
||||
// with the corresponding actual character.
|
||||
//
|
||||
void
|
||||
UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
|
||||
UnicodeString result;
|
||||
UnicodeString replacement;
|
||||
int i;
|
||||
|
||||
mAmps.reset(s);
|
||||
// See the initialization for the regex matcher mAmps.
|
||||
// Which entity we've matched is determined by which capture group has content,
|
||||
// which is flaged by start() of that group not being -1.
|
||||
while (mAmps.find()) {
|
||||
if (mAmps.start(1, status) != -1) {
|
||||
replacement.setTo((UChar)x_AMP);
|
||||
} else if (mAmps.start(2, status) != -1) {
|
||||
replacement.setTo((UChar)x_LT);
|
||||
} else if (mAmps.start(3, status) != -1) {
|
||||
replacement.setTo((UChar)x_GT);
|
||||
} else if (mAmps.start(4, status) != -1) {
|
||||
replacement.setTo((UChar)x_APOS);
|
||||
} else if (mAmps.start(5, status) != -1) {
|
||||
replacement.setTo((UChar)x_QUOT);
|
||||
} else if (mAmps.start(6, status) != -1) {
|
||||
UnicodeString hexString = mAmps.group(6, status);
|
||||
UChar32 val = 0;
|
||||
for (i=0; i<hexString.length(); i++) {
|
||||
val = (val << 4) + u_digit(hexString.charAt(i), 16);
|
||||
}
|
||||
// TODO: some verification that the character is valid
|
||||
replacement.setTo(val);
|
||||
} else if (mAmps.start(7, status) != -1) {
|
||||
UnicodeString decimalString = mAmps.group(7, status);
|
||||
UChar32 val = 0;
|
||||
for (i=0; i<decimalString.length(); i++) {
|
||||
val = val*10 + u_digit(decimalString.charAt(i), 10);
|
||||
}
|
||||
// TODO: some verification that the character is valid
|
||||
replacement.setTo(val);
|
||||
} else {
|
||||
// An unrecognized &entity; Leave it alone.
|
||||
// TODO: check that it really looks like an entity, and is not some
|
||||
// random & in the text.
|
||||
replacement = mAmps.group(0, status);
|
||||
}
|
||||
mAmps.appendReplacement(result, replacement, status);
|
||||
}
|
||||
mAmps.appendTail(result);
|
||||
s = result;
|
||||
}
|
||||
|
||||
void
|
||||
UXMLParser::error(const char *message, UErrorCode &status) {
|
||||
// TODO: something better here...
|
||||
const UnicodeString &src=mXMLDecl.input();
|
||||
int line = 0;
|
||||
int ci = 0;
|
||||
while (ci < fPos && ci>=0) {
|
||||
ci = src.indexOf((UChar)0x0a, ci+1);
|
||||
line++;
|
||||
}
|
||||
fprintf(stderr, "Error: %s at line %d\n", message, line);
|
||||
if (U_SUCCESS(status)) {
|
||||
status = U_PARSE_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
// intern strings like in Java
|
||||
|
||||
const UnicodeString *
|
||||
UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
|
||||
const UHashElement *he=fNames.find(s);
|
||||
if(he!=NULL) {
|
||||
// already a known name, return its hashed key pointer
|
||||
return (const UnicodeString *)he->key.pointer;
|
||||
} else {
|
||||
// add this new name and return its hashed key pointer
|
||||
fNames.puti(s, 0, errorCode);
|
||||
he=fNames.find(s);
|
||||
return (const UnicodeString *)he->key.pointer;
|
||||
}
|
||||
}
|
||||
|
||||
const UnicodeString *
|
||||
UXMLParser::findName(const UnicodeString &s) const {
|
||||
const UHashElement *he=fNames.find(s);
|
||||
if(he!=NULL) {
|
||||
// a known name, return its hashed key pointer
|
||||
return (const UnicodeString *)he->key.pointer;
|
||||
} else {
|
||||
// unknown name
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// UXMLElement ------------------------------------------------------------- ***
|
||||
|
||||
UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
|
||||
fParser(parser),
|
||||
fName(name),
|
||||
fAttNames(errorCode),
|
||||
fAttValues(errorCode),
|
||||
fChildren(errorCode),
|
||||
fParent(NULL)
|
||||
{
|
||||
}
|
||||
|
||||
UXMLElement::~UXMLElement() {
|
||||
int i;
|
||||
// attribute names are owned by the UXMLParser, don't delete them here
|
||||
for (i=fAttValues.size()-1; i>=0; i--) {
|
||||
delete (UObject *)fAttValues.elementAt(i);
|
||||
}
|
||||
for (i=fChildren.size()-1; i>=0; i--) {
|
||||
delete (UObject *)fChildren.elementAt(i);
|
||||
}
|
||||
}
|
||||
|
||||
const UnicodeString &
|
||||
UXMLElement::getTagName() const {
|
||||
return *fName;
|
||||
}
|
||||
|
||||
UnicodeString
|
||||
UXMLElement::getText(UBool recurse) const {
|
||||
UnicodeString text;
|
||||
appendText(text, recurse);
|
||||
return text;
|
||||
}
|
||||
|
||||
void
|
||||
UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
|
||||
const UObject *node;
|
||||
int32_t i, count=fChildren.size();
|
||||
for(i=0; i<count; ++i) {
|
||||
node=(const UObject *)fChildren.elementAt(i);
|
||||
if(node->getDynamicClassID()==UnicodeString::getStaticClassID()) {
|
||||
text.append(*(const UnicodeString *)node);
|
||||
} else if(recurse) /* must be a UXMLElement */ {
|
||||
((const UXMLElement *)node)->appendText(text, recurse);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int32_t
|
||||
UXMLElement::countAttributes() const {
|
||||
return fAttNames.size();
|
||||
}
|
||||
|
||||
const UnicodeString *
|
||||
UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
|
||||
if(0<=i && i<fAttNames.size()) {
|
||||
name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
|
||||
value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
|
||||
return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
const UnicodeString *
|
||||
UXMLElement::getAttribute(const UnicodeString &name) const {
|
||||
// search for the attribute name by comparing the interned pointer,
|
||||
// not the string contents
|
||||
const UnicodeString *p=fParser->findName(name);
|
||||
if(p==NULL) {
|
||||
return NULL; // no such attribute seen by the parser at all
|
||||
}
|
||||
|
||||
int32_t i, count=fAttNames.size();
|
||||
for(i=0; i<count; ++i) {
|
||||
if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
|
||||
return (const UnicodeString *)fAttValues.elementAt(i);
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int32_t
|
||||
UXMLElement::countChildren() const {
|
||||
return fChildren.size();
|
||||
}
|
||||
|
||||
const UObject *
|
||||
UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
|
||||
if(0<=i && i<fChildren.size()) {
|
||||
const UObject *node=(const UObject *)fChildren.elementAt(i);
|
||||
if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
|
||||
type=UXML_NODE_TYPE_ELEMENT;
|
||||
} else {
|
||||
type=UXML_NODE_TYPE_STRING;
|
||||
}
|
||||
return node;
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
const UXMLElement *
|
||||
UXMLElement::nextChildElement(int32_t &i) const {
|
||||
if(i<0) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const UObject *node;
|
||||
int32_t count=fChildren.size();
|
||||
while(i<count) {
|
||||
node=(const UObject *)fChildren.elementAt(i++);
|
||||
// TODO: see if ICU can use C++ instanceof instead of its own poor man's RTTI
|
||||
// if(node instanceof UXMLElement) {
|
||||
if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
|
||||
return (const UXMLElement *)node;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const UXMLElement *
|
||||
UXMLElement::getChildElement(const UnicodeString &name) const {
|
||||
// search for the element name by comparing the interned pointer,
|
||||
// not the string contents
|
||||
const UnicodeString *p=fParser->findName(name);
|
||||
if(p==NULL) {
|
||||
return NULL; // no such element seen by the parser at all
|
||||
}
|
||||
|
||||
const UObject *node;
|
||||
int32_t i, count=fChildren.size();
|
||||
for(i=0; i<count; ++i) {
|
||||
node=(const UObject *)fChildren.elementAt(i);
|
||||
if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
|
||||
const UXMLElement *elem=(const UXMLElement *)node;
|
||||
if(p==elem->fName) {
|
||||
return elem;
|
||||
}
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
236
icu4c/source/tools/toolutil/xmlparser.h
Normal file
236
icu4c/source/tools/toolutil/xmlparser.h
Normal file
|
@ -0,0 +1,236 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2004-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: xmlparser.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2004jul21
|
||||
* created by: Andy Heninger
|
||||
*
|
||||
* Tiny XML parser using ICU and intended for use in ICU tests and in build tools.
|
||||
* Not suitable for production use. Not supported.
|
||||
* Not conformant. Not efficient.
|
||||
* But very small.
|
||||
*/
|
||||
|
||||
#ifndef __XMLPARSER_H__
|
||||
#define __XMLPARSER_H__
|
||||
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/regex.h"
|
||||
#include "uvector.h"
|
||||
#include "hash.h"
|
||||
|
||||
enum UXMLNodeType {
|
||||
/** Node type string (text contents), stored as a UnicodeString. */
|
||||
UXML_NODE_TYPE_STRING,
|
||||
/** Node type element, stored as a UXMLElement. */
|
||||
UXML_NODE_TYPE_ELEMENT,
|
||||
UXML_NODE_TYPE_COUNT
|
||||
};
|
||||
|
||||
/**
|
||||
* This class represents an element node in a parsed XML tree.
|
||||
*/
|
||||
class U_TOOLUTIL_API UXMLElement : public UObject {
|
||||
public:
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~UXMLElement();
|
||||
|
||||
/**
|
||||
* Get the tag name of this element.
|
||||
*/
|
||||
const UnicodeString &getTagName() const;
|
||||
/**
|
||||
* Get the text contents of the element.
|
||||
* Append the contents of all text child nodes.
|
||||
* @param recurse If TRUE, also recursively appends the contents of all
|
||||
* text child nodes of element children.
|
||||
* @return The text contents.
|
||||
*/
|
||||
UnicodeString getText(UBool recurse) const;
|
||||
/**
|
||||
* Get the number of attributes.
|
||||
*/
|
||||
int32_t countAttributes() const;
|
||||
/**
|
||||
* Get the i-th attribute.
|
||||
* @param i Index of the attribute.
|
||||
* @param name Output parameter, receives the attribute name.
|
||||
* @param value Output parameter, receives the attribute value.
|
||||
* @return A pointer to the attribute value (may be &value or a pointer to an
|
||||
* internal string object), or NULL if i is out of bounds.
|
||||
*/
|
||||
const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const;
|
||||
/**
|
||||
* Get the value of the attribute with the given name.
|
||||
* @param name Attribute name to be looked up.
|
||||
* @return A pointer to the attribute value, or NULL if this element
|
||||
* does not have this attribute.
|
||||
*/
|
||||
const UnicodeString *getAttribute(const UnicodeString &name) const;
|
||||
/**
|
||||
* Get the number of child nodes.
|
||||
*/
|
||||
int32_t countChildren() const;
|
||||
/**
|
||||
* Get the i-th child node.
|
||||
* @param i Index of the child node.
|
||||
* @param type The child node type.
|
||||
* @return A pointer to the child node object, or NULL if i is out of bounds.
|
||||
*/
|
||||
const UObject *getChild(int32_t i, UXMLNodeType &type) const;
|
||||
/**
|
||||
* Get the next child element node, skipping non-element child nodes.
|
||||
* @param i Enumeration index; initialize to 0 before getting the first child element.
|
||||
* @return A pointer to the next child element, or NULL if there is none.
|
||||
*/
|
||||
const UXMLElement *nextChildElement(int32_t &i) const;
|
||||
/**
|
||||
* Get the immediate child element with the given name.
|
||||
* If there are multiple child elements with this name, then return
|
||||
* the first one.
|
||||
* @param name Element name to be looked up.
|
||||
* @return A pointer to the element node, or NULL if this element
|
||||
* does not have this immediate child element.
|
||||
*/
|
||||
const UXMLElement *getChildElement(const UnicodeString &name) const;
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
||||
*/
|
||||
virtual UClassID getDynamicClassID() const;
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for this class.
|
||||
*/
|
||||
static UClassID U_EXPORT2 getStaticClassID();
|
||||
|
||||
private:
|
||||
// prevent default construction etc.
|
||||
UXMLElement();
|
||||
UXMLElement(const UXMLElement &other);
|
||||
UXMLElement &operator=(const UXMLElement &other);
|
||||
|
||||
void appendText(UnicodeString &text, UBool recurse) const;
|
||||
|
||||
friend class UXMLParser;
|
||||
|
||||
UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode);
|
||||
|
||||
const UXMLParser *fParser;
|
||||
const UnicodeString *fName; // The tag name of this element (owned by the UXMLParser)
|
||||
UnicodeString fContent; // The text content of this node. All element content is
|
||||
// concatenated even when there are intervening nested elements
|
||||
// (which doesn't happen with most xml files we care about)
|
||||
// Sections of content containing only white space are dropped,
|
||||
// which gets rid the bogus white space content from
|
||||
// elements which are primarily containers for nested elements.
|
||||
UVector fAttNames; // A vector containing the names of this element's attributes
|
||||
// The names are UnicodeString objects, owned by the UXMLParser.
|
||||
UVector fAttValues; // A vector containing the attribute values for
|
||||
// this element's attributes. The order is the same
|
||||
// as that of the attribute name vector.
|
||||
|
||||
UXMLElement *fParent; // A pointer to the parent element of this element.
|
||||
|
||||
UVector fChildren; // The child nodes of this element (a Vector)
|
||||
};
|
||||
|
||||
/**
|
||||
* A simple XML parser; it is neither efficient nor conformant and only useful for
|
||||
* restricted types of XML documents.
|
||||
*
|
||||
* The parse methods parse whole documents and return the parse trees via their
|
||||
* root elements.
|
||||
*/
|
||||
class U_TOOLUTIL_API UXMLParser : public UObject {
|
||||
public:
|
||||
/**
|
||||
* Create an XML parser.
|
||||
*/
|
||||
static UXMLParser *createParser(UErrorCode &errorCode);
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~UXMLParser();
|
||||
|
||||
/**
|
||||
* Parse an XML document, create the entire document tree, and
|
||||
* return a pointer to the root element of the parsed tree.
|
||||
* The caller must delete the element.
|
||||
*/
|
||||
UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode);
|
||||
/**
|
||||
* Parse an XML file, create the entire document tree, and
|
||||
* return a pointer to the root element of the parsed tree.
|
||||
* The caller must delete the element.
|
||||
*/
|
||||
UXMLElement *parseFile(const char *filename, UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
||||
*/
|
||||
virtual UClassID getDynamicClassID() const;
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for this class.
|
||||
*/
|
||||
static UClassID U_EXPORT2 getStaticClassID();
|
||||
|
||||
private:
|
||||
// prevent default construction etc.
|
||||
UXMLParser();
|
||||
UXMLParser(const UXMLParser &other);
|
||||
UXMLParser &operator=(const UXMLParser &other);
|
||||
|
||||
// constructor
|
||||
UXMLParser(UErrorCode &status);
|
||||
|
||||
void parseMisc(UErrorCode &status);
|
||||
UXMLElement *createElement(RegexMatcher &mEl, UErrorCode &status);
|
||||
void error(const char *message, UErrorCode &status);
|
||||
UnicodeString scanContent(UErrorCode &status);
|
||||
void replaceCharRefs(UnicodeString &s, UErrorCode &status);
|
||||
|
||||
const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode);
|
||||
public:
|
||||
// public for UXMLElement only
|
||||
const UnicodeString *findName(const UnicodeString &s) const;
|
||||
private:
|
||||
|
||||
// There is one ICU regex matcher for each of the major XML syntax items
|
||||
// that are recognized.
|
||||
RegexMatcher mXMLDecl;
|
||||
RegexMatcher mXMLComment;
|
||||
RegexMatcher mXMLSP;
|
||||
RegexMatcher mXMLDoctype;
|
||||
RegexMatcher mXMLPI;
|
||||
RegexMatcher mXMLElemStart;
|
||||
RegexMatcher mXMLElemEnd;
|
||||
RegexMatcher mXMLElemEmpty;
|
||||
RegexMatcher mXMLCharData;
|
||||
RegexMatcher mAttrValue;
|
||||
RegexMatcher mAttrNormalizer;
|
||||
RegexMatcher mNewLineNormalizer;
|
||||
RegexMatcher mAmps;
|
||||
|
||||
Hashtable fNames; // interned element/attribute name strings
|
||||
UStack fElementStack; // Stack holds the parent elements when nested
|
||||
// elements are being parsed. All items on this
|
||||
// stack are of type UXMLElement.
|
||||
int32_t fPos; // String index of the current scan position in
|
||||
// xml source (in fSrc).
|
||||
UnicodeString fOneLF;
|
||||
};
|
||||
|
||||
#endif
|
Loading…
Add table
Reference in a new issue