Escape TAB character in attribute values with 	
This change modifies the table entries for ctx_special_attr to treat TAB character as special, which makes the output code escape it. Before this change, trying to use TAB in an attribute value would output it verbatim; during subsequent parsing, pugixml - and other compliant parsers - would apply attribute-value normalization, turning the TAB into a space and losing the original value. Using 	 fixes this; if an input document has 	 in an attribute value, that gets unescaped into \t during parsing and escaped back into 	 during output, which means we can now roundtrip values like this. Fixes #242.
This commit is contained in:
parent
7d2436ec2f
commit
aac75cd299
2 changed files with 17 additions and 3 deletions
|
@ -1861,7 +1861,7 @@ PUGI__NS_BEGIN
|
|||
enum chartypex_t
|
||||
{
|
||||
ctx_special_pcdata = 1, // Any symbol >= 0 and < 32 (except \t, \r, \n), &, <, >
|
||||
ctx_special_attr = 2, // Any symbol >= 0 and < 32 (except \t), &, <, >, "
|
||||
ctx_special_attr = 2, // Any symbol >= 0 and < 32, &, <, >, "
|
||||
ctx_start_symbol = 4, // Any symbol > 127, a-z, A-Z, _
|
||||
ctx_digit = 8, // 0-9
|
||||
ctx_symbol = 16 // Any symbol > 127, a-z, A-Z, 0-9, _, -, .
|
||||
|
@ -1869,7 +1869,7 @@ PUGI__NS_BEGIN
|
|||
|
||||
static const unsigned char chartypex_table[256] =
|
||||
{
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 2, 3, 3, 2, 3, 3, // 0-15
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, // 0-15
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 16-31
|
||||
0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 16, 16, 0, // 32-47
|
||||
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 3, 0, 3, 0, // 48-63
|
||||
|
|
|
@ -193,7 +193,21 @@ TEST_XML(write_escape, "<node attr=''>text</node>")
|
|||
doc.child(STR("node")).attribute(STR("attr")) = STR("<>'\"&\x04\r\n\t");
|
||||
doc.child(STR("node")).first_child().set_value(STR("<>'\"&\x04\r\n\t"));
|
||||
|
||||
CHECK_NODE(doc, STR("<node attr=\"<>'"& \t\"><>'\"&\r\n\t</node>"));
|
||||
CHECK_NODE(doc, STR("<node attr=\"<>'"& 	\"><>'\"&\r\n\t</node>"));
|
||||
}
|
||||
|
||||
TEST_XML(write_escape_roundtrip, "<node attr=''>text</node>")
|
||||
{
|
||||
doc.child(STR("node")).attribute(STR("attr")) = STR("<>'\"&\x04\r\n\t");
|
||||
doc.child(STR("node")).first_child().set_value(STR("<>'\"&\x04\r\n\t"));
|
||||
|
||||
std::string contents = write_narrow(doc, format_raw, encoding_utf8);
|
||||
|
||||
CHECK(doc.load_buffer(contents.c_str(), contents.size()));
|
||||
|
||||
// Note: this string is almost identical to the string from write_escape with the exception of \r
|
||||
// \r in PCDATA doesn't roundtrip because it has to go through newline conversion (which could be disabled, but is active by default)
|
||||
CHECK_NODE(doc, STR("<node attr=\"<>'"& 	\"><>'\"&\n\t</node>"));
|
||||
}
|
||||
|
||||
TEST_XML(write_escape_unicode, "<node attr='㰀'/>")
|
||||
|
|
Loading…
Add table
Reference in a new issue