Introduced encoding_latin1 support (conversion on loading, conversion on saving, encoding name in declaration in document::save)

git-svn-id: http://pugixml.googlecode.com/svn/trunk@829 99668b35-9821-0410-8761-19e4c4f06640
This commit is contained in:
arseny.kapoulkine 2011-12-20 09:45:10 +00:00
parent 5a312a8ea8
commit a0769dfe38
6 changed files with 190 additions and 13 deletions

View file

@ -754,6 +754,27 @@ namespace
}
};
struct latin1_writer
{
typedef uint8_t* value_type;
static value_type low(value_type result, uint32_t ch)
{
*result = static_cast<uint8_t>(ch > 255 ? '?' : ch);
return result + 1;
}
static value_type high(value_type result, uint32_t ch)
{
(void)ch;
*result = '?';
return result + 1;
}
};
template <size_t size> struct wchar_selector;
template <> struct wchar_selector<2>
@ -904,6 +925,16 @@ namespace
return result;
}
static inline typename Traits::value_type decode_latin1_block(const uint8_t* data, size_t size, typename Traits::value_type result)
{
for (size_t i = 0; i < size; ++i)
{
result = Traits::low(result, data[i]);
}
return result;
}
};
template <typename T> inline void convert_utf_endian_swap(T* result, const T* data, size_t length)
@ -1172,6 +1203,27 @@ namespace
return true;
}
bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size)
{
const uint8_t* data = static_cast<const uint8_t*>(contents);
// get length in wchar_t units
out_length = size;
// allocate buffer of suitable length
out_buffer = static_cast<char_t*>(global_allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
if (!out_buffer) return false;
// convert latin1 input to wchar_t
wchar_writer::value_type out_begin = reinterpret_cast<wchar_writer::value_type>(out_buffer);
wchar_writer::value_type out_end = utf_decoder<wchar_writer>::decode_latin1_block(data, size, out_begin);
assert(out_end == out_begin + out_length);
(void)!out_end;
return true;
}
bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
{
// get native encoding
@ -1206,6 +1258,9 @@ namespace
convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true());
}
// source encoding is latin1
if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size);
assert(!"Invalid encoding");
return false;
}
@ -1254,6 +1309,48 @@ namespace
return true;
}
size_t get_latin1_7bit_prefix_length(const uint8_t* data, size_t size)
{
for (size_t i = 0; i < size; ++i)
if (data[i] > 127)
return i;
return size;
}
bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
{
const uint8_t* data = static_cast<const uint8_t*>(contents);
// get size of prefix that does not need utf8 conversion
size_t prefix_length = get_latin1_7bit_prefix_length(data, size);
assert(prefix_length <= size);
const uint8_t* postfix = data + prefix_length;
size_t postfix_length = size - prefix_length;
// if no conversion is needed, just return the original buffer
if (postfix_length == 0) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
// first pass: get length in utf8 units
out_length = prefix_length + utf_decoder<utf8_counter>::decode_latin1_block(postfix, postfix_length, 0);
// allocate buffer of suitable length
out_buffer = static_cast<char_t*>(global_allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
if (!out_buffer) return false;
// second pass: convert latin1 input to utf8
memcpy(out_buffer, data, prefix_length);
uint8_t* out_begin = reinterpret_cast<uint8_t*>(out_buffer);
uint8_t* out_end = utf_decoder<utf8_writer>::decode_latin1_block(postfix, postfix_length, out_begin + prefix_length);
assert(out_end == out_begin + out_length);
(void)!out_end;
return true;
}
bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
{
// fast path: no conversion required
@ -1279,6 +1376,9 @@ namespace
convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true());
}
// source encoding is latin1
if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size, is_mutable);
assert(!"Invalid encoding");
return false;
}
@ -2580,6 +2680,18 @@ namespace
return static_cast<size_t>(end - dest) * sizeof(uint32_t);
}
// convert to latin1
if (encoding == encoding_latin1)
{
uint8_t* dest = reinterpret_cast<uint8_t*>(result);
uint8_t* end = sizeof(wchar_t) == 2 ?
utf_decoder<latin1_writer>::decode_utf16_block(reinterpret_cast<const uint16_t*>(data), length, dest) :
utf_decoder<latin1_writer>::decode_utf32_block(reinterpret_cast<const uint32_t*>(data), length, dest);
return static_cast<size_t>(end - dest);
}
assert(!"Invalid encoding");
return 0;
}
@ -2632,6 +2744,14 @@ namespace
return static_cast<size_t>(end - dest) * sizeof(uint32_t);
}
if (encoding == encoding_latin1)
{
uint8_t* dest = reinterpret_cast<uint8_t*>(result);
uint8_t* end = utf_decoder<latin1_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
return static_cast<size_t>(end - dest);
}
assert(!"Invalid encoding");
return 0;
}
@ -2822,6 +2942,9 @@ namespace
writer.write("\xff\xfe\x00\x00", 4);
break;
case encoding_latin1:
break;
default:
assert(!"Invalid encoding");
}
@ -4806,7 +4929,9 @@ namespace pugi
if (!(flags & format_no_declaration) && !has_declaration(*this))
{
buffered_writer.write(PUGIXML_TEXT("<?xml version=\"1.0\"?>"));
buffered_writer.write(PUGIXML_TEXT("<?xml version=\"1.0\""));
if (encoding == encoding_latin1) buffered_writer.write(PUGIXML_TEXT(" encoding=\"ISO-8859-1\""));
buffered_writer.write('?', '>');
if (!(flags & format_raw)) buffered_writer.write('\n');
}

View file

@ -194,7 +194,8 @@ namespace pugi
encoding_utf32_le, // Little-endian UTF32
encoding_utf32_be, // Big-endian UTF32
encoding_utf32, // UTF32 with native endianness
encoding_wchar // The same encoding wchar_t has (either UTF16 or UTF32)
encoding_wchar, // The same encoding wchar_t has (either UTF16 or UTF32)
encoding_latin1
};
// Formatting flags

View file

@ -0,0 +1 @@
<?xml version="1.0" encoding="ISO-8859-1"?><EXAMPLE><!--This is a comment with special chars: <äöü>--><ORDER version="1.0" xml:lang="de"><!--This is another comment with special chars: <äöü>--><HEADER><X_ORDER_ID>0000053535</X_ORDER_ID><CUSTOMER_ID>1010</CUSTOMER_ID><NAME_1>Müller</NAME_1><NAME_2>Jörg</NAME_2></HEADER><ENTRIES><ENTRY><ARTICLE>&lt;Test&gt;</ARTICLE><ENTRY_NO>10</ENTRY_NO></ENTRY><ENTRY><ARTICLE>&lt;Test 2&gt;</ARTICLE><ENTRY_NO>20</ENTRY_NO></ENTRY></ENTRIES><FOOTER><TEXT>This is a text.</TEXT></FOOTER></ORDER></EXAMPLE>

View file

@ -0,0 +1 @@
<?xml version="1.0" encoding="ISO-8859-1"?><EXAMPLE><!--This is a comment with special chars: <äöü>--><ORDER version="1.0" xml:lang="de"><!--This is another comment with special chars: <äöü>--><HEADER><X_ORDER_ID>0000053535</X_ORDER_ID><CUSTOMER_ID>1010</CUSTOMER_ID><NAME_1>Müller</NAME_1><NAME_2>Jörg</NAME_2></HEADER><ENTRIES><ENTRY><ARTICLE>&lt;Test&gt;</ARTICLE><ENTRY_NO>10</ENTRY_NO></ENTRY><ENTRY><ARTICLE>&lt;Test 2&gt;</ARTICLE><ENTRY_NO>20</ENTRY_NO></ENTRY></ENTRIES><FOOTER><TEXT>This is a text.</TEXT></FOOTER></ORDER></EXAMPLE>

View file

@ -307,6 +307,7 @@ TEST_XML(document_save_bom, "<n/>")
CHECK(test_save_narrow(doc, flags, encoding_utf16_le, "\xff\xfe<\x00n\x00 \x00/\x00>\x00", 12));
CHECK(test_save_narrow(doc, flags, encoding_utf32_be, "\x00\x00\xfe\xff\x00\x00\x00<\x00\x00\x00n\x00\x00\x00 \x00\x00\x00/\x00\x00\x00>", 24));
CHECK(test_save_narrow(doc, flags, encoding_utf32_le, "\xff\xfe\x00\x00<\x00\x00\x00n\x00\x00\x00 \x00\x00\x00/\x00\x00\x00>\x00\x00\x00", 24));
CHECK(test_save_narrow(doc, flags, encoding_latin1, "<n />", 5));
// encodings synonyms
CHECK(save_narrow(doc, flags, encoding_utf16) == save_narrow(doc, flags, (is_little_endian() ? encoding_utf16_le : encoding_utf16_be)));
@ -371,6 +372,15 @@ TEST_XML(document_save_declaration_present_last, "<node/>")
CHECK(writer.as_string() == STR("<?xml version=\"1.0\"?>\n<node />\n<?xml encoding=\"utf8\"?>\n"));
}
TEST_XML(document_save_declaration_latin1, "<node/>")
{
xml_writer_string writer;
doc.save(writer, STR(""), pugi::format_default, encoding_latin1);
CHECK(writer.as_narrow() == "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n<node />\n");
}
struct temp_file
{
char path[512];
@ -704,18 +714,19 @@ static bool load_file_in_memory(const char* path, char*& data, size_t& size)
return true;
}
struct file_data_t
{
const char* path;
xml_encoding encoding;
char* data;
size_t size;
};
TEST(document_contents_preserve)
{
struct file_t
{
const char* path;
xml_encoding encoding;
char* data;
size_t size;
};
file_t files[] =
file_data_t files[] =
{
{"tests/data/utftest_utf16_be_clean.xml", encoding_utf16_be, 0, 0},
{"tests/data/utftest_utf16_le_clean.xml", encoding_utf16_le, 0, 0},
@ -751,6 +762,41 @@ TEST(document_contents_preserve)
}
}
TEST(document_contents_preserve_latin1)
{
file_data_t files[] =
{
{"tests/data/latintest_utf8.xml", encoding_utf8, 0, 0},
{"tests/data/latintest_latin1.xml", encoding_latin1, 0, 0}
};
// load files in memory
for (unsigned int i = 0; i < sizeof(files) / sizeof(files[0]); ++i)
{
CHECK(load_file_in_memory(files[i].path, files[i].data, files[i].size));
}
// convert each file to each format and compare bitwise
for (unsigned int src = 0; src < sizeof(files) / sizeof(files[0]); ++src)
{
for (unsigned int dst = 0; dst < sizeof(files) / sizeof(files[0]); ++dst)
{
// parse into document (preserve comments, declaration and whitespace pcdata)
xml_document doc;
CHECK(doc.load_buffer(files[src].data, files[src].size, parse_default | parse_ws_pcdata | parse_declaration | parse_comments, files[src].encoding));
// compare saved document with the original (raw formatting, without extra declaration, write bom if it was in original file)
CHECK(test_save_narrow(doc, format_raw | format_no_declaration | format_write_bom, files[dst].encoding, files[dst].data, files[dst].size));
}
}
// cleanup
for (unsigned int j = 0; j < sizeof(files) / sizeof(files[0]); ++j)
{
delete[] files[j].data;
}
}
static bool test_parse_fail(const void* buffer, size_t size, xml_encoding encoding = encoding_utf8)
{
// copy buffer to heap (to enable out-of-bounds checks)
@ -811,7 +857,8 @@ TEST(document_load_buffer_empty)
encoding_utf32_le,
encoding_utf32_be,
encoding_utf32,
encoding_wchar
encoding_wchar,
encoding_latin1
};
char buffer[1];

View file

@ -189,6 +189,8 @@ TEST(write_encodings)
{
CHECK(v.size() == 10 && v[0] == '<' && v[1] == 0x54 && v[2] == 0xA2 && v[3] == 0x20AC && v[4] == 0xd852 && v[5] == 0xdf62 && v[6] == ' ' && v[7] == '/' && v[8] == '>' && v[9] == '\n');
}
CHECK(test_write_narrow(doc, format_default, encoding_latin1, "<\x54\xA2?? />\n", 9));
}
#ifdef PUGIXML_WCHAR_MODE