mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-09 15:27:38 +00:00
ICU-3944 Text Access, new tests, bugs fixed
X-SVN-Rev: 18106
This commit is contained in:
parent
c4d57fd411
commit
091627dceb
3 changed files with 312 additions and 22 deletions
|
@ -436,6 +436,8 @@ utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
|
|||
if (spaceRequired>0) {
|
||||
ut->extraSize = extraSpace;
|
||||
ut->pExtra = &((ExtendedUText *)ut)->extension;
|
||||
uprv_memset(ut->pExtra, 0, extraSpace); // Purify whines about copying untouched extra [buffer]
|
||||
// space when cloning, so init it now.
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
@ -467,6 +469,7 @@ utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
|
|||
} else {
|
||||
ut->extraSize = extraSpace;
|
||||
ut->flags |= UTEXT_EXTRA_HEAP_ALLOCATED;
|
||||
uprv_memset(ut->pExtra, 0, extraSpace);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -613,7 +616,10 @@ U_CDECL_END
|
|||
// UText implementation for UTF-8 strings (read-only)
|
||||
//
|
||||
// Use of UText data members:
|
||||
// context pointer to UTF-8 string
|
||||
// context pointer to UTF-8 string
|
||||
// utext.b is the input string length (bytes).
|
||||
// utext.p pointer to allocated utf-8 string if owned by this utext (after a clone)
|
||||
// utext.q pointer to the filled part of the Map array.
|
||||
//
|
||||
// TODO: make creation of the index mapping array lazy.
|
||||
// Create it for a chunk the first time the user asks for an index.
|
||||
|
@ -638,9 +644,6 @@ struct UTF8Extra {
|
|||
int32_t map[UTF8_TEXT_CHUNK_SIZE+2];
|
||||
};
|
||||
|
||||
// utext.b is the input string length (bytes).
|
||||
// utext.q pointer to the filled part of the Map array.
|
||||
//
|
||||
// because backwards iteration fills the buffers starting at the end and
|
||||
// working towards the front, the filled part of the buffers may not begin
|
||||
// at the start of the available storage for the buffers.
|
||||
|
@ -679,12 +682,12 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
|
|||
return FALSE;
|
||||
}
|
||||
|
||||
chunk->nativeStart=index;
|
||||
c=s8[index];
|
||||
if(c<=0x7f) {
|
||||
// get a run of ASCII characters.
|
||||
// Even if we don't fill the buffer, we will stop with the first
|
||||
// non-ascii char, so that the buffer can use utf-16 indexing.
|
||||
chunk->nativeStart=index;
|
||||
u16buf[0]=(UChar)c;
|
||||
for(i=1, ++index;
|
||||
i<UTF8_TEXT_CHUNK_SIZE && index<length && (c=s8[index])<=0x7f;
|
||||
|
@ -696,6 +699,7 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
|
|||
} else {
|
||||
// get a chunk of characters starting with a non-ASCII one
|
||||
U8_SET_CP_START(s8, 0, index); // put utf-8 index at first byte of char, if not there already.
|
||||
chunk->nativeStart=index;
|
||||
for(i=0; i<UTF8_TEXT_CHUNK_SIZE && index<length; ) {
|
||||
// i is utf-16 index into chunk buffer.
|
||||
// index is utf-8 index into original string
|
||||
|
@ -724,10 +728,10 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
|
|||
return FALSE;
|
||||
}
|
||||
|
||||
chunk->nativeLimit=index;
|
||||
c=s8[index-1];
|
||||
if(c<=0x7f) {
|
||||
// get a chunk of ASCII characters. Don't build the index map
|
||||
chunk->nativeLimit=index;
|
||||
i=UTF8_TEXT_CHUNK_SIZE;
|
||||
do {
|
||||
u16buf[--i]=(UChar)c;
|
||||
|
@ -739,6 +743,7 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
|
|||
if(index<length) {
|
||||
U8_SET_CP_START(s8, 0, index);
|
||||
}
|
||||
chunk->nativeLimit=index;
|
||||
i=UTF8_TEXT_CHUNK_SIZE;
|
||||
map[i]=index; // map position for char following the last one in the buffer.
|
||||
do {
|
||||
|
@ -781,6 +786,80 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// This is a slightly modified copy of u_strFromUTF8,
|
||||
// Inserts a Replacement Char rather than failing on invalid UTF-8
|
||||
// Removes unnecessary features.
|
||||
//
|
||||
static UChar*
|
||||
utext_strFromUTF8(UChar *dest,
|
||||
int32_t destCapacity,
|
||||
int32_t *pDestLength,
|
||||
const char* src,
|
||||
int32_t srcLength, // required. NUL terminated not supported.
|
||||
UErrorCode *pErrorCode
|
||||
)
|
||||
{
|
||||
|
||||
UChar *pDest = dest;
|
||||
UChar *pDestLimit = dest+destCapacity;
|
||||
UChar32 ch=0;
|
||||
int32_t index = 0;
|
||||
int32_t reqLength = 0;
|
||||
uint8_t* pSrc = (uint8_t*) src;
|
||||
|
||||
|
||||
while((index < srcLength)&&(pDest<pDestLimit)){
|
||||
ch = pSrc[index++];
|
||||
if(ch <=0x7f){
|
||||
*pDest++=(UChar)ch;
|
||||
}else{
|
||||
ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
|
||||
if(ch<0){
|
||||
ch = 0xfffd;
|
||||
}
|
||||
if(ch<=0xFFFF){
|
||||
*(pDest++)=(UChar)ch;
|
||||
}else{
|
||||
*(pDest++)=UTF16_LEAD(ch);
|
||||
if(pDest<pDestLimit){
|
||||
*(pDest++)=UTF16_TRAIL(ch);
|
||||
}else{
|
||||
reqLength++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/* donot fill the dest buffer just count the UChars needed */
|
||||
while(index < srcLength){
|
||||
ch = pSrc[index++];
|
||||
if(ch <= 0x7f){
|
||||
reqLength++;
|
||||
}else{
|
||||
ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
|
||||
if(ch<0){
|
||||
ch = 0xfffd;
|
||||
}
|
||||
reqLength+=UTF_CHAR_LENGTH(ch);
|
||||
}
|
||||
}
|
||||
|
||||
reqLength+=(int32_t)(pDest - dest);
|
||||
|
||||
if(pDestLength){
|
||||
*pDestLength = reqLength;
|
||||
}
|
||||
|
||||
/* Terminate the buffer */
|
||||
u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
utf8TextExtract(UText *ut,
|
||||
int32_t start, int32_t limit,
|
||||
|
@ -791,17 +870,23 @@ utf8TextExtract(UText *ut,
|
|||
}
|
||||
if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
if(start<0 || start>limit || ut->b<limit) {
|
||||
if(start<0 || start>limit) {
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
if (limit>ut->b) {
|
||||
limit = ut->b;
|
||||
}
|
||||
if (start>ut->b) {
|
||||
start = ut->b;
|
||||
}
|
||||
int32_t destLength=0;
|
||||
u_strFromUTF8(dest, destCapacity, &destLength,
|
||||
utext_strFromUTF8(dest, destCapacity, &destLength,
|
||||
(const char *)ut->context+start, limit-start,
|
||||
pErrorCode);
|
||||
return destLength;
|
||||
// TODO: if U_INVALID|ILLEGAL_CHAR_FOUND, extract text anyway and use SUB for illegal sequences?
|
||||
}
|
||||
|
||||
// Assume nonUTF16Indexes and 0<=offset<=chunk->length
|
||||
|
@ -823,6 +908,19 @@ utf8TextMapIndexToUTF16(UText *ut, int32_t index) {
|
|||
while(index>map[offset]) {
|
||||
++offset;
|
||||
}
|
||||
if (index<map[offset]) {
|
||||
// index was to a trail byte of a multi-byte utf-8 char.
|
||||
// The loop above advaned offset to the start of the following char, now
|
||||
// offset must be backed up to the start of the utf-16 char into which
|
||||
// the utf-8 index pointed.
|
||||
offset--;
|
||||
if (offset>0 && map[offset] == map[offset-1]) {
|
||||
// index was to a utf-8 trail byte of a supplemenary char.
|
||||
// Offset now points to the trail surrogate (one in back of the following char)
|
||||
// Back offset up one more time to get to the utf-16 lead surrogate.
|
||||
offset--;
|
||||
}
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
|
|
|
@ -43,11 +43,13 @@ UTextTest::~UTextTest() {
|
|||
|
||||
void
|
||||
UTextTest::runIndexedTest(int32_t index, UBool exec,
|
||||
const char* &name, char* /*par*/) {
|
||||
const char* &name, char* /*par*/) {
|
||||
switch (index) {
|
||||
case 0: name = "TextTest";
|
||||
if(exec) TextTest(); break;
|
||||
default: name = ""; break;
|
||||
if (exec) TextTest(); break;
|
||||
case 1: name = "ErrorTest";
|
||||
if (exec) ErrorTest(); break;
|
||||
default: name = ""; break;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -62,10 +64,23 @@ static uint32_t m_rand()
|
|||
}
|
||||
|
||||
|
||||
//
|
||||
// TextTest()
|
||||
//
|
||||
// Top Level function for UText testing.
|
||||
// Specifies the strings to be tested, with the acutal testing itself
|
||||
// being carried out in another function, TestString().
|
||||
//
|
||||
void UTextTest::TextTest() {
|
||||
int32_t i, j;
|
||||
|
||||
TestString("abcd\\U00010001xyz");
|
||||
TestString("");
|
||||
|
||||
// Supplementary chars at start or end
|
||||
TestString("\\U00010001");
|
||||
TestString("abc\\U00010001");
|
||||
TestString("\\U00010001abc");
|
||||
|
||||
// Test simple strings of lengths 1 to 60, looking for glitches at buffer boundaries
|
||||
UnicodeString s;
|
||||
|
@ -126,14 +141,11 @@ void UTextTest::TextTest() {
|
|||
TestString(s);
|
||||
}
|
||||
|
||||
//
|
||||
// mapping between native indexes and code points.
|
||||
// native indexes could be utf-8, utf-16, utf32, or some code page.
|
||||
// The general purpose UText test funciton takes an array of these as
|
||||
// expected contents of the text being accessed.
|
||||
//
|
||||
|
||||
|
||||
//
|
||||
// TestString() Run a suite of UText tests on a string.
|
||||
// The test string is unescaped before use.
|
||||
//
|
||||
void UTextTest::TestString(const UnicodeString &s) {
|
||||
int32_t i;
|
||||
int32_t j;
|
||||
|
@ -147,7 +159,7 @@ void UTextTest::TestString(const UnicodeString &s) {
|
|||
saLen = sa.length();
|
||||
|
||||
//
|
||||
// Build up the mapping between code points and UTF-16 code unit indexes.
|
||||
// Build up a mapping between code points and UTF-16 code unit indexes.
|
||||
//
|
||||
m *cpMap = new m[sa.length() + 1];
|
||||
j = 0;
|
||||
|
@ -161,7 +173,7 @@ void UTextTest::TestString(const UnicodeString &s) {
|
|||
cpMap[j].nativeIdx = i; // position following the last char in utf-16 string.
|
||||
|
||||
|
||||
// UChar * test, null term
|
||||
// UChar * test, null terminated
|
||||
status = U_ZERO_ERROR;
|
||||
UChar *buf = new UChar[saLen+1];
|
||||
sa.extract(buf, saLen+1, status);
|
||||
|
@ -502,6 +514,11 @@ cleanupAndReturn:
|
|||
utext_close(targetUT);
|
||||
}
|
||||
|
||||
//
|
||||
// TestAccess() Test the read only access functions on a UText.
|
||||
// The text is accessed in a variety of ways, and compared with
|
||||
// the reference UnicodeString.
|
||||
//
|
||||
void UTextTest::TestAccess(const UnicodeString &us, UText *ut, int cpCount, m *cpMap) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
gTestNum++;
|
||||
|
@ -711,7 +728,11 @@ void UTextTest::TestAccess(const UnicodeString &us, UText *ut, int cpCount, m *c
|
|||
|
||||
status = U_ZERO_ERROR;
|
||||
len = utext_extract(ut, 0, utlen, NULL, 0, &status);
|
||||
TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR)
|
||||
if (utlen == 0) {
|
||||
TEST_ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
|
||||
} else {
|
||||
TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
|
||||
}
|
||||
TEST_ASSERT(len == expectedLen);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
|
@ -731,6 +752,176 @@ void UTextTest::TestAccess(const UnicodeString &us, UText *ut, int cpCount, m *c
|
|||
}
|
||||
|
||||
delete buf;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// ErrorTest() Check various error and edge cases.
|
||||
//
|
||||
void UTextTest::ErrorTest()
|
||||
{
|
||||
// Close of an unitialized UText. Shouldn't blow up.
|
||||
{
|
||||
UText ut;
|
||||
memset(&ut, 0, sizeof(UText));
|
||||
utext_close(&ut);
|
||||
utext_close(NULL);
|
||||
}
|
||||
|
||||
// Double-close of a UText. Shouldn't blow up. UText should still be usable.
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UText ut = UTEXT_INITIALIZER;
|
||||
UnicodeString s("Hello, World");
|
||||
UText *ut2 = utext_openUnicodeString(&ut, &s, &status);
|
||||
TEST_SUCCESS(status);
|
||||
TEST_ASSERT(ut2 == &ut);
|
||||
|
||||
UText *ut3 = utext_close(&ut);
|
||||
TEST_ASSERT(ut3 == &ut);
|
||||
|
||||
UText *ut4 = utext_close(&ut);
|
||||
TEST_ASSERT(ut4 == &ut);
|
||||
|
||||
utext_openUnicodeString(&ut, &s, &status);
|
||||
TEST_SUCCESS(status);
|
||||
utext_close(&ut);
|
||||
}
|
||||
|
||||
// Re-use of a UText, chaining through each of the types of UText
|
||||
// (If it doesn't blow up, and doesn't leak, it's probably working fine)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UText ut = UTEXT_INITIALIZER;
|
||||
UText *utp;
|
||||
UnicodeString s1("Hello, World");
|
||||
UChar s2[] = {(UChar)0x41, (UChar)0x42, (UChar)0};
|
||||
char *s3 = "\x66\x67\x68";
|
||||
|
||||
utp = utext_openUnicodeString(&ut, &s1, &status);
|
||||
TEST_SUCCESS(status);
|
||||
TEST_ASSERT(utp == &ut);
|
||||
|
||||
utp = utext_openConstUnicodeString(&ut, &s1, &status);
|
||||
TEST_SUCCESS(status);
|
||||
TEST_ASSERT(utp == &ut);
|
||||
|
||||
utp = utext_openUTF8(&ut, s3, -1, &status);
|
||||
TEST_SUCCESS(status);
|
||||
TEST_ASSERT(utp == &ut);
|
||||
|
||||
utp = utext_openUChars(&ut, s2, -1, &status);
|
||||
TEST_SUCCESS(status);
|
||||
TEST_ASSERT(utp == &ut);
|
||||
|
||||
utp = utext_close(&ut);
|
||||
TEST_ASSERT(utp == &ut);
|
||||
|
||||
utp = utext_openUnicodeString(&ut, &s1, &status);
|
||||
TEST_SUCCESS(status);
|
||||
TEST_ASSERT(utp == &ut);
|
||||
}
|
||||
|
||||
//
|
||||
// UTF-8 with malformed sequences.
|
||||
// These should come through as the Unicode replacement char, \ufffd
|
||||
//
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UText *ut = NULL;
|
||||
char *badUTF8 = "\x41\x81\x42\xf0\x81\x81\x43";
|
||||
UChar32 c;
|
||||
|
||||
ut = utext_openUTF8(NULL, badUTF8, -1, &status);
|
||||
TEST_SUCCESS(status);
|
||||
c = utext_char32At(ut, 1);
|
||||
TEST_ASSERT(c == 0xfffd);
|
||||
c = utext_char32At(ut, 3);
|
||||
TEST_ASSERT(c == 0xfffd);
|
||||
c = utext_char32At(ut, 5);
|
||||
TEST_ASSERT(c == 0xfffd);
|
||||
c = utext_char32At(ut, 6);
|
||||
TEST_ASSERT(c == 0x43);
|
||||
|
||||
UChar buf[10];
|
||||
int n = utext_extract(ut, 0, 9, buf, 10, &status);
|
||||
TEST_SUCCESS(status);
|
||||
TEST_ASSERT(n==5);
|
||||
TEST_ASSERT(buf[1] == 0xfffd);
|
||||
TEST_ASSERT(buf[3] == 0xfffd);
|
||||
TEST_ASSERT(buf[2] == 0x42);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// isLengthExpensive - does it make the exptected transitions after
|
||||
// getting the length of a nul terminated string?
|
||||
//
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeString sa("Hello, this is a string");
|
||||
UBool isExpensive;
|
||||
|
||||
UChar sb[100];
|
||||
memset(sb, 0x20, sizeof(sb));
|
||||
sb[99] = 0;
|
||||
|
||||
UText *uta = utext_openUnicodeString(NULL, &sa, &status);
|
||||
TEST_SUCCESS(status);
|
||||
isExpensive = utext_isLengthExpensive(uta);
|
||||
TEST_ASSERT(isExpensive == FALSE);
|
||||
utext_close(uta);
|
||||
|
||||
UText *utb = utext_openUChars(NULL, sb, -1, &status);
|
||||
TEST_SUCCESS(status);
|
||||
isExpensive = utext_isLengthExpensive(utb);
|
||||
TEST_ASSERT(isExpensive == TRUE);
|
||||
int32_t len = utext_nativeLength(utb);
|
||||
TEST_ASSERT(len == 99);
|
||||
isExpensive = utext_isLengthExpensive(utb);
|
||||
TEST_ASSERT(isExpensive == FALSE);
|
||||
utext_close(utb);
|
||||
}
|
||||
|
||||
//
|
||||
// get/set native index to positions not on code point boundaries.
|
||||
//
|
||||
{
|
||||
char *u8str = "\xc8\x81\xe1\x82\x83\xf1\x84\x85\x86";
|
||||
int32_t startMap[] = { 0, 0, 2, 2, 2, 5, 5, 5, 5, 9, 9};
|
||||
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UText *ut = utext_openUTF8(NULL, u8str, -1, &status);
|
||||
TEST_SUCCESS(status);
|
||||
|
||||
int32_t i;
|
||||
int32_t startMapLimit = sizeof(startMap) / sizeof(int32_t);
|
||||
for (i=0; i<startMapLimit; i++) {
|
||||
utext_setNativeIndex(ut, i);
|
||||
int32_t cpIndex = utext_getNativeIndex(ut);
|
||||
TEST_ASSERT(cpIndex == startMap[i]);
|
||||
}
|
||||
utext_close(ut);
|
||||
|
||||
// Similar test, with utf16 instead of utf8
|
||||
UnicodeString u16str("\\u1000\\U00011000\\u2000\\U00022000");
|
||||
int32_t start16Map[] ={ 0, 1, 1, 3, 4, 4, 6, 6};
|
||||
u16str = u16str.unescape();
|
||||
status = U_ZERO_ERROR;
|
||||
ut = utext_openUnicodeString(NULL, &u16str, &status);
|
||||
TEST_SUCCESS(status);
|
||||
|
||||
startMapLimit = sizeof(start16Map) / sizeof(int32_t);
|
||||
for (i=0; i<startMapLimit; i++) {
|
||||
utext_setNativeIndex(ut, i);
|
||||
int32_t cpIndex = utext_getNativeIndex(ut);
|
||||
TEST_ASSERT(cpIndex == start16Map[i]);
|
||||
}
|
||||
utext_close(ut);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -29,6 +29,7 @@ public:
|
|||
|
||||
void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par=NULL);
|
||||
void TextTest();
|
||||
void ErrorTest();
|
||||
|
||||
private:
|
||||
struct m { // Map between native indices & code points.
|
||||
|
|
Loading…
Add table
Reference in a new issue