ICU-3944 Text Access, new tests, bugs fixed

X-SVN-Rev: 18106
This commit is contained in:
Andy Heninger 2005-07-01 00:39:24 +00:00
parent c4d57fd411
commit 091627dceb
3 changed files with 312 additions and 22 deletions

View file

@ -436,6 +436,8 @@ utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
if (spaceRequired>0) {
ut->extraSize = extraSpace;
ut->pExtra = &((ExtendedUText *)ut)->extension;
uprv_memset(ut->pExtra, 0, extraSpace); // Purify whines about copying untouched extra [buffer]
// space when cloning, so init it now.
}
}
} else {
@ -467,6 +469,7 @@ utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
} else {
ut->extraSize = extraSpace;
ut->flags |= UTEXT_EXTRA_HEAP_ALLOCATED;
uprv_memset(ut->pExtra, 0, extraSpace);
}
}
}
@ -613,7 +616,10 @@ U_CDECL_END
// UText implementation for UTF-8 strings (read-only)
//
// Use of UText data members:
// context pointer to UTF-8 string
// context pointer to UTF-8 string
// utext.b is the input string length (bytes).
// utext.p pointer to allocated utf-8 string if owned by this utext (after a clone)
// utext.q pointer to the filled part of the Map array.
//
// TODO: make creation of the index mapping array lazy.
// Create it for a chunk the first time the user asks for an index.
@ -638,9 +644,6 @@ struct UTF8Extra {
int32_t map[UTF8_TEXT_CHUNK_SIZE+2];
};
// utext.b is the input string length (bytes).
// utext.q pointer to the filled part of the Map array.
//
// because backwards iteration fills the buffers starting at the end and
// working towards the front, the filled part of the buffers may not begin
// at the start of the available storage for the buffers.
@ -679,12 +682,12 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
return FALSE;
}
chunk->nativeStart=index;
c=s8[index];
if(c<=0x7f) {
// get a run of ASCII characters.
// Even if we don't fill the buffer, we will stop with the first
// non-ascii char, so that the buffer can use utf-16 indexing.
chunk->nativeStart=index;
u16buf[0]=(UChar)c;
for(i=1, ++index;
i<UTF8_TEXT_CHUNK_SIZE && index<length && (c=s8[index])<=0x7f;
@ -696,6 +699,7 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
} else {
// get a chunk of characters starting with a non-ASCII one
U8_SET_CP_START(s8, 0, index); // put utf-8 index at first byte of char, if not there already.
chunk->nativeStart=index;
for(i=0; i<UTF8_TEXT_CHUNK_SIZE && index<length; ) {
// i is utf-16 index into chunk buffer.
// index is utf-8 index into original string
@ -724,10 +728,10 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
return FALSE;
}
chunk->nativeLimit=index;
c=s8[index-1];
if(c<=0x7f) {
// get a chunk of ASCII characters. Don't build the index map
chunk->nativeLimit=index;
i=UTF8_TEXT_CHUNK_SIZE;
do {
u16buf[--i]=(UChar)c;
@ -739,6 +743,7 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
if(index<length) {
U8_SET_CP_START(s8, 0, index);
}
chunk->nativeLimit=index;
i=UTF8_TEXT_CHUNK_SIZE;
map[i]=index; // map position for char following the last one in the buffer.
do {
@ -781,6 +786,80 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
}
}
//
// This is a slightly modified copy of u_strFromUTF8,
// Inserts a Replacement Char rather than failing on invalid UTF-8
// Removes unnecessary features.
//
static UChar*
utext_strFromUTF8(UChar *dest,
int32_t destCapacity,
int32_t *pDestLength,
const char* src,
int32_t srcLength, // required. NUL terminated not supported.
UErrorCode *pErrorCode
)
{
UChar *pDest = dest;
UChar *pDestLimit = dest+destCapacity;
UChar32 ch=0;
int32_t index = 0;
int32_t reqLength = 0;
uint8_t* pSrc = (uint8_t*) src;
while((index < srcLength)&&(pDest<pDestLimit)){
ch = pSrc[index++];
if(ch <=0x7f){
*pDest++=(UChar)ch;
}else{
ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
if(ch<0){
ch = 0xfffd;
}
if(ch<=0xFFFF){
*(pDest++)=(UChar)ch;
}else{
*(pDest++)=UTF16_LEAD(ch);
if(pDest<pDestLimit){
*(pDest++)=UTF16_TRAIL(ch);
}else{
reqLength++;
break;
}
}
}
}
/* donot fill the dest buffer just count the UChars needed */
while(index < srcLength){
ch = pSrc[index++];
if(ch <= 0x7f){
reqLength++;
}else{
ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
if(ch<0){
ch = 0xfffd;
}
reqLength+=UTF_CHAR_LENGTH(ch);
}
}
reqLength+=(int32_t)(pDest - dest);
if(pDestLength){
*pDestLength = reqLength;
}
/* Terminate the buffer */
u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
return dest;
}
static int32_t U_CALLCONV
utf8TextExtract(UText *ut,
int32_t start, int32_t limit,
@ -791,17 +870,23 @@ utf8TextExtract(UText *ut,
}
if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if(start<0 || start>limit || ut->b<limit) {
if(start<0 || start>limit) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
if (limit>ut->b) {
limit = ut->b;
}
if (start>ut->b) {
start = ut->b;
}
int32_t destLength=0;
u_strFromUTF8(dest, destCapacity, &destLength,
utext_strFromUTF8(dest, destCapacity, &destLength,
(const char *)ut->context+start, limit-start,
pErrorCode);
return destLength;
// TODO: if U_INVALID|ILLEGAL_CHAR_FOUND, extract text anyway and use SUB for illegal sequences?
}
// Assume nonUTF16Indexes and 0<=offset<=chunk->length
@ -823,6 +908,19 @@ utf8TextMapIndexToUTF16(UText *ut, int32_t index) {
while(index>map[offset]) {
++offset;
}
if (index<map[offset]) {
// index was to a trail byte of a multi-byte utf-8 char.
// The loop above advaned offset to the start of the following char, now
// offset must be backed up to the start of the utf-16 char into which
// the utf-8 index pointed.
offset--;
if (offset>0 && map[offset] == map[offset-1]) {
// index was to a utf-8 trail byte of a supplemenary char.
// Offset now points to the trail surrogate (one in back of the following char)
// Back offset up one more time to get to the utf-16 lead surrogate.
offset--;
}
}
return offset;
}

View file

@ -43,11 +43,13 @@ UTextTest::~UTextTest() {
void
UTextTest::runIndexedTest(int32_t index, UBool exec,
const char* &name, char* /*par*/) {
const char* &name, char* /*par*/) {
switch (index) {
case 0: name = "TextTest";
if(exec) TextTest(); break;
default: name = ""; break;
if (exec) TextTest(); break;
case 1: name = "ErrorTest";
if (exec) ErrorTest(); break;
default: name = ""; break;
}
}
@ -62,10 +64,23 @@ static uint32_t m_rand()
}
//
// TextTest()
//
// Top Level function for UText testing.
// Specifies the strings to be tested, with the acutal testing itself
// being carried out in another function, TestString().
//
void UTextTest::TextTest() {
int32_t i, j;
TestString("abcd\\U00010001xyz");
TestString("");
// Supplementary chars at start or end
TestString("\\U00010001");
TestString("abc\\U00010001");
TestString("\\U00010001abc");
// Test simple strings of lengths 1 to 60, looking for glitches at buffer boundaries
UnicodeString s;
@ -126,14 +141,11 @@ void UTextTest::TextTest() {
TestString(s);
}
//
// mapping between native indexes and code points.
// native indexes could be utf-8, utf-16, utf32, or some code page.
// The general purpose UText test funciton takes an array of these as
// expected contents of the text being accessed.
//
//
// TestString() Run a suite of UText tests on a string.
// The test string is unescaped before use.
//
void UTextTest::TestString(const UnicodeString &s) {
int32_t i;
int32_t j;
@ -147,7 +159,7 @@ void UTextTest::TestString(const UnicodeString &s) {
saLen = sa.length();
//
// Build up the mapping between code points and UTF-16 code unit indexes.
// Build up a mapping between code points and UTF-16 code unit indexes.
//
m *cpMap = new m[sa.length() + 1];
j = 0;
@ -161,7 +173,7 @@ void UTextTest::TestString(const UnicodeString &s) {
cpMap[j].nativeIdx = i; // position following the last char in utf-16 string.
// UChar * test, null term
// UChar * test, null terminated
status = U_ZERO_ERROR;
UChar *buf = new UChar[saLen+1];
sa.extract(buf, saLen+1, status);
@ -502,6 +514,11 @@ cleanupAndReturn:
utext_close(targetUT);
}
//
// TestAccess() Test the read only access functions on a UText.
// The text is accessed in a variety of ways, and compared with
// the reference UnicodeString.
//
void UTextTest::TestAccess(const UnicodeString &us, UText *ut, int cpCount, m *cpMap) {
UErrorCode status = U_ZERO_ERROR;
gTestNum++;
@ -711,7 +728,11 @@ void UTextTest::TestAccess(const UnicodeString &us, UText *ut, int cpCount, m *c
status = U_ZERO_ERROR;
len = utext_extract(ut, 0, utlen, NULL, 0, &status);
TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR)
if (utlen == 0) {
TEST_ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
} else {
TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
}
TEST_ASSERT(len == expectedLen);
status = U_ZERO_ERROR;
@ -731,6 +752,176 @@ void UTextTest::TestAccess(const UnicodeString &us, UText *ut, int cpCount, m *c
}
delete buf;
}
//
// ErrorTest() Check various error and edge cases.
//
void UTextTest::ErrorTest()
{
// Close of an unitialized UText. Shouldn't blow up.
{
UText ut;
memset(&ut, 0, sizeof(UText));
utext_close(&ut);
utext_close(NULL);
}
// Double-close of a UText. Shouldn't blow up. UText should still be usable.
{
UErrorCode status = U_ZERO_ERROR;
UText ut = UTEXT_INITIALIZER;
UnicodeString s("Hello, World");
UText *ut2 = utext_openUnicodeString(&ut, &s, &status);
TEST_SUCCESS(status);
TEST_ASSERT(ut2 == &ut);
UText *ut3 = utext_close(&ut);
TEST_ASSERT(ut3 == &ut);
UText *ut4 = utext_close(&ut);
TEST_ASSERT(ut4 == &ut);
utext_openUnicodeString(&ut, &s, &status);
TEST_SUCCESS(status);
utext_close(&ut);
}
// Re-use of a UText, chaining through each of the types of UText
// (If it doesn't blow up, and doesn't leak, it's probably working fine)
{
UErrorCode status = U_ZERO_ERROR;
UText ut = UTEXT_INITIALIZER;
UText *utp;
UnicodeString s1("Hello, World");
UChar s2[] = {(UChar)0x41, (UChar)0x42, (UChar)0};
char *s3 = "\x66\x67\x68";
utp = utext_openUnicodeString(&ut, &s1, &status);
TEST_SUCCESS(status);
TEST_ASSERT(utp == &ut);
utp = utext_openConstUnicodeString(&ut, &s1, &status);
TEST_SUCCESS(status);
TEST_ASSERT(utp == &ut);
utp = utext_openUTF8(&ut, s3, -1, &status);
TEST_SUCCESS(status);
TEST_ASSERT(utp == &ut);
utp = utext_openUChars(&ut, s2, -1, &status);
TEST_SUCCESS(status);
TEST_ASSERT(utp == &ut);
utp = utext_close(&ut);
TEST_ASSERT(utp == &ut);
utp = utext_openUnicodeString(&ut, &s1, &status);
TEST_SUCCESS(status);
TEST_ASSERT(utp == &ut);
}
//
// UTF-8 with malformed sequences.
// These should come through as the Unicode replacement char, \ufffd
//
{
UErrorCode status = U_ZERO_ERROR;
UText *ut = NULL;
char *badUTF8 = "\x41\x81\x42\xf0\x81\x81\x43";
UChar32 c;
ut = utext_openUTF8(NULL, badUTF8, -1, &status);
TEST_SUCCESS(status);
c = utext_char32At(ut, 1);
TEST_ASSERT(c == 0xfffd);
c = utext_char32At(ut, 3);
TEST_ASSERT(c == 0xfffd);
c = utext_char32At(ut, 5);
TEST_ASSERT(c == 0xfffd);
c = utext_char32At(ut, 6);
TEST_ASSERT(c == 0x43);
UChar buf[10];
int n = utext_extract(ut, 0, 9, buf, 10, &status);
TEST_SUCCESS(status);
TEST_ASSERT(n==5);
TEST_ASSERT(buf[1] == 0xfffd);
TEST_ASSERT(buf[3] == 0xfffd);
TEST_ASSERT(buf[2] == 0x42);
}
//
// isLengthExpensive - does it make the exptected transitions after
// getting the length of a nul terminated string?
//
{
UErrorCode status = U_ZERO_ERROR;
UnicodeString sa("Hello, this is a string");
UBool isExpensive;
UChar sb[100];
memset(sb, 0x20, sizeof(sb));
sb[99] = 0;
UText *uta = utext_openUnicodeString(NULL, &sa, &status);
TEST_SUCCESS(status);
isExpensive = utext_isLengthExpensive(uta);
TEST_ASSERT(isExpensive == FALSE);
utext_close(uta);
UText *utb = utext_openUChars(NULL, sb, -1, &status);
TEST_SUCCESS(status);
isExpensive = utext_isLengthExpensive(utb);
TEST_ASSERT(isExpensive == TRUE);
int32_t len = utext_nativeLength(utb);
TEST_ASSERT(len == 99);
isExpensive = utext_isLengthExpensive(utb);
TEST_ASSERT(isExpensive == FALSE);
utext_close(utb);
}
//
// get/set native index to positions not on code point boundaries.
//
{
char *u8str = "\xc8\x81\xe1\x82\x83\xf1\x84\x85\x86";
int32_t startMap[] = { 0, 0, 2, 2, 2, 5, 5, 5, 5, 9, 9};
UErrorCode status = U_ZERO_ERROR;
UText *ut = utext_openUTF8(NULL, u8str, -1, &status);
TEST_SUCCESS(status);
int32_t i;
int32_t startMapLimit = sizeof(startMap) / sizeof(int32_t);
for (i=0; i<startMapLimit; i++) {
utext_setNativeIndex(ut, i);
int32_t cpIndex = utext_getNativeIndex(ut);
TEST_ASSERT(cpIndex == startMap[i]);
}
utext_close(ut);
// Similar test, with utf16 instead of utf8
UnicodeString u16str("\\u1000\\U00011000\\u2000\\U00022000");
int32_t start16Map[] ={ 0, 1, 1, 3, 4, 4, 6, 6};
u16str = u16str.unescape();
status = U_ZERO_ERROR;
ut = utext_openUnicodeString(NULL, &u16str, &status);
TEST_SUCCESS(status);
startMapLimit = sizeof(start16Map) / sizeof(int32_t);
for (i=0; i<startMapLimit; i++) {
utext_setNativeIndex(ut, i);
int32_t cpIndex = utext_getNativeIndex(ut);
TEST_ASSERT(cpIndex == start16Map[i]);
}
utext_close(ut);
}
}

View file

@ -29,6 +29,7 @@ public:
void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par=NULL);
void TextTest();
void ErrorTest();
private:
struct m { // Map between native indices & code points.