ICU-13244 add U16_GET_OR_FFFD(), U16_NEXT_OR_FFFD(), U16_PREV_OR_FFFD()

X-SVN-Rev: 40404
This commit is contained in:
Markus Scherer 2017-09-13 22:29:43 +00:00
parent dda03c710a
commit edac6e7206
2 changed files with 190 additions and 24 deletions

View file

@ -185,8 +185,8 @@
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to a single, unpaired surrogate, then that itself
* will be returned as the code point.
* If the offset points to a single, unpaired surrogate, then
* c is set to that unpaired surrogate.
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
*
* @param s const UChar * string
@ -213,6 +213,53 @@
} \
}
#ifndef U_HIDE_DRAFT_API
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The offset may point to either the lead or trail surrogate unit
* for a supplementary code point, in which case the macro will read
* the adjacent matching surrogate as well.
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to a single, unpaired surrogate, then
* c is set to U+FFFD.
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT_OR_FFFD.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<=i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_GET_UNSAFE
* @draft ICU 60
*/
#define U16_GET_OR_FFFD(s, start, i, length, c) { \
(c)=(s)[i]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_LEAD(c)) { \
if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} else { \
(c)=0xfffd; \
} \
} else { \
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} else { \
(c)=0xfffd; \
} \
} \
} \
}
#endif // U_HIDE_DRAFT_API
/* definitions with forward iteration --------------------------------------- */
/**
@ -253,8 +300,7 @@
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate or
* to a single, unpaired lead surrogate, then that itself
* will be returned as the code point.
* to a single, unpaired lead surrogate, then c is set to that unpaired surrogate.
*
* @param s const UChar * string
* @param i string offset, must be i<length
@ -274,6 +320,44 @@
} \
}
#ifndef U_HIDE_DRAFT_API
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate or
* to a single, unpaired lead surrogate, then c is set to U+FFFD.
*
* @param s const UChar * string
* @param i string offset, must be i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_NEXT_UNSAFE
* @draft ICU 60
*/
#define U16_NEXT_OR_FFFD(s, i, length, c) { \
(c)=(s)[(i)++]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_LEAD(c) && (i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
++(i); \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} else { \
(c)=0xfffd; \
} \
} \
}
#endif // U_HIDE_DRAFT_API
/**
* Append a code point to a string, overwriting 1 or 2 code units.
* The offset points to the current end of the string contents
@ -481,8 +565,7 @@
* for a supplementary code point, then the macro will read
* the preceding lead surrogate as well.
* If the offset is behind a lead surrogate or behind a single, unpaired
* trail surrogate, then that itself
* will be returned as the code point.
* trail surrogate, then c is set to that unpaired surrogate.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
@ -502,6 +585,43 @@
} \
}
#ifndef U_HIDE_DRAFT_API
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a trail surrogate unit
* for a supplementary code point, then the macro will read
* the preceding lead surrogate as well.
* If the offset is behind a lead surrogate or behind a single, unpaired
* trail surrogate, then c is set to U+FFFD.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<i
* @param c output UChar32 variable
* @see U16_PREV_UNSAFE
* @draft ICU 60
*/
#define U16_PREV_OR_FFFD(s, start, i, c) { \
(c)=(s)[--(i)]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_TRAIL(c) && (i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
--(i); \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} else { \
(c)=0xfffd; \
} \
} \
}
#endif // U_HIDE_DRAFT_API
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)

View file

@ -147,7 +147,7 @@ static void TestGetChar()
0x11734, 0xd800, UTF_ERROR_VALUE
};
uint16_t i=0;
UChar32 c;
UChar32 c, expected;
uint16_t offset=0;
for(offset=0; offset<UPRV_LENGTHOF(input); offset++) {
if(0<offset && offset<UPRV_LENGTHOF(input)-1){
@ -163,13 +163,20 @@ static void TestGetChar()
}
UTF16_GET_CHAR_SAFE(input, 0, offset, UPRV_LENGTHOF(input), c, FALSE);
if(c != result[i+1]){
log_err("ERROR: UTF16_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
expected=result[i+1];
if(c != expected) {
log_err("ERROR: UTF16_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
}
U16_GET(input, 0, offset, UPRV_LENGTHOF(input), c);
if(c != result[i+1]){
log_err("ERROR: U16_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
if(c != expected) {
log_err("ERROR: U16_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
}
U16_GET_OR_FFFD(input, 0, offset, UPRV_LENGTHOF(input), c);
if(U_IS_SURROGATE(expected)) { expected=0xfffd; }
if(c != expected) {
log_err("ERROR: U16_GET_OR_FFFD failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
}
UTF16_GET_CHAR_SAFE(input, 0, offset, UPRV_LENGTHOF(input), c, TRUE);
@ -216,7 +223,7 @@ static void TestNextPrevChar(){
};
UChar32 c=0x0000;
UChar32 c=0x0000, expected;
uint16_t i=0;
uint16_t offset=0, setOffset=0;
for(offset=0; offset<UPRV_LENGTHOF(input); offset++){
@ -246,9 +253,10 @@ static void TestNextPrevChar(){
log_err("ERROR: UTF16_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[i+1], setOffset);
}
if(c != result[i+1]){
log_err("ERROR: UTF16_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
}
expected=result[i+1];
if(c != expected) {
log_err("ERROR: UTF16_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
}
setOffset=offset;
U16_NEXT(input, setOffset, UPRV_LENGTHOF(input), c);
@ -256,9 +264,20 @@ static void TestNextPrevChar(){
log_err("ERROR: U16_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[i+1], setOffset);
}
if(c != result[i+1]){
log_err("ERROR: U16_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
}
if(c != expected){
log_err("ERROR: U16_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
}
setOffset=offset;
U16_NEXT_OR_FFFD(input, setOffset, UPRV_LENGTHOF(input), c);
if(setOffset != movedOffset[i+1]){
log_err("ERROR: U16_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[i+1], setOffset);
}
if(U_IS_SURROGATE(expected)) { expected=0xfffd; }
if(c != expected){
log_err("ERROR: U16_NEXT_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
}
setOffset=offset;
UTF16_NEXT_CHAR_SAFE(input, setOffset, UPRV_LENGTHOF(input), c, TRUE);
@ -310,9 +329,21 @@ static void TestNextPrevChar(){
log_err("ERROR: U16_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[i+4], setOffset);
}
if(c != result[i+4]){
log_err("ERROR: U16_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
}
expected = result[i+4];
if(c != expected) {
log_err("ERROR: U16_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
}
setOffset=offset;
U16_PREV_OR_FFFD(input, 0, setOffset, c);
if(setOffset != movedOffset[i+4]){
log_err("ERROR: U16_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[i+4], setOffset);
}
if(U_IS_SURROGATE(expected)) { expected=0xfffd; }
if(c != expected) {
log_err("ERROR: U16_PREV_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
}
setOffset=offset;
UTF16_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE);
@ -349,14 +380,24 @@ static void TestNulTerminated() {
0
};
UChar32 c, c2;
UChar32 c, c2, expected;
int32_t i0, i=0, j, k, expectedIndex;
int32_t cpIndex=0;
do {
i0=i;
U16_NEXT(input, i, -1, c);
if(c!=result[cpIndex]) {
log_err("U16_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, result[cpIndex]);
expected=result[cpIndex];
if(c!=expected) {
log_err("U16_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected);
}
j=i0;
U16_NEXT_OR_FFFD(input, j, -1, c);
if(U_IS_SURROGATE(expected)) { expected=0xfffd; }
if(c!=expected) {
log_err("U16_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected);
}
if(j!=i) {
log_err("U16_NEXT_OR_FFFD() moved to index %d but U16_NEXT() moved to %d\n", j, i);
}
j=i0;
U16_FWD_1(input, j, -1);
@ -385,6 +426,11 @@ static void TestNulTerminated() {
if(c2!=c) {
log_err("U16_NEXT(from %d)=U+%04x != U+%04x=U16_GET(at %d)\n", i0, c, c2, j);
}
U16_GET_OR_FFFD(input, 0, j, -1, c2);
expected= U_IS_SURROGATE(c) ? 0xfffd : c;
if(c2!=expected) {
log_err("U16_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U16_GET_OR_FFFD(at %d)\n", i0, expected, c2, j);
}
/* U16_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */
k=j+1;
U16_SET_CP_LIMIT(input, 0, k, -1);