mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 15:05:53 +00:00
ICU-13244 add U16_GET_OR_FFFD(), U16_NEXT_OR_FFFD(), U16_PREV_OR_FFFD()
X-SVN-Rev: 40404
This commit is contained in:
parent
dda03c710a
commit
edac6e7206
2 changed files with 190 additions and 24 deletions
|
@ -185,8 +185,8 @@
|
|||
*
|
||||
* The length can be negative for a NUL-terminated string.
|
||||
*
|
||||
* If the offset points to a single, unpaired surrogate, then that itself
|
||||
* will be returned as the code point.
|
||||
* If the offset points to a single, unpaired surrogate, then
|
||||
* c is set to that unpaired surrogate.
|
||||
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
|
@ -213,6 +213,53 @@
|
|||
} \
|
||||
}
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Get a code point from a string at a random-access offset,
|
||||
* without changing the offset.
|
||||
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||
*
|
||||
* The offset may point to either the lead or trail surrogate unit
|
||||
* for a supplementary code point, in which case the macro will read
|
||||
* the adjacent matching surrogate as well.
|
||||
*
|
||||
* The length can be negative for a NUL-terminated string.
|
||||
*
|
||||
* If the offset points to a single, unpaired surrogate, then
|
||||
* c is set to U+FFFD.
|
||||
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT_OR_FFFD.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param start starting string offset (usually 0)
|
||||
* @param i string offset, must be start<=i<length
|
||||
* @param length string length
|
||||
* @param c output UChar32 variable
|
||||
* @see U16_GET_UNSAFE
|
||||
* @draft ICU 60
|
||||
*/
|
||||
#define U16_GET_OR_FFFD(s, start, i, length, c) { \
|
||||
(c)=(s)[i]; \
|
||||
if(U16_IS_SURROGATE(c)) { \
|
||||
uint16_t __c2; \
|
||||
if(U16_IS_SURROGATE_LEAD(c)) { \
|
||||
if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
|
||||
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
|
||||
} else { \
|
||||
(c)=0xfffd; \
|
||||
} \
|
||||
} else { \
|
||||
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
|
||||
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
|
||||
} else { \
|
||||
(c)=0xfffd; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
/* definitions with forward iteration --------------------------------------- */
|
||||
|
||||
/**
|
||||
|
@ -253,8 +300,7 @@
|
|||
* for a supplementary code point, in which case the macro will read
|
||||
* the following trail surrogate as well.
|
||||
* If the offset points to a trail surrogate or
|
||||
* to a single, unpaired lead surrogate, then that itself
|
||||
* will be returned as the code point.
|
||||
* to a single, unpaired lead surrogate, then c is set to that unpaired surrogate.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset, must be i<length
|
||||
|
@ -274,6 +320,44 @@
|
|||
} \
|
||||
}
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Get a code point from a string at a code point boundary offset,
|
||||
* and advance the offset to the next code point boundary.
|
||||
* (Post-incrementing forward iteration.)
|
||||
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||
*
|
||||
* The length can be negative for a NUL-terminated string.
|
||||
*
|
||||
* The offset may point to the lead surrogate unit
|
||||
* for a supplementary code point, in which case the macro will read
|
||||
* the following trail surrogate as well.
|
||||
* If the offset points to a trail surrogate or
|
||||
* to a single, unpaired lead surrogate, then c is set to U+FFFD.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset, must be i<length
|
||||
* @param length string length
|
||||
* @param c output UChar32 variable
|
||||
* @see U16_NEXT_UNSAFE
|
||||
* @draft ICU 60
|
||||
*/
|
||||
#define U16_NEXT_OR_FFFD(s, i, length, c) { \
|
||||
(c)=(s)[(i)++]; \
|
||||
if(U16_IS_SURROGATE(c)) { \
|
||||
uint16_t __c2; \
|
||||
if(U16_IS_SURROGATE_LEAD(c) && (i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
|
||||
++(i); \
|
||||
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
|
||||
} else { \
|
||||
(c)=0xfffd; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Append a code point to a string, overwriting 1 or 2 code units.
|
||||
* The offset points to the current end of the string contents
|
||||
|
@ -481,8 +565,7 @@
|
|||
* for a supplementary code point, then the macro will read
|
||||
* the preceding lead surrogate as well.
|
||||
* If the offset is behind a lead surrogate or behind a single, unpaired
|
||||
* trail surrogate, then that itself
|
||||
* will be returned as the code point.
|
||||
* trail surrogate, then c is set to that unpaired surrogate.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param start starting string offset (usually 0)
|
||||
|
@ -502,6 +585,43 @@
|
|||
} \
|
||||
}
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Move the string offset from one code point boundary to the previous one
|
||||
* and get the code point between them.
|
||||
* (Pre-decrementing backward iteration.)
|
||||
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||
*
|
||||
* The input offset may be the same as the string length.
|
||||
* If the offset is behind a trail surrogate unit
|
||||
* for a supplementary code point, then the macro will read
|
||||
* the preceding lead surrogate as well.
|
||||
* If the offset is behind a lead surrogate or behind a single, unpaired
|
||||
* trail surrogate, then c is set to U+FFFD.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param start starting string offset (usually 0)
|
||||
* @param i string offset, must be start<i
|
||||
* @param c output UChar32 variable
|
||||
* @see U16_PREV_UNSAFE
|
||||
* @draft ICU 60
|
||||
*/
|
||||
#define U16_PREV_OR_FFFD(s, start, i, c) { \
|
||||
(c)=(s)[--(i)]; \
|
||||
if(U16_IS_SURROGATE(c)) { \
|
||||
uint16_t __c2; \
|
||||
if(U16_IS_SURROGATE_TRAIL(c) && (i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
|
||||
--(i); \
|
||||
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
|
||||
} else { \
|
||||
(c)=0xfffd; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Move the string offset from one code point boundary to the previous one.
|
||||
* (Pre-decrementing backward iteration.)
|
||||
|
|
|
@ -147,7 +147,7 @@ static void TestGetChar()
|
|||
0x11734, 0xd800, UTF_ERROR_VALUE
|
||||
};
|
||||
uint16_t i=0;
|
||||
UChar32 c;
|
||||
UChar32 c, expected;
|
||||
uint16_t offset=0;
|
||||
for(offset=0; offset<UPRV_LENGTHOF(input); offset++) {
|
||||
if(0<offset && offset<UPRV_LENGTHOF(input)-1){
|
||||
|
@ -163,13 +163,20 @@ static void TestGetChar()
|
|||
}
|
||||
|
||||
UTF16_GET_CHAR_SAFE(input, 0, offset, UPRV_LENGTHOF(input), c, FALSE);
|
||||
if(c != result[i+1]){
|
||||
log_err("ERROR: UTF16_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
|
||||
expected=result[i+1];
|
||||
if(c != expected) {
|
||||
log_err("ERROR: UTF16_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
|
||||
U16_GET(input, 0, offset, UPRV_LENGTHOF(input), c);
|
||||
if(c != result[i+1]){
|
||||
log_err("ERROR: U16_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
|
||||
if(c != expected) {
|
||||
log_err("ERROR: U16_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
|
||||
U16_GET_OR_FFFD(input, 0, offset, UPRV_LENGTHOF(input), c);
|
||||
if(U_IS_SURROGATE(expected)) { expected=0xfffd; }
|
||||
if(c != expected) {
|
||||
log_err("ERROR: U16_GET_OR_FFFD failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
|
||||
UTF16_GET_CHAR_SAFE(input, 0, offset, UPRV_LENGTHOF(input), c, TRUE);
|
||||
|
@ -216,7 +223,7 @@ static void TestNextPrevChar(){
|
|||
};
|
||||
|
||||
|
||||
UChar32 c=0x0000;
|
||||
UChar32 c=0x0000, expected;
|
||||
uint16_t i=0;
|
||||
uint16_t offset=0, setOffset=0;
|
||||
for(offset=0; offset<UPRV_LENGTHOF(input); offset++){
|
||||
|
@ -246,9 +253,10 @@ static void TestNextPrevChar(){
|
|||
log_err("ERROR: UTF16_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+1], setOffset);
|
||||
}
|
||||
if(c != result[i+1]){
|
||||
log_err("ERROR: UTF16_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
|
||||
}
|
||||
expected=result[i+1];
|
||||
if(c != expected) {
|
||||
log_err("ERROR: UTF16_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U16_NEXT(input, setOffset, UPRV_LENGTHOF(input), c);
|
||||
|
@ -256,9 +264,20 @@ static void TestNextPrevChar(){
|
|||
log_err("ERROR: U16_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+1], setOffset);
|
||||
}
|
||||
if(c != result[i+1]){
|
||||
log_err("ERROR: U16_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
|
||||
}
|
||||
if(c != expected){
|
||||
log_err("ERROR: U16_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U16_NEXT_OR_FFFD(input, setOffset, UPRV_LENGTHOF(input), c);
|
||||
if(setOffset != movedOffset[i+1]){
|
||||
log_err("ERROR: U16_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+1], setOffset);
|
||||
}
|
||||
if(U_IS_SURROGATE(expected)) { expected=0xfffd; }
|
||||
if(c != expected){
|
||||
log_err("ERROR: U16_NEXT_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
UTF16_NEXT_CHAR_SAFE(input, setOffset, UPRV_LENGTHOF(input), c, TRUE);
|
||||
|
@ -310,9 +329,21 @@ static void TestNextPrevChar(){
|
|||
log_err("ERROR: U16_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+4], setOffset);
|
||||
}
|
||||
if(c != result[i+4]){
|
||||
log_err("ERROR: U16_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
|
||||
}
|
||||
expected = result[i+4];
|
||||
if(c != expected) {
|
||||
log_err("ERROR: U16_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U16_PREV_OR_FFFD(input, 0, setOffset, c);
|
||||
if(setOffset != movedOffset[i+4]){
|
||||
log_err("ERROR: U16_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+4], setOffset);
|
||||
}
|
||||
if(U_IS_SURROGATE(expected)) { expected=0xfffd; }
|
||||
if(c != expected) {
|
||||
log_err("ERROR: U16_PREV_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
UTF16_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE);
|
||||
|
@ -349,14 +380,24 @@ static void TestNulTerminated() {
|
|||
0
|
||||
};
|
||||
|
||||
UChar32 c, c2;
|
||||
UChar32 c, c2, expected;
|
||||
int32_t i0, i=0, j, k, expectedIndex;
|
||||
int32_t cpIndex=0;
|
||||
do {
|
||||
i0=i;
|
||||
U16_NEXT(input, i, -1, c);
|
||||
if(c!=result[cpIndex]) {
|
||||
log_err("U16_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, result[cpIndex]);
|
||||
expected=result[cpIndex];
|
||||
if(c!=expected) {
|
||||
log_err("U16_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected);
|
||||
}
|
||||
j=i0;
|
||||
U16_NEXT_OR_FFFD(input, j, -1, c);
|
||||
if(U_IS_SURROGATE(expected)) { expected=0xfffd; }
|
||||
if(c!=expected) {
|
||||
log_err("U16_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected);
|
||||
}
|
||||
if(j!=i) {
|
||||
log_err("U16_NEXT_OR_FFFD() moved to index %d but U16_NEXT() moved to %d\n", j, i);
|
||||
}
|
||||
j=i0;
|
||||
U16_FWD_1(input, j, -1);
|
||||
|
@ -385,6 +426,11 @@ static void TestNulTerminated() {
|
|||
if(c2!=c) {
|
||||
log_err("U16_NEXT(from %d)=U+%04x != U+%04x=U16_GET(at %d)\n", i0, c, c2, j);
|
||||
}
|
||||
U16_GET_OR_FFFD(input, 0, j, -1, c2);
|
||||
expected= U_IS_SURROGATE(c) ? 0xfffd : c;
|
||||
if(c2!=expected) {
|
||||
log_err("U16_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U16_GET_OR_FFFD(at %d)\n", i0, expected, c2, j);
|
||||
}
|
||||
/* U16_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */
|
||||
k=j+1;
|
||||
U16_SET_CP_LIMIT(input, 0, k, -1);
|
||||
|
|
Loading…
Add table
Reference in a new issue