ICU-21655 widen Java character APIs from char to int arguments

This commit is contained in:
Markus Scherer 2021-08-23 18:36:57 -07:00
parent 280f0f2a25
commit a36f06eaae
22 changed files with 241 additions and 162 deletions

View file

@ -1551,6 +1551,22 @@ itself public can be placed in different places:
4. If it is used by multiple packages, make it public and place the class in
`the com.ibm.icu.impl` package.
### ICU4J API Stability
General discussion: See [ICU Design / ICU API compatibility](../icu/design.md#icu-api-compatibility).
Occasionally, we “broaden” or “widen” a Java API by making a parameter broader
(e.g., `char` (code unit) to `int` (code point), or `String` to `CharSequence`)
or a return type narrower (e.g., `Object` to `UnicodeSet`).
Such a change is source-compatible but not binary compatible.
Before we do this, we need to check with users like Android whether this is ok.
For example, in a class that Android exposes via its SDK,
Android may need to retain hidden compatibility overloads with the old input types.
In addition, we should test with code using both the old and new types,
so that if someone has such compatibility overloads they all get exercised.
### Error Handling and Exceptions
Errors should be indicated by throwing exceptions, not by returning “bogus”

View file

@ -326,7 +326,7 @@ class CharsetASCII extends CharsetICU {
* if the character is a lead surrogate, we need to call encodeTrail to attempt to match
* it up with a trail surrogate. if not, the character is unmappable.
*/
return (UTF16.isSurrogate((char) ch))
return (UTF16.isSurrogate(ch))
? encodeTrail(source, (char) ch, flush)
: CoderResult.unmappableForLength(1);
}

View file

@ -473,7 +473,7 @@ class CharsetBOCU1 extends CharsetICU {
if(UTF16.isTrailSurrogate(trail)){
source.position(source.position()+1);
++nextSourceIndex;
c=UCharacter.getCodePoint((char)c, trail);
c=UCharacter.getCodePoint(c, trail);
}
} else {
/*no more input*/
@ -518,7 +518,7 @@ class CharsetBOCU1 extends CharsetICU {
continue;
}
if(UTF16.isLeadSurrogate((char)c)){
if(UTF16.isLeadSurrogate(c)){
getTrail(source, target, offsets);
if(checkNegative){
break;

View file

@ -375,8 +375,8 @@ class CharsetCompoundText extends CharsetICU {
tmpTargetBuffer.limit(3);
/* check if the char is a First surrogate */
if (UTF16.isSurrogate((char)sourceChar) || gotoGetTrail) {
if (UTF16.isLeadSurrogate((char)sourceChar) || gotoGetTrail) {
if (UTF16.isSurrogate(sourceChar) || gotoGetTrail) {
if (UTF16.isLeadSurrogate(sourceChar) || gotoGetTrail) {
// getTrail label
/* reset gotoGetTrail flag*/
gotoGetTrail = false;
@ -388,7 +388,7 @@ class CharsetCompoundText extends CharsetICU {
source.position(source.position()-1);
if (UTF16.isTrailSurrogate(trail)) {
source.get();
sourceChar = UCharacter.getCodePoint((char)sourceChar, trail);
sourceChar = UCharacter.getCodePoint(sourceChar, trail);
fromUChar32 = 0x00;
/* convert this supplementary code point */
/* exit this condition tree */

View file

@ -342,7 +342,7 @@ class CharsetHZ extends CharsetICU {
/* Handle surrogates */
/* check if the char is a First surrogate */
if (UTF16.isSurrogate((char) mySourceChar)) {
if (UTF16.isSurrogate(mySourceChar)) {
// use that handy handleSurrogates method everyone's been talking about!
CoderResult cr = handleSurrogates(source, (char) mySourceChar);
return (cr != null) ? cr : CoderResult.unmappableForLength(2);

View file

@ -1364,7 +1364,7 @@ class CharsetISCII extends CharsetICU {
if (cr.isOverflow()) {
break;
}
} else if (UTF16.isSurrogate((char)sourceChar)) {
} else if (UTF16.isSurrogate(sourceChar)) {
cr = handleSurrogates(source, (char) sourceChar);
return (cr != null) ? cr : CoderResult.unmappableForLength(2);
} else {

View file

@ -1759,8 +1759,8 @@ class CharsetISO2022 extends CharsetICU {
sourceChar = source.get();
}
/* check if the char is a First surrogate */
if (getTrail || UTF16.isSurrogate((char)sourceChar)) {
if (getTrail || UTF16.isLeadSurrogate((char)sourceChar)) {
if (getTrail || UTF16.isSurrogate(sourceChar)) {
if (getTrail || UTF16.isLeadSurrogate(sourceChar)) {
// getTrail:
if (getTrail) {
getTrail = false;
@ -1773,7 +1773,7 @@ class CharsetISO2022 extends CharsetICU {
source.position(source.position()-1);
if (UTF16.isTrailSurrogate(trail)) {
source.get();
sourceChar = UCharacter.getCodePoint((char)sourceChar, trail);
sourceChar = UCharacter.getCodePoint(sourceChar, trail);
fromUChar32 = 0x00;
/* convert this supplementary code point */
/* exit this condition tree */
@ -2267,8 +2267,8 @@ class CharsetISO2022 extends CharsetICU {
sourceChar = source.get();
}
/* check if the char is a First surrogate */
if (UTF16.isSurrogate((char)sourceChar) || gotoGetTrail) {
if (UTF16.isLeadSurrogate((char)sourceChar) || gotoGetTrail) {
if (UTF16.isSurrogate(sourceChar) || gotoGetTrail) {
if (UTF16.isLeadSurrogate(sourceChar) || gotoGetTrail) {
// getTrail label
/* reset gotoGetTrail flag*/
gotoGetTrail = false;
@ -2280,7 +2280,7 @@ class CharsetISO2022 extends CharsetICU {
source.position(source.position()-1);
if (UTF16.isTrailSurrogate(trail)) {
source.get();
sourceChar = UCharacter.getCodePoint((char)sourceChar, trail);
sourceChar = UCharacter.getCodePoint(sourceChar, trail);
fromUChar32 = 0x00;
/* convert this supplementary code point */
/* exit this condition tree */
@ -2767,8 +2767,8 @@ class CharsetISO2022 extends CharsetICU {
*/
/* check if the char is a First surrogate */
if (gotoGetTrail || UTF16.isSurrogate((char)sourceChar)) {
if (gotoGetTrail || UTF16.isLeadSurrogate((char)sourceChar)) {
if (gotoGetTrail || UTF16.isSurrogate(sourceChar)) {
if (gotoGetTrail || UTF16.isLeadSurrogate(sourceChar)) {
// getTrail label
// reset gotoGetTrail flag
gotoGetTrail = false;
@ -2780,7 +2780,7 @@ class CharsetISO2022 extends CharsetICU {
source.position(source.position()-1);
if (UTF16.isTrailSurrogate(trail)) {
source.get();
sourceChar = UCharacter.getCodePoint((char)sourceChar, trail);
sourceChar = UCharacter.getCodePoint(sourceChar, trail);
err = CoderResult.unmappableForLength(2);
/* convert this surrogate code point */
/* exit this condition tree */

View file

@ -2946,7 +2946,7 @@ class CharsetMBCS extends CharsetICU {
boolean doloop = true;
boolean doread = true;
if (c != 0 && target.hasRemaining()) {
if (UTF16.isLeadSurrogate((char) c) && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) {
if (UTF16.isLeadSurrogate(c) && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) {
// c is a lead surrogate, read another input
SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex, nextSourceIndex,
prevSourceIndex, prevLength);
@ -2989,9 +2989,9 @@ class CharsetMBCS extends CharsetICU {
* are not paired but mapped separately. Note that in this case unmatched surrogates are
* not detected.
*/
if (UTF16.isSurrogate((char) c)
if (UTF16.isSurrogate(c)
&& (uniMask & UConverterConstants.HAS_SURROGATES) == 0) {
if (UTF16.isLeadSurrogate((char) c)) {
if (UTF16.isLeadSurrogate(c)) {
// getTrail:
SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex,
nextSourceIndex, prevSourceIndex, prevLength);
@ -4064,9 +4064,9 @@ class CharsetMBCS extends CharsetICU {
/* normal end of conversion: prepare for a new character */
c = 0;
continue;
} else if (!UTF16.isSurrogate((char) c)) {
} else if (!UTF16.isSurrogate(c)) {
/* normal, unassigned BMP character */
} else if (UTF16.isLeadSurrogate((char) c)) {
} else if (UTF16.isLeadSurrogate(c)) {
// getTrail:
SideEffectsSingleBMP x = new SideEffectsSingleBMP(c, sourceArrayIndex);
doloop = getTrailSingleBMP(source, x, cr);
@ -4195,7 +4195,7 @@ class CharsetMBCS extends CharsetICU {
boolean doloop = true;
boolean doread = true;
if (c != 0 && target.hasRemaining()) {
if (UTF16.isLeadSurrogate((char) c)) {
if (UTF16.isLeadSurrogate(c)) {
SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex);
doloop = getTrailDouble(source, target, uniMask, x, flush, cr);
doread = x.doread;
@ -4225,8 +4225,8 @@ class CharsetMBCS extends CharsetICU {
if (doread) {
c = source.get(sourceArrayIndex++);
++nextSourceIndex;
if (UTF16.isSurrogate((char) c)) {
if (UTF16.isLeadSurrogate((char) c)) {
if (UTF16.isSurrogate(c)) {
if (UTF16.isLeadSurrogate(c)) {
// getTrail:
SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex,
nextSourceIndex);
@ -4340,7 +4340,7 @@ class CharsetMBCS extends CharsetICU {
boolean doloop = true;
boolean doread = true;
if (c != 0 && target.hasRemaining()) {
if (UTF16.isLeadSurrogate((char) c)) {
if (UTF16.isLeadSurrogate(c)) {
SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex);
doloop = getTrailDouble(source, target, uniMask, x, flush, cr);
doread = x.doread;
@ -4374,8 +4374,8 @@ class CharsetMBCS extends CharsetICU {
* not paired but mapped separately. Note that in this case unmatched surrogates are not
* detected.
*/
if (UTF16.isSurrogate((char) c) && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) {
if (UTF16.isLeadSurrogate((char) c)) {
if (UTF16.isSurrogate(c) && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) {
if (UTF16.isLeadSurrogate(c)) {
// getTrail:
SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex,
nextSourceIndex);
@ -4504,7 +4504,7 @@ class CharsetMBCS extends CharsetICU {
char trail = source.get(x.sourceArrayIndex);
if (UTF16.isTrailSurrogate(trail)) {
++x.sourceArrayIndex;
x.c = UCharacter.getCodePoint((char) x.c, trail);
x.c = UCharacter.getCodePoint(x.c, trail);
/* this codepage does not map supplementary code points */
/* callback(unassigned) */
cr[0] = CoderResult.unmappableForLength(2);
@ -4548,7 +4548,7 @@ class CharsetMBCS extends CharsetICU {
++x.sourceArrayIndex;
++x.nextSourceIndex;
/* convert this supplementary code point */
x.c = UCharacter.getCodePoint((char) x.c, trail);
x.c = UCharacter.getCodePoint(x.c, trail);
if ((uniMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
fromUnicodeStatus = x.prevLength; /* save the old state */
@ -4622,7 +4622,7 @@ class CharsetMBCS extends CharsetICU {
++x.sourceArrayIndex;
++x.nextSourceIndex;
/* convert this supplementary code point */
x.c = UCharacter.getCodePoint((char) x.c, trail);
x.c = UCharacter.getCodePoint(x.c, trail);
if ((uniMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
/* callback(unassigned) */

View file

@ -813,9 +813,9 @@ class CharsetSCSU extends CharsetICU{
offsets.put(sourceIndex);
}
--targetCapacity;
} else if(AfterGetTrail || UTF16.isSurrogate((char)c)){
} else if(AfterGetTrail || UTF16.isSurrogate(c)){
if(!AfterGetTrail){
if(UTF16.isLeadSurrogate((char)c)){
if(UTF16.isLeadSurrogate(c)){
label = getTrail(source, target, offsets);
if(label==EndLoop){
return label;
@ -1058,7 +1058,7 @@ class CharsetSCSU extends CharsetICU{
if(UTF16.isTrailSurrogate(trail)){
source.position(source.position()+1);
++nextSourceIndex;
c = UCharacter.getCodePoint((char)c, trail);
c = UCharacter.getCodePoint(c, trail);
label = Loop;
} else {
/*this is unmatched lead code unit (1st Surrogate)*/
@ -1078,7 +1078,7 @@ class CharsetSCSU extends CharsetICU{
int label = EndLoop;
AfterGetTrailUnicode = true;
/*c is surrogate*/
if(UTF16.isLeadSurrogate((char)c)){
if(UTF16.isLeadSurrogate(c)){
// getTrailUnicode:
lead = (char)c;
if(source.hasRemaining()){
@ -1087,7 +1087,7 @@ class CharsetSCSU extends CharsetICU{
if(UTF16.isTrailSurrogate(trail)){
source.get();
++nextSourceIndex;
c = UCharacter.getCodePoint((char)c, trail);
c = UCharacter.getCodePoint(c, trail);
/*convert this surrogate code point*/
/*exit this condition tree*/
} else {

View file

@ -397,7 +397,7 @@ class CharsetUTF8 extends CharsetICU {
}
targetArray[tgtIdx++] = encodeLastTail(char32);
} else if (!UTF16.isSurrogate((char) char32) || isCESU8) {
} else if (!UTF16.isSurrogate(char32) || isCESU8) {
/* 3 bytes to encode from char32 */
targetArray[tgtIdx++] = encodeHeadOf3(char32);
@ -481,7 +481,7 @@ class CharsetUTF8 extends CharsetICU {
}
target.put(encodeLastTail(char32));
} else if (!UTF16.isSurrogate((char) char32) || isCESU8) {
} else if (!UTF16.isSurrogate(char32) || isCESU8) {
/* 3 bytes to encode from char32 */
target.put(encodeHeadOf3(char32));

View file

@ -33,30 +33,30 @@ public final class CharacterIteration {
// which leaves it in position for underlying iterator's next() to work.
int c = ci.current();
if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE && c<=UTF16.LEAD_SURROGATE_MAX_VALUE) {
c = ci.next();
c = ci.next();
if (c<UTF16.TRAIL_SURROGATE_MIN_VALUE || c>UTF16.TRAIL_SURROGATE_MAX_VALUE) {
ci.previous();
ci.previous();
}
}
// For BMP chars, this next() is the real deal.
c = ci.next();
// If we might have a lead surrogate, we need to peak ahead to get the trail
// If we might have a lead surrogate, we need to peak ahead to get the trail
// even though we don't want to really be positioned there.
if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
c = nextTrail32(ci, c);
c = nextTrail32(ci, c);
}
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c != DONE32) {
// We got a supplementary char. Back the iterator up to the position
// of the lead surrogate.
ci.previous();
ci.previous();
}
return c;
}
// Out-of-line portion of the in-line Next32 code.
// The call site does an initial ci.next() and calls this function
// if the 16 bit value it gets is >= LEAD_SURROGATE_MIN_VALUE.
@ -81,36 +81,36 @@ public final class CharacterIteration {
}
return retVal;
}
public static int previous32(CharacterIterator ci) {
if (ci.getIndex() <= ci.getBeginIndex()) {
return DONE32;
return DONE32;
}
char trail = ci.previous();
int retVal = trail;
if (UTF16.isTrailSurrogate(trail) && ci.getIndex()>ci.getBeginIndex()) {
char lead = ci.previous();
if (UTF16.isLeadSurrogate(lead)) {
retVal = (((int)lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
((int)trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
(trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
UTF16.SUPPLEMENTARY_MIN_VALUE;
} else {
ci.next();
}
}
}
return retVal;
}
public static int current32(CharacterIterator ci) {
char lead = ci.current();
int retVal = lead;
if (retVal < UTF16.LEAD_SURROGATE_MIN_VALUE) {
return retVal;
return retVal;
}
if (UTF16.isLeadSurrogate(lead)) {
int trail = (int)ci.next();
int trail = ci.next();
ci.previous();
if (UTF16.isTrailSurrogate((char)trail)) {
if (UTF16.isTrailSurrogate(trail)) {
retVal = ((lead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10) +
(trail - UTF16.TRAIL_SURROGATE_MIN_VALUE) +
UTF16.SUPPLEMENTARY_MIN_VALUE;
@ -118,7 +118,7 @@ public final class CharacterIteration {
} else {
if (lead == CharacterIterator.DONE) {
if (ci.getIndex() >= ci.getEndIndex()) {
retVal = DONE32;
retVal = DONE32;
}
}
}

View file

@ -102,7 +102,7 @@ public class ReplaceableUCharacterIterator extends UCharacterIterator {
// trail surrogate, check for surrogates
int ch = current();
if(UTF16.isLeadSurrogate((char)ch)){
if(UTF16.isLeadSurrogate(ch)){
// advance the index to get the next code point
next();
// due to post increment semantics current() after next()
@ -111,7 +111,7 @@ public class ReplaceableUCharacterIterator extends UCharacterIterator {
// current should never change the current index so back off
previous();
if(UTF16.isTrailSurrogate((char)ch2)){
if(UTF16.isTrailSurrogate(ch2)){
// we found a surrogate pair
return Character.toCodePoint((char)ch, (char)ch2);
}

View file

@ -865,7 +865,7 @@ public final class Utility {
// if there is a trail surrogate after it, either as an
// escape or as a literal. If so, join them up into a
// supplementary.
if (offset < length && result <= 0xffff && UTF16.isLeadSurrogate((char) result)) {
if (offset < length && UTF16.isLeadSurrogate(result)) {
int ahead = offset+1;
c = s.charAt(offset); // [sic] get 16-bit code unit
if (c == '\\' && ahead < length) {
@ -875,7 +875,7 @@ public final class Utility {
ahead += cpAndLength & 0xff;
}
}
if (c <= 0xffff && UTF16.isTrailSurrogate((char) c)) {
if (UTF16.isTrailSurrogate(c)) {
offset = ahead;
result = Character.toCodePoint((char) result, (char) c);
}

View file

@ -160,6 +160,28 @@ import com.ibm.icu.util.VersionInfo;
public final class UCharacter implements ECharacterCategory, ECharacterDirection
{
/**
* Lead surrogate bitmask
*/
private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
/**
* Trail surrogate bitmask
*/
private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
/**
* Lead surrogate bits
*/
private static final int LEAD_SURROGATE_BITS = 0xD800;
/**
* Trail surrogate bits
*/
private static final int TRAIL_SURROGATE_BITS = 0xDC00;
private static final int U16_SURROGATE_OFFSET = ((0xd800 << 10) + 0xdc00 - 0x10000);
// public inner classes ----------------------------------------------
/**
@ -5266,19 +5288,21 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
/**
* {@icu} Returns a code point corresponding to the two surrogate code units.
*
* @param lead the lead char
* @param trail the trail char
* @return code point if surrogate characters are valid.
* @param lead the lead unit
* (In ICU 2.1-69 the type of both parameters was <code>char</code>.)
* @param trail the trail unit
* @return code point if lead and trail form a valid surrogate pair.
* @exception IllegalArgumentException thrown when the code units do
* not form a valid code point
* @stable ICU 2.1
* not form a valid surrogate pair
* @stable ICU 70
* @see #toCodePoint(int, int)
*/
public static int getCodePoint(char lead, char trail)
public static int getCodePoint(int lead, int trail)
{
if (Character.isSurrogatePair(lead, trail)) {
return Character.toCodePoint(lead, trail);
if (isHighSurrogate(lead) && isLowSurrogate(trail)) {
return toCodePoint(lead, trail);
}
throw new IllegalArgumentException("Illegal surrogate characters");
throw new IllegalArgumentException("Not a valid surrogate pair");
}
/**
@ -6180,37 +6204,43 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
}
/**
* Same as {@link Character#isHighSurrogate}.
* Same as {@link Character#isHighSurrogate},
* except that the ICU version accepts <code>int</code> for code points.
*
* @param ch the char to check
* @return true if ch is a high (lead) surrogate
* @stable ICU 3.0
* @param codePoint the code point to check
* (In ICU 3.0-69 the type of this parameter was <code>char</code>.)
* @return true if codePoint is a high (lead) surrogate
* @stable ICU 70
*/
public static boolean isHighSurrogate(char ch) {
return Character.isHighSurrogate(ch);
public static boolean isHighSurrogate(int codePoint) {
return (codePoint & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
}
/**
* Same as {@link Character#isLowSurrogate}.
* Same as {@link Character#isLowSurrogate},
* except that the ICU version accepts <code>int</code> for code points.
*
* @param ch the char to check
* @return true if ch is a low (trail) surrogate
* @stable ICU 3.0
* @param codePoint the code point to check
* (In ICU 3.0-69 the type of this parameter was <code>char</code>.)
* @return true if codePoint is a low (trail) surrogate
* @stable ICU 70
*/
public static boolean isLowSurrogate(char ch) {
return Character.isLowSurrogate(ch);
public static boolean isLowSurrogate(int codePoint) {
return (codePoint & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
}
/**
* Same as {@link Character#isSurrogatePair}.
* Same as {@link Character#isSurrogatePair},
* except that the ICU version accepts <code>int</code> for code points.
*
* @param high the high (lead) char
* @param low the low (trail) char
* @param high the high (lead) unit
* (In ICU 3.0-69 the type of both parameters was <code>char</code>.)
* @param low the low (trail) unit
* @return true if high, low form a surrogate pair
* @stable ICU 3.0
* @stable ICU 70
*/
public static final boolean isSurrogatePair(char high, char low) {
return Character.isSurrogatePair(high, low);
public static final boolean isSurrogatePair(int high, int low) {
return isHighSurrogate(high) && isLowSurrogate(low);
}
/**
@ -6227,17 +6257,21 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
}
/**
* Same as {@link Character#toCodePoint}.
* Same as {@link Character#toCodePoint},
* except that the ICU version accepts <code>int</code> for code points.
* Returns the code point represented by the two surrogate code units.
* This does not check the surrogate pair for validity.
*
* @param high the high (lead) surrogate
* (In ICU 3.0-69 the type of both parameters was <code>char</code>.)
* @param low the low (trail) surrogate
* @return the code point formed by the surrogate pair
* @stable ICU 3.0
* @stable ICU 70
* @see #getCodePoint(int, int)
*/
public static final int toCodePoint(char high, char low) {
return Character.toCodePoint(high, low);
public static final int toCodePoint(int high, int low) {
// see ICU4C U16_GET_SUPPLEMENTARY()
return (high << 10) + low - U16_SURROGATE_OFFSET;
}
/**

View file

@ -2237,7 +2237,7 @@ public final class Normalizer implements Cloneable {
/* get complete code points for c1, c2 for lookups if either is a surrogate */
cp1=c1;
if(UTF16.isSurrogate((char)c1)) {
if(UTF16.isSurrogate(c1)) {
char c;
if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
@ -2253,7 +2253,7 @@ public final class Normalizer implements Cloneable {
}
cp2=c2;
if(UTF16.isSurrogate((char)c2)) {
if(UTF16.isSurrogate(c2)) {
char c;
if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
@ -2277,7 +2277,7 @@ public final class Normalizer implements Cloneable {
(length=csp.toFullFolding(cp1, fold1, options))>=0
) {
/* cp1 case-folds to the code point "length" or to p[length] */
if(UTF16.isSurrogate((char)c1)) {
if(UTF16.isSurrogate(c1)) {
if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
/* advance beyond source surrogate pair if it case-folds */
++s1;
@ -2325,7 +2325,7 @@ public final class Normalizer implements Cloneable {
(length=csp.toFullFolding(cp2, fold2, options))>=0
) {
/* cp2 case-folds to the code point "length" or to p[length] */
if(UTF16.isSurrogate((char)c2)) {
if(UTF16.isSurrogate(c2)) {
if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
/* advance beyond source surrogate pair if it case-folds */
++s2;
@ -2373,7 +2373,7 @@ public final class Normalizer implements Cloneable {
(decomp1=nfcImpl.getDecomposition(cp1))!=null
) {
/* cp1 decomposes into p[length] */
if(UTF16.isSurrogate((char)c1)) {
if(UTF16.isSurrogate(c1)) {
if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
/* advance beyond source surrogate pair if it decomposes */
++s1;
@ -2417,7 +2417,7 @@ public final class Normalizer implements Cloneable {
(decomp2=nfcImpl.getDecomposition(cp2))!=null
) {
/* cp2 decomposes into p[length] */
if(UTF16.isSurrogate((char)c2)) {
if(UTF16.isSurrogate(c2)) {
if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
/* advance beyond source surrogate pair if it decomposes */
++s2;

View file

@ -153,7 +153,7 @@ public abstract class UCharacterIterator implements Cloneable, UForwardCharacter
*/
public int currentCodePoint() {
int ch = current();
if (UTF16.isLeadSurrogate((char) ch)) {
if (UTF16.isLeadSurrogate(ch)) {
// advance the index to get the
// next code point
next();
@ -165,7 +165,7 @@ public abstract class UCharacterIterator implements Cloneable, UForwardCharacter
// the current index so back off
previous();
if (UTF16.isTrailSurrogate((char) ch2)) {
if (UTF16.isTrailSurrogate(ch2)) {
// we found a surrogate pair
// return the codepoint
return Character.toCodePoint((char) ch, (char) ch2);
@ -211,9 +211,9 @@ public abstract class UCharacterIterator implements Cloneable, UForwardCharacter
@Override
public int nextCodePoint() {
int ch1 = next();
if (UTF16.isLeadSurrogate((char) ch1)) {
if (UTF16.isLeadSurrogate(ch1)) {
int ch2 = next();
if (UTF16.isTrailSurrogate((char) ch2)) {
if (UTF16.isTrailSurrogate(ch2)) {
return Character.toCodePoint((char) ch1, (char) ch2);
} else if (ch2 != DONE) {
// unmatched surrogate so back out
@ -243,9 +243,9 @@ public abstract class UCharacterIterator implements Cloneable, UForwardCharacter
*/
public int previousCodePoint() {
int ch1 = previous();
if (UTF16.isTrailSurrogate((char) ch1)) {
if (UTF16.isTrailSurrogate(ch1)) {
int ch2 = previous();
if (UTF16.isLeadSurrogate((char) ch2)) {
if (UTF16.isLeadSurrogate(ch2)) {
return Character.toCodePoint((char) ch2, (char) ch1);
} else if (ch2 != DONE) {
// unmatched trail surrogate so back out

View file

@ -596,36 +596,39 @@ public final class UTF16 {
}
/**
* Determines whether the code value is a surrogate.
* Determines whether the code point is a surrogate.
*
* @param char16 The input character.
* @return true If the input character is a surrogate.
* @stable ICU 2.1
* @param codePoint The input character.
* (In ICU 2.1-69 the type of this parameter was <code>char</code>.)
* @return true If the input code point is a surrogate.
* @stable ICU 70
*/
public static boolean isSurrogate(char char16) {
return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
public static boolean isSurrogate(int codePoint) {
return (codePoint & SURROGATE_BITMASK) == SURROGATE_BITS;
}
/**
* Determines whether the character is a trail surrogate.
* Determines whether the code point is a trail surrogate.
*
* @param char16 The input character.
* @return true If the input character is a trail surrogate.
* @stable ICU 2.1
* @param codePoint The input character.
* (In ICU 2.1-69 the type of this parameter was <code>char</code>.)
* @return true If the input code point is a trail surrogate.
* @stable ICU 70
*/
public static boolean isTrailSurrogate(char char16) {
return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
public static boolean isTrailSurrogate(int codePoint) {
return (codePoint & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
}
/**
* Determines whether the character is a lead surrogate.
* Determines whether the code point is a lead surrogate.
*
* @param char16 The input character.
* @return true If the input character is a lead surrogate
* @stable ICU 2.1
* @param codePoint The input character.
* (In ICU 2.1-69 the type of this parameter was <code>char</code>.)
* @return true If the input code point is a lead surrogate
* @stable ICU 70
*/
public static boolean isLeadSurrogate(char char16) {
return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
public static boolean isLeadSurrogate(int codePoint) {
return (codePoint & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
}
/**
@ -1545,7 +1548,7 @@ public final class UTF16 {
if (char32 < SUPPLEMENTARY_MIN_VALUE) {
int result = source.indexOf((char) char32);
if (result >= 0) {
if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
if (isLeadSurrogate(char32) && (result < source.length() - 1)
&& isTrailSurrogate(source.charAt(result + 1))) {
return indexOf(source, char32, result + 1);
}
@ -1646,7 +1649,7 @@ public final class UTF16 {
if (char32 < SUPPLEMENTARY_MIN_VALUE) {
int result = source.indexOf((char) char32, fromIndex);
if (result >= 0) {
if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
if (isLeadSurrogate(char32) && (result < source.length() - 1)
&& isTrailSurrogate(source.charAt(result + 1))) {
return indexOf(source, char32, result + 1);
}
@ -1748,7 +1751,7 @@ public final class UTF16 {
if (char32 < SUPPLEMENTARY_MIN_VALUE) {
int result = source.lastIndexOf((char) char32);
if (result >= 0) {
if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
if (isLeadSurrogate(char32) && (result < source.length() - 1)
&& isTrailSurrogate(source.charAt(result + 1))) {
return lastIndexOf(source, char32, result - 1);
}
@ -1859,7 +1862,7 @@ public final class UTF16 {
if (char32 < SUPPLEMENTARY_MIN_VALUE) {
int result = source.lastIndexOf((char) char32, fromIndex);
if (result >= 0) {
if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
if (isLeadSurrogate(char32) && (result < source.length() - 1)
&& isTrailSurrogate(source.charAt(result + 1))) {
return lastIndexOf(source, char32, result - 1);
}

View file

@ -544,7 +544,7 @@ public class TestCharset extends TestFmwk {
bytes[x + 1] = (byte) (0x80 | ((i >> 6) & 0x3f));
bytes[x + 2] = (byte) (0x80 | ((i >> 0) & 0x3f));
chars[y] = (char) i;
if (!UTF16.isSurrogate((char)i)) {
if (!UTF16.isSurrogate(i)) {
bs = ByteBuffer.wrap(bytes, x, 3).slice();
us = CharBuffer.wrap(chars, y, 1).slice();
try {

View file

@ -78,50 +78,58 @@ public final class UCharacterSurrogateTest extends TestFmwk {
@Test
public void TestIsHighSurrogate() {
if (UCharacter
.isHighSurrogate((char) (UCharacter.MIN_HIGH_SURROGATE - 1)))
// Test with both int & char values.
if (UCharacter.isHighSurrogate(UCharacter.MIN_HIGH_SURROGATE - 1) ||
UCharacter.isHighSurrogate((char) (UCharacter.MIN_HIGH_SURROGATE - 1)))
errln("0xd7ff");
if (!UCharacter.isHighSurrogate(UCharacter.MIN_HIGH_SURROGATE))
errln("0xd800");
if (!UCharacter.isHighSurrogate(UCharacter.MAX_HIGH_SURROGATE))
errln("0xdbff");
if (UCharacter
.isHighSurrogate((char) (UCharacter.MAX_HIGH_SURROGATE + 1)))
if (UCharacter.isHighSurrogate(UCharacter.MAX_HIGH_SURROGATE + 1) ||
UCharacter.isHighSurrogate((char) (UCharacter.MAX_HIGH_SURROGATE + 1)))
errln("0xdc00");
}
@Test
public void TestIsLowSurrogate() {
if (UCharacter
.isLowSurrogate((char) (UCharacter.MIN_LOW_SURROGATE - 1)))
// Test with both int & char values.
if (UCharacter.isLowSurrogate(UCharacter.MIN_LOW_SURROGATE - 1) ||
UCharacter.isLowSurrogate((char) (UCharacter.MIN_LOW_SURROGATE - 1)))
errln("0xdbff");
if (!UCharacter.isLowSurrogate(UCharacter.MIN_LOW_SURROGATE))
errln("0xdc00");
if (!UCharacter.isLowSurrogate(UCharacter.MAX_LOW_SURROGATE))
errln("0xdfff");
if (UCharacter
.isLowSurrogate((char) (UCharacter.MAX_LOW_SURROGATE + 1)))
if (UCharacter.isLowSurrogate(UCharacter.MAX_LOW_SURROGATE + 1) ||
UCharacter.isLowSurrogate((char) (UCharacter.MAX_LOW_SURROGATE + 1)))
errln("0xe000");
}
@Test
public void TestIsSurrogatePair() {
// Test with both int & char values.
if (UCharacter.isSurrogatePair(
(char) (UCharacter.MIN_HIGH_SURROGATE - 1),
UCharacter.MIN_LOW_SURROGATE))
UCharacter.MIN_HIGH_SURROGATE - 1, UCharacter.MIN_LOW_SURROGATE) ||
UCharacter.isSurrogatePair(
(char) (UCharacter.MIN_HIGH_SURROGATE - 1), UCharacter.MIN_LOW_SURROGATE))
errln("0xd7ff,0xdc00");
if (UCharacter.isSurrogatePair(
(char) (UCharacter.MAX_HIGH_SURROGATE + 1),
UCharacter.MIN_LOW_SURROGATE))
UCharacter.MAX_HIGH_SURROGATE + 1, UCharacter.MIN_LOW_SURROGATE) ||
UCharacter.isSurrogatePair(
(char) (UCharacter.MAX_HIGH_SURROGATE + 1), UCharacter.MIN_LOW_SURROGATE))
errln("0xd800,0xdc00");
if (UCharacter.isSurrogatePair(UCharacter.MIN_HIGH_SURROGATE,
(char) (UCharacter.MIN_LOW_SURROGATE - 1)))
if (UCharacter.isSurrogatePair(
UCharacter.MIN_HIGH_SURROGATE, UCharacter.MIN_LOW_SURROGATE - 1) ||
UCharacter.isSurrogatePair(
UCharacter.MIN_HIGH_SURROGATE, (char) (UCharacter.MIN_LOW_SURROGATE - 1)))
errln("0xd800,0xdbff");
if (UCharacter.isSurrogatePair(UCharacter.MIN_HIGH_SURROGATE,
(char) (UCharacter.MAX_LOW_SURROGATE + 1)))
if (UCharacter.isSurrogatePair(
UCharacter.MIN_HIGH_SURROGATE, UCharacter.MAX_LOW_SURROGATE + 1) ||
UCharacter.isSurrogatePair(
UCharacter.MIN_HIGH_SURROGATE, (char) (UCharacter.MAX_LOW_SURROGATE + 1)))
errln("0xd800,0xe000");
if (!UCharacter.isSurrogatePair(UCharacter.MIN_HIGH_SURROGATE,
UCharacter.MIN_LOW_SURROGATE))
if (!UCharacter.isSurrogatePair(UCharacter.MIN_HIGH_SURROGATE, UCharacter.MIN_LOW_SURROGATE))
errln("0xd800,0xdc00");
}
@ -157,6 +165,9 @@ public final class UCharacterSurrogateTest extends TestFmwk {
errln(Integer.toHexString(pairs[i]) + ", " + pairs[i + 1]);
break;
}
// Also test with int values.
int cp2 = UCharacter.toCodePoint(pairs[i], pairs[i + 1]);
assertEquals("pairs at " + i, cp, cp2);
}
}

View file

@ -1526,8 +1526,14 @@ public final class UCharacterTest extends TestFmwk
ch ++;
}
}
try
{
// Test with both char & int values.
try {
UCharacter.getCodePoint(0xD7ff, 0xDC00);
errln("Invalid surrogate characters should not form a " +
"supplementary");
} catch(Exception e) {
}
try {
UCharacter.getCodePoint((char)0xD7ff, (char)0xDC00);
errln("Invalid surrogate characters should not form a " +
"supplementary");

View file

@ -480,26 +480,35 @@ public final class UTF16Test extends TestFmwk
@Test
public void TestGetCharCountSurrogate()
{
if (UTF16.getCharCount(0x61) != 1 ||
UTF16.getCharCount(0x10000) != 2) {
errln("FAIL getCharCount result failure");
if (UTF16.getCharCount(0x61) != 1 || UTF16.getCharCount(0x10000) != 2) {
errln("FAIL getCharCount result failure");
}
// ICU-21655 (ICU 70) widened the surrogate functions from char to int.
// Test with both types, in case someone like Android retains binary-compatibility overloads.
if (UTF16.getLeadSurrogate(0x61) != 0 ||
UTF16.getTrailSurrogate(0x61) != 0x61 ||
UTF16.isLeadSurrogate((char)0x61) ||
UTF16.isTrailSurrogate((char)0x61) ||
UTF16.getLeadSurrogate(0x10000) != 0xd800 ||
UTF16.getTrailSurrogate(0x10000) != 0xdc00 ||
UTF16.isLeadSurrogate((char)0xd800) != true ||
UTF16.isTrailSurrogate((char)0xd800) ||
UTF16.isLeadSurrogate((char)0xdc00) ||
UTF16.isTrailSurrogate((char)0xdc00) != true) {
errln("FAIL *Surrogate result failure");
UTF16.getTrailSurrogate(0x61) != 0x61 ||
UTF16.isLeadSurrogate((char)0x61) ||
UTF16.isTrailSurrogate((char)0x61) ||
UTF16.isLeadSurrogate(0x61) ||
UTF16.isTrailSurrogate(0x61) ||
UTF16.getLeadSurrogate(0x10000) != 0xd800 ||
UTF16.getTrailSurrogate(0x10000) != 0xdc00 ||
UTF16.isLeadSurrogate((char)0xd800) != true ||
UTF16.isTrailSurrogate((char)0xd800) ||
UTF16.isLeadSurrogate((char)0xdc00) ||
UTF16.isTrailSurrogate((char)0xdc00) != true ||
UTF16.isLeadSurrogate(0xd800) != true ||
UTF16.isTrailSurrogate(0xd800) ||
UTF16.isLeadSurrogate(0xdc00) ||
UTF16.isTrailSurrogate(0xdc00) != true) {
errln("FAIL *Surrogate result failure");
}
if (UTF16.isSurrogate((char)0x61) || !UTF16.isSurrogate((char)0xd800)
|| !UTF16.isSurrogate((char)0xdc00)) {
errln("FAIL isSurrogate result failure");
|| !UTF16.isSurrogate((char)0xdc00)
|| UTF16.isSurrogate(0x61) || !UTF16.isSurrogate(0xd800)
|| !UTF16.isSurrogate(0xdc00)) {
errln("FAIL isSurrogate result failure");
}
}

View file

@ -287,7 +287,7 @@ public final class TrieTest extends TestFmwk
+ Integer.toHexString(value2) + " instead of 0x"
+ Integer.toHexString(value));
}
if (!UTF16.isLeadSurrogate((char)start)) {
if (!UTF16.isLeadSurrogate(start)) {
value2 = trie.getLeadValue((char)start);
if (value != value2) {
errln("serialized trie.getLeadValue(U+"