ICU-12725 Update u_isIDStart and u_isIDPart to TR31

ICU-12725 move to uprops.cpp

ICU-12725 change dependency

ICU-12725 Fix Java implementation
This commit is contained in:
Frank Tang 2023-01-18 18:00:35 -08:00 committed by Frank Yung-Fong Tang
parent bb0e745e25
commit de9cb9a133
7 changed files with 47 additions and 73 deletions

View file

@ -304,30 +304,6 @@ u_ispunct(UChar32 c) {
return (UBool)((CAT_MASK(props)&U_GC_P_MASK)!=0);
}
/* Checks if the Unicode character can start a Unicode identifier.*/
U_CAPI UBool U_EXPORT2
u_isIDStart(UChar32 c) {
/* same as u_isalpha() */
uint32_t props;
GET_PROPS(c, props);
return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_NL_MASK))!=0);
}
/* Checks if the Unicode character can be a Unicode identifier part other than starting the
identifier.*/
U_CAPI UBool U_EXPORT2
u_isIDPart(UChar32 c) {
uint32_t props;
GET_PROPS(c, props);
return (UBool)(
(CAT_MASK(props)&
(U_GC_ND_MASK|U_GC_NL_MASK|
U_GC_L_MASK|
U_GC_PC_MASK|U_GC_MC_MASK|U_GC_MN_MASK)
)!=0 ||
u_isIDIgnorable(c));
}
/*Checks if the Unicode character can be ignorable in a Java or Unicode identifier.*/
U_CAPI UBool U_EXPORT2
u_isIDIgnorable(UChar32 c) {

View file

@ -3837,9 +3837,8 @@ u_getPropertyValueEnum(UProperty property,
/**
* Determines if the specified character is permissible as the
* first character in an identifier according to Unicode
* (The Unicode Standard, Version 3.0, chapter 5.16 Identifiers).
* True for characters with general categories "L" (letters) and "Nl" (letter numbers).
* first character in an identifier as ID_Start according to
* Unicode® Standard Annex #31 UNICODE IDENTIFIER AND PATTERN SYNTAX
*
* Same as java.lang.Character.isUnicodeIdentifierStart().
* Same as UCHAR_ID_START
@ -3856,12 +3855,9 @@ U_CAPI UBool U_EXPORT2
u_isIDStart(UChar32 c);
/**
* Determines if the specified character is permissible
* in an identifier according to Java.
* True for characters with general categories "L" (letters),
* "Nl" (letter numbers), "Nd" (decimal digits),
* "Mc" and "Mn" (combining marks), "Pc" (connecting punctuation), and
* u_isIDIgnorable(c).
* Determines if the specified character is permissible as a
* character other than the first character in an identifier as ID_Continue
* according to Unicode® Standard Annex #31 UNICODE IDENTIFIER AND PATTERN SYNTAX
*
* Same as java.lang.Character.isUnicodeIdentifierPart().
* Almost the same as Unicode's ID_Continue (UCHAR_ID_CONTINUE)
@ -3869,7 +3865,8 @@ u_isIDStart(UChar32 c);
* u_isIDIgnorable(c).
*
* @param c the code point to be tested
* @return true if the code point may occur in an identifier according to Java
* @return true if the code point may occur in an identifier other than the
* first character.
*
* @see UCHAR_ID_CONTINUE
* @see u_isIDStart

View file

@ -423,6 +423,19 @@ u_hasBinaryProperty(UChar32 c, UProperty which) {
}
}
/* Checks if the Unicode character can start a Unicode identifier.*/
U_CAPI UBool U_EXPORT2
u_isIDStart(UChar32 c) {
return u_hasBinaryProperty(c, UCHAR_ID_START);
}
/* Checks if the Unicode character can be a Unicode identifier part other than starting the
identifier.*/
U_CAPI UBool U_EXPORT2
u_isIDPart(UChar32 c) {
return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE);
}
U_CAPI UBool U_EXPORT2
u_stringHasBinaryProperty(const UChar *s, int32_t length, UProperty which) {
if (s == nullptr && length != 0) { return false; }

View file

@ -895,10 +895,10 @@ static void TestIdentifier()
const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061, 0x1885, 0x212e, 0x309b};
const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019, 0x2e2f};
const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045, 0x1886, 0x212e, 0x309c};
const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020, 0x2019, 0x2e2f};
const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85};
const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};

View file

@ -351,7 +351,7 @@ group: uniset_core
group: icu_utility_with_props
util_props.o
deps
icu_utility uchar ucase
icu_utility uchar ucase uprops
group: icu_utility
util.o

View file

@ -4550,20 +4550,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
*/
public static boolean isUnicodeIdentifierPart(int ch)
{
// if props == 0, it will just fall through and return false
// cat == format
return ((1 << getType(ch))
& ((1 << UCharacterCategory.UPPERCASE_LETTER)
| (1 << UCharacterCategory.LOWERCASE_LETTER)
| (1 << UCharacterCategory.TITLECASE_LETTER)
| (1 << UCharacterCategory.MODIFIER_LETTER)
| (1 << UCharacterCategory.OTHER_LETTER)
| (1 << UCharacterCategory.LETTER_NUMBER)
| (1 << UCharacterCategory.CONNECTOR_PUNCTUATION)
| (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER)
| (1 << UCharacterCategory.COMBINING_SPACING_MARK)
| (1 << UCharacterCategory.NON_SPACING_MARK))) != 0
|| isIdentifierIgnorable(ch);
return hasBinaryProperty(ch, UProperty.ID_CONTINUE); // single code point
}
/**
@ -4588,15 +4575,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
*/
public static boolean isUnicodeIdentifierStart(int ch)
{
/*int cat = getType(ch);*/
// if props == 0, it will just fall through and return false
return ((1 << getType(ch))
& ((1 << UCharacterCategory.UPPERCASE_LETTER)
| (1 << UCharacterCategory.LOWERCASE_LETTER)
| (1 << UCharacterCategory.TITLECASE_LETTER)
| (1 << UCharacterCategory.MODIFIER_LETTER)
| (1 << UCharacterCategory.OTHER_LETTER)
| (1 << UCharacterCategory.LETTER_NUMBER))) != 0;
return hasBinaryProperty(ch, UProperty.ID_START); // single code point
}
/**

View file

@ -623,15 +623,14 @@ public final class UCharacterTest extends TestFmwk
@Test
public void TestIdentifier()
{
int unicodeidstart[] = {0x0250, 0x0000e2, 0x000061};
int nonunicodeidstart[] = {0x2000, 0x00000a, 0x002019};
int unicodeidpart[] = {0x005f, 0x000032, 0x000045};
int nonunicodeidpart[] = {0x2030, 0x0000a3, 0x000020};
int unicodeidstart[] = {0x0250, 0x0000e2, 0x000061, 0x001885, 0x00212e, 0x00309b};
int nonunicodeidstart[] = {0x2000, 0x00000a, 0x002019, 0x002e2f};
int unicodeidpart[] = {0x005f, 0x000032, 0x000045, 0x001886, 0x00212e, 0x00309c};
int nonunicodeidpart[] = {0x2030, 0x0000a3, 0x000020, 0x002019, 0x002e2f};
int idignore[] = {0x0006, 0x0010, 0x206b};
int nonidignore[] = {0x0075, 0x0000a3, 0x000061};
int size = unicodeidstart.length;
for (int i = 0; i < size; i ++)
for (int i = 0; i < unicodeidstart.length; i ++)
{
if (!UCharacter.isUnicodeIdentifierStart(unicodeidstart[i]))
{
@ -639,6 +638,9 @@ public final class UCharacterTest extends TestFmwk
" expected to be a unicode identifier start character");
break;
}
}
for (int i = 0; i < nonunicodeidstart.length; i ++)
{
if (UCharacter.isUnicodeIdentifierStart(nonunicodeidstart[i]))
{
errln("FAIL \\u" + hex(nonunicodeidstart[i]) +
@ -646,12 +648,18 @@ public final class UCharacterTest extends TestFmwk
"character");
break;
}
}
for (int i = 0; i < unicodeidpart.length; i ++)
{
if (!UCharacter.isUnicodeIdentifierPart(unicodeidpart[i]))
{
errln("FAIL \\u" + hex(unicodeidpart[i]) +
" expected to be a unicode identifier part character");
break;
}
}
for (int i = 0; i < nonunicodeidpart.length; i ++)
{
if (UCharacter.isUnicodeIdentifierPart(nonunicodeidpart[i]))
{
errln("FAIL \\u" + hex(nonunicodeidpart[i]) +
@ -659,23 +667,24 @@ public final class UCharacterTest extends TestFmwk
"character");
break;
}
}
for (int i = 0; i < idignore.length; i ++)
{
if (!UCharacter.isIdentifierIgnorable(idignore[i]))
{
errln("FAIL \\u" + hex(idignore[i]) +
" expected to be a ignorable unicode character");
break;
}
}
for (int i = 0; i < nonidignore.length; i ++)
{
if (UCharacter.isIdentifierIgnorable(nonidignore[i]))
{
errln("FAIL \\u" + hex(nonidignore[i]) +
" expected not to be a ignorable unicode character");
break;
}
logln("Ok \\u" + hex(unicodeidstart[i]) + " and \\u" +
hex(nonunicodeidstart[i]) + " and \\u" +
hex(unicodeidpart[i]) + " and \\u" +
hex(nonunicodeidpart[i]) + " and \\u" +
hex(idignore[i]) + " and \\u" + hex(nonidignore[i]));
}
}