mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-1923 upgrade case folding to Unicode 3.2
X-SVN-Rev: 8769
This commit is contained in:
parent
f8749a02dd
commit
0a70506190
5 changed files with 120 additions and 27 deletions
|
@ -1434,6 +1434,45 @@ u_internalToTitle(UChar32 c, UCharIterator *iter,
|
|||
* a full mapping, i.e., a string.
|
||||
* If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
|
||||
* then only the lowercase mapping is stored.
|
||||
*
|
||||
* Some special cases are hardcoded because their conditions cannot be
|
||||
* parsed and processed from CaseFolding.txt.
|
||||
*
|
||||
* Unicode 3.2 CaseFolding.txt specifies for its status field:
|
||||
|
||||
# C: common case folding, common mappings shared by both simple and full mappings.
|
||||
# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
|
||||
# S: simple case folding, mappings to single characters where different from F.
|
||||
# T: special case for uppercase I and dotted uppercase I
|
||||
# - For non-Turkic languages, this mapping is normally not used.
|
||||
# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
|
||||
#
|
||||
# Usage:
|
||||
# A. To do a simple case folding, use the mappings with status C + S.
|
||||
# B. To do a full case folding, use the mappings with status C + F.
|
||||
#
|
||||
# The mappings with status T can be used or omitted depending on the desired case-folding
|
||||
# behavior. (The default option is to exclude them.)
|
||||
|
||||
* Unicode 3.2 has 'T' mappings as follows:
|
||||
|
||||
0049; T; 0131; # LATIN CAPITAL LETTER I
|
||||
0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
|
||||
* while the default mappings for these code points are:
|
||||
|
||||
0049; C; 0069; # LATIN CAPITAL LETTER I
|
||||
0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
|
||||
* U+0130 is otherwise lowercased to U+0069 (UnicodeData.txt).
|
||||
*
|
||||
* In case this code is used with CaseFolding.txt from an older version of Unicode
|
||||
* where CaseFolding.txt contains mappings with a status of 'I' that
|
||||
* have the opposite polarity ('I' mappings are included by default but excluded for Turkic),
|
||||
* we must also hardcode the Unicode 3.2 mappings for the code points
|
||||
* with 'I' mappings.
|
||||
* Unicode 3.1.1 has 'I' mappings for U+0130 and U+0131.
|
||||
* Unicode 3.2 has a 'T' mapping for U+0130, and lowercases U+0131 to itself (see UnicodeData.txt).
|
||||
*/
|
||||
|
||||
/* return the simple case folding mapping for c */
|
||||
|
@ -1467,11 +1506,26 @@ u_foldCase(UChar32 c, uint32_t options) {
|
|||
pe=oldPE;
|
||||
} else {
|
||||
/* special case folding mappings, hardcoded */
|
||||
if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT && (uint32_t)(c-0x130)<=1) {
|
||||
/* map dotted I and dotless i to U+0069 small i */
|
||||
return 0x69;
|
||||
if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
|
||||
/* default mappings */
|
||||
if(c==0x49) {
|
||||
/* 0049; C; 0069; # LATIN CAPITAL LETTER I */
|
||||
return 0x69;
|
||||
} else if(c==0x130) {
|
||||
/* no simple default mapping for U+0130, use UnicodeData.txt */
|
||||
return 0x69;
|
||||
}
|
||||
} else {
|
||||
/* Turkic mappings */
|
||||
if(c==0x49) {
|
||||
/* 0049; T; 0131; # LATIN CAPITAL LETTER I */
|
||||
return 0x131;
|
||||
} else if(c==0x130) {
|
||||
/* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
|
||||
return 0x69;
|
||||
}
|
||||
}
|
||||
/* return c itself because it is excluded from case folding */
|
||||
/* return c itself because there is no special mapping for it */
|
||||
return c;
|
||||
}
|
||||
}
|
||||
|
@ -1526,12 +1580,33 @@ u_internalFoldCase(UChar32 c,
|
|||
return length;
|
||||
} else {
|
||||
/* special case folding mappings, hardcoded */
|
||||
if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT && (uint32_t)(c-0x130)<=1) {
|
||||
/* map dotted I and dotless i to U+0069 small i */
|
||||
result =0x69;
|
||||
/* goto single; */
|
||||
if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
|
||||
/* default mappings */
|
||||
if(c==0x49) {
|
||||
/* 0049; C; 0069; # LATIN CAPITAL LETTER I */
|
||||
result=0x69;
|
||||
} else if(c==0x130) {
|
||||
/* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
|
||||
if(0<destCapacity) {
|
||||
dest[0]=0x69;
|
||||
}
|
||||
if(1<destCapacity) {
|
||||
dest[1]=0x307;
|
||||
}
|
||||
return 2;
|
||||
}
|
||||
} else {
|
||||
/* Turkic mappings */
|
||||
if(c==0x49) {
|
||||
/* 0049; T; 0131; # LATIN CAPITAL LETTER I */
|
||||
result=0x131;
|
||||
} else if(c==0x130) {
|
||||
/* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
|
||||
result=0x69;
|
||||
}
|
||||
}
|
||||
/* return c itself because it is excluded from case folding */
|
||||
/* return c itself because there is no special mapping for it */
|
||||
/* goto single; */
|
||||
}
|
||||
} else if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_LOWERCASE)) {
|
||||
i=EXC_LOWERCASE;
|
||||
|
|
|
@ -1746,9 +1746,25 @@ u_toupper(UChar32 c);
|
|||
U_CAPI UChar32 U_EXPORT2
|
||||
u_totitle(UChar32 c);
|
||||
|
||||
/** Option value for case folding: use all mappings defined in CaseFolding.txt. @draft ICU 1.8 */
|
||||
/** Option value for case folding: use default mappings defined in CaseFolding.txt. @stable */
|
||||
#define U_FOLD_CASE_DEFAULT 0
|
||||
/** Option value for case folding: exclude the mappings for dotted I and dotless i marked with 'I' in CaseFolding.txt. @draft ICU 1.8 */
|
||||
|
||||
/**
|
||||
* Option value for case folding:
|
||||
*
|
||||
* Use the modified set of mappings provided in CaseFolding.txt to handle dotted I
|
||||
* and dotless i appropriately for Turkic languages (tr, az).
|
||||
*
|
||||
* Before Unicode 3.2, CaseFolding.txt contains mappings marked with 'I' that
|
||||
* are to be included for default mappings and
|
||||
* excluded for the Turkic-specific mappings.
|
||||
*
|
||||
* Unicode 3.2 CaseFolding.txt instead contains mappings marked with 'T' that
|
||||
* are to be excluded for default mappings and
|
||||
* included for the Turkic-specific mappings.
|
||||
*
|
||||
* @stable
|
||||
*/
|
||||
#define U_FOLD_CASE_EXCLUDE_SPECIAL_I 1
|
||||
|
||||
/**
|
||||
|
|
|
@ -331,17 +331,19 @@ TestCaseFolding() {
|
|||
simple[]={
|
||||
/* input, default, exclude special i */
|
||||
0x61, 0x61, 0x61,
|
||||
0x49, 0x69, 0x69,
|
||||
0x131, 0x69, 0x131,
|
||||
0x49, 0x69, 0x131,
|
||||
0x130, 0x69, 0x69,
|
||||
0x131, 0x131, 0x131,
|
||||
0xdf, 0xdf, 0xdf,
|
||||
0xfb03, 0xfb03, 0xfb03,
|
||||
0x1040e,0x10436,0x10436,
|
||||
0x5ffff,0x5ffff,0x5ffff
|
||||
};
|
||||
|
||||
static const UChar
|
||||
mixed[]= { 0x61, 0x42, 0x131, 0x3d0, 0xdf, 0xfb03, 0xd93f, 0xdfff },
|
||||
foldedDefault[]= { 0x61, 0x62, 0x69, 0x3b2, 0x73, 0x73, 0x66, 0x66, 0x69, 0xd93f, 0xdfff },
|
||||
foldedExcludeSpecialI[]={ 0x61, 0x62, 0x131, 0x3b2, 0x73, 0x73, 0x66, 0x66, 0x69, 0xd93f, 0xdfff };
|
||||
mixed[]= { 0x61, 0x42, 0x130, 0x49, 0x131, 0x3d0, 0xdf, 0xfb03, 0xd93f, 0xdfff },
|
||||
foldedDefault[]= { 0x61, 0x62, 0x69, 0x307, 0x69, 0x131, 0x3b2, 0x73, 0x73, 0x66, 0x66, 0x69, 0xd93f, 0xdfff },
|
||||
foldedExcludeSpecialI[]={ 0x61, 0x62, 0x69, 0x131, 0x131, 0x3b2, 0x73, 0x73, 0x66, 0x66, 0x69, 0xd93f, 0xdfff };
|
||||
|
||||
UVersionInfo unicodeVersion={ 0, 0, 17, 89 }, unicode_3_1={ 3, 1, 0, 0 };
|
||||
|
||||
|
@ -544,9 +546,9 @@ TestCaseCompare() {
|
|||
static const UChar
|
||||
|
||||
mixed[]= { 0x61, 0x42, 0x131, 0x3a3, 0xdf, 0xfb03, 0xd93f, 0xdfff, 0 },
|
||||
otherDefault[]= { 0x41, 0x62, 0x69, 0x3c3, 0x73, 0x53, 0x46, 0x66, 0x49, 0xd93f, 0xdfff, 0 },
|
||||
otherDefault[]= { 0x41, 0x62, 0x131, 0x3c3, 0x73, 0x53, 0x46, 0x66, 0x49, 0xd93f, 0xdfff, 0 },
|
||||
otherExcludeSpecialI[]={ 0x41, 0x62, 0x131, 0x3c3, 0x53, 0x73, 0x66, 0x46, 0x69, 0xd93f, 0xdfff, 0 },
|
||||
different[]= { 0x41, 0x62, 0x69, 0x3c3, 0x73, 0x53, 0x46, 0x66, 0x49, 0xd93f, 0xdffd, 0 };
|
||||
different[]= { 0x41, 0x62, 0x131, 0x3c3, 0x73, 0x53, 0x46, 0x66, 0x49, 0xd93f, 0xdffd, 0 };
|
||||
|
||||
UVersionInfo unicodeVersion={ 0, 0, 17, 89 }, unicode_3_1={ 3, 1, 0, 0 };
|
||||
|
||||
|
|
|
@ -290,8 +290,9 @@ StringCaseTest::TestCaseConversion()
|
|||
// test case folding
|
||||
{
|
||||
UnicodeString
|
||||
s=UNICODE_STRING("A\\u00df\\u00b5\\ufb03\\U0001040c\\u0131", 35).unescape(),
|
||||
f=UNICODE_STRING("ass\\u03bcffi\\U00010434i", 23).unescape(),
|
||||
s=UnicodeString("A\\u00df\\u00b5\\ufb03\\U0001040c\\u0130\\u0131", "").unescape(),
|
||||
f=UnicodeString("ass\\u03bcffi\\U00010434i\\u0307\\u0131", "").unescape(),
|
||||
g=UnicodeString("ass\\u03bcffi\\U00010434i\\u0131", "").unescape(),
|
||||
t;
|
||||
|
||||
(t=s).foldCase();
|
||||
|
@ -300,10 +301,9 @@ StringCaseTest::TestCaseConversion()
|
|||
}
|
||||
|
||||
// alternate handling for dotted I/dotless i (U+0130, U+0131)
|
||||
f.setCharAt(f.length()-1, 0x131);
|
||||
(t=s).foldCase(U_FOLD_CASE_EXCLUDE_SPECIAL_I);
|
||||
if(f!=t) {
|
||||
errln("error in foldCase(\"" + s + "\", U_FOLD_CASE_EXCLUDE_SPECIAL_I)=\"" + t + "\" but expected \"" + f + "\"");
|
||||
if(g!=t) {
|
||||
errln("error in foldCase(\"" + s + "\", U_FOLD_CASE_EXCLUDE_SPECIAL_I)=\"" + t + "\" but expected \"" + g + "\"");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -299,10 +299,10 @@ UnicodeStringTest::TestCompare()
|
|||
/* test caseCompare() */
|
||||
{
|
||||
static const UChar
|
||||
_mixed[]= { 0x61, 0x42, 0x131, 0x3a3, 0xdf, 0xfb03, 0xd93f, 0xdfff, 0 },
|
||||
_otherDefault[]= { 0x41, 0x62, 0x69, 0x3c3, 0x73, 0x53, 0x46, 0x66, 0x49, 0xd93f, 0xdfff, 0 },
|
||||
_otherExcludeSpecialI[]={ 0x41, 0x62, 0x131, 0x3c3, 0x53, 0x73, 0x66, 0x46, 0x69, 0xd93f, 0xdfff, 0 },
|
||||
_different[]= { 0x41, 0x62, 0x69, 0x3c3, 0x73, 0x53, 0x46, 0x66, 0x49, 0xd93f, 0xdffd, 0 };
|
||||
_mixed[]= { 0x61, 0x42, 0x131, 0x3a3, 0xdf, 0x130, 0x49, 0xfb03, 0xd93f, 0xdfff, 0 },
|
||||
_otherDefault[]= { 0x41, 0x62, 0x131, 0x3c3, 0x73, 0x53, 0x69, 0x307, 0x69, 0x46, 0x66, 0x49, 0xd93f, 0xdfff, 0 },
|
||||
_otherExcludeSpecialI[]={ 0x41, 0x62, 0x131, 0x3c3, 0x53, 0x73, 0x69, 0x131, 0x66, 0x46, 0x69, 0xd93f, 0xdfff, 0 },
|
||||
_different[]= { 0x41, 0x62, 0x131, 0x3c3, 0x73, 0x53, 0x130, 0x49, 0x46, 0x66, 0x49, 0xd93f, 0xdffd, 0 };
|
||||
|
||||
UnicodeString
|
||||
mixed(TRUE, _mixed, -1),
|
||||
|
|
Loading…
Add table
Reference in a new issue