ICU-1923 upgrade case folding to Unicode 3.2

X-SVN-Rev: 8769
This commit is contained in:
Markus Scherer 2002-06-03 03:33:44 +00:00
parent f8749a02dd
commit 0a70506190
5 changed files with 120 additions and 27 deletions

View file

@ -1434,6 +1434,45 @@ u_internalToTitle(UChar32 c, UCharIterator *iter,
* a full mapping, i.e., a string.
* If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
* then only the lowercase mapping is stored.
*
* Some special cases are hardcoded because their conditions cannot be
* parsed and processed from CaseFolding.txt.
*
* Unicode 3.2 CaseFolding.txt specifies for its status field:
# C: common case folding, common mappings shared by both simple and full mappings.
# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
# S: simple case folding, mappings to single characters where different from F.
# T: special case for uppercase I and dotted uppercase I
# - For non-Turkic languages, this mapping is normally not used.
# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
#
# Usage:
# A. To do a simple case folding, use the mappings with status C + S.
# B. To do a full case folding, use the mappings with status C + F.
#
# The mappings with status T can be used or omitted depending on the desired case-folding
# behavior. (The default option is to exclude them.)
* Unicode 3.2 has 'T' mappings as follows:
0049; T; 0131; # LATIN CAPITAL LETTER I
0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
* while the default mappings for these code points are:
0049; C; 0069; # LATIN CAPITAL LETTER I
0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
* U+0130 is otherwise lowercased to U+0069 (UnicodeData.txt).
*
* In case this code is used with CaseFolding.txt from an older version of Unicode
* where CaseFolding.txt contains mappings with a status of 'I' that
* have the opposite polarity ('I' mappings are included by default but excluded for Turkic),
* we must also hardcode the Unicode 3.2 mappings for the code points
* with 'I' mappings.
* Unicode 3.1.1 has 'I' mappings for U+0130 and U+0131.
* Unicode 3.2 has a 'T' mapping for U+0130, and lowercases U+0131 to itself (see UnicodeData.txt).
*/
/* return the simple case folding mapping for c */
@ -1467,11 +1506,26 @@ u_foldCase(UChar32 c, uint32_t options) {
pe=oldPE;
} else {
/* special case folding mappings, hardcoded */
if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT && (uint32_t)(c-0x130)<=1) {
/* map dotted I and dotless i to U+0069 small i */
return 0x69;
if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
/* default mappings */
if(c==0x49) {
/* 0049; C; 0069; # LATIN CAPITAL LETTER I */
return 0x69;
} else if(c==0x130) {
/* no simple default mapping for U+0130, use UnicodeData.txt */
return 0x69;
}
} else {
/* Turkic mappings */
if(c==0x49) {
/* 0049; T; 0131; # LATIN CAPITAL LETTER I */
return 0x131;
} else if(c==0x130) {
/* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
return 0x69;
}
}
/* return c itself because it is excluded from case folding */
/* return c itself because there is no special mapping for it */
return c;
}
}
@ -1526,12 +1580,33 @@ u_internalFoldCase(UChar32 c,
return length;
} else {
/* special case folding mappings, hardcoded */
if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT && (uint32_t)(c-0x130)<=1) {
/* map dotted I and dotless i to U+0069 small i */
result =0x69;
/* goto single; */
if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
/* default mappings */
if(c==0x49) {
/* 0049; C; 0069; # LATIN CAPITAL LETTER I */
result=0x69;
} else if(c==0x130) {
/* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
if(0<destCapacity) {
dest[0]=0x69;
}
if(1<destCapacity) {
dest[1]=0x307;
}
return 2;
}
} else {
/* Turkic mappings */
if(c==0x49) {
/* 0049; T; 0131; # LATIN CAPITAL LETTER I */
result=0x131;
} else if(c==0x130) {
/* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
result=0x69;
}
}
/* return c itself because it is excluded from case folding */
/* return c itself because there is no special mapping for it */
/* goto single; */
}
} else if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_LOWERCASE)) {
i=EXC_LOWERCASE;

View file

@ -1746,9 +1746,25 @@ u_toupper(UChar32 c);
U_CAPI UChar32 U_EXPORT2
u_totitle(UChar32 c);
/** Option value for case folding: use all mappings defined in CaseFolding.txt. @draft ICU 1.8 */
/** Option value for case folding: use default mappings defined in CaseFolding.txt. @stable */
#define U_FOLD_CASE_DEFAULT 0
/** Option value for case folding: exclude the mappings for dotted I and dotless i marked with 'I' in CaseFolding.txt. @draft ICU 1.8 */
/**
* Option value for case folding:
*
* Use the modified set of mappings provided in CaseFolding.txt to handle dotted I
* and dotless i appropriately for Turkic languages (tr, az).
*
* Before Unicode 3.2, CaseFolding.txt contains mappings marked with 'I' that
* are to be included for default mappings and
* excluded for the Turkic-specific mappings.
*
* Unicode 3.2 CaseFolding.txt instead contains mappings marked with 'T' that
* are to be excluded for default mappings and
* included for the Turkic-specific mappings.
*
* @stable
*/
#define U_FOLD_CASE_EXCLUDE_SPECIAL_I 1
/**

View file

@ -331,17 +331,19 @@ TestCaseFolding() {
simple[]={
/* input, default, exclude special i */
0x61, 0x61, 0x61,
0x49, 0x69, 0x69,
0x131, 0x69, 0x131,
0x49, 0x69, 0x131,
0x130, 0x69, 0x69,
0x131, 0x131, 0x131,
0xdf, 0xdf, 0xdf,
0xfb03, 0xfb03, 0xfb03,
0x1040e,0x10436,0x10436,
0x5ffff,0x5ffff,0x5ffff
};
static const UChar
mixed[]= { 0x61, 0x42, 0x131, 0x3d0, 0xdf, 0xfb03, 0xd93f, 0xdfff },
foldedDefault[]= { 0x61, 0x62, 0x69, 0x3b2, 0x73, 0x73, 0x66, 0x66, 0x69, 0xd93f, 0xdfff },
foldedExcludeSpecialI[]={ 0x61, 0x62, 0x131, 0x3b2, 0x73, 0x73, 0x66, 0x66, 0x69, 0xd93f, 0xdfff };
mixed[]= { 0x61, 0x42, 0x130, 0x49, 0x131, 0x3d0, 0xdf, 0xfb03, 0xd93f, 0xdfff },
foldedDefault[]= { 0x61, 0x62, 0x69, 0x307, 0x69, 0x131, 0x3b2, 0x73, 0x73, 0x66, 0x66, 0x69, 0xd93f, 0xdfff },
foldedExcludeSpecialI[]={ 0x61, 0x62, 0x69, 0x131, 0x131, 0x3b2, 0x73, 0x73, 0x66, 0x66, 0x69, 0xd93f, 0xdfff };
UVersionInfo unicodeVersion={ 0, 0, 17, 89 }, unicode_3_1={ 3, 1, 0, 0 };
@ -544,9 +546,9 @@ TestCaseCompare() {
static const UChar
mixed[]= { 0x61, 0x42, 0x131, 0x3a3, 0xdf, 0xfb03, 0xd93f, 0xdfff, 0 },
otherDefault[]= { 0x41, 0x62, 0x69, 0x3c3, 0x73, 0x53, 0x46, 0x66, 0x49, 0xd93f, 0xdfff, 0 },
otherDefault[]= { 0x41, 0x62, 0x131, 0x3c3, 0x73, 0x53, 0x46, 0x66, 0x49, 0xd93f, 0xdfff, 0 },
otherExcludeSpecialI[]={ 0x41, 0x62, 0x131, 0x3c3, 0x53, 0x73, 0x66, 0x46, 0x69, 0xd93f, 0xdfff, 0 },
different[]= { 0x41, 0x62, 0x69, 0x3c3, 0x73, 0x53, 0x46, 0x66, 0x49, 0xd93f, 0xdffd, 0 };
different[]= { 0x41, 0x62, 0x131, 0x3c3, 0x73, 0x53, 0x46, 0x66, 0x49, 0xd93f, 0xdffd, 0 };
UVersionInfo unicodeVersion={ 0, 0, 17, 89 }, unicode_3_1={ 3, 1, 0, 0 };

View file

@ -290,8 +290,9 @@ StringCaseTest::TestCaseConversion()
// test case folding
{
UnicodeString
s=UNICODE_STRING("A\\u00df\\u00b5\\ufb03\\U0001040c\\u0131", 35).unescape(),
f=UNICODE_STRING("ass\\u03bcffi\\U00010434i", 23).unescape(),
s=UnicodeString("A\\u00df\\u00b5\\ufb03\\U0001040c\\u0130\\u0131", "").unescape(),
f=UnicodeString("ass\\u03bcffi\\U00010434i\\u0307\\u0131", "").unescape(),
g=UnicodeString("ass\\u03bcffi\\U00010434i\\u0131", "").unescape(),
t;
(t=s).foldCase();
@ -300,10 +301,9 @@ StringCaseTest::TestCaseConversion()
}
// alternate handling for dotted I/dotless i (U+0130, U+0131)
f.setCharAt(f.length()-1, 0x131);
(t=s).foldCase(U_FOLD_CASE_EXCLUDE_SPECIAL_I);
if(f!=t) {
errln("error in foldCase(\"" + s + "\", U_FOLD_CASE_EXCLUDE_SPECIAL_I)=\"" + t + "\" but expected \"" + f + "\"");
if(g!=t) {
errln("error in foldCase(\"" + s + "\", U_FOLD_CASE_EXCLUDE_SPECIAL_I)=\"" + t + "\" but expected \"" + g + "\"");
}
}
}

View file

@ -299,10 +299,10 @@ UnicodeStringTest::TestCompare()
/* test caseCompare() */
{
static const UChar
_mixed[]= { 0x61, 0x42, 0x131, 0x3a3, 0xdf, 0xfb03, 0xd93f, 0xdfff, 0 },
_otherDefault[]= { 0x41, 0x62, 0x69, 0x3c3, 0x73, 0x53, 0x46, 0x66, 0x49, 0xd93f, 0xdfff, 0 },
_otherExcludeSpecialI[]={ 0x41, 0x62, 0x131, 0x3c3, 0x53, 0x73, 0x66, 0x46, 0x69, 0xd93f, 0xdfff, 0 },
_different[]= { 0x41, 0x62, 0x69, 0x3c3, 0x73, 0x53, 0x46, 0x66, 0x49, 0xd93f, 0xdffd, 0 };
_mixed[]= { 0x61, 0x42, 0x131, 0x3a3, 0xdf, 0x130, 0x49, 0xfb03, 0xd93f, 0xdfff, 0 },
_otherDefault[]= { 0x41, 0x62, 0x131, 0x3c3, 0x73, 0x53, 0x69, 0x307, 0x69, 0x46, 0x66, 0x49, 0xd93f, 0xdfff, 0 },
_otherExcludeSpecialI[]={ 0x41, 0x62, 0x131, 0x3c3, 0x53, 0x73, 0x69, 0x131, 0x66, 0x46, 0x69, 0xd93f, 0xdfff, 0 },
_different[]= { 0x41, 0x62, 0x131, 0x3c3, 0x73, 0x53, 0x130, 0x49, 0x46, 0x66, 0x49, 0xd93f, 0xdffd, 0 };
UnicodeString
mixed(TRUE, _mixed, -1),