Added unicode normalization

@TODO make code smaller
This commit is contained in:
Alex Zolotarev 2011-05-28 23:28:01 +02:00 committed by Alex Zolotarev
parent 015250ae97
commit 7841b70b09
6 changed files with 4607 additions and 16 deletions

View file

@ -21,6 +21,7 @@ SOURCES += \
path_utils.cpp \
condition.cpp \
lower_case.cpp \
normalize_unicode.cpp \
HEADERS += \
SRC_FIRST.hpp \

View file

@ -252,3 +252,14 @@ UNIT_TEST(MakeUniString_Smoke)
char const s [] = "Hello!";
TEST_EQUAL(strings::UniString(&s[0], &s[0] + ARRAY_SIZE(s) - 1), strings::MakeUniString(s), ());
}
UNIT_TEST(Normalize)
{
strings::UniChar const s[] = { 0x1f101, 'H', 0xfef0, 0xfdfc, 0x2150 };
strings::UniString us(&s[0], &s[0] + ARRAY_SIZE(s));
strings::UniChar const r[] = { 0x30, 0x2c, 'H', 0x649, 0x631, 0x6cc, 0x627, 0x644,
0x31, 0x2044, 0x37 };
strings::UniString result(&r[0], &r[0] + ARRAY_SIZE(r));
strings::Normalize(us);
TEST_EQUAL(us, result, ());
}

View file

@ -22,22 +22,22 @@ static uint16_t const smallff[] = {0xff00,0xff01,0xff02,0xff03,0xff04,0xff05,0xf
/// @return 0 if char should be replaced with 2 or more chars
UniChar LowerUniChar(UniChar c)
{
switch (c & 0x00ffff00)
switch (c & 0xffffff00)
{
case 0x0000: return small00[static_cast<uint16_t>(c & 0x00ff)];
case 0x0100: return small01[static_cast<uint16_t>(c & 0x00ff)];
case 0x0200: return small02[static_cast<uint16_t>(c & 0x00ff)];
case 0x0300: return small03[static_cast<uint16_t>(c & 0x00ff)];
case 0x0400: return small04[static_cast<uint16_t>(c & 0x00ff)];
case 0x0500: return small05[static_cast<uint16_t>(c & 0x00ff)];
case 0x1000: return small10[static_cast<uint16_t>(c & 0x00ff)];
case 0x1e00: return small1e[static_cast<uint16_t>(c & 0x00ff)];
case 0x1f00: return small1f[static_cast<uint16_t>(c & 0x00ff)];
case 0x2100: return small21[static_cast<uint16_t>(c & 0x00ff)];
case 0x2400: return small24[static_cast<uint16_t>(c & 0x00ff)];
case 0x2c00: return small2c[static_cast<uint16_t>(c & 0x00ff)];
case 0xa600: return smalla6[static_cast<uint16_t>(c & 0x00ff)];
case 0xa700: return smalla7[static_cast<uint16_t>(c & 0x00ff)];
case 0x0000: return small00[static_cast<uint8_t>(c & 0x00ff)];
case 0x0100: return small01[static_cast<uint8_t>(c & 0x00ff)];
case 0x0200: return small02[static_cast<uint8_t>(c & 0x00ff)];
case 0x0300: return small03[static_cast<uint8_t>(c & 0x00ff)];
case 0x0400: return small04[static_cast<uint8_t>(c & 0x00ff)];
case 0x0500: return small05[static_cast<uint8_t>(c & 0x00ff)];
case 0x1000: return small10[static_cast<uint8_t>(c & 0x00ff)];
case 0x1e00: return small1e[static_cast<uint8_t>(c & 0x00ff)];
case 0x1f00: return small1f[static_cast<uint8_t>(c & 0x00ff)];
case 0x2100: return small21[static_cast<uint8_t>(c & 0x00ff)];
case 0x2400: return small24[static_cast<uint8_t>(c & 0x00ff)];
case 0x2c00: return small2c[static_cast<uint8_t>(c & 0x00ff)];
case 0xa600: return smalla6[static_cast<uint8_t>(c & 0x00ff)];
case 0xa700: return smalla7[static_cast<uint8_t>(c & 0x00ff)];
case 0xfb00:
{
if (c >= 0xfb00 && c <= 0xfb06)
@ -46,7 +46,7 @@ UniChar LowerUniChar(UniChar c)
return 0;
return c;
}
case 0xff00: return smallff[static_cast<uint16_t>(c & 0x00ff)];
case 0xff00: return smallff[static_cast<uint8_t>(c & 0x00ff)];
case 0x10400:
{
if (c >= 0x10400 && c <= 0x10427)

4569
base/normalize_unicode.cpp Normal file

File diff suppressed because one or more lines are too long

View file

@ -95,6 +95,13 @@ string MakeLowerCase(string const & s)
return result;
}
UniString Normalize(UniString const & s)
{
UniString result(s);
Normalize(result);
return result;
}
bool EqualNoCase(string const & s1, string const & s2)
{
return MakeLowerCase(s1) == MakeLowerCase(s2);

View file

@ -17,6 +17,9 @@ typedef buffer_vector<UniChar, 32> UniString;
UniString MakeLowerCase(UniString const & s);
void MakeLowerCase(UniString & s);
UniString Normalize(UniString const & s);
void Normalize(UniString & s);
void MakeLowerCase(string & s);
string MakeLowerCase(string const & s);
bool EqualNoCase(string const & s1, string const & s2);