forked from organicmaps/organicmaps
Compare commits
1 commit
master
...
ab-from-ch
Author | SHA1 | Date | |
---|---|---|---|
|
a738fa9f9c |
2 changed files with 178 additions and 174 deletions
|
@ -29,69 +29,73 @@ UNIT_TEST(LowerUniChar)
|
|||
|
||||
static char constexpr kFile[] = "./data/CaseFolding.test";
|
||||
std::ifstream file(kFile);
|
||||
TEST(file.is_open(), (kFile));
|
||||
TEST(file, (kFile));
|
||||
|
||||
size_t fCount = 0, cCount = 0;
|
||||
std::unordered_map<strings::UniChar, strings::UniString> m;
|
||||
std::string line;
|
||||
while (file.good())
|
||||
while (file)
|
||||
{
|
||||
std::getline(file, line);
|
||||
|
||||
// strip comments
|
||||
size_t const sharp = line.find('#');
|
||||
if (sharp != std::string::npos)
|
||||
if (auto const sharp = line.find('#'); sharp != std::string::npos)
|
||||
line.erase(sharp);
|
||||
strings::SimpleTokenizer semicolon(line, ";");
|
||||
if (!semicolon)
|
||||
continue;
|
||||
|
||||
std::istringstream stream{std::string{*semicolon}};
|
||||
uint32_t uc;
|
||||
stream >> std::hex >> uc;
|
||||
ASSERT(stream, ("Overflow"));
|
||||
uint32_t uniChar;
|
||||
{
|
||||
std::string_view const hex = *semicolon;
|
||||
auto const [_, ec] = std::from_chars(hex.begin(), hex.end(), uniChar, 16);
|
||||
TEST(ec == std::errc{}, ());
|
||||
}
|
||||
++semicolon;
|
||||
|
||||
auto const type = *semicolon;
|
||||
std::string_view const type = *semicolon;
|
||||
if (type == " S" || type == " T")
|
||||
continue;
|
||||
if (type != " C" && type != " F")
|
||||
continue;
|
||||
++semicolon;
|
||||
|
||||
strings::UniString us;
|
||||
strings::UniString lowerCaseChars;
|
||||
strings::SimpleTokenizer spacer(*semicolon, " ");
|
||||
while (spacer)
|
||||
{
|
||||
stream.clear();
|
||||
stream.str(std::string(*spacer));
|
||||
uint32_t smallCode;
|
||||
stream >> std::hex >> smallCode;
|
||||
us.push_back(smallCode);
|
||||
std::string_view const lowerCaseCharsSv = *spacer;
|
||||
uint32_t lowerUniChar;
|
||||
auto const [_, ec] = std::from_chars(lowerCaseCharsSv.begin(), lowerCaseCharsSv.end(), lowerUniChar, 16);
|
||||
TEST(ec == std::errc{}, ());
|
||||
lowerCaseChars.push_back(lowerUniChar);
|
||||
++spacer;
|
||||
}
|
||||
|
||||
switch (us.size())
|
||||
switch (lowerCaseChars.size())
|
||||
{
|
||||
case 0: continue;
|
||||
case 0:
|
||||
TEST(false, ("No valid lower chars in line:", line));
|
||||
break;
|
||||
case 1:
|
||||
{
|
||||
m[uc] = us;
|
||||
m[uniChar] = lowerCaseChars;
|
||||
++cCount;
|
||||
TEST_EQUAL(strings::LowerUniChar(uc), us[0], ());
|
||||
TEST_EQUAL(strings::LowerUniChar(uniChar), lowerCaseChars[0], ());
|
||||
TEST_EQUAL(type, " C", ());
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
m[uc] = us;
|
||||
m[uniChar] = lowerCaseChars;
|
||||
++fCount;
|
||||
TEST_EQUAL(type, " F", ());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
LOG(LINFO, ("Loaded", cCount, "common foldings and", fCount, "full foldings"));
|
||||
TEST_EQUAL(1022, cCount, ("Update the count after updating the test cases file"));
|
||||
TEST_EQUAL(104, fCount, ("Update the count after updating the test cases file"));
|
||||
|
||||
// full range unicode table test
|
||||
for (strings::UniChar c = 0; c < 0x11000; ++c)
|
||||
|
|
|
@ -1,153 +1,153 @@
|
|||
Basic_Latin 0x0000 0x007F
|
||||
Latin-1_Supplement 0x0080 0x00FF
|
||||
Latin_Extended-A 0x0100 0x017F
|
||||
Latin_Extended-B 0x0180 0x024F
|
||||
IPA_Extensions 0x0250 0x02AF
|
||||
Spacing_Modifier_Letters 0x02B0 0x02FF
|
||||
Combining_Diacritical_Marks 0x0300 0x036F
|
||||
Greek_and_Coptic 0x0370 0x03FF
|
||||
Cyrillic 0x0400 0x04FF
|
||||
Cyrillic_Supplement 0x0500 0x052F
|
||||
Armenian 0x0530 0x058F
|
||||
Hebrew 0x0590 0x05FF
|
||||
Arabic 0x0600 0x06FF
|
||||
Syriac 0x0700 0x074F
|
||||
Arabic_Supplement 0x0750 0x077F
|
||||
Thaana 0x0780 0x07BF
|
||||
NKo 0x07C0 0x07FF
|
||||
Samaritan 0x0800 0x083F
|
||||
Mandaic 0x0840 0x085F
|
||||
Arabic_Extended-A 0x08A0 0x08FF
|
||||
Devanagari 0x0900 0x097F
|
||||
Bengali 0x0980 0x09FF
|
||||
Gurmukhi 0x0A00 0x0A7F
|
||||
Gujarati 0x0A80 0x0AFF
|
||||
Oriya 0x0B00 0x0B7F
|
||||
Tamil 0x0B80 0x0BFF
|
||||
Telugu 0x0C00 0x0C7F
|
||||
Kannada 0x0C80 0x0CFF
|
||||
Malayalam 0x0D00 0x0D7F
|
||||
Sinhala 0x0D80 0x0DFF
|
||||
Thai 0x0E00 0x0E7F
|
||||
Lao 0x0E80 0x0EFF
|
||||
Tibetan 0x0F00 0x0FFF
|
||||
Myanmar 0x1000 0x109F
|
||||
Georgian 0x10A0 0x10FF
|
||||
Hangul_Jamo 0x1100 0x11FF
|
||||
Ethiopic 0x1200 0x137F
|
||||
Ethiopic_Supplement 0x1380 0x139F
|
||||
Cherokee 0x13A0 0x13FF
|
||||
Unified_Canadian_Aboriginal_Syllabics 0x1400 0x167F
|
||||
Ogham 0x1680 0x169F
|
||||
Runic 0x16A0 0x16FF
|
||||
Tagalog 0x1700 0x171F
|
||||
Hanunoo 0x1720 0x173F
|
||||
Buhid 0x1740 0x175F
|
||||
Tagbanwa 0x1760 0x177F
|
||||
Khmer 0x1780 0x17FF
|
||||
Mongolian 0x1800 0x18AF
|
||||
Unified_Canadian_Aboriginal_Syllabics_Extended 0x18B0 0x18FF
|
||||
Limbu 0x1900 0x194F
|
||||
Tai_Le 0x1950 0x197F
|
||||
New_Tai_Lue 0x1980 0x19DF
|
||||
Khmer_Symbols 0x19E0 0x19FF
|
||||
Buginese 0x1A00 0x1A1F
|
||||
Tai_Tham 0x1A20 0x1AAF
|
||||
Balinese 0x1B00 0x1B7F
|
||||
Sundanese 0x1B80 0x1BBF
|
||||
Batak 0x1BC0 0x1BFF
|
||||
Lepcha 0x1C00 0x1C4F
|
||||
Ol_Chiki 0x1C50 0x1C7F
|
||||
Vedic_Extensions 0x1CD0 0x1CFF
|
||||
Phonetic_Extensions 0x1D00 0x1D7F
|
||||
Phonetic_Extensions_Supplement 0x1D80 0x1DBF
|
||||
Combining_Diacritical_Marks_Supplement 0x1DC0 0x1DFF
|
||||
Latin_Extended_Additional 0x1E00 0x1EFF
|
||||
Greek_Extended 0x1F00 0x1FFF
|
||||
General_Punctuation 0x2000 0x206F
|
||||
Superscripts_and_Subscripts 0x2070 0x209F
|
||||
Currency_Symbols 0x20A0 0x20CF
|
||||
Combining_Diacritical_Marks_for_Symbols 0x20D0 0x20FF
|
||||
Letterlike_Symbols 0x2100 0x214F
|
||||
Number_Forms 0x2150 0x218F
|
||||
Arrows 0x2190 0x21FF
|
||||
Mathematical_Operators 0x2200 0x22FF
|
||||
Miscellaneous_Technical 0x2300 0x23FF
|
||||
Control_Pictures 0x2400 0x243F
|
||||
Optical_Character_Recognition 0x2440 0x245F
|
||||
Enclosed_Alphanumerics 0x2460 0x24FF
|
||||
Box_Drawing 0x2500 0x257F
|
||||
Block_Elements 0x2580 0x259F
|
||||
Geometric_Shapes 0x25A0 0x25FF
|
||||
Miscellaneous_Symbols 0x2600 0x26FF
|
||||
Dingbats 0x2700 0x27BF
|
||||
Miscellaneous_Mathematical_Symbols-A 0x27C0 0x27EF
|
||||
Supplemental_Arrows-A 0x27F0 0x27FF
|
||||
Braille_Patterns 0x2800 0x28FF
|
||||
Supplemental_Arrows-B 0x2900 0x297F
|
||||
Miscellaneous_Mathematical_Symbols-B 0x2980 0x29FF
|
||||
Supplemental_Mathematical_Operators 0x2A00 0x2AFF
|
||||
Miscellaneous_Symbols_and_Arrows 0x2B00 0x2BFF
|
||||
Glagolitic 0x2C00 0x2C5F
|
||||
Latin_Extended-C 0x2C60 0x2C7F
|
||||
Coptic 0x2C80 0x2CFF
|
||||
Georgian_Supplement 0x2D00 0x2D2F
|
||||
Tifinagh 0x2D30 0x2D7F
|
||||
Ethiopic_Extended 0x2D80 0x2DDF
|
||||
Cyrillic_Extended-A 0x2DE0 0x2DFF
|
||||
Supplemental_Punctuation 0x2E00 0x2E7F
|
||||
CJK_Radicals_Supplement 0x2E80 0x2EFF
|
||||
Kangxi_Radicals 0x2F00 0x2FDF
|
||||
Ideographic_Description_Characters 0x2FF0 0x2FFF
|
||||
CJK_Symbols_and_Punctuation 0x3000 0x303F
|
||||
Hiragana 0x3040 0x309F
|
||||
Katakana 0x30A0 0x30FF
|
||||
Bopomofo 0x3100 0x312F
|
||||
Hangul_Compatibility_Jamo 0x3130 0x318F
|
||||
Kanbun 0x3190 0x319F
|
||||
Bopomofo_Extended 0x31A0 0x31BF
|
||||
CJK_Strokes 0x31C0 0x31EF
|
||||
Katakana_Phonetic_Extensions 0x31F0 0x31FF
|
||||
Enclosed_CJK_Letters_and_Months 0x3200 0x32FF
|
||||
CJK_Compatibility 0x3300 0x33FF
|
||||
CJK_Unified_Ideographs_Extension_A 0x3400 0x4DBF
|
||||
Yijing_Hexagram_Symbols 0x4DC0 0x4DFF
|
||||
CJK_Unified_Ideographs 0x4E00 0x9FFF
|
||||
Yi_Syllables 0xA000 0xA48F
|
||||
Yi_Radicals 0xA490 0xA4CF
|
||||
Lisu 0xA4D0 0xA4FF
|
||||
Vai 0xA500 0xA63F
|
||||
Cyrillic_Extended-B 0xA640 0xA69F
|
||||
Bamum 0xA6A0 0xA6FF
|
||||
Modifier_Tone_Letters 0xA700 0xA71F
|
||||
Latin_Extended-D 0xA720 0xA7FF
|
||||
Syloti_Nagri 0xA800 0xA82F
|
||||
Common_Indic_Number_Forms 0xA830 0xA83F
|
||||
Phags-pa 0xA840 0xA87F
|
||||
Saurashtra 0xA880 0xA8DF
|
||||
Devanagari_Extended 0xA8E0 0xA8FF
|
||||
Kayah_Li 0xA900 0xA92F
|
||||
Rejang 0xA930 0xA95F
|
||||
Hangul_Jamo_Extended-A 0xA960 0xA97F
|
||||
Javanese 0xA980 0xA9DF
|
||||
Cham 0xAA00 0xAA5F
|
||||
Myanmar_Extended-A 0xAA60 0xAA7F
|
||||
Tai_Viet 0xAA80 0xAADF
|
||||
Ethiopic_Extended-A 0xAB00 0xAB2F
|
||||
Meetei_Mayek 0xABC0 0xABFF
|
||||
Hangul_Syllables 0xAC00 0xD7AF
|
||||
Hangul_Jamo_Extended-B 0xD7B0 0xD7FF
|
||||
High_Surrogates 0xD800 0xDB7F
|
||||
High_Private_Use_Surrogates 0xDB80 0xDBFF
|
||||
Low_Surrogates 0xDC00 0xDFFF
|
||||
Private_Use_Area 0xE000 0xF8FF
|
||||
CJK_Compatibility_Ideographs 0xF900 0xFAFF
|
||||
Alphabetic_Presentation_Forms 0xFB00 0xFB4F
|
||||
Arabic_Presentation_Forms-A 0xFB50 0xFDFF
|
||||
Variation_Selectors 0xFE00 0xFE0F
|
||||
Vertical_Forms 0xFE10 0xFE1F
|
||||
Combining_Half_Marks 0xFE20 0xFE2F
|
||||
CJK_Compatibility_Forms 0xFE30 0xFE4F
|
||||
Small_Form_Variants 0xFE50 0xFE6F
|
||||
Arabic_Presentation_Forms-B 0xFE70 0xFEFF
|
||||
Halfwidth_and_Fullwidth_Forms 0xFF00 0xFFEF
|
||||
Basic_Latin 0000 007F
|
||||
Latin-1_Supplement 0080 00FF
|
||||
Latin_Extended-A 0100 017F
|
||||
Latin_Extended-B 0180 024F
|
||||
IPA_Extensions 0250 02AF
|
||||
Spacing_Modifier_Letters 02B0 02FF
|
||||
Combining_Diacritical_Marks 0300 036F
|
||||
Greek_and_Coptic 0370 03FF
|
||||
Cyrillic 0400 04FF
|
||||
Cyrillic_Supplement 0500 052F
|
||||
Armenian 0530 058F
|
||||
Hebrew 0590 05FF
|
||||
Arabic 0600 06FF
|
||||
Syriac 0700 074F
|
||||
Arabic_Supplement 0750 077F
|
||||
Thaana 0780 07BF
|
||||
NKo 07C0 07FF
|
||||
Samaritan 0800 083F
|
||||
Mandaic 0840 085F
|
||||
Arabic_Extended-A 08A0 08FF
|
||||
Devanagari 0900 097F
|
||||
Bengali 0980 09FF
|
||||
Gurmukhi 0A00 0A7F
|
||||
Gujarati 0A80 0AFF
|
||||
Oriya 0B00 0B7F
|
||||
Tamil 0B80 0BFF
|
||||
Telugu 0C00 0C7F
|
||||
Kannada 0C80 0CFF
|
||||
Malayalam 0D00 0D7F
|
||||
Sinhala 0D80 0DFF
|
||||
Thai 0E00 0E7F
|
||||
Lao 0E80 0EFF
|
||||
Tibetan 0F00 0FFF
|
||||
Myanmar 1000 109F
|
||||
Georgian 10A0 10FF
|
||||
Hangul_Jamo 1100 11FF
|
||||
Ethiopic 1200 137F
|
||||
Ethiopic_Supplement 1380 139F
|
||||
Cherokee 13A0 13FF
|
||||
Unified_Canadian_Aboriginal_Syllabics 1400 167F
|
||||
Ogham 1680 169F
|
||||
Runic 16A0 16FF
|
||||
Tagalog 1700 171F
|
||||
Hanunoo 1720 173F
|
||||
Buhid 1740 175F
|
||||
Tagbanwa 1760 177F
|
||||
Khmer 1780 17FF
|
||||
Mongolian 1800 18AF
|
||||
Unified_Canadian_Aboriginal_Syllabics_Extended 18B0 18FF
|
||||
Limbu 1900 194F
|
||||
Tai_Le 1950 197F
|
||||
New_Tai_Lue 1980 19DF
|
||||
Khmer_Symbols 19E0 19FF
|
||||
Buginese 1A00 1A1F
|
||||
Tai_Tham 1A20 1AAF
|
||||
Balinese 1B00 1B7F
|
||||
Sundanese 1B80 1BBF
|
||||
Batak 1BC0 1BFF
|
||||
Lepcha 1C00 1C4F
|
||||
Ol_Chiki 1C50 1C7F
|
||||
Vedic_Extensions 1CD0 1CFF
|
||||
Phonetic_Extensions 1D00 1D7F
|
||||
Phonetic_Extensions_Supplement 1D80 1DBF
|
||||
Combining_Diacritical_Marks_Supplement 1DC0 1DFF
|
||||
Latin_Extended_Additional 1E00 1EFF
|
||||
Greek_Extended 1F00 1FFF
|
||||
General_Punctuation 2000 206F
|
||||
Superscripts_and_Subscripts 2070 209F
|
||||
Currency_Symbols 20A0 20CF
|
||||
Combining_Diacritical_Marks_for_Symbols 20D0 20FF
|
||||
Letterlike_Symbols 2100 214F
|
||||
Number_Forms 2150 218F
|
||||
Arrows 2190 21FF
|
||||
Mathematical_Operators 2200 22FF
|
||||
Miscellaneous_Technical 2300 23FF
|
||||
Control_Pictures 2400 243F
|
||||
Optical_Character_Recognition 2440 245F
|
||||
Enclosed_Alphanumerics 2460 24FF
|
||||
Box_Drawing 2500 257F
|
||||
Block_Elements 2580 259F
|
||||
Geometric_Shapes 25A0 25FF
|
||||
Miscellaneous_Symbols 2600 26FF
|
||||
Dingbats 2700 27BF
|
||||
Miscellaneous_Mathematical_Symbols-A 27C0 27EF
|
||||
Supplemental_Arrows-A 27F0 27FF
|
||||
Braille_Patterns 2800 28FF
|
||||
Supplemental_Arrows-B 2900 297F
|
||||
Miscellaneous_Mathematical_Symbols-B 2980 29FF
|
||||
Supplemental_Mathematical_Operators 2A00 2AFF
|
||||
Miscellaneous_Symbols_and_Arrows 2B00 2BFF
|
||||
Glagolitic 2C00 2C5F
|
||||
Latin_Extended-C 2C60 2C7F
|
||||
Coptic 2C80 2CFF
|
||||
Georgian_Supplement 2D00 2D2F
|
||||
Tifinagh 2D30 2D7F
|
||||
Ethiopic_Extended 2D80 2DDF
|
||||
Cyrillic_Extended-A 2DE0 2DFF
|
||||
Supplemental_Punctuation 2E00 2E7F
|
||||
CJK_Radicals_Supplement 2E80 2EFF
|
||||
Kangxi_Radicals 2F00 2FDF
|
||||
Ideographic_Description_Characters 2FF0 2FFF
|
||||
CJK_Symbols_and_Punctuation 3000 303F
|
||||
Hiragana 3040 309F
|
||||
Katakana 30A0 30FF
|
||||
Bopomofo 3100 312F
|
||||
Hangul_Compatibility_Jamo 3130 318F
|
||||
Kanbun 3190 319F
|
||||
Bopomofo_Extended 31A0 31BF
|
||||
CJK_Strokes 31C0 31EF
|
||||
Katakana_Phonetic_Extensions 31F0 31FF
|
||||
Enclosed_CJK_Letters_and_Months 3200 32FF
|
||||
CJK_Compatibility 3300 33FF
|
||||
CJK_Unified_Ideographs_Extension_A 3400 4DBF
|
||||
Yijing_Hexagram_Symbols 4DC0 4DFF
|
||||
CJK_Unified_Ideographs 4E00 9FFF
|
||||
Yi_Syllables A000 A48F
|
||||
Yi_Radicals A490 A4CF
|
||||
Lisu A4D0 A4FF
|
||||
Vai A500 A63F
|
||||
Cyrillic_Extended-B A640 A69F
|
||||
Bamum A6A0 A6FF
|
||||
Modifier_Tone_Letters A700 A71F
|
||||
Latin_Extended-D A720 A7FF
|
||||
Syloti_Nagri A800 A82F
|
||||
Common_Indic_Number_Forms A830 A83F
|
||||
Phags-pa A840 A87F
|
||||
Saurashtra A880 A8DF
|
||||
Devanagari_Extended A8E0 A8FF
|
||||
Kayah_Li A900 A92F
|
||||
Rejang A930 A95F
|
||||
Hangul_Jamo_Extended-A A960 A97F
|
||||
Javanese A980 A9DF
|
||||
Cham AA00 AA5F
|
||||
Myanmar_Extended-A AA60 AA7F
|
||||
Tai_Viet AA80 AADF
|
||||
Ethiopic_Extended-A AB00 AB2F
|
||||
Meetei_Mayek ABC0 ABFF
|
||||
Hangul_Syllables AC00 D7AF
|
||||
Hangul_Jamo_Extended-B D7B0 D7FF
|
||||
High_Surrogates D800 DB7F
|
||||
High_Private_Use_Surrogates DB80 DBFF
|
||||
Low_Surrogates DC00 DFFF
|
||||
Private_Use_Area E000 F8FF
|
||||
CJK_Compatibility_Ideographs F900 FAFF
|
||||
Alphabetic_Presentation_Forms FB00 FB4F
|
||||
Arabic_Presentation_Forms-A FB50 FDFF
|
||||
Variation_Selectors FE00 FE0F
|
||||
Vertical_Forms FE10 FE1F
|
||||
Combining_Half_Marks FE20 FE2F
|
||||
CJK_Compatibility_Forms FE30 FE4F
|
||||
Small_Form_Variants FE50 FE6F
|
||||
Arabic_Presentation_Forms-B FE70 FEFF
|
||||
Halfwidth_and_Fullwidth_Forms FF00 FFEF
|
||||
|
|
Loading…
Add table
Reference in a new issue