Compare commits

...
Sign in to create a new pull request.

1 commit

Author SHA1 Message Date
Alexander Borsuk
a738fa9f9c Minor test improvements, use from_chars
Signed-off-by: Alexander Borsuk <me@alex.bio>
2024-02-26 00:20:51 +02:00
2 changed files with 178 additions and 174 deletions

View file

@ -29,69 +29,73 @@ UNIT_TEST(LowerUniChar)
static char constexpr kFile[] = "./data/CaseFolding.test";
std::ifstream file(kFile);
TEST(file.is_open(), (kFile));
TEST(file, (kFile));
size_t fCount = 0, cCount = 0;
std::unordered_map<strings::UniChar, strings::UniString> m;
std::string line;
while (file.good())
while (file)
{
std::getline(file, line);
// strip comments
size_t const sharp = line.find('#');
if (sharp != std::string::npos)
if (auto const sharp = line.find('#'); sharp != std::string::npos)
line.erase(sharp);
strings::SimpleTokenizer semicolon(line, ";");
if (!semicolon)
continue;
std::istringstream stream{std::string{*semicolon}};
uint32_t uc;
stream >> std::hex >> uc;
ASSERT(stream, ("Overflow"));
uint32_t uniChar;
{
std::string_view const hex = *semicolon;
auto const [_, ec] = std::from_chars(hex.begin(), hex.end(), uniChar, 16);
TEST(ec == std::errc{}, ());
}
++semicolon;
auto const type = *semicolon;
std::string_view const type = *semicolon;
if (type == " S" || type == " T")
continue;
if (type != " C" && type != " F")
continue;
++semicolon;
strings::UniString us;
strings::UniString lowerCaseChars;
strings::SimpleTokenizer spacer(*semicolon, " ");
while (spacer)
{
stream.clear();
stream.str(std::string(*spacer));
uint32_t smallCode;
stream >> std::hex >> smallCode;
us.push_back(smallCode);
std::string_view const lowerCaseCharsSv = *spacer;
uint32_t lowerUniChar;
auto const [_, ec] = std::from_chars(lowerCaseCharsSv.begin(), lowerCaseCharsSv.end(), lowerUniChar, 16);
TEST(ec == std::errc{}, ());
lowerCaseChars.push_back(lowerUniChar);
++spacer;
}
switch (us.size())
switch (lowerCaseChars.size())
{
case 0: continue;
case 0:
TEST(false, ("No valid lower chars in line:", line));
break;
case 1:
{
m[uc] = us;
m[uniChar] = lowerCaseChars;
++cCount;
TEST_EQUAL(strings::LowerUniChar(uc), us[0], ());
TEST_EQUAL(strings::LowerUniChar(uniChar), lowerCaseChars[0], ());
TEST_EQUAL(type, " C", ());
break;
}
default:
{
m[uc] = us;
m[uniChar] = lowerCaseChars;
++fCount;
TEST_EQUAL(type, " F", ());
break;
}
}
}
LOG(LINFO, ("Loaded", cCount, "common foldings and", fCount, "full foldings"));
TEST_EQUAL(1022, cCount, ("Update the count after updating the test cases file"));
TEST_EQUAL(104, fCount, ("Update the count after updating the test cases file"));
// full range unicode table test
for (strings::UniChar c = 0; c < 0x11000; ++c)

View file

@ -1,153 +1,153 @@
Basic_Latin 0x0000 0x007F
Latin-1_Supplement 0x0080 0x00FF
Latin_Extended-A 0x0100 0x017F
Latin_Extended-B 0x0180 0x024F
IPA_Extensions 0x0250 0x02AF
Spacing_Modifier_Letters 0x02B0 0x02FF
Combining_Diacritical_Marks 0x0300 0x036F
Greek_and_Coptic 0x0370 0x03FF
Cyrillic 0x0400 0x04FF
Cyrillic_Supplement 0x0500 0x052F
Armenian 0x0530 0x058F
Hebrew 0x0590 0x05FF
Arabic 0x0600 0x06FF
Syriac 0x0700 0x074F
Arabic_Supplement 0x0750 0x077F
Thaana 0x0780 0x07BF
NKo 0x07C0 0x07FF
Samaritan 0x0800 0x083F
Mandaic 0x0840 0x085F
Arabic_Extended-A 0x08A0 0x08FF
Devanagari 0x0900 0x097F
Bengali 0x0980 0x09FF
Gurmukhi 0x0A00 0x0A7F
Gujarati 0x0A80 0x0AFF
Oriya 0x0B00 0x0B7F
Tamil 0x0B80 0x0BFF
Telugu 0x0C00 0x0C7F
Kannada 0x0C80 0x0CFF
Malayalam 0x0D00 0x0D7F
Sinhala 0x0D80 0x0DFF
Thai 0x0E00 0x0E7F
Lao 0x0E80 0x0EFF
Tibetan 0x0F00 0x0FFF
Myanmar 0x1000 0x109F
Georgian 0x10A0 0x10FF
Hangul_Jamo 0x1100 0x11FF
Ethiopic 0x1200 0x137F
Ethiopic_Supplement 0x1380 0x139F
Cherokee 0x13A0 0x13FF
Unified_Canadian_Aboriginal_Syllabics 0x1400 0x167F
Ogham 0x1680 0x169F
Runic 0x16A0 0x16FF
Tagalog 0x1700 0x171F
Hanunoo 0x1720 0x173F
Buhid 0x1740 0x175F
Tagbanwa 0x1760 0x177F
Khmer 0x1780 0x17FF
Mongolian 0x1800 0x18AF
Unified_Canadian_Aboriginal_Syllabics_Extended 0x18B0 0x18FF
Limbu 0x1900 0x194F
Tai_Le 0x1950 0x197F
New_Tai_Lue 0x1980 0x19DF
Khmer_Symbols 0x19E0 0x19FF
Buginese 0x1A00 0x1A1F
Tai_Tham 0x1A20 0x1AAF
Balinese 0x1B00 0x1B7F
Sundanese 0x1B80 0x1BBF
Batak 0x1BC0 0x1BFF
Lepcha 0x1C00 0x1C4F
Ol_Chiki 0x1C50 0x1C7F
Vedic_Extensions 0x1CD0 0x1CFF
Phonetic_Extensions 0x1D00 0x1D7F
Phonetic_Extensions_Supplement 0x1D80 0x1DBF
Combining_Diacritical_Marks_Supplement 0x1DC0 0x1DFF
Latin_Extended_Additional 0x1E00 0x1EFF
Greek_Extended 0x1F00 0x1FFF
General_Punctuation 0x2000 0x206F
Superscripts_and_Subscripts 0x2070 0x209F
Currency_Symbols 0x20A0 0x20CF
Combining_Diacritical_Marks_for_Symbols 0x20D0 0x20FF
Letterlike_Symbols 0x2100 0x214F
Number_Forms 0x2150 0x218F
Arrows 0x2190 0x21FF
Mathematical_Operators 0x2200 0x22FF
Miscellaneous_Technical 0x2300 0x23FF
Control_Pictures 0x2400 0x243F
Optical_Character_Recognition 0x2440 0x245F
Enclosed_Alphanumerics 0x2460 0x24FF
Box_Drawing 0x2500 0x257F
Block_Elements 0x2580 0x259F
Geometric_Shapes 0x25A0 0x25FF
Miscellaneous_Symbols 0x2600 0x26FF
Dingbats 0x2700 0x27BF
Miscellaneous_Mathematical_Symbols-A 0x27C0 0x27EF
Supplemental_Arrows-A 0x27F0 0x27FF
Braille_Patterns 0x2800 0x28FF
Supplemental_Arrows-B 0x2900 0x297F
Miscellaneous_Mathematical_Symbols-B 0x2980 0x29FF
Supplemental_Mathematical_Operators 0x2A00 0x2AFF
Miscellaneous_Symbols_and_Arrows 0x2B00 0x2BFF
Glagolitic 0x2C00 0x2C5F
Latin_Extended-C 0x2C60 0x2C7F
Coptic 0x2C80 0x2CFF
Georgian_Supplement 0x2D00 0x2D2F
Tifinagh 0x2D30 0x2D7F
Ethiopic_Extended 0x2D80 0x2DDF
Cyrillic_Extended-A 0x2DE0 0x2DFF
Supplemental_Punctuation 0x2E00 0x2E7F
CJK_Radicals_Supplement 0x2E80 0x2EFF
Kangxi_Radicals 0x2F00 0x2FDF
Ideographic_Description_Characters 0x2FF0 0x2FFF
CJK_Symbols_and_Punctuation 0x3000 0x303F
Hiragana 0x3040 0x309F
Katakana 0x30A0 0x30FF
Bopomofo 0x3100 0x312F
Hangul_Compatibility_Jamo 0x3130 0x318F
Kanbun 0x3190 0x319F
Bopomofo_Extended 0x31A0 0x31BF
CJK_Strokes 0x31C0 0x31EF
Katakana_Phonetic_Extensions 0x31F0 0x31FF
Enclosed_CJK_Letters_and_Months 0x3200 0x32FF
CJK_Compatibility 0x3300 0x33FF
CJK_Unified_Ideographs_Extension_A 0x3400 0x4DBF
Yijing_Hexagram_Symbols 0x4DC0 0x4DFF
CJK_Unified_Ideographs 0x4E00 0x9FFF
Yi_Syllables 0xA000 0xA48F
Yi_Radicals 0xA490 0xA4CF
Lisu 0xA4D0 0xA4FF
Vai 0xA500 0xA63F
Cyrillic_Extended-B 0xA640 0xA69F
Bamum 0xA6A0 0xA6FF
Modifier_Tone_Letters 0xA700 0xA71F
Latin_Extended-D 0xA720 0xA7FF
Syloti_Nagri 0xA800 0xA82F
Common_Indic_Number_Forms 0xA830 0xA83F
Phags-pa 0xA840 0xA87F
Saurashtra 0xA880 0xA8DF
Devanagari_Extended 0xA8E0 0xA8FF
Kayah_Li 0xA900 0xA92F
Rejang 0xA930 0xA95F
Hangul_Jamo_Extended-A 0xA960 0xA97F
Javanese 0xA980 0xA9DF
Cham 0xAA00 0xAA5F
Myanmar_Extended-A 0xAA60 0xAA7F
Tai_Viet 0xAA80 0xAADF
Ethiopic_Extended-A 0xAB00 0xAB2F
Meetei_Mayek 0xABC0 0xABFF
Hangul_Syllables 0xAC00 0xD7AF
Hangul_Jamo_Extended-B 0xD7B0 0xD7FF
High_Surrogates 0xD800 0xDB7F
High_Private_Use_Surrogates 0xDB80 0xDBFF
Low_Surrogates 0xDC00 0xDFFF
Private_Use_Area 0xE000 0xF8FF
CJK_Compatibility_Ideographs 0xF900 0xFAFF
Alphabetic_Presentation_Forms 0xFB00 0xFB4F
Arabic_Presentation_Forms-A 0xFB50 0xFDFF
Variation_Selectors 0xFE00 0xFE0F
Vertical_Forms 0xFE10 0xFE1F
Combining_Half_Marks 0xFE20 0xFE2F
CJK_Compatibility_Forms 0xFE30 0xFE4F
Small_Form_Variants 0xFE50 0xFE6F
Arabic_Presentation_Forms-B 0xFE70 0xFEFF
Halfwidth_and_Fullwidth_Forms 0xFF00 0xFFEF
Basic_Latin 0000 007F
Latin-1_Supplement 0080 00FF
Latin_Extended-A 0100 017F
Latin_Extended-B 0180 024F
IPA_Extensions 0250 02AF
Spacing_Modifier_Letters 02B0 02FF
Combining_Diacritical_Marks 0300 036F
Greek_and_Coptic 0370 03FF
Cyrillic 0400 04FF
Cyrillic_Supplement 0500 052F
Armenian 0530 058F
Hebrew 0590 05FF
Arabic 0600 06FF
Syriac 0700 074F
Arabic_Supplement 0750 077F
Thaana 0780 07BF
NKo 07C0 07FF
Samaritan 0800 083F
Mandaic 0840 085F
Arabic_Extended-A 08A0 08FF
Devanagari 0900 097F
Bengali 0980 09FF
Gurmukhi 0A00 0A7F
Gujarati 0A80 0AFF
Oriya 0B00 0B7F
Tamil 0B80 0BFF
Telugu 0C00 0C7F
Kannada 0C80 0CFF
Malayalam 0D00 0D7F
Sinhala 0D80 0DFF
Thai 0E00 0E7F
Lao 0E80 0EFF
Tibetan 0F00 0FFF
Myanmar 1000 109F
Georgian 10A0 10FF
Hangul_Jamo 1100 11FF
Ethiopic 1200 137F
Ethiopic_Supplement 1380 139F
Cherokee 13A0 13FF
Unified_Canadian_Aboriginal_Syllabics 1400 167F
Ogham 1680 169F
Runic 16A0 16FF
Tagalog 1700 171F
Hanunoo 1720 173F
Buhid 1740 175F
Tagbanwa 1760 177F
Khmer 1780 17FF
Mongolian 1800 18AF
Unified_Canadian_Aboriginal_Syllabics_Extended 18B0 18FF
Limbu 1900 194F
Tai_Le 1950 197F
New_Tai_Lue 1980 19DF
Khmer_Symbols 19E0 19FF
Buginese 1A00 1A1F
Tai_Tham 1A20 1AAF
Balinese 1B00 1B7F
Sundanese 1B80 1BBF
Batak 1BC0 1BFF
Lepcha 1C00 1C4F
Ol_Chiki 1C50 1C7F
Vedic_Extensions 1CD0 1CFF
Phonetic_Extensions 1D00 1D7F
Phonetic_Extensions_Supplement 1D80 1DBF
Combining_Diacritical_Marks_Supplement 1DC0 1DFF
Latin_Extended_Additional 1E00 1EFF
Greek_Extended 1F00 1FFF
General_Punctuation 2000 206F
Superscripts_and_Subscripts 2070 209F
Currency_Symbols 20A0 20CF
Combining_Diacritical_Marks_for_Symbols 20D0 20FF
Letterlike_Symbols 2100 214F
Number_Forms 2150 218F
Arrows 2190 21FF
Mathematical_Operators 2200 22FF
Miscellaneous_Technical 2300 23FF
Control_Pictures 2400 243F
Optical_Character_Recognition 2440 245F
Enclosed_Alphanumerics 2460 24FF
Box_Drawing 2500 257F
Block_Elements 2580 259F
Geometric_Shapes 25A0 25FF
Miscellaneous_Symbols 2600 26FF
Dingbats 2700 27BF
Miscellaneous_Mathematical_Symbols-A 27C0 27EF
Supplemental_Arrows-A 27F0 27FF
Braille_Patterns 2800 28FF
Supplemental_Arrows-B 2900 297F
Miscellaneous_Mathematical_Symbols-B 2980 29FF
Supplemental_Mathematical_Operators 2A00 2AFF
Miscellaneous_Symbols_and_Arrows 2B00 2BFF
Glagolitic 2C00 2C5F
Latin_Extended-C 2C60 2C7F
Coptic 2C80 2CFF
Georgian_Supplement 2D00 2D2F
Tifinagh 2D30 2D7F
Ethiopic_Extended 2D80 2DDF
Cyrillic_Extended-A 2DE0 2DFF
Supplemental_Punctuation 2E00 2E7F
CJK_Radicals_Supplement 2E80 2EFF
Kangxi_Radicals 2F00 2FDF
Ideographic_Description_Characters 2FF0 2FFF
CJK_Symbols_and_Punctuation 3000 303F
Hiragana 3040 309F
Katakana 30A0 30FF
Bopomofo 3100 312F
Hangul_Compatibility_Jamo 3130 318F
Kanbun 3190 319F
Bopomofo_Extended 31A0 31BF
CJK_Strokes 31C0 31EF
Katakana_Phonetic_Extensions 31F0 31FF
Enclosed_CJK_Letters_and_Months 3200 32FF
CJK_Compatibility 3300 33FF
CJK_Unified_Ideographs_Extension_A 3400 4DBF
Yijing_Hexagram_Symbols 4DC0 4DFF
CJK_Unified_Ideographs 4E00 9FFF
Yi_Syllables A000 A48F
Yi_Radicals A490 A4CF
Lisu A4D0 A4FF
Vai A500 A63F
Cyrillic_Extended-B A640 A69F
Bamum A6A0 A6FF
Modifier_Tone_Letters A700 A71F
Latin_Extended-D A720 A7FF
Syloti_Nagri A800 A82F
Common_Indic_Number_Forms A830 A83F
Phags-pa A840 A87F
Saurashtra A880 A8DF
Devanagari_Extended A8E0 A8FF
Kayah_Li A900 A92F
Rejang A930 A95F
Hangul_Jamo_Extended-A A960 A97F
Javanese A980 A9DF
Cham AA00 AA5F
Myanmar_Extended-A AA60 AA7F
Tai_Viet AA80 AADF
Ethiopic_Extended-A AB00 AB2F
Meetei_Mayek ABC0 ABFF
Hangul_Syllables AC00 D7AF
Hangul_Jamo_Extended-B D7B0 D7FF
High_Surrogates D800 DB7F
High_Private_Use_Surrogates DB80 DBFF
Low_Surrogates DC00 DFFF
Private_Use_Area E000 F8FF
CJK_Compatibility_Ideographs F900 FAFF
Alphabetic_Presentation_Forms FB00 FB4F
Arabic_Presentation_Forms-A FB50 FDFF
Variation_Selectors FE00 FE0F
Vertical_Forms FE10 FE1F
Combining_Half_Marks FE20 FE2F
CJK_Compatibility_Forms FE30 FE4F
Small_Form_Variants FE50 FE6F
Arabic_Presentation_Forms-B FE70 FEFF
Halfwidth_and_Fullwidth_Forms FF00 FFEF