diff --git a/tools/unicode/c/genuca/collationbasedatabuilder.cpp b/tools/unicode/c/genuca/collationbasedatabuilder.cpp index e700586387a..bacb032a969 100644 --- a/tools/unicode/c/genuca/collationbasedatabuilder.cpp +++ b/tools/unicode/c/genuca/collationbasedatabuilder.cpp @@ -1,6 +1,6 @@ /* ******************************************************************************* -* Copyright (C) 2012-2014, International Business Machines +* Copyright (C) 2012-2015, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationbasedatabuilder.cpp @@ -85,7 +85,10 @@ CollationBaseDataBuilder::CollationBaseDataBuilder(UErrorCode &errorCode) : CollationDataBuilder(errorCode), numericPrimary(0x12000000), firstHanPrimary(0), lastHanPrimary(0), hanStep(2), - rootElements(errorCode) { + rootElements(errorCode), + scriptStartsLength(1) { + uprv_memset(scriptsIndex, 0, sizeof(scriptsIndex)); + uprv_memset(scriptStarts, 0, sizeof(scriptStarts)); } CollationBaseDataBuilder::~CollationBaseDataBuilder() { @@ -309,27 +312,43 @@ CollationBaseDataBuilder::addRootElement(int64_t ce, UErrorCode &errorCode) { } void -CollationBaseDataBuilder::addReorderingGroup(uint32_t firstByte, uint32_t lastByte, - const UnicodeString &groupScripts, - UErrorCode &errorCode) { - if(U_FAILURE(errorCode)) { return; } - if(groupScripts.isEmpty()) { - errorCode = U_ILLEGAL_ARGUMENT_ERROR; - return; +CollationBaseDataBuilder::addScriptStart(int32_t script, uint32_t p) { + // The primary weight must be the lowest possible for a two-byte prefix. + // It could be 2, 3, or 4 bytes long. We round down to the two-byte boundary. + U_ASSERT((p & 0xff) == 0 || (p & 0xff) == 2); + p >>= 8; + U_ASSERT((p & 0xff) == 0 || (p & 0xff) == 2); + p >>= 8; + uint32_t lowestP2 = compressibleBytes[p >> 8] ? 4 : 2; + if((p & 0xff) == lowestP2) { + // The script really starts on a lead byte boundary. Round down to that. + p &= 0xff00; } - if(groupScripts.indexOf((UChar)USCRIPT_UNKNOWN) >= 0) { - // Zzzz must not occur. - // It is the code used in the API to separate low and high scripts. - errorCode = U_ILLEGAL_ARGUMENT_ERROR; - return; + // Script starts should be added in ascending order, otherwise we would need to sort them. + if(script < UCOL_REORDER_CODE_FIRST) { + U_ASSERT(0 <= script && script < USCRIPT_CODE_LIMIT); + } else { + U_ASSERT(script <= (UCOL_REORDER_CODE_FIRST + 15)); + script = USCRIPT_CODE_LIMIT + script - UCOL_REORDER_CODE_FIRST; + } + if(scriptStartsLength != 0 && scriptStarts[scriptStartsLength - 1] == p) { + // Two scripts share a range (e.g., Hira & Kana). + scriptsIndex[script] = (uint16_t)(scriptStartsLength - 1); + } else { + U_ASSERT(scriptStartsLength == 0 || scriptStarts[scriptStartsLength - 1] <= p); + U_ASSERT(scriptStartsLength < UPRV_LENGTHOF(scriptStarts)); + scriptsIndex[script] = (uint16_t)scriptStartsLength; + scriptStarts[scriptStartsLength++] = (uint16_t)p; + } + if(script == USCRIPT_UNKNOWN) { + // The last script start is for unassigned code points + // (with high implict primary weights). + // Add one more entry with the limit of this range, + // which is the start of the trailing-weights range. + U_ASSERT(scriptStartsLength < UPRV_LENGTHOF(scriptStarts)); + scriptStarts[scriptStartsLength++] = + (uint16_t)((Collation::FIRST_TRAILING_PRIMARY >> 16) & 0xff00); } - // Note: We are mostly trusting the input data, - // rather than verifying that reordering groups do not intersect - // with their lead byte ranges nor their sets of scripts, - // and that all script codes are valid. - scripts.append((UChar)((firstByte << 8) | lastByte)); - scripts.append((UChar)groupScripts.length()); - scripts.append(groupScripts); } void @@ -337,8 +356,18 @@ CollationBaseDataBuilder::build(CollationData &data, UErrorCode &errorCode) { buildMappings(data, errorCode); data.numericPrimary = numericPrimary; data.compressibleBytes = compressibleBytes; - data.scripts = reinterpret_cast(scripts.getBuffer()); - data.scriptsLength = scripts.length(); + + int32_t numScripts = USCRIPT_CODE_LIMIT; + while(numScripts > 0 && scriptsIndex[numScripts - 1] == 0) { --numScripts; } + // Move the 16 special groups (not all used) + // down for contiguous storage of the script and special-group indexes. + for(int32_t i = 0; i < 16; ++i) { + scriptsIndex[numScripts + i] = scriptsIndex[USCRIPT_CODE_LIMIT + i]; + } + data.numScripts = numScripts; + data.scriptsIndex = scriptsIndex; + data.scriptStarts = scriptStarts; + data.scriptStartsLength = scriptStartsLength; buildFastLatinTable(data, errorCode); } diff --git a/tools/unicode/c/genuca/collationbasedatabuilder.h b/tools/unicode/c/genuca/collationbasedatabuilder.h index aaa6be10fc3..214b5217371 100644 --- a/tools/unicode/c/genuca/collationbasedatabuilder.h +++ b/tools/unicode/c/genuca/collationbasedatabuilder.h @@ -1,6 +1,6 @@ /* ******************************************************************************* -* Copyright (C) 2012-2014, International Business Machines +* Copyright (C) 2012-2015, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationbasedatabuilder.h @@ -18,6 +18,7 @@ #include "unicode/uniset.h" #include "unicode/unistr.h" +#include "unicode/uscript.h" #include "collation.h" #include "collationdata.h" #include "collationdatabuilder.h" @@ -66,9 +67,7 @@ public: void addRootElements(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode); void addRootElement(int64_t ce, UErrorCode &errorCode); - void addReorderingGroup(uint32_t firstByte, uint32_t lastByte, - const UnicodeString &groupScripts, - UErrorCode &errorCode); + void addScriptStart(int32_t script, uint32_t p); virtual void build(CollationData &data, UErrorCode &errorCode); @@ -86,7 +85,9 @@ private: uint32_t lastHanPrimary; int32_t hanStep; UVector64 rootElements; - UnicodeString scripts; + uint16_t scriptsIndex[USCRIPT_CODE_LIMIT + 16]; // need exactly this many + uint16_t scriptStarts[USCRIPT_CODE_LIMIT + 16]; // should be safely more than needed + int32_t scriptStartsLength; }; U_NAMESPACE_END diff --git a/tools/unicode/c/genuca/genuca.cpp b/tools/unicode/c/genuca/genuca.cpp index 94793919e5c..c0aa67433c2 100644 --- a/tools/unicode/c/genuca/genuca.cpp +++ b/tools/unicode/c/genuca/genuca.cpp @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2000-2014, International Business Machines +* Copyright (C) 2000-2015, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -25,6 +25,8 @@ #include "unicode/utypes.h" #include "unicode/errorcode.h" #include "unicode/localpointer.h" +#include "unicode/ucol.h" +#include "unicode/uscript.h" #include "unicode/utf8.h" #include "charstr.h" #include "cmemory.h" @@ -46,8 +48,6 @@ #include "uparse.h" #include "writesrc.h" -#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) - #if UCONFIG_NO_COLLATION extern "C" int @@ -83,7 +83,7 @@ static UDataInfo ucaDataInfo={ 0, { 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol" - { 4, 1, 0, 0 }, // formatVersion + { 5, 0, 0, 0 }, // formatVersion { 6, 3, 0, 0 } // dataVersion }; @@ -206,32 +206,151 @@ static int64_t parseCE(const CollationDataBuilder &builder, char *&s, UErrorCode } } +// Hardcoded mapping from script sample characters to script codes. +// Pro: Available without complete and updated UCD scripts data, +// easy to add non-script codes specific to collation. +// Con: Needs manual update for each new script or change in sample character. static const struct { - const char *name; - int32_t code; -} specialReorderTokens[] = { - { "TERMINATOR", -2 }, // -2 means "ignore" - { "LEVEL-SEPARATOR", -2 }, - { "FIELD-SEPARATOR", -2 }, - { "COMPRESS", -3 }, - // The standard name is "PUNCT" but FractionalUCA.txt uses the long form. - { "PUNCTUATION", UCOL_REORDER_CODE_PUNCTUATION }, - { "IMPLICIT", USCRIPT_HAN }, // Implicit weights are usually for Han characters. Han & unassigned share a lead byte. - { "TRAILING", -2 }, // We do not reorder trailing weights (those after implicits). - { "SPECIAL", -2 } // We must never reorder internal, special CE lead bytes. + UChar32 sampleChar; + int32_t script; +} sampleCharsToScripts[] = { + { 0x00A0, UCOL_REORDER_CODE_SPACE }, + { 0x201C, UCOL_REORDER_CODE_PUNCTUATION }, + { 0x263A, UCOL_REORDER_CODE_SYMBOL }, + { 0x20AC, UCOL_REORDER_CODE_CURRENCY }, + { 0x0034, UCOL_REORDER_CODE_DIGIT }, + { 0x004C, USCRIPT_LATIN }, + { 0x03A9, USCRIPT_GREEK }, + { 0x03E2, USCRIPT_COPTIC }, + { 0x042F, USCRIPT_CYRILLIC }, + { 0x2C00, USCRIPT_GLAGOLITIC }, + { 0x1036B, USCRIPT_OLD_PERMIC }, + { 0x10D3, USCRIPT_GEORGIAN }, + { 0x0531, USCRIPT_ARMENIAN }, + { 0x05D0, USCRIPT_HEBREW }, + { 0x10900, USCRIPT_PHOENICIAN }, + { 0x0800, USCRIPT_SAMARITAN }, + { 0x0628, USCRIPT_ARABIC }, + { 0x0710, USCRIPT_SYRIAC }, + { 0x0840, USCRIPT_MANDAIC }, + { 0x078C, USCRIPT_THAANA }, + { 0x07CA, USCRIPT_NKO }, + { 0x2D5E, USCRIPT_TIFINAGH }, + { 0x12A0, USCRIPT_ETHIOPIC }, + { 0x0905, USCRIPT_DEVANAGARI }, + { 0x0995, USCRIPT_BENGALI }, + { 0x0A15, USCRIPT_GURMUKHI }, + { 0x0A95, USCRIPT_GUJARATI }, + { 0x0B15, USCRIPT_ORIYA }, + { 0x0B95, USCRIPT_TAMIL }, + { 0x0C15, USCRIPT_TELUGU }, + { 0x0C95, USCRIPT_KANNADA }, + { 0x0D15, USCRIPT_MALAYALAM }, + { 0x0D85, USCRIPT_SINHALA }, + { 0xABC0, USCRIPT_MEITEI_MAYEK }, + { 0xA800, USCRIPT_SYLOTI_NAGRI }, + { 0xA882, USCRIPT_SAURASHTRA }, + { 0x11083, USCRIPT_KAITHI }, + { 0x11152, USCRIPT_MAHAJANI }, + { 0x11183, USCRIPT_SHARADA }, + { 0x11208, USCRIPT_KHOJKI }, + { 0x112BE, USCRIPT_KHUDAWADI }, + { 0x11315, USCRIPT_GRANTHA }, + { 0x11484, USCRIPT_TIRHUTA }, + { 0x1158E, USCRIPT_SIDDHAM }, + { 0x1160E, USCRIPT_MODI }, + { 0x11680, USCRIPT_TAKRI }, + { 0x1B83, USCRIPT_SUNDANESE }, + { 0x11005, USCRIPT_BRAHMI }, + { 0x10A00, USCRIPT_KHAROSHTHI }, + { 0x0E17, USCRIPT_THAI }, + { 0x0EA5, USCRIPT_LAO }, + { 0xAA80, USCRIPT_TAI_VIET }, + { 0x0F40, USCRIPT_TIBETAN }, + { 0x1C00, USCRIPT_LEPCHA }, + { 0xA840, USCRIPT_PHAGS_PA }, + { 0x1900, USCRIPT_LIMBU }, + { 0x1703, USCRIPT_TAGALOG }, + { 0x1723, USCRIPT_HANUNOO }, + { 0x1743, USCRIPT_BUHID }, + { 0x1763, USCRIPT_TAGBANWA }, + { 0x1A00, USCRIPT_BUGINESE }, + { 0x1BC0, USCRIPT_BATAK }, + { 0xA930, USCRIPT_REJANG }, + { 0xA90A, USCRIPT_KAYAH_LI }, + { 0x1000, USCRIPT_MYANMAR }, + { 0x11103, USCRIPT_CHAKMA }, + { 0x1780, USCRIPT_KHMER }, + { 0x1950, USCRIPT_TAI_LE }, + { 0x1980, USCRIPT_NEW_TAI_LUE }, + { 0x1A20, USCRIPT_LANNA }, + { 0xAA00, USCRIPT_CHAM }, + { 0x1B05, USCRIPT_BALINESE }, + { 0xA984, USCRIPT_JAVANESE }, + { 0x1826, USCRIPT_MONGOLIAN }, + { 0x1C5A, USCRIPT_OL_CHIKI }, + { 0x13C4, USCRIPT_CHEROKEE }, + { 0x14C0, USCRIPT_CANADIAN_ABORIGINAL }, + { 0x168F, USCRIPT_OGHAM }, + { 0x16A0, USCRIPT_RUNIC }, + { 0x10C00, USCRIPT_ORKHON }, + { 0xA549, USCRIPT_VAI }, + { 0xA6A0, USCRIPT_BAMUM }, + { 0x16AE6, USCRIPT_BASSA_VAH }, + { 0x1E802, USCRIPT_MENDE }, + { 0xAC00, USCRIPT_HANGUL }, + { 0x304B, USCRIPT_HIRAGANA }, + { 0x30AB, USCRIPT_KATAKANA }, + { 0x3105, USCRIPT_BOPOMOFO }, + { 0xA288, USCRIPT_YI }, + { 0xA4D0, USCRIPT_LISU }, + { 0x16F00, USCRIPT_MIAO }, + { 0x118B4, USCRIPT_WARANG_CITI }, + { 0x11AC0, USCRIPT_PAU_CIN_HAU }, + { 0x16B1C, USCRIPT_PAHAWH_HMONG }, + { 0x10280, USCRIPT_LYCIAN }, + { 0x102A0, USCRIPT_CARIAN }, + { 0x10920, USCRIPT_LYDIAN }, + { 0x10300, USCRIPT_OLD_ITALIC }, + { 0x10330, USCRIPT_GOTHIC }, + { 0x10414, USCRIPT_DESERET }, + { 0x10450, USCRIPT_SHAVIAN }, + { 0x1BC20, USCRIPT_DUPLOYAN }, + { 0x10480, USCRIPT_OSMANYA }, + { 0x10500, USCRIPT_ELBASAN }, + { 0x10537, USCRIPT_CAUCASIAN_ALBANIAN }, + { 0x110D0, USCRIPT_SORA_SOMPENG }, + { 0x16A4F, USCRIPT_MRO }, + { 0x10000, USCRIPT_LINEAR_B }, + { 0x10647, USCRIPT_LINEAR_A }, + { 0x10800, USCRIPT_CYPRIOT }, + { 0x10A60, USCRIPT_OLD_SOUTH_ARABIAN }, + { 0x10A95, USCRIPT_OLD_NORTH_ARABIAN }, + { 0x10B00, USCRIPT_AVESTAN }, + { 0x10873, USCRIPT_PALMYRENE }, + { 0x10896, USCRIPT_NABATAEAN }, + { 0x10840, USCRIPT_IMPERIAL_ARAMAIC }, + { 0x10B40, USCRIPT_INSCRIPTIONAL_PARTHIAN }, + { 0x10B60, USCRIPT_INSCRIPTIONAL_PAHLAVI }, + { 0x10B8F, USCRIPT_PSALTER_PAHLAVI }, + { 0x10AD8, USCRIPT_MANICHAEAN }, + { 0x10380, USCRIPT_UGARITIC }, + { 0x103A0, USCRIPT_OLD_PERSIAN }, + { 0x12000, USCRIPT_CUNEIFORM }, + { 0x13153, USCRIPT_EGYPTIAN_HIEROGLYPHS }, + { 0x109A0, USCRIPT_MEROITIC_CURSIVE }, + { 0x10980, USCRIPT_MEROITIC_HIEROGLYPHS }, + { 0x5B57, USCRIPT_HAN }, + { 0xFDD0, USCRIPT_UNKNOWN } // unassigned-implicit primary weights }; -int32_t getReorderCode(const char* name) { - int32_t code = CollationRuleParser::getReorderCode(name); - if (code >= 0) { - return code; - } - for (int32_t i = 0; i < LENGTHOF(specialReorderTokens); ++i) { - if (0 == strcmp(name, specialReorderTokens[i].name)) { - return specialReorderTokens[i].code; +static int32_t getCharScript(UChar32 c) { + for(int32_t i = 0; i < UPRV_LENGTHOF(sampleCharsToScripts); ++i) { + if(c == sampleCharsToScripts[i].sampleChar) { + return sampleCharsToScripts[i].script; } } - return -1; // Same as UCHAR_INVALID_CODE or USCRIPT_INVALID_CODE. + return USCRIPT_INVALID_CODE; // -1 } /** @@ -333,7 +452,7 @@ static struct { }; static int64_t getOptionValue(const char *name) { - for (int32_t i = 0; i < LENGTHOF(vt); ++i) { + for (int32_t i = 0; i < UPRV_LENGTHOF(vt); ++i) { if(uprv_strcmp(name, vt[i].name) == 0) { return vt[i].value; } @@ -341,11 +460,9 @@ static int64_t getOptionValue(const char *name) { return 0; } -static UnicodeString *leadByteScripts = NULL; - static void readAnOption( CollationBaseDataBuilder &builder, char *buffer, UErrorCode *status) { - for (int32_t cnt = 0; cnt= 2 && c == 0xFDD1) { + UChar32 c2 = s.char32At(1); + int32_t script = getCharScript(c2); + if(script < 0) { + fprintf(stderr, + "Error: Unknown script for first-primary sample character " + "U+%04x on line %u of %s\n" + " (add the character to genuca.cpp sampleCharsToScripts[])\n", + c2, (int)line, filename); + exit(U_INVALID_FORMAT_ERROR); + } + if(script == USCRIPT_UNKNOWN) { + // FDD1 FDD0, first unassigned-implicit primary + builder.addScriptStart(script, Collation::FIRST_UNASSIGNED_PRIMARY); + continue; + } + builder.addScriptStart(script, p); + if(script == USCRIPT_HIRAGANA) { + builder.addScriptStart(USCRIPT_KATAKANA_OR_HIRAGANA, p); + } else if(script == USCRIPT_HAN) { + builder.addScriptStart(USCRIPT_SIMPLIFIED_HAN, p); + builder.addScriptStart(USCRIPT_TRADITIONAL_HAN, p); + } } if(0xe0000000 <= p && p < 0xf0000000) { @@ -887,40 +995,6 @@ buildAndWriteBaseData(CollationBaseDataBuilder &builder, return; } - if(leadByteScripts != NULL) { - uint32_t firstLead = Collation::MERGE_SEPARATOR_BYTE + 1; - do { - // Find the range of lead bytes with this set of scripts. - const UnicodeString &firstScripts = leadByteScripts[firstLead]; - if(firstScripts.isEmpty()) { - fprintf(stderr, "[top_byte 0x%02X] has no reorderable scripts\n", (int)firstLead); - errorCode = U_INVALID_FORMAT_ERROR; - return; - } - uint32_t lead = firstLead; - for(;;) { - ++lead; - const UnicodeString &scripts = leadByteScripts[lead]; - // The scripts should either be the same or disjoint. - // We do not test if all reordering groups have disjoint sets of scripts. - if(scripts.isEmpty() || firstScripts.indexOf(scripts[0]) < 0) { break; } - if(scripts != firstScripts) { - fprintf(stderr, - "[top_byte 0x%02X] includes script %d from [top_byte 0x%02X] " - "but not all scripts match\n", - (int)firstLead, scripts[0], (int)lead); - errorCode = U_INVALID_FORMAT_ERROR; - return; - } - } - // lead is one greater than the last lead byte with the same set of scripts as firstLead. - builder.addReorderingGroup(firstLead, lead - 1, firstScripts, errorCode); - if(U_FAILURE(errorCode)) { return; } - firstLead = lead; - } while(firstLead < Collation::UNASSIGNED_IMPLICIT_BYTE); - delete[] leadByteScripts; - } - CollationData data(*Normalizer2Factory::getNFCImpl(errorCode)); builder.enableFastLatin(); builder.build(data, errorCode); @@ -1168,7 +1242,7 @@ extern "C" int main(int argc, char* argv[]) { U_MAIN_INIT_ARGS(argc, argv); - argc=u_parseArgs(argc, argv, LENGTHOF(options), options); + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); /* error handling, printing usage message */ if(argc<0) {