mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-14 17:24:01 +00:00
ICU-11449 collation formatVersion 5: reorder single scripts not groups, write new scripts data with reserved ranges
X-SVN-Rev: 36914
This commit is contained in:
parent
b2c4740141
commit
12f91f5673
3 changed files with 248 additions and 144 deletions
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2012-2014, International Business Machines
|
||||
* Copyright (C) 2012-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationbasedatabuilder.cpp
|
||||
|
@ -85,7 +85,10 @@ CollationBaseDataBuilder::CollationBaseDataBuilder(UErrorCode &errorCode)
|
|||
: CollationDataBuilder(errorCode),
|
||||
numericPrimary(0x12000000),
|
||||
firstHanPrimary(0), lastHanPrimary(0), hanStep(2),
|
||||
rootElements(errorCode) {
|
||||
rootElements(errorCode),
|
||||
scriptStartsLength(1) {
|
||||
uprv_memset(scriptsIndex, 0, sizeof(scriptsIndex));
|
||||
uprv_memset(scriptStarts, 0, sizeof(scriptStarts));
|
||||
}
|
||||
|
||||
CollationBaseDataBuilder::~CollationBaseDataBuilder() {
|
||||
|
@ -309,27 +312,43 @@ CollationBaseDataBuilder::addRootElement(int64_t ce, UErrorCode &errorCode) {
|
|||
}
|
||||
|
||||
void
|
||||
CollationBaseDataBuilder::addReorderingGroup(uint32_t firstByte, uint32_t lastByte,
|
||||
const UnicodeString &groupScripts,
|
||||
UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
if(groupScripts.isEmpty()) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
CollationBaseDataBuilder::addScriptStart(int32_t script, uint32_t p) {
|
||||
// The primary weight must be the lowest possible for a two-byte prefix.
|
||||
// It could be 2, 3, or 4 bytes long. We round down to the two-byte boundary.
|
||||
U_ASSERT((p & 0xff) == 0 || (p & 0xff) == 2);
|
||||
p >>= 8;
|
||||
U_ASSERT((p & 0xff) == 0 || (p & 0xff) == 2);
|
||||
p >>= 8;
|
||||
uint32_t lowestP2 = compressibleBytes[p >> 8] ? 4 : 2;
|
||||
if((p & 0xff) == lowestP2) {
|
||||
// The script really starts on a lead byte boundary. Round down to that.
|
||||
p &= 0xff00;
|
||||
}
|
||||
if(groupScripts.indexOf((UChar)USCRIPT_UNKNOWN) >= 0) {
|
||||
// Zzzz must not occur.
|
||||
// It is the code used in the API to separate low and high scripts.
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
// Script starts should be added in ascending order, otherwise we would need to sort them.
|
||||
if(script < UCOL_REORDER_CODE_FIRST) {
|
||||
U_ASSERT(0 <= script && script < USCRIPT_CODE_LIMIT);
|
||||
} else {
|
||||
U_ASSERT(script <= (UCOL_REORDER_CODE_FIRST + 15));
|
||||
script = USCRIPT_CODE_LIMIT + script - UCOL_REORDER_CODE_FIRST;
|
||||
}
|
||||
if(scriptStartsLength != 0 && scriptStarts[scriptStartsLength - 1] == p) {
|
||||
// Two scripts share a range (e.g., Hira & Kana).
|
||||
scriptsIndex[script] = (uint16_t)(scriptStartsLength - 1);
|
||||
} else {
|
||||
U_ASSERT(scriptStartsLength == 0 || scriptStarts[scriptStartsLength - 1] <= p);
|
||||
U_ASSERT(scriptStartsLength < UPRV_LENGTHOF(scriptStarts));
|
||||
scriptsIndex[script] = (uint16_t)scriptStartsLength;
|
||||
scriptStarts[scriptStartsLength++] = (uint16_t)p;
|
||||
}
|
||||
if(script == USCRIPT_UNKNOWN) {
|
||||
// The last script start is for unassigned code points
|
||||
// (with high implict primary weights).
|
||||
// Add one more entry with the limit of this range,
|
||||
// which is the start of the trailing-weights range.
|
||||
U_ASSERT(scriptStartsLength < UPRV_LENGTHOF(scriptStarts));
|
||||
scriptStarts[scriptStartsLength++] =
|
||||
(uint16_t)((Collation::FIRST_TRAILING_PRIMARY >> 16) & 0xff00);
|
||||
}
|
||||
// Note: We are mostly trusting the input data,
|
||||
// rather than verifying that reordering groups do not intersect
|
||||
// with their lead byte ranges nor their sets of scripts,
|
||||
// and that all script codes are valid.
|
||||
scripts.append((UChar)((firstByte << 8) | lastByte));
|
||||
scripts.append((UChar)groupScripts.length());
|
||||
scripts.append(groupScripts);
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -337,8 +356,18 @@ CollationBaseDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
|
|||
buildMappings(data, errorCode);
|
||||
data.numericPrimary = numericPrimary;
|
||||
data.compressibleBytes = compressibleBytes;
|
||||
data.scripts = reinterpret_cast<const uint16_t *>(scripts.getBuffer());
|
||||
data.scriptsLength = scripts.length();
|
||||
|
||||
int32_t numScripts = USCRIPT_CODE_LIMIT;
|
||||
while(numScripts > 0 && scriptsIndex[numScripts - 1] == 0) { --numScripts; }
|
||||
// Move the 16 special groups (not all used)
|
||||
// down for contiguous storage of the script and special-group indexes.
|
||||
for(int32_t i = 0; i < 16; ++i) {
|
||||
scriptsIndex[numScripts + i] = scriptsIndex[USCRIPT_CODE_LIMIT + i];
|
||||
}
|
||||
data.numScripts = numScripts;
|
||||
data.scriptsIndex = scriptsIndex;
|
||||
data.scriptStarts = scriptStarts;
|
||||
data.scriptStartsLength = scriptStartsLength;
|
||||
buildFastLatinTable(data, errorCode);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2012-2014, International Business Machines
|
||||
* Copyright (C) 2012-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationbasedatabuilder.h
|
||||
|
@ -18,6 +18,7 @@
|
|||
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "collation.h"
|
||||
#include "collationdata.h"
|
||||
#include "collationdatabuilder.h"
|
||||
|
@ -66,9 +67,7 @@ public:
|
|||
void addRootElements(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode);
|
||||
void addRootElement(int64_t ce, UErrorCode &errorCode);
|
||||
|
||||
void addReorderingGroup(uint32_t firstByte, uint32_t lastByte,
|
||||
const UnicodeString &groupScripts,
|
||||
UErrorCode &errorCode);
|
||||
void addScriptStart(int32_t script, uint32_t p);
|
||||
|
||||
virtual void build(CollationData &data, UErrorCode &errorCode);
|
||||
|
||||
|
@ -86,7 +85,9 @@ private:
|
|||
uint32_t lastHanPrimary;
|
||||
int32_t hanStep;
|
||||
UVector64 rootElements;
|
||||
UnicodeString scripts;
|
||||
uint16_t scriptsIndex[USCRIPT_CODE_LIMIT + 16]; // need exactly this many
|
||||
uint16_t scriptStarts[USCRIPT_CODE_LIMIT + 16]; // should be safely more than needed
|
||||
int32_t scriptStartsLength;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2000-2014, International Business Machines
|
||||
* Copyright (C) 2000-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -25,6 +25,8 @@
|
|||
#include "unicode/utypes.h"
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/localpointer.h"
|
||||
#include "unicode/ucol.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "unicode/utf8.h"
|
||||
#include "charstr.h"
|
||||
#include "cmemory.h"
|
||||
|
@ -46,8 +48,6 @@
|
|||
#include "uparse.h"
|
||||
#include "writesrc.h"
|
||||
|
||||
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
#if UCONFIG_NO_COLLATION
|
||||
|
||||
extern "C" int
|
||||
|
@ -83,7 +83,7 @@ static UDataInfo ucaDataInfo={
|
|||
0,
|
||||
|
||||
{ 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol"
|
||||
{ 4, 1, 0, 0 }, // formatVersion
|
||||
{ 5, 0, 0, 0 }, // formatVersion
|
||||
{ 6, 3, 0, 0 } // dataVersion
|
||||
};
|
||||
|
||||
|
@ -206,32 +206,151 @@ static int64_t parseCE(const CollationDataBuilder &builder, char *&s, UErrorCode
|
|||
}
|
||||
}
|
||||
|
||||
// Hardcoded mapping from script sample characters to script codes.
|
||||
// Pro: Available without complete and updated UCD scripts data,
|
||||
// easy to add non-script codes specific to collation.
|
||||
// Con: Needs manual update for each new script or change in sample character.
|
||||
static const struct {
|
||||
const char *name;
|
||||
int32_t code;
|
||||
} specialReorderTokens[] = {
|
||||
{ "TERMINATOR", -2 }, // -2 means "ignore"
|
||||
{ "LEVEL-SEPARATOR", -2 },
|
||||
{ "FIELD-SEPARATOR", -2 },
|
||||
{ "COMPRESS", -3 },
|
||||
// The standard name is "PUNCT" but FractionalUCA.txt uses the long form.
|
||||
{ "PUNCTUATION", UCOL_REORDER_CODE_PUNCTUATION },
|
||||
{ "IMPLICIT", USCRIPT_HAN }, // Implicit weights are usually for Han characters. Han & unassigned share a lead byte.
|
||||
{ "TRAILING", -2 }, // We do not reorder trailing weights (those after implicits).
|
||||
{ "SPECIAL", -2 } // We must never reorder internal, special CE lead bytes.
|
||||
UChar32 sampleChar;
|
||||
int32_t script;
|
||||
} sampleCharsToScripts[] = {
|
||||
{ 0x00A0, UCOL_REORDER_CODE_SPACE },
|
||||
{ 0x201C, UCOL_REORDER_CODE_PUNCTUATION },
|
||||
{ 0x263A, UCOL_REORDER_CODE_SYMBOL },
|
||||
{ 0x20AC, UCOL_REORDER_CODE_CURRENCY },
|
||||
{ 0x0034, UCOL_REORDER_CODE_DIGIT },
|
||||
{ 0x004C, USCRIPT_LATIN },
|
||||
{ 0x03A9, USCRIPT_GREEK },
|
||||
{ 0x03E2, USCRIPT_COPTIC },
|
||||
{ 0x042F, USCRIPT_CYRILLIC },
|
||||
{ 0x2C00, USCRIPT_GLAGOLITIC },
|
||||
{ 0x1036B, USCRIPT_OLD_PERMIC },
|
||||
{ 0x10D3, USCRIPT_GEORGIAN },
|
||||
{ 0x0531, USCRIPT_ARMENIAN },
|
||||
{ 0x05D0, USCRIPT_HEBREW },
|
||||
{ 0x10900, USCRIPT_PHOENICIAN },
|
||||
{ 0x0800, USCRIPT_SAMARITAN },
|
||||
{ 0x0628, USCRIPT_ARABIC },
|
||||
{ 0x0710, USCRIPT_SYRIAC },
|
||||
{ 0x0840, USCRIPT_MANDAIC },
|
||||
{ 0x078C, USCRIPT_THAANA },
|
||||
{ 0x07CA, USCRIPT_NKO },
|
||||
{ 0x2D5E, USCRIPT_TIFINAGH },
|
||||
{ 0x12A0, USCRIPT_ETHIOPIC },
|
||||
{ 0x0905, USCRIPT_DEVANAGARI },
|
||||
{ 0x0995, USCRIPT_BENGALI },
|
||||
{ 0x0A15, USCRIPT_GURMUKHI },
|
||||
{ 0x0A95, USCRIPT_GUJARATI },
|
||||
{ 0x0B15, USCRIPT_ORIYA },
|
||||
{ 0x0B95, USCRIPT_TAMIL },
|
||||
{ 0x0C15, USCRIPT_TELUGU },
|
||||
{ 0x0C95, USCRIPT_KANNADA },
|
||||
{ 0x0D15, USCRIPT_MALAYALAM },
|
||||
{ 0x0D85, USCRIPT_SINHALA },
|
||||
{ 0xABC0, USCRIPT_MEITEI_MAYEK },
|
||||
{ 0xA800, USCRIPT_SYLOTI_NAGRI },
|
||||
{ 0xA882, USCRIPT_SAURASHTRA },
|
||||
{ 0x11083, USCRIPT_KAITHI },
|
||||
{ 0x11152, USCRIPT_MAHAJANI },
|
||||
{ 0x11183, USCRIPT_SHARADA },
|
||||
{ 0x11208, USCRIPT_KHOJKI },
|
||||
{ 0x112BE, USCRIPT_KHUDAWADI },
|
||||
{ 0x11315, USCRIPT_GRANTHA },
|
||||
{ 0x11484, USCRIPT_TIRHUTA },
|
||||
{ 0x1158E, USCRIPT_SIDDHAM },
|
||||
{ 0x1160E, USCRIPT_MODI },
|
||||
{ 0x11680, USCRIPT_TAKRI },
|
||||
{ 0x1B83, USCRIPT_SUNDANESE },
|
||||
{ 0x11005, USCRIPT_BRAHMI },
|
||||
{ 0x10A00, USCRIPT_KHAROSHTHI },
|
||||
{ 0x0E17, USCRIPT_THAI },
|
||||
{ 0x0EA5, USCRIPT_LAO },
|
||||
{ 0xAA80, USCRIPT_TAI_VIET },
|
||||
{ 0x0F40, USCRIPT_TIBETAN },
|
||||
{ 0x1C00, USCRIPT_LEPCHA },
|
||||
{ 0xA840, USCRIPT_PHAGS_PA },
|
||||
{ 0x1900, USCRIPT_LIMBU },
|
||||
{ 0x1703, USCRIPT_TAGALOG },
|
||||
{ 0x1723, USCRIPT_HANUNOO },
|
||||
{ 0x1743, USCRIPT_BUHID },
|
||||
{ 0x1763, USCRIPT_TAGBANWA },
|
||||
{ 0x1A00, USCRIPT_BUGINESE },
|
||||
{ 0x1BC0, USCRIPT_BATAK },
|
||||
{ 0xA930, USCRIPT_REJANG },
|
||||
{ 0xA90A, USCRIPT_KAYAH_LI },
|
||||
{ 0x1000, USCRIPT_MYANMAR },
|
||||
{ 0x11103, USCRIPT_CHAKMA },
|
||||
{ 0x1780, USCRIPT_KHMER },
|
||||
{ 0x1950, USCRIPT_TAI_LE },
|
||||
{ 0x1980, USCRIPT_NEW_TAI_LUE },
|
||||
{ 0x1A20, USCRIPT_LANNA },
|
||||
{ 0xAA00, USCRIPT_CHAM },
|
||||
{ 0x1B05, USCRIPT_BALINESE },
|
||||
{ 0xA984, USCRIPT_JAVANESE },
|
||||
{ 0x1826, USCRIPT_MONGOLIAN },
|
||||
{ 0x1C5A, USCRIPT_OL_CHIKI },
|
||||
{ 0x13C4, USCRIPT_CHEROKEE },
|
||||
{ 0x14C0, USCRIPT_CANADIAN_ABORIGINAL },
|
||||
{ 0x168F, USCRIPT_OGHAM },
|
||||
{ 0x16A0, USCRIPT_RUNIC },
|
||||
{ 0x10C00, USCRIPT_ORKHON },
|
||||
{ 0xA549, USCRIPT_VAI },
|
||||
{ 0xA6A0, USCRIPT_BAMUM },
|
||||
{ 0x16AE6, USCRIPT_BASSA_VAH },
|
||||
{ 0x1E802, USCRIPT_MENDE },
|
||||
{ 0xAC00, USCRIPT_HANGUL },
|
||||
{ 0x304B, USCRIPT_HIRAGANA },
|
||||
{ 0x30AB, USCRIPT_KATAKANA },
|
||||
{ 0x3105, USCRIPT_BOPOMOFO },
|
||||
{ 0xA288, USCRIPT_YI },
|
||||
{ 0xA4D0, USCRIPT_LISU },
|
||||
{ 0x16F00, USCRIPT_MIAO },
|
||||
{ 0x118B4, USCRIPT_WARANG_CITI },
|
||||
{ 0x11AC0, USCRIPT_PAU_CIN_HAU },
|
||||
{ 0x16B1C, USCRIPT_PAHAWH_HMONG },
|
||||
{ 0x10280, USCRIPT_LYCIAN },
|
||||
{ 0x102A0, USCRIPT_CARIAN },
|
||||
{ 0x10920, USCRIPT_LYDIAN },
|
||||
{ 0x10300, USCRIPT_OLD_ITALIC },
|
||||
{ 0x10330, USCRIPT_GOTHIC },
|
||||
{ 0x10414, USCRIPT_DESERET },
|
||||
{ 0x10450, USCRIPT_SHAVIAN },
|
||||
{ 0x1BC20, USCRIPT_DUPLOYAN },
|
||||
{ 0x10480, USCRIPT_OSMANYA },
|
||||
{ 0x10500, USCRIPT_ELBASAN },
|
||||
{ 0x10537, USCRIPT_CAUCASIAN_ALBANIAN },
|
||||
{ 0x110D0, USCRIPT_SORA_SOMPENG },
|
||||
{ 0x16A4F, USCRIPT_MRO },
|
||||
{ 0x10000, USCRIPT_LINEAR_B },
|
||||
{ 0x10647, USCRIPT_LINEAR_A },
|
||||
{ 0x10800, USCRIPT_CYPRIOT },
|
||||
{ 0x10A60, USCRIPT_OLD_SOUTH_ARABIAN },
|
||||
{ 0x10A95, USCRIPT_OLD_NORTH_ARABIAN },
|
||||
{ 0x10B00, USCRIPT_AVESTAN },
|
||||
{ 0x10873, USCRIPT_PALMYRENE },
|
||||
{ 0x10896, USCRIPT_NABATAEAN },
|
||||
{ 0x10840, USCRIPT_IMPERIAL_ARAMAIC },
|
||||
{ 0x10B40, USCRIPT_INSCRIPTIONAL_PARTHIAN },
|
||||
{ 0x10B60, USCRIPT_INSCRIPTIONAL_PAHLAVI },
|
||||
{ 0x10B8F, USCRIPT_PSALTER_PAHLAVI },
|
||||
{ 0x10AD8, USCRIPT_MANICHAEAN },
|
||||
{ 0x10380, USCRIPT_UGARITIC },
|
||||
{ 0x103A0, USCRIPT_OLD_PERSIAN },
|
||||
{ 0x12000, USCRIPT_CUNEIFORM },
|
||||
{ 0x13153, USCRIPT_EGYPTIAN_HIEROGLYPHS },
|
||||
{ 0x109A0, USCRIPT_MEROITIC_CURSIVE },
|
||||
{ 0x10980, USCRIPT_MEROITIC_HIEROGLYPHS },
|
||||
{ 0x5B57, USCRIPT_HAN },
|
||||
{ 0xFDD0, USCRIPT_UNKNOWN } // unassigned-implicit primary weights
|
||||
};
|
||||
|
||||
int32_t getReorderCode(const char* name) {
|
||||
int32_t code = CollationRuleParser::getReorderCode(name);
|
||||
if (code >= 0) {
|
||||
return code;
|
||||
}
|
||||
for (int32_t i = 0; i < LENGTHOF(specialReorderTokens); ++i) {
|
||||
if (0 == strcmp(name, specialReorderTokens[i].name)) {
|
||||
return specialReorderTokens[i].code;
|
||||
static int32_t getCharScript(UChar32 c) {
|
||||
for(int32_t i = 0; i < UPRV_LENGTHOF(sampleCharsToScripts); ++i) {
|
||||
if(c == sampleCharsToScripts[i].sampleChar) {
|
||||
return sampleCharsToScripts[i].script;
|
||||
}
|
||||
}
|
||||
return -1; // Same as UCHAR_INVALID_CODE or USCRIPT_INVALID_CODE.
|
||||
return USCRIPT_INVALID_CODE; // -1
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -333,7 +452,7 @@ static struct {
|
|||
};
|
||||
|
||||
static int64_t getOptionValue(const char *name) {
|
||||
for (int32_t i = 0; i < LENGTHOF(vt); ++i) {
|
||||
for (int32_t i = 0; i < UPRV_LENGTHOF(vt); ++i) {
|
||||
if(uprv_strcmp(name, vt[i].name) == 0) {
|
||||
return vt[i].value;
|
||||
}
|
||||
|
@ -341,11 +460,9 @@ static int64_t getOptionValue(const char *name) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
static UnicodeString *leadByteScripts = NULL;
|
||||
|
||||
static void readAnOption(
|
||||
CollationBaseDataBuilder &builder, char *buffer, UErrorCode *status) {
|
||||
for (int32_t cnt = 0; cnt<LENGTHOF(vt); cnt++) {
|
||||
for (int32_t cnt = 0; cnt<UPRV_LENGTHOF(vt); cnt++) {
|
||||
int32_t vtLen = (int32_t)uprv_strlen(vt[cnt].name);
|
||||
if(uprv_strncmp(buffer, vt[cnt].name, vtLen) == 0) {
|
||||
ActionType what_to_do = vt[cnt].what_to_do;
|
||||
|
@ -491,54 +608,12 @@ static void readAnOption(
|
|||
fprintf(stderr, "warning: UCA version %s != UCD version %s\n", uca, ucd);
|
||||
}
|
||||
} else if (what_to_do == READLEADBYTETOSCRIPTS) {
|
||||
uint16_t leadByte = (hex2num(*pointer++) * 16);
|
||||
leadByte += hex2num(*pointer++);
|
||||
|
||||
if(0xe0 <= leadByte && leadByte < Collation::UNASSIGNED_IMPLICIT_BYTE) {
|
||||
// Extend the Hani range to the end of what this implementation uses.
|
||||
// FractionalUCA.txt assumes a different algorithm for implicit primary weights,
|
||||
// and different high-lead byte ranges.
|
||||
leadByteScripts[leadByte] = leadByteScripts[0xdf];
|
||||
return;
|
||||
}
|
||||
|
||||
UnicodeString scripts;
|
||||
for(;;) {
|
||||
pointer = skipWhiteSpace(pointer);
|
||||
if (*pointer == ']') {
|
||||
break;
|
||||
}
|
||||
const char *scriptName = pointer;
|
||||
char c;
|
||||
while((c = *pointer) != 0 && c != ' ' && c != '\t' && c != ']') { ++pointer; }
|
||||
if(c == 0) {
|
||||
fprintf(stderr, "Syntax error: unterminated list of scripts: '%s'\n", buffer);
|
||||
*status = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
*pointer = 0;
|
||||
int32_t reorderCode = getReorderCode(scriptName);
|
||||
*pointer = c;
|
||||
if (reorderCode == -3) { // COMPRESS
|
||||
builder.setCompressibleLeadByte(leadByte);
|
||||
continue;
|
||||
}
|
||||
if (reorderCode == -2) {
|
||||
continue; // Ignore "TERMINATOR" etc.
|
||||
}
|
||||
if (reorderCode < 0 || 0xffff < reorderCode) {
|
||||
fprintf(stderr, "Syntax error: unable to parse reorder code from '%s'\n", scriptName);
|
||||
*status = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
scripts.append((UChar)reorderCode);
|
||||
}
|
||||
if(!scripts.isEmpty()) {
|
||||
if(leadByteScripts == NULL) {
|
||||
leadByteScripts = new UnicodeString[256];
|
||||
}
|
||||
leadByteScripts[leadByte] = scripts;
|
||||
if (strstr(pointer, "COMPRESS") != NULL) {
|
||||
uint16_t leadByte = (hex2num(*pointer++) * 16);
|
||||
leadByte += hex2num(*pointer++);
|
||||
builder.setCompressibleLeadByte(leadByte);
|
||||
}
|
||||
// We do not need the list of scripts on this line.
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -730,9 +805,21 @@ parseFractionalUCA(const char *filename,
|
|||
// are only entered into the inverse table,
|
||||
// not into the normal collation data.
|
||||
builder.addRootElements(ces, cesLength, *status);
|
||||
if(s.length() == 2 && s[1] == 0x34 && cesLength == 1) {
|
||||
// Lead byte for numeric sorting.
|
||||
builder.setNumericPrimary(p);
|
||||
if(s.length() == 2 && cesLength == 1) {
|
||||
switch(s[1]) {
|
||||
case 0x34:
|
||||
// Lead byte for numeric sorting.
|
||||
builder.setNumericPrimary(p);
|
||||
break;
|
||||
case 0xFF21:
|
||||
builder.addScriptStart(CollationData::REORDER_RESERVED_BEFORE_LATIN, p);
|
||||
break;
|
||||
case 0xFF3A:
|
||||
builder.addScriptStart(CollationData::REORDER_RESERVED_AFTER_LATIN, p);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
UChar32 c = s.char32At(0);
|
||||
|
@ -742,8 +829,29 @@ parseFractionalUCA(const char *filename,
|
|||
// CollationBaseDataBuilder::init() maps them to special CEs.
|
||||
// Except for U+FFFE, these have higher primaries in v2 than in FractionalUCA.txt.
|
||||
if(0xfffd <= c && c <= 0xffff) { continue; }
|
||||
if(s.length() == 2 && s[0] == 0xFDD1 && s[1] == 0xFDD0) {
|
||||
continue;
|
||||
if(s.length() >= 2 && c == 0xFDD1) {
|
||||
UChar32 c2 = s.char32At(1);
|
||||
int32_t script = getCharScript(c2);
|
||||
if(script < 0) {
|
||||
fprintf(stderr,
|
||||
"Error: Unknown script for first-primary sample character "
|
||||
"U+%04x on line %u of %s\n"
|
||||
" (add the character to genuca.cpp sampleCharsToScripts[])\n",
|
||||
c2, (int)line, filename);
|
||||
exit(U_INVALID_FORMAT_ERROR);
|
||||
}
|
||||
if(script == USCRIPT_UNKNOWN) {
|
||||
// FDD1 FDD0, first unassigned-implicit primary
|
||||
builder.addScriptStart(script, Collation::FIRST_UNASSIGNED_PRIMARY);
|
||||
continue;
|
||||
}
|
||||
builder.addScriptStart(script, p);
|
||||
if(script == USCRIPT_HIRAGANA) {
|
||||
builder.addScriptStart(USCRIPT_KATAKANA_OR_HIRAGANA, p);
|
||||
} else if(script == USCRIPT_HAN) {
|
||||
builder.addScriptStart(USCRIPT_SIMPLIFIED_HAN, p);
|
||||
builder.addScriptStart(USCRIPT_TRADITIONAL_HAN, p);
|
||||
}
|
||||
}
|
||||
|
||||
if(0xe0000000 <= p && p < 0xf0000000) {
|
||||
|
@ -887,40 +995,6 @@ buildAndWriteBaseData(CollationBaseDataBuilder &builder,
|
|||
return;
|
||||
}
|
||||
|
||||
if(leadByteScripts != NULL) {
|
||||
uint32_t firstLead = Collation::MERGE_SEPARATOR_BYTE + 1;
|
||||
do {
|
||||
// Find the range of lead bytes with this set of scripts.
|
||||
const UnicodeString &firstScripts = leadByteScripts[firstLead];
|
||||
if(firstScripts.isEmpty()) {
|
||||
fprintf(stderr, "[top_byte 0x%02X] has no reorderable scripts\n", (int)firstLead);
|
||||
errorCode = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
uint32_t lead = firstLead;
|
||||
for(;;) {
|
||||
++lead;
|
||||
const UnicodeString &scripts = leadByteScripts[lead];
|
||||
// The scripts should either be the same or disjoint.
|
||||
// We do not test if all reordering groups have disjoint sets of scripts.
|
||||
if(scripts.isEmpty() || firstScripts.indexOf(scripts[0]) < 0) { break; }
|
||||
if(scripts != firstScripts) {
|
||||
fprintf(stderr,
|
||||
"[top_byte 0x%02X] includes script %d from [top_byte 0x%02X] "
|
||||
"but not all scripts match\n",
|
||||
(int)firstLead, scripts[0], (int)lead);
|
||||
errorCode = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
// lead is one greater than the last lead byte with the same set of scripts as firstLead.
|
||||
builder.addReorderingGroup(firstLead, lead - 1, firstScripts, errorCode);
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
firstLead = lead;
|
||||
} while(firstLead < Collation::UNASSIGNED_IMPLICIT_BYTE);
|
||||
delete[] leadByteScripts;
|
||||
}
|
||||
|
||||
CollationData data(*Normalizer2Factory::getNFCImpl(errorCode));
|
||||
builder.enableFastLatin();
|
||||
builder.build(data, errorCode);
|
||||
|
@ -1168,7 +1242,7 @@ extern "C" int
|
|||
main(int argc, char* argv[]) {
|
||||
U_MAIN_INIT_ARGS(argc, argv);
|
||||
|
||||
argc=u_parseArgs(argc, argv, LENGTHOF(options), options);
|
||||
argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
|
||||
|
||||
/* error handling, printing usage message */
|
||||
if(argc<0) {
|
||||
|
|
Loading…
Add table
Reference in a new issue