mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
ICU-11449 reorder single scripts not groups, scripts/groups can start on top-16-bit boundaries, data formatVersion 5 for new scripts data and optional reorderRanges appended to reorderCodes
X-SVN-Rev: 36924
This commit is contained in:
parent
bcdcc4dc67
commit
a9d7c3e4bd
33 changed files with 22801 additions and 22306 deletions
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2003-2014, International Business Machines
|
||||
* Copyright (C) 2003-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -340,7 +340,7 @@ swapFormatVersion3(const UDataSwapper *ds,
|
|||
return header.size;
|
||||
}
|
||||
|
||||
// swap formatVersion 4 ---------------------------------------------------- ***
|
||||
// swap formatVersion 4 or 5 ----------------------------------------------- ***
|
||||
|
||||
// The following are copied from CollationDataReader, trading an awkward copy of constants
|
||||
// for an awkward relocation of the i18n collationdatareader.h file into the common library.
|
||||
|
@ -566,7 +566,7 @@ ucol_swap(const UDataSwapper *ds,
|
|||
info.dataFormat[1]==0x43 &&
|
||||
info.dataFormat[2]==0x6f &&
|
||||
info.dataFormat[3]==0x6c &&
|
||||
(info.formatVersion[0]==3 || info.formatVersion[0]==4)
|
||||
(3<=info.formatVersion[0] && info.formatVersion[0]<=5)
|
||||
)) {
|
||||
udata_printError(ds, "ucol_swap(): data format %02x.%02x.%02x.%02x "
|
||||
"(format version %02x.%02x) is not recognized as collation data\n",
|
||||
|
|
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load diff
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010-2014, International Business Machines
|
||||
* Copyright (C) 2010-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collation.h
|
||||
|
@ -488,10 +488,6 @@ public:
|
|||
return makeCE(unassignedPrimaryFromCodePoint(c));
|
||||
}
|
||||
|
||||
static inline uint32_t reorder(const uint8_t reorderTable[256], uint32_t primary) {
|
||||
return ((uint32_t)reorderTable[primary >> 24] << 24) | (primary & 0xffffff);
|
||||
}
|
||||
|
||||
private:
|
||||
Collation(); // No instantiation.
|
||||
};
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2014, International Business Machines
|
||||
* Copyright (C) 1996-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationcompare.cpp
|
||||
|
@ -95,10 +95,9 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
|
|||
|
||||
if(leftPrimary != rightPrimary) {
|
||||
// Return the primary difference, with script reordering.
|
||||
const uint8_t *reorderTable = settings.reorderTable;
|
||||
if (reorderTable != NULL) {
|
||||
leftPrimary = Collation::reorder(reorderTable, leftPrimary);
|
||||
rightPrimary = Collation::reorder(reorderTable, rightPrimary);
|
||||
if(settings.hasReordering()) {
|
||||
leftPrimary = settings.reorder(leftPrimary);
|
||||
rightPrimary = settings.reorder(rightPrimary);
|
||||
}
|
||||
return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
|
||||
}
|
||||
|
@ -340,10 +339,9 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
|
|||
|
||||
if(leftQuaternary != rightQuaternary) {
|
||||
// Return the difference, with script reordering.
|
||||
const uint8_t *reorderTable = settings.reorderTable;
|
||||
if (reorderTable != NULL) {
|
||||
leftQuaternary = Collation::reorder(reorderTable, leftQuaternary);
|
||||
rightQuaternary = Collation::reorder(reorderTable, rightQuaternary);
|
||||
if(settings.hasReordering()) {
|
||||
leftQuaternary = settings.reorder(leftQuaternary);
|
||||
rightQuaternary = settings.reorder(rightQuaternary);
|
||||
}
|
||||
return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2012-2014, International Business Machines
|
||||
* Copyright (C) 2012-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationdata.cpp
|
||||
|
@ -21,6 +21,7 @@
|
|||
#include "collationdata.h"
|
||||
#include "uassert.h"
|
||||
#include "utrie2.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -114,48 +115,57 @@ CollationData::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
|
|||
|
||||
uint32_t
|
||||
CollationData::getFirstPrimaryForGroup(int32_t script) const {
|
||||
int32_t index = findScript(script);
|
||||
if(index < 0) {
|
||||
return 0;
|
||||
}
|
||||
uint32_t head = scripts[index];
|
||||
return (head & 0xff00) << 16;
|
||||
int32_t index = getScriptIndex(script);
|
||||
return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16;
|
||||
}
|
||||
|
||||
uint32_t
|
||||
CollationData::getLastPrimaryForGroup(int32_t script) const {
|
||||
int32_t index = findScript(script);
|
||||
if(index < 0) {
|
||||
int32_t index = getScriptIndex(script);
|
||||
if(index == 0) {
|
||||
return 0;
|
||||
}
|
||||
uint32_t head = scripts[index];
|
||||
uint32_t lastByte = head & 0xff;
|
||||
return ((lastByte + 1) << 24) - 1;
|
||||
uint32_t limit = scriptStarts[index + 1];
|
||||
return (limit << 16) - 1;
|
||||
}
|
||||
|
||||
int32_t
|
||||
CollationData::getGroupForPrimary(uint32_t p) const {
|
||||
p >>= 24; // Reordering groups are distinguished by primary lead bytes.
|
||||
for(int32_t i = 0; i < scriptsLength; i = i + 2 + scripts[i + 1]) {
|
||||
uint32_t lastByte = scripts[i] & 0xff;
|
||||
if(p <= lastByte) {
|
||||
return scripts[i + 2];
|
||||
p >>= 16;
|
||||
if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) {
|
||||
return -1;
|
||||
}
|
||||
int32_t index = 1;
|
||||
while(p >= scriptStarts[index + 1]) { ++index; }
|
||||
for(int32_t i = 0; i < numScripts; ++i) {
|
||||
if(scriptsIndex[i] == index) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
|
||||
if(scriptsIndex[numScripts + i] == index) {
|
||||
return UCOL_REORDER_CODE_FIRST + i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int32_t
|
||||
CollationData::findScript(int32_t script) const {
|
||||
if(script < 0 || 0xffff < script) { return -1; }
|
||||
for(int32_t i = 0; i < scriptsLength;) {
|
||||
int32_t limit = i + 2 + scripts[i + 1];
|
||||
for(int32_t j = i + 2; j < limit; ++j) {
|
||||
if(script == scripts[j]) { return i; }
|
||||
CollationData::getScriptIndex(int32_t script) const {
|
||||
if(script < 0) {
|
||||
return 0;
|
||||
} else if(script < numScripts) {
|
||||
return scriptsIndex[script];
|
||||
} else if(script < UCOL_REORDER_CODE_FIRST) {
|
||||
return 0;
|
||||
} else {
|
||||
script -= UCOL_REORDER_CODE_FIRST;
|
||||
if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
|
||||
return scriptsIndex[numScripts + script];
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
i = limit;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int32_t
|
||||
|
@ -163,85 +173,114 @@ CollationData::getEquivalentScripts(int32_t script,
|
|||
int32_t dest[], int32_t capacity,
|
||||
UErrorCode &errorCode) const {
|
||||
if(U_FAILURE(errorCode)) { return 0; }
|
||||
int32_t i = findScript(script);
|
||||
if(i < 0) { return 0; }
|
||||
int32_t length = scripts[i + 1];
|
||||
U_ASSERT(length != 0);
|
||||
int32_t index = getScriptIndex(script);
|
||||
if(index == 0) { return 0; }
|
||||
if(script >= UCOL_REORDER_CODE_FIRST) {
|
||||
// Special groups have no aliases.
|
||||
if(capacity > 0) {
|
||||
dest[0] = script;
|
||||
} else {
|
||||
errorCode = U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
int32_t length = 0;
|
||||
for(int32_t i = 0; i < numScripts; ++i) {
|
||||
if(scriptsIndex[i] == index) {
|
||||
if(length < capacity) {
|
||||
dest[length] = i;
|
||||
}
|
||||
++length;
|
||||
}
|
||||
}
|
||||
if(length > capacity) {
|
||||
errorCode = U_BUFFER_OVERFLOW_ERROR;
|
||||
return length;
|
||||
}
|
||||
i += 2;
|
||||
dest[0] = scripts[i++];
|
||||
for(int32_t j = 1; j < length; ++j) {
|
||||
script = scripts[i++];
|
||||
// Sorted insertion.
|
||||
for(int32_t k = j;; --k) {
|
||||
// Invariant: dest[k] is free to receive either script or dest[k - 1].
|
||||
if(k > 0 && script < dest[k - 1]) {
|
||||
dest[k] = dest[k - 1];
|
||||
} else {
|
||||
dest[k] = script;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return length;
|
||||
}
|
||||
|
||||
void
|
||||
CollationData::makeReorderTable(const int32_t *reorder, int32_t length,
|
||||
uint8_t table[256], UErrorCode &errorCode) const {
|
||||
CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
|
||||
UVector32 &ranges, UErrorCode &errorCode) const {
|
||||
makeReorderRanges(reorder, length, FALSE, ranges, errorCode);
|
||||
}
|
||||
|
||||
void
|
||||
CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
|
||||
UBool latinMustMove,
|
||||
UVector32 &ranges, UErrorCode &errorCode) const {
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
ranges.removeAllElements();
|
||||
if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Maps each script-or-group range to a new lead byte.
|
||||
uint8_t table[MAX_NUM_SCRIPT_RANGES];
|
||||
uprv_memset(table, 0, sizeof(table));
|
||||
|
||||
{
|
||||
// Set "don't care" values for reserved ranges.
|
||||
int32_t index = scriptsIndex[
|
||||
numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_FIRST];
|
||||
if(index != 0) {
|
||||
table[index] = 0xff;
|
||||
}
|
||||
index = scriptsIndex[
|
||||
numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FIRST];
|
||||
if(index != 0) {
|
||||
table[index] = 0xff;
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize the table.
|
||||
// Never reorder special low and high primary lead bytes.
|
||||
int32_t lowByte;
|
||||
for(lowByte = 0; lowByte <= Collation::MERGE_SEPARATOR_BYTE; ++lowByte) {
|
||||
table[lowByte] = lowByte;
|
||||
}
|
||||
// lowByte == 03
|
||||
|
||||
int32_t highByte;
|
||||
for(highByte = 0xff; highByte >= Collation::TRAIL_WEIGHT_BYTE; --highByte) {
|
||||
table[highByte] = highByte;
|
||||
}
|
||||
// highByte == FE
|
||||
|
||||
// Set intermediate bytes to 0 to indicate that they have not been set yet.
|
||||
for(int32_t i = lowByte; i <= highByte; ++i) {
|
||||
table[i] = 0;
|
||||
}
|
||||
U_ASSERT(scriptStartsLength >= 2);
|
||||
U_ASSERT(scriptStarts[0] == 0);
|
||||
int32_t lowStart = scriptStarts[1];
|
||||
U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8));
|
||||
int32_t highLimit = scriptStarts[scriptStartsLength - 1];
|
||||
U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8));
|
||||
|
||||
// Get the set of special reorder codes in the input list.
|
||||
// This supports up to 32 special reorder codes;
|
||||
// This supports a fixed number of special reorder codes;
|
||||
// it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
|
||||
uint32_t specials = 0;
|
||||
for(int32_t i = 0; i < length; ++i) {
|
||||
int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;
|
||||
if(0 <= reorderCode && reorderCode <= 31) {
|
||||
if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
|
||||
specials |= (uint32_t)1 << reorderCode;
|
||||
}
|
||||
}
|
||||
|
||||
// Start the reordering with the special low reorder codes that do not occur in the input.
|
||||
for(int32_t i = 0;; i += 3) {
|
||||
if(scripts[i + 1] != 1) { break; } // Went beyond special single-code reorder codes.
|
||||
int32_t reorderCode = (int32_t)scripts[i + 2] - UCOL_REORDER_CODE_FIRST;
|
||||
if(reorderCode < 0) { break; } // Went beyond special reorder codes.
|
||||
if((specials & ((uint32_t)1 << reorderCode)) == 0) {
|
||||
int32_t head = scripts[i];
|
||||
int32_t firstByte = head >> 8;
|
||||
int32_t lastByte = head & 0xff;
|
||||
do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte);
|
||||
for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
|
||||
int32_t index = scriptsIndex[numScripts + i];
|
||||
if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) {
|
||||
lowStart = addLowScriptRange(table, index, lowStart);
|
||||
}
|
||||
}
|
||||
|
||||
// Reorder according to the input scripts, continuing from the bottom of the bytes range.
|
||||
// Skip the reserved range before Latin if Latin is the first script,
|
||||
// so that we do not move it unnecessarily.
|
||||
int32_t skippedReserved = 0;
|
||||
if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) {
|
||||
int32_t index = scriptsIndex[USCRIPT_LATIN];
|
||||
U_ASSERT(index != 0);
|
||||
int32_t start = scriptStarts[index];
|
||||
U_ASSERT(lowStart <= start);
|
||||
skippedReserved = start - lowStart;
|
||||
lowStart = start;
|
||||
}
|
||||
|
||||
// Reorder according to the input scripts, continuing from the bottom of the primary range.
|
||||
int32_t originalLength = length; // length will be decremented if "others" is in the list.
|
||||
UBool hasReorderToEnd = FALSE;
|
||||
for(int32_t i = 0; i < length;) {
|
||||
int32_t script = reorder[i++];
|
||||
if(script == USCRIPT_UNKNOWN) {
|
||||
// Put the remaining scripts at the top.
|
||||
hasReorderToEnd = TRUE;
|
||||
while(i < length) {
|
||||
script = reorder[--length];
|
||||
if(script == USCRIPT_UNKNOWN || // Must occur at most once.
|
||||
|
@ -249,16 +288,13 @@ CollationData::makeReorderTable(const int32_t *reorder, int32_t length,
|
|||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
int32_t index = findScript(script);
|
||||
if(index < 0) { continue; }
|
||||
int32_t head = scripts[index];
|
||||
int32_t firstByte = head >> 8;
|
||||
int32_t lastByte = head & 0xff;
|
||||
if(table[firstByte] != 0) { // Duplicate or equivalent script.
|
||||
int32_t index = getScriptIndex(script);
|
||||
if(index == 0) { continue; }
|
||||
if(table[index] != 0) { // Duplicate or equivalent script.
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
do { table[lastByte--] = highByte--; } while(firstByte <= lastByte);
|
||||
highLimit = addHighScriptRange(table, index, highLimit);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -268,24 +304,83 @@ CollationData::makeReorderTable(const int32_t *reorder, int32_t length,
|
|||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
int32_t index = findScript(script);
|
||||
if(index < 0) { continue; }
|
||||
int32_t head = scripts[index];
|
||||
int32_t firstByte = head >> 8;
|
||||
int32_t lastByte = head & 0xff;
|
||||
if(table[firstByte] != 0) { // Duplicate or equivalent script.
|
||||
int32_t index = getScriptIndex(script);
|
||||
if(index == 0) { continue; }
|
||||
if(table[index] != 0) { // Duplicate or equivalent script.
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte);
|
||||
lowStart = addLowScriptRange(table, index, lowStart);
|
||||
}
|
||||
|
||||
// Put all remaining scripts into the middle.
|
||||
// Avoid table[0] which must remain 0.
|
||||
for(int32_t i = 1; i <= 0xff; ++i) {
|
||||
if(table[i] == 0) { table[i] = lowByte++; }
|
||||
for(int32_t i = 1; i < scriptStartsLength - 1; ++i) {
|
||||
int32_t leadByte = table[i];
|
||||
if(leadByte != 0) { continue; }
|
||||
int32_t start = scriptStarts[i];
|
||||
if(!hasReorderToEnd && start > lowStart) {
|
||||
// No need to move this script.
|
||||
lowStart = start;
|
||||
}
|
||||
lowStart = addLowScriptRange(table, i, lowStart);
|
||||
}
|
||||
U_ASSERT(lowByte == highByte + 1);
|
||||
if(lowStart > highLimit) {
|
||||
if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
|
||||
// Try not skipping the before-Latin reserved range.
|
||||
makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode);
|
||||
return;
|
||||
}
|
||||
// We need more primary lead bytes than available, despite the reserved ranges.
|
||||
errorCode = U_BUFFER_OVERFLOW_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
// Turn lead bytes into a list of (limit, offset) pairs.
|
||||
// Encode each pair in one list element:
|
||||
// Upper 16 bits = limit, lower 16 = signed lead byte offset.
|
||||
int32_t offset = 0;
|
||||
for(int32_t i = 1;; ++i) {
|
||||
int32_t nextOffset = offset;
|
||||
while(i < scriptStartsLength - 1) {
|
||||
int32_t newLeadByte = table[i];
|
||||
if(newLeadByte == 0xff) {
|
||||
// "Don't care" lead byte for reserved range, continue with current offset.
|
||||
} else {
|
||||
nextOffset = newLeadByte - (scriptStarts[i] >> 8);
|
||||
if(nextOffset != offset) { break; }
|
||||
}
|
||||
++i;
|
||||
}
|
||||
if(offset != 0 || i < scriptStartsLength - 1) {
|
||||
ranges.addElement(((int32_t)scriptStarts[i] << 16) | (offset & 0xffff), errorCode);
|
||||
}
|
||||
if(i == scriptStartsLength - 1) { break; }
|
||||
offset = nextOffset;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t
|
||||
CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const {
|
||||
int32_t start = scriptStarts[index];
|
||||
if((start & 0xff) < (lowStart & 0xff)) {
|
||||
lowStart += 0x100;
|
||||
}
|
||||
table[index] = (uint8_t)(lowStart >> 8);
|
||||
int32_t limit = scriptStarts[index + 1];
|
||||
lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
|
||||
return lowStart;
|
||||
}
|
||||
|
||||
int32_t
|
||||
CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const {
|
||||
int32_t limit = scriptStarts[index + 1];
|
||||
if((limit & 0xff) > (highLimit & 0xff)) {
|
||||
highLimit -= 0x100;
|
||||
}
|
||||
int32_t start = scriptStarts[index];
|
||||
highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
|
||||
table[index] = (uint8_t)(highLimit >> 8);
|
||||
return highLimit;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010-2014, International Business Machines
|
||||
* Copyright (C) 2010-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationdata.h
|
||||
|
@ -16,6 +16,7 @@
|
|||
|
||||
#if !UCONFIG_NO_COLLATION
|
||||
|
||||
#include "unicode/ucol.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "collation.h"
|
||||
#include "normalizer2impl.h"
|
||||
|
@ -25,6 +26,8 @@ struct UDataMemory;
|
|||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class UVector32;
|
||||
|
||||
/**
|
||||
* Collation data container.
|
||||
* Immutable data created by a CollationDataBuilder, or loaded from a file,
|
||||
|
@ -33,6 +36,20 @@ U_NAMESPACE_BEGIN
|
|||
* Includes data for the collation base (root/default), aliased if this is not the base.
|
||||
*/
|
||||
struct U_I18N_API CollationData : public UMemory {
|
||||
// Note: The ucadata.icu loader could discover the reserved ranges by setting an array
|
||||
// parallel with the ranges, and resetting ranges that are indexed.
|
||||
// The reordering builder code could clone the resulting template array.
|
||||
enum {
|
||||
REORDER_RESERVED_BEFORE_LATIN = UCOL_REORDER_CODE_FIRST + 14,
|
||||
REORDER_RESERVED_AFTER_LATIN
|
||||
};
|
||||
|
||||
enum {
|
||||
MAX_NUM_SPECIAL_REORDER_CODES = 8,
|
||||
/** C++ only, data reader check scriptStartsLength. */
|
||||
MAX_NUM_SCRIPT_RANGES = 256
|
||||
};
|
||||
|
||||
CollationData(const Normalizer2Impl &nfc)
|
||||
: trie(NULL),
|
||||
ce32s(NULL), ces(NULL), contexts(NULL), base(NULL),
|
||||
|
@ -43,7 +60,7 @@ struct U_I18N_API CollationData : public UMemory {
|
|||
compressibleBytes(NULL),
|
||||
unsafeBackwardSet(NULL),
|
||||
fastLatinTable(NULL), fastLatinTableLength(0),
|
||||
scripts(NULL), scriptsLength(0),
|
||||
numScripts(0), scriptsIndex(NULL), scriptStarts(NULL), scriptStartsLength(0),
|
||||
rootElements(NULL), rootElementsLength(0) {}
|
||||
|
||||
uint32_t getCE32(UChar32 c) const {
|
||||
|
@ -137,13 +154,17 @@ struct U_I18N_API CollationData : public UMemory {
|
|||
int32_t dest[], int32_t capacity, UErrorCode &errorCode) const;
|
||||
|
||||
/**
|
||||
* Writes the permutation table for the given reordering of scripts and groups,
|
||||
* mapping from default-order primary-weight lead bytes to reordered lead bytes.
|
||||
* Writes the permutation of primary-weight ranges
|
||||
* for the given reordering of scripts and groups.
|
||||
* The caller checks for illegal arguments and
|
||||
* takes care of [DEFAULT] and memory allocation.
|
||||
*
|
||||
* Each list element will be a (limit, offset) pair as described
|
||||
* for the CollationSettings::reorderRanges.
|
||||
* The list will be empty if no ranges are reordered.
|
||||
*/
|
||||
void makeReorderTable(const int32_t *reorder, int32_t length,
|
||||
uint8_t table[256], UErrorCode &errorCode) const;
|
||||
void makeReorderRanges(const int32_t *reorder, int32_t length,
|
||||
UVector32 &ranges, UErrorCode &errorCode) const;
|
||||
|
||||
/** @see jamoCE32s */
|
||||
static const int32_t JAMO_CE32S_LENGTH = 19 + 21 + 27;
|
||||
|
@ -195,22 +216,26 @@ struct U_I18N_API CollationData : public UMemory {
|
|||
* Data for scripts and reordering groups.
|
||||
* Uses include building a reordering permutation table and
|
||||
* providing script boundaries to AlphabeticIndex.
|
||||
*
|
||||
* This data is a sorted list of primary-weight lead byte ranges (reordering groups),
|
||||
* each with a list of pairs sorted in base collation order;
|
||||
* each pair contains a script/reorder code and the lowest primary weight for that script.
|
||||
*
|
||||
* Data structure:
|
||||
* - Each reordering group is encoded in n+2 16-bit integers.
|
||||
* - First integer:
|
||||
* Bits 15..8: First byte of the reordering group's range.
|
||||
* Bits 7..0: Last byte of the reordering group's range.
|
||||
* - Second integer:
|
||||
* Length n of the list of script/reordering codes.
|
||||
* - Each further integer is a script or reordering code.
|
||||
*/
|
||||
const uint16_t *scripts;
|
||||
int32_t scriptsLength;
|
||||
int32_t numScripts;
|
||||
/**
|
||||
* The length of scriptsIndex is numScripts+16.
|
||||
* It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
|
||||
* 16 special reorder codes (not all used) are mapped starting at numScripts.
|
||||
* Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
|
||||
* There are special codes at the end for reorder-reserved primary ranges.
|
||||
*
|
||||
* Multiple scripts may share a range and index, for example Hira & Kana.
|
||||
*/
|
||||
const uint16_t *scriptsIndex;
|
||||
/**
|
||||
* Start primary weight (top 16 bits only) for a group/script/reserved range
|
||||
* indexed by scriptsIndex.
|
||||
* The first range (separators & terminators) and the last range (trailing weights)
|
||||
* are not reorderable, and no scriptsIndex entry points to them.
|
||||
*/
|
||||
const uint16_t *scriptStarts;
|
||||
int32_t scriptStartsLength;
|
||||
|
||||
/**
|
||||
* Collation elements in the root collator.
|
||||
|
@ -221,7 +246,12 @@ struct U_I18N_API CollationData : public UMemory {
|
|||
int32_t rootElementsLength;
|
||||
|
||||
private:
|
||||
int32_t findScript(int32_t script) const;
|
||||
int32_t getScriptIndex(int32_t script) const;
|
||||
void makeReorderRanges(const int32_t *reorder, int32_t length,
|
||||
UBool latinMustMove,
|
||||
UVector32 &ranges, UErrorCode &errorCode) const;
|
||||
int32_t addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const;
|
||||
int32_t addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2012-2014, International Business Machines
|
||||
* Copyright (C) 2012-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationdatabuilder.cpp
|
||||
|
@ -1213,8 +1213,10 @@ CollationDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
|
|||
if(base != NULL) {
|
||||
data.numericPrimary = base->numericPrimary;
|
||||
data.compressibleBytes = base->compressibleBytes;
|
||||
data.scripts = base->scripts;
|
||||
data.scriptsLength = base->scriptsLength;
|
||||
data.numScripts = base->numScripts;
|
||||
data.scriptsIndex = base->scriptsIndex;
|
||||
data.scriptStarts = base->scriptStarts;
|
||||
data.scriptStartsLength = base->scriptStartsLength;
|
||||
}
|
||||
buildFastLatinTable(data, errorCode);
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2013-2014, International Business Machines
|
||||
* Copyright (C) 2013-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationdatareader.cpp
|
||||
|
@ -102,6 +102,8 @@ CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes
|
|||
const CollationData *baseData = base == NULL ? NULL : base->data;
|
||||
const int32_t *reorderCodes = NULL;
|
||||
int32_t reorderCodesLength = 0;
|
||||
const uint32_t *reorderRanges = NULL;
|
||||
int32_t reorderRangesLength = 0;
|
||||
index = IX_REORDER_CODES_OFFSET;
|
||||
offset = getIndex(inIndexes, indexesLength, index);
|
||||
length = getIndex(inIndexes, indexesLength, index + 1) - offset;
|
||||
|
@ -114,6 +116,20 @@ CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes
|
|||
}
|
||||
reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset);
|
||||
reorderCodesLength = length / 4;
|
||||
|
||||
// The reorderRanges (if any) are the trailing reorderCodes entries.
|
||||
// Split the array at the boundary.
|
||||
// Script or reorder codes do not exceed 16-bit values.
|
||||
// Range limits are stored in the upper 16 bits, and are never 0.
|
||||
while(reorderRangesLength < reorderCodesLength &&
|
||||
(reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) {
|
||||
++reorderRangesLength;
|
||||
}
|
||||
U_ASSERT(reorderRangesLength < reorderCodesLength);
|
||||
if(reorderRangesLength != 0) {
|
||||
reorderCodesLength -= reorderRangesLength;
|
||||
reorderRanges = reinterpret_cast<const uint32_t *>(reorderCodes + reorderCodesLength);
|
||||
}
|
||||
}
|
||||
|
||||
// There should be a reorder table only if there are reorder codes.
|
||||
|
@ -337,13 +353,32 @@ CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes
|
|||
errorCode = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
data->scripts = reinterpret_cast<const uint16_t *>(inBytes + offset);
|
||||
data->scriptsLength = length / 2;
|
||||
const uint16_t *scripts = reinterpret_cast<const uint16_t *>(inBytes + offset);
|
||||
int32_t scriptsLength = length / 2;
|
||||
data->numScripts = scripts[0];
|
||||
// There must be enough entries for both arrays, including more than two range starts.
|
||||
data->scriptStartsLength = scriptsLength - (1 + data->numScripts + 16);
|
||||
if(data->scriptStartsLength <= 2 ||
|
||||
CollationData::MAX_NUM_SCRIPT_RANGES < data->scriptStartsLength) {
|
||||
errorCode = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
data->scriptsIndex = scripts + 1;
|
||||
data->scriptStarts = scripts + 1 + data->numScripts + 16;
|
||||
if(!(data->scriptStarts[0] == 0 &&
|
||||
data->scriptStarts[1] == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8) &&
|
||||
data->scriptStarts[data->scriptStartsLength - 1] ==
|
||||
(Collation::TRAIL_WEIGHT_BYTE << 8))) {
|
||||
errorCode = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
} else if(data == NULL) {
|
||||
// Nothing to do.
|
||||
} else if(baseData != NULL) {
|
||||
data->scripts = baseData->scripts;
|
||||
data->scriptsLength = baseData->scriptsLength;
|
||||
data->numScripts = baseData->numScripts;
|
||||
data->scriptsIndex = baseData->scriptsIndex;
|
||||
data->scriptStarts = baseData->scriptStarts;
|
||||
data->scriptStartsLength = baseData->scriptStartsLength;
|
||||
}
|
||||
|
||||
index = IX_COMPRESSIBLE_BYTES_OFFSET;
|
||||
|
@ -393,16 +428,10 @@ CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes
|
|||
return;
|
||||
}
|
||||
|
||||
if(reorderCodesLength == 0 || reorderTable != NULL) {
|
||||
settings->aliasReordering(reorderCodes, reorderCodesLength, reorderTable);
|
||||
} else {
|
||||
uint8_t table[256];
|
||||
baseData->makeReorderTable(reorderCodes, reorderCodesLength, table, errorCode);
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
if(!settings->setReordering(reorderCodes, reorderCodesLength,table)) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
if(reorderCodesLength != 0) {
|
||||
settings->aliasReordering(*baseData, reorderCodes, reorderCodesLength,
|
||||
reorderRanges, reorderRangesLength,
|
||||
reorderTable, errorCode);
|
||||
}
|
||||
|
||||
settings->fastLatinOptions = CollationFastLatin::getOptions(
|
||||
|
@ -422,7 +451,7 @@ CollationDataReader::isAcceptable(void *context,
|
|||
pInfo->dataFormat[1] == 0x43 &&
|
||||
pInfo->dataFormat[2] == 0x6f &&
|
||||
pInfo->dataFormat[3] == 0x6c &&
|
||||
pInfo->formatVersion[0] == 4
|
||||
pInfo->formatVersion[0] == 5
|
||||
) {
|
||||
UVersionInfo *version = static_cast<UVersionInfo *>(context);
|
||||
if(version != NULL) {
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2013-2014, International Business Machines
|
||||
* Copyright (C) 2013-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationdatareader.h
|
||||
|
@ -109,7 +109,7 @@ private:
|
|||
|
||||
/*
|
||||
* Format of collation data (ucadata.icu, binary data in coll/ *.res files).
|
||||
* Format version 4.1.
|
||||
* Format version 5.
|
||||
*
|
||||
* The root collation data is stored in the ucadata.icu file.
|
||||
* Tailorings are stored inside .res resource bundle files, with a complete file header.
|
||||
|
@ -151,10 +151,30 @@ private:
|
|||
* int32_t reorderCodes[]; -- empty in root
|
||||
* The list of script and reordering codes.
|
||||
*
|
||||
* Beginning with format version 5, this array may optionally
|
||||
* have trailing entries with a full list of reorder ranges
|
||||
* as described for CollationSettings::reorderRanges.
|
||||
*
|
||||
* Script or reorder codes are first and do not exceed 16-bit values.
|
||||
* Range limits are stored in the upper 16 bits, and are never 0.
|
||||
* Split this array into reorder codes and ranges at the first entry
|
||||
* with non-zero upper 16 bits.
|
||||
*
|
||||
* If the ranges are missing but needed for split-reordered primary lead bytes,
|
||||
* then they are regenerated at load time.
|
||||
*
|
||||
* uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes
|
||||
* Primary-weight lead byte permutation table.
|
||||
* Normally present when the reorderCodes are, but can be built at load time.
|
||||
*
|
||||
* Beginning with format version 5, a 0 entry at a non-zero index
|
||||
* (which is otherwise an illegal value)
|
||||
* means that the primary lead byte is "split"
|
||||
* (there are different offsets for primaries that share that lead byte)
|
||||
* and the reordering offset must be determined via the reorder ranges
|
||||
* that are either stored as part of the reorderCodes array
|
||||
* or regenerated at load time.
|
||||
*
|
||||
* UTrie2 trie; -- see utrie2_impl.h and utrie2.h
|
||||
* The trie holds the main collation data. Each code point is mapped to a 32-bit value.
|
||||
* It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set,
|
||||
|
@ -194,6 +214,13 @@ private:
|
|||
* See the CollationFastLatin class.
|
||||
*
|
||||
* uint16_t scripts[]; -- empty in all tailorings
|
||||
* Format version 5:
|
||||
* uint16_t numScripts;
|
||||
* uint16_t scriptsIndex[numScripts+16];
|
||||
* uint16_t scriptStarts[];
|
||||
* See CollationData::numScripts etc.
|
||||
*
|
||||
* Format version 4:
|
||||
* Table of the reordering groups with their first and last lead bytes,
|
||||
* and their script and reordering codes.
|
||||
* See CollationData::scripts.
|
||||
|
@ -202,15 +229,20 @@ private:
|
|||
* Flag for getSortKey(), indicating primary weight lead bytes that are compressible.
|
||||
*
|
||||
* -----------------
|
||||
* Changes for formatVersion 4.1
|
||||
* Changes for formatVersion 5 (ICU 55)
|
||||
*
|
||||
* Reordering moves single scripts, not groups of scripts.
|
||||
* Reorder ranges are optionally appended to the reorderCodes,
|
||||
* and a 0 entry in the reorderTable indicates a split lead byte.
|
||||
* The scripts data has a new format.
|
||||
*
|
||||
* The rootElements may contain secondary and tertiary weights below common=05.
|
||||
* (Used for small Hiragana letters.)
|
||||
* Where is occurs, there is also an explicit unit with common secondary & tertiary weights.
|
||||
* There are no other data structure changes, but builder code needs to be able to handle such data.
|
||||
*
|
||||
* ICU 55 ucadata.icu uses formatVersion 4.1.
|
||||
* ICU 55 tailoring data continues to use formatVersion 4.0.
|
||||
* The collation element for the merge separator code point U+FFFE
|
||||
* does not necessarily have special, unique secondary/tertiary weights any more.
|
||||
*/
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2013-2014, International Business Machines
|
||||
* Copyright (C) 2013-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationdatawriter.cpp
|
||||
|
@ -68,7 +68,7 @@ static const UDataInfo dataInfo = {
|
|||
0,
|
||||
|
||||
{ 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol"
|
||||
{ 4, 0, 0, 0 }, // formatVersion
|
||||
{ 5, 0, 0, 0 }, // formatVersion
|
||||
{ 6, 3, 0, 0 } // dataVersion
|
||||
};
|
||||
|
||||
|
@ -157,6 +157,23 @@ CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
|
|||
}
|
||||
}
|
||||
|
||||
UVector32 codesAndRanges(errorCode);
|
||||
const int32_t *reorderCodes = settings.reorderCodes;
|
||||
int32_t reorderCodesLength = settings.reorderCodesLength;
|
||||
if(settings.hasReordering() &&
|
||||
CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) {
|
||||
// Rebuild the full list of reorder ranges.
|
||||
// The list in the settings is truncated for efficiency.
|
||||
data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode);
|
||||
// Write the codes, then the ranges.
|
||||
for(int32_t i = 0; i < reorderCodesLength; ++i) {
|
||||
codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode);
|
||||
}
|
||||
if(U_FAILURE(errorCode)) { return 0; }
|
||||
reorderCodes = codesAndRanges.getBuffer();
|
||||
reorderCodesLength = codesAndRanges.size();
|
||||
}
|
||||
|
||||
int32_t headerSize;
|
||||
if(isBase) {
|
||||
headerSize = 0; // udata_create() writes the header
|
||||
|
@ -171,7 +188,7 @@ CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
|
|||
if(hasMappings && data.cesLength != 0) {
|
||||
// Sum of the sizes of the data items which are
|
||||
// not automatically multiples of 8 bytes and which are placed before the CEs.
|
||||
int32_t sum = headerSize + (indexesLength + settings.reorderCodesLength) * 4;
|
||||
int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4;
|
||||
if((sum & 7) != 0) {
|
||||
// We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
|
||||
// We add to the header size here.
|
||||
|
@ -211,7 +228,7 @@ CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
|
|||
}
|
||||
|
||||
indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize;
|
||||
totalSize += settings.reorderCodesLength * 4;
|
||||
totalSize += reorderCodesLength * 4;
|
||||
|
||||
indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize;
|
||||
if(settings.reorderTable != NULL) {
|
||||
|
@ -280,9 +297,13 @@ CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
|
|||
indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize;
|
||||
totalSize += fastLatinTableLength * 2;
|
||||
|
||||
UnicodeString scripts;
|
||||
indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize;
|
||||
if(isBase) {
|
||||
totalSize += data.scriptsLength * 2;
|
||||
scripts.append((UChar)data.numScripts);
|
||||
scripts.append(reinterpret_cast<const UChar *>(data.scriptsIndex), data.numScripts + 16);
|
||||
scripts.append(reinterpret_cast<const UChar *>(data.scriptStarts), data.scriptStartsLength);
|
||||
totalSize += scripts.length() * 2;
|
||||
}
|
||||
|
||||
indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize;
|
||||
|
@ -299,7 +320,7 @@ CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
|
|||
}
|
||||
|
||||
uprv_memcpy(dest, indexes, indexesLength * 4);
|
||||
copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, settings.reorderCodes, dest);
|
||||
copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest);
|
||||
copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest);
|
||||
// The trie has already been serialized into the dest buffer.
|
||||
copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest);
|
||||
|
@ -308,7 +329,7 @@ CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
|
|||
copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest);
|
||||
// The unsafeBackwardSet has already been serialized into the dest buffer.
|
||||
copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest);
|
||||
copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, data.scripts, dest);
|
||||
copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest);
|
||||
copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest);
|
||||
|
||||
return headerSize + totalSize;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2013-2014, International Business Machines
|
||||
* Copyright (C) 2013-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationfastlatin.cpp
|
||||
|
@ -36,33 +36,50 @@ CollationFastLatin::getOptions(const CollationData *data, const CollationSetting
|
|||
// lowest long mini primary.
|
||||
miniVarTop = MIN_LONG - 1;
|
||||
} else {
|
||||
uint32_t v1 = settings.variableTop >> 24;
|
||||
int32_t headerLength = *table & 0xff;
|
||||
int32_t i = headerLength - 1;
|
||||
if(i <= 0 || v1 > (table[i] & 0x7f)) {
|
||||
int32_t i = 1 + settings.getMaxVariable();
|
||||
if(i >= headerLength) {
|
||||
return -1; // variableTop >= digits, should not occur
|
||||
}
|
||||
while(i > 1 && v1 <= (table[i - 1] & 0x7f)) { --i; }
|
||||
// In the table header, the miniVarTop is in bits 15..7, with 4 zero bits 19..16 implied.
|
||||
// Shift right to make it comparable with long mini primaries in bits 15..3.
|
||||
miniVarTop = (table[i] & 0xff80) >> 4;
|
||||
miniVarTop = table[i];
|
||||
}
|
||||
|
||||
const uint8_t *reorderTable = settings.reorderTable;
|
||||
if(reorderTable != NULL) {
|
||||
const uint16_t *scripts = data->scripts;
|
||||
int32_t length = data->scriptsLength;
|
||||
uint32_t prevLastByte = 0;
|
||||
for(int32_t i = 0; i < length;) {
|
||||
// reordered last byte of the group
|
||||
uint32_t lastByte = reorderTable[scripts[i] & 0xff];
|
||||
if(lastByte < prevLastByte) {
|
||||
// The permutation affects the groups up to Latin.
|
||||
return -1;
|
||||
UBool digitsAreReordered = FALSE;
|
||||
if(settings.hasReordering()) {
|
||||
uint32_t prevStart = 0;
|
||||
uint32_t beforeDigitStart = 0;
|
||||
uint32_t digitStart = 0;
|
||||
uint32_t afterDigitStart = 0;
|
||||
for(int32_t group = UCOL_REORDER_CODE_FIRST;
|
||||
group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES;
|
||||
++group) {
|
||||
uint32_t start = data->getFirstPrimaryForGroup(group);
|
||||
start = settings.reorder(start);
|
||||
if(group == UCOL_REORDER_CODE_DIGIT) {
|
||||
beforeDigitStart = prevStart;
|
||||
digitStart = start;
|
||||
} else if(start != 0) {
|
||||
if(start < prevStart) {
|
||||
// The permutation affects the groups up to Latin.
|
||||
return -1;
|
||||
}
|
||||
// In the future, there might be a special group between digits & Latin.
|
||||
if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) {
|
||||
afterDigitStart = start;
|
||||
}
|
||||
prevStart = start;
|
||||
}
|
||||
if(scripts[i + 2] == USCRIPT_LATIN) { break; }
|
||||
i = i + 2 + scripts[i + 1];
|
||||
prevLastByte = lastByte;
|
||||
}
|
||||
uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN);
|
||||
latinStart = settings.reorder(latinStart);
|
||||
if(latinStart < prevStart) {
|
||||
return -1;
|
||||
}
|
||||
if(afterDigitStart == 0) {
|
||||
afterDigitStart = latinStart;
|
||||
}
|
||||
if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) {
|
||||
digitsAreReordered = TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -78,7 +95,7 @@ CollationFastLatin::getOptions(const CollationData *data, const CollationSetting
|
|||
}
|
||||
primaries[c] = (uint16_t)p;
|
||||
}
|
||||
if((settings.options & CollationSettings::NUMERIC) != 0) {
|
||||
if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) {
|
||||
// Bail out for digits.
|
||||
for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2013-2014, International Business Machines
|
||||
* Copyright (C) 2013-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationfastlatin.h
|
||||
|
@ -31,7 +31,7 @@ public:
|
|||
* When the major version number of the main data format changes,
|
||||
* we can reset this fast Latin version to 1.
|
||||
*/
|
||||
static const uint16_t VERSION = 1;
|
||||
static const uint16_t VERSION = 2;
|
||||
|
||||
static const int32_t LATIN_MAX = 0x17f;
|
||||
static const int32_t LATIN_LIMIT = LATIN_MAX + 1;
|
||||
|
@ -252,7 +252,7 @@ private:
|
|||
|
||||
/*
|
||||
* Format of the CollationFastLatin data table.
|
||||
* CollationFastLatin::VERSION = 1.
|
||||
* CollationFastLatin::VERSION = 2.
|
||||
*
|
||||
* This table contains data for a Latin-text collation fastpath.
|
||||
* The data is stored as an array of uint16_t which contains the following parts.
|
||||
|
@ -262,6 +262,12 @@ private:
|
|||
* 7..0: length of the header
|
||||
*
|
||||
* uint16_t varTops[header length - 1]
|
||||
* Version 2:
|
||||
* varTops[m] is the highest CollationFastLatin long-primary weight
|
||||
* of supported maxVariable group m
|
||||
* (special reorder group space, punct, symbol, currency).
|
||||
*
|
||||
* Version 1:
|
||||
* Each of these values maps the variable top lead byte of a supported maxVariable group
|
||||
* to the highest CollationFastLatin long-primary weight.
|
||||
* The values are stored in ascending order.
|
||||
|
@ -293,6 +299,16 @@ private:
|
|||
* Each list is terminated by an entry with CONTR_CHAR_MASK.
|
||||
* Each list starts with such an entry which also contains the default result
|
||||
* for when there is no contraction match.
|
||||
*
|
||||
* -----------------
|
||||
* Changes for version 2 (ICU 55)
|
||||
*
|
||||
* Special reorder groups do not necessarily start on whole primary lead bytes any more.
|
||||
* Therefore, the varTops data has a new format:
|
||||
* Version 1 stored the lead bytes of the highest root primaries for
|
||||
* the maxVariable-supported special reorder groups.
|
||||
* Now the top 16 bits would need to be stored,
|
||||
* and it is simpler to store only the fast-Latin weights.
|
||||
*/
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2013-2014, International Business Machines
|
||||
* Copyright (C) 2013-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationfastlatinbuilder.cpp
|
||||
|
@ -136,42 +136,26 @@ CollationFastLatinBuilder::forData(const CollationData &data, UErrorCode &errorC
|
|||
UBool
|
||||
CollationFastLatinBuilder::loadGroups(const CollationData &data, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) { return FALSE; }
|
||||
result.append(0); // reserved for version & headerLength
|
||||
headerLength = 1 + NUM_SPECIAL_GROUPS;
|
||||
uint32_t r0 = (CollationFastLatin::VERSION << 8) | headerLength;
|
||||
result.append((UChar)r0);
|
||||
// The first few reordering groups should be special groups
|
||||
// (space, punct, ..., digit) followed by Latn, then Grek and other scripts.
|
||||
for(int32_t i = 0;;) {
|
||||
if(i >= data.scriptsLength) {
|
||||
// no Latn script
|
||||
errorCode = U_INTERNAL_PROGRAM_ERROR;
|
||||
for(int32_t i = 0; i < NUM_SPECIAL_GROUPS; ++i) {
|
||||
lastSpecialPrimaries[i] = data.getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST + i);
|
||||
if(lastSpecialPrimaries[i] == 0) {
|
||||
// missing data
|
||||
return FALSE;
|
||||
}
|
||||
uint32_t head = data.scripts[i];
|
||||
uint32_t lastByte = head & 0xff; // last primary byte in the group
|
||||
int32_t group = data.scripts[i + 2];
|
||||
if(group == UCOL_REORDER_CODE_DIGIT) {
|
||||
firstDigitPrimary = (head & 0xff00) << 16;
|
||||
headerLength = result.length();
|
||||
uint32_t r0 = (CollationFastLatin::VERSION << 8) | headerLength;
|
||||
result.setCharAt(0, (UChar)r0);
|
||||
} else if(group == USCRIPT_LATIN) {
|
||||
if(firstDigitPrimary == 0) {
|
||||
// no digit group
|
||||
errorCode = U_INTERNAL_PROGRAM_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
firstLatinPrimary = (head & 0xff00) << 16;
|
||||
lastLatinPrimary = (lastByte << 24) | 0xffffff;
|
||||
break;
|
||||
} else if(firstDigitPrimary == 0) {
|
||||
// a group below digits
|
||||
if(lastByte > 0x7f) {
|
||||
// We only use 7 bits for the last byte of a below-digits group.
|
||||
// This does not warrant an errorCode, but we do not build a fast Latin table.
|
||||
return FALSE;
|
||||
}
|
||||
result.append((UChar)lastByte);
|
||||
}
|
||||
i = i + 2 + data.scripts[i + 1];
|
||||
result.append(0); // reserve a slot for this group
|
||||
}
|
||||
|
||||
firstDigitPrimary = data.getFirstPrimaryForGroup(UCOL_REORDER_CODE_DIGIT);
|
||||
firstLatinPrimary = data.getFirstPrimaryForGroup(USCRIPT_LATIN);
|
||||
lastLatinPrimary = data.getLastPrimaryForGroup(USCRIPT_LATIN);
|
||||
if(firstDigitPrimary == 0 || firstLatinPrimary == 0) {
|
||||
// missing data
|
||||
return FALSE;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
@ -187,23 +171,21 @@ CollationFastLatinBuilder::inSameGroup(uint32_t p, uint32_t q) const {
|
|||
}
|
||||
// Both or neither must be potentially-variable,
|
||||
// so that we can test only one and determine if both are variable.
|
||||
if(p >= firstDigitPrimary) {
|
||||
return q >= firstDigitPrimary;
|
||||
} else if(q >= firstDigitPrimary) {
|
||||
uint32_t lastVariablePrimary = lastSpecialPrimaries[NUM_SPECIAL_GROUPS - 1];
|
||||
if(p > lastVariablePrimary) {
|
||||
return q > lastVariablePrimary;
|
||||
} else if(q > lastVariablePrimary) {
|
||||
return FALSE;
|
||||
}
|
||||
// Both will be encoded with long mini primaries.
|
||||
// They must be in the same special reordering group,
|
||||
// so that we can test only one and determine if both are variable.
|
||||
p >>= 24; // first primary byte
|
||||
q >>= 24;
|
||||
U_ASSERT(p != 0 && q != 0);
|
||||
U_ASSERT(p <= result[headerLength - 1]); // the loop will terminate
|
||||
for(int32_t i = 1;; ++i) {
|
||||
uint32_t lastByte = result[i];
|
||||
if(p <= lastByte) {
|
||||
return q <= lastByte;
|
||||
} else if(q <= lastByte) {
|
||||
for(int32_t i = 0;; ++i) { // will terminate
|
||||
uint32_t lastPrimary = lastSpecialPrimaries[i];
|
||||
if(p <= lastPrimary) {
|
||||
return q <= lastPrimary;
|
||||
} else if(q <= lastPrimary) {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
@ -451,8 +433,8 @@ CollationFastLatinBuilder::encodeUniqueCEs(UErrorCode &errorCode) {
|
|||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
int32_t group = 1;
|
||||
uint32_t lastGroupByte = result[group];
|
||||
int32_t group = 0;
|
||||
uint32_t lastGroupPrimary = lastSpecialPrimaries[group];
|
||||
// The lowest unique CE must be at least a secondary CE.
|
||||
U_ASSERT(((uint32_t)uniqueCEs.elementAti(0) >> 16) != 0);
|
||||
uint32_t prevPrimary = 0;
|
||||
|
@ -466,16 +448,15 @@ CollationFastLatinBuilder::encodeUniqueCEs(UErrorCode &errorCode) {
|
|||
// (uniqueCEs does not store case bits.)
|
||||
uint32_t p = (uint32_t)(ce >> 32);
|
||||
if(p != prevPrimary) {
|
||||
uint32_t p1 = p >> 24;
|
||||
while(p1 > lastGroupByte) {
|
||||
while(p > lastGroupPrimary) {
|
||||
U_ASSERT(pri <= CollationFastLatin::MAX_LONG);
|
||||
// Add the last "long primary" in or before the group
|
||||
// into the upper 9 bits of the group entry.
|
||||
result.setCharAt(group, (UChar)((pri << 4) | lastGroupByte));
|
||||
if(++group < headerLength) { // group is 1-based
|
||||
lastGroupByte = result[group];
|
||||
// Set the group's header entry to the
|
||||
// last "long primary" in or before the group.
|
||||
result.setCharAt(1 + group, (UChar)pri);
|
||||
if(++group < NUM_SPECIAL_GROUPS) {
|
||||
lastGroupPrimary = lastSpecialPrimaries[group];
|
||||
} else {
|
||||
lastGroupByte = 0xff;
|
||||
lastGroupPrimary = 0xffffffff;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2013-2014, International Business Machines
|
||||
* Copyright (C) 2013-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationfastlatinbuilder.h
|
||||
|
@ -39,6 +39,9 @@ public:
|
|||
int32_t lengthOfTable() const { return result.length(); }
|
||||
|
||||
private:
|
||||
// space, punct, symbol, currency (not digit)
|
||||
enum { NUM_SPECIAL_GROUPS = UCOL_REORDER_CODE_CURRENCY - UCOL_REORDER_CODE_FIRST + 1 };
|
||||
|
||||
UBool loadGroups(const CollationData &data, UErrorCode &errorCode);
|
||||
UBool inSameGroup(uint32_t p, uint32_t q) const;
|
||||
|
||||
|
@ -73,7 +76,8 @@ private:
|
|||
/** One 16-bit mini CE per unique CE. */
|
||||
uint16_t *miniCEs;
|
||||
|
||||
// These are constant for a given list of CollationData.scripts.
|
||||
// These are constant for a given root collator.
|
||||
uint32_t lastSpecialPrimaries[NUM_SPECIAL_GROUPS];
|
||||
uint32_t firstDigitPrimary;
|
||||
uint32_t firstLatinPrimary;
|
||||
uint32_t lastLatinPrimary;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (C) 1999-2014, International Business Machines
|
||||
* Copyright (C) 1999-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
* file name: collationfcd.cpp
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2012-2014, International Business Machines
|
||||
* Copyright (C) 2012-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationkeys.cpp
|
||||
|
@ -246,7 +246,6 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
|
|||
// +1 so that we can use "<" and primary ignorables test out early.
|
||||
variableTop = settings.variableTop + 1;
|
||||
}
|
||||
const uint8_t *reorderTable = settings.reorderTable;
|
||||
|
||||
uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options);
|
||||
|
||||
|
@ -255,7 +254,7 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
|
|||
SortKeyLevel tertiaries;
|
||||
SortKeyLevel quaternaries;
|
||||
|
||||
uint32_t compressedP1 = 0; // 0==no compression; otherwise reordered compressible lead byte
|
||||
uint32_t prevReorderedPrimary = 0; // 0==no compression
|
||||
int32_t commonCases = 0;
|
||||
int32_t commonSecondaries = 0;
|
||||
int32_t commonTertiaries = 0;
|
||||
|
@ -284,14 +283,15 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
|
|||
}
|
||||
do {
|
||||
if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) {
|
||||
uint32_t p1 = p >> 24;
|
||||
if(reorderTable != NULL) { p1 = reorderTable[p1]; }
|
||||
if(p1 >= QUAT_SHIFTED_LIMIT_BYTE) {
|
||||
if(settings.hasReordering()) {
|
||||
p = settings.reorder(p);
|
||||
}
|
||||
if((p >> 24) >= QUAT_SHIFTED_LIMIT_BYTE) {
|
||||
// Prevent shifted primary lead bytes from
|
||||
// overlapping with the common compression range.
|
||||
quaternaries.appendByte(QUAT_SHIFTED_LIMIT_BYTE);
|
||||
}
|
||||
quaternaries.appendWeight32((p1 << 24) | (p & 0xffffff));
|
||||
quaternaries.appendWeight32(p);
|
||||
}
|
||||
do {
|
||||
ce = iter.nextCE(errorCode);
|
||||
|
@ -304,11 +304,15 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
|
|||
// If ce==NO_CE, then write nothing for the primary level but
|
||||
// terminate compression on all levels and then exit the loop.
|
||||
if(p > Collation::NO_CE_PRIMARY && (levels & Collation::PRIMARY_LEVEL_FLAG) != 0) {
|
||||
// Test the un-reordered primary for compressibility.
|
||||
UBool isCompressible = compressibleBytes[p >> 24];
|
||||
if(settings.hasReordering()) {
|
||||
p = settings.reorder(p);
|
||||
}
|
||||
uint32_t p1 = p >> 24;
|
||||
if(reorderTable != NULL) { p1 = reorderTable[p1]; }
|
||||
if(p1 != compressedP1) {
|
||||
if(compressedP1 != 0) {
|
||||
if(p1 < compressedP1) {
|
||||
if(!isCompressible || p1 != (prevReorderedPrimary >> 24)) {
|
||||
if(prevReorderedPrimary != 0) {
|
||||
if(p < prevReorderedPrimary) {
|
||||
// No primary compression terminator
|
||||
// at the end of the level or merged segment.
|
||||
if(p1 > Collation::MERGE_SEPARATOR_BYTE) {
|
||||
|
@ -319,12 +323,10 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
|
|||
}
|
||||
}
|
||||
sink.Append(p1);
|
||||
// Test the un-reordered lead byte for compressibility but
|
||||
// remember the reordered lead byte.
|
||||
if(compressibleBytes[p >> 24]) {
|
||||
compressedP1 = p1;
|
||||
if(isCompressible) {
|
||||
prevReorderedPrimary = p;
|
||||
} else {
|
||||
compressedP1 = 0;
|
||||
prevReorderedPrimary = 0;
|
||||
}
|
||||
}
|
||||
char p2 = (char)(p >> 16);
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2013-2014, International Business Machines
|
||||
* Copyright (C) 2013-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationruleparser.cpp
|
||||
|
@ -706,17 +706,7 @@ CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &error
|
|||
if(U_FAILURE(errorCode)) { return; }
|
||||
i = limit;
|
||||
}
|
||||
int32_t length = reorderCodes.size();
|
||||
if(length == 1 && reorderCodes.elementAti(0) == UCOL_REORDER_CODE_NONE) {
|
||||
settings->resetReordering();
|
||||
return;
|
||||
}
|
||||
uint8_t table[256];
|
||||
baseData->makeReorderTable(reorderCodes.getBuffer(), length, table, errorCode);
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
if(!settings->setReordering(reorderCodes.getBuffer(), length, table)) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
|
||||
}
|
||||
|
||||
static const char *const gSpecialReorderCodes[] = {
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2013-2014, International Business Machines
|
||||
* Copyright (C) 2013-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationsettings.cpp
|
||||
|
@ -16,10 +16,12 @@
|
|||
#include "unicode/ucol.h"
|
||||
#include "cmemory.h"
|
||||
#include "collation.h"
|
||||
#include "collationdata.h"
|
||||
#include "collationsettings.h"
|
||||
#include "sharedobject.h"
|
||||
#include "uassert.h"
|
||||
#include "umutex.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -27,19 +29,12 @@ CollationSettings::CollationSettings(const CollationSettings &other)
|
|||
: SharedObject(other),
|
||||
options(other.options), variableTop(other.variableTop),
|
||||
reorderTable(NULL),
|
||||
minHighNoReorder(other.minHighNoReorder),
|
||||
reorderRanges(NULL), reorderRangesLength(0),
|
||||
reorderCodes(NULL), reorderCodesLength(0), reorderCodesCapacity(0),
|
||||
fastLatinOptions(other.fastLatinOptions) {
|
||||
int32_t length = other.reorderCodesLength;
|
||||
if(length == 0) {
|
||||
U_ASSERT(other.reorderTable == NULL);
|
||||
} else {
|
||||
U_ASSERT(other.reorderTable != NULL);
|
||||
if(other.reorderCodesCapacity == 0) {
|
||||
aliasReordering(other.reorderCodes, length, other.reorderTable);
|
||||
} else {
|
||||
setReordering(other.reorderCodes, length, other.reorderTable);
|
||||
}
|
||||
}
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
copyReorderingFrom(other, errorCode);
|
||||
if(fastLatinOptions >= 0) {
|
||||
uprv_memcpy(fastLatinPrimaries, other.fastLatinPrimaries, sizeof(fastLatinPrimaries));
|
||||
}
|
||||
|
@ -79,14 +74,22 @@ CollationSettings::resetReordering() {
|
|||
// rather than a no-op permutation.
|
||||
// Keep the memory via reorderCodes and its capacity.
|
||||
reorderTable = NULL;
|
||||
minHighNoReorder = 0;
|
||||
reorderRangesLength = 0;
|
||||
reorderCodesLength = 0;
|
||||
}
|
||||
|
||||
void
|
||||
CollationSettings::aliasReordering(const int32_t *codes, int32_t length, const uint8_t *table) {
|
||||
if(length == 0) {
|
||||
resetReordering();
|
||||
} else {
|
||||
CollationSettings::aliasReordering(const CollationData &data, const int32_t *codes, int32_t length,
|
||||
const uint32_t *ranges, int32_t rangesLength,
|
||||
const uint8_t *table, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
if(table != NULL &&
|
||||
(rangesLength == 0 ?
|
||||
!reorderTableHasSplitBytes(table) :
|
||||
rangesLength >= 2 &&
|
||||
// The first offset must be 0. The last offset must not be 0.
|
||||
(ranges[0] & 0xffff) == 0 && (ranges[rangesLength - 1] & 0xffff) != 0)) {
|
||||
// We need to release the memory before setting the alias pointer.
|
||||
if(reorderCodesCapacity != 0) {
|
||||
uprv_free(const_cast<int32_t *>(reorderCodes));
|
||||
|
@ -95,36 +98,170 @@ CollationSettings::aliasReordering(const int32_t *codes, int32_t length, const u
|
|||
reorderTable = table;
|
||||
reorderCodes = codes;
|
||||
reorderCodesLength = length;
|
||||
// Drop ranges before the first split byte. They are reordered by the table.
|
||||
// This then speeds up reordering of the remaining ranges.
|
||||
int32_t firstSplitByteRangeIndex = 0;
|
||||
while(firstSplitByteRangeIndex < rangesLength &&
|
||||
(ranges[firstSplitByteRangeIndex] & 0xff0000) == 0) {
|
||||
// The second byte of the primary limit is 0.
|
||||
++firstSplitByteRangeIndex;
|
||||
}
|
||||
if(firstSplitByteRangeIndex == rangesLength) {
|
||||
U_ASSERT(!reorderTableHasSplitBytes(table));
|
||||
minHighNoReorder = 0;
|
||||
reorderRanges = NULL;
|
||||
reorderRangesLength = 0;
|
||||
} else {
|
||||
U_ASSERT(table[ranges[firstSplitByteRangeIndex] >> 24] == 0);
|
||||
minHighNoReorder = ranges[rangesLength - 1] & 0xffff0000;
|
||||
reorderRanges = ranges + firstSplitByteRangeIndex;
|
||||
reorderRangesLength = rangesLength - firstSplitByteRangeIndex;
|
||||
}
|
||||
return;
|
||||
}
|
||||
// Regenerate missing data.
|
||||
setReordering(data, codes, length, errorCode);
|
||||
}
|
||||
|
||||
void
|
||||
CollationSettings::setReordering(const CollationData &data,
|
||||
const int32_t *codes, int32_t codesLength,
|
||||
UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
if(codesLength == 0 || (codesLength == 1 && codes[0] == UCOL_REORDER_CODE_NONE)) {
|
||||
resetReordering();
|
||||
return;
|
||||
}
|
||||
UVector32 rangesList(errorCode);
|
||||
data.makeReorderRanges(codes, codesLength, rangesList, errorCode);
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
int32_t rangesLength = rangesList.size();
|
||||
if(rangesLength == 0) {
|
||||
resetReordering();
|
||||
return;
|
||||
}
|
||||
const uint32_t *ranges = reinterpret_cast<uint32_t *>(rangesList.getBuffer());
|
||||
// ranges[] contains at least two (limit, offset) pairs.
|
||||
// The first offset must be 0. The last offset must not be 0.
|
||||
// Separators (at the low end) and trailing weights (at the high end)
|
||||
// are never reordered.
|
||||
U_ASSERT(rangesLength >= 2);
|
||||
U_ASSERT((ranges[0] & 0xffff) == 0 && (ranges[rangesLength - 1] & 0xffff) != 0);
|
||||
minHighNoReorder = ranges[rangesLength - 1] & 0xffff0000;
|
||||
|
||||
// Write the lead byte permutation table.
|
||||
// Set a 0 for each lead byte that has a range boundary in the middle.
|
||||
uint8_t table[256];
|
||||
int32_t b = 0;
|
||||
int32_t firstSplitByteRangeIndex = -1;
|
||||
for(int32_t i = 0; i < rangesLength; ++i) {
|
||||
uint32_t pair = ranges[i];
|
||||
int32_t limit1 = (int32_t)(pair >> 24);
|
||||
while(b < limit1) {
|
||||
table[b] = (uint8_t)(b + pair);
|
||||
++b;
|
||||
}
|
||||
// Check the second byte of the limit.
|
||||
if((pair & 0xff0000) != 0) {
|
||||
table[limit1] = 0;
|
||||
b = limit1 + 1;
|
||||
if(firstSplitByteRangeIndex < 0) {
|
||||
firstSplitByteRangeIndex = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
while(b <= 0xff) {
|
||||
table[b] = (uint8_t)b;
|
||||
++b;
|
||||
}
|
||||
if(firstSplitByteRangeIndex < 0) {
|
||||
// The lead byte permutation table alone suffices for reordering.
|
||||
rangesLength = 0;
|
||||
} else {
|
||||
// Remove the ranges below the first split byte.
|
||||
ranges += firstSplitByteRangeIndex;
|
||||
rangesLength -= firstSplitByteRangeIndex;
|
||||
}
|
||||
setReorderArrays(codes, codesLength, ranges, rangesLength, table, errorCode);
|
||||
}
|
||||
|
||||
void
|
||||
CollationSettings::setReorderArrays(const int32_t *codes, int32_t codesLength,
|
||||
const uint32_t *ranges, int32_t rangesLength,
|
||||
const uint8_t *table, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
int32_t *ownedCodes;
|
||||
int32_t totalLength = codesLength + rangesLength;
|
||||
U_ASSERT(totalLength > 0);
|
||||
if(totalLength <= reorderCodesCapacity) {
|
||||
ownedCodes = const_cast<int32_t *>(reorderCodes);
|
||||
} else {
|
||||
// Allocate one memory block for the codes, the ranges, and the 16-aligned table.
|
||||
int32_t capacity = (totalLength + 3) & ~3; // round up to a multiple of 4 ints
|
||||
ownedCodes = (int32_t *)uprv_malloc(capacity * 4 + 256);
|
||||
if(ownedCodes == NULL) {
|
||||
resetReordering();
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
if(reorderCodesCapacity != 0) {
|
||||
uprv_free(const_cast<int32_t *>(reorderCodes));
|
||||
}
|
||||
reorderCodes = ownedCodes;
|
||||
reorderCodesCapacity = capacity;
|
||||
}
|
||||
uprv_memcpy(ownedCodes + reorderCodesCapacity, table, 256);
|
||||
uprv_memcpy(ownedCodes, codes, codesLength * 4);
|
||||
uprv_memcpy(ownedCodes + codesLength, ranges, rangesLength * 4);
|
||||
reorderTable = reinterpret_cast<const uint8_t *>(reorderCodes + reorderCodesCapacity);
|
||||
reorderCodesLength = codesLength;
|
||||
reorderRanges = reinterpret_cast<uint32_t *>(ownedCodes) + codesLength;
|
||||
reorderRangesLength = rangesLength;
|
||||
}
|
||||
|
||||
void
|
||||
CollationSettings::copyReorderingFrom(const CollationSettings &other, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
if(!other.hasReordering()) {
|
||||
resetReordering();
|
||||
return;
|
||||
}
|
||||
minHighNoReorder = other.minHighNoReorder;
|
||||
if(other.reorderCodesCapacity == 0) {
|
||||
// The reorder arrays are aliased to memory-mapped data.
|
||||
reorderTable = other.reorderTable;
|
||||
reorderRanges = other.reorderRanges;
|
||||
reorderRangesLength = other.reorderRangesLength;
|
||||
reorderCodes = other.reorderCodes;
|
||||
reorderCodesLength = other.reorderCodesLength;
|
||||
} else {
|
||||
setReorderArrays(other.reorderCodes, other.reorderCodesLength,
|
||||
other.reorderRanges, other.reorderRangesLength,
|
||||
other.reorderTable, errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
UBool
|
||||
CollationSettings::setReordering(const int32_t *codes, int32_t length, const uint8_t table[256]) {
|
||||
if(length == 0) {
|
||||
resetReordering();
|
||||
} else {
|
||||
uint8_t *ownedTable;
|
||||
int32_t *ownedCodes;
|
||||
if(length <= reorderCodesCapacity) {
|
||||
ownedTable = const_cast<uint8_t *>(reorderTable);
|
||||
ownedCodes = const_cast<int32_t *>(reorderCodes);
|
||||
} else {
|
||||
// Allocate one memory block for the codes and the 16-aligned table.
|
||||
int32_t capacity = (length + 3) & ~3; // round up to a multiple of 4 ints
|
||||
uint8_t *bytes = (uint8_t *)uprv_malloc(256 + capacity * 4);
|
||||
if(bytes == NULL) { return FALSE; }
|
||||
if(reorderCodesCapacity != 0) {
|
||||
uprv_free(const_cast<int32_t *>(reorderCodes));
|
||||
}
|
||||
reorderTable = ownedTable = bytes + capacity * 4;
|
||||
reorderCodes = ownedCodes = (int32_t *)bytes;
|
||||
reorderCodesCapacity = capacity;
|
||||
CollationSettings::reorderTableHasSplitBytes(const uint8_t table[256]) {
|
||||
U_ASSERT(table[0] == 0);
|
||||
for(int32_t i = 1; i < 256; ++i) {
|
||||
if(table[i] == 0) {
|
||||
return TRUE;
|
||||
}
|
||||
uprv_memcpy(ownedTable, table, 256);
|
||||
uprv_memcpy(ownedCodes, codes, length * 4);
|
||||
reorderCodesLength = length;
|
||||
}
|
||||
return TRUE;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
uint32_t
|
||||
CollationSettings::reorderEx(uint32_t p) const {
|
||||
if(p >= minHighNoReorder) { return p; }
|
||||
// Round up p so that its lower 16 bits are >= any offset bits.
|
||||
// Then compare q directly with (limit, offset) pairs.
|
||||
uint32_t q = p | 0xffff;
|
||||
uint32_t r;
|
||||
const uint32_t *ranges = reorderRanges;
|
||||
while(q >= (r = *ranges)) { ++ranges; }
|
||||
return p + (r << 24);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2013-2014, International Business Machines
|
||||
* Copyright (C) 2013-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationsettings.h
|
||||
|
@ -23,6 +23,8 @@
|
|||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
struct CollationData;
|
||||
|
||||
/**
|
||||
* Collation settings/options/attributes.
|
||||
* These are the values that can be changed via API.
|
||||
|
@ -103,6 +105,8 @@ struct U_I18N_API CollationSettings : public SharedObject {
|
|||
(MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT)),
|
||||
variableTop(0),
|
||||
reorderTable(NULL),
|
||||
minHighNoReorder(0),
|
||||
reorderRanges(NULL), reorderRangesLength(0),
|
||||
reorderCodes(NULL), reorderCodesLength(0), reorderCodesCapacity(0),
|
||||
fastLatinOptions(-1) {}
|
||||
|
||||
|
@ -118,8 +122,23 @@ struct U_I18N_API CollationSettings : public SharedObject {
|
|||
int32_t hashCode() const;
|
||||
|
||||
void resetReordering();
|
||||
void aliasReordering(const int32_t *codes, int32_t length, const uint8_t *table);
|
||||
UBool setReordering(const int32_t *codes, int32_t length, const uint8_t table[256]);
|
||||
void aliasReordering(const CollationData &data, const int32_t *codes, int32_t length,
|
||||
const uint32_t *ranges, int32_t rangesLength,
|
||||
const uint8_t *table, UErrorCode &errorCode);
|
||||
void setReordering(const CollationData &data, const int32_t *codes, int32_t codesLength,
|
||||
UErrorCode &errorCode);
|
||||
void copyReorderingFrom(const CollationSettings &other, UErrorCode &errorCode);
|
||||
|
||||
inline UBool hasReordering() const { return reorderTable != NULL; }
|
||||
static UBool reorderTableHasSplitBytes(const uint8_t table[256]);
|
||||
inline uint32_t reorder(uint32_t p) const {
|
||||
uint8_t b = reorderTable[p >> 24];
|
||||
if(b != 0 || p <= Collation::NO_CE_PRIMARY) {
|
||||
return ((uint32_t)b << 24) | (p & 0xffffff);
|
||||
} else {
|
||||
return reorderEx(p);
|
||||
}
|
||||
}
|
||||
|
||||
void setStrength(int32_t value, int32_t defaultOptions, UErrorCode &errorCode);
|
||||
|
||||
|
@ -194,23 +213,57 @@ struct U_I18N_API CollationSettings : public SharedObject {
|
|||
int32_t options;
|
||||
/** Variable-top primary weight. */
|
||||
uint32_t variableTop;
|
||||
/** 256-byte table for reordering permutation of primary lead bytes; NULL if no reordering. */
|
||||
/**
|
||||
* 256-byte table for reordering permutation of primary lead bytes; NULL if no reordering.
|
||||
* A 0 entry at a non-zero index means that the primary lead byte is "split"
|
||||
* (there are different offsets for primaries that share that lead byte)
|
||||
* and the reordering offset must be determined via the reorderRanges.
|
||||
*/
|
||||
const uint8_t *reorderTable;
|
||||
/** Limit of last reordered range. 0 if no reordering or no split bytes. */
|
||||
uint32_t minHighNoReorder;
|
||||
/**
|
||||
* Primary-weight ranges for script reordering,
|
||||
* to be used by reorder(p) for split-reordered primary lead bytes.
|
||||
*
|
||||
* Each entry is a (limit, offset) pair.
|
||||
* The upper 16 bits of the entry are the upper 16 bits of the
|
||||
* exclusive primary limit of a range.
|
||||
* Primaries between the previous limit and this one have their lead bytes
|
||||
* modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits.
|
||||
*
|
||||
* CollationData::makeReorderRanges() writes a full list where the first range
|
||||
* (at least for terminators and separators) has a 0 offset.
|
||||
* The last range has a non-zero offset.
|
||||
* minHighNoReorder is set to the limit of that last range.
|
||||
*
|
||||
* In the settings object, the initial ranges before the first split lead byte
|
||||
* are omitted for efficiency; they are handled by reorder(p) via the reorderTable.
|
||||
* If there are no split-reordered lead bytes, then no ranges are needed.
|
||||
*/
|
||||
const uint32_t *reorderRanges;
|
||||
int32_t reorderRangesLength;
|
||||
/** Array of reorder codes; ignored if reorderCodesLength == 0. */
|
||||
const int32_t *reorderCodes;
|
||||
/** Number of reorder codes; 0 if no reordering. */
|
||||
int32_t reorderCodesLength;
|
||||
/**
|
||||
* Capacity of reorderCodes.
|
||||
* If 0, then the table and codes are aliases.
|
||||
* If 0, then the codes, the ranges, and the table are aliases.
|
||||
* Otherwise, this object owns the memory via the reorderCodes pointer;
|
||||
* the table and the codes are in the same memory block, with the codes first.
|
||||
* the codes, the ranges, and the table are in the same memory block, in that order.
|
||||
*/
|
||||
int32_t reorderCodesCapacity;
|
||||
|
||||
/** Options for CollationFastLatin. Negative if disabled. */
|
||||
int32_t fastLatinOptions;
|
||||
uint16_t fastLatinPrimaries[0x180];
|
||||
|
||||
private:
|
||||
void setReorderArrays(const int32_t *codes, int32_t codesLength,
|
||||
const uint32_t *ranges, int32_t rangesLength,
|
||||
const uint8_t *table, UErrorCode &errorCode);
|
||||
uint32_t reorderEx(uint32_t p) const;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2013-2014, International Business Machines
|
||||
* Copyright (C) 2013-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationtailoring.cpp
|
||||
|
@ -40,6 +40,7 @@ CollationTailoring::CollationTailoring(const CollationSettings *baseSettings)
|
|||
if(baseSettings != NULL) {
|
||||
U_ASSERT(baseSettings->reorderCodesLength == 0);
|
||||
U_ASSERT(baseSettings->reorderTable == NULL);
|
||||
U_ASSERT(baseSettings->minHighNoReorder == 0);
|
||||
} else {
|
||||
settings = new CollationSettings();
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2014, International Business Machines
|
||||
* Copyright (C) 1996-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* rulebasedcollator.cpp
|
||||
|
@ -673,9 +673,7 @@ RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, int32_t length,
|
|||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
ownedSettings->aliasReordering(defaultSettings.reorderCodes,
|
||||
defaultSettings.reorderCodesLength,
|
||||
defaultSettings.reorderTable);
|
||||
ownedSettings->copyReorderingFrom(defaultSettings, errorCode);
|
||||
setFastLatinOptions(*ownedSettings);
|
||||
}
|
||||
return;
|
||||
|
@ -685,17 +683,7 @@ RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, int32_t length,
|
|||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
if(length == 0) {
|
||||
ownedSettings->resetReordering();
|
||||
} else {
|
||||
uint8_t reorderTable[256];
|
||||
data->makeReorderTable(reorderCodes, length, reorderTable, errorCode);
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
if(!ownedSettings->setReordering(reorderCodes, length, reorderTable)) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
ownedSettings->setReordering(*data, reorderCodes, length, errorCode);
|
||||
setFastLatinOptions(*ownedSettings);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 1996-2014, International Business Machines
|
||||
* Copyright (C) 1996-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
******************************************************************************
|
||||
*/
|
||||
|
@ -607,7 +607,7 @@ public:
|
|||
* Retrieves the reordering codes for this collator.
|
||||
* @param dest The array to fill with the script ordering.
|
||||
* @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function
|
||||
* will only return the length of the result without writing any of the result string (pre-flighting).
|
||||
* will only return the length of the result without writing any codes (pre-flighting).
|
||||
* @param status A reference to an error code value, which must not indicate
|
||||
* a failure before the function call.
|
||||
* @return The length of the script ordering array.
|
||||
|
@ -630,6 +630,7 @@ public:
|
|||
* length is also set to 0. An empty array will clear any reordering codes on the collator.
|
||||
* @param reorderCodesLength The length of reorderCodes.
|
||||
* @param status error code
|
||||
* @see ucol_setReorderCodes
|
||||
* @see Collator#getReorderCodes
|
||||
* @see Collator#getEquivalentReorderCodes
|
||||
* @see UScriptCode
|
||||
|
@ -643,11 +644,13 @@ public:
|
|||
/**
|
||||
* Retrieves the reorder codes that are grouped with the given reorder code. Some reorder
|
||||
* codes will be grouped and must reorder together.
|
||||
* Beginning with ICU 55, scripts only reorder together if they are primary-equal,
|
||||
* for example Hiragana and Katakana.
|
||||
*
|
||||
* @param reorderCode The reorder code to determine equivalence for.
|
||||
* @param dest The array to fill with the script equivalence reordering codes.
|
||||
* @param destCapacity The length of dest. If it is 0, then dest may be NULL and the
|
||||
* function will only return the length of the result without writing any of the result
|
||||
* string (pre-flighting).
|
||||
* function will only return the length of the result without writing any codes (pre-flighting).
|
||||
* @param status A reference to an error code value, which must not indicate
|
||||
* a failure before the function call.
|
||||
* @return The length of the of the reordering code equivalence array.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 1996-2014, International Business Machines Corporation and
|
||||
* Copyright (C) 1996-2015, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
******************************************************************************
|
||||
*/
|
||||
|
@ -651,7 +651,7 @@ public:
|
|||
* Retrieves the reordering codes for this collator.
|
||||
* @param dest The array to fill with the script ordering.
|
||||
* @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function
|
||||
* will only return the length of the result without writing any of the result string (pre-flighting).
|
||||
* will only return the length of the result without writing any codes (pre-flighting).
|
||||
* @param status A reference to an error code value, which must not indicate
|
||||
* a failure before the function call.
|
||||
* @return The length of the script ordering array.
|
||||
|
@ -670,6 +670,7 @@ public:
|
|||
* length is also set to 0. An empty array will clear any reordering codes on the collator.
|
||||
* @param reorderCodesLength The length of reorderCodes.
|
||||
* @param status error code
|
||||
* @see ucol_setReorderCodes
|
||||
* @see Collator#getReorderCodes
|
||||
* @see Collator#getEquivalentReorderCodes
|
||||
* @stable ICU 4.8
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (c) 1996-2014, International Business Machines Corporation and others.
|
||||
* Copyright (c) 1996-2015, International Business Machines Corporation and others.
|
||||
* All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -685,7 +685,7 @@ ucol_setStrength(UCollator *coll,
|
|||
* @param coll The UCollator to query.
|
||||
* @param dest The array to fill with the script ordering.
|
||||
* @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function
|
||||
* will only return the length of the result without writing any of the result string (pre-flighting).
|
||||
* will only return the length of the result without writing any codes (pre-flighting).
|
||||
* @param pErrorCode Must be a valid pointer to an error code value, which must not indicate a
|
||||
* failure before the function call.
|
||||
* @return The number of reordering codes written to the dest array.
|
||||
|
@ -702,27 +702,32 @@ ucol_getReorderCodes(const UCollator* coll,
|
|||
UErrorCode *pErrorCode);
|
||||
/**
|
||||
* Sets the reordering codes for this collator.
|
||||
* Collation reordering allows scripts and some other defined blocks of characters
|
||||
* to be moved relative to each other as a block. This reordering is done on top of
|
||||
* Collation reordering allows scripts and some other groups of characters
|
||||
* to be moved relative to each other. This reordering is done on top of
|
||||
* the DUCET/CLDR standard collation order. Reordering can specify groups to be placed
|
||||
* at the start and/or the end of the collation order. These groups are specified using
|
||||
* UScript codes and UColReorderCode entries.
|
||||
*
|
||||
* <p>By default, reordering codes specified for the start of the order are placed in the
|
||||
* order given after a group of "special" non-script blocks. These special groups of characters
|
||||
* order given after several special non-script blocks. These special groups of characters
|
||||
* are space, punctuation, symbol, currency, and digit. These special groups are represented with
|
||||
* UColReorderCode entries. Script groups can be intermingled with
|
||||
* these special non-script blocks if those special blocks are explicitly specified in the reordering.
|
||||
* these special non-script groups if those special groups are explicitly specified in the reordering.
|
||||
*
|
||||
* <p>The special code OTHERS stands for any script that is not explicitly
|
||||
* mentioned in the list of reordering codes given. Anything that is after OTHERS
|
||||
* will go at the very end of the reordering in the order given.
|
||||
*
|
||||
* <p>The special reorder code DEFAULT will reset the reordering for this collator
|
||||
* to the default for this collator. The default reordering may be the DUCET/CLDR order or may be a reordering that
|
||||
* was specified when this collator was created from resource data or from rules. The
|
||||
* DEFAULT code <b>must</b> be the sole code supplied when it used. If not
|
||||
* that will result in a U_ILLEGAL_ARGUMENT_ERROR being set.
|
||||
* DEFAULT code <b>must</b> be the sole code supplied when it is used.
|
||||
* If not, then U_ILLEGAL_ARGUMENT_ERROR will be set.
|
||||
*
|
||||
* <p>The special reorder code NONE will remove any reordering for this collator.
|
||||
* The result of setting no reordering will be to have the DUCET/CLDR ordering used. The
|
||||
* NONE code <b>must</b> be the sole code supplied when it used.
|
||||
* NONE code <b>must</b> be the sole code supplied when it is used.
|
||||
*
|
||||
* @param coll The UCollator to set.
|
||||
* @param reorderCodes An array of script codes in the new order. This can be NULL if the
|
||||
* length is also set to 0. An empty array will clear any reordering codes on the collator.
|
||||
|
@ -744,10 +749,13 @@ ucol_setReorderCodes(UCollator* coll,
|
|||
/**
|
||||
* Retrieves the reorder codes that are grouped with the given reorder code. Some reorder
|
||||
* codes will be grouped and must reorder together.
|
||||
* Beginning with ICU 55, scripts only reorder together if they are primary-equal,
|
||||
* for example Hiragana and Katakana.
|
||||
*
|
||||
* @param reorderCode The reorder code to determine equivalence for.
|
||||
* @param dest The array to fill with the script ordering.
|
||||
* @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function
|
||||
* will only return the length of the result without writing any of the result string (pre-flighting).
|
||||
* will only return the length of the result without writing any codes (pre-flighting).
|
||||
* @param pErrorCode Must be a valid pointer to an error code value, which must not indicate
|
||||
* a failure before the function call.
|
||||
* @return The number of reordering codes written to the dest array.
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
|
||||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 2001-2014, International Business Machines Corporation and
|
||||
* Copyright (c) 2001-2015, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/*******************************************************************************
|
||||
|
@ -4693,7 +4693,7 @@ static void TestReorderingAPI(void)
|
|||
UErrorCode status = U_ZERO_ERROR;
|
||||
UCollator *myCollation;
|
||||
int32_t reorderCodes[3] = {USCRIPT_GREEK, USCRIPT_HAN, UCOL_REORDER_CODE_PUNCTUATION};
|
||||
int32_t duplicateReorderCodes[] = {USCRIPT_CUNEIFORM, USCRIPT_GREEK, UCOL_REORDER_CODE_CURRENCY, USCRIPT_EGYPTIAN_HIEROGLYPHS};
|
||||
int32_t duplicateReorderCodes[] = {USCRIPT_HIRAGANA, USCRIPT_GREEK, UCOL_REORDER_CODE_CURRENCY, USCRIPT_KATAKANA};
|
||||
int32_t reorderCodesStartingWithDefault[] = {UCOL_REORDER_CODE_DEFAULT, USCRIPT_GREEK, USCRIPT_HAN, UCOL_REORDER_CODE_PUNCTUATION};
|
||||
int32_t reorderCodeNone = UCOL_REORDER_CODE_NONE;
|
||||
UCollationResult collResult;
|
||||
|
@ -4952,86 +4952,117 @@ static UBool containsExpectedScript(const int32_t scripts[], int32_t length, int
|
|||
}
|
||||
|
||||
static void TestEquivalentReorderingScripts(void) {
|
||||
// Beginning with ICU 55, collation reordering moves single scripts
|
||||
// rather than groups of scripts,
|
||||
// except where scripts share a range and sort primary-equal.
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t equivalentScripts[100];
|
||||
int32_t length;
|
||||
int i;
|
||||
int32_t prevScript;
|
||||
/* At least these scripts are expected to be equivalent. There may be more. */
|
||||
/* These scripts are expected to be equivalent. */
|
||||
static const int32_t expectedScripts[] = {
|
||||
USCRIPT_BOPOMOFO,
|
||||
USCRIPT_LISU,
|
||||
USCRIPT_LYCIAN,
|
||||
USCRIPT_CARIAN,
|
||||
USCRIPT_LYDIAN,
|
||||
USCRIPT_YI,
|
||||
USCRIPT_OLD_ITALIC,
|
||||
USCRIPT_GOTHIC,
|
||||
USCRIPT_DESERET,
|
||||
USCRIPT_SHAVIAN,
|
||||
USCRIPT_OSMANYA,
|
||||
USCRIPT_LINEAR_B,
|
||||
USCRIPT_CYPRIOT,
|
||||
USCRIPT_OLD_SOUTH_ARABIAN,
|
||||
USCRIPT_AVESTAN,
|
||||
USCRIPT_IMPERIAL_ARAMAIC,
|
||||
USCRIPT_INSCRIPTIONAL_PARTHIAN,
|
||||
USCRIPT_INSCRIPTIONAL_PAHLAVI,
|
||||
USCRIPT_UGARITIC,
|
||||
USCRIPT_OLD_PERSIAN,
|
||||
USCRIPT_CUNEIFORM,
|
||||
USCRIPT_EGYPTIAN_HIEROGLYPHS,
|
||||
USCRIPT_PHONETIC_POLLARD,
|
||||
USCRIPT_SORA_SOMPENG,
|
||||
USCRIPT_MEROITIC_CURSIVE,
|
||||
USCRIPT_MEROITIC_HIEROGLYPHS
|
||||
USCRIPT_HIRAGANA,
|
||||
USCRIPT_KATAKANA,
|
||||
USCRIPT_KATAKANA_OR_HIRAGANA
|
||||
};
|
||||
|
||||
/* UScript.GOTHIC */
|
||||
equivalentScripts[0] = 0;
|
||||
length = ucol_getEquivalentReorderCodes(
|
||||
USCRIPT_GOTHIC, equivalentScripts, LEN(equivalentScripts), &status);
|
||||
if (U_FAILURE(status)) {
|
||||
log_err_status(status, "ERROR/Gothic: retrieving equivalent reorder codes: %s\n", myErrorName(status));
|
||||
return;
|
||||
}
|
||||
if (length < LEN(expectedScripts)) {
|
||||
log_err("ERROR/Gothic: retrieved equivalent script length wrong: "
|
||||
"expected at least %d, was = %d\n",
|
||||
if (length != 1 || equivalentScripts[0] != USCRIPT_GOTHIC) {
|
||||
log_err("ERROR/Gothic: retrieved equivalent scripts wrong: "
|
||||
"length expected 1, was = %d; expected [%d] was [%d]\n",
|
||||
length, USCRIPT_GOTHIC, equivalentScripts[0]);
|
||||
}
|
||||
|
||||
length = ucol_getEquivalentReorderCodes(
|
||||
USCRIPT_HIRAGANA, equivalentScripts, LEN(equivalentScripts), &status);
|
||||
if (U_FAILURE(status)) {
|
||||
log_err_status(status, "ERROR/Hiragana: retrieving equivalent reorder codes: %s\n", myErrorName(status));
|
||||
return;
|
||||
}
|
||||
if (length != LEN(expectedScripts)) {
|
||||
log_err("ERROR/Hiragana: retrieved equivalent script length wrong: "
|
||||
"expected %d, was = %d\n",
|
||||
LEN(expectedScripts), length);
|
||||
}
|
||||
prevScript = -1;
|
||||
for (i = 0; i < length; ++i) {
|
||||
int32_t script = equivalentScripts[i];
|
||||
if (script <= prevScript) {
|
||||
log_err("ERROR/Gothic: equivalent scripts out of order at index %d\n", i);
|
||||
log_err("ERROR/Hiragana: equivalent scripts out of order at index %d\n", i);
|
||||
}
|
||||
prevScript = script;
|
||||
}
|
||||
for (i = 0; i < LEN(expectedScripts); i++) {
|
||||
if (!containsExpectedScript(equivalentScripts, length, expectedScripts[i])) {
|
||||
log_err("ERROR/Gothic: equivalent scripts do not contain %d\n",
|
||||
log_err("ERROR/Hiragana: equivalent scripts do not contain %d\n",
|
||||
expectedScripts[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/* UScript.SHAVIAN */
|
||||
length = ucol_getEquivalentReorderCodes(
|
||||
USCRIPT_SHAVIAN, equivalentScripts, LEN(equivalentScripts), &status);
|
||||
USCRIPT_KATAKANA, equivalentScripts, LEN(equivalentScripts), &status);
|
||||
if (U_FAILURE(status)) {
|
||||
log_err_status(status, "ERROR/Shavian: retrieving equivalent reorder codes: %s\n", myErrorName(status));
|
||||
log_err_status(status, "ERROR/Katakana: retrieving equivalent reorder codes: %s\n", myErrorName(status));
|
||||
return;
|
||||
}
|
||||
if (length < LEN(expectedScripts)) {
|
||||
log_err("ERROR/Shavian: retrieved equivalent script length wrong: "
|
||||
"expected at least %d, was = %d\n",
|
||||
if (length != LEN(expectedScripts)) {
|
||||
log_err("ERROR/Katakana: retrieved equivalent script length wrong: "
|
||||
"expected %d, was = %d\n",
|
||||
LEN(expectedScripts), length);
|
||||
}
|
||||
for (i = 0; i < LEN(expectedScripts); i++) {
|
||||
if (!containsExpectedScript(equivalentScripts, length, expectedScripts[i])) {
|
||||
log_err("ERROR/Shavian: equivalent scripts do not contain %d\n",
|
||||
log_err("ERROR/Katakana: equivalent scripts do not contain %d\n",
|
||||
expectedScripts[i]);
|
||||
}
|
||||
}
|
||||
|
||||
length = ucol_getEquivalentReorderCodes(
|
||||
USCRIPT_KATAKANA_OR_HIRAGANA, equivalentScripts, LEN(equivalentScripts), &status);
|
||||
if (U_FAILURE(status) || length != LEN(expectedScripts)) {
|
||||
log_err("ERROR/Hrkt: retrieved equivalent script length wrong: "
|
||||
"expected %d, was = %d\n",
|
||||
LEN(expectedScripts), length);
|
||||
}
|
||||
|
||||
length = ucol_getEquivalentReorderCodes(
|
||||
USCRIPT_HAN, equivalentScripts, LEN(equivalentScripts), &status);
|
||||
if (U_FAILURE(status) || length != 3) {
|
||||
log_err("ERROR/Hani: retrieved equivalent script length wrong: "
|
||||
"expected 3, was = %d\n", length);
|
||||
}
|
||||
length = ucol_getEquivalentReorderCodes(
|
||||
USCRIPT_SIMPLIFIED_HAN, equivalentScripts, LEN(equivalentScripts), &status);
|
||||
if (U_FAILURE(status) || length != 3) {
|
||||
log_err("ERROR/Hans: retrieved equivalent script length wrong: "
|
||||
"expected 3, was = %d\n", length);
|
||||
}
|
||||
length = ucol_getEquivalentReorderCodes(
|
||||
USCRIPT_TRADITIONAL_HAN, equivalentScripts, LEN(equivalentScripts), &status);
|
||||
if (U_FAILURE(status) || length != 3) {
|
||||
log_err("ERROR/Hant: retrieved equivalent script length wrong: "
|
||||
"expected 3, was = %d\n", length);
|
||||
}
|
||||
|
||||
length = ucol_getEquivalentReorderCodes(
|
||||
USCRIPT_MEROITIC_CURSIVE, equivalentScripts, LEN(equivalentScripts), &status);
|
||||
if (U_FAILURE(status) || length != 2) {
|
||||
log_err("ERROR/Merc: retrieved equivalent script length wrong: "
|
||||
"expected 2, was = %d\n", length);
|
||||
}
|
||||
length = ucol_getEquivalentReorderCodes(
|
||||
USCRIPT_MEROITIC_HIEROGLYPHS, equivalentScripts, LEN(equivalentScripts), &status);
|
||||
if (U_FAILURE(status) || length != 2) {
|
||||
log_err("ERROR/Mero: retrieved equivalent script length wrong: "
|
||||
"expected 2, was = %d\n", length);
|
||||
}
|
||||
}
|
||||
|
||||
static void TestReorderingAcrossCloning(void)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2014, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2015, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
//===============================================================================
|
||||
|
@ -1537,7 +1537,10 @@ void CollationAPITest::TestVariableTopSetting() {
|
|||
status = U_ZERO_ERROR;
|
||||
vt[0] = 0x24; // dollar sign (currency symbol)
|
||||
uint32_t newVarTop = coll->setVariableTop(vt, 1, status);
|
||||
|
||||
if(U_FAILURE(status)) {
|
||||
errln("setVariableTop(dollar sign) failed: %s", u_errorName(status));
|
||||
return;
|
||||
}
|
||||
if(newVarTop != coll->getVariableTop(status)) {
|
||||
errln("setVariableTop(dollar sign) != following getVariableTop()");
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2012-2014, International Business Machines
|
||||
* Copyright (C) 2012-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationtest.cpp
|
||||
|
@ -190,7 +190,7 @@ void CollationTest::TestImplicits() {
|
|||
IcuTestErrorCode errorCode(*this, "TestImplicits");
|
||||
|
||||
const CollationData *cd = CollationRoot::getData(errorCode);
|
||||
if(errorCode.logDataIfFailureAndReset("CollationRoot::getBaseData()")) {
|
||||
if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
14
icu4c/source/test/testdata/collationtest.txt
vendored
14
icu4c/source/test/testdata/collationtest.txt
vendored
|
@ -1,4 +1,4 @@
|
|||
# Copyright (c) 2012-2014 International Business Machines
|
||||
# Copyright (c) 2012-2015 International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# This file should be in UTF-8 with a signature byte sequence ("BOM").
|
||||
|
@ -2526,3 +2526,15 @@
|
|||
<3 あ
|
||||
<3 ァ
|
||||
<1 い
|
||||
|
||||
** test: reorder single scripts not groups, ICU ticket 11449
|
||||
@ root
|
||||
% reorder Goth Latn
|
||||
* compare
|
||||
<1 4
|
||||
<1 𐌰 # Gothic
|
||||
<1 L
|
||||
<1 Ω
|
||||
# Before ICU 55, the following reordered together with Gothic.
|
||||
<1 𐌈 # Old Italic
|
||||
<1 𐑐 # Shavian
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1998-2014, International Business Machines
|
||||
* Copyright (C) 1998-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -1069,6 +1069,11 @@ addCollation(ParseState* state, struct SResource *result, const char *collation
|
|||
if(isVerbose()) {
|
||||
printf("%s~%s collation tailoring part sizes:\n", state->filename, collationType);
|
||||
icu::CollationInfo::printSizes(totalSize, indexes);
|
||||
if(t->settings->hasReordering()) {
|
||||
printf("%s~%s collation reordering ranges:\n", state->filename, collationType);
|
||||
icu::CollationInfo::printReorderRanges(
|
||||
*t->data, t->settings->reorderCodes, t->settings->reorderCodesLength);
|
||||
}
|
||||
}
|
||||
struct SResource *collationBin = bin_open(state->bundle, "%%CollationBin", totalSize, dest, NULL, NULL, status);
|
||||
table_add(result, collationBin, line, status);
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2013-2014, International Business Machines
|
||||
* Copyright (C) 2013-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationinfo.cpp
|
||||
|
@ -16,9 +16,11 @@
|
|||
|
||||
#if !UCONFIG_NO_COLLATION
|
||||
|
||||
#include "collationdata.h"
|
||||
#include "collationdatareader.h"
|
||||
#include "collationinfo.h"
|
||||
#include "uassert.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -112,6 +114,37 @@ CollationInfo::getDataLength(const int32_t indexes[], int32_t startIndex) {
|
|||
return indexes[startIndex + 1] - indexes[startIndex];
|
||||
}
|
||||
|
||||
void
|
||||
CollationInfo::printReorderRanges(const CollationData &data, const int32_t *codes, int32_t length) {
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
UVector32 ranges(errorCode);
|
||||
data.makeReorderRanges(codes, length, ranges, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
printf(" error building reorder ranges: %s\n", u_errorName(errorCode));
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t start = 0;
|
||||
for(int32_t i = 0; i < ranges.size(); ++i) {
|
||||
int32_t pair = ranges.elementAti(i);
|
||||
int32_t limit = (pair >> 16) & 0xffff;
|
||||
int16_t offset = (int16_t)pair;
|
||||
if(offset == 0) {
|
||||
// [inclusive-start, exclusive-limit[
|
||||
printf(" [%04x, %04x[\n", start, limit);
|
||||
} else if(offset > 0) {
|
||||
printf(" reorder [%04x, %04x[ by offset %02x to [%04x, %04x[\n",
|
||||
start, limit, offset,
|
||||
start + (offset << 8), limit + (offset << 8));
|
||||
} else /* offset < 0 */ {
|
||||
printf(" reorder [%04x, %04x[ by offset -%02x to [%04x, %04x[\n",
|
||||
start, limit, -offset,
|
||||
start + (offset << 8), limit + (offset << 8));
|
||||
}
|
||||
start = limit;
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // !UCONFIG_NO_COLLATION
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2013-2014, International Business Machines
|
||||
* Copyright (C) 2013-2015, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* collationinfo.h
|
||||
|
@ -18,12 +18,15 @@
|
|||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
struct CollationData;
|
||||
|
||||
/**
|
||||
* Collation-related code for tools & demos.
|
||||
*/
|
||||
class U_TOOLUTIL_API CollationInfo /* all static */ {
|
||||
public:
|
||||
static void printSizes(int32_t sizeWithHeader, const int32_t indexes[]);
|
||||
static void printReorderRanges(const CollationData &data, const int32_t *codes, int32_t length);
|
||||
|
||||
private:
|
||||
CollationInfo(); // no constructor
|
||||
|
|
Loading…
Add table
Reference in a new issue