ICU-11449 reorder single scripts not groups, scripts/groups can start on top-16-bit boundaries, data formatVersion 5 for new scripts data and optional reorderRanges appended to reorderCodes

X-SVN-Rev: 36924
This commit is contained in:
Markus Scherer 2015-01-07 03:37:11 +00:00
parent bcdcc4dc67
commit a9d7c3e4bd
33 changed files with 22801 additions and 22306 deletions

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2003-2014, International Business Machines
* Copyright (C) 2003-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -340,7 +340,7 @@ swapFormatVersion3(const UDataSwapper *ds,
return header.size;
}
// swap formatVersion 4 ---------------------------------------------------- ***
// swap formatVersion 4 or 5 ----------------------------------------------- ***
// The following are copied from CollationDataReader, trading an awkward copy of constants
// for an awkward relocation of the i18n collationdatareader.h file into the common library.
@ -566,7 +566,7 @@ ucol_swap(const UDataSwapper *ds,
info.dataFormat[1]==0x43 &&
info.dataFormat[2]==0x6f &&
info.dataFormat[3]==0x6c &&
(info.formatVersion[0]==3 || info.formatVersion[0]==4)
(3<=info.formatVersion[0] && info.formatVersion[0]<=5)
)) {
udata_printError(ds, "ucol_swap(): data format %02x.%02x.%02x.%02x "
"(format version %02x.%02x) is not recognized as collation data\n",

File diff suppressed because it is too large Load diff

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2010-2014, International Business Machines
* Copyright (C) 2010-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collation.h
@ -488,10 +488,6 @@ public:
return makeCE(unassignedPrimaryFromCodePoint(c));
}
static inline uint32_t reorder(const uint8_t reorderTable[256], uint32_t primary) {
return ((uint32_t)reorderTable[primary >> 24] << 24) | (primary & 0xffffff);
}
private:
Collation(); // No instantiation.
};

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines
* Copyright (C) 1996-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationcompare.cpp
@ -95,10 +95,9 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
if(leftPrimary != rightPrimary) {
// Return the primary difference, with script reordering.
const uint8_t *reorderTable = settings.reorderTable;
if (reorderTable != NULL) {
leftPrimary = Collation::reorder(reorderTable, leftPrimary);
rightPrimary = Collation::reorder(reorderTable, rightPrimary);
if(settings.hasReordering()) {
leftPrimary = settings.reorder(leftPrimary);
rightPrimary = settings.reorder(rightPrimary);
}
return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
}
@ -340,10 +339,9 @@ CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterat
if(leftQuaternary != rightQuaternary) {
// Return the difference, with script reordering.
const uint8_t *reorderTable = settings.reorderTable;
if (reorderTable != NULL) {
leftQuaternary = Collation::reorder(reorderTable, leftQuaternary);
rightQuaternary = Collation::reorder(reorderTable, rightQuaternary);
if(settings.hasReordering()) {
leftQuaternary = settings.reorder(leftQuaternary);
rightQuaternary = settings.reorder(rightQuaternary);
}
return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
}

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2012-2014, International Business Machines
* Copyright (C) 2012-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationdata.cpp
@ -21,6 +21,7 @@
#include "collationdata.h"
#include "uassert.h"
#include "utrie2.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
@ -114,48 +115,57 @@ CollationData::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
uint32_t
CollationData::getFirstPrimaryForGroup(int32_t script) const {
int32_t index = findScript(script);
if(index < 0) {
return 0;
}
uint32_t head = scripts[index];
return (head & 0xff00) << 16;
int32_t index = getScriptIndex(script);
return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16;
}
uint32_t
CollationData::getLastPrimaryForGroup(int32_t script) const {
int32_t index = findScript(script);
if(index < 0) {
int32_t index = getScriptIndex(script);
if(index == 0) {
return 0;
}
uint32_t head = scripts[index];
uint32_t lastByte = head & 0xff;
return ((lastByte + 1) << 24) - 1;
uint32_t limit = scriptStarts[index + 1];
return (limit << 16) - 1;
}
int32_t
CollationData::getGroupForPrimary(uint32_t p) const {
p >>= 24; // Reordering groups are distinguished by primary lead bytes.
for(int32_t i = 0; i < scriptsLength; i = i + 2 + scripts[i + 1]) {
uint32_t lastByte = scripts[i] & 0xff;
if(p <= lastByte) {
return scripts[i + 2];
p >>= 16;
if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) {
return -1;
}
int32_t index = 1;
while(p >= scriptStarts[index + 1]) { ++index; }
for(int32_t i = 0; i < numScripts; ++i) {
if(scriptsIndex[i] == index) {
return i;
}
}
for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
if(scriptsIndex[numScripts + i] == index) {
return UCOL_REORDER_CODE_FIRST + i;
}
}
return -1;
}
int32_t
CollationData::findScript(int32_t script) const {
if(script < 0 || 0xffff < script) { return -1; }
for(int32_t i = 0; i < scriptsLength;) {
int32_t limit = i + 2 + scripts[i + 1];
for(int32_t j = i + 2; j < limit; ++j) {
if(script == scripts[j]) { return i; }
CollationData::getScriptIndex(int32_t script) const {
if(script < 0) {
return 0;
} else if(script < numScripts) {
return scriptsIndex[script];
} else if(script < UCOL_REORDER_CODE_FIRST) {
return 0;
} else {
script -= UCOL_REORDER_CODE_FIRST;
if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
return scriptsIndex[numScripts + script];
} else {
return 0;
}
i = limit;
}
return -1;
}
int32_t
@ -163,85 +173,114 @@ CollationData::getEquivalentScripts(int32_t script,
int32_t dest[], int32_t capacity,
UErrorCode &errorCode) const {
if(U_FAILURE(errorCode)) { return 0; }
int32_t i = findScript(script);
if(i < 0) { return 0; }
int32_t length = scripts[i + 1];
U_ASSERT(length != 0);
int32_t index = getScriptIndex(script);
if(index == 0) { return 0; }
if(script >= UCOL_REORDER_CODE_FIRST) {
// Special groups have no aliases.
if(capacity > 0) {
dest[0] = script;
} else {
errorCode = U_BUFFER_OVERFLOW_ERROR;
}
return 1;
}
int32_t length = 0;
for(int32_t i = 0; i < numScripts; ++i) {
if(scriptsIndex[i] == index) {
if(length < capacity) {
dest[length] = i;
}
++length;
}
}
if(length > capacity) {
errorCode = U_BUFFER_OVERFLOW_ERROR;
return length;
}
i += 2;
dest[0] = scripts[i++];
for(int32_t j = 1; j < length; ++j) {
script = scripts[i++];
// Sorted insertion.
for(int32_t k = j;; --k) {
// Invariant: dest[k] is free to receive either script or dest[k - 1].
if(k > 0 && script < dest[k - 1]) {
dest[k] = dest[k - 1];
} else {
dest[k] = script;
break;
}
}
}
return length;
}
void
CollationData::makeReorderTable(const int32_t *reorder, int32_t length,
uint8_t table[256], UErrorCode &errorCode) const {
CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
UVector32 &ranges, UErrorCode &errorCode) const {
makeReorderRanges(reorder, length, FALSE, ranges, errorCode);
}
void
CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
UBool latinMustMove,
UVector32 &ranges, UErrorCode &errorCode) const {
if(U_FAILURE(errorCode)) { return; }
ranges.removeAllElements();
if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) {
return;
}
// Maps each script-or-group range to a new lead byte.
uint8_t table[MAX_NUM_SCRIPT_RANGES];
uprv_memset(table, 0, sizeof(table));
{
// Set "don't care" values for reserved ranges.
int32_t index = scriptsIndex[
numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_FIRST];
if(index != 0) {
table[index] = 0xff;
}
index = scriptsIndex[
numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FIRST];
if(index != 0) {
table[index] = 0xff;
}
}
// Initialize the table.
// Never reorder special low and high primary lead bytes.
int32_t lowByte;
for(lowByte = 0; lowByte <= Collation::MERGE_SEPARATOR_BYTE; ++lowByte) {
table[lowByte] = lowByte;
}
// lowByte == 03
int32_t highByte;
for(highByte = 0xff; highByte >= Collation::TRAIL_WEIGHT_BYTE; --highByte) {
table[highByte] = highByte;
}
// highByte == FE
// Set intermediate bytes to 0 to indicate that they have not been set yet.
for(int32_t i = lowByte; i <= highByte; ++i) {
table[i] = 0;
}
U_ASSERT(scriptStartsLength >= 2);
U_ASSERT(scriptStarts[0] == 0);
int32_t lowStart = scriptStarts[1];
U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8));
int32_t highLimit = scriptStarts[scriptStartsLength - 1];
U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8));
// Get the set of special reorder codes in the input list.
// This supports up to 32 special reorder codes;
// This supports a fixed number of special reorder codes;
// it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
uint32_t specials = 0;
for(int32_t i = 0; i < length; ++i) {
int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;
if(0 <= reorderCode && reorderCode <= 31) {
if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
specials |= (uint32_t)1 << reorderCode;
}
}
// Start the reordering with the special low reorder codes that do not occur in the input.
for(int32_t i = 0;; i += 3) {
if(scripts[i + 1] != 1) { break; } // Went beyond special single-code reorder codes.
int32_t reorderCode = (int32_t)scripts[i + 2] - UCOL_REORDER_CODE_FIRST;
if(reorderCode < 0) { break; } // Went beyond special reorder codes.
if((specials & ((uint32_t)1 << reorderCode)) == 0) {
int32_t head = scripts[i];
int32_t firstByte = head >> 8;
int32_t lastByte = head & 0xff;
do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte);
for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
int32_t index = scriptsIndex[numScripts + i];
if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) {
lowStart = addLowScriptRange(table, index, lowStart);
}
}
// Reorder according to the input scripts, continuing from the bottom of the bytes range.
// Skip the reserved range before Latin if Latin is the first script,
// so that we do not move it unnecessarily.
int32_t skippedReserved = 0;
if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) {
int32_t index = scriptsIndex[USCRIPT_LATIN];
U_ASSERT(index != 0);
int32_t start = scriptStarts[index];
U_ASSERT(lowStart <= start);
skippedReserved = start - lowStart;
lowStart = start;
}
// Reorder according to the input scripts, continuing from the bottom of the primary range.
int32_t originalLength = length; // length will be decremented if "others" is in the list.
UBool hasReorderToEnd = FALSE;
for(int32_t i = 0; i < length;) {
int32_t script = reorder[i++];
if(script == USCRIPT_UNKNOWN) {
// Put the remaining scripts at the top.
hasReorderToEnd = TRUE;
while(i < length) {
script = reorder[--length];
if(script == USCRIPT_UNKNOWN || // Must occur at most once.
@ -249,16 +288,13 @@ CollationData::makeReorderTable(const int32_t *reorder, int32_t length,
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
int32_t index = findScript(script);
if(index < 0) { continue; }
int32_t head = scripts[index];
int32_t firstByte = head >> 8;
int32_t lastByte = head & 0xff;
if(table[firstByte] != 0) { // Duplicate or equivalent script.
int32_t index = getScriptIndex(script);
if(index == 0) { continue; }
if(table[index] != 0) { // Duplicate or equivalent script.
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
do { table[lastByte--] = highByte--; } while(firstByte <= lastByte);
highLimit = addHighScriptRange(table, index, highLimit);
}
break;
}
@ -268,24 +304,83 @@ CollationData::makeReorderTable(const int32_t *reorder, int32_t length,
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
int32_t index = findScript(script);
if(index < 0) { continue; }
int32_t head = scripts[index];
int32_t firstByte = head >> 8;
int32_t lastByte = head & 0xff;
if(table[firstByte] != 0) { // Duplicate or equivalent script.
int32_t index = getScriptIndex(script);
if(index == 0) { continue; }
if(table[index] != 0) { // Duplicate or equivalent script.
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte);
lowStart = addLowScriptRange(table, index, lowStart);
}
// Put all remaining scripts into the middle.
// Avoid table[0] which must remain 0.
for(int32_t i = 1; i <= 0xff; ++i) {
if(table[i] == 0) { table[i] = lowByte++; }
for(int32_t i = 1; i < scriptStartsLength - 1; ++i) {
int32_t leadByte = table[i];
if(leadByte != 0) { continue; }
int32_t start = scriptStarts[i];
if(!hasReorderToEnd && start > lowStart) {
// No need to move this script.
lowStart = start;
}
lowStart = addLowScriptRange(table, i, lowStart);
}
U_ASSERT(lowByte == highByte + 1);
if(lowStart > highLimit) {
if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
// Try not skipping the before-Latin reserved range.
makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode);
return;
}
// We need more primary lead bytes than available, despite the reserved ranges.
errorCode = U_BUFFER_OVERFLOW_ERROR;
return;
}
// Turn lead bytes into a list of (limit, offset) pairs.
// Encode each pair in one list element:
// Upper 16 bits = limit, lower 16 = signed lead byte offset.
int32_t offset = 0;
for(int32_t i = 1;; ++i) {
int32_t nextOffset = offset;
while(i < scriptStartsLength - 1) {
int32_t newLeadByte = table[i];
if(newLeadByte == 0xff) {
// "Don't care" lead byte for reserved range, continue with current offset.
} else {
nextOffset = newLeadByte - (scriptStarts[i] >> 8);
if(nextOffset != offset) { break; }
}
++i;
}
if(offset != 0 || i < scriptStartsLength - 1) {
ranges.addElement(((int32_t)scriptStarts[i] << 16) | (offset & 0xffff), errorCode);
}
if(i == scriptStartsLength - 1) { break; }
offset = nextOffset;
}
}
int32_t
CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const {
int32_t start = scriptStarts[index];
if((start & 0xff) < (lowStart & 0xff)) {
lowStart += 0x100;
}
table[index] = (uint8_t)(lowStart >> 8);
int32_t limit = scriptStarts[index + 1];
lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
return lowStart;
}
int32_t
CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const {
int32_t limit = scriptStarts[index + 1];
if((limit & 0xff) > (highLimit & 0xff)) {
highLimit -= 0x100;
}
int32_t start = scriptStarts[index];
highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
table[index] = (uint8_t)(highLimit >> 8);
return highLimit;
}
U_NAMESPACE_END

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2010-2014, International Business Machines
* Copyright (C) 2010-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationdata.h
@ -16,6 +16,7 @@
#if !UCONFIG_NO_COLLATION
#include "unicode/ucol.h"
#include "unicode/uniset.h"
#include "collation.h"
#include "normalizer2impl.h"
@ -25,6 +26,8 @@ struct UDataMemory;
U_NAMESPACE_BEGIN
class UVector32;
/**
* Collation data container.
* Immutable data created by a CollationDataBuilder, or loaded from a file,
@ -33,6 +36,20 @@ U_NAMESPACE_BEGIN
* Includes data for the collation base (root/default), aliased if this is not the base.
*/
struct U_I18N_API CollationData : public UMemory {
// Note: The ucadata.icu loader could discover the reserved ranges by setting an array
// parallel with the ranges, and resetting ranges that are indexed.
// The reordering builder code could clone the resulting template array.
enum {
REORDER_RESERVED_BEFORE_LATIN = UCOL_REORDER_CODE_FIRST + 14,
REORDER_RESERVED_AFTER_LATIN
};
enum {
MAX_NUM_SPECIAL_REORDER_CODES = 8,
/** C++ only, data reader check scriptStartsLength. */
MAX_NUM_SCRIPT_RANGES = 256
};
CollationData(const Normalizer2Impl &nfc)
: trie(NULL),
ce32s(NULL), ces(NULL), contexts(NULL), base(NULL),
@ -43,7 +60,7 @@ struct U_I18N_API CollationData : public UMemory {
compressibleBytes(NULL),
unsafeBackwardSet(NULL),
fastLatinTable(NULL), fastLatinTableLength(0),
scripts(NULL), scriptsLength(0),
numScripts(0), scriptsIndex(NULL), scriptStarts(NULL), scriptStartsLength(0),
rootElements(NULL), rootElementsLength(0) {}
uint32_t getCE32(UChar32 c) const {
@ -137,13 +154,17 @@ struct U_I18N_API CollationData : public UMemory {
int32_t dest[], int32_t capacity, UErrorCode &errorCode) const;
/**
* Writes the permutation table for the given reordering of scripts and groups,
* mapping from default-order primary-weight lead bytes to reordered lead bytes.
* Writes the permutation of primary-weight ranges
* for the given reordering of scripts and groups.
* The caller checks for illegal arguments and
* takes care of [DEFAULT] and memory allocation.
*
* Each list element will be a (limit, offset) pair as described
* for the CollationSettings::reorderRanges.
* The list will be empty if no ranges are reordered.
*/
void makeReorderTable(const int32_t *reorder, int32_t length,
uint8_t table[256], UErrorCode &errorCode) const;
void makeReorderRanges(const int32_t *reorder, int32_t length,
UVector32 &ranges, UErrorCode &errorCode) const;
/** @see jamoCE32s */
static const int32_t JAMO_CE32S_LENGTH = 19 + 21 + 27;
@ -195,22 +216,26 @@ struct U_I18N_API CollationData : public UMemory {
* Data for scripts and reordering groups.
* Uses include building a reordering permutation table and
* providing script boundaries to AlphabeticIndex.
*
* This data is a sorted list of primary-weight lead byte ranges (reordering groups),
* each with a list of pairs sorted in base collation order;
* each pair contains a script/reorder code and the lowest primary weight for that script.
*
* Data structure:
* - Each reordering group is encoded in n+2 16-bit integers.
* - First integer:
* Bits 15..8: First byte of the reordering group's range.
* Bits 7..0: Last byte of the reordering group's range.
* - Second integer:
* Length n of the list of script/reordering codes.
* - Each further integer is a script or reordering code.
*/
const uint16_t *scripts;
int32_t scriptsLength;
int32_t numScripts;
/**
* The length of scriptsIndex is numScripts+16.
* It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
* 16 special reorder codes (not all used) are mapped starting at numScripts.
* Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
* There are special codes at the end for reorder-reserved primary ranges.
*
* Multiple scripts may share a range and index, for example Hira & Kana.
*/
const uint16_t *scriptsIndex;
/**
* Start primary weight (top 16 bits only) for a group/script/reserved range
* indexed by scriptsIndex.
* The first range (separators & terminators) and the last range (trailing weights)
* are not reorderable, and no scriptsIndex entry points to them.
*/
const uint16_t *scriptStarts;
int32_t scriptStartsLength;
/**
* Collation elements in the root collator.
@ -221,7 +246,12 @@ struct U_I18N_API CollationData : public UMemory {
int32_t rootElementsLength;
private:
int32_t findScript(int32_t script) const;
int32_t getScriptIndex(int32_t script) const;
void makeReorderRanges(const int32_t *reorder, int32_t length,
UBool latinMustMove,
UVector32 &ranges, UErrorCode &errorCode) const;
int32_t addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const;
int32_t addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const;
};
U_NAMESPACE_END

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2012-2014, International Business Machines
* Copyright (C) 2012-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationdatabuilder.cpp
@ -1213,8 +1213,10 @@ CollationDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
if(base != NULL) {
data.numericPrimary = base->numericPrimary;
data.compressibleBytes = base->compressibleBytes;
data.scripts = base->scripts;
data.scriptsLength = base->scriptsLength;
data.numScripts = base->numScripts;
data.scriptsIndex = base->scriptsIndex;
data.scriptStarts = base->scriptStarts;
data.scriptStartsLength = base->scriptStartsLength;
}
buildFastLatinTable(data, errorCode);
}

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationdatareader.cpp
@ -102,6 +102,8 @@ CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes
const CollationData *baseData = base == NULL ? NULL : base->data;
const int32_t *reorderCodes = NULL;
int32_t reorderCodesLength = 0;
const uint32_t *reorderRanges = NULL;
int32_t reorderRangesLength = 0;
index = IX_REORDER_CODES_OFFSET;
offset = getIndex(inIndexes, indexesLength, index);
length = getIndex(inIndexes, indexesLength, index + 1) - offset;
@ -114,6 +116,20 @@ CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes
}
reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset);
reorderCodesLength = length / 4;
// The reorderRanges (if any) are the trailing reorderCodes entries.
// Split the array at the boundary.
// Script or reorder codes do not exceed 16-bit values.
// Range limits are stored in the upper 16 bits, and are never 0.
while(reorderRangesLength < reorderCodesLength &&
(reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) {
++reorderRangesLength;
}
U_ASSERT(reorderRangesLength < reorderCodesLength);
if(reorderRangesLength != 0) {
reorderCodesLength -= reorderRangesLength;
reorderRanges = reinterpret_cast<const uint32_t *>(reorderCodes + reorderCodesLength);
}
}
// There should be a reorder table only if there are reorder codes.
@ -337,13 +353,32 @@ CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes
errorCode = U_INVALID_FORMAT_ERROR;
return;
}
data->scripts = reinterpret_cast<const uint16_t *>(inBytes + offset);
data->scriptsLength = length / 2;
const uint16_t *scripts = reinterpret_cast<const uint16_t *>(inBytes + offset);
int32_t scriptsLength = length / 2;
data->numScripts = scripts[0];
// There must be enough entries for both arrays, including more than two range starts.
data->scriptStartsLength = scriptsLength - (1 + data->numScripts + 16);
if(data->scriptStartsLength <= 2 ||
CollationData::MAX_NUM_SCRIPT_RANGES < data->scriptStartsLength) {
errorCode = U_INVALID_FORMAT_ERROR;
return;
}
data->scriptsIndex = scripts + 1;
data->scriptStarts = scripts + 1 + data->numScripts + 16;
if(!(data->scriptStarts[0] == 0 &&
data->scriptStarts[1] == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8) &&
data->scriptStarts[data->scriptStartsLength - 1] ==
(Collation::TRAIL_WEIGHT_BYTE << 8))) {
errorCode = U_INVALID_FORMAT_ERROR;
return;
}
} else if(data == NULL) {
// Nothing to do.
} else if(baseData != NULL) {
data->scripts = baseData->scripts;
data->scriptsLength = baseData->scriptsLength;
data->numScripts = baseData->numScripts;
data->scriptsIndex = baseData->scriptsIndex;
data->scriptStarts = baseData->scriptStarts;
data->scriptStartsLength = baseData->scriptStartsLength;
}
index = IX_COMPRESSIBLE_BYTES_OFFSET;
@ -393,16 +428,10 @@ CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes
return;
}
if(reorderCodesLength == 0 || reorderTable != NULL) {
settings->aliasReordering(reorderCodes, reorderCodesLength, reorderTable);
} else {
uint8_t table[256];
baseData->makeReorderTable(reorderCodes, reorderCodesLength, table, errorCode);
if(U_FAILURE(errorCode)) { return; }
if(!settings->setReordering(reorderCodes, reorderCodesLength,table)) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
if(reorderCodesLength != 0) {
settings->aliasReordering(*baseData, reorderCodes, reorderCodesLength,
reorderRanges, reorderRangesLength,
reorderTable, errorCode);
}
settings->fastLatinOptions = CollationFastLatin::getOptions(
@ -422,7 +451,7 @@ CollationDataReader::isAcceptable(void *context,
pInfo->dataFormat[1] == 0x43 &&
pInfo->dataFormat[2] == 0x6f &&
pInfo->dataFormat[3] == 0x6c &&
pInfo->formatVersion[0] == 4
pInfo->formatVersion[0] == 5
) {
UVersionInfo *version = static_cast<UVersionInfo *>(context);
if(version != NULL) {

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationdatareader.h
@ -109,7 +109,7 @@ private:
/*
* Format of collation data (ucadata.icu, binary data in coll/ *.res files).
* Format version 4.1.
* Format version 5.
*
* The root collation data is stored in the ucadata.icu file.
* Tailorings are stored inside .res resource bundle files, with a complete file header.
@ -151,10 +151,30 @@ private:
* int32_t reorderCodes[]; -- empty in root
* The list of script and reordering codes.
*
* Beginning with format version 5, this array may optionally
* have trailing entries with a full list of reorder ranges
* as described for CollationSettings::reorderRanges.
*
* Script or reorder codes are first and do not exceed 16-bit values.
* Range limits are stored in the upper 16 bits, and are never 0.
* Split this array into reorder codes and ranges at the first entry
* with non-zero upper 16 bits.
*
* If the ranges are missing but needed for split-reordered primary lead bytes,
* then they are regenerated at load time.
*
* uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes
* Primary-weight lead byte permutation table.
* Normally present when the reorderCodes are, but can be built at load time.
*
* Beginning with format version 5, a 0 entry at a non-zero index
* (which is otherwise an illegal value)
* means that the primary lead byte is "split"
* (there are different offsets for primaries that share that lead byte)
* and the reordering offset must be determined via the reorder ranges
* that are either stored as part of the reorderCodes array
* or regenerated at load time.
*
* UTrie2 trie; -- see utrie2_impl.h and utrie2.h
* The trie holds the main collation data. Each code point is mapped to a 32-bit value.
* It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set,
@ -194,6 +214,13 @@ private:
* See the CollationFastLatin class.
*
* uint16_t scripts[]; -- empty in all tailorings
* Format version 5:
* uint16_t numScripts;
* uint16_t scriptsIndex[numScripts+16];
* uint16_t scriptStarts[];
* See CollationData::numScripts etc.
*
* Format version 4:
* Table of the reordering groups with their first and last lead bytes,
* and their script and reordering codes.
* See CollationData::scripts.
@ -202,15 +229,20 @@ private:
* Flag for getSortKey(), indicating primary weight lead bytes that are compressible.
*
* -----------------
* Changes for formatVersion 4.1
* Changes for formatVersion 5 (ICU 55)
*
* Reordering moves single scripts, not groups of scripts.
* Reorder ranges are optionally appended to the reorderCodes,
* and a 0 entry in the reorderTable indicates a split lead byte.
* The scripts data has a new format.
*
* The rootElements may contain secondary and tertiary weights below common=05.
* (Used for small Hiragana letters.)
* Where is occurs, there is also an explicit unit with common secondary & tertiary weights.
* There are no other data structure changes, but builder code needs to be able to handle such data.
*
* ICU 55 ucadata.icu uses formatVersion 4.1.
* ICU 55 tailoring data continues to use formatVersion 4.0.
* The collation element for the merge separator code point U+FFFE
* does not necessarily have special, unique secondary/tertiary weights any more.
*/
U_NAMESPACE_END

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationdatawriter.cpp
@ -68,7 +68,7 @@ static const UDataInfo dataInfo = {
0,
{ 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol"
{ 4, 0, 0, 0 }, // formatVersion
{ 5, 0, 0, 0 }, // formatVersion
{ 6, 3, 0, 0 } // dataVersion
};
@ -157,6 +157,23 @@ CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
}
}
UVector32 codesAndRanges(errorCode);
const int32_t *reorderCodes = settings.reorderCodes;
int32_t reorderCodesLength = settings.reorderCodesLength;
if(settings.hasReordering() &&
CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) {
// Rebuild the full list of reorder ranges.
// The list in the settings is truncated for efficiency.
data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode);
// Write the codes, then the ranges.
for(int32_t i = 0; i < reorderCodesLength; ++i) {
codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode);
}
if(U_FAILURE(errorCode)) { return 0; }
reorderCodes = codesAndRanges.getBuffer();
reorderCodesLength = codesAndRanges.size();
}
int32_t headerSize;
if(isBase) {
headerSize = 0; // udata_create() writes the header
@ -171,7 +188,7 @@ CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
if(hasMappings && data.cesLength != 0) {
// Sum of the sizes of the data items which are
// not automatically multiples of 8 bytes and which are placed before the CEs.
int32_t sum = headerSize + (indexesLength + settings.reorderCodesLength) * 4;
int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4;
if((sum & 7) != 0) {
// We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
// We add to the header size here.
@ -211,7 +228,7 @@ CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
}
indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize;
totalSize += settings.reorderCodesLength * 4;
totalSize += reorderCodesLength * 4;
indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize;
if(settings.reorderTable != NULL) {
@ -280,9 +297,13 @@ CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize;
totalSize += fastLatinTableLength * 2;
UnicodeString scripts;
indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize;
if(isBase) {
totalSize += data.scriptsLength * 2;
scripts.append((UChar)data.numScripts);
scripts.append(reinterpret_cast<const UChar *>(data.scriptsIndex), data.numScripts + 16);
scripts.append(reinterpret_cast<const UChar *>(data.scriptStarts), data.scriptStartsLength);
totalSize += scripts.length() * 2;
}
indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize;
@ -299,7 +320,7 @@ CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
}
uprv_memcpy(dest, indexes, indexesLength * 4);
copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, settings.reorderCodes, dest);
copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest);
copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest);
// The trie has already been serialized into the dest buffer.
copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest);
@ -308,7 +329,7 @@ CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest);
// The unsafeBackwardSet has already been serialized into the dest buffer.
copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest);
copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, data.scripts, dest);
copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest);
copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest);
return headerSize + totalSize;

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationfastlatin.cpp
@ -36,33 +36,50 @@ CollationFastLatin::getOptions(const CollationData *data, const CollationSetting
// lowest long mini primary.
miniVarTop = MIN_LONG - 1;
} else {
uint32_t v1 = settings.variableTop >> 24;
int32_t headerLength = *table & 0xff;
int32_t i = headerLength - 1;
if(i <= 0 || v1 > (table[i] & 0x7f)) {
int32_t i = 1 + settings.getMaxVariable();
if(i >= headerLength) {
return -1; // variableTop >= digits, should not occur
}
while(i > 1 && v1 <= (table[i - 1] & 0x7f)) { --i; }
// In the table header, the miniVarTop is in bits 15..7, with 4 zero bits 19..16 implied.
// Shift right to make it comparable with long mini primaries in bits 15..3.
miniVarTop = (table[i] & 0xff80) >> 4;
miniVarTop = table[i];
}
const uint8_t *reorderTable = settings.reorderTable;
if(reorderTable != NULL) {
const uint16_t *scripts = data->scripts;
int32_t length = data->scriptsLength;
uint32_t prevLastByte = 0;
for(int32_t i = 0; i < length;) {
// reordered last byte of the group
uint32_t lastByte = reorderTable[scripts[i] & 0xff];
if(lastByte < prevLastByte) {
// The permutation affects the groups up to Latin.
return -1;
UBool digitsAreReordered = FALSE;
if(settings.hasReordering()) {
uint32_t prevStart = 0;
uint32_t beforeDigitStart = 0;
uint32_t digitStart = 0;
uint32_t afterDigitStart = 0;
for(int32_t group = UCOL_REORDER_CODE_FIRST;
group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES;
++group) {
uint32_t start = data->getFirstPrimaryForGroup(group);
start = settings.reorder(start);
if(group == UCOL_REORDER_CODE_DIGIT) {
beforeDigitStart = prevStart;
digitStart = start;
} else if(start != 0) {
if(start < prevStart) {
// The permutation affects the groups up to Latin.
return -1;
}
// In the future, there might be a special group between digits & Latin.
if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) {
afterDigitStart = start;
}
prevStart = start;
}
if(scripts[i + 2] == USCRIPT_LATIN) { break; }
i = i + 2 + scripts[i + 1];
prevLastByte = lastByte;
}
uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN);
latinStart = settings.reorder(latinStart);
if(latinStart < prevStart) {
return -1;
}
if(afterDigitStart == 0) {
afterDigitStart = latinStart;
}
if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) {
digitsAreReordered = TRUE;
}
}
@ -78,7 +95,7 @@ CollationFastLatin::getOptions(const CollationData *data, const CollationSetting
}
primaries[c] = (uint16_t)p;
}
if((settings.options & CollationSettings::NUMERIC) != 0) {
if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) {
// Bail out for digits.
for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
}

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationfastlatin.h
@ -31,7 +31,7 @@ public:
* When the major version number of the main data format changes,
* we can reset this fast Latin version to 1.
*/
static const uint16_t VERSION = 1;
static const uint16_t VERSION = 2;
static const int32_t LATIN_MAX = 0x17f;
static const int32_t LATIN_LIMIT = LATIN_MAX + 1;
@ -252,7 +252,7 @@ private:
/*
* Format of the CollationFastLatin data table.
* CollationFastLatin::VERSION = 1.
* CollationFastLatin::VERSION = 2.
*
* This table contains data for a Latin-text collation fastpath.
* The data is stored as an array of uint16_t which contains the following parts.
@ -262,6 +262,12 @@ private:
* 7..0: length of the header
*
* uint16_t varTops[header length - 1]
* Version 2:
* varTops[m] is the highest CollationFastLatin long-primary weight
* of supported maxVariable group m
* (special reorder group space, punct, symbol, currency).
*
* Version 1:
* Each of these values maps the variable top lead byte of a supported maxVariable group
* to the highest CollationFastLatin long-primary weight.
* The values are stored in ascending order.
@ -293,6 +299,16 @@ private:
* Each list is terminated by an entry with CONTR_CHAR_MASK.
* Each list starts with such an entry which also contains the default result
* for when there is no contraction match.
*
* -----------------
* Changes for version 2 (ICU 55)
*
* Special reorder groups do not necessarily start on whole primary lead bytes any more.
* Therefore, the varTops data has a new format:
* Version 1 stored the lead bytes of the highest root primaries for
* the maxVariable-supported special reorder groups.
* Now the top 16 bits would need to be stored,
* and it is simpler to store only the fast-Latin weights.
*/
U_NAMESPACE_END

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationfastlatinbuilder.cpp
@ -136,42 +136,26 @@ CollationFastLatinBuilder::forData(const CollationData &data, UErrorCode &errorC
UBool
CollationFastLatinBuilder::loadGroups(const CollationData &data, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return FALSE; }
result.append(0); // reserved for version & headerLength
headerLength = 1 + NUM_SPECIAL_GROUPS;
uint32_t r0 = (CollationFastLatin::VERSION << 8) | headerLength;
result.append((UChar)r0);
// The first few reordering groups should be special groups
// (space, punct, ..., digit) followed by Latn, then Grek and other scripts.
for(int32_t i = 0;;) {
if(i >= data.scriptsLength) {
// no Latn script
errorCode = U_INTERNAL_PROGRAM_ERROR;
for(int32_t i = 0; i < NUM_SPECIAL_GROUPS; ++i) {
lastSpecialPrimaries[i] = data.getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST + i);
if(lastSpecialPrimaries[i] == 0) {
// missing data
return FALSE;
}
uint32_t head = data.scripts[i];
uint32_t lastByte = head & 0xff; // last primary byte in the group
int32_t group = data.scripts[i + 2];
if(group == UCOL_REORDER_CODE_DIGIT) {
firstDigitPrimary = (head & 0xff00) << 16;
headerLength = result.length();
uint32_t r0 = (CollationFastLatin::VERSION << 8) | headerLength;
result.setCharAt(0, (UChar)r0);
} else if(group == USCRIPT_LATIN) {
if(firstDigitPrimary == 0) {
// no digit group
errorCode = U_INTERNAL_PROGRAM_ERROR;
return FALSE;
}
firstLatinPrimary = (head & 0xff00) << 16;
lastLatinPrimary = (lastByte << 24) | 0xffffff;
break;
} else if(firstDigitPrimary == 0) {
// a group below digits
if(lastByte > 0x7f) {
// We only use 7 bits for the last byte of a below-digits group.
// This does not warrant an errorCode, but we do not build a fast Latin table.
return FALSE;
}
result.append((UChar)lastByte);
}
i = i + 2 + data.scripts[i + 1];
result.append(0); // reserve a slot for this group
}
firstDigitPrimary = data.getFirstPrimaryForGroup(UCOL_REORDER_CODE_DIGIT);
firstLatinPrimary = data.getFirstPrimaryForGroup(USCRIPT_LATIN);
lastLatinPrimary = data.getLastPrimaryForGroup(USCRIPT_LATIN);
if(firstDigitPrimary == 0 || firstLatinPrimary == 0) {
// missing data
return FALSE;
}
return TRUE;
}
@ -187,23 +171,21 @@ CollationFastLatinBuilder::inSameGroup(uint32_t p, uint32_t q) const {
}
// Both or neither must be potentially-variable,
// so that we can test only one and determine if both are variable.
if(p >= firstDigitPrimary) {
return q >= firstDigitPrimary;
} else if(q >= firstDigitPrimary) {
uint32_t lastVariablePrimary = lastSpecialPrimaries[NUM_SPECIAL_GROUPS - 1];
if(p > lastVariablePrimary) {
return q > lastVariablePrimary;
} else if(q > lastVariablePrimary) {
return FALSE;
}
// Both will be encoded with long mini primaries.
// They must be in the same special reordering group,
// so that we can test only one and determine if both are variable.
p >>= 24; // first primary byte
q >>= 24;
U_ASSERT(p != 0 && q != 0);
U_ASSERT(p <= result[headerLength - 1]); // the loop will terminate
for(int32_t i = 1;; ++i) {
uint32_t lastByte = result[i];
if(p <= lastByte) {
return q <= lastByte;
} else if(q <= lastByte) {
for(int32_t i = 0;; ++i) { // will terminate
uint32_t lastPrimary = lastSpecialPrimaries[i];
if(p <= lastPrimary) {
return q <= lastPrimary;
} else if(q <= lastPrimary) {
return FALSE;
}
}
@ -451,8 +433,8 @@ CollationFastLatinBuilder::encodeUniqueCEs(UErrorCode &errorCode) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return FALSE;
}
int32_t group = 1;
uint32_t lastGroupByte = result[group];
int32_t group = 0;
uint32_t lastGroupPrimary = lastSpecialPrimaries[group];
// The lowest unique CE must be at least a secondary CE.
U_ASSERT(((uint32_t)uniqueCEs.elementAti(0) >> 16) != 0);
uint32_t prevPrimary = 0;
@ -466,16 +448,15 @@ CollationFastLatinBuilder::encodeUniqueCEs(UErrorCode &errorCode) {
// (uniqueCEs does not store case bits.)
uint32_t p = (uint32_t)(ce >> 32);
if(p != prevPrimary) {
uint32_t p1 = p >> 24;
while(p1 > lastGroupByte) {
while(p > lastGroupPrimary) {
U_ASSERT(pri <= CollationFastLatin::MAX_LONG);
// Add the last "long primary" in or before the group
// into the upper 9 bits of the group entry.
result.setCharAt(group, (UChar)((pri << 4) | lastGroupByte));
if(++group < headerLength) { // group is 1-based
lastGroupByte = result[group];
// Set the group's header entry to the
// last "long primary" in or before the group.
result.setCharAt(1 + group, (UChar)pri);
if(++group < NUM_SPECIAL_GROUPS) {
lastGroupPrimary = lastSpecialPrimaries[group];
} else {
lastGroupByte = 0xff;
lastGroupPrimary = 0xffffffff;
break;
}
}

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationfastlatinbuilder.h
@ -39,6 +39,9 @@ public:
int32_t lengthOfTable() const { return result.length(); }
private:
// space, punct, symbol, currency (not digit)
enum { NUM_SPECIAL_GROUPS = UCOL_REORDER_CODE_CURRENCY - UCOL_REORDER_CODE_FIRST + 1 };
UBool loadGroups(const CollationData &data, UErrorCode &errorCode);
UBool inSameGroup(uint32_t p, uint32_t q) const;
@ -73,7 +76,8 @@ private:
/** One 16-bit mini CE per unique CE. */
uint16_t *miniCEs;
// These are constant for a given list of CollationData.scripts.
// These are constant for a given root collator.
uint32_t lastSpecialPrimaries[NUM_SPECIAL_GROUPS];
uint32_t firstDigitPrimary;
uint32_t firstLatinPrimary;
uint32_t lastLatinPrimary;

View file

@ -1,5 +1,5 @@
/*
* Copyright (C) 1999-2014, International Business Machines
* Copyright (C) 1999-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
* file name: collationfcd.cpp

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2012-2014, International Business Machines
* Copyright (C) 2012-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationkeys.cpp
@ -246,7 +246,6 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
// +1 so that we can use "<" and primary ignorables test out early.
variableTop = settings.variableTop + 1;
}
const uint8_t *reorderTable = settings.reorderTable;
uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options);
@ -255,7 +254,7 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
SortKeyLevel tertiaries;
SortKeyLevel quaternaries;
uint32_t compressedP1 = 0; // 0==no compression; otherwise reordered compressible lead byte
uint32_t prevReorderedPrimary = 0; // 0==no compression
int32_t commonCases = 0;
int32_t commonSecondaries = 0;
int32_t commonTertiaries = 0;
@ -284,14 +283,15 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
}
do {
if((levels & Collation::QUATERNARY_LEVEL_FLAG) != 0) {
uint32_t p1 = p >> 24;
if(reorderTable != NULL) { p1 = reorderTable[p1]; }
if(p1 >= QUAT_SHIFTED_LIMIT_BYTE) {
if(settings.hasReordering()) {
p = settings.reorder(p);
}
if((p >> 24) >= QUAT_SHIFTED_LIMIT_BYTE) {
// Prevent shifted primary lead bytes from
// overlapping with the common compression range.
quaternaries.appendByte(QUAT_SHIFTED_LIMIT_BYTE);
}
quaternaries.appendWeight32((p1 << 24) | (p & 0xffffff));
quaternaries.appendWeight32(p);
}
do {
ce = iter.nextCE(errorCode);
@ -304,11 +304,15 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
// If ce==NO_CE, then write nothing for the primary level but
// terminate compression on all levels and then exit the loop.
if(p > Collation::NO_CE_PRIMARY && (levels & Collation::PRIMARY_LEVEL_FLAG) != 0) {
// Test the un-reordered primary for compressibility.
UBool isCompressible = compressibleBytes[p >> 24];
if(settings.hasReordering()) {
p = settings.reorder(p);
}
uint32_t p1 = p >> 24;
if(reorderTable != NULL) { p1 = reorderTable[p1]; }
if(p1 != compressedP1) {
if(compressedP1 != 0) {
if(p1 < compressedP1) {
if(!isCompressible || p1 != (prevReorderedPrimary >> 24)) {
if(prevReorderedPrimary != 0) {
if(p < prevReorderedPrimary) {
// No primary compression terminator
// at the end of the level or merged segment.
if(p1 > Collation::MERGE_SEPARATOR_BYTE) {
@ -319,12 +323,10 @@ CollationKeys::writeSortKeyUpToQuaternary(CollationIterator &iter,
}
}
sink.Append(p1);
// Test the un-reordered lead byte for compressibility but
// remember the reordered lead byte.
if(compressibleBytes[p >> 24]) {
compressedP1 = p1;
if(isCompressible) {
prevReorderedPrimary = p;
} else {
compressedP1 = 0;
prevReorderedPrimary = 0;
}
}
char p2 = (char)(p >> 16);

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationruleparser.cpp
@ -706,17 +706,7 @@ CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &error
if(U_FAILURE(errorCode)) { return; }
i = limit;
}
int32_t length = reorderCodes.size();
if(length == 1 && reorderCodes.elementAti(0) == UCOL_REORDER_CODE_NONE) {
settings->resetReordering();
return;
}
uint8_t table[256];
baseData->makeReorderTable(reorderCodes.getBuffer(), length, table, errorCode);
if(U_FAILURE(errorCode)) { return; }
if(!settings->setReordering(reorderCodes.getBuffer(), length, table)) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
}
settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
}
static const char *const gSpecialReorderCodes[] = {

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationsettings.cpp
@ -16,10 +16,12 @@
#include "unicode/ucol.h"
#include "cmemory.h"
#include "collation.h"
#include "collationdata.h"
#include "collationsettings.h"
#include "sharedobject.h"
#include "uassert.h"
#include "umutex.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
@ -27,19 +29,12 @@ CollationSettings::CollationSettings(const CollationSettings &other)
: SharedObject(other),
options(other.options), variableTop(other.variableTop),
reorderTable(NULL),
minHighNoReorder(other.minHighNoReorder),
reorderRanges(NULL), reorderRangesLength(0),
reorderCodes(NULL), reorderCodesLength(0), reorderCodesCapacity(0),
fastLatinOptions(other.fastLatinOptions) {
int32_t length = other.reorderCodesLength;
if(length == 0) {
U_ASSERT(other.reorderTable == NULL);
} else {
U_ASSERT(other.reorderTable != NULL);
if(other.reorderCodesCapacity == 0) {
aliasReordering(other.reorderCodes, length, other.reorderTable);
} else {
setReordering(other.reorderCodes, length, other.reorderTable);
}
}
UErrorCode errorCode = U_ZERO_ERROR;
copyReorderingFrom(other, errorCode);
if(fastLatinOptions >= 0) {
uprv_memcpy(fastLatinPrimaries, other.fastLatinPrimaries, sizeof(fastLatinPrimaries));
}
@ -79,14 +74,22 @@ CollationSettings::resetReordering() {
// rather than a no-op permutation.
// Keep the memory via reorderCodes and its capacity.
reorderTable = NULL;
minHighNoReorder = 0;
reorderRangesLength = 0;
reorderCodesLength = 0;
}
void
CollationSettings::aliasReordering(const int32_t *codes, int32_t length, const uint8_t *table) {
if(length == 0) {
resetReordering();
} else {
CollationSettings::aliasReordering(const CollationData &data, const int32_t *codes, int32_t length,
const uint32_t *ranges, int32_t rangesLength,
const uint8_t *table, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
if(table != NULL &&
(rangesLength == 0 ?
!reorderTableHasSplitBytes(table) :
rangesLength >= 2 &&
// The first offset must be 0. The last offset must not be 0.
(ranges[0] & 0xffff) == 0 && (ranges[rangesLength - 1] & 0xffff) != 0)) {
// We need to release the memory before setting the alias pointer.
if(reorderCodesCapacity != 0) {
uprv_free(const_cast<int32_t *>(reorderCodes));
@ -95,36 +98,170 @@ CollationSettings::aliasReordering(const int32_t *codes, int32_t length, const u
reorderTable = table;
reorderCodes = codes;
reorderCodesLength = length;
// Drop ranges before the first split byte. They are reordered by the table.
// This then speeds up reordering of the remaining ranges.
int32_t firstSplitByteRangeIndex = 0;
while(firstSplitByteRangeIndex < rangesLength &&
(ranges[firstSplitByteRangeIndex] & 0xff0000) == 0) {
// The second byte of the primary limit is 0.
++firstSplitByteRangeIndex;
}
if(firstSplitByteRangeIndex == rangesLength) {
U_ASSERT(!reorderTableHasSplitBytes(table));
minHighNoReorder = 0;
reorderRanges = NULL;
reorderRangesLength = 0;
} else {
U_ASSERT(table[ranges[firstSplitByteRangeIndex] >> 24] == 0);
minHighNoReorder = ranges[rangesLength - 1] & 0xffff0000;
reorderRanges = ranges + firstSplitByteRangeIndex;
reorderRangesLength = rangesLength - firstSplitByteRangeIndex;
}
return;
}
// Regenerate missing data.
setReordering(data, codes, length, errorCode);
}
void
CollationSettings::setReordering(const CollationData &data,
const int32_t *codes, int32_t codesLength,
UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
if(codesLength == 0 || (codesLength == 1 && codes[0] == UCOL_REORDER_CODE_NONE)) {
resetReordering();
return;
}
UVector32 rangesList(errorCode);
data.makeReorderRanges(codes, codesLength, rangesList, errorCode);
if(U_FAILURE(errorCode)) { return; }
int32_t rangesLength = rangesList.size();
if(rangesLength == 0) {
resetReordering();
return;
}
const uint32_t *ranges = reinterpret_cast<uint32_t *>(rangesList.getBuffer());
// ranges[] contains at least two (limit, offset) pairs.
// The first offset must be 0. The last offset must not be 0.
// Separators (at the low end) and trailing weights (at the high end)
// are never reordered.
U_ASSERT(rangesLength >= 2);
U_ASSERT((ranges[0] & 0xffff) == 0 && (ranges[rangesLength - 1] & 0xffff) != 0);
minHighNoReorder = ranges[rangesLength - 1] & 0xffff0000;
// Write the lead byte permutation table.
// Set a 0 for each lead byte that has a range boundary in the middle.
uint8_t table[256];
int32_t b = 0;
int32_t firstSplitByteRangeIndex = -1;
for(int32_t i = 0; i < rangesLength; ++i) {
uint32_t pair = ranges[i];
int32_t limit1 = (int32_t)(pair >> 24);
while(b < limit1) {
table[b] = (uint8_t)(b + pair);
++b;
}
// Check the second byte of the limit.
if((pair & 0xff0000) != 0) {
table[limit1] = 0;
b = limit1 + 1;
if(firstSplitByteRangeIndex < 0) {
firstSplitByteRangeIndex = i;
}
}
}
while(b <= 0xff) {
table[b] = (uint8_t)b;
++b;
}
if(firstSplitByteRangeIndex < 0) {
// The lead byte permutation table alone suffices for reordering.
rangesLength = 0;
} else {
// Remove the ranges below the first split byte.
ranges += firstSplitByteRangeIndex;
rangesLength -= firstSplitByteRangeIndex;
}
setReorderArrays(codes, codesLength, ranges, rangesLength, table, errorCode);
}
void
CollationSettings::setReorderArrays(const int32_t *codes, int32_t codesLength,
const uint32_t *ranges, int32_t rangesLength,
const uint8_t *table, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
int32_t *ownedCodes;
int32_t totalLength = codesLength + rangesLength;
U_ASSERT(totalLength > 0);
if(totalLength <= reorderCodesCapacity) {
ownedCodes = const_cast<int32_t *>(reorderCodes);
} else {
// Allocate one memory block for the codes, the ranges, and the 16-aligned table.
int32_t capacity = (totalLength + 3) & ~3; // round up to a multiple of 4 ints
ownedCodes = (int32_t *)uprv_malloc(capacity * 4 + 256);
if(ownedCodes == NULL) {
resetReordering();
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
if(reorderCodesCapacity != 0) {
uprv_free(const_cast<int32_t *>(reorderCodes));
}
reorderCodes = ownedCodes;
reorderCodesCapacity = capacity;
}
uprv_memcpy(ownedCodes + reorderCodesCapacity, table, 256);
uprv_memcpy(ownedCodes, codes, codesLength * 4);
uprv_memcpy(ownedCodes + codesLength, ranges, rangesLength * 4);
reorderTable = reinterpret_cast<const uint8_t *>(reorderCodes + reorderCodesCapacity);
reorderCodesLength = codesLength;
reorderRanges = reinterpret_cast<uint32_t *>(ownedCodes) + codesLength;
reorderRangesLength = rangesLength;
}
void
CollationSettings::copyReorderingFrom(const CollationSettings &other, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
if(!other.hasReordering()) {
resetReordering();
return;
}
minHighNoReorder = other.minHighNoReorder;
if(other.reorderCodesCapacity == 0) {
// The reorder arrays are aliased to memory-mapped data.
reorderTable = other.reorderTable;
reorderRanges = other.reorderRanges;
reorderRangesLength = other.reorderRangesLength;
reorderCodes = other.reorderCodes;
reorderCodesLength = other.reorderCodesLength;
} else {
setReorderArrays(other.reorderCodes, other.reorderCodesLength,
other.reorderRanges, other.reorderRangesLength,
other.reorderTable, errorCode);
}
}
UBool
CollationSettings::setReordering(const int32_t *codes, int32_t length, const uint8_t table[256]) {
if(length == 0) {
resetReordering();
} else {
uint8_t *ownedTable;
int32_t *ownedCodes;
if(length <= reorderCodesCapacity) {
ownedTable = const_cast<uint8_t *>(reorderTable);
ownedCodes = const_cast<int32_t *>(reorderCodes);
} else {
// Allocate one memory block for the codes and the 16-aligned table.
int32_t capacity = (length + 3) & ~3; // round up to a multiple of 4 ints
uint8_t *bytes = (uint8_t *)uprv_malloc(256 + capacity * 4);
if(bytes == NULL) { return FALSE; }
if(reorderCodesCapacity != 0) {
uprv_free(const_cast<int32_t *>(reorderCodes));
}
reorderTable = ownedTable = bytes + capacity * 4;
reorderCodes = ownedCodes = (int32_t *)bytes;
reorderCodesCapacity = capacity;
CollationSettings::reorderTableHasSplitBytes(const uint8_t table[256]) {
U_ASSERT(table[0] == 0);
for(int32_t i = 1; i < 256; ++i) {
if(table[i] == 0) {
return TRUE;
}
uprv_memcpy(ownedTable, table, 256);
uprv_memcpy(ownedCodes, codes, length * 4);
reorderCodesLength = length;
}
return TRUE;
return FALSE;
}
uint32_t
CollationSettings::reorderEx(uint32_t p) const {
if(p >= minHighNoReorder) { return p; }
// Round up p so that its lower 16 bits are >= any offset bits.
// Then compare q directly with (limit, offset) pairs.
uint32_t q = p | 0xffff;
uint32_t r;
const uint32_t *ranges = reorderRanges;
while(q >= (r = *ranges)) { ++ranges; }
return p + (r << 24);
}
void

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationsettings.h
@ -23,6 +23,8 @@
U_NAMESPACE_BEGIN
struct CollationData;
/**
* Collation settings/options/attributes.
* These are the values that can be changed via API.
@ -103,6 +105,8 @@ struct U_I18N_API CollationSettings : public SharedObject {
(MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT)),
variableTop(0),
reorderTable(NULL),
minHighNoReorder(0),
reorderRanges(NULL), reorderRangesLength(0),
reorderCodes(NULL), reorderCodesLength(0), reorderCodesCapacity(0),
fastLatinOptions(-1) {}
@ -118,8 +122,23 @@ struct U_I18N_API CollationSettings : public SharedObject {
int32_t hashCode() const;
void resetReordering();
void aliasReordering(const int32_t *codes, int32_t length, const uint8_t *table);
UBool setReordering(const int32_t *codes, int32_t length, const uint8_t table[256]);
void aliasReordering(const CollationData &data, const int32_t *codes, int32_t length,
const uint32_t *ranges, int32_t rangesLength,
const uint8_t *table, UErrorCode &errorCode);
void setReordering(const CollationData &data, const int32_t *codes, int32_t codesLength,
UErrorCode &errorCode);
void copyReorderingFrom(const CollationSettings &other, UErrorCode &errorCode);
inline UBool hasReordering() const { return reorderTable != NULL; }
static UBool reorderTableHasSplitBytes(const uint8_t table[256]);
inline uint32_t reorder(uint32_t p) const {
uint8_t b = reorderTable[p >> 24];
if(b != 0 || p <= Collation::NO_CE_PRIMARY) {
return ((uint32_t)b << 24) | (p & 0xffffff);
} else {
return reorderEx(p);
}
}
void setStrength(int32_t value, int32_t defaultOptions, UErrorCode &errorCode);
@ -194,23 +213,57 @@ struct U_I18N_API CollationSettings : public SharedObject {
int32_t options;
/** Variable-top primary weight. */
uint32_t variableTop;
/** 256-byte table for reordering permutation of primary lead bytes; NULL if no reordering. */
/**
* 256-byte table for reordering permutation of primary lead bytes; NULL if no reordering.
* A 0 entry at a non-zero index means that the primary lead byte is "split"
* (there are different offsets for primaries that share that lead byte)
* and the reordering offset must be determined via the reorderRanges.
*/
const uint8_t *reorderTable;
/** Limit of last reordered range. 0 if no reordering or no split bytes. */
uint32_t minHighNoReorder;
/**
* Primary-weight ranges for script reordering,
* to be used by reorder(p) for split-reordered primary lead bytes.
*
* Each entry is a (limit, offset) pair.
* The upper 16 bits of the entry are the upper 16 bits of the
* exclusive primary limit of a range.
* Primaries between the previous limit and this one have their lead bytes
* modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits.
*
* CollationData::makeReorderRanges() writes a full list where the first range
* (at least for terminators and separators) has a 0 offset.
* The last range has a non-zero offset.
* minHighNoReorder is set to the limit of that last range.
*
* In the settings object, the initial ranges before the first split lead byte
* are omitted for efficiency; they are handled by reorder(p) via the reorderTable.
* If there are no split-reordered lead bytes, then no ranges are needed.
*/
const uint32_t *reorderRanges;
int32_t reorderRangesLength;
/** Array of reorder codes; ignored if reorderCodesLength == 0. */
const int32_t *reorderCodes;
/** Number of reorder codes; 0 if no reordering. */
int32_t reorderCodesLength;
/**
* Capacity of reorderCodes.
* If 0, then the table and codes are aliases.
* If 0, then the codes, the ranges, and the table are aliases.
* Otherwise, this object owns the memory via the reorderCodes pointer;
* the table and the codes are in the same memory block, with the codes first.
* the codes, the ranges, and the table are in the same memory block, in that order.
*/
int32_t reorderCodesCapacity;
/** Options for CollationFastLatin. Negative if disabled. */
int32_t fastLatinOptions;
uint16_t fastLatinPrimaries[0x180];
private:
void setReorderArrays(const int32_t *codes, int32_t codesLength,
const uint32_t *ranges, int32_t rangesLength,
const uint8_t *table, UErrorCode &errorCode);
uint32_t reorderEx(uint32_t p) const;
};
U_NAMESPACE_END

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationtailoring.cpp
@ -40,6 +40,7 @@ CollationTailoring::CollationTailoring(const CollationSettings *baseSettings)
if(baseSettings != NULL) {
U_ASSERT(baseSettings->reorderCodesLength == 0);
U_ASSERT(baseSettings->reorderTable == NULL);
U_ASSERT(baseSettings->minHighNoReorder == 0);
} else {
settings = new CollationSettings();
}

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines
* Copyright (C) 1996-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* rulebasedcollator.cpp
@ -673,9 +673,7 @@ RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, int32_t length,
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
ownedSettings->aliasReordering(defaultSettings.reorderCodes,
defaultSettings.reorderCodesLength,
defaultSettings.reorderTable);
ownedSettings->copyReorderingFrom(defaultSettings, errorCode);
setFastLatinOptions(*ownedSettings);
}
return;
@ -685,17 +683,7 @@ RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, int32_t length,
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
if(length == 0) {
ownedSettings->resetReordering();
} else {
uint8_t reorderTable[256];
data->makeReorderTable(reorderCodes, length, reorderTable, errorCode);
if(U_FAILURE(errorCode)) { return; }
if(!ownedSettings->setReordering(reorderCodes, length, reorderTable)) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
}
ownedSettings->setReordering(*data, reorderCodes, length, errorCode);
setFastLatinOptions(*ownedSettings);
}

View file

@ -1,6 +1,6 @@
/*
******************************************************************************
* Copyright (C) 1996-2014, International Business Machines
* Copyright (C) 1996-2015, International Business Machines
* Corporation and others. All Rights Reserved.
******************************************************************************
*/
@ -607,7 +607,7 @@ public:
* Retrieves the reordering codes for this collator.
* @param dest The array to fill with the script ordering.
* @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function
* will only return the length of the result without writing any of the result string (pre-flighting).
* will only return the length of the result without writing any codes (pre-flighting).
* @param status A reference to an error code value, which must not indicate
* a failure before the function call.
* @return The length of the script ordering array.
@ -630,6 +630,7 @@ public:
* length is also set to 0. An empty array will clear any reordering codes on the collator.
* @param reorderCodesLength The length of reorderCodes.
* @param status error code
* @see ucol_setReorderCodes
* @see Collator#getReorderCodes
* @see Collator#getEquivalentReorderCodes
* @see UScriptCode
@ -643,11 +644,13 @@ public:
/**
* Retrieves the reorder codes that are grouped with the given reorder code. Some reorder
* codes will be grouped and must reorder together.
* Beginning with ICU 55, scripts only reorder together if they are primary-equal,
* for example Hiragana and Katakana.
*
* @param reorderCode The reorder code to determine equivalence for.
* @param dest The array to fill with the script equivalence reordering codes.
* @param destCapacity The length of dest. If it is 0, then dest may be NULL and the
* function will only return the length of the result without writing any of the result
* string (pre-flighting).
* function will only return the length of the result without writing any codes (pre-flighting).
* @param status A reference to an error code value, which must not indicate
* a failure before the function call.
* @return The length of the of the reordering code equivalence array.

View file

@ -1,6 +1,6 @@
/*
******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and
* Copyright (C) 1996-2015, International Business Machines Corporation and
* others. All Rights Reserved.
******************************************************************************
*/
@ -651,7 +651,7 @@ public:
* Retrieves the reordering codes for this collator.
* @param dest The array to fill with the script ordering.
* @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function
* will only return the length of the result without writing any of the result string (pre-flighting).
* will only return the length of the result without writing any codes (pre-flighting).
* @param status A reference to an error code value, which must not indicate
* a failure before the function call.
* @return The length of the script ordering array.
@ -670,6 +670,7 @@ public:
* length is also set to 0. An empty array will clear any reordering codes on the collator.
* @param reorderCodesLength The length of reorderCodes.
* @param status error code
* @see ucol_setReorderCodes
* @see Collator#getReorderCodes
* @see Collator#getEquivalentReorderCodes
* @stable ICU 4.8

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (c) 1996-2014, International Business Machines Corporation and others.
* Copyright (c) 1996-2015, International Business Machines Corporation and others.
* All Rights Reserved.
*******************************************************************************
*/
@ -685,7 +685,7 @@ ucol_setStrength(UCollator *coll,
* @param coll The UCollator to query.
* @param dest The array to fill with the script ordering.
* @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function
* will only return the length of the result without writing any of the result string (pre-flighting).
* will only return the length of the result without writing any codes (pre-flighting).
* @param pErrorCode Must be a valid pointer to an error code value, which must not indicate a
* failure before the function call.
* @return The number of reordering codes written to the dest array.
@ -702,27 +702,32 @@ ucol_getReorderCodes(const UCollator* coll,
UErrorCode *pErrorCode);
/**
* Sets the reordering codes for this collator.
* Collation reordering allows scripts and some other defined blocks of characters
* to be moved relative to each other as a block. This reordering is done on top of
* Collation reordering allows scripts and some other groups of characters
* to be moved relative to each other. This reordering is done on top of
* the DUCET/CLDR standard collation order. Reordering can specify groups to be placed
* at the start and/or the end of the collation order. These groups are specified using
* UScript codes and UColReorderCode entries.
*
* <p>By default, reordering codes specified for the start of the order are placed in the
* order given after a group of "special" non-script blocks. These special groups of characters
* order given after several special non-script blocks. These special groups of characters
* are space, punctuation, symbol, currency, and digit. These special groups are represented with
* UColReorderCode entries. Script groups can be intermingled with
* these special non-script blocks if those special blocks are explicitly specified in the reordering.
* these special non-script groups if those special groups are explicitly specified in the reordering.
*
* <p>The special code OTHERS stands for any script that is not explicitly
* mentioned in the list of reordering codes given. Anything that is after OTHERS
* will go at the very end of the reordering in the order given.
*
* <p>The special reorder code DEFAULT will reset the reordering for this collator
* to the default for this collator. The default reordering may be the DUCET/CLDR order or may be a reordering that
* was specified when this collator was created from resource data or from rules. The
* DEFAULT code <b>must</b> be the sole code supplied when it used. If not
* that will result in a U_ILLEGAL_ARGUMENT_ERROR being set.
* DEFAULT code <b>must</b> be the sole code supplied when it is used.
* If not, then U_ILLEGAL_ARGUMENT_ERROR will be set.
*
* <p>The special reorder code NONE will remove any reordering for this collator.
* The result of setting no reordering will be to have the DUCET/CLDR ordering used. The
* NONE code <b>must</b> be the sole code supplied when it used.
* NONE code <b>must</b> be the sole code supplied when it is used.
*
* @param coll The UCollator to set.
* @param reorderCodes An array of script codes in the new order. This can be NULL if the
* length is also set to 0. An empty array will clear any reordering codes on the collator.
@ -744,10 +749,13 @@ ucol_setReorderCodes(UCollator* coll,
/**
* Retrieves the reorder codes that are grouped with the given reorder code. Some reorder
* codes will be grouped and must reorder together.
* Beginning with ICU 55, scripts only reorder together if they are primary-equal,
* for example Hiragana and Katakana.
*
* @param reorderCode The reorder code to determine equivalence for.
* @param dest The array to fill with the script ordering.
* @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function
* will only return the length of the result without writing any of the result string (pre-flighting).
* will only return the length of the result without writing any codes (pre-flighting).
* @param pErrorCode Must be a valid pointer to an error code value, which must not indicate
* a failure before the function call.
* @return The number of reordering codes written to the dest array.

View file

@ -1,7 +1,7 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2001-2014, International Business Machines Corporation and
* Copyright (c) 2001-2015, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/*******************************************************************************
@ -4693,7 +4693,7 @@ static void TestReorderingAPI(void)
UErrorCode status = U_ZERO_ERROR;
UCollator *myCollation;
int32_t reorderCodes[3] = {USCRIPT_GREEK, USCRIPT_HAN, UCOL_REORDER_CODE_PUNCTUATION};
int32_t duplicateReorderCodes[] = {USCRIPT_CUNEIFORM, USCRIPT_GREEK, UCOL_REORDER_CODE_CURRENCY, USCRIPT_EGYPTIAN_HIEROGLYPHS};
int32_t duplicateReorderCodes[] = {USCRIPT_HIRAGANA, USCRIPT_GREEK, UCOL_REORDER_CODE_CURRENCY, USCRIPT_KATAKANA};
int32_t reorderCodesStartingWithDefault[] = {UCOL_REORDER_CODE_DEFAULT, USCRIPT_GREEK, USCRIPT_HAN, UCOL_REORDER_CODE_PUNCTUATION};
int32_t reorderCodeNone = UCOL_REORDER_CODE_NONE;
UCollationResult collResult;
@ -4952,86 +4952,117 @@ static UBool containsExpectedScript(const int32_t scripts[], int32_t length, int
}
static void TestEquivalentReorderingScripts(void) {
// Beginning with ICU 55, collation reordering moves single scripts
// rather than groups of scripts,
// except where scripts share a range and sort primary-equal.
UErrorCode status = U_ZERO_ERROR;
int32_t equivalentScripts[100];
int32_t length;
int i;
int32_t prevScript;
/* At least these scripts are expected to be equivalent. There may be more. */
/* These scripts are expected to be equivalent. */
static const int32_t expectedScripts[] = {
USCRIPT_BOPOMOFO,
USCRIPT_LISU,
USCRIPT_LYCIAN,
USCRIPT_CARIAN,
USCRIPT_LYDIAN,
USCRIPT_YI,
USCRIPT_OLD_ITALIC,
USCRIPT_GOTHIC,
USCRIPT_DESERET,
USCRIPT_SHAVIAN,
USCRIPT_OSMANYA,
USCRIPT_LINEAR_B,
USCRIPT_CYPRIOT,
USCRIPT_OLD_SOUTH_ARABIAN,
USCRIPT_AVESTAN,
USCRIPT_IMPERIAL_ARAMAIC,
USCRIPT_INSCRIPTIONAL_PARTHIAN,
USCRIPT_INSCRIPTIONAL_PAHLAVI,
USCRIPT_UGARITIC,
USCRIPT_OLD_PERSIAN,
USCRIPT_CUNEIFORM,
USCRIPT_EGYPTIAN_HIEROGLYPHS,
USCRIPT_PHONETIC_POLLARD,
USCRIPT_SORA_SOMPENG,
USCRIPT_MEROITIC_CURSIVE,
USCRIPT_MEROITIC_HIEROGLYPHS
USCRIPT_HIRAGANA,
USCRIPT_KATAKANA,
USCRIPT_KATAKANA_OR_HIRAGANA
};
/* UScript.GOTHIC */
equivalentScripts[0] = 0;
length = ucol_getEquivalentReorderCodes(
USCRIPT_GOTHIC, equivalentScripts, LEN(equivalentScripts), &status);
if (U_FAILURE(status)) {
log_err_status(status, "ERROR/Gothic: retrieving equivalent reorder codes: %s\n", myErrorName(status));
return;
}
if (length < LEN(expectedScripts)) {
log_err("ERROR/Gothic: retrieved equivalent script length wrong: "
"expected at least %d, was = %d\n",
if (length != 1 || equivalentScripts[0] != USCRIPT_GOTHIC) {
log_err("ERROR/Gothic: retrieved equivalent scripts wrong: "
"length expected 1, was = %d; expected [%d] was [%d]\n",
length, USCRIPT_GOTHIC, equivalentScripts[0]);
}
length = ucol_getEquivalentReorderCodes(
USCRIPT_HIRAGANA, equivalentScripts, LEN(equivalentScripts), &status);
if (U_FAILURE(status)) {
log_err_status(status, "ERROR/Hiragana: retrieving equivalent reorder codes: %s\n", myErrorName(status));
return;
}
if (length != LEN(expectedScripts)) {
log_err("ERROR/Hiragana: retrieved equivalent script length wrong: "
"expected %d, was = %d\n",
LEN(expectedScripts), length);
}
prevScript = -1;
for (i = 0; i < length; ++i) {
int32_t script = equivalentScripts[i];
if (script <= prevScript) {
log_err("ERROR/Gothic: equivalent scripts out of order at index %d\n", i);
log_err("ERROR/Hiragana: equivalent scripts out of order at index %d\n", i);
}
prevScript = script;
}
for (i = 0; i < LEN(expectedScripts); i++) {
if (!containsExpectedScript(equivalentScripts, length, expectedScripts[i])) {
log_err("ERROR/Gothic: equivalent scripts do not contain %d\n",
log_err("ERROR/Hiragana: equivalent scripts do not contain %d\n",
expectedScripts[i]);
}
}
/* UScript.SHAVIAN */
length = ucol_getEquivalentReorderCodes(
USCRIPT_SHAVIAN, equivalentScripts, LEN(equivalentScripts), &status);
USCRIPT_KATAKANA, equivalentScripts, LEN(equivalentScripts), &status);
if (U_FAILURE(status)) {
log_err_status(status, "ERROR/Shavian: retrieving equivalent reorder codes: %s\n", myErrorName(status));
log_err_status(status, "ERROR/Katakana: retrieving equivalent reorder codes: %s\n", myErrorName(status));
return;
}
if (length < LEN(expectedScripts)) {
log_err("ERROR/Shavian: retrieved equivalent script length wrong: "
"expected at least %d, was = %d\n",
if (length != LEN(expectedScripts)) {
log_err("ERROR/Katakana: retrieved equivalent script length wrong: "
"expected %d, was = %d\n",
LEN(expectedScripts), length);
}
for (i = 0; i < LEN(expectedScripts); i++) {
if (!containsExpectedScript(equivalentScripts, length, expectedScripts[i])) {
log_err("ERROR/Shavian: equivalent scripts do not contain %d\n",
log_err("ERROR/Katakana: equivalent scripts do not contain %d\n",
expectedScripts[i]);
}
}
length = ucol_getEquivalentReorderCodes(
USCRIPT_KATAKANA_OR_HIRAGANA, equivalentScripts, LEN(equivalentScripts), &status);
if (U_FAILURE(status) || length != LEN(expectedScripts)) {
log_err("ERROR/Hrkt: retrieved equivalent script length wrong: "
"expected %d, was = %d\n",
LEN(expectedScripts), length);
}
length = ucol_getEquivalentReorderCodes(
USCRIPT_HAN, equivalentScripts, LEN(equivalentScripts), &status);
if (U_FAILURE(status) || length != 3) {
log_err("ERROR/Hani: retrieved equivalent script length wrong: "
"expected 3, was = %d\n", length);
}
length = ucol_getEquivalentReorderCodes(
USCRIPT_SIMPLIFIED_HAN, equivalentScripts, LEN(equivalentScripts), &status);
if (U_FAILURE(status) || length != 3) {
log_err("ERROR/Hans: retrieved equivalent script length wrong: "
"expected 3, was = %d\n", length);
}
length = ucol_getEquivalentReorderCodes(
USCRIPT_TRADITIONAL_HAN, equivalentScripts, LEN(equivalentScripts), &status);
if (U_FAILURE(status) || length != 3) {
log_err("ERROR/Hant: retrieved equivalent script length wrong: "
"expected 3, was = %d\n", length);
}
length = ucol_getEquivalentReorderCodes(
USCRIPT_MEROITIC_CURSIVE, equivalentScripts, LEN(equivalentScripts), &status);
if (U_FAILURE(status) || length != 2) {
log_err("ERROR/Merc: retrieved equivalent script length wrong: "
"expected 2, was = %d\n", length);
}
length = ucol_getEquivalentReorderCodes(
USCRIPT_MEROITIC_HIEROGLYPHS, equivalentScripts, LEN(equivalentScripts), &status);
if (U_FAILURE(status) || length != 2) {
log_err("ERROR/Mero: retrieved equivalent script length wrong: "
"expected 2, was = %d\n", length);
}
}
static void TestReorderingAcrossCloning(void)

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2014, International Business Machines Corporation and
* Copyright (c) 1997-2015, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
//===============================================================================
@ -1537,7 +1537,10 @@ void CollationAPITest::TestVariableTopSetting() {
status = U_ZERO_ERROR;
vt[0] = 0x24; // dollar sign (currency symbol)
uint32_t newVarTop = coll->setVariableTop(vt, 1, status);
if(U_FAILURE(status)) {
errln("setVariableTop(dollar sign) failed: %s", u_errorName(status));
return;
}
if(newVarTop != coll->getVariableTop(status)) {
errln("setVariableTop(dollar sign) != following getVariableTop()");
}

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2012-2014, International Business Machines
* Copyright (C) 2012-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationtest.cpp
@ -190,7 +190,7 @@ void CollationTest::TestImplicits() {
IcuTestErrorCode errorCode(*this, "TestImplicits");
const CollationData *cd = CollationRoot::getData(errorCode);
if(errorCode.logDataIfFailureAndReset("CollationRoot::getBaseData()")) {
if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
return;
}

View file

@ -1,4 +1,4 @@
# Copyright (c) 2012-2014 International Business Machines
# Copyright (c) 2012-2015 International Business Machines
# Corporation and others. All Rights Reserved.
#
# This file should be in UTF-8 with a signature byte sequence ("BOM").
@ -2526,3 +2526,15 @@
<3 あ
<3 ァ
<1 い
** test: reorder single scripts not groups, ICU ticket 11449
@ root
% reorder Goth Latn
* compare
<1 4
<1 𐌰 # Gothic
<1 L
<1 Ω
# Before ICU 55, the following reordered together with Gothic.
<1 𐌈 # Old Italic
<1 𐑐 # Shavian

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1998-2014, International Business Machines
* Copyright (C) 1998-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -1069,6 +1069,11 @@ addCollation(ParseState* state, struct SResource *result, const char *collation
if(isVerbose()) {
printf("%s~%s collation tailoring part sizes:\n", state->filename, collationType);
icu::CollationInfo::printSizes(totalSize, indexes);
if(t->settings->hasReordering()) {
printf("%s~%s collation reordering ranges:\n", state->filename, collationType);
icu::CollationInfo::printReorderRanges(
*t->data, t->settings->reorderCodes, t->settings->reorderCodesLength);
}
}
struct SResource *collationBin = bin_open(state->bundle, "%%CollationBin", totalSize, dest, NULL, NULL, status);
table_add(result, collationBin, line, status);

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationinfo.cpp
@ -16,9 +16,11 @@
#if !UCONFIG_NO_COLLATION
#include "collationdata.h"
#include "collationdatareader.h"
#include "collationinfo.h"
#include "uassert.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
@ -112,6 +114,37 @@ CollationInfo::getDataLength(const int32_t indexes[], int32_t startIndex) {
return indexes[startIndex + 1] - indexes[startIndex];
}
void
CollationInfo::printReorderRanges(const CollationData &data, const int32_t *codes, int32_t length) {
UErrorCode errorCode = U_ZERO_ERROR;
UVector32 ranges(errorCode);
data.makeReorderRanges(codes, length, ranges, errorCode);
if(U_FAILURE(errorCode)) {
printf(" error building reorder ranges: %s\n", u_errorName(errorCode));
return;
}
int32_t start = 0;
for(int32_t i = 0; i < ranges.size(); ++i) {
int32_t pair = ranges.elementAti(i);
int32_t limit = (pair >> 16) & 0xffff;
int16_t offset = (int16_t)pair;
if(offset == 0) {
// [inclusive-start, exclusive-limit[
printf(" [%04x, %04x[\n", start, limit);
} else if(offset > 0) {
printf(" reorder [%04x, %04x[ by offset %02x to [%04x, %04x[\n",
start, limit, offset,
start + (offset << 8), limit + (offset << 8));
} else /* offset < 0 */ {
printf(" reorder [%04x, %04x[ by offset -%02x to [%04x, %04x[\n",
start, limit, -offset,
start + (offset << 8), limit + (offset << 8));
}
start = limit;
}
}
U_NAMESPACE_END
#endif // !UCONFIG_NO_COLLATION

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* collationinfo.h
@ -18,12 +18,15 @@
U_NAMESPACE_BEGIN
struct CollationData;
/**
* Collation-related code for tools & demos.
*/
class U_TOOLUTIL_API CollationInfo /* all static */ {
public:
static void printSizes(int32_t sizeWithHeader, const int32_t indexes[]);
static void printReorderRanges(const CollationData &data, const int32_t *codes, int32_t length);
private:
CollationInfo(); // no constructor