ICU-11448 collation common tertiary weights for normal Hiragana, change builder to cope with below-common tertiary weights

X-SVN-Rev: 36906
This commit is contained in:
Markus Scherer 2014-12-27 00:38:34 +00:00
parent b3c29ef95e
commit 590a85cbf1
11 changed files with 943 additions and 749 deletions

File diff suppressed because it is too large Load diff

View file

@ -277,15 +277,22 @@ CollationBaseDataBuilder::addRootElement(int64_t ce, UErrorCode &errorCode) {
// We will add it later, as part of the Han ranges.
uint32_t p = (uint32_t)(ce >> 32);
uint32_t secTer = (uint32_t)ce;
if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
if(firstHanPrimary <= p && p <= lastHanPrimary) {
if(firstHanPrimary <= p && p <= lastHanPrimary) {
if(secTer < Collation::COMMON_SEC_AND_TER_CE) {
// buildRootElementsTable() does not currently handle this case.
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
} else {
// Check that secondary and tertiary weights are >= "common".
if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
return;
}
}
if(secTer != Collation::COMMON_SEC_AND_TER_CE) { // minor optimization
// Check that secondary and tertiary weights are > 01.
uint32_t s = secTer >> 16;
uint32_t t = secTer & Collation::ONLY_TERTIARY_MASK;
if((s != 0 && s < Collation::COMMON_WEIGHT16) || (t != 0 && t < Collation::COMMON_WEIGHT16)) {
if((s != 0 && s <= Collation::BEFORE_WEIGHT16) ||
(t != 0 && t <= Collation::BEFORE_WEIGHT16)) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
@ -337,14 +344,26 @@ CollationBaseDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
void
CollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &errorCode) {
// Limit sentinel for root elements.
// This allows us to reduce range checks at runtime.
rootElements.addElement(Collation::makeCE(CollationRootElements::PRIMARY_SENTINEL), errorCode);
if(U_FAILURE(errorCode)) { return; }
uint32_t nextHanPrimary = firstHanPrimary; // Set to 0xffffffff after the last Han range.
uint32_t prevPrimary = 0; // Start with primary ignorable CEs.
UBool tryRange = FALSE;
UBool needCommonSecTerUnit = FALSE;
UBool hasDeltaUnit = FALSE;
for(int32_t i = 0; i < rootElements.size(); ++i) {
int64_t ce = rootElements.elementAti(i);
uint32_t p = (uint32_t)(ce >> 32);
uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
if((p != prevPrimary || secTer > Collation::COMMON_SEC_AND_TER_CE) && needCommonSecTerUnit) {
// The last primary had low sec/ter weights but no common sec/ter combination.
// The next unit is either a new primary or an above-common sec/ter unit.
// Insert a common sec/ter unit so that the builder will reliably
// tailor to either before or after a common weight but not across it.
table.addElement((int32_t)Collation::COMMON_SEC_AND_TER_CE |
CollationRootElements::SEC_TER_DELTA_FLAG, errorCode);
}
if(p != prevPrimary) {
U_ASSERT((p & 0xff) == 0);
int32_t end;
@ -352,7 +371,7 @@ CollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &e
// Add a Han primary weight or range.
// We omitted them initially, and omitted all CEs with Han primaries
// and common secondary/tertiary weights.
U_ASSERT(p > lastHanPrimary || secTer != Collation::COMMON_SEC_AND_TER_CE);
U_ASSERT(p > lastHanPrimary || secTer > Collation::COMMON_SEC_AND_TER_CE);
if(p == nextHanPrimary) {
// One single Han primary with non-common secondary/tertiary weights.
table.addElement((int32_t)p, errorCode);
@ -370,6 +389,7 @@ CollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &e
// nextHanPrimary == lastHanPrimary < p
// We just wrote the single last Han primary.
nextHanPrimary = 0xffffffff;
table.addElement((int32_t)p, errorCode);
} else if(p < lastHanPrimary) {
// nextHanPrimary < p < lastHanPrimary
// End the Han range on p, prepare for the next range.
@ -388,7 +408,14 @@ CollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &e
table.addElement((int32_t)p, errorCode);
}
}
} else if(tryRange && secTer == Collation::COMMON_SEC_AND_TER_CE &&
} else if(prevPrimary != 0 &&
// If there has not been an intervening delta unit,
// then we will try to combine the previous primary and
// the next several primaries into a range.
!hasDeltaUnit &&
// Might get a range with more than two primaries if the current CE
// has common sec/ter weights.
secTer == Collation::COMMON_SEC_AND_TER_CE &&
(end = writeRootElementsRange(prevPrimary, p, i + 1, table, errorCode)) != 0) {
// Multiple CEs with only common secondary/tertiary weights were
// combined into a primary range.
@ -402,22 +429,24 @@ CollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &e
table.addElement((int32_t)p, errorCode);
}
prevPrimary = p;
needCommonSecTerUnit = FALSE;
hasDeltaUnit = FALSE;
}
if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
if(secTer == Collation::COMMON_SEC_AND_TER_CE && !needCommonSecTerUnit) {
// The common secondar/tertiary weights are implied in the primary unit.
// If there is no intervening delta unit, then we will try to combine
// the next several primaries into a range.
tryRange = TRUE;
} else {
if(secTer < Collation::COMMON_SEC_AND_TER_CE) {
// Remember to not suppress a common sec/ter unit if p!=0.
needCommonSecTerUnit = p != 0;
} else if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
// Real common sec/ter unit, no need to insert an artificial one.
needCommonSecTerUnit = FALSE;
}
// For each new set of secondary/tertiary weights we write a delta unit.
table.addElement((int32_t)secTer | CollationRootElements::SEC_TER_DELTA_FLAG, errorCode);
tryRange = FALSE;
hasDeltaUnit = TRUE;
}
}
// Limit sentinel for root elements.
// This allows us to reduce range checks at runtime.
table.addElement(CollationRootElements::PRIMARY_SENTINEL, errorCode);
}
int32_t

View file

@ -313,7 +313,7 @@ CollationBuilder::addReset(int32_t strength, const UnicodeString &str,
if(U_FAILURE(errorCode)) { return; }
int64_t node = nodes.elementAti(index);
// If the index is for a "weaker" tailored node,
// If the index is for a "weaker" node,
// then skip backwards over this and further "weaker" nodes.
while(strengthFromNode(node) > strength) {
index = previousIndexFromNode(node);
@ -360,6 +360,8 @@ CollationBuilder::addReset(int32_t strength, const UnicodeString &str,
if(strength >= UCOL_TERTIARY) {
index = findCommonNode(index, UCOL_TERTIARY);
}
// findCommonNode() stayed on the stronger node or moved to
// an explicit common-weight node of the reset-before strength.
node = nodes.elementAti(index);
if(strengthFromNode(node) == strength) {
// Found a same-strength node with an explicit weight.
@ -373,89 +375,45 @@ CollationBuilder::addReset(int32_t strength, const UnicodeString &str,
}
return;
}
U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16);
U_ASSERT(weight16 > Collation::BEFORE_WEIGHT16);
// Reset to just before this node.
// Insert the preceding same-level explicit weight if it is not there already.
// Which explicit weight immediately precedes this one?
weight16 = getWeight16Before(index, node, strength);
// Does this preceding weight have a node?
uint32_t previousWeight16;
int32_t previousIndex = previousIndexFromNode(node);
if(weight16 == Collation::COMMON_WEIGHT16) {
// Reset to just before this same-strength common-weight node.
for(int32_t i = previousIndex;; i = previousIndexFromNode(node)) {
node = nodes.elementAti(i);
int32_t previousStrength = strengthFromNode(node);
if(previousStrength < strength) {
U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16 || i == previousIndex);
// Either the reset element has an above-common weight and
// the parent node provides the implied common weight,
// or the reset element has a weight<=common in the node
// right after the parent, and we need to insert the preceding weight.
previousWeight16 = Collation::COMMON_WEIGHT16;
break;
} else if(previousStrength == strength && !isTailoredNode(node)) {
previousWeight16 = weight16FromNode(node);
break;
}
// Skip weaker nodes and same-level tailored nodes.
}
if(previousWeight16 == weight16) {
// The preceding weight has a node,
// maybe with following weaker or tailored nodes.
// Reset to the last of them.
index = previousIndex;
} else {
// A non-common weight is only possible from a root CE.
// Find the higher-level weights, which must all be explicit,
// and then find the preceding weight for this level.
uint32_t previousWeight16 = 0;
int32_t previousWeightIndex = -1;
int32_t i = index;
if(strength == UCOL_SECONDARY) {
uint32_t p;
do {
i = previousIndexFromNode(node);
node = nodes.elementAti(i);
if(strengthFromNode(node) == UCOL_SECONDARY && !isTailoredNode(node) &&
previousWeightIndex < 0) {
previousWeightIndex = i;
previousWeight16 = weight16FromNode(node);
}
} while(strengthFromNode(node) > UCOL_PRIMARY);
U_ASSERT(!isTailoredNode(node));
p = weight32FromNode(node);
weight16 = rootElements.getSecondaryBefore(p, weight16);
} else {
uint32_t p, s;
do {
i = previousIndexFromNode(node);
node = nodes.elementAti(i);
if(strengthFromNode(node) == UCOL_TERTIARY && !isTailoredNode(node) &&
previousWeightIndex < 0) {
previousWeightIndex = i;
previousWeight16 = weight16FromNode(node);
}
} while(strengthFromNode(node) > UCOL_SECONDARY);
U_ASSERT(!isTailoredNode(node));
if(strengthFromNode(node) == UCOL_SECONDARY) {
s = weight16FromNode(node);
do {
i = previousIndexFromNode(node);
node = nodes.elementAti(i);
} while(strengthFromNode(node) > UCOL_PRIMARY);
U_ASSERT(!isTailoredNode(node));
} else {
U_ASSERT(!nodeHasBefore2(node));
s = Collation::COMMON_WEIGHT16;
}
p = weight32FromNode(node);
weight16 = rootElements.getTertiaryBefore(p, s, weight16);
U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0);
}
// Find or insert the new explicit weight before the current one.
if(previousWeightIndex >= 0 && weight16 == previousWeight16) {
// Tailor after the last node between adjacent root nodes.
index = previousIndex;
} else {
node = nodeFromWeight16(weight16) | nodeFromStrength(strength);
index = insertNodeBetween(previousIndex, index, node, errorCode);
}
// Insert a node with the preceding weight, reset to that.
node = nodeFromWeight16(weight16) | nodeFromStrength(strength);
index = insertNodeBetween(previousIndex, index, node, errorCode);
}
} else {
// Found a stronger node with implied strength-common weight.
int64_t hasBefore3 = 0;
if(strength == UCOL_SECONDARY) {
U_ASSERT(!nodeHasBefore2(node));
// Move the HAS_BEFORE3 flag from the parent node
// to the new secondary common node.
hasBefore3 = node & HAS_BEFORE3;
node = (node & ~(int64_t)HAS_BEFORE3) | HAS_BEFORE2;
} else {
U_ASSERT(!nodeHasBefore3(node));
node |= HAS_BEFORE3;
}
nodes.setElementAt(node, index);
int32_t nextIndex = nextIndexFromNode(node);
// Insert default nodes with weights 01 and 05, reset to the 01 node.
node = nodeFromWeight16(Collation::BEFORE_WEIGHT16) | nodeFromStrength(strength);
index = insertNodeBetween(index, nextIndex, node, errorCode);
node = nodeFromWeight16(Collation::COMMON_WEIGHT16) | hasBefore3 |
nodeFromStrength(strength);
insertNodeBetween(index, nextIndex, node, errorCode);
uint32_t weight16 = getWeight16Before(index, node, strength);
index = findOrInsertWeakNode(index, weight16, strength, errorCode);
}
// Strength of the temporary CE = strength of its reset position.
// Code above raises an error if the before-strength is stronger.
@ -468,6 +426,49 @@ CollationBuilder::addReset(int32_t strength, const UnicodeString &str,
ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength);
}
uint32_t
CollationBuilder::getWeight16Before(int32_t index, int64_t node, int32_t level) {
U_ASSERT(strengthFromNode(node) < level || !isTailoredNode(node));
// Collect the root CE weights if this node is for a root CE.
// If it is not, then return the low non-primary boundary for a tailored CE.
uint32_t t;
if(strengthFromNode(node) == UCOL_TERTIARY) {
t = weight16FromNode(node);
} else {
t = Collation::COMMON_WEIGHT16; // Stronger node with implied common weight.
}
while(strengthFromNode(node) > UCOL_SECONDARY) {
index = previousIndexFromNode(node);
node = nodes.elementAti(index);
}
if(isTailoredNode(node)) {
return Collation::BEFORE_WEIGHT16;
}
uint32_t s;
if(strengthFromNode(node) == UCOL_SECONDARY) {
s = weight16FromNode(node);
} else {
s = Collation::COMMON_WEIGHT16; // Stronger node with implied common weight.
}
while(strengthFromNode(node) > UCOL_PRIMARY) {
index = previousIndexFromNode(node);
node = nodes.elementAti(index);
}
if(isTailoredNode(node)) {
return Collation::BEFORE_WEIGHT16;
}
// [p, s, t] is a root CE. Return the preceding weight for the requested level.
uint32_t p = weight32FromNode(node);
uint32_t weight16;
if(level == UCOL_SECONDARY) {
weight16 = rootElements.getSecondaryBefore(p, s);
} else {
weight16 = rootElements.getTertiaryBefore(p, s, t);
U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0);
}
return weight16;
}
int64_t
CollationBuilder::getSpecialResetPosition(const UnicodeString &str,
const char *&parserErrorReason, UErrorCode &errorCode) {
@ -793,7 +794,7 @@ CollationBuilder::findOrInsertNodeForRootCE(int64_t ce, int32_t strength, UError
// down to the requested level/strength.
// Root CEs must have common=zero quaternary weights (for which we never insert any nodes).
U_ASSERT((ce & 0xc0) == 0);
int32_t index = findOrInsertNodeForPrimary((uint32_t)(ce >> 32) , errorCode);
int32_t index = findOrInsertNodeForPrimary((uint32_t)(ce >> 32), errorCode);
if(strength >= UCOL_SECONDARY) {
uint32_t lower32 = (uint32_t)ce;
index = findOrInsertWeakNode(index, lower32 >> 16, UCOL_SECONDARY, errorCode);
@ -863,17 +864,44 @@ int32_t
CollationBuilder::findOrInsertWeakNode(int32_t index, uint32_t weight16, int32_t level, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return 0; }
U_ASSERT(0 <= index && index < nodes.size());
U_ASSERT(UCOL_SECONDARY <= level && level <= UCOL_TERTIARY);
U_ASSERT(weight16 == 0 || weight16 >= Collation::COMMON_WEIGHT16);
// Only reset-before inserts common weights.
if(weight16 == Collation::COMMON_WEIGHT16) {
return findCommonNode(index, level);
}
// If this will be the first below-common weight for the parent node,
// then we will also need to insert a common weight after it.
int64_t node = nodes.elementAti(index);
U_ASSERT(strengthFromNode(node) < level); // parent node is stronger
if(weight16 != 0 && weight16 < Collation::COMMON_WEIGHT16) {
int32_t hasThisLevelBefore = level == UCOL_SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3;
if((node & hasThisLevelBefore) == 0) {
// The parent node has an implied level-common weight.
int64_t commonNode =
nodeFromWeight16(Collation::COMMON_WEIGHT16) | nodeFromStrength(level);
if(level == UCOL_SECONDARY) {
// Move the HAS_BEFORE3 flag from the parent node
// to the new secondary common node.
commonNode |= node & HAS_BEFORE3;
node &= ~(int64_t)HAS_BEFORE3;
}
nodes.setElementAt(node | hasThisLevelBefore, index);
// Insert below-common-weight node.
int32_t nextIndex = nextIndexFromNode(node);
node = nodeFromWeight16(weight16) | nodeFromStrength(level);
index = insertNodeBetween(index, nextIndex, node, errorCode);
// Insert common-weight node.
insertNodeBetween(index, nextIndex, commonNode, errorCode);
// Return index of below-common-weight node.
return index;
}
}
// Find the root CE's weight for this level.
// Postpone insertion if not found:
// Insert the new root node before the next stronger node,
// or before the next root node with the same strength and a larger weight.
int64_t node = nodes.elementAti(index);
int32_t nextIndex;
while((nextIndex = nextIndexFromNode(node)) != 0) {
node = nodes.elementAti(nextIndex);
@ -961,13 +989,14 @@ CollationBuilder::findCommonNode(int32_t index, int32_t strength) const {
index = nextIndexFromNode(node);
node = nodes.elementAti(index);
U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength &&
weight16FromNode(node) == Collation::BEFORE_WEIGHT16);
weight16FromNode(node) < Collation::COMMON_WEIGHT16);
// Skip to the explicit common node.
do {
index = nextIndexFromNode(node);
node = nodes.elementAti(index);
U_ASSERT(strengthFromNode(node) >= strength);
} while(isTailoredNode(node) || strengthFromNode(node) > strength);
} while(isTailoredNode(node) || strengthFromNode(node) > strength ||
weight16FromNode(node) < Collation::COMMON_WEIGHT16);
U_ASSERT(weight16FromNode(node) == Collation::COMMON_WEIGHT16);
return index;
}
@ -1351,6 +1380,9 @@ CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {
CollationWeights primaries, secondaries, tertiaries;
int64_t *nodesArray = nodes.getBuffer();
#ifdef DEBUG_COLLATION_BUILDER
puts("\nCollationBuilder::makeTailoredCEs()");
#endif
for(int32_t rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) {
int32_t i = rootPrimaryIndexes.elementAti(rpi);
@ -1398,11 +1430,11 @@ CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {
// Gap at the beginning of the tertiary CE range.
t = rootElements.getTertiaryBoundary() - 0x100;
tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK;
} else if(t == Collation::BEFORE_WEIGHT16) {
tLimit = Collation::COMMON_WEIGHT16;
} else if(!pIsTailored && !sIsTailored) {
// p and s are root weights.
tLimit = rootElements.getTertiaryAfter(pIndex, s, t);
} else if(t == Collation::BEFORE_WEIGHT16) {
tLimit = Collation::COMMON_WEIGHT16;
} else {
// [p, s] is tailored.
U_ASSERT(t == Collation::COMMON_WEIGHT16);
@ -1441,11 +1473,11 @@ CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {
// Gap at the beginning of the secondary CE range.
s = rootElements.getSecondaryBoundary() - 0x100;
sLimit = rootElements.getFirstSecondaryCE() >> 16;
} else if(s == Collation::BEFORE_WEIGHT16) {
sLimit = Collation::COMMON_WEIGHT16;
} else if(!pIsTailored) {
// p is a root primary.
sLimit = rootElements.getSecondaryAfter(pIndex, s);
} else if(s == Collation::BEFORE_WEIGHT16) {
sLimit = Collation::COMMON_WEIGHT16;
} else {
// p is a tailored primary.
U_ASSERT(s == Collation::COMMON_WEIGHT16);
@ -1460,6 +1492,11 @@ CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {
if(!secondaries.allocWeights(s, sLimit, sCount)) {
errorCode = U_BUFFER_OVERFLOW_ERROR;
errorReason = "secondary tailoring gap too small";
#ifdef DEBUG_COLLATION_BUILDER
printf("!secondaries.allocWeights(%lx, %lx, sCount=%ld)\n",
(long)alignWeightRight(s), (long)alignWeightRight(sLimit),
(long)alignWeightRight(sCount));
#endif
return;
}
sIsTailored = TRUE;

View file

@ -56,6 +56,11 @@ private:
/** Implements CollationRuleParser::Sink. */
virtual void addReset(int32_t strength, const UnicodeString &str,
const char *&errorReason, UErrorCode &errorCode);
/**
* Returns the secondary or tertiary weight preceding the current node's weight.
* node=nodes[index].
*/
uint32_t getWeight16Before(int32_t index, int64_t node, int32_t level);
int64_t getSpecialResetPosition(const UnicodeString &str,
const char *&parserErrorReason, UErrorCode &errorCode);
@ -96,7 +101,7 @@ private:
/**
* Finds the node which implies or contains a common=05 weight of the given strength
* (secondary or tertiary).
* (secondary or tertiary), if the current node is stronger.
* Skips weaker nodes and tailored nodes if the current node is stronger
* and is followed by an explicit-common-weight node.
* Always returns the input index if that node is no stronger than the given strength.
@ -218,15 +223,13 @@ private:
/** At most 1M nodes, limited by the 20 bits in node bit fields. */
static const int32_t MAX_INDEX = 0xfffff;
/**
* Node bit 6 is set on a primary node if there are tailored nodes
* with secondary values below the common secondary weight (05),
* from a reset-secondary-before (&[before 2]).
* Node bit 6 is set on a primary node if there are nodes
* with secondary values below the common secondary weight (05).
*/
static const int32_t HAS_BEFORE2 = 0x40;
/**
* Node bit 5 is set on a primary or secondary node if there are tailored nodes
* with tertiary values below the common tertiary weight (05),
* from a reset-tertiary-before (&[before 3]).
* Node bit 5 is set on a primary or secondary node if there are nodes
* with tertiary values below the common tertiary weight (05).
*/
static const int32_t HAS_BEFORE3 = 0x20;
/**
@ -338,15 +341,16 @@ private:
* A node of a given strength normally implies "common" weights on weaker levels.
*
* A node with HAS_BEFORE2 must be immediately followed by
* a secondary node with BEFORE_WEIGHT16, then a secondary tailored node,
* a secondary node with an explicit below-common weight, then a secondary tailored node,
* and later an explicit common-secondary node.
* (&[before 2] resets to the BEFORE_WEIGHT16 node so that
* The below-common weight can be a root weight,
* or it can be BEFORE_WEIGHT16 for tailoring before an implied common weight
* or before the lowest root weight.
* (&[before 2] resets to an explicit secondary node so that
* the following addRelation(secondary) tailors right after that.
* If we did not have this node and instead were to reset on the primary node,
* then addRelation(secondary) would skip forward to the the COMMON_WEIGHT16 node.)
*
* All secondary tailored nodes between these two explicit ones
* will be assigned lower-than-common secondary weights.
* If the flag is not set, then there are no explicit secondary nodes
* with the common or lower weights.
*

View file

@ -109,7 +109,7 @@ private:
/*
* Format of collation data (ucadata.icu, binary data in coll/ *.res files).
* Format version 4.0.
* Format version 4.1.
*
* The root collation data is stored in the ucadata.icu file.
* Tailorings are stored inside .res resource bundle files, with a complete file header.
@ -200,6 +200,17 @@ private:
*
* UBool compressibleBytes[]; -- empty in all tailorings
* Flag for getSortKey(), indicating primary weight lead bytes that are compressible.
*
* -----------------
* Changes for formatVersion 4.1
*
* The rootElements may contain secondary and tertiary weights below common=05.
* (Used for small Hiragana letters.)
* Where is occurs, there is also an explicit unit with common secondary & tertiary weights.
* There are no other data structure changes, but builder code needs to be able to handle such data.
*
* ICU 55 ucadata.icu uses formatVersion 4.1.
* ICU 55 tailoring data continues to use formatVersion 4.0.
*/
U_NAMESPACE_END

View file

@ -125,7 +125,7 @@ CollationRootElements::getSecondaryBefore(uint32_t p, uint32_t s) const {
} else {
index = findPrimary(p) + 1;
previousSec = Collation::BEFORE_WEIGHT16;
sec = Collation::COMMON_WEIGHT16;
sec = getFirstSecTerForPrimary(index) >> 16;
}
U_ASSERT(s >= sec);
while(s > sec) {
@ -155,7 +155,7 @@ CollationRootElements::getTertiaryBefore(uint32_t p, uint32_t s, uint32_t t) con
} else {
index = findPrimary(p) + 1;
previousTer = Collation::BEFORE_WEIGHT16;
secTer = Collation::COMMON_SEC_AND_TER_CE;
secTer = getFirstSecTerForPrimary(index);
}
uint32_t st = (s << 16) | t;
while(st > secTer) {
@ -191,33 +191,38 @@ CollationRootElements::getPrimaryAfter(uint32_t p, int32_t index, UBool isCompre
uint32_t
CollationRootElements::getSecondaryAfter(int32_t index, uint32_t s) const {
uint32_t secTer;
uint32_t secLimit;
if(index == 0) {
// primary = 0
U_ASSERT(s != 0);
index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX];
secTer = elements[index];
// Gap at the end of the secondary CE range.
secLimit = 0x10000;
} else {
U_ASSERT(index >= (int32_t)elements[IX_FIRST_PRIMARY_INDEX]);
++index;
secTer = getFirstSecTerForPrimary(index + 1);
// If this is an explicit sec/ter unit, then it will be read once more.
// Gap for secondaries of primary CEs.
secLimit = getSecondaryBoundary();
}
for(;;) {
uint32_t secTer = elements[index];
if((secTer & SEC_TER_DELTA_FLAG) == 0) { return secLimit; }
uint32_t sec = secTer >> 16;
if(sec > s) { return sec; }
++index;
secTer = elements[++index];
if((secTer & SEC_TER_DELTA_FLAG) == 0) { return secLimit; }
}
}
uint32_t
CollationRootElements::getTertiaryAfter(int32_t index, uint32_t s, uint32_t t) const {
uint32_t secTer;
uint32_t terLimit;
if(index == 0) {
// primary = 0
if(s == 0) {
U_ASSERT(t != 0);
index = (int32_t)elements[IX_FIRST_TERTIARY_INDEX];
// Gap at the end of the tertiary CE range.
terLimit = 0x4000;
@ -226,22 +231,42 @@ CollationRootElements::getTertiaryAfter(int32_t index, uint32_t s, uint32_t t) c
// Gap for tertiaries of primary/secondary CEs.
terLimit = getTertiaryBoundary();
}
secTer = elements[index] & ~SEC_TER_DELTA_FLAG;
} else {
U_ASSERT(index >= (int32_t)elements[IX_FIRST_PRIMARY_INDEX]);
++index;
secTer = getFirstSecTerForPrimary(index + 1);
// If this is an explicit sec/ter unit, then it will be read once more.
terLimit = getTertiaryBoundary();
}
uint32_t st = (s << 16) | t;
for(;;) {
uint32_t secTer = elements[index];
if(secTer > st) {
U_ASSERT((secTer >> 16) == s);
return secTer & 0xffff;
}
secTer = elements[++index];
// No tertiary greater than t for this primary+secondary.
if((secTer & SEC_TER_DELTA_FLAG) == 0 || (secTer >> 16) > s) { return terLimit; }
secTer &= ~SEC_TER_DELTA_FLAG;
if(secTer > st) { return secTer & 0xffff; }
++index;
}
}
uint32_t
CollationRootElements::getFirstSecTerForPrimary(int32_t index) const {
uint32_t secTer = elements[index];
if((secTer & SEC_TER_DELTA_FLAG) == 0) {
// No sec/ter delta.
return Collation::COMMON_SEC_AND_TER_CE;
}
secTer &= ~SEC_TER_DELTA_FLAG;
if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
// Implied sec/ter.
return Collation::COMMON_SEC_AND_TER_CE;
}
// Explicit sec/ter below common/common.
return secTer;
}
int32_t
CollationRootElements::findPrimary(uint32_t p) const {
// Requirement: p must occur as a root primary.

View file

@ -189,15 +189,32 @@ public:
/**
* Returns the secondary weight after [p, s] where index=findPrimary(p)
* except use index=0 for p=0.
*
* Must return a weight for every root [p, s] as well as for every weight
* returned by getSecondaryBefore(). If p!=0 then s can be BEFORE_WEIGHT16.
*
* Exception: [0, 0] is handled by the CollationBuilder:
* Both its lower and upper boundaries are special.
*/
uint32_t getSecondaryAfter(int32_t index, uint32_t s) const;
/**
* Returns the tertiary weight after [p, s, t] where index=findPrimary(p)
* except use index=0 for p=0.
*
* Must return a weight for every root [p, s, t] as well as for every weight
* returned by getTertiaryBefore(). If s!=0 then t can be BEFORE_WEIGHT16.
*
* Exception: [0, 0, 0] is handled by the CollationBuilder:
* Both its lower and upper boundaries are special.
*/
uint32_t getTertiaryAfter(int32_t index, uint32_t s, uint32_t t) const;
private:
/**
* Returns the first secondary & tertiary weights for p where index=findPrimary(p)+1.
*/
uint32_t getFirstSecTerForPrimary(int32_t index) const;
/**
* Finds the largest index i where elements[i]<=p.
* Requires first primary<=p<0xffffff00 (PRIMARY_SENTINEL).
@ -216,15 +233,18 @@ private:
* See the comments on the IX_ constants.
*
* All other elements are a compact form of the root collator CEs
* in collation order.
* in mostly collation order.
*
* Primary weights have the SEC_TER_DELTA_FLAG flag not set.
* A primary-weight element by itself represents a root CE
* with Collation::COMMON_SEC_AND_TER_CE.
* A sequence of one or more root CEs with the same primary weight is stored as
* one element with the primary weight, with the SEC_TER_DELTA_FLAG flag not set,
* followed by elements with only the secondary/tertiary weights,
* each with that flag set.
* If the lowest secondary/tertiary combination is Collation::COMMON_SEC_AND_TER_CE,
* then the element for that combination is omitted.
*
* If there are root CEs with the same primary but other secondary/tertiary weights,
* then for each such CE there is an element with those secondary and tertiary weights,
* and with the SEC_TER_DELTA_FLAG flag set.
* Note: If the first actual secondary/tertiary combination is higher than
* Collation::COMMON_SEC_AND_TER_CE (which is unusual),
* the runtime code will assume anyway that Collation::COMMON_SEC_AND_TER_CE is present.
*
* A range of only-primary CEs with a consistent "step" increment
* from each primary to the next may be stored as a range.

View file

@ -723,7 +723,26 @@ public:
// Simple primary CE.
++index;
pri = p;
secTer = Collation::COMMON_SEC_AND_TER_CE;
// Does this have an explicit below-common sec/ter unit,
// or does it imply a common one?
if(index == length) {
secTer = Collation::COMMON_SEC_AND_TER_CE;
} else {
secTer = elements[index];
if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
// No sec/ter delta.
secTer = Collation::COMMON_SEC_AND_TER_CE;
} else {
secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
// Implied sec/ter.
secTer = Collation::COMMON_SEC_AND_TER_CE;
} else {
// Explicit sec/ter below common/common.
++index;
}
}
}
return TRUE;
}

View file

@ -2476,3 +2476,53 @@
* compare
<1 AA
<2 aą
** test: tailor tertiary-after a common tertiary where there is a lower one
# Assume that Hiragana small A has a below-common tertiary, and Hiragana A has a common one.
# See ICU ticket 11448 & CLDR ticket 7222.
@ rules
&あ<<<x<<<y<<<z
* compare
<1 ぁ
<3 あ
<3 x
<3 y
<3 z
<3 ァ
<1 い
** test: tailor tertiary-after a below-common tertiary
@ rules
&ぁ<<<x<<<y<<<z
* compare
<1 ぁ
<3 x
<3 y
<3 z
<3 あ
<3 ァ
<1 い
** test: tailor tertiary-before a common tertiary where there is a lower one
@ rules
&[before 3]あ<<<x<<<y<<<z
* compare
<1 ぁ
<3 x
<3 y
<3 z
<3 あ
<3 ァ
<1 い
** test: tailor tertiary-before a below-common tertiary
@ rules
&[before 3]ぁ<<<x<<<y<<<z
* compare
<1 x
<3 y
<3 z
<3 ぁ
<3 あ
<3 ァ
<1 い