mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-11448 collation common tertiary weights for normal Hiragana, change builder to cope with below-common tertiary weights
X-SVN-Rev: 36906
This commit is contained in:
parent
b3c29ef95e
commit
590a85cbf1
11 changed files with 943 additions and 749 deletions
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load diff
|
@ -277,15 +277,22 @@ CollationBaseDataBuilder::addRootElement(int64_t ce, UErrorCode &errorCode) {
|
|||
// We will add it later, as part of the Han ranges.
|
||||
uint32_t p = (uint32_t)(ce >> 32);
|
||||
uint32_t secTer = (uint32_t)ce;
|
||||
if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
|
||||
if(firstHanPrimary <= p && p <= lastHanPrimary) {
|
||||
if(firstHanPrimary <= p && p <= lastHanPrimary) {
|
||||
if(secTer < Collation::COMMON_SEC_AND_TER_CE) {
|
||||
// buildRootElementsTable() does not currently handle this case.
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
// Check that secondary and tertiary weights are >= "common".
|
||||
if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
if(secTer != Collation::COMMON_SEC_AND_TER_CE) { // minor optimization
|
||||
// Check that secondary and tertiary weights are > 01.
|
||||
uint32_t s = secTer >> 16;
|
||||
uint32_t t = secTer & Collation::ONLY_TERTIARY_MASK;
|
||||
if((s != 0 && s < Collation::COMMON_WEIGHT16) || (t != 0 && t < Collation::COMMON_WEIGHT16)) {
|
||||
if((s != 0 && s <= Collation::BEFORE_WEIGHT16) ||
|
||||
(t != 0 && t <= Collation::BEFORE_WEIGHT16)) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
@ -337,14 +344,26 @@ CollationBaseDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
|
|||
|
||||
void
|
||||
CollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &errorCode) {
|
||||
// Limit sentinel for root elements.
|
||||
// This allows us to reduce range checks at runtime.
|
||||
rootElements.addElement(Collation::makeCE(CollationRootElements::PRIMARY_SENTINEL), errorCode);
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
uint32_t nextHanPrimary = firstHanPrimary; // Set to 0xffffffff after the last Han range.
|
||||
uint32_t prevPrimary = 0; // Start with primary ignorable CEs.
|
||||
UBool tryRange = FALSE;
|
||||
UBool needCommonSecTerUnit = FALSE;
|
||||
UBool hasDeltaUnit = FALSE;
|
||||
for(int32_t i = 0; i < rootElements.size(); ++i) {
|
||||
int64_t ce = rootElements.elementAti(i);
|
||||
uint32_t p = (uint32_t)(ce >> 32);
|
||||
uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
|
||||
if((p != prevPrimary || secTer > Collation::COMMON_SEC_AND_TER_CE) && needCommonSecTerUnit) {
|
||||
// The last primary had low sec/ter weights but no common sec/ter combination.
|
||||
// The next unit is either a new primary or an above-common sec/ter unit.
|
||||
// Insert a common sec/ter unit so that the builder will reliably
|
||||
// tailor to either before or after a common weight but not across it.
|
||||
table.addElement((int32_t)Collation::COMMON_SEC_AND_TER_CE |
|
||||
CollationRootElements::SEC_TER_DELTA_FLAG, errorCode);
|
||||
}
|
||||
if(p != prevPrimary) {
|
||||
U_ASSERT((p & 0xff) == 0);
|
||||
int32_t end;
|
||||
|
@ -352,7 +371,7 @@ CollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &e
|
|||
// Add a Han primary weight or range.
|
||||
// We omitted them initially, and omitted all CEs with Han primaries
|
||||
// and common secondary/tertiary weights.
|
||||
U_ASSERT(p > lastHanPrimary || secTer != Collation::COMMON_SEC_AND_TER_CE);
|
||||
U_ASSERT(p > lastHanPrimary || secTer > Collation::COMMON_SEC_AND_TER_CE);
|
||||
if(p == nextHanPrimary) {
|
||||
// One single Han primary with non-common secondary/tertiary weights.
|
||||
table.addElement((int32_t)p, errorCode);
|
||||
|
@ -370,6 +389,7 @@ CollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &e
|
|||
// nextHanPrimary == lastHanPrimary < p
|
||||
// We just wrote the single last Han primary.
|
||||
nextHanPrimary = 0xffffffff;
|
||||
table.addElement((int32_t)p, errorCode);
|
||||
} else if(p < lastHanPrimary) {
|
||||
// nextHanPrimary < p < lastHanPrimary
|
||||
// End the Han range on p, prepare for the next range.
|
||||
|
@ -388,7 +408,14 @@ CollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &e
|
|||
table.addElement((int32_t)p, errorCode);
|
||||
}
|
||||
}
|
||||
} else if(tryRange && secTer == Collation::COMMON_SEC_AND_TER_CE &&
|
||||
} else if(prevPrimary != 0 &&
|
||||
// If there has not been an intervening delta unit,
|
||||
// then we will try to combine the previous primary and
|
||||
// the next several primaries into a range.
|
||||
!hasDeltaUnit &&
|
||||
// Might get a range with more than two primaries if the current CE
|
||||
// has common sec/ter weights.
|
||||
secTer == Collation::COMMON_SEC_AND_TER_CE &&
|
||||
(end = writeRootElementsRange(prevPrimary, p, i + 1, table, errorCode)) != 0) {
|
||||
// Multiple CEs with only common secondary/tertiary weights were
|
||||
// combined into a primary range.
|
||||
|
@ -402,22 +429,24 @@ CollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &e
|
|||
table.addElement((int32_t)p, errorCode);
|
||||
}
|
||||
prevPrimary = p;
|
||||
needCommonSecTerUnit = FALSE;
|
||||
hasDeltaUnit = FALSE;
|
||||
}
|
||||
if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
|
||||
if(secTer == Collation::COMMON_SEC_AND_TER_CE && !needCommonSecTerUnit) {
|
||||
// The common secondar/tertiary weights are implied in the primary unit.
|
||||
// If there is no intervening delta unit, then we will try to combine
|
||||
// the next several primaries into a range.
|
||||
tryRange = TRUE;
|
||||
} else {
|
||||
if(secTer < Collation::COMMON_SEC_AND_TER_CE) {
|
||||
// Remember to not suppress a common sec/ter unit if p!=0.
|
||||
needCommonSecTerUnit = p != 0;
|
||||
} else if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
|
||||
// Real common sec/ter unit, no need to insert an artificial one.
|
||||
needCommonSecTerUnit = FALSE;
|
||||
}
|
||||
// For each new set of secondary/tertiary weights we write a delta unit.
|
||||
table.addElement((int32_t)secTer | CollationRootElements::SEC_TER_DELTA_FLAG, errorCode);
|
||||
tryRange = FALSE;
|
||||
hasDeltaUnit = TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
// Limit sentinel for root elements.
|
||||
// This allows us to reduce range checks at runtime.
|
||||
table.addElement(CollationRootElements::PRIMARY_SENTINEL, errorCode);
|
||||
}
|
||||
|
||||
int32_t
|
||||
|
|
|
@ -313,7 +313,7 @@ CollationBuilder::addReset(int32_t strength, const UnicodeString &str,
|
|||
if(U_FAILURE(errorCode)) { return; }
|
||||
|
||||
int64_t node = nodes.elementAti(index);
|
||||
// If the index is for a "weaker" tailored node,
|
||||
// If the index is for a "weaker" node,
|
||||
// then skip backwards over this and further "weaker" nodes.
|
||||
while(strengthFromNode(node) > strength) {
|
||||
index = previousIndexFromNode(node);
|
||||
|
@ -360,6 +360,8 @@ CollationBuilder::addReset(int32_t strength, const UnicodeString &str,
|
|||
if(strength >= UCOL_TERTIARY) {
|
||||
index = findCommonNode(index, UCOL_TERTIARY);
|
||||
}
|
||||
// findCommonNode() stayed on the stronger node or moved to
|
||||
// an explicit common-weight node of the reset-before strength.
|
||||
node = nodes.elementAti(index);
|
||||
if(strengthFromNode(node) == strength) {
|
||||
// Found a same-strength node with an explicit weight.
|
||||
|
@ -373,89 +375,45 @@ CollationBuilder::addReset(int32_t strength, const UnicodeString &str,
|
|||
}
|
||||
return;
|
||||
}
|
||||
U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16);
|
||||
U_ASSERT(weight16 > Collation::BEFORE_WEIGHT16);
|
||||
// Reset to just before this node.
|
||||
// Insert the preceding same-level explicit weight if it is not there already.
|
||||
// Which explicit weight immediately precedes this one?
|
||||
weight16 = getWeight16Before(index, node, strength);
|
||||
// Does this preceding weight have a node?
|
||||
uint32_t previousWeight16;
|
||||
int32_t previousIndex = previousIndexFromNode(node);
|
||||
if(weight16 == Collation::COMMON_WEIGHT16) {
|
||||
// Reset to just before this same-strength common-weight node.
|
||||
for(int32_t i = previousIndex;; i = previousIndexFromNode(node)) {
|
||||
node = nodes.elementAti(i);
|
||||
int32_t previousStrength = strengthFromNode(node);
|
||||
if(previousStrength < strength) {
|
||||
U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16 || i == previousIndex);
|
||||
// Either the reset element has an above-common weight and
|
||||
// the parent node provides the implied common weight,
|
||||
// or the reset element has a weight<=common in the node
|
||||
// right after the parent, and we need to insert the preceding weight.
|
||||
previousWeight16 = Collation::COMMON_WEIGHT16;
|
||||
break;
|
||||
} else if(previousStrength == strength && !isTailoredNode(node)) {
|
||||
previousWeight16 = weight16FromNode(node);
|
||||
break;
|
||||
}
|
||||
// Skip weaker nodes and same-level tailored nodes.
|
||||
}
|
||||
if(previousWeight16 == weight16) {
|
||||
// The preceding weight has a node,
|
||||
// maybe with following weaker or tailored nodes.
|
||||
// Reset to the last of them.
|
||||
index = previousIndex;
|
||||
} else {
|
||||
// A non-common weight is only possible from a root CE.
|
||||
// Find the higher-level weights, which must all be explicit,
|
||||
// and then find the preceding weight for this level.
|
||||
uint32_t previousWeight16 = 0;
|
||||
int32_t previousWeightIndex = -1;
|
||||
int32_t i = index;
|
||||
if(strength == UCOL_SECONDARY) {
|
||||
uint32_t p;
|
||||
do {
|
||||
i = previousIndexFromNode(node);
|
||||
node = nodes.elementAti(i);
|
||||
if(strengthFromNode(node) == UCOL_SECONDARY && !isTailoredNode(node) &&
|
||||
previousWeightIndex < 0) {
|
||||
previousWeightIndex = i;
|
||||
previousWeight16 = weight16FromNode(node);
|
||||
}
|
||||
} while(strengthFromNode(node) > UCOL_PRIMARY);
|
||||
U_ASSERT(!isTailoredNode(node));
|
||||
p = weight32FromNode(node);
|
||||
weight16 = rootElements.getSecondaryBefore(p, weight16);
|
||||
} else {
|
||||
uint32_t p, s;
|
||||
do {
|
||||
i = previousIndexFromNode(node);
|
||||
node = nodes.elementAti(i);
|
||||
if(strengthFromNode(node) == UCOL_TERTIARY && !isTailoredNode(node) &&
|
||||
previousWeightIndex < 0) {
|
||||
previousWeightIndex = i;
|
||||
previousWeight16 = weight16FromNode(node);
|
||||
}
|
||||
} while(strengthFromNode(node) > UCOL_SECONDARY);
|
||||
U_ASSERT(!isTailoredNode(node));
|
||||
if(strengthFromNode(node) == UCOL_SECONDARY) {
|
||||
s = weight16FromNode(node);
|
||||
do {
|
||||
i = previousIndexFromNode(node);
|
||||
node = nodes.elementAti(i);
|
||||
} while(strengthFromNode(node) > UCOL_PRIMARY);
|
||||
U_ASSERT(!isTailoredNode(node));
|
||||
} else {
|
||||
U_ASSERT(!nodeHasBefore2(node));
|
||||
s = Collation::COMMON_WEIGHT16;
|
||||
}
|
||||
p = weight32FromNode(node);
|
||||
weight16 = rootElements.getTertiaryBefore(p, s, weight16);
|
||||
U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0);
|
||||
}
|
||||
// Find or insert the new explicit weight before the current one.
|
||||
if(previousWeightIndex >= 0 && weight16 == previousWeight16) {
|
||||
// Tailor after the last node between adjacent root nodes.
|
||||
index = previousIndex;
|
||||
} else {
|
||||
node = nodeFromWeight16(weight16) | nodeFromStrength(strength);
|
||||
index = insertNodeBetween(previousIndex, index, node, errorCode);
|
||||
}
|
||||
// Insert a node with the preceding weight, reset to that.
|
||||
node = nodeFromWeight16(weight16) | nodeFromStrength(strength);
|
||||
index = insertNodeBetween(previousIndex, index, node, errorCode);
|
||||
}
|
||||
} else {
|
||||
// Found a stronger node with implied strength-common weight.
|
||||
int64_t hasBefore3 = 0;
|
||||
if(strength == UCOL_SECONDARY) {
|
||||
U_ASSERT(!nodeHasBefore2(node));
|
||||
// Move the HAS_BEFORE3 flag from the parent node
|
||||
// to the new secondary common node.
|
||||
hasBefore3 = node & HAS_BEFORE3;
|
||||
node = (node & ~(int64_t)HAS_BEFORE3) | HAS_BEFORE2;
|
||||
} else {
|
||||
U_ASSERT(!nodeHasBefore3(node));
|
||||
node |= HAS_BEFORE3;
|
||||
}
|
||||
nodes.setElementAt(node, index);
|
||||
int32_t nextIndex = nextIndexFromNode(node);
|
||||
// Insert default nodes with weights 01 and 05, reset to the 01 node.
|
||||
node = nodeFromWeight16(Collation::BEFORE_WEIGHT16) | nodeFromStrength(strength);
|
||||
index = insertNodeBetween(index, nextIndex, node, errorCode);
|
||||
node = nodeFromWeight16(Collation::COMMON_WEIGHT16) | hasBefore3 |
|
||||
nodeFromStrength(strength);
|
||||
insertNodeBetween(index, nextIndex, node, errorCode);
|
||||
uint32_t weight16 = getWeight16Before(index, node, strength);
|
||||
index = findOrInsertWeakNode(index, weight16, strength, errorCode);
|
||||
}
|
||||
// Strength of the temporary CE = strength of its reset position.
|
||||
// Code above raises an error if the before-strength is stronger.
|
||||
|
@ -468,6 +426,49 @@ CollationBuilder::addReset(int32_t strength, const UnicodeString &str,
|
|||
ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength);
|
||||
}
|
||||
|
||||
uint32_t
|
||||
CollationBuilder::getWeight16Before(int32_t index, int64_t node, int32_t level) {
|
||||
U_ASSERT(strengthFromNode(node) < level || !isTailoredNode(node));
|
||||
// Collect the root CE weights if this node is for a root CE.
|
||||
// If it is not, then return the low non-primary boundary for a tailored CE.
|
||||
uint32_t t;
|
||||
if(strengthFromNode(node) == UCOL_TERTIARY) {
|
||||
t = weight16FromNode(node);
|
||||
} else {
|
||||
t = Collation::COMMON_WEIGHT16; // Stronger node with implied common weight.
|
||||
}
|
||||
while(strengthFromNode(node) > UCOL_SECONDARY) {
|
||||
index = previousIndexFromNode(node);
|
||||
node = nodes.elementAti(index);
|
||||
}
|
||||
if(isTailoredNode(node)) {
|
||||
return Collation::BEFORE_WEIGHT16;
|
||||
}
|
||||
uint32_t s;
|
||||
if(strengthFromNode(node) == UCOL_SECONDARY) {
|
||||
s = weight16FromNode(node);
|
||||
} else {
|
||||
s = Collation::COMMON_WEIGHT16; // Stronger node with implied common weight.
|
||||
}
|
||||
while(strengthFromNode(node) > UCOL_PRIMARY) {
|
||||
index = previousIndexFromNode(node);
|
||||
node = nodes.elementAti(index);
|
||||
}
|
||||
if(isTailoredNode(node)) {
|
||||
return Collation::BEFORE_WEIGHT16;
|
||||
}
|
||||
// [p, s, t] is a root CE. Return the preceding weight for the requested level.
|
||||
uint32_t p = weight32FromNode(node);
|
||||
uint32_t weight16;
|
||||
if(level == UCOL_SECONDARY) {
|
||||
weight16 = rootElements.getSecondaryBefore(p, s);
|
||||
} else {
|
||||
weight16 = rootElements.getTertiaryBefore(p, s, t);
|
||||
U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0);
|
||||
}
|
||||
return weight16;
|
||||
}
|
||||
|
||||
int64_t
|
||||
CollationBuilder::getSpecialResetPosition(const UnicodeString &str,
|
||||
const char *&parserErrorReason, UErrorCode &errorCode) {
|
||||
|
@ -793,7 +794,7 @@ CollationBuilder::findOrInsertNodeForRootCE(int64_t ce, int32_t strength, UError
|
|||
// down to the requested level/strength.
|
||||
// Root CEs must have common=zero quaternary weights (for which we never insert any nodes).
|
||||
U_ASSERT((ce & 0xc0) == 0);
|
||||
int32_t index = findOrInsertNodeForPrimary((uint32_t)(ce >> 32) , errorCode);
|
||||
int32_t index = findOrInsertNodeForPrimary((uint32_t)(ce >> 32), errorCode);
|
||||
if(strength >= UCOL_SECONDARY) {
|
||||
uint32_t lower32 = (uint32_t)ce;
|
||||
index = findOrInsertWeakNode(index, lower32 >> 16, UCOL_SECONDARY, errorCode);
|
||||
|
@ -863,17 +864,44 @@ int32_t
|
|||
CollationBuilder::findOrInsertWeakNode(int32_t index, uint32_t weight16, int32_t level, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) { return 0; }
|
||||
U_ASSERT(0 <= index && index < nodes.size());
|
||||
U_ASSERT(UCOL_SECONDARY <= level && level <= UCOL_TERTIARY);
|
||||
|
||||
U_ASSERT(weight16 == 0 || weight16 >= Collation::COMMON_WEIGHT16);
|
||||
// Only reset-before inserts common weights.
|
||||
if(weight16 == Collation::COMMON_WEIGHT16) {
|
||||
return findCommonNode(index, level);
|
||||
}
|
||||
|
||||
// If this will be the first below-common weight for the parent node,
|
||||
// then we will also need to insert a common weight after it.
|
||||
int64_t node = nodes.elementAti(index);
|
||||
U_ASSERT(strengthFromNode(node) < level); // parent node is stronger
|
||||
if(weight16 != 0 && weight16 < Collation::COMMON_WEIGHT16) {
|
||||
int32_t hasThisLevelBefore = level == UCOL_SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3;
|
||||
if((node & hasThisLevelBefore) == 0) {
|
||||
// The parent node has an implied level-common weight.
|
||||
int64_t commonNode =
|
||||
nodeFromWeight16(Collation::COMMON_WEIGHT16) | nodeFromStrength(level);
|
||||
if(level == UCOL_SECONDARY) {
|
||||
// Move the HAS_BEFORE3 flag from the parent node
|
||||
// to the new secondary common node.
|
||||
commonNode |= node & HAS_BEFORE3;
|
||||
node &= ~(int64_t)HAS_BEFORE3;
|
||||
}
|
||||
nodes.setElementAt(node | hasThisLevelBefore, index);
|
||||
// Insert below-common-weight node.
|
||||
int32_t nextIndex = nextIndexFromNode(node);
|
||||
node = nodeFromWeight16(weight16) | nodeFromStrength(level);
|
||||
index = insertNodeBetween(index, nextIndex, node, errorCode);
|
||||
// Insert common-weight node.
|
||||
insertNodeBetween(index, nextIndex, commonNode, errorCode);
|
||||
// Return index of below-common-weight node.
|
||||
return index;
|
||||
}
|
||||
}
|
||||
|
||||
// Find the root CE's weight for this level.
|
||||
// Postpone insertion if not found:
|
||||
// Insert the new root node before the next stronger node,
|
||||
// or before the next root node with the same strength and a larger weight.
|
||||
int64_t node = nodes.elementAti(index);
|
||||
int32_t nextIndex;
|
||||
while((nextIndex = nextIndexFromNode(node)) != 0) {
|
||||
node = nodes.elementAti(nextIndex);
|
||||
|
@ -961,13 +989,14 @@ CollationBuilder::findCommonNode(int32_t index, int32_t strength) const {
|
|||
index = nextIndexFromNode(node);
|
||||
node = nodes.elementAti(index);
|
||||
U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength &&
|
||||
weight16FromNode(node) == Collation::BEFORE_WEIGHT16);
|
||||
weight16FromNode(node) < Collation::COMMON_WEIGHT16);
|
||||
// Skip to the explicit common node.
|
||||
do {
|
||||
index = nextIndexFromNode(node);
|
||||
node = nodes.elementAti(index);
|
||||
U_ASSERT(strengthFromNode(node) >= strength);
|
||||
} while(isTailoredNode(node) || strengthFromNode(node) > strength);
|
||||
} while(isTailoredNode(node) || strengthFromNode(node) > strength ||
|
||||
weight16FromNode(node) < Collation::COMMON_WEIGHT16);
|
||||
U_ASSERT(weight16FromNode(node) == Collation::COMMON_WEIGHT16);
|
||||
return index;
|
||||
}
|
||||
|
@ -1351,6 +1380,9 @@ CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {
|
|||
|
||||
CollationWeights primaries, secondaries, tertiaries;
|
||||
int64_t *nodesArray = nodes.getBuffer();
|
||||
#ifdef DEBUG_COLLATION_BUILDER
|
||||
puts("\nCollationBuilder::makeTailoredCEs()");
|
||||
#endif
|
||||
|
||||
for(int32_t rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) {
|
||||
int32_t i = rootPrimaryIndexes.elementAti(rpi);
|
||||
|
@ -1398,11 +1430,11 @@ CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {
|
|||
// Gap at the beginning of the tertiary CE range.
|
||||
t = rootElements.getTertiaryBoundary() - 0x100;
|
||||
tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK;
|
||||
} else if(t == Collation::BEFORE_WEIGHT16) {
|
||||
tLimit = Collation::COMMON_WEIGHT16;
|
||||
} else if(!pIsTailored && !sIsTailored) {
|
||||
// p and s are root weights.
|
||||
tLimit = rootElements.getTertiaryAfter(pIndex, s, t);
|
||||
} else if(t == Collation::BEFORE_WEIGHT16) {
|
||||
tLimit = Collation::COMMON_WEIGHT16;
|
||||
} else {
|
||||
// [p, s] is tailored.
|
||||
U_ASSERT(t == Collation::COMMON_WEIGHT16);
|
||||
|
@ -1441,11 +1473,11 @@ CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {
|
|||
// Gap at the beginning of the secondary CE range.
|
||||
s = rootElements.getSecondaryBoundary() - 0x100;
|
||||
sLimit = rootElements.getFirstSecondaryCE() >> 16;
|
||||
} else if(s == Collation::BEFORE_WEIGHT16) {
|
||||
sLimit = Collation::COMMON_WEIGHT16;
|
||||
} else if(!pIsTailored) {
|
||||
// p is a root primary.
|
||||
sLimit = rootElements.getSecondaryAfter(pIndex, s);
|
||||
} else if(s == Collation::BEFORE_WEIGHT16) {
|
||||
sLimit = Collation::COMMON_WEIGHT16;
|
||||
} else {
|
||||
// p is a tailored primary.
|
||||
U_ASSERT(s == Collation::COMMON_WEIGHT16);
|
||||
|
@ -1460,6 +1492,11 @@ CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {
|
|||
if(!secondaries.allocWeights(s, sLimit, sCount)) {
|
||||
errorCode = U_BUFFER_OVERFLOW_ERROR;
|
||||
errorReason = "secondary tailoring gap too small";
|
||||
#ifdef DEBUG_COLLATION_BUILDER
|
||||
printf("!secondaries.allocWeights(%lx, %lx, sCount=%ld)\n",
|
||||
(long)alignWeightRight(s), (long)alignWeightRight(sLimit),
|
||||
(long)alignWeightRight(sCount));
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
sIsTailored = TRUE;
|
||||
|
|
|
@ -56,6 +56,11 @@ private:
|
|||
/** Implements CollationRuleParser::Sink. */
|
||||
virtual void addReset(int32_t strength, const UnicodeString &str,
|
||||
const char *&errorReason, UErrorCode &errorCode);
|
||||
/**
|
||||
* Returns the secondary or tertiary weight preceding the current node's weight.
|
||||
* node=nodes[index].
|
||||
*/
|
||||
uint32_t getWeight16Before(int32_t index, int64_t node, int32_t level);
|
||||
|
||||
int64_t getSpecialResetPosition(const UnicodeString &str,
|
||||
const char *&parserErrorReason, UErrorCode &errorCode);
|
||||
|
@ -96,7 +101,7 @@ private:
|
|||
|
||||
/**
|
||||
* Finds the node which implies or contains a common=05 weight of the given strength
|
||||
* (secondary or tertiary).
|
||||
* (secondary or tertiary), if the current node is stronger.
|
||||
* Skips weaker nodes and tailored nodes if the current node is stronger
|
||||
* and is followed by an explicit-common-weight node.
|
||||
* Always returns the input index if that node is no stronger than the given strength.
|
||||
|
@ -218,15 +223,13 @@ private:
|
|||
/** At most 1M nodes, limited by the 20 bits in node bit fields. */
|
||||
static const int32_t MAX_INDEX = 0xfffff;
|
||||
/**
|
||||
* Node bit 6 is set on a primary node if there are tailored nodes
|
||||
* with secondary values below the common secondary weight (05),
|
||||
* from a reset-secondary-before (&[before 2]).
|
||||
* Node bit 6 is set on a primary node if there are nodes
|
||||
* with secondary values below the common secondary weight (05).
|
||||
*/
|
||||
static const int32_t HAS_BEFORE2 = 0x40;
|
||||
/**
|
||||
* Node bit 5 is set on a primary or secondary node if there are tailored nodes
|
||||
* with tertiary values below the common tertiary weight (05),
|
||||
* from a reset-tertiary-before (&[before 3]).
|
||||
* Node bit 5 is set on a primary or secondary node if there are nodes
|
||||
* with tertiary values below the common tertiary weight (05).
|
||||
*/
|
||||
static const int32_t HAS_BEFORE3 = 0x20;
|
||||
/**
|
||||
|
@ -338,15 +341,16 @@ private:
|
|||
* A node of a given strength normally implies "common" weights on weaker levels.
|
||||
*
|
||||
* A node with HAS_BEFORE2 must be immediately followed by
|
||||
* a secondary node with BEFORE_WEIGHT16, then a secondary tailored node,
|
||||
* a secondary node with an explicit below-common weight, then a secondary tailored node,
|
||||
* and later an explicit common-secondary node.
|
||||
* (&[before 2] resets to the BEFORE_WEIGHT16 node so that
|
||||
* The below-common weight can be a root weight,
|
||||
* or it can be BEFORE_WEIGHT16 for tailoring before an implied common weight
|
||||
* or before the lowest root weight.
|
||||
* (&[before 2] resets to an explicit secondary node so that
|
||||
* the following addRelation(secondary) tailors right after that.
|
||||
* If we did not have this node and instead were to reset on the primary node,
|
||||
* then addRelation(secondary) would skip forward to the the COMMON_WEIGHT16 node.)
|
||||
*
|
||||
* All secondary tailored nodes between these two explicit ones
|
||||
* will be assigned lower-than-common secondary weights.
|
||||
* If the flag is not set, then there are no explicit secondary nodes
|
||||
* with the common or lower weights.
|
||||
*
|
||||
|
|
|
@ -109,7 +109,7 @@ private:
|
|||
|
||||
/*
|
||||
* Format of collation data (ucadata.icu, binary data in coll/ *.res files).
|
||||
* Format version 4.0.
|
||||
* Format version 4.1.
|
||||
*
|
||||
* The root collation data is stored in the ucadata.icu file.
|
||||
* Tailorings are stored inside .res resource bundle files, with a complete file header.
|
||||
|
@ -200,6 +200,17 @@ private:
|
|||
*
|
||||
* UBool compressibleBytes[]; -- empty in all tailorings
|
||||
* Flag for getSortKey(), indicating primary weight lead bytes that are compressible.
|
||||
*
|
||||
* -----------------
|
||||
* Changes for formatVersion 4.1
|
||||
*
|
||||
* The rootElements may contain secondary and tertiary weights below common=05.
|
||||
* (Used for small Hiragana letters.)
|
||||
* Where is occurs, there is also an explicit unit with common secondary & tertiary weights.
|
||||
* There are no other data structure changes, but builder code needs to be able to handle such data.
|
||||
*
|
||||
* ICU 55 ucadata.icu uses formatVersion 4.1.
|
||||
* ICU 55 tailoring data continues to use formatVersion 4.0.
|
||||
*/
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -125,7 +125,7 @@ CollationRootElements::getSecondaryBefore(uint32_t p, uint32_t s) const {
|
|||
} else {
|
||||
index = findPrimary(p) + 1;
|
||||
previousSec = Collation::BEFORE_WEIGHT16;
|
||||
sec = Collation::COMMON_WEIGHT16;
|
||||
sec = getFirstSecTerForPrimary(index) >> 16;
|
||||
}
|
||||
U_ASSERT(s >= sec);
|
||||
while(s > sec) {
|
||||
|
@ -155,7 +155,7 @@ CollationRootElements::getTertiaryBefore(uint32_t p, uint32_t s, uint32_t t) con
|
|||
} else {
|
||||
index = findPrimary(p) + 1;
|
||||
previousTer = Collation::BEFORE_WEIGHT16;
|
||||
secTer = Collation::COMMON_SEC_AND_TER_CE;
|
||||
secTer = getFirstSecTerForPrimary(index);
|
||||
}
|
||||
uint32_t st = (s << 16) | t;
|
||||
while(st > secTer) {
|
||||
|
@ -191,33 +191,38 @@ CollationRootElements::getPrimaryAfter(uint32_t p, int32_t index, UBool isCompre
|
|||
|
||||
uint32_t
|
||||
CollationRootElements::getSecondaryAfter(int32_t index, uint32_t s) const {
|
||||
uint32_t secTer;
|
||||
uint32_t secLimit;
|
||||
if(index == 0) {
|
||||
// primary = 0
|
||||
U_ASSERT(s != 0);
|
||||
index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX];
|
||||
secTer = elements[index];
|
||||
// Gap at the end of the secondary CE range.
|
||||
secLimit = 0x10000;
|
||||
} else {
|
||||
U_ASSERT(index >= (int32_t)elements[IX_FIRST_PRIMARY_INDEX]);
|
||||
++index;
|
||||
secTer = getFirstSecTerForPrimary(index + 1);
|
||||
// If this is an explicit sec/ter unit, then it will be read once more.
|
||||
// Gap for secondaries of primary CEs.
|
||||
secLimit = getSecondaryBoundary();
|
||||
}
|
||||
for(;;) {
|
||||
uint32_t secTer = elements[index];
|
||||
if((secTer & SEC_TER_DELTA_FLAG) == 0) { return secLimit; }
|
||||
uint32_t sec = secTer >> 16;
|
||||
if(sec > s) { return sec; }
|
||||
++index;
|
||||
secTer = elements[++index];
|
||||
if((secTer & SEC_TER_DELTA_FLAG) == 0) { return secLimit; }
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t
|
||||
CollationRootElements::getTertiaryAfter(int32_t index, uint32_t s, uint32_t t) const {
|
||||
uint32_t secTer;
|
||||
uint32_t terLimit;
|
||||
if(index == 0) {
|
||||
// primary = 0
|
||||
if(s == 0) {
|
||||
U_ASSERT(t != 0);
|
||||
index = (int32_t)elements[IX_FIRST_TERTIARY_INDEX];
|
||||
// Gap at the end of the tertiary CE range.
|
||||
terLimit = 0x4000;
|
||||
|
@ -226,22 +231,42 @@ CollationRootElements::getTertiaryAfter(int32_t index, uint32_t s, uint32_t t) c
|
|||
// Gap for tertiaries of primary/secondary CEs.
|
||||
terLimit = getTertiaryBoundary();
|
||||
}
|
||||
secTer = elements[index] & ~SEC_TER_DELTA_FLAG;
|
||||
} else {
|
||||
U_ASSERT(index >= (int32_t)elements[IX_FIRST_PRIMARY_INDEX]);
|
||||
++index;
|
||||
secTer = getFirstSecTerForPrimary(index + 1);
|
||||
// If this is an explicit sec/ter unit, then it will be read once more.
|
||||
terLimit = getTertiaryBoundary();
|
||||
}
|
||||
uint32_t st = (s << 16) | t;
|
||||
for(;;) {
|
||||
uint32_t secTer = elements[index];
|
||||
if(secTer > st) {
|
||||
U_ASSERT((secTer >> 16) == s);
|
||||
return secTer & 0xffff;
|
||||
}
|
||||
secTer = elements[++index];
|
||||
// No tertiary greater than t for this primary+secondary.
|
||||
if((secTer & SEC_TER_DELTA_FLAG) == 0 || (secTer >> 16) > s) { return terLimit; }
|
||||
secTer &= ~SEC_TER_DELTA_FLAG;
|
||||
if(secTer > st) { return secTer & 0xffff; }
|
||||
++index;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t
|
||||
CollationRootElements::getFirstSecTerForPrimary(int32_t index) const {
|
||||
uint32_t secTer = elements[index];
|
||||
if((secTer & SEC_TER_DELTA_FLAG) == 0) {
|
||||
// No sec/ter delta.
|
||||
return Collation::COMMON_SEC_AND_TER_CE;
|
||||
}
|
||||
secTer &= ~SEC_TER_DELTA_FLAG;
|
||||
if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
|
||||
// Implied sec/ter.
|
||||
return Collation::COMMON_SEC_AND_TER_CE;
|
||||
}
|
||||
// Explicit sec/ter below common/common.
|
||||
return secTer;
|
||||
}
|
||||
|
||||
int32_t
|
||||
CollationRootElements::findPrimary(uint32_t p) const {
|
||||
// Requirement: p must occur as a root primary.
|
||||
|
|
|
@ -189,15 +189,32 @@ public:
|
|||
/**
|
||||
* Returns the secondary weight after [p, s] where index=findPrimary(p)
|
||||
* except use index=0 for p=0.
|
||||
*
|
||||
* Must return a weight for every root [p, s] as well as for every weight
|
||||
* returned by getSecondaryBefore(). If p!=0 then s can be BEFORE_WEIGHT16.
|
||||
*
|
||||
* Exception: [0, 0] is handled by the CollationBuilder:
|
||||
* Both its lower and upper boundaries are special.
|
||||
*/
|
||||
uint32_t getSecondaryAfter(int32_t index, uint32_t s) const;
|
||||
/**
|
||||
* Returns the tertiary weight after [p, s, t] where index=findPrimary(p)
|
||||
* except use index=0 for p=0.
|
||||
*
|
||||
* Must return a weight for every root [p, s, t] as well as for every weight
|
||||
* returned by getTertiaryBefore(). If s!=0 then t can be BEFORE_WEIGHT16.
|
||||
*
|
||||
* Exception: [0, 0, 0] is handled by the CollationBuilder:
|
||||
* Both its lower and upper boundaries are special.
|
||||
*/
|
||||
uint32_t getTertiaryAfter(int32_t index, uint32_t s, uint32_t t) const;
|
||||
|
||||
private:
|
||||
/**
|
||||
* Returns the first secondary & tertiary weights for p where index=findPrimary(p)+1.
|
||||
*/
|
||||
uint32_t getFirstSecTerForPrimary(int32_t index) const;
|
||||
|
||||
/**
|
||||
* Finds the largest index i where elements[i]<=p.
|
||||
* Requires first primary<=p<0xffffff00 (PRIMARY_SENTINEL).
|
||||
|
@ -216,15 +233,18 @@ private:
|
|||
* See the comments on the IX_ constants.
|
||||
*
|
||||
* All other elements are a compact form of the root collator CEs
|
||||
* in collation order.
|
||||
* in mostly collation order.
|
||||
*
|
||||
* Primary weights have the SEC_TER_DELTA_FLAG flag not set.
|
||||
* A primary-weight element by itself represents a root CE
|
||||
* with Collation::COMMON_SEC_AND_TER_CE.
|
||||
* A sequence of one or more root CEs with the same primary weight is stored as
|
||||
* one element with the primary weight, with the SEC_TER_DELTA_FLAG flag not set,
|
||||
* followed by elements with only the secondary/tertiary weights,
|
||||
* each with that flag set.
|
||||
* If the lowest secondary/tertiary combination is Collation::COMMON_SEC_AND_TER_CE,
|
||||
* then the element for that combination is omitted.
|
||||
*
|
||||
* If there are root CEs with the same primary but other secondary/tertiary weights,
|
||||
* then for each such CE there is an element with those secondary and tertiary weights,
|
||||
* and with the SEC_TER_DELTA_FLAG flag set.
|
||||
* Note: If the first actual secondary/tertiary combination is higher than
|
||||
* Collation::COMMON_SEC_AND_TER_CE (which is unusual),
|
||||
* the runtime code will assume anyway that Collation::COMMON_SEC_AND_TER_CE is present.
|
||||
*
|
||||
* A range of only-primary CEs with a consistent "step" increment
|
||||
* from each primary to the next may be stored as a range.
|
||||
|
|
|
@ -723,7 +723,26 @@ public:
|
|||
// Simple primary CE.
|
||||
++index;
|
||||
pri = p;
|
||||
secTer = Collation::COMMON_SEC_AND_TER_CE;
|
||||
// Does this have an explicit below-common sec/ter unit,
|
||||
// or does it imply a common one?
|
||||
if(index == length) {
|
||||
secTer = Collation::COMMON_SEC_AND_TER_CE;
|
||||
} else {
|
||||
secTer = elements[index];
|
||||
if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
|
||||
// No sec/ter delta.
|
||||
secTer = Collation::COMMON_SEC_AND_TER_CE;
|
||||
} else {
|
||||
secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
|
||||
if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
|
||||
// Implied sec/ter.
|
||||
secTer = Collation::COMMON_SEC_AND_TER_CE;
|
||||
} else {
|
||||
// Explicit sec/ter below common/common.
|
||||
++index;
|
||||
}
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
|
50
icu4c/source/test/testdata/collationtest.txt
vendored
50
icu4c/source/test/testdata/collationtest.txt
vendored
|
@ -2476,3 +2476,53 @@
|
|||
* compare
|
||||
<1 AA
|
||||
<2 aą
|
||||
|
||||
** test: tailor tertiary-after a common tertiary where there is a lower one
|
||||
# Assume that Hiragana small A has a below-common tertiary, and Hiragana A has a common one.
|
||||
# See ICU ticket 11448 & CLDR ticket 7222.
|
||||
@ rules
|
||||
&あ<<<x<<<y<<<z
|
||||
* compare
|
||||
<1 ぁ
|
||||
<3 あ
|
||||
<3 x
|
||||
<3 y
|
||||
<3 z
|
||||
<3 ァ
|
||||
<1 い
|
||||
|
||||
** test: tailor tertiary-after a below-common tertiary
|
||||
@ rules
|
||||
&ぁ<<<x<<<y<<<z
|
||||
* compare
|
||||
<1 ぁ
|
||||
<3 x
|
||||
<3 y
|
||||
<3 z
|
||||
<3 あ
|
||||
<3 ァ
|
||||
<1 い
|
||||
|
||||
** test: tailor tertiary-before a common tertiary where there is a lower one
|
||||
@ rules
|
||||
&[before 3]あ<<<x<<<y<<<z
|
||||
* compare
|
||||
<1 ぁ
|
||||
<3 x
|
||||
<3 y
|
||||
<3 z
|
||||
<3 あ
|
||||
<3 ァ
|
||||
<1 い
|
||||
|
||||
** test: tailor tertiary-before a below-common tertiary
|
||||
@ rules
|
||||
&[before 3]ぁ<<<x<<<y<<<z
|
||||
* compare
|
||||
<1 x
|
||||
<3 y
|
||||
<3 z
|
||||
<3 ぁ
|
||||
<3 あ
|
||||
<3 ァ
|
||||
<1 い
|
||||
|
|
Loading…
Add table
Reference in a new issue