ICU-13197 catch one more edge case where a mapping has no compose boundary after it

X-SVN-Rev: 40156
This commit is contained in:
Markus Scherer 2017-06-07 07:13:34 +00:00
parent 19d53e7641
commit 6c1e41e0f2
7 changed files with 21 additions and 15 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -208,7 +208,7 @@ void Normalizer2DataBuilder::removeMapping(UChar32 c) {
UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(const BuilderReorderingBuffer &buffer) {
if(buffer.isEmpty()) {
return TRUE; // maps-to-empty-string is no boundary of any kind
return TRUE; // Maps-to-empty-string is no boundary of any kind.
}
int32_t lastStarterIndex=buffer.lastStarterIndex();
if(lastStarterIndex<0) {
@ -223,31 +223,37 @@ UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(const BuilderReorderingBuff
return lastStarterIndex==buffer.length()-1;
}
// Note: There can be no Hangul syllable in the fully decomposed mapping.
const Norm *starterNorm=&norms.getNormRef(starter);
if(starterNorm->compositions==NULL) {
return FALSE; // the last starter does not combine forward
const Norm *starterNorm=norms.getNorm(starter);
if(starterNorm==nullptr || starterNorm->compositions==nullptr) {
return FALSE; // The last starter does not combine forward.
}
// Compose as far as possible, and see if further compositions are possible.
uint8_t prevCC=0;
for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length(); ++combMarkIndex) {
uint8_t cc=buffer.ccAt(combMarkIndex); // !=0 because after last starter
if(norms.combinesWithCCBetween(*starterNorm, prevCC, cc)) {
// The starter combines with a mark that reorders before the current one.
return TRUE;
}
if(prevCC<cc && (starter=starterNorm->combine(buffer.charAt(combMarkIndex)))>=0) {
starterNorm=&norms.getNormRef(starter);
if(starterNorm->compositions==NULL) {
return FALSE; // the composite does not combine further
// The starter combines with this mark into a composite replacement starter.
starterNorm=norms.getNorm(starter);
if(starterNorm==nullptr || starterNorm->compositions==nullptr) {
return FALSE; // The composite does not combine further.
}
// Keep prevCC because we "removed" the combining mark.
} else {
prevCC=cc;
}
}
// TRUE if the final, forward-combining starter is at the end.
return prevCC==0;
// TODO?! prevCC==0 || norms.combinesWithCCBetween(*starterNorm, prevCC, int32_t! 0x100)
// TODO?! actually, should check if it combines with any cc not seen here
if(prevCC==0) {
return TRUE; // forward-combining starter at the very end
}
if(norms.combinesWithCCBetween(*starterNorm, prevCC, 256)) {
// The starter combines with another mark.
return TRUE;
}
return FALSE;
}
void Normalizer2DataBuilder::postProcess(Norm &norm) {

View file

@ -77,8 +77,6 @@ private:
* (especially for a "YesNo" which has a round-trip mapping).
* This flag is used in Normalizer2Impl::hasCompBoundaryAfter().
*
* Modifies the buffer (partially composes it).
*
* A starter character with a mapping does not have a composition boundary after it
* if the character itself combines-forward (which is tested by the caller of this function),
* or it is deleted (mapped to the empty string),

View file

@ -131,7 +131,7 @@ void Norms::reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) con
}
}
UBool Norms::combinesWithCCBetween(const Norm &norm, uint8_t lowCC, uint8_t highCC) const {
UBool Norms::combinesWithCCBetween(const Norm &norm, uint8_t lowCC, int32_t highCC) const {
if((highCC-lowCC)>=2) {
int32_t length;
const CompositionPair *pairs=norm.getCompositionPairs(length);

View file

@ -156,7 +156,9 @@ public:
uint8_t getCC(UChar32 c) const { return getNormRef(c).cc; }
void reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const;
UBool combinesWithCCBetween(const Norm &norm, uint8_t lowCC, uint8_t highCC) const;
// int32_t highCC not uint8_t so that we can pass in 256 as the upper limit.
UBool combinesWithCCBetween(const Norm &norm, uint8_t lowCC, int32_t highCC) const;
class Enumerator {
public: