mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 22:15:31 +00:00
ICU-13197 more gennorm2 cleanup, set smallFCD for some characters with algorithmic mappings
X-SVN-Rev: 40155
This commit is contained in:
parent
b88c0cd8a9
commit
19d53e7641
8 changed files with 249 additions and 217 deletions
Binary file not shown.
Binary file not shown.
|
@ -25,14 +25,7 @@ ExtraData::ExtraData(Norms &n, UBool fast) :
|
|||
Norms::Enumerator(n),
|
||||
yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions
|
||||
yesNoMappingsAndCompositions(1000, (UChar32)0, 1), // 0=Hangul, 1=start of normal data
|
||||
optimizeFast(fast) {
|
||||
memset(smallFCD, 0, sizeof(smallFCD));
|
||||
}
|
||||
|
||||
void ExtraData::setSmallFCD(UChar32 c) {
|
||||
UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
|
||||
smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
|
||||
}
|
||||
optimizeFast(fast) {}
|
||||
|
||||
int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString) {
|
||||
UnicodeString &m=*norm.mapping;
|
||||
|
@ -44,26 +37,8 @@ int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &data
|
|||
(long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
|
||||
exit(U_INVALID_FORMAT_ERROR);
|
||||
}
|
||||
int32_t leadCC, trailCC;
|
||||
if(length==0) {
|
||||
leadCC=trailCC=0;
|
||||
} else {
|
||||
leadCC=norms.getCC(m.char32At(0));
|
||||
trailCC=norms.getCC(m.char32At(length-1));
|
||||
}
|
||||
if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (norm.cc!=0 || leadCC!=0)) {
|
||||
fprintf(stderr,
|
||||
"gennorm2 error: "
|
||||
"U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
|
||||
(long)c);
|
||||
exit(U_INVALID_FORMAT_ERROR);
|
||||
}
|
||||
// Write small-FCD data.
|
||||
if((leadCC|trailCC)!=0) {
|
||||
setSmallFCD(c);
|
||||
}
|
||||
// Write the mapping & raw mapping extraData.
|
||||
int32_t firstUnit=length|(trailCC<<8);
|
||||
int32_t firstUnit=length|(norm.trailCC<<8);
|
||||
int32_t preMappingLength=0;
|
||||
if(norm.rawMapping!=NULL) {
|
||||
UnicodeString &rm=*norm.rawMapping;
|
||||
|
@ -98,7 +73,7 @@ int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &data
|
|||
}
|
||||
firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
|
||||
}
|
||||
int32_t cccLccc=norm.cc|(leadCC<<8);
|
||||
int32_t cccLccc=norm.cc|(norm.leadCC<<8);
|
||||
if(cccLccc!=0) {
|
||||
dataString.append((UChar)cccLccc);
|
||||
++preMappingLength;
|
||||
|
@ -187,65 +162,31 @@ void ExtraData::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
|
|||
(long)start, (long)end);
|
||||
exit(U_INTERNAL_PROGRAM_ERROR);
|
||||
}
|
||||
if(norm.error!=nullptr) {
|
||||
fprintf(stderr, "gennorm2 error: U+%04lX %s\n", (long)start, norm.error);
|
||||
exit(U_INVALID_FORMAT_ERROR);
|
||||
}
|
||||
writeExtraData(start, norm);
|
||||
}
|
||||
|
||||
void ExtraData::writeExtraData(UChar32 c, Norm &norm) {
|
||||
if(!norm.hasMapping()) {
|
||||
// Write small-FCD data.
|
||||
// There is similar code in writeMapping() for characters that do have a mapping.
|
||||
if(norm.cc!=0) {
|
||||
if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
|
||||
fprintf(stderr,
|
||||
"gennorm2 error: "
|
||||
"U+%04lX below U+0300 has ccc!=0, not supported by ICU\n",
|
||||
(long)c);
|
||||
exit(U_INVALID_FORMAT_ERROR);
|
||||
}
|
||||
setSmallFCD(c);
|
||||
}
|
||||
}
|
||||
if(norm.combinesBack) {
|
||||
if(norm.hasMapping()) {
|
||||
fprintf(stderr,
|
||||
"gennorm2 error: "
|
||||
"U+%04lX combines-back and decomposes, not possible in Unicode normalization\n",
|
||||
(long)c);
|
||||
exit(U_INVALID_FORMAT_ERROR);
|
||||
}
|
||||
if(norm.compositions!=NULL) {
|
||||
norm.offset=
|
||||
(maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)|
|
||||
Norm::OFFSET_MAYBE_YES;
|
||||
writeCompositions(c, norm, maybeYesCompositions);
|
||||
}
|
||||
} else if(!norm.hasMapping()) {
|
||||
if(norm.compositions!=NULL) {
|
||||
norm.offset=
|
||||
(yesYesCompositions.length()<<Norm::OFFSET_SHIFT)|
|
||||
Norm::OFFSET_YES_YES;
|
||||
writeCompositions(c, norm, yesYesCompositions);
|
||||
}
|
||||
} else if(norm.mappingType==Norm::ROUND_TRIP) {
|
||||
if(norm.compositions!=NULL) {
|
||||
int32_t offset=yesNoMappingsAndCompositions.length()+
|
||||
writeMapping(c, norm, yesNoMappingsAndCompositions);
|
||||
norm.offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION;
|
||||
writeCompositions(c, norm, yesNoMappingsAndCompositions);
|
||||
} else {
|
||||
int32_t offset=yesNoMappingsOnly.length()+
|
||||
writeMapping(c, norm, yesNoMappingsOnly);
|
||||
norm.offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY;
|
||||
}
|
||||
} else /* one-way */ {
|
||||
if(norm.compositions!=NULL) {
|
||||
fprintf(stderr,
|
||||
"gennorm2 error: "
|
||||
"U+%04lX combines-forward and has a one-way mapping, "
|
||||
"not possible in Unicode normalization\n",
|
||||
(long)c);
|
||||
exit(U_INVALID_FORMAT_ERROR);
|
||||
}
|
||||
switch(norm.type) {
|
||||
case Norm::INERT:
|
||||
break; // no extra data
|
||||
case Norm::YES_YES_COMBINES_FWD:
|
||||
norm.offset=yesYesCompositions.length();
|
||||
writeCompositions(c, norm, yesYesCompositions);
|
||||
break;
|
||||
case Norm::YES_NO_COMBINES_FWD:
|
||||
norm.offset=yesNoMappingsAndCompositions.length()+
|
||||
writeMapping(c, norm, yesNoMappingsAndCompositions);
|
||||
writeCompositions(c, norm, yesNoMappingsAndCompositions);
|
||||
break;
|
||||
case Norm::YES_NO_MAPPING_ONLY:
|
||||
norm.offset=yesNoMappingsOnly.length()+
|
||||
writeMapping(c, norm, yesNoMappingsOnly);
|
||||
break;
|
||||
case Norm::NO_NO:
|
||||
if(norm.cc==0 && !optimizeFast) {
|
||||
// Try a compact, algorithmic encoding.
|
||||
// Only for ccc=0, because we can't store additional information
|
||||
|
@ -260,15 +201,25 @@ void ExtraData::writeExtraData(UChar32 c, Norm &norm) {
|
|||
(!norm.hasNoCompBoundaryAfter || 1!=norm.mapping->countChar32())) {
|
||||
int32_t delta=norm.mappingCP-c;
|
||||
if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
|
||||
norm.offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA;
|
||||
norm.type=Norm::NO_NO_DELTA;
|
||||
norm.offset=delta;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(norm.offset==0) {
|
||||
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
|
||||
int32_t offset=writeNoNoMapping(c, norm, noNoMappings, previousNoNoMappings);
|
||||
norm.offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
|
||||
}
|
||||
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
|
||||
norm.offset=writeNoNoMapping(c, norm, noNoMappings, previousNoNoMappings);
|
||||
break;
|
||||
case Norm::MAYBE_YES_COMBINES_FWD:
|
||||
norm.offset=maybeYesCompositions.length();
|
||||
writeCompositions(c, norm, maybeYesCompositions);
|
||||
break;
|
||||
case Norm::MAYBE_YES_SIMPLE:
|
||||
break; // no extra data
|
||||
case Norm::YES_YES_WITH_CC:
|
||||
break; // no extra data
|
||||
default: // Should not occur.
|
||||
exit(U_INTERNAL_PROGRAM_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -37,10 +37,8 @@ public:
|
|||
UnicodeString yesNoMappingsAndCompositions;
|
||||
UnicodeString yesNoMappingsOnly;
|
||||
UnicodeString noNoMappings;
|
||||
uint8_t smallFCD[0x100];
|
||||
|
||||
private:
|
||||
void setSmallFCD(UChar32 c);
|
||||
/**
|
||||
* Requires norm.hasMapping().
|
||||
* Returns the offset of the "first unit" from the beginning of the extraData for c.
|
||||
|
|
|
@ -206,7 +206,7 @@ void Normalizer2DataBuilder::removeMapping(UChar32 c) {
|
|||
}
|
||||
}
|
||||
|
||||
UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) {
|
||||
UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(const BuilderReorderingBuffer &buffer) {
|
||||
if(buffer.isEmpty()) {
|
||||
return TRUE; // maps-to-empty-string is no boundary of any kind
|
||||
}
|
||||
|
@ -215,7 +215,7 @@ UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &bu
|
|||
return TRUE; // no starter
|
||||
}
|
||||
UChar32 starter=buffer.charAt(lastStarterIndex);
|
||||
if( Hangul::isJamoL(starter) ||
|
||||
if(Hangul::isJamoL(starter) ||
|
||||
(Hangul::isJamoV(starter) &&
|
||||
0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))) {
|
||||
// A Jamo leading consonant or an LV pair combines-forward if it is at the end,
|
||||
|
@ -229,26 +229,93 @@ UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &bu
|
|||
}
|
||||
// Compose as far as possible, and see if further compositions are possible.
|
||||
uint8_t prevCC=0;
|
||||
for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) {
|
||||
for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length(); ++combMarkIndex) {
|
||||
uint8_t cc=buffer.ccAt(combMarkIndex); // !=0 because after last starter
|
||||
if(norms.combinesWithCCBetween(*starterNorm, prevCC, cc)) {
|
||||
return TRUE;
|
||||
}
|
||||
if(prevCC<cc && (starter=starterNorm->combine(buffer.charAt(combMarkIndex)))>=0) {
|
||||
buffer.setComposite(starter, combMarkIndex);
|
||||
starterNorm=&norms.getNormRef(starter);
|
||||
if(starterNorm->compositions==NULL) {
|
||||
return FALSE; // the composite does not combine further
|
||||
}
|
||||
// The combining mark at combMarkIndex has been removed.
|
||||
// Do not increment combMarkIndex now.
|
||||
// Keep prevCC because we "removed" the combining mark.
|
||||
} else {
|
||||
prevCC=cc;
|
||||
++combMarkIndex;
|
||||
}
|
||||
}
|
||||
// TRUE if the final, forward-combining starter is at the end.
|
||||
return prevCC==0;
|
||||
// TODO?! prevCC==0 || norms.combinesWithCCBetween(*starterNorm, prevCC, int32_t! 0x100)
|
||||
// TODO?! actually, should check if it combines with any cc not seen here
|
||||
}
|
||||
|
||||
void Normalizer2DataBuilder::postProcess(Norm &norm) {
|
||||
// Prerequisites: Compositions are built, mappings are recursively decomposed.
|
||||
// Mappings are not yet in canonical order.
|
||||
//
|
||||
// This function works on a Norm struct. We do not know which code point(s) map(s) to it.
|
||||
// Therefore, we cannot compute algorithmic mapping deltas here.
|
||||
// Error conditions are checked, but printed later when we do know the offending code point.
|
||||
if(norm.hasMapping()) {
|
||||
// Ensure canonical order.
|
||||
BuilderReorderingBuffer buffer;
|
||||
if(norm.rawMapping!=nullptr) {
|
||||
norms.reorder(*norm.rawMapping, buffer);
|
||||
buffer.reset();
|
||||
}
|
||||
norms.reorder(*norm.mapping, buffer);
|
||||
if(buffer.isEmpty()) {
|
||||
norm.leadCC=norm.trailCC=0;
|
||||
} else {
|
||||
norm.leadCC=buffer.ccAt(0);
|
||||
norm.trailCC=buffer.ccAt(buffer.length()-1);
|
||||
}
|
||||
|
||||
// Set the hasNoCompBoundaryAfter flag for use by the last code branch
|
||||
// in Normalizer2Impl::hasCompBoundaryAfter().
|
||||
// For details see the comments on hasNoCompBoundaryAfter(buffer).
|
||||
if(norm.compositions!=nullptr) {
|
||||
norm.hasNoCompBoundaryAfter=TRUE;
|
||||
} else {
|
||||
norm.hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
|
||||
}
|
||||
|
||||
if(norm.combinesBack) {
|
||||
norm.error="combines-back and decomposes, not possible in Unicode normalization";
|
||||
} else if(norm.mappingType==Norm::ROUND_TRIP) {
|
||||
if(norm.compositions!=NULL) {
|
||||
norm.type=Norm::YES_NO_COMBINES_FWD;
|
||||
} else {
|
||||
norm.type=Norm::YES_NO_MAPPING_ONLY;
|
||||
}
|
||||
} else { // one-way mapping
|
||||
if(norm.compositions!=NULL) {
|
||||
norm.error="combines-forward and has a one-way mapping, "
|
||||
"not possible in Unicode normalization";
|
||||
} else {
|
||||
norm.type=Norm::NO_NO;
|
||||
}
|
||||
}
|
||||
} else { // no mapping
|
||||
norm.leadCC=norm.trailCC=norm.cc;
|
||||
|
||||
if(norm.combinesBack) {
|
||||
if(norm.compositions!=nullptr) {
|
||||
// Earlier code checked ccc=0.
|
||||
norm.type=Norm::MAYBE_YES_COMBINES_FWD;
|
||||
} else {
|
||||
norm.type=Norm::MAYBE_YES_SIMPLE; // any ccc
|
||||
}
|
||||
} else if(norm.compositions!=nullptr) {
|
||||
// Earlier code checked ccc=0.
|
||||
norm.type=Norm::YES_YES_COMBINES_FWD;
|
||||
} else if(norm.cc!=0) {
|
||||
norm.type=Norm::YES_YES_WITH_CC;
|
||||
} else {
|
||||
norm.type=Norm::INERT;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class Norm16Writer : public Norms::Enumerator {
|
||||
|
@ -260,55 +327,70 @@ public:
|
|||
Normalizer2DataBuilder &builder;
|
||||
};
|
||||
|
||||
void Normalizer2DataBuilder::setSmallFCD(UChar32 c) {
|
||||
UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
|
||||
smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
|
||||
}
|
||||
|
||||
void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, Norm &norm) {
|
||||
int32_t offset=norm.offset>>Norm::OFFSET_SHIFT;
|
||||
int32_t norm16=0;
|
||||
UBool isDecompNo=FALSE; // TRUE if need to ensure start>=minDecompNoCP
|
||||
UBool isCompNoMaybe=FALSE; // TRUE if need to ensure start>=minCompNoMaybeCP
|
||||
switch(norm.offset&Norm::OFFSET_MASK) {
|
||||
case Norm::OFFSET_NONE:
|
||||
// No mapping, no compositions list.
|
||||
if(norm.combinesBack) {
|
||||
norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc;
|
||||
isDecompNo=(UBool)(norm.cc!=0);
|
||||
isCompNoMaybe=TRUE;
|
||||
} else if(norm.cc!=0) {
|
||||
norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+norm.cc;
|
||||
isDecompNo=isCompNoMaybe=TRUE;
|
||||
if(start<Normalizer2Impl::MIN_CCC_LCCC_CP && (norm.cc!=0 || norm.leadCC!=0)) {
|
||||
fprintf(stderr,
|
||||
"gennorm2 error: "
|
||||
"U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
|
||||
(long)start);
|
||||
exit(U_INVALID_FORMAT_ERROR);
|
||||
}
|
||||
if((norm.leadCC|norm.trailCC)!=0) {
|
||||
for(UChar32 c=start; c<=end; ++c) {
|
||||
setSmallFCD(c);
|
||||
}
|
||||
}
|
||||
|
||||
int32_t norm16;
|
||||
switch(norm.type) {
|
||||
case Norm::INERT:
|
||||
norm16=0;
|
||||
break;
|
||||
case Norm::OFFSET_MAYBE_YES:
|
||||
norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset;
|
||||
isCompNoMaybe=TRUE;
|
||||
case Norm::YES_YES_COMBINES_FWD:
|
||||
norm16=norm.offset;
|
||||
break;
|
||||
case Norm::OFFSET_YES_YES:
|
||||
norm16=offset;
|
||||
case Norm::YES_NO_COMBINES_FWD:
|
||||
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+norm.offset;
|
||||
break;
|
||||
case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION:
|
||||
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset;
|
||||
isDecompNo=TRUE;
|
||||
break;
|
||||
case Norm::OFFSET_YES_NO_MAPPING_ONLY:
|
||||
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset;
|
||||
isDecompNo=TRUE;
|
||||
break;
|
||||
case Norm::OFFSET_NO_NO:
|
||||
norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset;
|
||||
isDecompNo=isCompNoMaybe=TRUE;
|
||||
case Norm::YES_NO_MAPPING_ONLY:
|
||||
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+norm.offset;
|
||||
break;
|
||||
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
|
||||
case Norm::OFFSET_DELTA:
|
||||
norm16=getCenterNoNoDelta()+offset;
|
||||
isDecompNo=isCompNoMaybe=TRUE;
|
||||
case Norm::NO_NO:
|
||||
norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+norm.offset;
|
||||
break;
|
||||
case Norm::NO_NO_DELTA:
|
||||
norm16=getCenterNoNoDelta()+norm.offset;
|
||||
break;
|
||||
case Norm::MAYBE_YES_COMBINES_FWD:
|
||||
norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+norm.offset;
|
||||
break;
|
||||
case Norm::MAYBE_YES_SIMPLE:
|
||||
norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc; // ccc=0..255
|
||||
break;
|
||||
case Norm::YES_YES_WITH_CC:
|
||||
U_ASSERT(norm.cc!=0);
|
||||
norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+norm.cc; // ccc=1..255
|
||||
break;
|
||||
default: // Should not occur.
|
||||
exit(U_INTERNAL_PROGRAM_ERROR);
|
||||
}
|
||||
IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
|
||||
utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode);
|
||||
|
||||
// Set the minimum code points for real data lookups in the quick check loops.
|
||||
UBool isDecompNo=
|
||||
(Norm::YES_NO_COMBINES_FWD<=norm.type && norm.type<=Norm::NO_NO_DELTA) ||
|
||||
norm.cc!=0;
|
||||
if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
|
||||
indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
|
||||
}
|
||||
UBool isCompNoMaybe= norm.type>=Norm::NO_NO;
|
||||
if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
|
||||
indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
|
||||
}
|
||||
|
@ -367,65 +449,48 @@ void Normalizer2DataBuilder::processData() {
|
|||
norm16Trie=utrie2_open(0, 0, errorCode);
|
||||
errorCode.assertSuccess();
|
||||
|
||||
// Build composition lists before recursive decomposition,
|
||||
// so that we still have the raw, pair-wise mappings.
|
||||
CompositionBuilder compBuilder(norms);
|
||||
norms.enumRanges(compBuilder);
|
||||
|
||||
// Recursively decompose all mappings.
|
||||
Decomposer decomposer(norms);
|
||||
do {
|
||||
decomposer.didDecompose=FALSE;
|
||||
norms.enumRanges(decomposer);
|
||||
} while(decomposer.didDecompose);
|
||||
|
||||
BuilderReorderingBuffer buffer;
|
||||
// Set the Norm::Type and other properties.
|
||||
int32_t normsLength=norms.length();
|
||||
for(int32_t i=1; i<normsLength; ++i) {
|
||||
// Set the hasNoCompBoundaryAfter flag for use by the last code branch
|
||||
// in Normalizer2Impl::hasCompBoundaryAfter().
|
||||
// For details see the comments on hasNoCompBoundaryAfter(buffer).
|
||||
Norm &norm=norms.getNormRefByIndex(i);
|
||||
if(norm.hasMapping()) {
|
||||
if(norm.compositions!=NULL) {
|
||||
norm.hasNoCompBoundaryAfter=TRUE;
|
||||
} else {
|
||||
buffer.reset();
|
||||
norms.reorder(norm, buffer);
|
||||
norm.hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
|
||||
}
|
||||
}
|
||||
postProcess(norms.getNormRefByIndex(i));
|
||||
}
|
||||
|
||||
indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
|
||||
indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
|
||||
|
||||
// Write the properties, mappings and composition lists to
|
||||
// appropriate parts of the "extra data" array.
|
||||
ExtraData extra(norms, optimization==OPTIMIZE_FAST);
|
||||
norms.enumRanges(extra);
|
||||
|
||||
extraData=extra.maybeYesCompositions;
|
||||
extraData.append(extra.yesYesCompositions).
|
||||
append(extra.yesNoMappingsAndCompositions).
|
||||
append(extra.yesNoMappingsOnly).
|
||||
append(extra.noNoMappings);
|
||||
extraData=extra.yesYesCompositions;
|
||||
indexes[Normalizer2Impl::IX_MIN_YES_NO]=extraData.length();
|
||||
extraData.append(extra.yesNoMappingsAndCompositions);
|
||||
indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=extraData.length();
|
||||
extraData.append(extra.yesNoMappingsOnly);
|
||||
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
|
||||
indexes[Normalizer2Impl::IX_MIN_NO_NO]=extraData.length();
|
||||
extraData.append(extra.noNoMappings);
|
||||
indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=extraData.length();
|
||||
|
||||
extraData.insert(0, extra.maybeYesCompositions);
|
||||
indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
|
||||
Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
|
||||
extra.maybeYesCompositions.length();
|
||||
|
||||
// Pad to even length for 4-byte alignment of following data.
|
||||
if(extraData.length()&1) {
|
||||
extraData.append((UChar)0);
|
||||
}
|
||||
memcpy(smallFCD, extra.smallFCD, sizeof(smallFCD));
|
||||
|
||||
indexes[Normalizer2Impl::IX_MIN_YES_NO]=
|
||||
extra.yesYesCompositions.length();
|
||||
indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=
|
||||
indexes[Normalizer2Impl::IX_MIN_YES_NO]+
|
||||
extra.yesNoMappingsAndCompositions.length();
|
||||
indexes[Normalizer2Impl::IX_MIN_NO_NO]=
|
||||
indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+
|
||||
extra.yesNoMappingsOnly.length();
|
||||
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
|
||||
indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=
|
||||
indexes[Normalizer2Impl::IX_MIN_NO_NO]+
|
||||
extra.noNoMappings.length();
|
||||
indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
|
||||
Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
|
||||
extra.maybeYesCompositions.length();
|
||||
|
||||
int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA;
|
||||
if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
|
||||
|
@ -435,6 +500,13 @@ void Normalizer2DataBuilder::processData() {
|
|||
exit(U_BUFFER_OVERFLOW_ERROR);
|
||||
}
|
||||
|
||||
// writeNorm16() and setHangulData() reduce these as needed.
|
||||
indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
|
||||
indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
|
||||
|
||||
// Map each code point to its norm16 value,
|
||||
// including the properties that fit directly,
|
||||
// and the offset to the "extra data" if necessary.
|
||||
Norm16Writer norm16Writer(norms, *this);
|
||||
norms.enumRanges(norm16Writer);
|
||||
|
||||
|
@ -442,7 +514,7 @@ void Normalizer2DataBuilder::processData() {
|
|||
|
||||
// Look for the "worst" norm16 value of any supplementary code point
|
||||
// corresponding to a lead surrogate, and set it as that surrogate's value.
|
||||
// Enables quick check inner loops to look at only code units.
|
||||
// Enables UTF-16 quick check inner loops to look at only code units.
|
||||
//
|
||||
// We could be more sophisticated:
|
||||
// We could collect a bit set for whether there are values in the different
|
||||
|
@ -605,13 +677,6 @@ Normalizer2DataBuilder::writeCSourceFile(const char *filename) {
|
|||
line,
|
||||
smallFCD, 8, sizeof(smallFCD),
|
||||
"\n};\n\n");
|
||||
/*fputs( // TODO
|
||||
"static const UCaseProps %s_singleton={\n"
|
||||
" NULL,\n"
|
||||
" %s_indexes,\n"
|
||||
" %s_extraData,\n"
|
||||
" %s_smallFCD,\n",
|
||||
f);*/
|
||||
sprintf(line, "static const UTrie2 %s_trie={\n", dataName.data());
|
||||
char line2[100];
|
||||
sprintf(line2, "%s_trieIndex", dataName.data());
|
||||
|
|
|
@ -85,13 +85,15 @@ private:
|
|||
* or its mapping contains no starter,
|
||||
* or the last starter combines-forward.
|
||||
*/
|
||||
UBool hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer);
|
||||
void setHangulData();
|
||||
UBool hasNoCompBoundaryAfter(const BuilderReorderingBuffer &buffer);
|
||||
void postProcess(Norm &norm);
|
||||
|
||||
void setSmallFCD(UChar32 c);
|
||||
int32_t getCenterNoNoDelta() {
|
||||
return indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]-Normalizer2Impl::MAX_DELTA-1;
|
||||
}
|
||||
void writeNorm16(UChar32 start, UChar32 end, Norm &norm);
|
||||
void setHangulData();
|
||||
void processData();
|
||||
|
||||
Norms norms;
|
||||
|
|
|
@ -43,23 +43,13 @@ void BuilderReorderingBuffer::append(UChar32 c, uint8_t cc) {
|
|||
fDidReorder=TRUE;
|
||||
}
|
||||
|
||||
void BuilderReorderingBuffer::toString(UnicodeString &dest) {
|
||||
void BuilderReorderingBuffer::toString(UnicodeString &dest) const {
|
||||
dest.remove();
|
||||
for(int32_t i=0; i<fLength; ++i) {
|
||||
dest.append(charAt(i));
|
||||
}
|
||||
}
|
||||
|
||||
void BuilderReorderingBuffer::setComposite(UChar32 composite, int32_t combMarkIndex) {
|
||||
fArray[fLastStarterIndex]=composite<<8;
|
||||
// Remove the combining mark that contributed to the composite.
|
||||
--fLength;
|
||||
while(combMarkIndex<fLength) {
|
||||
fArray[combMarkIndex]=fArray[combMarkIndex+1];
|
||||
++combMarkIndex;
|
||||
}
|
||||
}
|
||||
|
||||
UChar32 Norm::combine(UChar32 trail) const {
|
||||
int32_t length;
|
||||
const CompositionPair *pairs=getCompositionPairs(length);
|
||||
|
@ -77,7 +67,9 @@ UChar32 Norm::combine(UChar32 trail) const {
|
|||
Norms::Norms(UErrorCode &errorCode) {
|
||||
normTrie=utrie2_open(0, 0, &errorCode);
|
||||
normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
|
||||
norms=allocNorm(); // unused Norm struct at index 0
|
||||
// Default "inert" Norm struct at index 0. Practically immutable.
|
||||
norms=allocNorm();
|
||||
norms->type=Norm::INERT;
|
||||
}
|
||||
|
||||
Norms::~Norms() {
|
||||
|
@ -122,13 +114,12 @@ Norm *Norms::createNorm(UChar32 c) {
|
|||
}
|
||||
}
|
||||
|
||||
void Norms::reorder(Norm &norm, BuilderReorderingBuffer &buffer) const {
|
||||
UnicodeString &m=*norm.mapping;
|
||||
int32_t length=m.length();
|
||||
void Norms::reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const {
|
||||
int32_t length=mapping.length();
|
||||
if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
|
||||
return; // writeMapping() will complain about it and print the code point.
|
||||
}
|
||||
const UChar *s=toUCharPtr(m.getBuffer());
|
||||
const char16_t *s=mapping.getBuffer();
|
||||
int32_t i=0;
|
||||
UChar32 c;
|
||||
while(i<length) {
|
||||
|
@ -136,7 +127,7 @@ void Norms::reorder(Norm &norm, BuilderReorderingBuffer &buffer) const {
|
|||
buffer.append(c, getCC(c));
|
||||
}
|
||||
if(buffer.didReorder()) {
|
||||
buffer.toString(m);
|
||||
buffer.toString(mapping);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -40,8 +40,7 @@ public:
|
|||
UBool didReorder() const { return fDidReorder; }
|
||||
|
||||
void append(UChar32 c, uint8_t cc);
|
||||
void toString(UnicodeString &dest);
|
||||
void setComposite(UChar32 composite, int32_t combMarkIndex);
|
||||
void toString(UnicodeString &dest) const;
|
||||
|
||||
private:
|
||||
int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK];
|
||||
|
@ -88,28 +87,54 @@ struct Norm {
|
|||
MappingType mappingType;
|
||||
|
||||
UVector32 *compositions; // (trail, composite) pairs
|
||||
uint8_t cc;
|
||||
uint8_t cc, leadCC, trailCC;
|
||||
UBool combinesBack;
|
||||
UBool hasNoCompBoundaryAfter;
|
||||
|
||||
enum OffsetType {
|
||||
OFFSET_NONE,
|
||||
// Composition for back-combining character. Allowed, but not normally used.
|
||||
OFFSET_MAYBE_YES,
|
||||
// Composition for a starter that does not have a decomposition mapping.
|
||||
OFFSET_YES_YES,
|
||||
// Round-trip mapping & composition for a starter.
|
||||
OFFSET_YES_NO_MAPPING_AND_COMPOSITION,
|
||||
// Round-trip mapping for a starter that itself does not combine-forward.
|
||||
OFFSET_YES_NO_MAPPING_ONLY,
|
||||
/**
|
||||
* Overall type of normalization properties.
|
||||
* Set after most processing is done.
|
||||
*
|
||||
* Corresponds to the rows in the chart on
|
||||
* http://site.icu-project.org/design/normalization/custom
|
||||
* in numerical (but reverse visual) order.
|
||||
*
|
||||
* YES_NO means composition quick check=yes, decomposition QC=no -- etc.
|
||||
*/
|
||||
enum Type {
|
||||
/** Initial value until most processing is done. */
|
||||
UNKNOWN,
|
||||
/** No mapping, does not combine, ccc=0. */
|
||||
INERT,
|
||||
/** Starter, no mapping, has compositions. */
|
||||
YES_YES_COMBINES_FWD,
|
||||
/** Starter with a round-trip mapping and compositions. */
|
||||
YES_NO_COMBINES_FWD,
|
||||
/** Starter with a round-trip mapping but no compositions. */
|
||||
YES_NO_MAPPING_ONLY,
|
||||
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
|
||||
// One-way mapping.
|
||||
OFFSET_NO_NO,
|
||||
// Delta for an algorithmic one-way mapping.
|
||||
OFFSET_DELTA
|
||||
};
|
||||
enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 };
|
||||
/** Has a one-way mapping. */
|
||||
NO_NO,
|
||||
/** Has an algorithmic one-way mapping to a single code point. */
|
||||
NO_NO_DELTA,
|
||||
/**
|
||||
* Combines both backward and forward, has compositions.
|
||||
* Allowed, but not normally used.
|
||||
*/
|
||||
MAYBE_YES_COMBINES_FWD,
|
||||
/** Combines only backward. */
|
||||
MAYBE_YES_SIMPLE,
|
||||
/** Non-zero ccc but does not combine backward. */
|
||||
YES_YES_WITH_CC
|
||||
} type;
|
||||
/** Offset into the type's part of the extra data, or the algorithmic-mapping delta. */
|
||||
int32_t offset;
|
||||
|
||||
/**
|
||||
* Error string set by processing functions that do not have access
|
||||
* to the code point, deferred for readable reporting.
|
||||
*/
|
||||
const char *error;
|
||||
};
|
||||
|
||||
class Norms {
|
||||
|
@ -130,7 +155,7 @@ public:
|
|||
const Norm &getNormRef(UChar32 c) const;
|
||||
uint8_t getCC(UChar32 c) const { return getNormRef(c).cc; }
|
||||
|
||||
void reorder(Norm &norm, BuilderReorderingBuffer &buffer) const;
|
||||
void reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const;
|
||||
UBool combinesWithCCBetween(const Norm &norm, uint8_t lowCC, uint8_t highCC) const;
|
||||
|
||||
class Enumerator {
|
||||
|
|
Loading…
Add table
Reference in a new issue