ICU-13197 more gennorm2 cleanup, set smallFCD for some characters with algorithmic mappings

X-SVN-Rev: 40155
This commit is contained in:
Markus Scherer 2017-06-07 05:23:53 +00:00
parent b88c0cd8a9
commit 19d53e7641
8 changed files with 249 additions and 217 deletions

Binary file not shown.

Binary file not shown.

View file

@ -25,14 +25,7 @@ ExtraData::ExtraData(Norms &n, UBool fast) :
Norms::Enumerator(n),
yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions
yesNoMappingsAndCompositions(1000, (UChar32)0, 1), // 0=Hangul, 1=start of normal data
optimizeFast(fast) {
memset(smallFCD, 0, sizeof(smallFCD));
}
void ExtraData::setSmallFCD(UChar32 c) {
UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
}
optimizeFast(fast) {}
int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString) {
UnicodeString &m=*norm.mapping;
@ -44,26 +37,8 @@ int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &data
(long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
exit(U_INVALID_FORMAT_ERROR);
}
int32_t leadCC, trailCC;
if(length==0) {
leadCC=trailCC=0;
} else {
leadCC=norms.getCC(m.char32At(0));
trailCC=norms.getCC(m.char32At(length-1));
}
if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (norm.cc!=0 || leadCC!=0)) {
fprintf(stderr,
"gennorm2 error: "
"U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
(long)c);
exit(U_INVALID_FORMAT_ERROR);
}
// Write small-FCD data.
if((leadCC|trailCC)!=0) {
setSmallFCD(c);
}
// Write the mapping & raw mapping extraData.
int32_t firstUnit=length|(trailCC<<8);
int32_t firstUnit=length|(norm.trailCC<<8);
int32_t preMappingLength=0;
if(norm.rawMapping!=NULL) {
UnicodeString &rm=*norm.rawMapping;
@ -98,7 +73,7 @@ int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &data
}
firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
}
int32_t cccLccc=norm.cc|(leadCC<<8);
int32_t cccLccc=norm.cc|(norm.leadCC<<8);
if(cccLccc!=0) {
dataString.append((UChar)cccLccc);
++preMappingLength;
@ -187,65 +162,31 @@ void ExtraData::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
(long)start, (long)end);
exit(U_INTERNAL_PROGRAM_ERROR);
}
if(norm.error!=nullptr) {
fprintf(stderr, "gennorm2 error: U+%04lX %s\n", (long)start, norm.error);
exit(U_INVALID_FORMAT_ERROR);
}
writeExtraData(start, norm);
}
void ExtraData::writeExtraData(UChar32 c, Norm &norm) {
if(!norm.hasMapping()) {
// Write small-FCD data.
// There is similar code in writeMapping() for characters that do have a mapping.
if(norm.cc!=0) {
if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
fprintf(stderr,
"gennorm2 error: "
"U+%04lX below U+0300 has ccc!=0, not supported by ICU\n",
(long)c);
exit(U_INVALID_FORMAT_ERROR);
}
setSmallFCD(c);
}
}
if(norm.combinesBack) {
if(norm.hasMapping()) {
fprintf(stderr,
"gennorm2 error: "
"U+%04lX combines-back and decomposes, not possible in Unicode normalization\n",
(long)c);
exit(U_INVALID_FORMAT_ERROR);
}
if(norm.compositions!=NULL) {
norm.offset=
(maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)|
Norm::OFFSET_MAYBE_YES;
writeCompositions(c, norm, maybeYesCompositions);
}
} else if(!norm.hasMapping()) {
if(norm.compositions!=NULL) {
norm.offset=
(yesYesCompositions.length()<<Norm::OFFSET_SHIFT)|
Norm::OFFSET_YES_YES;
writeCompositions(c, norm, yesYesCompositions);
}
} else if(norm.mappingType==Norm::ROUND_TRIP) {
if(norm.compositions!=NULL) {
int32_t offset=yesNoMappingsAndCompositions.length()+
writeMapping(c, norm, yesNoMappingsAndCompositions);
norm.offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION;
writeCompositions(c, norm, yesNoMappingsAndCompositions);
} else {
int32_t offset=yesNoMappingsOnly.length()+
writeMapping(c, norm, yesNoMappingsOnly);
norm.offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY;
}
} else /* one-way */ {
if(norm.compositions!=NULL) {
fprintf(stderr,
"gennorm2 error: "
"U+%04lX combines-forward and has a one-way mapping, "
"not possible in Unicode normalization\n",
(long)c);
exit(U_INVALID_FORMAT_ERROR);
}
switch(norm.type) {
case Norm::INERT:
break; // no extra data
case Norm::YES_YES_COMBINES_FWD:
norm.offset=yesYesCompositions.length();
writeCompositions(c, norm, yesYesCompositions);
break;
case Norm::YES_NO_COMBINES_FWD:
norm.offset=yesNoMappingsAndCompositions.length()+
writeMapping(c, norm, yesNoMappingsAndCompositions);
writeCompositions(c, norm, yesNoMappingsAndCompositions);
break;
case Norm::YES_NO_MAPPING_ONLY:
norm.offset=yesNoMappingsOnly.length()+
writeMapping(c, norm, yesNoMappingsOnly);
break;
case Norm::NO_NO:
if(norm.cc==0 && !optimizeFast) {
// Try a compact, algorithmic encoding.
// Only for ccc=0, because we can't store additional information
@ -260,15 +201,25 @@ void ExtraData::writeExtraData(UChar32 c, Norm &norm) {
(!norm.hasNoCompBoundaryAfter || 1!=norm.mapping->countChar32())) {
int32_t delta=norm.mappingCP-c;
if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
norm.offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA;
norm.type=Norm::NO_NO_DELTA;
norm.offset=delta;
break;
}
}
}
if(norm.offset==0) {
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
int32_t offset=writeNoNoMapping(c, norm, noNoMappings, previousNoNoMappings);
norm.offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
}
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
norm.offset=writeNoNoMapping(c, norm, noNoMappings, previousNoNoMappings);
break;
case Norm::MAYBE_YES_COMBINES_FWD:
norm.offset=maybeYesCompositions.length();
writeCompositions(c, norm, maybeYesCompositions);
break;
case Norm::MAYBE_YES_SIMPLE:
break; // no extra data
case Norm::YES_YES_WITH_CC:
break; // no extra data
default: // Should not occur.
exit(U_INTERNAL_PROGRAM_ERROR);
}
}

View file

@ -37,10 +37,8 @@ public:
UnicodeString yesNoMappingsAndCompositions;
UnicodeString yesNoMappingsOnly;
UnicodeString noNoMappings;
uint8_t smallFCD[0x100];
private:
void setSmallFCD(UChar32 c);
/**
* Requires norm.hasMapping().
* Returns the offset of the "first unit" from the beginning of the extraData for c.

View file

@ -206,7 +206,7 @@ void Normalizer2DataBuilder::removeMapping(UChar32 c) {
}
}
UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) {
UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(const BuilderReorderingBuffer &buffer) {
if(buffer.isEmpty()) {
return TRUE; // maps-to-empty-string is no boundary of any kind
}
@ -215,7 +215,7 @@ UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &bu
return TRUE; // no starter
}
UChar32 starter=buffer.charAt(lastStarterIndex);
if( Hangul::isJamoL(starter) ||
if(Hangul::isJamoL(starter) ||
(Hangul::isJamoV(starter) &&
0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))) {
// A Jamo leading consonant or an LV pair combines-forward if it is at the end,
@ -229,26 +229,93 @@ UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &bu
}
// Compose as far as possible, and see if further compositions are possible.
uint8_t prevCC=0;
for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) {
for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length(); ++combMarkIndex) {
uint8_t cc=buffer.ccAt(combMarkIndex); // !=0 because after last starter
if(norms.combinesWithCCBetween(*starterNorm, prevCC, cc)) {
return TRUE;
}
if(prevCC<cc && (starter=starterNorm->combine(buffer.charAt(combMarkIndex)))>=0) {
buffer.setComposite(starter, combMarkIndex);
starterNorm=&norms.getNormRef(starter);
if(starterNorm->compositions==NULL) {
return FALSE; // the composite does not combine further
}
// The combining mark at combMarkIndex has been removed.
// Do not increment combMarkIndex now.
// Keep prevCC because we "removed" the combining mark.
} else {
prevCC=cc;
++combMarkIndex;
}
}
// TRUE if the final, forward-combining starter is at the end.
return prevCC==0;
// TODO?! prevCC==0 || norms.combinesWithCCBetween(*starterNorm, prevCC, int32_t! 0x100)
// TODO?! actually, should check if it combines with any cc not seen here
}
void Normalizer2DataBuilder::postProcess(Norm &norm) {
// Prerequisites: Compositions are built, mappings are recursively decomposed.
// Mappings are not yet in canonical order.
//
// This function works on a Norm struct. We do not know which code point(s) map(s) to it.
// Therefore, we cannot compute algorithmic mapping deltas here.
// Error conditions are checked, but printed later when we do know the offending code point.
if(norm.hasMapping()) {
// Ensure canonical order.
BuilderReorderingBuffer buffer;
if(norm.rawMapping!=nullptr) {
norms.reorder(*norm.rawMapping, buffer);
buffer.reset();
}
norms.reorder(*norm.mapping, buffer);
if(buffer.isEmpty()) {
norm.leadCC=norm.trailCC=0;
} else {
norm.leadCC=buffer.ccAt(0);
norm.trailCC=buffer.ccAt(buffer.length()-1);
}
// Set the hasNoCompBoundaryAfter flag for use by the last code branch
// in Normalizer2Impl::hasCompBoundaryAfter().
// For details see the comments on hasNoCompBoundaryAfter(buffer).
if(norm.compositions!=nullptr) {
norm.hasNoCompBoundaryAfter=TRUE;
} else {
norm.hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
}
if(norm.combinesBack) {
norm.error="combines-back and decomposes, not possible in Unicode normalization";
} else if(norm.mappingType==Norm::ROUND_TRIP) {
if(norm.compositions!=NULL) {
norm.type=Norm::YES_NO_COMBINES_FWD;
} else {
norm.type=Norm::YES_NO_MAPPING_ONLY;
}
} else { // one-way mapping
if(norm.compositions!=NULL) {
norm.error="combines-forward and has a one-way mapping, "
"not possible in Unicode normalization";
} else {
norm.type=Norm::NO_NO;
}
}
} else { // no mapping
norm.leadCC=norm.trailCC=norm.cc;
if(norm.combinesBack) {
if(norm.compositions!=nullptr) {
// Earlier code checked ccc=0.
norm.type=Norm::MAYBE_YES_COMBINES_FWD;
} else {
norm.type=Norm::MAYBE_YES_SIMPLE; // any ccc
}
} else if(norm.compositions!=nullptr) {
// Earlier code checked ccc=0.
norm.type=Norm::YES_YES_COMBINES_FWD;
} else if(norm.cc!=0) {
norm.type=Norm::YES_YES_WITH_CC;
} else {
norm.type=Norm::INERT;
}
}
}
class Norm16Writer : public Norms::Enumerator {
@ -260,55 +327,70 @@ public:
Normalizer2DataBuilder &builder;
};
void Normalizer2DataBuilder::setSmallFCD(UChar32 c) {
UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
}
void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, Norm &norm) {
int32_t offset=norm.offset>>Norm::OFFSET_SHIFT;
int32_t norm16=0;
UBool isDecompNo=FALSE; // TRUE if need to ensure start>=minDecompNoCP
UBool isCompNoMaybe=FALSE; // TRUE if need to ensure start>=minCompNoMaybeCP
switch(norm.offset&Norm::OFFSET_MASK) {
case Norm::OFFSET_NONE:
// No mapping, no compositions list.
if(norm.combinesBack) {
norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc;
isDecompNo=(UBool)(norm.cc!=0);
isCompNoMaybe=TRUE;
} else if(norm.cc!=0) {
norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+norm.cc;
isDecompNo=isCompNoMaybe=TRUE;
if(start<Normalizer2Impl::MIN_CCC_LCCC_CP && (norm.cc!=0 || norm.leadCC!=0)) {
fprintf(stderr,
"gennorm2 error: "
"U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
(long)start);
exit(U_INVALID_FORMAT_ERROR);
}
if((norm.leadCC|norm.trailCC)!=0) {
for(UChar32 c=start; c<=end; ++c) {
setSmallFCD(c);
}
}
int32_t norm16;
switch(norm.type) {
case Norm::INERT:
norm16=0;
break;
case Norm::OFFSET_MAYBE_YES:
norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset;
isCompNoMaybe=TRUE;
case Norm::YES_YES_COMBINES_FWD:
norm16=norm.offset;
break;
case Norm::OFFSET_YES_YES:
norm16=offset;
case Norm::YES_NO_COMBINES_FWD:
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+norm.offset;
break;
case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION:
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset;
isDecompNo=TRUE;
break;
case Norm::OFFSET_YES_NO_MAPPING_ONLY:
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset;
isDecompNo=TRUE;
break;
case Norm::OFFSET_NO_NO:
norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset;
isDecompNo=isCompNoMaybe=TRUE;
case Norm::YES_NO_MAPPING_ONLY:
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+norm.offset;
break;
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
case Norm::OFFSET_DELTA:
norm16=getCenterNoNoDelta()+offset;
isDecompNo=isCompNoMaybe=TRUE;
case Norm::NO_NO:
norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+norm.offset;
break;
case Norm::NO_NO_DELTA:
norm16=getCenterNoNoDelta()+norm.offset;
break;
case Norm::MAYBE_YES_COMBINES_FWD:
norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+norm.offset;
break;
case Norm::MAYBE_YES_SIMPLE:
norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc; // ccc=0..255
break;
case Norm::YES_YES_WITH_CC:
U_ASSERT(norm.cc!=0);
norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+norm.cc; // ccc=1..255
break;
default: // Should not occur.
exit(U_INTERNAL_PROGRAM_ERROR);
}
IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode);
// Set the minimum code points for real data lookups in the quick check loops.
UBool isDecompNo=
(Norm::YES_NO_COMBINES_FWD<=norm.type && norm.type<=Norm::NO_NO_DELTA) ||
norm.cc!=0;
if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
}
UBool isCompNoMaybe= norm.type>=Norm::NO_NO;
if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
}
@ -367,65 +449,48 @@ void Normalizer2DataBuilder::processData() {
norm16Trie=utrie2_open(0, 0, errorCode);
errorCode.assertSuccess();
// Build composition lists before recursive decomposition,
// so that we still have the raw, pair-wise mappings.
CompositionBuilder compBuilder(norms);
norms.enumRanges(compBuilder);
// Recursively decompose all mappings.
Decomposer decomposer(norms);
do {
decomposer.didDecompose=FALSE;
norms.enumRanges(decomposer);
} while(decomposer.didDecompose);
BuilderReorderingBuffer buffer;
// Set the Norm::Type and other properties.
int32_t normsLength=norms.length();
for(int32_t i=1; i<normsLength; ++i) {
// Set the hasNoCompBoundaryAfter flag for use by the last code branch
// in Normalizer2Impl::hasCompBoundaryAfter().
// For details see the comments on hasNoCompBoundaryAfter(buffer).
Norm &norm=norms.getNormRefByIndex(i);
if(norm.hasMapping()) {
if(norm.compositions!=NULL) {
norm.hasNoCompBoundaryAfter=TRUE;
} else {
buffer.reset();
norms.reorder(norm, buffer);
norm.hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
}
}
postProcess(norms.getNormRefByIndex(i));
}
indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
// Write the properties, mappings and composition lists to
// appropriate parts of the "extra data" array.
ExtraData extra(norms, optimization==OPTIMIZE_FAST);
norms.enumRanges(extra);
extraData=extra.maybeYesCompositions;
extraData.append(extra.yesYesCompositions).
append(extra.yesNoMappingsAndCompositions).
append(extra.yesNoMappingsOnly).
append(extra.noNoMappings);
extraData=extra.yesYesCompositions;
indexes[Normalizer2Impl::IX_MIN_YES_NO]=extraData.length();
extraData.append(extra.yesNoMappingsAndCompositions);
indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=extraData.length();
extraData.append(extra.yesNoMappingsOnly);
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
indexes[Normalizer2Impl::IX_MIN_NO_NO]=extraData.length();
extraData.append(extra.noNoMappings);
indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=extraData.length();
extraData.insert(0, extra.maybeYesCompositions);
indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
extra.maybeYesCompositions.length();
// Pad to even length for 4-byte alignment of following data.
if(extraData.length()&1) {
extraData.append((UChar)0);
}
memcpy(smallFCD, extra.smallFCD, sizeof(smallFCD));
indexes[Normalizer2Impl::IX_MIN_YES_NO]=
extra.yesYesCompositions.length();
indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=
indexes[Normalizer2Impl::IX_MIN_YES_NO]+
extra.yesNoMappingsAndCompositions.length();
indexes[Normalizer2Impl::IX_MIN_NO_NO]=
indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+
extra.yesNoMappingsOnly.length();
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=
indexes[Normalizer2Impl::IX_MIN_NO_NO]+
extra.noNoMappings.length();
indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
extra.maybeYesCompositions.length();
int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA;
if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
@ -435,6 +500,13 @@ void Normalizer2DataBuilder::processData() {
exit(U_BUFFER_OVERFLOW_ERROR);
}
// writeNorm16() and setHangulData() reduce these as needed.
indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
// Map each code point to its norm16 value,
// including the properties that fit directly,
// and the offset to the "extra data" if necessary.
Norm16Writer norm16Writer(norms, *this);
norms.enumRanges(norm16Writer);
@ -442,7 +514,7 @@ void Normalizer2DataBuilder::processData() {
// Look for the "worst" norm16 value of any supplementary code point
// corresponding to a lead surrogate, and set it as that surrogate's value.
// Enables quick check inner loops to look at only code units.
// Enables UTF-16 quick check inner loops to look at only code units.
//
// We could be more sophisticated:
// We could collect a bit set for whether there are values in the different
@ -605,13 +677,6 @@ Normalizer2DataBuilder::writeCSourceFile(const char *filename) {
line,
smallFCD, 8, sizeof(smallFCD),
"\n};\n\n");
/*fputs( // TODO
"static const UCaseProps %s_singleton={\n"
" NULL,\n"
" %s_indexes,\n"
" %s_extraData,\n"
" %s_smallFCD,\n",
f);*/
sprintf(line, "static const UTrie2 %s_trie={\n", dataName.data());
char line2[100];
sprintf(line2, "%s_trieIndex", dataName.data());

View file

@ -85,13 +85,15 @@ private:
* or its mapping contains no starter,
* or the last starter combines-forward.
*/
UBool hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer);
void setHangulData();
UBool hasNoCompBoundaryAfter(const BuilderReorderingBuffer &buffer);
void postProcess(Norm &norm);
void setSmallFCD(UChar32 c);
int32_t getCenterNoNoDelta() {
return indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]-Normalizer2Impl::MAX_DELTA-1;
}
void writeNorm16(UChar32 start, UChar32 end, Norm &norm);
void setHangulData();
void processData();
Norms norms;

View file

@ -43,23 +43,13 @@ void BuilderReorderingBuffer::append(UChar32 c, uint8_t cc) {
fDidReorder=TRUE;
}
void BuilderReorderingBuffer::toString(UnicodeString &dest) {
void BuilderReorderingBuffer::toString(UnicodeString &dest) const {
dest.remove();
for(int32_t i=0; i<fLength; ++i) {
dest.append(charAt(i));
}
}
void BuilderReorderingBuffer::setComposite(UChar32 composite, int32_t combMarkIndex) {
fArray[fLastStarterIndex]=composite<<8;
// Remove the combining mark that contributed to the composite.
--fLength;
while(combMarkIndex<fLength) {
fArray[combMarkIndex]=fArray[combMarkIndex+1];
++combMarkIndex;
}
}
UChar32 Norm::combine(UChar32 trail) const {
int32_t length;
const CompositionPair *pairs=getCompositionPairs(length);
@ -77,7 +67,9 @@ UChar32 Norm::combine(UChar32 trail) const {
Norms::Norms(UErrorCode &errorCode) {
normTrie=utrie2_open(0, 0, &errorCode);
normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
norms=allocNorm(); // unused Norm struct at index 0
// Default "inert" Norm struct at index 0. Practically immutable.
norms=allocNorm();
norms->type=Norm::INERT;
}
Norms::~Norms() {
@ -122,13 +114,12 @@ Norm *Norms::createNorm(UChar32 c) {
}
}
void Norms::reorder(Norm &norm, BuilderReorderingBuffer &buffer) const {
UnicodeString &m=*norm.mapping;
int32_t length=m.length();
void Norms::reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const {
int32_t length=mapping.length();
if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
return; // writeMapping() will complain about it and print the code point.
}
const UChar *s=toUCharPtr(m.getBuffer());
const char16_t *s=mapping.getBuffer();
int32_t i=0;
UChar32 c;
while(i<length) {
@ -136,7 +127,7 @@ void Norms::reorder(Norm &norm, BuilderReorderingBuffer &buffer) const {
buffer.append(c, getCC(c));
}
if(buffer.didReorder()) {
buffer.toString(m);
buffer.toString(mapping);
}
}

View file

@ -40,8 +40,7 @@ public:
UBool didReorder() const { return fDidReorder; }
void append(UChar32 c, uint8_t cc);
void toString(UnicodeString &dest);
void setComposite(UChar32 composite, int32_t combMarkIndex);
void toString(UnicodeString &dest) const;
private:
int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK];
@ -88,28 +87,54 @@ struct Norm {
MappingType mappingType;
UVector32 *compositions; // (trail, composite) pairs
uint8_t cc;
uint8_t cc, leadCC, trailCC;
UBool combinesBack;
UBool hasNoCompBoundaryAfter;
enum OffsetType {
OFFSET_NONE,
// Composition for back-combining character. Allowed, but not normally used.
OFFSET_MAYBE_YES,
// Composition for a starter that does not have a decomposition mapping.
OFFSET_YES_YES,
// Round-trip mapping & composition for a starter.
OFFSET_YES_NO_MAPPING_AND_COMPOSITION,
// Round-trip mapping for a starter that itself does not combine-forward.
OFFSET_YES_NO_MAPPING_ONLY,
/**
* Overall type of normalization properties.
* Set after most processing is done.
*
* Corresponds to the rows in the chart on
* http://site.icu-project.org/design/normalization/custom
* in numerical (but reverse visual) order.
*
* YES_NO means composition quick check=yes, decomposition QC=no -- etc.
*/
enum Type {
/** Initial value until most processing is done. */
UNKNOWN,
/** No mapping, does not combine, ccc=0. */
INERT,
/** Starter, no mapping, has compositions. */
YES_YES_COMBINES_FWD,
/** Starter with a round-trip mapping and compositions. */
YES_NO_COMBINES_FWD,
/** Starter with a round-trip mapping but no compositions. */
YES_NO_MAPPING_ONLY,
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
// One-way mapping.
OFFSET_NO_NO,
// Delta for an algorithmic one-way mapping.
OFFSET_DELTA
};
enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 };
/** Has a one-way mapping. */
NO_NO,
/** Has an algorithmic one-way mapping to a single code point. */
NO_NO_DELTA,
/**
* Combines both backward and forward, has compositions.
* Allowed, but not normally used.
*/
MAYBE_YES_COMBINES_FWD,
/** Combines only backward. */
MAYBE_YES_SIMPLE,
/** Non-zero ccc but does not combine backward. */
YES_YES_WITH_CC
} type;
/** Offset into the type's part of the extra data, or the algorithmic-mapping delta. */
int32_t offset;
/**
* Error string set by processing functions that do not have access
* to the code point, deferred for readable reporting.
*/
const char *error;
};
class Norms {
@ -130,7 +155,7 @@ public:
const Norm &getNormRef(UChar32 c) const;
uint8_t getCC(UChar32 c) const { return getNormRef(c).cc; }
void reorder(Norm &norm, BuilderReorderingBuffer &buffer) const;
void reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const;
UBool combinesWithCCBetween(const Norm &norm, uint8_t lowCC, uint8_t highCC) const;
class Enumerator {