ICU-13197 split gennorm2/n2builder into 3 more manageable pieces; no output change

X-SVN-Rev: 40150
This commit is contained in:
Markus Scherer 2017-06-05 03:53:14 +00:00
parent acf2b4cc82
commit 425204d84b
8 changed files with 972 additions and 794 deletions

View file

@ -27,7 +27,7 @@ TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
CPPFLAGS += -I$(srcdir) -I$(top_srcdir)/common -I$(srcdir)/../toolutil
LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
OBJECTS = gennorm2.o n2builder.o
OBJECTS = gennorm2.o n2builder.o extradata.o norms.o
DEPS = $(OBJECTS:.o=.d)

View file

@ -0,0 +1,277 @@
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// extradata.cpp
// created: 2017jun04 Markus W. Scherer
// (pulled out of n2builder.cpp)
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/errorcode.h"
#include "unicode/unistr.h"
#include "unicode/utf16.h"
#include "extradata.h"
#include "normalizer2impl.h"
#include "norms.h"
#include "toolutil.h"
#include "utrie2.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
ExtraData::ExtraData(Norms &n, UBool fast) :
Norms::Enumerator(n),
yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions
yesNoMappingsAndCompositions(1000, (UChar32)0, 1), // 0=Hangul, 1=start of normal data
optimizeFast(fast) {
memset(smallFCD, 0, sizeof(smallFCD));
}
void ExtraData::setSmallFCD(UChar32 c) {
UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
}
int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString) {
UnicodeString &m=*norm.mapping;
int32_t length=m.length();
if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
fprintf(stderr,
"gennorm2 error: "
"mapping for U+%04lX longer than maximum of %d\n",
(long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
exit(U_INVALID_FORMAT_ERROR);
}
int32_t leadCC, trailCC;
if(length==0) {
leadCC=trailCC=0;
} else {
leadCC=norms.getCC(m.char32At(0));
trailCC=norms.getCC(m.char32At(length-1));
}
if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (norm.cc!=0 || leadCC!=0)) {
fprintf(stderr,
"gennorm2 error: "
"U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
(long)c);
exit(U_INVALID_FORMAT_ERROR);
}
// Write small-FCD data.
if((leadCC|trailCC)!=0) {
setSmallFCD(c);
}
// Write the mapping & raw mapping extraData.
int32_t firstUnit=length|(trailCC<<8);
int32_t preMappingLength=0;
if(norm.rawMapping!=NULL) {
UnicodeString &rm=*norm.rawMapping;
int32_t rmLength=rm.length();
if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) {
fprintf(stderr,
"gennorm2 error: "
"raw mapping for U+%04lX longer than maximum of %d\n",
(long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
exit(U_INVALID_FORMAT_ERROR);
}
UChar rm0=rm.charAt(0);
if( rmLength==length-1 &&
// 99: overlong substring lengths get pinned to remainder lengths anyway
0==rm.compare(1, 99, m, 2, 99) &&
rm0>Normalizer2Impl::MAPPING_LENGTH_MASK
) {
// Compression:
// rawMapping=rm0+mapping.substring(2) -> store only rm0
//
// The raw mapping is the same as the final mapping after replacing
// the final mapping's first two code units with the raw mapping's first one.
// In this case, we store only that first unit, rm0.
// This helps with a few hundred mappings.
dataString.append(rm0);
preMappingLength=1;
} else {
// Store the raw mapping with its length.
dataString.append(rm);
dataString.append((UChar)rmLength);
preMappingLength=rmLength+1;
}
firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
}
int32_t cccLccc=norm.cc|(leadCC<<8);
if(cccLccc!=0) {
dataString.append((UChar)cccLccc);
++preMappingLength;
firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
}
if(norm.hasNoCompBoundaryAfter) {
firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER;
}
dataString.append((UChar)firstUnit);
dataString.append(m);
return preMappingLength;
}
int32_t ExtraData::writeNoNoMapping(UChar32 c, const Norm &norm,
UnicodeString &dataString,
Hashtable &previousMappings) {
int32_t oldLength=dataString.length();
int32_t offset=oldLength+writeMapping(c, norm, dataString);
UnicodeString newMapping=dataString.tempSubString(oldLength);
int32_t previousOffset=previousMappings.geti(newMapping);
if(previousOffset!=0) {
// Duplicate, remove the new units and point to the old ones.
dataString.truncate(oldLength);
offset=previousOffset-1;
} else {
// Enter this new mapping into the hashtable, avoiding value 0 which is "not found".
IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()");
previousMappings.puti(newMapping, offset+1, errorCode);
}
return offset;
}
void ExtraData::writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString) {
if(norm.cc!=0) {
fprintf(stderr,
"gennorm2 error: "
"U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
(long)c);
exit(U_INVALID_FORMAT_ERROR);
}
int32_t length;
const CompositionPair *pairs=norm.getCompositionPairs(length);
for(int32_t i=0; i<length; ++i) {
const CompositionPair &pair=pairs[i];
// 22 bits for the composite character and whether it combines forward.
UChar32 compositeAndFwd=pair.composite<<1;
if(norms.getNormRef(pair.composite).compositions!=NULL) {
compositeAndFwd|=1; // The composite character also combines-forward.
}
// Encode most pairs in two units and some in three.
int32_t firstUnit, secondUnit, thirdUnit;
if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) {
if(compositeAndFwd<=0xffff) {
firstUnit=pair.trail<<1;
secondUnit=compositeAndFwd;
thirdUnit=-1;
} else {
firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE;
secondUnit=compositeAndFwd>>16;
thirdUnit=compositeAndFwd;
}
} else {
firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+
(pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))|
Normalizer2Impl::COMP_1_TRIPLE;
secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)|
(compositeAndFwd>>16);
thirdUnit=compositeAndFwd;
}
// Set the high bit of the first unit if this is the last composition pair.
if(i==(length-1)) {
firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE;
}
dataString.append((UChar)firstUnit).append((UChar)secondUnit);
if(thirdUnit>=0) {
dataString.append((UChar)thirdUnit);
}
}
}
void ExtraData::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
if(start!=end) {
fprintf(stderr,
"gennorm2 error: unexpected shared data for "
"multiple code points U+%04lX..U+%04lX\n",
(long)start, (long)end);
exit(U_INTERNAL_PROGRAM_ERROR);
}
writeExtraData(start, norm);
}
void ExtraData::writeExtraData(UChar32 c, Norm &norm) {
if(!norm.hasMapping()) {
// Write small-FCD data.
// There is similar code in writeMapping() for characters that do have a mapping.
if(norm.cc!=0) {
if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
fprintf(stderr,
"gennorm2 error: "
"U+%04lX below U+0300 has ccc!=0, not supported by ICU\n",
(long)c);
exit(U_INVALID_FORMAT_ERROR);
}
setSmallFCD(c);
}
}
if(norm.combinesBack) {
if(norm.hasMapping()) {
fprintf(stderr,
"gennorm2 error: "
"U+%04lX combines-back and decomposes, not possible in Unicode normalization\n",
(long)c);
exit(U_INVALID_FORMAT_ERROR);
}
if(norm.compositions!=NULL) {
norm.offset=
(maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)|
Norm::OFFSET_MAYBE_YES;
writeCompositions(c, norm, maybeYesCompositions);
}
} else if(!norm.hasMapping()) {
if(norm.compositions!=NULL) {
norm.offset=
(yesYesCompositions.length()<<Norm::OFFSET_SHIFT)|
Norm::OFFSET_YES_YES;
writeCompositions(c, norm, yesYesCompositions);
}
} else if(norm.mappingType==Norm::ROUND_TRIP) {
if(norm.compositions!=NULL) {
int32_t offset=yesNoMappingsAndCompositions.length()+
writeMapping(c, norm, yesNoMappingsAndCompositions);
norm.offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION;
writeCompositions(c, norm, yesNoMappingsAndCompositions);
} else {
int32_t offset=yesNoMappingsOnly.length()+
writeMapping(c, norm, yesNoMappingsOnly);
norm.offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY;
}
} else /* one-way */ {
if(norm.compositions!=NULL) {
fprintf(stderr,
"gennorm2 error: "
"U+%04lX combines-forward and has a one-way mapping, "
"not possible in Unicode normalization\n",
(long)c);
exit(U_INVALID_FORMAT_ERROR);
}
if(norm.cc==0 && !optimizeFast) {
// Try a compact, algorithmic encoding.
// Only for ccc=0, because we can't store additional information
// and we do not recursively follow an algorithmic encoding for access to the ccc.
//
// Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding
// if the mappingCP decomposes further, to ensure that there is a place to store it.
// We want to see that the final mapping does not have exactly 1 code point,
// or else we would have to recursively ensure that the final mapping is stored
// in normal extraData.
if(norm.mappingCP>=0 &&
(!norm.hasNoCompBoundaryAfter || 1!=norm.mapping->countChar32())) {
int32_t delta=norm.mappingCP-c;
if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
norm.offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA;
}
}
}
if(norm.offset==0) {
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
int32_t offset=writeNoNoMapping(c, norm, noNoMappings, previousNoNoMappings);
norm.offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
}
}
}
U_NAMESPACE_END
#endif // #if !UCONFIG_NO_NORMALIZATION

View file

@ -0,0 +1,65 @@
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// extradata.h
// created: 2017jun04 Markus W. Scherer
// (pulled out of n2builder.cpp)
// Write mappings and compositions in compact form for Normalizer2 "extra data",
// the data that does not fit into the trie itself.
#ifndef __EXTRADATA_H__
#define __EXTRADATA_H__
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/errorcode.h"
#include "unicode/unistr.h"
#include "unicode/utf16.h"
#include "hash.h"
#include "norms.h"
#include "toolutil.h"
#include "utrie2.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
class ExtraData : public Norms::Enumerator {
public:
ExtraData(Norms &n, UBool fast);
void rangeHandler(UChar32 start, UChar32 end, Norm &norm) override;
UnicodeString maybeYesCompositions;
UnicodeString yesYesCompositions;
UnicodeString yesNoMappingsAndCompositions;
UnicodeString yesNoMappingsOnly;
UnicodeString noNoMappings;
uint8_t smallFCD[0x100];
private:
void setSmallFCD(UChar32 c);
/**
* Requires norm.hasMapping().
* Returns the offset of the "first unit" from the beginning of the extraData for c.
* That is the same as the length of the optional data
* for the raw mapping and the ccc/lccc word.
*/
int32_t writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString);
int32_t writeNoNoMapping(UChar32 c, const Norm &norm,
UnicodeString &dataString, Hashtable &previousMappings);
/** Requires norm.compositions!=nullptr. */
void writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString);
void writeExtraData(UChar32 c, Norm &norm);
UBool optimizeFast;
Hashtable previousNoNoMappings; // If constructed in runtime code, pass in UErrorCode.
};
U_NAMESPACE_END
#endif // #if !UCONFIG_NO_NORMALIZATION
#endif // __EXTRADATA_H__

View file

@ -249,11 +249,15 @@
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="extradata.cpp" />
<ClCompile Include="gennorm2.cpp" />
<ClCompile Include="n2builder.cpp" />
<ClCompile Include="norms.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="extradata.h" />
<ClInclude Include="n2builder.h" />
<ClInclude Include="norms.h" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\..\common\common.vcxproj">

File diff suppressed because it is too large Load diff

View file

@ -28,16 +28,12 @@
#include "normalizer2impl.h" // for IX_COUNT
#include "toolutil.h"
#include "utrie2.h"
#include "norms.h"
U_NAMESPACE_BEGIN
extern UBool beVerbose, haveCopyright;
struct Norm;
class BuilderReorderingBuffer;
class ExtraDataWriter;
class Normalizer2DataBuilder {
public:
Normalizer2DataBuilder(UErrorCode &errorCode);
@ -69,42 +65,36 @@ public:
void writeCSourceFile(const char *filename);
private:
friend class CompositionBuilder;
friend class Decomposer;
friend class ExtraDataWriter;
friend class Norm16Writer;
// No copy constructor nor assignment operator.
Normalizer2DataBuilder(const Normalizer2DataBuilder &other);
Normalizer2DataBuilder &operator=(const Normalizer2DataBuilder &other);
Normalizer2DataBuilder(const Normalizer2DataBuilder &other) = delete;
Normalizer2DataBuilder &operator=(const Normalizer2DataBuilder &other) = delete;
Norm *allocNorm();
Norm *getNorm(UChar32 c);
Norm *createNorm(UChar32 c);
Norm *checkNormForMapping(Norm *p, UChar32 c); // check for permitted overrides
const Norm &getNormRef(UChar32 c) const;
uint8_t getCC(UChar32 c) const;
UBool combinesWithCCBetween(const Norm &norm, uint8_t lowCC, uint8_t highCC) const;
UChar32 combine(const Norm &norm, UChar32 trail) const;
void addComposition(UChar32 start, UChar32 end, uint32_t value);
UBool decompose(UChar32 start, UChar32 end, uint32_t value);
void reorder(Norm *p, BuilderReorderingBuffer &buffer);
/**
* Computes the MAPPING_NO_COMP_BOUNDARY_AFTER flag for a character's mapping
* (especially for a "YesNo" which has a round-trip mapping).
* This flag is used in Normalizer2Impl::hasCompBoundaryAfter().
*
* Modifies the buffer (partially composes it).
*
* A starter character with a mapping does not have a composition boundary after it
* if the character itself combines-forward (which is tested by the caller of this function),
* or it is deleted (mapped to the empty string),
* or its mapping contains no starter,
* or the last starter combines-forward.
*/
UBool hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer);
void setHangulData();
int32_t writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString);
void writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString);
void writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer);
int32_t getCenterNoNoDelta() {
return indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]-Normalizer2Impl::MAX_DELTA-1;
}
void writeNorm16(UChar32 start, UChar32 end, uint32_t value);
void writeNorm16(UChar32 start, UChar32 end, Norm &norm);
void processData();
UTrie2 *normTrie;
UToolMemory *normMem;
Norm *norms;
Norms norms;
int32_t phase;
OverrideHandling overrideHandling;

View file

@ -0,0 +1,333 @@
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// norms.cpp
// created: 2017jun04 Markus W. Scherer
// (pulled out of n2builder.cpp)
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/errorcode.h"
#include "unicode/unistr.h"
#include "unicode/utf16.h"
#include "normalizer2impl.h"
#include "norms.h"
#include "toolutil.h"
#include "utrie2.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
void BuilderReorderingBuffer::append(UChar32 c, uint8_t cc) {
if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) {
if(cc==0) {
fLastStarterIndex=fLength;
}
fArray[fLength++]=(c<<8)|cc;
return;
}
// Let this character bubble back to its canonical order.
int32_t i=fLength-1;
while(i>fLastStarterIndex && ccAt(i)>cc) {
--i;
}
++i; // after the last starter or prevCC<=cc
// Move this and the following characters forward one to make space.
for(int32_t j=fLength; i<j; --j) {
fArray[j]=fArray[j-1];
}
fArray[i]=(c<<8)|cc;
++fLength;
fDidReorder=TRUE;
}
void BuilderReorderingBuffer::toString(UnicodeString &dest) {
dest.remove();
for(int32_t i=0; i<fLength; ++i) {
dest.append(charAt(i));
}
}
void BuilderReorderingBuffer::setComposite(UChar32 composite, int32_t combMarkIndex) {
fArray[fLastStarterIndex]=composite<<8;
// Remove the combining mark that contributed to the composite.
--fLength;
while(combMarkIndex<fLength) {
fArray[combMarkIndex]=fArray[combMarkIndex+1];
++combMarkIndex;
}
}
UChar32 Norm::combine(UChar32 trail) const {
int32_t length;
const CompositionPair *pairs=getCompositionPairs(length);
for(int32_t i=0; i<length; ++i) {
if(trail==pairs[i].trail) {
return pairs[i].composite;
}
if(trail<pairs[i].trail) {
break;
}
}
return U_SENTINEL;
}
Norms::Norms(UErrorCode &errorCode) {
normTrie=utrie2_open(0, 0, &errorCode);
normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
norms=allocNorm(); // unused Norm struct at index 0
}
Norms::~Norms() {
utrie2_close(normTrie);
int32_t normsLength=utm_countItems(normMem);
for(int32_t i=1; i<normsLength; ++i) {
delete norms[i].mapping;
delete norms[i].rawMapping;
delete norms[i].compositions;
}
utm_close(normMem);
}
Norm *Norms::allocNorm() {
Norm *p=(Norm *)utm_alloc(normMem);
norms=(Norm *)utm_getStart(normMem); // in case it got reallocated
return p;
}
Norm *Norms::getNorm(UChar32 c) {
uint32_t i=utrie2_get32(normTrie, c);
if(i==0) {
return nullptr;
}
return norms+i;
}
const Norm &Norms::getNormRef(UChar32 c) const {
return norms[utrie2_get32(normTrie, c)];
}
Norm *Norms::createNorm(UChar32 c) {
uint32_t i=utrie2_get32(normTrie, c);
if(i!=0) {
return norms+i;
} else {
/* allocate Norm */
Norm *p=allocNorm();
IcuToolErrorCode errorCode("gennorm2/createNorm()");
utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode);
return p;
}
}
void Norms::reorder(Norm &norm, BuilderReorderingBuffer &buffer) const {
UnicodeString &m=*norm.mapping;
int32_t length=m.length();
if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
return; // writeMapping() will complain about it and print the code point.
}
const UChar *s=toUCharPtr(m.getBuffer());
int32_t i=0;
UChar32 c;
while(i<length) {
U16_NEXT(s, i, length, c);
buffer.append(c, getCC(c));
}
if(buffer.didReorder()) {
buffer.toString(m);
}
}
UBool Norms::combinesWithCCBetween(const Norm &norm, uint8_t lowCC, uint8_t highCC) const {
if((highCC-lowCC)>=2) {
int32_t length;
const CompositionPair *pairs=norm.getCompositionPairs(length);
for(int32_t i=0; i<length; ++i) {
uint8_t trailCC=getCC(pairs[i].trail);
if(lowCC<trailCC && trailCC<highCC) {
return TRUE;
}
}
}
return FALSE;
}
U_CDECL_BEGIN
static UBool U_CALLCONV
enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
return ((Norms::Enumerator *)context)->rangeHandler(start, end, value);
}
U_CDECL_END
void Norms::enumRanges(Enumerator &e) {
utrie2_enum(normTrie, nullptr, enumRangeHandler, &e);
}
Norms::Enumerator::~Enumerator() {}
UBool Norms::Enumerator::rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
if(value!=0) {
rangeHandler(start, end, norms.getNormRefByIndex(value));
}
return TRUE;
}
void CompositionBuilder::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
if(norm.mappingType!=Norm::ROUND_TRIP) { return; }
if(start!=end) {
fprintf(stderr,
"gennorm2 error: same round-trip mapping for "
"more than 1 code point U+%04lX..U+%04lX\n",
(long)start, (long)end);
exit(U_INVALID_FORMAT_ERROR);
}
if(norm.cc!=0) {
fprintf(stderr,
"gennorm2 error: "
"U+%04lX has a round-trip mapping and ccc!=0, "
"not possible in Unicode normalization\n",
(long)start);
exit(U_INVALID_FORMAT_ERROR);
}
// setRoundTripMapping() ensured that there are exactly two code points.
const UnicodeString &m=*norm.mapping;
UChar32 lead=m.char32At(0);
UChar32 trail=m.char32At(m.length()-1);
if(norms.getCC(lead)!=0) {
fprintf(stderr,
"gennorm2 error: "
"U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
"not possible in Unicode normalization\n",
(long)start, (long)lead);
exit(U_INVALID_FORMAT_ERROR);
}
// Flag for trailing character.
norms.createNorm(trail)->combinesBack=TRUE;
// Insert (trail, composite) pair into compositions list for the lead character.
IcuToolErrorCode errorCode("gennorm2/addComposition()");
Norm *leadNorm=norms.createNorm(lead);
UVector32 *compositions=leadNorm->compositions;
int32_t i;
if(compositions==nullptr) {
compositions=leadNorm->compositions=new UVector32(errorCode);
i=0; // "insert" the first pair at index 0
} else {
// Insertion sort, and check for duplicate trail characters.
int32_t length;
const CompositionPair *pairs=leadNorm->getCompositionPairs(length);
for(i=0; i<length; ++i) {
if(trail==pairs[i].trail) {
fprintf(stderr,
"gennorm2 error: same round-trip mapping for "
"more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
(long)start, (long)lead, (long)trail);
exit(U_INVALID_FORMAT_ERROR);
}
if(trail<pairs[i].trail) {
break;
}
}
}
compositions->insertElementAt(trail, 2*i, errorCode);
compositions->insertElementAt(start, 2*i+1, errorCode);
}
void Decomposer::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
if(!norm.hasMapping()) { return; }
const UnicodeString &m=*norm.mapping;
UnicodeString *decomposed=nullptr;
const UChar *s=toUCharPtr(m.getBuffer());
int32_t length=m.length();
int32_t prev, i=0;
UChar32 c;
while(i<length) {
prev=i;
U16_NEXT(s, i, length, c);
if(start<=c && c<=end) {
fprintf(stderr,
"gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
(long)c);
exit(U_INVALID_FORMAT_ERROR);
}
const Norm &cNorm=norms.getNormRef(c);
if(cNorm.hasMapping()) {
if(norm.mappingType==Norm::ROUND_TRIP) {
if(prev==0) {
if(cNorm.mappingType!=Norm::ROUND_TRIP) {
fprintf(stderr,
"gennorm2 error: "
"U+%04lX's round-trip mapping's starter "
"U+%04lX one-way-decomposes, "
"not possible in Unicode normalization\n",
(long)start, (long)c);
exit(U_INVALID_FORMAT_ERROR);
}
uint8_t myTrailCC=norms.getCC(m.char32At(i));
UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
uint8_t cTrailCC=norms.getCC(cTrailChar);
if(cTrailCC>myTrailCC) {
fprintf(stderr,
"gennorm2 error: "
"U+%04lX's round-trip mapping's starter "
"U+%04lX decomposes and the "
"inner/earlier tccc=%hu > outer/following tccc=%hu, "
"not possible in Unicode normalization\n",
(long)start, (long)c,
(short)cTrailCC, (short)myTrailCC);
exit(U_INVALID_FORMAT_ERROR);
}
} else {
fprintf(stderr,
"gennorm2 error: "
"U+%04lX's round-trip mapping's non-starter "
"U+%04lX decomposes, "
"not possible in Unicode normalization\n",
(long)start, (long)c);
exit(U_INVALID_FORMAT_ERROR);
}
}
if(decomposed==nullptr) {
decomposed=new UnicodeString(m, 0, prev);
}
decomposed->append(*cNorm.mapping);
} else if(Hangul::isHangul(c)) {
UChar buffer[3];
int32_t hangulLength=Hangul::decompose(c, buffer);
if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
fprintf(stderr,
"gennorm2 error: "
"U+%04lX's round-trip mapping's non-starter "
"U+%04lX decomposes, "
"not possible in Unicode normalization\n",
(long)start, (long)c);
exit(U_INVALID_FORMAT_ERROR);
}
if(decomposed==nullptr) {
decomposed=new UnicodeString(m, 0, prev);
}
decomposed->append(buffer, hangulLength);
} else if(decomposed!=nullptr) {
decomposed->append(m, prev, i-prev);
}
}
if(decomposed!=nullptr) {
if(norm.rawMapping==nullptr) {
// Remember the original mapping when decomposing recursively.
norm.rawMapping=norm.mapping;
} else {
delete norm.mapping;
}
norm.mapping=decomposed;
// Not norm.setMappingCP(); because the original mapping
// is most likely to be encodable as a delta.
didDecompose|=TRUE;
}
}
U_NAMESPACE_END
#endif // #if !UCONFIG_NO_NORMALIZATION

View file

@ -0,0 +1,178 @@
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// norms.h
// created: 2017jun04 Markus W. Scherer
// (pulled out of n2builder.cpp)
// Storing & manipulating Normalizer2 builder data.
#ifndef __NORMS_H__
#define __NORMS_H__
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/errorcode.h"
#include "unicode/unistr.h"
#include "unicode/utf16.h"
#include "normalizer2impl.h"
#include "toolutil.h"
#include "utrie2.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
class BuilderReorderingBuffer {
public:
BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {}
void reset() {
fLength=0;
fLastStarterIndex=-1;
fDidReorder=FALSE;
}
int32_t length() const { return fLength; }
UBool isEmpty() const { return fLength==0; }
int32_t lastStarterIndex() const { return fLastStarterIndex; }
UChar32 charAt(int32_t i) const { return fArray[i]>>8; }
uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; }
UBool didReorder() const { return fDidReorder; }
void append(UChar32 c, uint8_t cc);
void toString(UnicodeString &dest);
void setComposite(UChar32 composite, int32_t combMarkIndex);
private:
int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK];
int32_t fLength;
int32_t fLastStarterIndex;
UBool fDidReorder;
};
struct CompositionPair {
CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {}
UChar32 trail, composite;
};
struct Norm {
enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY };
UBool hasMapping() const { return mappingType>REMOVED; }
// Requires hasMapping() and well-formed mapping.
void setMappingCP() {
UChar32 c;
if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) {
mappingCP=c;
} else {
mappingCP=U_SENTINEL;
}
}
const CompositionPair *getCompositionPairs(int32_t &length) const {
if(compositions==nullptr) {
length=0;
return nullptr;
} else {
length=compositions->size()/2;
return reinterpret_cast<const CompositionPair *>(compositions->getBuffer());
}
}
UChar32 combine(UChar32 trail) const;
UnicodeString *mapping;
UnicodeString *rawMapping; // non-nullptr if the mapping is further decomposed
UChar32 mappingCP; // >=0 if mapping to 1 code point
int32_t mappingPhase;
MappingType mappingType;
UVector32 *compositions; // (trail, composite) pairs
uint8_t cc;
UBool combinesBack;
UBool hasNoCompBoundaryAfter;
enum OffsetType {
OFFSET_NONE,
// Composition for back-combining character. Allowed, but not normally used.
OFFSET_MAYBE_YES,
// Composition for a starter that does not have a decomposition mapping.
OFFSET_YES_YES,
// Round-trip mapping & composition for a starter.
OFFSET_YES_NO_MAPPING_AND_COMPOSITION,
// Round-trip mapping for a starter that itself does not combine-forward.
OFFSET_YES_NO_MAPPING_ONLY,
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
// One-way mapping.
OFFSET_NO_NO,
// Delta for an algorithmic one-way mapping.
OFFSET_DELTA
};
enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 };
int32_t offset;
};
class Norms {
public:
Norms(UErrorCode &errorCode);
~Norms();
int32_t length() const { return utm_countItems(normMem); }
const Norm &getNormRefByIndex(int32_t i) const { return norms[i]; }
Norm &getNormRefByIndex(int32_t i) { return norms[i]; }
Norm *allocNorm();
/** Returns an existing Norm unit, or nullptr if c has no data. */
Norm *getNorm(UChar32 c);
/** Returns a Norm unit, creating a new one if necessary. */
Norm *createNorm(UChar32 c);
/** Returns an existing Norm unit, or an immutable empty object if c has no data. */
const Norm &getNormRef(UChar32 c) const;
uint8_t getCC(UChar32 c) const { return getNormRef(c).cc; }
void reorder(Norm &norm, BuilderReorderingBuffer &buffer) const;
UBool combinesWithCCBetween(const Norm &norm, uint8_t lowCC, uint8_t highCC) const;
class Enumerator {
public:
Enumerator(Norms &n) : norms(n) {}
virtual ~Enumerator();
/** Called for enumerated value!=0. */
virtual void rangeHandler(UChar32 start, UChar32 end, Norm &norm) = 0;
/** @internal Public only for C callback. */
UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value);
protected:
Norms &norms;
};
void enumRanges(Enumerator &e);
private:
Norms(const Norms &other) = delete;
Norms &operator=(const Norms &other) = delete;
UTrie2 *normTrie;
UToolMemory *normMem;
Norm *norms;
};
class CompositionBuilder : public Norms::Enumerator {
public:
CompositionBuilder(Norms &n) : Norms::Enumerator(n) {}
/** Adds a composition mapping for the first character in a round-trip mapping. */
void rangeHandler(UChar32 start, UChar32 end, Norm &norm) override;
};
class Decomposer : public Norms::Enumerator {
public:
Decomposer(Norms &n) : Norms::Enumerator(n), didDecompose(FALSE) {}
/** Decomposes each character of the current mapping. Sets didDecompose if any. */
void rangeHandler(UChar32 start, UChar32 end, Norm &norm) override;
UBool didDecompose;
};
U_NAMESPACE_END
#endif // #if !UCONFIG_NO_NORMALIZATION
#endif // __NORMS_H__