mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-13177 Merging trunk to branch
X-SVN-Rev: 40460
This commit is contained in:
commit
c09ca5d6b9
407 changed files with 13072 additions and 12173 deletions
|
@ -1,3 +1,6 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later)
|
||||
|
||||
Copyright © 1991-2017 Unicode, Inc. All rights reserved.
|
||||
|
@ -131,7 +134,7 @@ property of their respective owners.
|
|||
# ---------COPYING.libtabe ---- BEGIN--------------------
|
||||
#
|
||||
# /*
|
||||
# * Copyrighy (c) 1999 TaBE Project.
|
||||
# * Copyright (c) 1999 TaBE Project.
|
||||
# * Copyright (c) 1999 Pai-Hsiang Hsiao.
|
||||
# * All rights reserved.
|
||||
# *
|
||||
|
|
|
@ -630,6 +630,14 @@
|
|||
(via -D or uconfig.h, as above)
|
||||
and include those header files explicitly that you actually need.<br />
|
||||
Note: The ICU test suites cannot be compiled with this setting.</li>
|
||||
<li><b>utf_old.h:</b>
|
||||
All of utf_old.h is deprecated or obsolete.<br />
|
||||
Beginning with ICU 60,
|
||||
you should define <code>U_HIDE_OBSOLETE_UTF_OLD_H</code> to 1
|
||||
(via -D or uconfig.h, as above).
|
||||
Use of any of these macros should be replaced as noted
|
||||
in the comments for the obsolete macro.<br />
|
||||
Note: The ICU test suites <i>can</i> be compiled with this setting.</li>
|
||||
<li><b>.dat file:</b> By default, the ICU data is built into
|
||||
a shared library (DLL). This is convenient because it requires no
|
||||
install-time or runtime configuration,
|
||||
|
|
|
@ -194,7 +194,7 @@ EXPAND_ONLY_PREDEF = YES
|
|||
SEARCH_INCLUDES = YES
|
||||
INCLUDE_PATH =
|
||||
INCLUDE_FILE_PATTERNS =
|
||||
PREDEFINED = U_EXPORT2= U_STABLE= U_DRAFT= U_INTERNAL= U_SYSTEM= U_DEPRECATED= U_OBSOLETE= U_CALLCONV= U_CDECL_BEGIN= U_CDECL_END= U_NO_THROW=\ "U_NAMESPACE_BEGIN=namespace icu{" "U_NAMESPACE_END=}" U_SHOW_CPLUSPLUS_API=1 U_DEFINE_LOCAL_OPEN_POINTER()= U_IN_DOXYGEN=1 U_OVERRIDE=override U_FINAL=final UCONFIG_ENABLE_PLUGINS=1 U_CHAR16_IS_TYPEDEF=0 U_CPLUSPLUS_VERSION=11 U_WCHAR_IS_UTF16
|
||||
PREDEFINED = U_EXPORT2= U_STABLE= U_DRAFT= U_INTERNAL= U_SYSTEM= U_DEPRECATED= U_OBSOLETE= U_CALLCONV_FPTR= U_CALLCONV= U_CDECL_BEGIN= U_CDECL_END= U_NO_THROW=\ "U_NAMESPACE_BEGIN=namespace icu{" "U_NAMESPACE_END=}" U_SHOW_CPLUSPLUS_API=1 U_DEFINE_LOCAL_OPEN_POINTER()= U_IN_DOXYGEN=1 U_OVERRIDE= U_FINAL=final UCONFIG_ENABLE_PLUGINS=1 U_CHAR16_IS_TYPEDEF=0 U_CPLUSPLUS_VERSION=11 U_WCHAR_IS_UTF16 U_NOEXCEPT=
|
||||
EXPAND_AS_DEFINED =
|
||||
SKIP_FUNCTION_MACROS = YES
|
||||
#---------------------------------------------------------------------------
|
||||
|
|
|
@ -89,7 +89,7 @@ ucnv_ext.o ucnvmbcs.o ucnv2022.o ucnvhz.o ucnv_lmb.o ucnvisci.o ucnvdisp.o ucnv_
|
|||
resource.o uresbund.o ures_cnv.o uresdata.o resbund.o resbund_cnv.o \
|
||||
ucurr.o \
|
||||
messagepattern.o ucat.o locmap.o uloc.o locid.o locutil.o locavailable.o locdispnames.o locdspnm.o loclikely.o locresdata.o \
|
||||
bytestream.o stringpiece.o \
|
||||
bytestream.o stringpiece.o bytesinkutil.o \
|
||||
stringtriebuilder.o bytestriebuilder.o \
|
||||
bytestrie.o bytestrieiterator.o \
|
||||
ucharstrie.o ucharstriebuilder.o ucharstrieiterator.o \
|
||||
|
@ -104,7 +104,7 @@ patternprops.o uchar.o uprops.o ucase.o propname.o ubidi_props.o ubidi.o ubidiwr
|
|||
uscript.o uscript_props.o usc_impl.o unames.o \
|
||||
utrie.o utrie2.o utrie2_builder.o bmpset.o unisetspan.o uset_props.o uniset_props.o uniset_closure.o uset.o uniset.o usetiter.o ruleiter.o caniter.o unifilt.o unifunct.o \
|
||||
uarrsort.o brkiter.o ubrk.o brkeng.o dictbe.o filteredbrk.o \
|
||||
rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o \
|
||||
rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o rbbi_cache.o \
|
||||
serv.o servnotf.o servls.o servlk.o servlkf.o servrbf.o servslkf.o \
|
||||
uidna.o usprep.o uts46.o punycode.o \
|
||||
util.o util_props.o parsepos.o locbased.o cwchar.o wintz.o dtintrv.o ucnvsel.o propsvec.o \
|
||||
|
|
|
@ -28,7 +28,7 @@ U_NAMESPACE_BEGIN
|
|||
|
||||
BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
|
||||
list(parentList), listLength(parentListLength) {
|
||||
uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
|
||||
uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
|
||||
uprv_memset(table7FF, 0, sizeof(table7FF));
|
||||
uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
|
||||
|
||||
|
@ -45,14 +45,16 @@ BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
|
|||
list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
|
||||
}
|
||||
list4kStarts[0x11]=listLength-1;
|
||||
containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);
|
||||
|
||||
initBits();
|
||||
overrideIllegal();
|
||||
}
|
||||
|
||||
BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
|
||||
containsFFFD(otherBMPSet.containsFFFD),
|
||||
list(newParentList), listLength(newParentListLength) {
|
||||
uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
|
||||
uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
|
||||
uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
|
||||
uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
|
||||
uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
|
||||
|
@ -120,7 +122,7 @@ void BMPSet::initBits() {
|
|||
UChar32 start, limit;
|
||||
int32_t listIndex=0;
|
||||
|
||||
// Set asciiBytes[].
|
||||
// Set latin1Contains[].
|
||||
do {
|
||||
start=list[listIndex++];
|
||||
if(listIndex<listLength) {
|
||||
|
@ -128,13 +130,30 @@ void BMPSet::initBits() {
|
|||
} else {
|
||||
limit=0x110000;
|
||||
}
|
||||
if(start>=0x80) {
|
||||
if(start>=0x100) {
|
||||
break;
|
||||
}
|
||||
do {
|
||||
asciiBytes[start++]=1;
|
||||
} while(start<limit && start<0x80);
|
||||
} while(limit<=0x80);
|
||||
latin1Contains[start++]=1;
|
||||
} while(start<limit && start<0x100);
|
||||
} while(limit<=0x100);
|
||||
|
||||
// Find the first range overlapping with (or after) 80..FF again,
|
||||
// to include them in table7FF as well.
|
||||
for(listIndex=0;;) {
|
||||
start=list[listIndex++];
|
||||
if(listIndex<listLength) {
|
||||
limit=list[listIndex++];
|
||||
} else {
|
||||
limit=0x110000;
|
||||
}
|
||||
if(limit>0x80) {
|
||||
if(start<0x80) {
|
||||
start=0x80;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Set table7FF[].
|
||||
while(start<0x800) {
|
||||
|
@ -204,19 +223,14 @@ void BMPSet::initBits() {
|
|||
* for faster validity checking at runtime.
|
||||
* No need to set 0 values where they were reset to 0 in the constructor
|
||||
* and not modified by initBits().
|
||||
* (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
|
||||
* (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
|
||||
* Need to set 0 values for surrogates D800..DFFF.
|
||||
*/
|
||||
void BMPSet::overrideIllegal() {
|
||||
uint32_t bits, mask;
|
||||
int32_t i;
|
||||
|
||||
if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
|
||||
// contains(FFFD)==TRUE
|
||||
for(i=0x80; i<0xc0; ++i) {
|
||||
asciiBytes[i]=1;
|
||||
}
|
||||
|
||||
if(containsFFFD) {
|
||||
bits=3; // Lead bytes 0xC0 and 0xC1.
|
||||
for(i=0; i<64; ++i) {
|
||||
table7FF[i]|=bits;
|
||||
|
@ -233,7 +247,6 @@ void BMPSet::overrideIllegal() {
|
|||
bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
|
||||
}
|
||||
} else {
|
||||
// contains(FFFD)==FALSE
|
||||
mask=~(0x10001<<0xd); // Lead byte 0xED.
|
||||
for(i=32; i<64; ++i) { // Second half of 4k block.
|
||||
bmpBlockBits[i]&=mask;
|
||||
|
@ -277,8 +290,8 @@ int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
|
|||
|
||||
UBool
|
||||
BMPSet::contains(UChar32 c) const {
|
||||
if((uint32_t)c<=0x7f) {
|
||||
return (UBool)asciiBytes[c];
|
||||
if((uint32_t)c<=0xff) {
|
||||
return (UBool)latin1Contains[c];
|
||||
} else if((uint32_t)c<=0x7ff) {
|
||||
return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
|
||||
} else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
|
||||
|
@ -314,8 +327,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
|
|||
// span
|
||||
do {
|
||||
c=*s;
|
||||
if(c<=0x7f) {
|
||||
if(!asciiBytes[c]) {
|
||||
if(c<=0xff) {
|
||||
if(!latin1Contains[c]) {
|
||||
break;
|
||||
}
|
||||
} else if(c<=0x7ff) {
|
||||
|
@ -354,8 +367,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
|
|||
// span not
|
||||
do {
|
||||
c=*s;
|
||||
if(c<=0x7f) {
|
||||
if(asciiBytes[c]) {
|
||||
if(c<=0xff) {
|
||||
if(latin1Contains[c]) {
|
||||
break;
|
||||
}
|
||||
} else if(c<=0x7ff) {
|
||||
|
@ -403,8 +416,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
|
|||
// span
|
||||
for(;;) {
|
||||
c=*(--limit);
|
||||
if(c<=0x7f) {
|
||||
if(!asciiBytes[c]) {
|
||||
if(c<=0xff) {
|
||||
if(!latin1Contains[c]) {
|
||||
break;
|
||||
}
|
||||
} else if(c<=0x7ff) {
|
||||
|
@ -446,8 +459,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
|
|||
// span not
|
||||
for(;;) {
|
||||
c=*(--limit);
|
||||
if(c<=0x7f) {
|
||||
if(asciiBytes[c]) {
|
||||
if(c<=0xff) {
|
||||
if(latin1Contains[c]) {
|
||||
break;
|
||||
}
|
||||
} else if(c<=0x7ff) {
|
||||
|
@ -497,22 +510,22 @@ const uint8_t *
|
|||
BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
|
||||
const uint8_t *limit=s+length;
|
||||
uint8_t b=*s;
|
||||
if((int8_t)b>=0) {
|
||||
if(U8_IS_SINGLE(b)) {
|
||||
// Initial all-ASCII span.
|
||||
if(spanCondition) {
|
||||
do {
|
||||
if(!asciiBytes[b] || ++s==limit) {
|
||||
if(!latin1Contains[b] || ++s==limit) {
|
||||
return s;
|
||||
}
|
||||
b=*s;
|
||||
} while((int8_t)b>=0);
|
||||
} while(U8_IS_SINGLE(b));
|
||||
} else {
|
||||
do {
|
||||
if(asciiBytes[b] || ++s==limit) {
|
||||
if(latin1Contains[b] || ++s==limit) {
|
||||
return s;
|
||||
}
|
||||
b=*s;
|
||||
} while((int8_t)b>=0);
|
||||
} while(U8_IS_SINGLE(b));
|
||||
}
|
||||
length=(int32_t)(limit-s);
|
||||
}
|
||||
|
@ -540,20 +553,20 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
|
|||
// single trail byte, check for preceding 3- or 4-byte lead byte
|
||||
if(length>=2 && (b=*(limit-2))>=0xe0) {
|
||||
limit-=2;
|
||||
if(asciiBytes[0x80]!=spanCondition) {
|
||||
if(containsFFFD!=spanCondition) {
|
||||
limit0=limit;
|
||||
}
|
||||
} else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
|
||||
// 4-byte lead byte with only two trail bytes
|
||||
limit-=3;
|
||||
if(asciiBytes[0x80]!=spanCondition) {
|
||||
if(containsFFFD!=spanCondition) {
|
||||
limit0=limit;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// lead byte with no trail bytes
|
||||
--limit;
|
||||
if(asciiBytes[0x80]!=spanCondition) {
|
||||
if(containsFFFD!=spanCondition) {
|
||||
limit0=limit;
|
||||
}
|
||||
}
|
||||
|
@ -563,26 +576,26 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
|
|||
|
||||
while(s<limit) {
|
||||
b=*s;
|
||||
if(b<0xc0) {
|
||||
// ASCII; or trail bytes with the result of contains(FFFD).
|
||||
if(U8_IS_SINGLE(b)) {
|
||||
// ASCII
|
||||
if(spanCondition) {
|
||||
do {
|
||||
if(!asciiBytes[b]) {
|
||||
if(!latin1Contains[b]) {
|
||||
return s;
|
||||
} else if(++s==limit) {
|
||||
return limit0;
|
||||
}
|
||||
b=*s;
|
||||
} while(b<0xc0);
|
||||
} while(U8_IS_SINGLE(b));
|
||||
} else {
|
||||
do {
|
||||
if(asciiBytes[b]) {
|
||||
if(latin1Contains[b]) {
|
||||
return s;
|
||||
} else if(++s==limit) {
|
||||
return limit0;
|
||||
}
|
||||
b=*s;
|
||||
} while(b<0xc0);
|
||||
} while(U8_IS_SINGLE(b));
|
||||
}
|
||||
}
|
||||
++s; // Advance past the lead byte.
|
||||
|
@ -619,7 +632,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
|
|||
UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
|
||||
if( ( (0x10000<=c && c<=0x10ffff) ?
|
||||
containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
|
||||
asciiBytes[0x80]
|
||||
containsFFFD
|
||||
) != spanCondition
|
||||
) {
|
||||
return s-1;
|
||||
|
@ -627,8 +640,9 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
|
|||
s+=3;
|
||||
continue;
|
||||
}
|
||||
} else /* 0xc0<=b<0xe0 */ {
|
||||
} else {
|
||||
if( /* handle U+0000..U+07FF inline */
|
||||
b>=0xc0 &&
|
||||
(t1=(uint8_t)(*s-0x80)) <= 0x3f
|
||||
) {
|
||||
if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
|
||||
|
@ -642,7 +656,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
|
|||
// Give an illegal sequence the same value as the result of contains(FFFD).
|
||||
// Handle each byte of an illegal sequence separately to simplify the code;
|
||||
// no need to optimize error handling.
|
||||
if(asciiBytes[0x80]!=spanCondition) {
|
||||
if(containsFFFD!=spanCondition) {
|
||||
return s-1;
|
||||
}
|
||||
}
|
||||
|
@ -667,26 +681,26 @@ BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCon
|
|||
|
||||
do {
|
||||
b=s[--length];
|
||||
if((int8_t)b>=0) {
|
||||
if(U8_IS_SINGLE(b)) {
|
||||
// ASCII sub-span
|
||||
if(spanCondition) {
|
||||
do {
|
||||
if(!asciiBytes[b]) {
|
||||
if(!latin1Contains[b]) {
|
||||
return length+1;
|
||||
} else if(length==0) {
|
||||
return 0;
|
||||
}
|
||||
b=s[--length];
|
||||
} while((int8_t)b>=0);
|
||||
} while(U8_IS_SINGLE(b));
|
||||
} else {
|
||||
do {
|
||||
if(asciiBytes[b]) {
|
||||
if(latin1Contains[b]) {
|
||||
return length+1;
|
||||
} else if(length==0) {
|
||||
return 0;
|
||||
}
|
||||
b=s[--length];
|
||||
} while((int8_t)b>=0);
|
||||
} while(U8_IS_SINGLE(b));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -28,11 +28,12 @@ U_NAMESPACE_BEGIN
|
|||
* Helper class for frozen UnicodeSets, implements contains() and span()
|
||||
* optimized for BMP code points. Structured to be UTF-8-friendly.
|
||||
*
|
||||
* ASCII: Look up bytes.
|
||||
* Latin-1: Look up bytes.
|
||||
* 2-byte characters: Bits organized vertically.
|
||||
* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,
|
||||
* with mixed for illegal ranges.
|
||||
* Supplementary characters: Call contains() on the parent set.
|
||||
* Supplementary characters: Binary search over
|
||||
* the supplementary part of the parent set's inversion list.
|
||||
*/
|
||||
class BMPSet : public UMemory {
|
||||
public:
|
||||
|
@ -96,12 +97,12 @@ private:
|
|||
inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;
|
||||
|
||||
/*
|
||||
* One byte per ASCII character, or trail byte in lead position.
|
||||
* 0 or 1 for ASCII characters.
|
||||
* The value for trail bytes is the result of contains(FFFD)
|
||||
* for faster validity checking at runtime.
|
||||
* One byte 0 or 1 per Latin-1 character.
|
||||
*/
|
||||
UBool asciiBytes[0xc0];
|
||||
UBool latin1Contains[0x100];
|
||||
|
||||
/* TRUE if contains(U+FFFD). */
|
||||
UBool containsFFFD;
|
||||
|
||||
/*
|
||||
* One bit per code point from U+0000..U+07FF.
|
||||
|
|
|
@ -11,9 +11,6 @@
|
|||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "brkeng.h"
|
||||
#include "cmemory.h"
|
||||
#include "dictbe.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/chariter.h"
|
||||
|
@ -24,6 +21,10 @@
|
|||
#include "unicode/uscript.h"
|
||||
#include "unicode/ucharstrie.h"
|
||||
#include "unicode/bytestrie.h"
|
||||
|
||||
#include "brkeng.h"
|
||||
#include "cmemory.h"
|
||||
#include "dictbe.h"
|
||||
#include "charstr.h"
|
||||
#include "dictionarydata.h"
|
||||
#include "mutex.h"
|
||||
|
@ -80,23 +81,15 @@ UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
|
|||
|
||||
int32_t
|
||||
UnhandledEngine::findBreaks( UText *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UBool reverse,
|
||||
int32_t breakType,
|
||||
UStack &/*foundBreaks*/ ) const {
|
||||
int32_t /* startPos */,
|
||||
int32_t endPos,
|
||||
int32_t breakType,
|
||||
UVector32 &/*foundBreaks*/ ) const {
|
||||
if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
|
||||
UChar32 c = utext_current32(text);
|
||||
if (reverse) {
|
||||
while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
|
||||
c = utext_previous32(text);
|
||||
}
|
||||
}
|
||||
else {
|
||||
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
|
||||
utext_next32(text); // TODO: recast loop to work with post-increment operations.
|
||||
c = utext_current32(text);
|
||||
}
|
||||
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
|
||||
utext_next32(text); // TODO: recast loop to work with post-increment operations.
|
||||
c = utext_current32(text);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
|
|
@ -19,6 +19,7 @@ U_NAMESPACE_BEGIN
|
|||
|
||||
class UnicodeSet;
|
||||
class UStack;
|
||||
class UVector32;
|
||||
class DictionaryMatcher;
|
||||
|
||||
/*******************************************************************
|
||||
|
@ -67,18 +68,15 @@ class LanguageBreakEngine : public UMemory {
|
|||
* is capable of handling.
|
||||
* @param startPos The start of the run within the supplied text.
|
||||
* @param endPos The end of the run within the supplied text.
|
||||
* @param reverse Whether the caller is looking for breaks in a reverse
|
||||
* direction.
|
||||
* @param breakType The type of break desired, or -1.
|
||||
* @param foundBreaks An allocated C array of the breaks found, if any
|
||||
* @param foundBreaks A Vector of int32_t to receive the breaks.
|
||||
* @return The number of breaks found.
|
||||
*/
|
||||
virtual int32_t findBreaks( UText *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UBool reverse,
|
||||
int32_t breakType,
|
||||
UStack &foundBreaks ) const = 0;
|
||||
UVector32 &foundBreaks ) const = 0;
|
||||
|
||||
};
|
||||
|
||||
|
@ -192,8 +190,6 @@ class UnhandledEngine : public LanguageBreakEngine {
|
|||
* is capable of handling.
|
||||
* @param startPos The start of the run within the supplied text.
|
||||
* @param endPos The end of the run within the supplied text.
|
||||
* @param reverse Whether the caller is looking for breaks in a reverse
|
||||
* direction.
|
||||
* @param breakType The type of break desired, or -1.
|
||||
* @param foundBreaks An allocated C array of the breaks found, if any
|
||||
* @return The number of breaks found.
|
||||
|
@ -201,9 +197,8 @@ class UnhandledEngine : public LanguageBreakEngine {
|
|||
virtual int32_t findBreaks( UText *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UBool reverse,
|
||||
int32_t breakType,
|
||||
UStack &foundBreaks ) const;
|
||||
UVector32 &foundBreaks ) const;
|
||||
|
||||
/**
|
||||
* <p>Tell the engine to handle a particular character and break type.</p>
|
||||
|
|
123
icu4c/source/common/bytesinkutil.cpp
Normal file
123
icu4c/source/common/bytesinkutil.cpp
Normal file
|
@ -0,0 +1,123 @@
|
|||
// © 2017 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// bytesinkutil.cpp
|
||||
// created: 2017sep14 Markus W. Scherer
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/bytestream.h"
|
||||
#include "unicode/edits.h"
|
||||
#include "unicode/stringoptions.h"
|
||||
#include "unicode/utf8.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "bytesinkutil.h"
|
||||
#include "cmemory.h"
|
||||
#include "uassert.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
UBool
|
||||
ByteSinkUtil::appendChange(int32_t length, const char16_t *s16, int32_t s16Length,
|
||||
ByteSink &sink, Edits *edits, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return FALSE; }
|
||||
char scratch[200];
|
||||
int32_t s8Length = 0;
|
||||
for (int32_t i = 0; i < s16Length;) {
|
||||
int32_t capacity;
|
||||
int32_t desiredCapacity = s16Length - i;
|
||||
if (desiredCapacity < (INT32_MAX / 3)) {
|
||||
desiredCapacity *= 3; // max 3 UTF-8 bytes per UTF-16 code unit
|
||||
} else if (desiredCapacity < (INT32_MAX / 2)) {
|
||||
desiredCapacity *= 2;
|
||||
} else {
|
||||
desiredCapacity = INT32_MAX;
|
||||
}
|
||||
char *buffer = sink.GetAppendBuffer(U8_MAX_LENGTH, desiredCapacity,
|
||||
scratch, UPRV_LENGTHOF(scratch), &capacity);
|
||||
capacity -= U8_MAX_LENGTH - 1;
|
||||
int32_t j = 0;
|
||||
for (; i < s16Length && j < capacity;) {
|
||||
UChar32 c;
|
||||
U16_NEXT_UNSAFE(s16, i, c);
|
||||
U8_APPEND_UNSAFE(buffer, j, c);
|
||||
}
|
||||
if (j > (INT32_MAX - s8Length)) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
sink.Append(buffer, j);
|
||||
s8Length += j;
|
||||
}
|
||||
if (edits != nullptr) {
|
||||
edits->addReplace(length, s8Length);
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
UBool
|
||||
ByteSinkUtil::appendChange(const uint8_t *s, const uint8_t *limit,
|
||||
const char16_t *s16, int32_t s16Length,
|
||||
ByteSink &sink, Edits *edits, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return FALSE; }
|
||||
if ((limit - s) > INT32_MAX) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
return appendChange((int32_t)(limit - s), s16, s16Length, sink, edits, errorCode);
|
||||
}
|
||||
|
||||
void
|
||||
ByteSinkUtil::appendCodePoint(int32_t length, UChar32 c, ByteSink &sink, Edits *edits) {
|
||||
char s8[U8_MAX_LENGTH];
|
||||
int32_t s8Length = 0;
|
||||
U8_APPEND_UNSAFE(s8, s8Length, c);
|
||||
if (edits != nullptr) {
|
||||
edits->addReplace(length, s8Length);
|
||||
}
|
||||
sink.Append(s8, s8Length);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
// See unicode/utf8.h U8_APPEND_UNSAFE().
|
||||
inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
|
||||
inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
|
||||
|
||||
} // namespace
|
||||
|
||||
void
|
||||
ByteSinkUtil::appendTwoBytes(UChar32 c, ByteSink &sink) {
|
||||
U_ASSERT(0x80 <= c && c <= 0x7ff); // 2-byte UTF-8
|
||||
char s8[2] = { (char)getTwoByteLead(c), (char)getTwoByteTrail(c) };
|
||||
sink.Append(s8, 2);
|
||||
}
|
||||
|
||||
UBool
|
||||
ByteSinkUtil::appendUnchanged(const uint8_t *s, int32_t length,
|
||||
ByteSink &sink, uint32_t options, Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return FALSE; }
|
||||
if (length > 0) {
|
||||
if (edits != nullptr) {
|
||||
edits->addUnchanged(length);
|
||||
}
|
||||
if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
|
||||
sink.Append(reinterpret_cast<const char *>(s), length);
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
UBool
|
||||
ByteSinkUtil::appendUnchanged(const uint8_t *s, const uint8_t *limit,
|
||||
ByteSink &sink, uint32_t options, Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return FALSE; }
|
||||
if ((limit - s) > INT32_MAX) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
return appendUnchanged(s, (int32_t)(limit - s), sink, options, edits, errorCode);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
53
icu4c/source/common/bytesinkutil.h
Normal file
53
icu4c/source/common/bytesinkutil.h
Normal file
|
@ -0,0 +1,53 @@
|
|||
// © 2017 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// bytesinkutil.h
|
||||
// created: 2017sep14 Markus W. Scherer
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/bytestream.h"
|
||||
#include "unicode/edits.h"
|
||||
#include "cmemory.h"
|
||||
#include "uassert.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class ByteSink;
|
||||
class Edits;
|
||||
|
||||
class U_COMMON_API ByteSinkUtil {
|
||||
public:
|
||||
ByteSinkUtil() = delete; // all static
|
||||
|
||||
/** (length) bytes were mapped to valid (s16, s16Length). */
|
||||
static UBool appendChange(int32_t length,
|
||||
const char16_t *s16, int32_t s16Length,
|
||||
ByteSink &sink, Edits *edits, UErrorCode &errorCode);
|
||||
|
||||
/** The bytes at [s, limit[ were mapped to valid (s16, s16Length). */
|
||||
static UBool appendChange(const uint8_t *s, const uint8_t *limit,
|
||||
const char16_t *s16, int32_t s16Length,
|
||||
ByteSink &sink, Edits *edits, UErrorCode &errorCode);
|
||||
|
||||
/** (length) bytes were mapped/changed to valid code point c. */
|
||||
static void appendCodePoint(int32_t length, UChar32 c, ByteSink &sink, Edits *edits = nullptr);
|
||||
|
||||
/** The few bytes at [src, nextSrc[ were mapped/changed to valid code point c. */
|
||||
static inline void appendCodePoint(const uint8_t *src, const uint8_t *nextSrc, UChar32 c,
|
||||
ByteSink &sink, Edits *edits = nullptr) {
|
||||
appendCodePoint((int32_t)(nextSrc - src), c, sink, edits);
|
||||
}
|
||||
|
||||
/** Append the two-byte character (U+0080..U+07FF). */
|
||||
static void appendTwoBytes(UChar32 c, ByteSink &sink);
|
||||
|
||||
static UBool appendUnchanged(const uint8_t *s, int32_t length,
|
||||
ByteSink &sink, uint32_t options, Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
static UBool appendUnchanged(const uint8_t *s, const uint8_t *limit,
|
||||
ByteSink &sink, uint32_t options, Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
|
@ -45,6 +45,12 @@ void CheckedArrayByteSink::Append(const char* bytes, int32_t n) {
|
|||
if (n <= 0) {
|
||||
return;
|
||||
}
|
||||
if (n > (INT32_MAX - appended_)) {
|
||||
// TODO: Report as integer overflow, not merely buffer overflow.
|
||||
appended_ = INT32_MAX;
|
||||
overflowed_ = TRUE;
|
||||
return;
|
||||
}
|
||||
appended_ += n;
|
||||
int32_t available = capacity_ - size_;
|
||||
if (n > available) {
|
||||
|
|
|
@ -268,6 +268,8 @@
|
|||
</ClCompile>
|
||||
<ClCompile Include="rbbitblb.cpp">
|
||||
</ClCompile>
|
||||
<ClCompile Include="rbbi_cache.cpp">
|
||||
</ClCompile>
|
||||
<ClCompile Include="dictionarydata.cpp" />
|
||||
<ClCompile Include="ubrk.cpp" />
|
||||
<ClCompile Include="ucol_swp.cpp">
|
||||
|
@ -445,6 +447,7 @@
|
|||
</ClCompile>
|
||||
<ClCompile Include="usprep.cpp" />
|
||||
<ClCompile Include="appendable.cpp" />
|
||||
<ClCompile Include="bytesinkutil.cpp" />
|
||||
<ClCompile Include="bytestream.cpp" />
|
||||
<ClCompile Include="bytestrie.cpp" />
|
||||
<ClCompile Include="bytestriebuilder.cpp" />
|
||||
|
@ -572,6 +575,7 @@
|
|||
<ClInclude Include="rbbiscan.h" />
|
||||
<ClInclude Include="rbbisetb.h" />
|
||||
<ClInclude Include="rbbitblb.h" />
|
||||
<ClInclude Include="rbbi_cache.h" />
|
||||
<ClInclude Include="dictionarydata.h" />
|
||||
<CustomBuild Include="unicode\ubrk.h">
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">copy "%(FullPath)" ..\..\include\unicode
|
||||
|
@ -1478,6 +1482,7 @@
|
|||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
|
||||
</CustomBuild>
|
||||
<ClInclude Include="bytesinkutil.h" />
|
||||
<CustomBuild Include="unicode\bytestream.h">
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">copy "%(FullPath)" ..\..\include\unicode
|
||||
</Command>
|
||||
|
|
|
@ -97,6 +97,9 @@
|
|||
<ClCompile Include="rbbitblb.cpp">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="rbbi_cache.cpp">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="ubrk.cpp">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClCompile>
|
||||
|
@ -460,6 +463,9 @@
|
|||
<ClCompile Include="usprep.cpp">
|
||||
<Filter>sprep</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="bytesinkutil.cpp">
|
||||
<Filter>strings</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="bytestream.cpp">
|
||||
<Filter>strings</Filter>
|
||||
</ClCompile>
|
||||
|
@ -636,6 +642,9 @@
|
|||
<ClInclude Include="rbbitblb.h">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="rbbi_cache.h">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="ubrkimpl.h">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClInclude>
|
||||
|
@ -861,6 +870,9 @@
|
|||
<ClInclude Include="sprpimpl.h">
|
||||
<Filter>sprep</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="bytesinkutil.h">
|
||||
<Filter>strings</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="charstr.h">
|
||||
<Filter>strings</Filter>
|
||||
</ClInclude>
|
||||
|
|
|
@ -299,6 +299,8 @@
|
|||
</ClCompile>
|
||||
<ClCompile Include="rbbitblb.cpp">
|
||||
</ClCompile>
|
||||
<ClCompile Include="rbbi_cache.cpp">
|
||||
</ClCompile>
|
||||
<ClCompile Include="dictionarydata.cpp" />
|
||||
<ClCompile Include="ubrk.cpp" />
|
||||
<ClCompile Include="ucol_swp.cpp">
|
||||
|
@ -452,6 +454,7 @@
|
|||
</ClCompile>
|
||||
<ClCompile Include="usprep.cpp" />
|
||||
<ClCompile Include="appendable.cpp" />
|
||||
<ClCompile Include="bytesinkutil.cpp" />
|
||||
<ClCompile Include="bytestream.cpp" />
|
||||
<ClCompile Include="bytestrie.cpp" />
|
||||
<ClCompile Include="bytestriebuilder.cpp" />
|
||||
|
@ -529,6 +532,7 @@
|
|||
<ClInclude Include="rbbiscan.h" />
|
||||
<ClInclude Include="rbbisetb.h" />
|
||||
<ClInclude Include="rbbitblb.h" />
|
||||
<ClInclude Include="rbbi_cache.h" />
|
||||
<ClInclude Include="dictionarydata.h" />
|
||||
<CustomBuild Include="unicode\ubrk.h">
|
||||
<Command>copy "%(FullPath)" ..\..\include\unicode</Command>
|
||||
|
@ -894,6 +898,7 @@
|
|||
</Command>
|
||||
<Outputs>..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
|
||||
</CustomBuild>
|
||||
<ClInclude Include="bytesinkutil.h" />
|
||||
<CustomBuild Include="unicode\bytestream.h">
|
||||
<Command>copy "%(FullPath)" ..\..\include\unicode
|
||||
</Command>
|
||||
|
|
|
@ -46,9 +46,9 @@ int32_t
|
|||
DictionaryBreakEngine::findBreaks( UText *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UBool reverse,
|
||||
int32_t breakType,
|
||||
UStack &foundBreaks ) const {
|
||||
UVector32 &foundBreaks ) const {
|
||||
(void)startPos; // TODO: remove this param?
|
||||
int32_t result = 0;
|
||||
|
||||
// Find the span of characters included in the set.
|
||||
|
@ -60,34 +60,12 @@ DictionaryBreakEngine::findBreaks( UText *text,
|
|||
int32_t rangeStart;
|
||||
int32_t rangeEnd;
|
||||
UChar32 c = utext_current32(text);
|
||||
if (reverse) {
|
||||
UBool isDict = fSet.contains(c);
|
||||
while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
|
||||
c = utext_previous32(text);
|
||||
isDict = fSet.contains(c);
|
||||
}
|
||||
if (current < startPos) {
|
||||
rangeStart = startPos;
|
||||
} else {
|
||||
rangeStart = current;
|
||||
if (!isDict) {
|
||||
utext_next32(text);
|
||||
rangeStart = (int32_t)utext_getNativeIndex(text);
|
||||
}
|
||||
}
|
||||
// rangeEnd = start + 1;
|
||||
utext_setNativeIndex(text, start);
|
||||
utext_next32(text);
|
||||
rangeEnd = (int32_t)utext_getNativeIndex(text);
|
||||
}
|
||||
else {
|
||||
while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
|
||||
utext_next32(text); // TODO: recast loop for postincrement
|
||||
c = utext_current32(text);
|
||||
}
|
||||
rangeStart = start;
|
||||
rangeEnd = current;
|
||||
while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
|
||||
utext_next32(text); // TODO: recast loop for postincrement
|
||||
c = utext_current32(text);
|
||||
}
|
||||
rangeStart = start;
|
||||
rangeEnd = current;
|
||||
if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
|
||||
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
|
||||
utext_setNativeIndex(text, current);
|
||||
|
@ -248,7 +226,7 @@ int32_t
|
|||
ThaiBreakEngine::divideUpDictionaryRange( UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UStack &foundBreaks ) const {
|
||||
UVector32 &foundBreaks ) const {
|
||||
utext_setNativeIndex(text, rangeStart);
|
||||
utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
|
||||
if (utext_getNativeIndex(text) >= rangeEnd) {
|
||||
|
@ -487,7 +465,7 @@ int32_t
|
|||
LaoBreakEngine::divideUpDictionaryRange( UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UStack &foundBreaks ) const {
|
||||
UVector32 &foundBreaks ) const {
|
||||
if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
|
||||
return 0; // Not enough characters for two words
|
||||
}
|
||||
|
@ -680,7 +658,7 @@ int32_t
|
|||
BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UStack &foundBreaks ) const {
|
||||
UVector32 &foundBreaks ) const {
|
||||
if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
|
||||
return 0; // Not enough characters for two words
|
||||
}
|
||||
|
@ -885,7 +863,7 @@ int32_t
|
|||
KhmerBreakEngine::divideUpDictionaryRange( UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UStack &foundBreaks ) const {
|
||||
UVector32 &foundBreaks ) const {
|
||||
if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
|
||||
return 0; // Not enough characters for two words
|
||||
}
|
||||
|
@ -1110,9 +1088,9 @@ static inline uint32_t getKatakanaCost(int32_t wordLength){
|
|||
return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
|
||||
}
|
||||
|
||||
static inline bool isKatakana(uint16_t value) {
|
||||
return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
|
||||
(value >= 0xFF66u && value <= 0xFF9fu);
|
||||
static inline bool isKatakana(UChar32 value) {
|
||||
return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
|
||||
(value >= 0xFF66 && value <= 0xFF9f);
|
||||
}
|
||||
|
||||
|
||||
|
@ -1128,14 +1106,14 @@ static inline int32_t utext_i32_flag(int32_t bitIndex) {
|
|||
* @param text A UText representing the text
|
||||
* @param rangeStart The start of the range of dictionary characters
|
||||
* @param rangeEnd The end of the range of dictionary characters
|
||||
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
||||
* @param foundBreaks vector<int32> to receive the break positions
|
||||
* @return The number of breaks found
|
||||
*/
|
||||
int32_t
|
||||
CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UStack &foundBreaks ) const {
|
||||
UVector32 &foundBreaks ) const {
|
||||
if (rangeStart >= rangeEnd) {
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include "unicode/utext.h"
|
||||
|
||||
#include "brkeng.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -84,21 +85,18 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
|
|||
*
|
||||
* @param text A UText representing the text. The iterator is left at
|
||||
* the end of the run of characters which the engine is capable of handling
|
||||
* that starts from the first (or last) character in the range.
|
||||
* that starts from the first character in the range.
|
||||
* @param startPos The start of the run within the supplied text.
|
||||
* @param endPos The end of the run within the supplied text.
|
||||
* @param reverse Whether the caller is looking for breaks in a reverse
|
||||
* direction.
|
||||
* @param breakType The type of break desired, or -1.
|
||||
* @param foundBreaks An allocated C array of the breaks found, if any
|
||||
* @param foundBreaks vector of int32_t to receive the break positions
|
||||
* @return The number of breaks found.
|
||||
*/
|
||||
virtual int32_t findBreaks( UText *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UBool reverse,
|
||||
int32_t breakType,
|
||||
UStack &foundBreaks ) const;
|
||||
UVector32 &foundBreaks ) const;
|
||||
|
||||
protected:
|
||||
|
||||
|
@ -128,7 +126,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
|
|||
virtual int32_t divideUpDictionaryRange( UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UStack &foundBreaks ) const = 0;
|
||||
UVector32 &foundBreaks ) const = 0;
|
||||
|
||||
};
|
||||
|
||||
|
@ -185,7 +183,7 @@ class ThaiBreakEngine : public DictionaryBreakEngine {
|
|||
virtual int32_t divideUpDictionaryRange( UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UStack &foundBreaks ) const;
|
||||
UVector32 &foundBreaks ) const;
|
||||
|
||||
};
|
||||
|
||||
|
@ -241,7 +239,7 @@ class LaoBreakEngine : public DictionaryBreakEngine {
|
|||
virtual int32_t divideUpDictionaryRange( UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UStack &foundBreaks ) const;
|
||||
UVector32 &foundBreaks ) const;
|
||||
|
||||
};
|
||||
|
||||
|
@ -297,7 +295,7 @@ class BurmeseBreakEngine : public DictionaryBreakEngine {
|
|||
virtual int32_t divideUpDictionaryRange( UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UStack &foundBreaks ) const;
|
||||
UVector32 &foundBreaks ) const;
|
||||
|
||||
};
|
||||
|
||||
|
@ -353,7 +351,7 @@ class KhmerBreakEngine : public DictionaryBreakEngine {
|
|||
virtual int32_t divideUpDictionaryRange( UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UStack &foundBreaks ) const;
|
||||
UVector32 &foundBreaks ) const;
|
||||
|
||||
};
|
||||
|
||||
|
@ -417,7 +415,7 @@ class CjkBreakEngine : public DictionaryBreakEngine {
|
|||
virtual int32_t divideUpDictionaryRange( UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UStack &foundBreaks ) const;
|
||||
UVector32 &foundBreaks ) const;
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -17,10 +17,10 @@ namespace {
|
|||
const int32_t MAX_UNCHANGED_LENGTH = 0x1000;
|
||||
const int32_t MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1;
|
||||
|
||||
// 0wwwcccccccccccc with w=1..6 records ccc+1 replacements of w:w text units.
|
||||
// No length change.
|
||||
const int32_t MAX_SHORT_WIDTH = 6;
|
||||
const int32_t MAX_SHORT_CHANGE_LENGTH = 0xfff;
|
||||
// 0mmmnnnccccccccc with m=1..6 records ccc+1 replacements of m:n text units.
|
||||
const int32_t MAX_SHORT_CHANGE_OLD_LENGTH = 6;
|
||||
const int32_t MAX_SHORT_CHANGE_NEW_LENGTH = 7;
|
||||
const int32_t SHORT_CHANGE_NUM_MASK = 0x1ff;
|
||||
const int32_t MAX_SHORT_CHANGE = 0x6fff;
|
||||
|
||||
// 0111mmmmmmnnnnnn records a replacement of m text units with n.
|
||||
|
@ -138,20 +138,6 @@ void Edits::addUnchanged(int32_t unchangedLength) {
|
|||
|
||||
void Edits::addReplace(int32_t oldLength, int32_t newLength) {
|
||||
if(U_FAILURE(errorCode_)) { return; }
|
||||
if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) {
|
||||
// Replacement of short oldLength text units by same-length new text.
|
||||
// Merge into previous short-replacement record, if any.
|
||||
++numChanges;
|
||||
int32_t last = lastUnit();
|
||||
if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
|
||||
(last >> 12) == oldLength && (last & 0xfff) < MAX_SHORT_CHANGE_LENGTH) {
|
||||
setLastUnit(last + 1);
|
||||
return;
|
||||
}
|
||||
append(oldLength << 12);
|
||||
return;
|
||||
}
|
||||
|
||||
if(oldLength < 0 || newLength < 0) {
|
||||
errorCode_ = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
|
@ -171,6 +157,21 @@ void Edits::addReplace(int32_t oldLength, int32_t newLength) {
|
|||
delta += newDelta;
|
||||
}
|
||||
|
||||
if(0 < oldLength && oldLength <= MAX_SHORT_CHANGE_OLD_LENGTH &&
|
||||
newLength <= MAX_SHORT_CHANGE_NEW_LENGTH) {
|
||||
// Merge into previous same-lengths short-replacement record, if any.
|
||||
int32_t u = (oldLength << 12) | (newLength << 9);
|
||||
int32_t last = lastUnit();
|
||||
if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
|
||||
(last & ~SHORT_CHANGE_NUM_MASK) == u &&
|
||||
(last & SHORT_CHANGE_NUM_MASK) < SHORT_CHANGE_NUM_MASK) {
|
||||
setLastUnit(last + 1);
|
||||
return;
|
||||
}
|
||||
append(u);
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t head = 0x7000;
|
||||
if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) {
|
||||
head |= oldLength << 6;
|
||||
|
@ -396,7 +397,7 @@ Edits &Edits::mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &error
|
|||
Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) :
|
||||
array(a), index(0), length(len), remaining(0),
|
||||
onlyChanges_(oc), coarse(crs),
|
||||
changed(FALSE), oldLength_(0), newLength_(0),
|
||||
dir(0), changed(FALSE), oldLength_(0), newLength_(0),
|
||||
srcIndex(0), replIndex(0), destIndex(0) {}
|
||||
|
||||
int32_t Edits::Iterator::readLength(int32_t head) {
|
||||
|
@ -418,7 +419,7 @@ int32_t Edits::Iterator::readLength(int32_t head) {
|
|||
}
|
||||
}
|
||||
|
||||
void Edits::Iterator::updateIndexes() {
|
||||
void Edits::Iterator::updateNextIndexes() {
|
||||
srcIndex += oldLength_;
|
||||
if (changed) {
|
||||
replIndex += newLength_;
|
||||
|
@ -426,22 +427,52 @@ void Edits::Iterator::updateIndexes() {
|
|||
destIndex += newLength_;
|
||||
}
|
||||
|
||||
void Edits::Iterator::updatePreviousIndexes() {
|
||||
srcIndex -= oldLength_;
|
||||
if (changed) {
|
||||
replIndex -= newLength_;
|
||||
}
|
||||
destIndex -= newLength_;
|
||||
}
|
||||
|
||||
UBool Edits::Iterator::noNext() {
|
||||
// No change beyond the string.
|
||||
// No change before or beyond the string.
|
||||
dir = 0;
|
||||
changed = FALSE;
|
||||
oldLength_ = newLength_ = 0;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) {
|
||||
// Forward iteration: Update the string indexes to the limit of the current span,
|
||||
// and post-increment-read array units to assemble a new span.
|
||||
// Leaves the array index one after the last unit of that span.
|
||||
if (U_FAILURE(errorCode)) { return FALSE; }
|
||||
// We have an errorCode in case we need to start guarding against integer overflows.
|
||||
// It is also convenient for caller loops if we bail out when an error was set elsewhere.
|
||||
updateIndexes();
|
||||
if (remaining > 0) {
|
||||
// Fine-grained iterator: Continue a sequence of equal-length changes.
|
||||
--remaining;
|
||||
return TRUE;
|
||||
if (dir > 0) {
|
||||
updateNextIndexes();
|
||||
} else {
|
||||
if (dir < 0) {
|
||||
// Turn around from previous() to next().
|
||||
// Post-increment-read the same span again.
|
||||
if (remaining > 0) {
|
||||
// Fine-grained iterator:
|
||||
// Stay on the current one of a sequence of compressed changes.
|
||||
++index; // next() rests on the index after the sequence unit.
|
||||
dir = 1;
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
dir = 1;
|
||||
}
|
||||
if (remaining >= 1) {
|
||||
// Fine-grained iterator: Continue a sequence of compressed changes.
|
||||
if (remaining > 1) {
|
||||
--remaining;
|
||||
return TRUE;
|
||||
}
|
||||
remaining = 0;
|
||||
}
|
||||
if (index >= length) {
|
||||
return noNext();
|
||||
|
@ -457,7 +488,7 @@ UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) {
|
|||
}
|
||||
newLength_ = oldLength_;
|
||||
if (onlyChanges) {
|
||||
updateIndexes();
|
||||
updateNextIndexes();
|
||||
if (index >= length) {
|
||||
return noNext();
|
||||
}
|
||||
|
@ -469,14 +500,19 @@ UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) {
|
|||
}
|
||||
changed = TRUE;
|
||||
if (u <= MAX_SHORT_CHANGE) {
|
||||
int32_t oldLen = u >> 12;
|
||||
int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH;
|
||||
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
|
||||
if (coarse) {
|
||||
int32_t w = u >> 12;
|
||||
int32_t len = (u & 0xfff) + 1;
|
||||
oldLength_ = newLength_ = len * w;
|
||||
oldLength_ = num * oldLen;
|
||||
newLength_ = num * newLen;
|
||||
} else {
|
||||
// Split a sequence of equal-length changes that was compressed into one unit.
|
||||
oldLength_ = newLength_ = u >> 12;
|
||||
remaining = u & 0xfff;
|
||||
// Split a sequence of changes that was compressed into one unit.
|
||||
oldLength_ = oldLen;
|
||||
newLength_ = newLen;
|
||||
if (num > 1) {
|
||||
remaining = num; // This is the first of two or more changes.
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
} else {
|
||||
|
@ -491,22 +527,127 @@ UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) {
|
|||
while (index < length && (u = array[index]) > MAX_UNCHANGED) {
|
||||
++index;
|
||||
if (u <= MAX_SHORT_CHANGE) {
|
||||
int32_t w = u >> 12;
|
||||
int32_t len = (u & 0xfff) + 1;
|
||||
len = len * w;
|
||||
oldLength_ += len;
|
||||
newLength_ += len;
|
||||
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
|
||||
oldLength_ += (u >> 12) * num;
|
||||
newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num;
|
||||
} else {
|
||||
U_ASSERT(u <= 0x7fff);
|
||||
int32_t oldLen = readLength((u >> 6) & 0x3f);
|
||||
int32_t newLen = readLength(u & 0x3f);
|
||||
oldLength_ += oldLen;
|
||||
newLength_ += newLen;
|
||||
oldLength_ += readLength((u >> 6) & 0x3f);
|
||||
newLength_ += readLength(u & 0x3f);
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
UBool Edits::Iterator::previous(UErrorCode &errorCode) {
|
||||
// Backward iteration: Pre-decrement-read array units to assemble a new span,
|
||||
// then update the string indexes to the start of that span.
|
||||
// Leaves the array index on the head unit of that span.
|
||||
if (U_FAILURE(errorCode)) { return FALSE; }
|
||||
// We have an errorCode in case we need to start guarding against integer overflows.
|
||||
// It is also convenient for caller loops if we bail out when an error was set elsewhere.
|
||||
if (dir >= 0) {
|
||||
if (dir > 0) {
|
||||
// Turn around from next() to previous().
|
||||
// Set the string indexes to the span limit and
|
||||
// pre-decrement-read the same span again.
|
||||
if (remaining > 0) {
|
||||
// Fine-grained iterator:
|
||||
// Stay on the current one of a sequence of compressed changes.
|
||||
--index; // previous() rests on the sequence unit.
|
||||
dir = -1;
|
||||
return TRUE;
|
||||
}
|
||||
updateNextIndexes();
|
||||
}
|
||||
dir = -1;
|
||||
}
|
||||
if (remaining > 0) {
|
||||
// Fine-grained iterator: Continue a sequence of compressed changes.
|
||||
int32_t u = array[index];
|
||||
U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE);
|
||||
if (remaining <= (u & SHORT_CHANGE_NUM_MASK)) {
|
||||
++remaining;
|
||||
updatePreviousIndexes();
|
||||
return TRUE;
|
||||
}
|
||||
remaining = 0;
|
||||
}
|
||||
if (index <= 0) {
|
||||
return noNext();
|
||||
}
|
||||
int32_t u = array[--index];
|
||||
if (u <= MAX_UNCHANGED) {
|
||||
// Combine adjacent unchanged ranges.
|
||||
changed = FALSE;
|
||||
oldLength_ = u + 1;
|
||||
while (index > 0 && (u = array[index - 1]) <= MAX_UNCHANGED) {
|
||||
--index;
|
||||
oldLength_ += u + 1;
|
||||
}
|
||||
newLength_ = oldLength_;
|
||||
// No need to handle onlyChanges as long as previous() is called only from findIndex().
|
||||
updatePreviousIndexes();
|
||||
return TRUE;
|
||||
}
|
||||
changed = TRUE;
|
||||
if (u <= MAX_SHORT_CHANGE) {
|
||||
int32_t oldLen = u >> 12;
|
||||
int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH;
|
||||
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
|
||||
if (coarse) {
|
||||
oldLength_ = num * oldLen;
|
||||
newLength_ = num * newLen;
|
||||
} else {
|
||||
// Split a sequence of changes that was compressed into one unit.
|
||||
oldLength_ = oldLen;
|
||||
newLength_ = newLen;
|
||||
if (num > 1) {
|
||||
remaining = 1; // This is the last of two or more changes.
|
||||
}
|
||||
updatePreviousIndexes();
|
||||
return TRUE;
|
||||
}
|
||||
} else {
|
||||
if (u <= 0x7fff) {
|
||||
// The change is encoded in u alone.
|
||||
oldLength_ = readLength((u >> 6) & 0x3f);
|
||||
newLength_ = readLength(u & 0x3f);
|
||||
} else {
|
||||
// Back up to the head of the change, read the lengths,
|
||||
// and reset the index to the head again.
|
||||
U_ASSERT(index > 0);
|
||||
while ((u = array[--index]) > 0x7fff) {}
|
||||
U_ASSERT(u > MAX_SHORT_CHANGE);
|
||||
int32_t headIndex = index++;
|
||||
oldLength_ = readLength((u >> 6) & 0x3f);
|
||||
newLength_ = readLength(u & 0x3f);
|
||||
index = headIndex;
|
||||
}
|
||||
if (!coarse) {
|
||||
updatePreviousIndexes();
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
// Combine adjacent changes.
|
||||
while (index > 0 && (u = array[index - 1]) > MAX_UNCHANGED) {
|
||||
--index;
|
||||
if (u <= MAX_SHORT_CHANGE) {
|
||||
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
|
||||
oldLength_ += (u >> 12) * num;
|
||||
newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num;
|
||||
} else if (u <= 0x7fff) {
|
||||
// Read the lengths, and reset the index to the head again.
|
||||
int32_t headIndex = index++;
|
||||
oldLength_ += readLength((u >> 6) & 0x3f);
|
||||
newLength_ += readLength(u & 0x3f);
|
||||
index = headIndex;
|
||||
}
|
||||
}
|
||||
updatePreviousIndexes();
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode) || i < 0) { return -1; }
|
||||
int32_t spanStart, spanLength;
|
||||
|
@ -518,7 +659,44 @@ int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &erro
|
|||
spanLength = newLength_;
|
||||
}
|
||||
if (i < spanStart) {
|
||||
if (i >= (spanStart / 2)) {
|
||||
// Search backwards.
|
||||
for (;;) {
|
||||
UBool hasPrevious = previous(errorCode);
|
||||
U_ASSERT(hasPrevious); // because i>=0 and the first span starts at 0
|
||||
(void)hasPrevious; // avoid unused-variable warning
|
||||
spanStart = findSource ? srcIndex : destIndex;
|
||||
if (i >= spanStart) {
|
||||
// The index is in the current span.
|
||||
return 0;
|
||||
}
|
||||
if (remaining > 0) {
|
||||
// Is the index in one of the remaining compressed edits?
|
||||
// spanStart is the start of the current span, first of the remaining ones.
|
||||
spanLength = findSource ? oldLength_ : newLength_;
|
||||
int32_t u = array[index];
|
||||
U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE);
|
||||
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1 - remaining;
|
||||
int32_t len = num * spanLength;
|
||||
if (i >= (spanStart - len)) {
|
||||
int32_t n = ((spanStart - i - 1) / spanLength) + 1;
|
||||
// 1 <= n <= num
|
||||
srcIndex -= n * oldLength_;
|
||||
replIndex -= n * newLength_;
|
||||
destIndex -= n * newLength_;
|
||||
remaining += n;
|
||||
return 0;
|
||||
}
|
||||
// Skip all of these edits at once.
|
||||
srcIndex -= num * oldLength_;
|
||||
replIndex -= num * newLength_;
|
||||
destIndex -= num * newLength_;
|
||||
remaining = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Reset the iterator to the start.
|
||||
dir = 0;
|
||||
index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;
|
||||
} else if (i < (spanStart + spanLength)) {
|
||||
// The index is in the current span.
|
||||
|
@ -536,21 +714,21 @@ int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &erro
|
|||
// The index is in the current span.
|
||||
return 0;
|
||||
}
|
||||
if (remaining > 0) {
|
||||
if (remaining > 1) {
|
||||
// Is the index in one of the remaining compressed edits?
|
||||
// spanStart is the start of the current span, before the remaining ones.
|
||||
int32_t len = (remaining + 1) * spanLength;
|
||||
// spanStart is the start of the current span, first of the remaining ones.
|
||||
int32_t len = remaining * spanLength;
|
||||
if (i < (spanStart + len)) {
|
||||
int32_t n = (i - spanStart) / spanLength; // 1 <= n <= remaining
|
||||
len = n * spanLength;
|
||||
srcIndex += len;
|
||||
replIndex += len;
|
||||
destIndex += len;
|
||||
int32_t n = (i - spanStart) / spanLength; // 1 <= n <= remaining - 1
|
||||
srcIndex += n * oldLength_;
|
||||
replIndex += n * newLength_;
|
||||
destIndex += n * newLength_;
|
||||
remaining -= n;
|
||||
return 0;
|
||||
}
|
||||
// Make next() skip all of these edits at once.
|
||||
oldLength_ = newLength_ = len;
|
||||
oldLength_ *= remaining;
|
||||
newLength_ *= remaining;
|
||||
remaining = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -694,7 +694,7 @@ FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& st
|
|||
}
|
||||
|
||||
FilteredBreakIteratorBuilder *
|
||||
FilteredBreakIteratorBuilder::createInstance(UErrorCode& status) {
|
||||
FilteredBreakIteratorBuilder::createEmptyInstance(UErrorCode& status) {
|
||||
if(U_FAILURE(status)) return NULL;
|
||||
LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status);
|
||||
return (U_SUCCESS(status))? ret.orphan(): NULL;
|
||||
|
|
|
@ -22,11 +22,11 @@
|
|||
|
||||
#include "unicode/edits.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/stringoptions.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/unorm.h"
|
||||
#include "cpputils.h"
|
||||
#include "ustr_imp.h" // U_EDITS_NO_RESET
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
|
|
@ -33,6 +33,8 @@ class U_COMMON_API Hashtable : public UMemory {
|
|||
|
||||
inline void init(UHashFunction *keyHash, UKeyComparator *keyComp, UValueComparator *valueComp, UErrorCode& status);
|
||||
|
||||
inline void initSize(UHashFunction *keyHash, UKeyComparator *keyComp, UValueComparator *valueComp, int32_t size, UErrorCode& status);
|
||||
|
||||
public:
|
||||
/**
|
||||
* Construct a hashtable
|
||||
|
@ -41,6 +43,14 @@ public:
|
|||
*/
|
||||
Hashtable(UBool ignoreKeyCase, UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Construct a hashtable
|
||||
* @param ignoreKeyCase If true, keys are case insensitive.
|
||||
* @param size initial size allocation
|
||||
* @param status Error code
|
||||
*/
|
||||
Hashtable(UBool ignoreKeyCase, int32_t size, UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Construct a hashtable
|
||||
* @param keyComp Comparator for comparing the keys
|
||||
|
@ -76,9 +86,9 @@ public:
|
|||
int32_t puti(const UnicodeString& key, int32_t value, UErrorCode& status);
|
||||
|
||||
void* get(const UnicodeString& key) const;
|
||||
|
||||
|
||||
int32_t geti(const UnicodeString& key) const;
|
||||
|
||||
|
||||
void* remove(const UnicodeString& key);
|
||||
|
||||
int32_t removei(const UnicodeString& key);
|
||||
|
@ -92,9 +102,9 @@ public:
|
|||
* @see uhash_nextElement
|
||||
*/
|
||||
const UHashElement* nextElement(int32_t& pos) const;
|
||||
|
||||
|
||||
UKeyComparator* setKeyComparator(UKeyComparator*keyComp);
|
||||
|
||||
|
||||
UValueComparator* setValueComparator(UValueComparator* valueComp);
|
||||
|
||||
UBool equals(const Hashtable& that) const;
|
||||
|
@ -107,7 +117,7 @@ private:
|
|||
* Implementation
|
||||
********************************************************************/
|
||||
|
||||
inline void Hashtable::init(UHashFunction *keyHash, UKeyComparator *keyComp,
|
||||
inline void Hashtable::init(UHashFunction *keyHash, UKeyComparator *keyComp,
|
||||
UValueComparator *valueComp, UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
|
@ -119,10 +129,23 @@ inline void Hashtable::init(UHashFunction *keyHash, UKeyComparator *keyComp,
|
|||
}
|
||||
}
|
||||
|
||||
inline Hashtable::Hashtable(UKeyComparator *keyComp, UValueComparator *valueComp,
|
||||
inline void Hashtable::initSize(UHashFunction *keyHash, UKeyComparator *keyComp,
|
||||
UValueComparator *valueComp, int32_t size, UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
uhash_initSize(&hashObj, keyHash, keyComp, valueComp, size, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
hash = &hashObj;
|
||||
uhash_setKeyDeleter(hash, uprv_deleteUObject);
|
||||
}
|
||||
}
|
||||
|
||||
inline Hashtable::Hashtable(UKeyComparator *keyComp, UValueComparator *valueComp,
|
||||
UErrorCode& status) : hash(0) {
|
||||
init( uhash_hashUnicodeString, keyComp, valueComp, status);
|
||||
}
|
||||
|
||||
inline Hashtable::Hashtable(UBool ignoreKeyCase, UErrorCode& status)
|
||||
: hash(0)
|
||||
{
|
||||
|
@ -134,6 +157,17 @@ inline Hashtable::Hashtable(UBool ignoreKeyCase, UErrorCode& status)
|
|||
status);
|
||||
}
|
||||
|
||||
inline Hashtable::Hashtable(UBool ignoreKeyCase, int32_t size, UErrorCode& status)
|
||||
: hash(0)
|
||||
{
|
||||
initSize(ignoreKeyCase ? uhash_hashCaselessUnicodeString
|
||||
: uhash_hashUnicodeString,
|
||||
ignoreKeyCase ? uhash_compareCaselessUnicodeString
|
||||
: uhash_compareUnicodeString,
|
||||
NULL, size,
|
||||
status);
|
||||
}
|
||||
|
||||
inline Hashtable::Hashtable(UErrorCode& status)
|
||||
: hash(0)
|
||||
{
|
||||
|
@ -200,7 +234,7 @@ inline void Hashtable::removeAll(void) {
|
|||
inline UKeyComparator* Hashtable::setKeyComparator(UKeyComparator*keyComp){
|
||||
return uhash_setKeyComparator(hash, keyComp);
|
||||
}
|
||||
|
||||
|
||||
inline UValueComparator* Hashtable::setValueComparator(UValueComparator* valueComp){
|
||||
return uhash_setValueComparator(hash, valueComp);
|
||||
}
|
||||
|
|
|
@ -542,7 +542,7 @@ uloc_getDisplayName(const char *locale,
|
|||
return 0;
|
||||
}
|
||||
separator = (const UChar *)p0 + subLen;
|
||||
sepLen = p1 - separator;
|
||||
sepLen = static_cast<int32_t>(p1 - separator);
|
||||
}
|
||||
|
||||
if(patLen==0 || (patLen==defaultPatLen && !u_strncmp(pattern, defaultPattern, patLen))) {
|
||||
|
@ -558,8 +558,8 @@ uloc_getDisplayName(const char *locale,
|
|||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
sub0Pos=p0-pattern;
|
||||
sub1Pos=p1-pattern;
|
||||
sub0Pos = static_cast<int32_t>(p0-pattern);
|
||||
sub1Pos = static_cast<int32_t>(p1-pattern);
|
||||
if (sub1Pos < sub0Pos) { /* a very odd pattern */
|
||||
int32_t t=sub0Pos; sub0Pos=sub1Pos; sub1Pos=t;
|
||||
langi=1;
|
||||
|
|
|
@ -54,7 +54,7 @@ static int32_t ncat(char *buffer, uint32_t buflen, ...) {
|
|||
*p = 0;
|
||||
va_end(args);
|
||||
|
||||
return p - buffer;
|
||||
return static_cast<int32_t>(p - buffer);
|
||||
}
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
|
|
@ -300,21 +300,21 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={
|
|||
1,1,1,1,0x864,0x198d,1,1,1,1,1,1,0x868,0x1993,1,0x86c,
|
||||
0x1999,1,1,1,1,1,1,1,0xfc0e,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,0xfe12,1,1,1,0xffcc,0xffb8,0xffcc,
|
||||
0xffcc,1,1,1,0x29dd,0x29e3,0x29e9,0x29ef,0x29f5,0x29fb,0x2a01,0x2a07,1,1,1,1,
|
||||
0xffcc,1,1,1,0x29dc,0x29e2,0x29e8,0x29ee,0x29f4,0x29fa,0x2a00,0x2a06,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,0xfe0e,1,0xfc00,1,1,1,1,1,
|
||||
1,1,1,0x870,1,1,1,0x199f,0x19a5,0xfe12,1,1,1,1,1,1,
|
||||
1,1,1,0xfc00,1,1,1,1,0x2a0d,0x2a13,1,0x2a19,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0x2a1f,
|
||||
1,1,0x2a25,1,1,1,1,1,0xfe0e,1,1,1,1,1,1,1,
|
||||
1,1,1,0xfc00,1,1,1,1,0x2a0c,0x2a12,1,0x2a18,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0x2a1e,
|
||||
1,1,0x2a24,1,1,1,1,1,0xfe0e,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,0xfe12,1,1,1,1,1,1,
|
||||
1,1,1,1,1,0x2a2b,0x2a31,0x2a37,1,1,0x2a3d,1,1,1,1,1,
|
||||
1,1,1,1,1,0x2a2a,0x2a30,0x2a36,1,1,0x2a3c,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,0xfe0e,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,0xfe12,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0x878,
|
||||
0x19ab,1,1,0x19b1,0x19b7,0xfe12,1,1,1,1,1,1,1,1,0xfc00,0xfc00,
|
||||
1,1,1,1,0x2a43,0x2a49,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,0x2a42,0x2a48,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,0x884,1,0x19bd,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,0xfc00,1,
|
||||
|
@ -342,7 +342,7 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={
|
|||
1,1,1,0x2a4f,1,1,1,1,1,1,1,1,1,0x2a55,1,1,
|
||||
1,1,0x2a5b,1,1,1,1,0x2a61,1,1,1,1,0x2a67,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,0x2a6d,1,1,1,1,1,1,
|
||||
1,0xff02,0xff04,0x3c40,0xff08,0x3c48,0x2a73,1,0x2a79,1,0xff04,0xff04,0xff04,0xff04,1,1,
|
||||
1,0xff02,0xff04,0x3c40,0xff08,0x3c48,0x2a72,1,0x2a78,1,0xff04,0xff04,0xff04,0xff04,1,1,
|
||||
0xff04,0x3c50,0xffcc,0xffcc,0xfe12,1,0xffcc,0xffcc,1,1,1,1,1,1,1,1,
|
||||
1,1,1,0x2a7f,1,1,1,1,1,1,1,1,1,0x2a85,1,1,
|
||||
1,1,0x2a8b,1,1,1,1,0x2a91,1,1,1,1,0x2a97,1,1,1,
|
||||
|
@ -406,15 +406,15 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={
|
|||
0x21ef,0x21f9,0x2203,0x220d,0x10d8,0x10e6,0x2217,0x2221,0x222b,0x2235,1,1,0x10f4,0x1102,0x223f,0x2249,
|
||||
0x2253,0x225d,1,1,0x1110,0x1122,0x2267,0x2271,0x227b,0x2285,0x228f,0x2299,1,0x1134,1,0x22a3,
|
||||
1,0x22ad,1,0x22b7,0x1146,0x115c,0x1174,0x1182,0x1190,0x119e,0x11ac,0x11ba,0x11c6,0x11dc,0x11f4,0x1202,
|
||||
0x1210,0x121e,0x122c,0x123a,0x1246,0x3b8e,0x22bf,0x3b97,0x1250,0x3b9e,0x22c5,0x3ba7,0x22cb,0x3baf,0x22d1,0x3bb7,
|
||||
0x1210,0x121e,0x122c,0x123a,0x1246,0x3b8e,0x22bf,0x3b96,0x1250,0x3b9e,0x22c5,0x3ba6,0x22cb,0x3bae,0x22d1,0x3bb6,
|
||||
0x125a,0x3bbe,1,1,0x22d8,0x22e2,0x22f1,0x2301,0x2311,0x2321,0x2331,0x2341,0x234c,0x2356,0x2365,0x2375,
|
||||
0x2385,0x2395,0x23a5,0x23b5,0x23c0,0x23ca,0x23d9,0x23e9,0x23f9,0x2409,0x2419,0x2429,0x2434,0x243e,0x244d,0x245d,
|
||||
0x246d,0x247d,0x248d,0x249d,0x24a8,0x24b2,0x24c1,0x24d1,0x24e1,0x24f1,0x2501,0x2511,0x251c,0x2526,0x2535,0x2545,
|
||||
0x2555,0x2565,0x2575,0x2585,0x258f,0x2595,0x259d,0x25a4,0x25ad,1,0x1264,0x25b7,0x25bf,0x25c5,0x25cb,0x3bc7,
|
||||
0x25d0,1,0x2aa2,0x8f0,1,0x25d7,0x25df,0x25e6,0x25ef,1,0x126e,0x25f9,0x2601,0x3bcf,0x2607,0x3bd7,
|
||||
0x260c,0x2613,0x2619,0x261f,0x2625,0x262b,0x2633,0x3be1,1,1,0x263b,0x2643,0x264b,0x2651,0x2657,0x3beb,
|
||||
1,0x265d,0x2663,0x2669,0x266f,0x2675,0x267d,0x3bf5,0x2685,0x268b,0x2691,0x2699,0x26a1,0x26a7,0x26ad,0x3bff,
|
||||
0x26b3,0x26b9,0x3c07,0x2aa7,1,1,0x26c1,0x26c8,0x26d1,1,0x1278,0x26db,0x26e3,0x3c0f,0x26e9,0x3c17,
|
||||
0x2555,0x2565,0x2575,0x2585,0x258f,0x2595,0x259d,0x25a4,0x25ad,1,0x1264,0x25b7,0x25bf,0x25c5,0x25cb,0x3bc6,
|
||||
0x25d0,1,0x2aa2,0x8f0,1,0x25d7,0x25df,0x25e6,0x25ef,1,0x126e,0x25f9,0x2601,0x3bce,0x2607,0x3bd6,
|
||||
0x260c,0x2613,0x2619,0x261f,0x2625,0x262b,0x2633,0x3be0,1,1,0x263b,0x2643,0x264b,0x2651,0x2657,0x3bea,
|
||||
1,0x265d,0x2663,0x2669,0x266f,0x2675,0x267d,0x3bf4,0x2685,0x268b,0x2691,0x2699,0x26a1,0x26a7,0x26ad,0x3bfe,
|
||||
0x26b3,0x26b9,0x3c06,0x2aa7,1,1,0x26c1,0x26c8,0x26d1,1,0x1278,0x26db,0x26e3,0x3c0e,0x26e9,0x3c16,
|
||||
0x26ee,0x2aab,0x8fc,1,0xfa09,0xfa09,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,0xffcc,0xffcc,0xfe02,0xfe02,0xffcc,0xffcc,0xffcc,0xffcc,0xfe02,0xfe02,0xfe02,0xffcc,
|
||||
|
@ -512,10 +512,10 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={
|
|||
0x311b,0x3009,0x311f,0x3123,0x3127,0x312b,0x312f,0x3011,0x2f09,0x3133,0x3015,0x3137,0x3019,0x313b,0x2ae1,0x313f,
|
||||
0x3145,0x314b,0x3151,0x3155,0x3159,0x315d,0x3163,0x3169,0x316f,0x3173,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,0x3177,0xfe34,0x317d,1,1,1,1,
|
||||
1,1,1,1,1,1,0x3183,0x3189,0x3191,0x319b,0x31a3,0x31a9,0x31af,0x31b5,0x31bb,0x31c1,
|
||||
0x31c7,0x31cd,0x31d3,1,0x31d9,0x31df,0x31e5,0x31eb,0x31f1,1,0x31f7,1,0x31fd,0x3203,1,0x3209,
|
||||
0x320f,1,0x3215,0x321b,0x3221,0x3227,0x322d,0x3233,0x3239,0x323f,0x3245,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,0x3176,0xfe34,0x317c,1,1,1,1,
|
||||
1,1,1,1,1,1,0x3182,0x3188,0x3190,0x319a,0x31a2,0x31a8,0x31ae,0x31b4,0x31ba,0x31c0,
|
||||
0x31c6,0x31cc,0x31d2,1,0x31d8,0x31de,0x31e4,0x31ea,0x31f0,1,0x31f6,1,0x31fc,0x3202,1,0x3208,
|
||||
0x320e,1,0x3214,0x321a,0x3220,0x3226,0x322c,0x3232,0x3238,0x323e,0x3244,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,0xffcc,0xffcc,0xffcc,0xffcc,
|
||||
0xffcc,0xffcc,0xffcc,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffcc,0xffcc,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
|
@ -560,13 +560,13 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={
|
|||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,0xfe02,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,0x324b,0x3255,0x3269,0x3281,0x3299,0x32b1,0x32c9,0xffb0,0xffb0,0xfe02,
|
||||
1,1,1,1,1,1,0x324a,0x3254,0x3268,0x3280,0x3298,0x32b0,0x32c8,0xffb0,0xffb0,0xfe02,
|
||||
0xfe02,0xfe02,1,1,1,0xffc4,0xffb0,0xffb0,0xffb0,0xffb0,0xffb0,1,1,1,1,1,
|
||||
1,1,1,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,1,1,0xffcc,0xffcc,0xffcc,
|
||||
0xffcc,0xffcc,0xffb8,0xffb8,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,0xffcc,0xffcc,0xffcc,0xffcc,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,0x32d7,0x32e1,0x32f5,0x330d,0x3325,
|
||||
0x333d,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,0x32d6,0x32e0,0x32f4,0x330c,0x3324,
|
||||
0x333c,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,0xffcc,0xffcc,0xffcc,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
|
|
|
@ -20,10 +20,10 @@
|
|||
|
||||
#include "unicode/edits.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/stringoptions.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "cpputils.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "ustr_imp.h" // U_EDITS_NO_RESET
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -226,14 +226,14 @@ public:
|
|||
private:
|
||||
virtual void
|
||||
normalize(const UChar *src, const UChar *limit,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const U_OVERRIDE {
|
||||
impl.compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
|
||||
}
|
||||
using Normalizer2WithImpl::normalize; // Avoid warning about hiding base class function.
|
||||
|
||||
void
|
||||
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
|
||||
Edits *edits, UErrorCode &errorCode) const override {
|
||||
Edits *edits, UErrorCode &errorCode) const U_OVERRIDE {
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
|
@ -249,12 +249,12 @@ private:
|
|||
virtual void
|
||||
normalizeAndAppend(const UChar *src, const UChar *limit, UBool doNormalize,
|
||||
UnicodeString &safeMiddle,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const U_OVERRIDE {
|
||||
impl.composeAndAppend(src, limit, doNormalize, onlyContiguous, safeMiddle, buffer, errorCode);
|
||||
}
|
||||
|
||||
virtual UBool
|
||||
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override {
|
||||
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return FALSE;
|
||||
}
|
||||
|
@ -271,7 +271,7 @@ private:
|
|||
return impl.compose(sArray, sArray+s.length(), onlyContiguous, FALSE, buffer, errorCode);
|
||||
}
|
||||
virtual UBool
|
||||
isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const override {
|
||||
isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const U_OVERRIDE {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return FALSE;
|
||||
}
|
||||
|
@ -279,7 +279,7 @@ private:
|
|||
return impl.composeUTF8(0, onlyContiguous, s, s + sp.length(), nullptr, nullptr, errorCode);
|
||||
}
|
||||
virtual UNormalizationCheckResult
|
||||
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override {
|
||||
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return UNORM_MAYBE;
|
||||
}
|
||||
|
@ -293,20 +293,20 @@ private:
|
|||
return qcResult;
|
||||
}
|
||||
virtual const UChar *
|
||||
spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &) const override {
|
||||
spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &) const U_OVERRIDE {
|
||||
return impl.composeQuickCheck(src, limit, onlyContiguous, NULL);
|
||||
}
|
||||
using Normalizer2WithImpl::spanQuickCheckYes; // Avoid warning about hiding base class function.
|
||||
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const override {
|
||||
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const U_OVERRIDE {
|
||||
return impl.getCompQuickCheck(impl.getNorm16(c));
|
||||
}
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const override {
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE {
|
||||
return impl.hasCompBoundaryBefore(c);
|
||||
}
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const override {
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE {
|
||||
return impl.hasCompBoundaryAfter(c, onlyContiguous);
|
||||
}
|
||||
virtual UBool isInert(UChar32 c) const override {
|
||||
virtual UBool isInert(UChar32 c) const U_OVERRIDE {
|
||||
return impl.isCompInert(c, onlyContiguous);
|
||||
}
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
|
||||
#include "unicode/edits.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/stringoptions.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/unorm.h"
|
||||
#include "cstring.h"
|
||||
|
@ -30,7 +31,6 @@
|
|||
#include "normalizer2impl.h"
|
||||
#include "uassert.h"
|
||||
#include "ucln_cmn.h"
|
||||
#include "ustr_imp.h" // U_EDITS_NO_RESET
|
||||
|
||||
using icu::Normalizer2Impl;
|
||||
|
||||
|
@ -85,7 +85,7 @@ class NoopNormalizer2 : public Normalizer2 {
|
|||
virtual UnicodeString &
|
||||
normalize(const UnicodeString &src,
|
||||
UnicodeString &dest,
|
||||
UErrorCode &errorCode) const override {
|
||||
UErrorCode &errorCode) const U_OVERRIDE {
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
if(&dest!=&src) {
|
||||
dest=src;
|
||||
|
@ -97,7 +97,7 @@ class NoopNormalizer2 : public Normalizer2 {
|
|||
}
|
||||
virtual void
|
||||
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
|
||||
Edits *edits, UErrorCode &errorCode) const override {
|
||||
Edits *edits, UErrorCode &errorCode) const U_OVERRIDE {
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
if (edits != nullptr) {
|
||||
if ((options & U_EDITS_NO_RESET) == 0) {
|
||||
|
@ -115,7 +115,7 @@ class NoopNormalizer2 : public Normalizer2 {
|
|||
virtual UnicodeString &
|
||||
normalizeSecondAndAppend(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const override {
|
||||
UErrorCode &errorCode) const U_OVERRIDE {
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
if(&first!=&second) {
|
||||
first.append(second);
|
||||
|
@ -128,7 +128,7 @@ class NoopNormalizer2 : public Normalizer2 {
|
|||
virtual UnicodeString &
|
||||
append(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const override {
|
||||
UErrorCode &errorCode) const U_OVERRIDE {
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
if(&first!=&second) {
|
||||
first.append(second);
|
||||
|
@ -139,29 +139,29 @@ class NoopNormalizer2 : public Normalizer2 {
|
|||
return first;
|
||||
}
|
||||
virtual UBool
|
||||
getDecomposition(UChar32, UnicodeString &) const override {
|
||||
getDecomposition(UChar32, UnicodeString &) const U_OVERRIDE {
|
||||
return FALSE;
|
||||
}
|
||||
// No need to override the default getRawDecomposition().
|
||||
// No need to U_OVERRIDE the default getRawDecomposition().
|
||||
virtual UBool
|
||||
isNormalized(const UnicodeString &, UErrorCode &errorCode) const override {
|
||||
isNormalized(const UnicodeString &, UErrorCode &errorCode) const U_OVERRIDE {
|
||||
return U_SUCCESS(errorCode);
|
||||
}
|
||||
virtual UBool
|
||||
isNormalizedUTF8(StringPiece, UErrorCode &errorCode) const override {
|
||||
isNormalizedUTF8(StringPiece, UErrorCode &errorCode) const U_OVERRIDE {
|
||||
return U_SUCCESS(errorCode);
|
||||
}
|
||||
virtual UNormalizationCheckResult
|
||||
quickCheck(const UnicodeString &, UErrorCode &) const override {
|
||||
quickCheck(const UnicodeString &, UErrorCode &) const U_OVERRIDE {
|
||||
return UNORM_YES;
|
||||
}
|
||||
virtual int32_t
|
||||
spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const override {
|
||||
spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const U_OVERRIDE {
|
||||
return s.length();
|
||||
}
|
||||
virtual UBool hasBoundaryBefore(UChar32) const override { return TRUE; }
|
||||
virtual UBool hasBoundaryAfter(UChar32) const override { return TRUE; }
|
||||
virtual UBool isInert(UChar32) const override { return TRUE; }
|
||||
virtual UBool hasBoundaryBefore(UChar32) const U_OVERRIDE { return TRUE; }
|
||||
virtual UBool hasBoundaryAfter(UChar32) const U_OVERRIDE { return TRUE; }
|
||||
virtual UBool isInert(UChar32) const U_OVERRIDE { return TRUE; }
|
||||
};
|
||||
|
||||
NoopNormalizer2::~NoopNormalizer2() {}
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
#include "unicode/ustring.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "unicode/utf8.h"
|
||||
#include "bytesinkutil.h"
|
||||
#include "cmemory.h"
|
||||
#include "mutex.h"
|
||||
#include "normalizer2impl.h"
|
||||
|
@ -129,60 +130,6 @@ int32_t getJamoTMinusBase(const uint8_t *src, const uint8_t *limit) {
|
|||
return -1;
|
||||
}
|
||||
|
||||
/** The bytes at [src, nextSrc[ were mapped to valid (s16, s16Length). */
|
||||
UBool
|
||||
appendChange(const uint8_t *src, const uint8_t *nextSrc,
|
||||
const char16_t *s16, int32_t s16Length,
|
||||
ByteSink &sink, Edits *edits, UErrorCode &errorCode) {
|
||||
U_ASSERT(U_SUCCESS(errorCode));
|
||||
U_ASSERT((nextSrc - src) <= INT32_MAX); // ensured by caller
|
||||
char scratch[200];
|
||||
int32_t s8Length = 0;
|
||||
for (int32_t i = 0; i < s16Length;) {
|
||||
int32_t capacity;
|
||||
int32_t desiredCapacity = s16Length - i;
|
||||
if (desiredCapacity < (INT32_MAX / 3)) {
|
||||
desiredCapacity *= 3; // max 3 UTF-8 bytes per UTF-16 code unit
|
||||
} else if (desiredCapacity < (INT32_MAX / 2)) {
|
||||
desiredCapacity *= 2;
|
||||
} else {
|
||||
desiredCapacity = INT32_MAX;
|
||||
}
|
||||
char *buffer = sink.GetAppendBuffer(U8_MAX_LENGTH, desiredCapacity,
|
||||
scratch, UPRV_LENGTHOF(scratch), &capacity);
|
||||
capacity -= U8_MAX_LENGTH - 1;
|
||||
int32_t j = 0;
|
||||
for (; i < s16Length && j < capacity;) {
|
||||
UChar32 c;
|
||||
U16_NEXT_UNSAFE(s16, i, c);
|
||||
U8_APPEND_UNSAFE(buffer, j, c);
|
||||
}
|
||||
if (j > (INT32_MAX - s8Length)) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
sink.Append(buffer, j);
|
||||
s8Length += j;
|
||||
}
|
||||
if (edits != nullptr) {
|
||||
edits->addReplace((int32_t)(nextSrc - src), s8Length);
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/** The few bytes at [src, nextSrc[ were mapped to valid code point c. */
|
||||
void
|
||||
appendCodePoint(const uint8_t *src, const uint8_t *nextSrc, UChar32 c,
|
||||
ByteSink &sink, Edits *edits) {
|
||||
char buffer[U8_MAX_LENGTH];
|
||||
int32_t length = 0;
|
||||
U8_APPEND_UNSAFE(buffer, length, c);
|
||||
if (edits != nullptr) {
|
||||
edits->addReplace((int32_t)(nextSrc - src), length);
|
||||
}
|
||||
sink.Append(buffer, length);
|
||||
}
|
||||
|
||||
void
|
||||
appendCodePointDelta(const uint8_t *cpStart, const uint8_t *cpLimit, int32_t delta,
|
||||
ByteSink &sink, Edits *edits) {
|
||||
|
@ -214,27 +161,6 @@ appendCodePointDelta(const uint8_t *cpStart, const uint8_t *cpLimit, int32_t del
|
|||
sink.Append(buffer, length);
|
||||
}
|
||||
|
||||
UBool
|
||||
appendUnchanged(const uint8_t *s, const uint8_t *limit,
|
||||
ByteSink &sink, uint32_t options, Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
U_ASSERT(U_SUCCESS(errorCode));
|
||||
if ((limit - s) > INT32_MAX) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
int32_t length = (int32_t)(limit - s);
|
||||
if (length > 0) {
|
||||
if (edits != nullptr) {
|
||||
edits->addUnchanged(length);
|
||||
}
|
||||
if ((options & U_OMIT_UNCHANGED_TEXT) ==0) {
|
||||
sink.Append(reinterpret_cast<const char *>(s), length);
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// ReorderingBuffer -------------------------------------------------------- ***
|
||||
|
@ -1851,7 +1777,8 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
|
|||
for (;;) {
|
||||
if (src == limit) {
|
||||
if (prevBoundary != limit && sink != nullptr) {
|
||||
appendUnchanged(prevBoundary, limit, *sink, options, edits, errorCode);
|
||||
ByteSinkUtil::appendUnchanged(prevBoundary, limit,
|
||||
*sink, options, edits, errorCode);
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
@ -1884,7 +1811,8 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
|
|||
if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
|
||||
hasCompBoundaryBefore(src, limit)) {
|
||||
if (prevBoundary != prevSrc &&
|
||||
!appendUnchanged(prevBoundary, prevSrc, *sink, options, edits, errorCode)) {
|
||||
!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
|
||||
*sink, options, edits, errorCode)) {
|
||||
break;
|
||||
}
|
||||
appendCodePointDelta(prevSrc, src, getAlgorithmicDelta(norm16), *sink, edits);
|
||||
|
@ -1896,13 +1824,14 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
|
|||
if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
|
||||
hasCompBoundaryBefore(src, limit)) {
|
||||
if (prevBoundary != prevSrc &&
|
||||
!appendUnchanged(prevBoundary, prevSrc, *sink, options, edits, errorCode)) {
|
||||
!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
|
||||
*sink, options, edits, errorCode)) {
|
||||
break;
|
||||
}
|
||||
const uint16_t *mapping = getMapping(norm16);
|
||||
int32_t length = *mapping++ & MAPPING_LENGTH_MASK;
|
||||
if (!appendChange(prevSrc, src, (const UChar *)mapping, length,
|
||||
*sink, edits, errorCode)) {
|
||||
if (!ByteSinkUtil::appendChange(prevSrc, src, (const UChar *)mapping, length,
|
||||
*sink, edits, errorCode)) {
|
||||
break;
|
||||
}
|
||||
prevBoundary = src;
|
||||
|
@ -1915,7 +1844,8 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
|
|||
if (hasCompBoundaryBefore(src, limit) ||
|
||||
hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) {
|
||||
if (prevBoundary != prevSrc &&
|
||||
!appendUnchanged(prevBoundary, prevSrc, *sink, options, edits, errorCode)) {
|
||||
!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
|
||||
*sink, options, edits, errorCode)) {
|
||||
break;
|
||||
}
|
||||
if (edits != nullptr) {
|
||||
|
@ -1955,10 +1885,11 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
|
|||
Hangul::JAMO_T_COUNT + t;
|
||||
prevSrc -= 3; // Replace the Jamo L as well.
|
||||
if (prevBoundary != prevSrc &&
|
||||
!appendUnchanged(prevBoundary, prevSrc, *sink, options, edits, errorCode)) {
|
||||
!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
|
||||
*sink, options, edits, errorCode)) {
|
||||
break;
|
||||
}
|
||||
appendCodePoint(prevSrc, src, syllable, *sink, edits);
|
||||
ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);
|
||||
prevBoundary = src;
|
||||
continue;
|
||||
}
|
||||
|
@ -1979,10 +1910,11 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
|
|||
UChar32 syllable = prev + getJamoTMinusBase(prevSrc, src);
|
||||
prevSrc -= 3; // Replace the Hangul LV as well.
|
||||
if (prevBoundary != prevSrc &&
|
||||
!appendUnchanged(prevBoundary, prevSrc, *sink, options, edits, errorCode)) {
|
||||
!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
|
||||
*sink, options, edits, errorCode)) {
|
||||
break;
|
||||
}
|
||||
appendCodePoint(prevSrc, src, syllable, *sink, edits);
|
||||
ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);
|
||||
prevBoundary = src;
|
||||
continue;
|
||||
}
|
||||
|
@ -2006,7 +1938,8 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
|
|||
for (;;) {
|
||||
if (src == limit) {
|
||||
if (sink != nullptr) {
|
||||
appendUnchanged(prevBoundary, limit, *sink, options, edits, errorCode);
|
||||
ByteSinkUtil::appendUnchanged(prevBoundary, limit,
|
||||
*sink, options, edits, errorCode);
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
@ -2070,11 +2003,12 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
|
|||
return FALSE;
|
||||
}
|
||||
if (prevBoundary != prevSrc &&
|
||||
!appendUnchanged(prevBoundary, prevSrc, *sink, options, edits, errorCode)) {
|
||||
!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
|
||||
*sink, options, edits, errorCode)) {
|
||||
break;
|
||||
}
|
||||
if (!appendChange(prevSrc, src, buffer.getStart(), buffer.length(),
|
||||
*sink, edits, errorCode)) {
|
||||
if (!ByteSinkUtil::appendChange(prevSrc, src, buffer.getStart(), buffer.length(),
|
||||
*sink, edits, errorCode)) {
|
||||
break;
|
||||
}
|
||||
prevBoundary = src;
|
||||
|
|
|
@ -675,6 +675,16 @@ extern U_IMPORT char *U_TZNAME[];
|
|||
|
||||
#if !UCONFIG_NO_FILE_IO && ((U_PLATFORM_IS_DARWIN_BASED && (U_PLATFORM != U_PF_IPHONE || defined(U_TIMEZONE))) || U_PLATFORM_IS_LINUX_BASED || U_PLATFORM == U_PF_BSD || U_PLATFORM == U_PF_SOLARIS)
|
||||
/* These platforms are likely to use Olson timezone IDs. */
|
||||
/* common targets of the symbolic link at TZDEFAULT are:
|
||||
* "/usr/share/zoneinfo/<olsonID>" default, older Linus distros, macOS to 10.12
|
||||
* "../usr/share/zoneinfo/<olsonID>" newer Linux distros: Red Hat Enterprise Linux 7, Ubuntu, SuSe Linux
|
||||
* "/usr/share/lib/zoneinfo/<olsonID>" Solaris
|
||||
* "../usr/share/lib/zoneinfo/<olsonID>" Solaris
|
||||
* "/var/db/timezone/zoneinfo/<olsonID>" macOS 10.13
|
||||
* To avoid checking lots of paths, just check that the target path
|
||||
* before the <olsonID> ends with "/zoneinfo/", and the <olsonID> is valid.
|
||||
*/
|
||||
|
||||
#define CHECK_LOCALTIME_LINK 1
|
||||
#if U_PLATFORM_IS_DARWIN_BASED
|
||||
#include <tzfile.h>
|
||||
|
@ -682,12 +692,12 @@ extern U_IMPORT char *U_TZNAME[];
|
|||
#elif U_PLATFORM == U_PF_SOLARIS
|
||||
#define TZDEFAULT "/etc/localtime"
|
||||
#define TZZONEINFO "/usr/share/lib/zoneinfo/"
|
||||
#define TZZONEINFO2 "../usr/share/lib/zoneinfo/"
|
||||
#define TZ_ENV_CHECK "localtime"
|
||||
#else
|
||||
#define TZDEFAULT "/etc/localtime"
|
||||
#define TZZONEINFO "/usr/share/zoneinfo/"
|
||||
#endif
|
||||
#define TZZONEINFOTAIL "/zoneinfo/"
|
||||
#if U_HAVE_DIRENT_H
|
||||
#define TZFILE_SKIP "posixrules" /* tz file to skip when searching. */
|
||||
/* Some Linux distributions have 'localtime' in /usr/share/zoneinfo
|
||||
|
@ -1131,24 +1141,15 @@ uprv_tzname(int n)
|
|||
*/
|
||||
int32_t ret = (int32_t)readlink(TZDEFAULT, gTimeZoneBuffer, sizeof(gTimeZoneBuffer)-1);
|
||||
if (0 < ret) {
|
||||
int32_t tzZoneInfoLen = uprv_strlen(TZZONEINFO);
|
||||
int32_t tzZoneInfoTailLen = uprv_strlen(TZZONEINFOTAIL);
|
||||
gTimeZoneBuffer[ret] = 0;
|
||||
if (uprv_strncmp(gTimeZoneBuffer, TZZONEINFO, tzZoneInfoLen) == 0
|
||||
&& isValidOlsonID(gTimeZoneBuffer + tzZoneInfoLen))
|
||||
char * tzZoneInfoTailPtr = uprv_strstr(gTimeZoneBuffer, TZZONEINFOTAIL);
|
||||
|
||||
if (tzZoneInfoTailPtr != NULL
|
||||
&& isValidOlsonID(tzZoneInfoTailPtr + tzZoneInfoTailLen))
|
||||
{
|
||||
return (gTimeZoneBufferPtr = gTimeZoneBuffer + tzZoneInfoLen);
|
||||
return (gTimeZoneBufferPtr = tzZoneInfoTailPtr + tzZoneInfoTailLen);
|
||||
}
|
||||
#if U_PLATFORM == U_PF_SOLARIS
|
||||
else
|
||||
{
|
||||
tzZoneInfoLen = uprv_strlen(TZZONEINFO2);
|
||||
if (uprv_strncmp(gTimeZoneBuffer, TZZONEINFO2, tzZoneInfoLen) == 0
|
||||
&& isValidOlsonID(gTimeZoneBuffer + tzZoneInfoLen))
|
||||
{
|
||||
return (gTimeZoneBufferPtr = gTimeZoneBuffer + tzZoneInfoLen);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
#if defined(SEARCH_TZFILE)
|
||||
DefaultTZInfo* tzInfo = (DefaultTZInfo*)uprv_malloc(sizeof(DefaultTZInfo));
|
||||
|
|
File diff suppressed because it is too large
Load diff
622
icu4c/source/common/rbbi_cache.cpp
Normal file
622
icu4c/source/common/rbbi_cache.cpp
Normal file
|
@ -0,0 +1,622 @@
|
|||
// Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// file: rbbi_cache.cpp
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/ubrk.h"
|
||||
#include "unicode/rbbi.h"
|
||||
|
||||
#include "rbbi_cache.h"
|
||||
|
||||
#include "brkeng.h"
|
||||
#include "cmemory.h"
|
||||
#include "rbbidata.h"
|
||||
#include "uassert.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/*
|
||||
* DictionaryCache implementation
|
||||
*/
|
||||
|
||||
RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
|
||||
fBI(bi), fBreaks(NULL), fPositionInCache(-1),
|
||||
fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) {
|
||||
fBreaks = new UVector32(status);
|
||||
}
|
||||
|
||||
RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() {
|
||||
delete fBreaks;
|
||||
fBreaks = NULL;
|
||||
}
|
||||
|
||||
void RuleBasedBreakIterator::DictionaryCache::reset() {
|
||||
fPositionInCache = -1;
|
||||
fStart = 0;
|
||||
fLimit = 0;
|
||||
fFirstRuleStatusIndex = 0;
|
||||
fOtherRuleStatusIndex = 0;
|
||||
fBreaks->removeAllElements();
|
||||
}
|
||||
|
||||
UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
|
||||
if (fromPos >= fLimit || fromPos < fStart) {
|
||||
fPositionInCache = -1;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
// Sequential iteration, move from previous boundary to the following
|
||||
|
||||
int32_t r = 0;
|
||||
if (fPositionInCache >= 0 && fPositionInCache < fBreaks->size() && fBreaks->elementAti(fPositionInCache) == fromPos) {
|
||||
++fPositionInCache;
|
||||
if (fPositionInCache >= fBreaks->size()) {
|
||||
fPositionInCache = -1;
|
||||
return FALSE;
|
||||
}
|
||||
r = fBreaks->elementAti(fPositionInCache);
|
||||
U_ASSERT(r > fromPos);
|
||||
*result = r;
|
||||
*statusIndex = fOtherRuleStatusIndex;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
// Random indexing. Linear search for the boundary following the given position.
|
||||
|
||||
for (fPositionInCache = 0; fPositionInCache < fBreaks->size(); ++fPositionInCache) {
|
||||
r= fBreaks->elementAti(fPositionInCache);
|
||||
if (r > fromPos) {
|
||||
*result = r;
|
||||
*statusIndex = fOtherRuleStatusIndex;
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
U_ASSERT(FALSE);
|
||||
fPositionInCache = -1;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
||||
UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
|
||||
if (fromPos <= fStart || fromPos > fLimit) {
|
||||
fPositionInCache = -1;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (fromPos == fLimit) {
|
||||
fPositionInCache = fBreaks->size() - 1;
|
||||
if (fPositionInCache >= 0) {
|
||||
U_ASSERT(fBreaks->elementAti(fPositionInCache) == fromPos);
|
||||
}
|
||||
}
|
||||
|
||||
int32_t r;
|
||||
if (fPositionInCache > 0 && fPositionInCache < fBreaks->size() && fBreaks->elementAti(fPositionInCache) == fromPos) {
|
||||
--fPositionInCache;
|
||||
r = fBreaks->elementAti(fPositionInCache);
|
||||
U_ASSERT(r < fromPos);
|
||||
*result = r;
|
||||
*statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
if (fPositionInCache == 0) {
|
||||
fPositionInCache = -1;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
for (fPositionInCache = fBreaks->size()-1; fPositionInCache >= 0; --fPositionInCache) {
|
||||
r = fBreaks->elementAti(fPositionInCache);
|
||||
if (r < fromPos) {
|
||||
*result = r;
|
||||
*statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
U_ASSERT(FALSE);
|
||||
fPositionInCache = -1;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos,
|
||||
int32_t firstRuleStatus, int32_t otherRuleStatus) {
|
||||
if ((endPos - startPos) <= 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
reset();
|
||||
fFirstRuleStatusIndex = firstRuleStatus;
|
||||
fOtherRuleStatusIndex = otherRuleStatus;
|
||||
|
||||
int32_t rangeStart = startPos;
|
||||
int32_t rangeEnd = endPos;
|
||||
|
||||
uint16_t category;
|
||||
int32_t current;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t foundBreakCount = 0;
|
||||
UText *text = fBI->fText;
|
||||
|
||||
// Loop through the text, looking for ranges of dictionary characters.
|
||||
// For each span, find the appropriate break engine, and ask it to find
|
||||
// any breaks within the span.
|
||||
|
||||
utext_setNativeIndex(text, rangeStart);
|
||||
UChar32 c = utext_current32(text);
|
||||
category = UTRIE2_GET16(fBI->fData->fTrie, c);
|
||||
|
||||
while(U_SUCCESS(status)) {
|
||||
while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd && (category & 0x4000) == 0) {
|
||||
utext_next32(text); // TODO: cleaner loop structure.
|
||||
c = utext_current32(text);
|
||||
category = UTRIE2_GET16(fBI->fData->fTrie, c);
|
||||
}
|
||||
if (current >= rangeEnd) {
|
||||
break;
|
||||
}
|
||||
|
||||
// We now have a dictionary character. Get the appropriate language object
|
||||
// to deal with it.
|
||||
const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(c);
|
||||
|
||||
// Ask the language object if there are any breaks. It will add them to the cache and
|
||||
// leave the text pointer on the other side of its range, ready to search for the next one.
|
||||
if (lbe != NULL) {
|
||||
foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBI->fBreakType, *fBreaks);
|
||||
}
|
||||
|
||||
// Reload the loop variables for the next go-round
|
||||
c = utext_current32(text);
|
||||
category = UTRIE2_GET16(fBI->fData->fTrie, c);
|
||||
}
|
||||
|
||||
// If we found breaks, ensure that the first and last entries are
|
||||
// the original starting and ending position. And initialize the
|
||||
// cache iteration position to the first entry.
|
||||
|
||||
// printf("foundBreakCount = %d\n", foundBreakCount);
|
||||
if (foundBreakCount > 0) {
|
||||
U_ASSERT(foundBreakCount == fBreaks->size());
|
||||
if (startPos < fBreaks->elementAti(0)) {
|
||||
// The dictionary did not place a boundary at the start of the segment of text.
|
||||
// Add one now. This should not commonly happen, but it would be easy for interactions
|
||||
// of the rules for dictionary segments and the break engine implementations to
|
||||
// inadvertently cause it. Cover it here, just in case.
|
||||
fBreaks->insertElementAt(startPos, 0, status);
|
||||
}
|
||||
if (endPos > fBreaks->peeki()) {
|
||||
fBreaks->push(endPos, status);
|
||||
}
|
||||
fPositionInCache = 0;
|
||||
// Note: Dictionary matching may extend beyond the original limit.
|
||||
fStart = fBreaks->elementAti(0);
|
||||
fLimit = fBreaks->peeki();
|
||||
} else {
|
||||
// there were no language-based breaks, even though the segment contained
|
||||
// dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
|
||||
// for this range will fail, and the calling code will fall back to the rule based boundaries.
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* BreakCache implemetation
|
||||
*/
|
||||
|
||||
RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
|
||||
fBI(bi), fSideBuffer(status) {
|
||||
reset();
|
||||
}
|
||||
|
||||
|
||||
RuleBasedBreakIterator::BreakCache::~BreakCache() {
|
||||
}
|
||||
|
||||
|
||||
void RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) {
|
||||
fStartBufIdx = 0;
|
||||
fEndBufIdx = 0;
|
||||
fTextIdx = pos;
|
||||
fBufIdx = 0;
|
||||
fBoundaries[0] = pos;
|
||||
fStatuses[0] = (uint16_t)ruleStatus;
|
||||
}
|
||||
|
||||
|
||||
int32_t RuleBasedBreakIterator::BreakCache::current() {
|
||||
fBI->fPosition = fTextIdx;
|
||||
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
|
||||
fBI->fDone = FALSE;
|
||||
return fTextIdx;
|
||||
}
|
||||
|
||||
|
||||
void RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
|
||||
// startPos is in the cache. Do a next() from that position.
|
||||
// TODO: an awkward set of interactions with bi->fDone
|
||||
// seek() does not clear it; it can't because of interactions with populateNear().
|
||||
// next() does not clear it in the fast-path case, where everything matters. Maybe it should.
|
||||
// So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
|
||||
fBI->fDone = false;
|
||||
next();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
|
||||
if (startPos == fTextIdx) {
|
||||
previous(status);
|
||||
} else {
|
||||
// seek() leaves the BreakCache positioned at the preceding boundary
|
||||
// if the requested position is between two bounaries.
|
||||
// current() pushes the BreakCache position out to the BreakIterator itself.
|
||||
U_ASSERT(startPos > fTextIdx);
|
||||
current();
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Out-of-line code for BreakCache::next().
|
||||
* Cache does not already contain the boundary
|
||||
*/
|
||||
void RuleBasedBreakIterator::BreakCache::nextOL() {
|
||||
fBI->fDone = !populateFollowing();
|
||||
fBI->fPosition = fTextIdx;
|
||||
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
int32_t initialBufIdx = fBufIdx;
|
||||
if (fBufIdx == fStartBufIdx) {
|
||||
// At start of cache. Prepend to it.
|
||||
populatePreceding(status);
|
||||
} else {
|
||||
// Cache already holds the next boundary
|
||||
fBufIdx = modChunkSize(fBufIdx - 1);
|
||||
fTextIdx = fBoundaries[fBufIdx];
|
||||
}
|
||||
fBI->fDone = (fBufIdx == initialBufIdx);
|
||||
fBI->fPosition = fTextIdx;
|
||||
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
|
||||
if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
|
||||
return FALSE;
|
||||
}
|
||||
if (pos == fBoundaries[fStartBufIdx]) {
|
||||
// Common case: seek(0), from BreakIterator::first()
|
||||
fBufIdx = fStartBufIdx;
|
||||
fTextIdx = fBoundaries[fBufIdx];
|
||||
return TRUE;
|
||||
}
|
||||
if (pos == fBoundaries[fEndBufIdx]) {
|
||||
fBufIdx = fEndBufIdx;
|
||||
fTextIdx = fBoundaries[fBufIdx];
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
int32_t min = fStartBufIdx;
|
||||
int32_t max = fEndBufIdx;
|
||||
while (min != max) {
|
||||
int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
|
||||
probe = modChunkSize(probe);
|
||||
if (fBoundaries[probe] > pos) {
|
||||
max = probe;
|
||||
} else {
|
||||
min = modChunkSize(probe + 1);
|
||||
}
|
||||
}
|
||||
U_ASSERT(fBoundaries[max] > pos);
|
||||
fBufIdx = modChunkSize(max - 1);
|
||||
fTextIdx = fBoundaries[fBufIdx];
|
||||
U_ASSERT(fTextIdx <= pos);
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
||||
UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return FALSE;
|
||||
}
|
||||
U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
|
||||
|
||||
// Find a boundary somewhere in the vicinity of the requested position.
|
||||
// Depending on the safe rules and the text data, it could be either before, at, or after
|
||||
// the requested position.
|
||||
|
||||
|
||||
// If the requested position is not near already cached positions, clear the existing cache,
|
||||
// find a near-by boundary and begin new cache contents there.
|
||||
|
||||
if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
|
||||
int32_t aBoundary = 0;
|
||||
int32_t ruleStatusIndex = 0;
|
||||
// TODO: check for position == length of text. Although may still need to back up to get rule status.
|
||||
if (position > 20) {
|
||||
int32_t backupPos = fBI->handlePrevious(position);
|
||||
fBI->fPosition = backupPos;
|
||||
aBoundary = fBI->handleNext(); // Ignore dictionary, just finding a rule based boundary.
|
||||
ruleStatusIndex = fBI->fRuleStatusIndex;
|
||||
}
|
||||
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
|
||||
}
|
||||
|
||||
// Fill in boundaries between existing cache content and the new requested position.
|
||||
|
||||
if (fBoundaries[fEndBufIdx] < position) {
|
||||
// The last position in the cache precedes the requested position.
|
||||
// Add following position(s) to the cache.
|
||||
while (fBoundaries[fEndBufIdx] < position) {
|
||||
if (!populateFollowing()) {
|
||||
U_ASSERT(false);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer.
|
||||
fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries.
|
||||
while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos.
|
||||
previous(status);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
if (fBoundaries[fStartBufIdx] > position) {
|
||||
// The first position in the cache is beyond the requested position.
|
||||
// back up more until we get a boundary <= the requested position.
|
||||
while (fBoundaries[fStartBufIdx] > position) {
|
||||
populatePreceding(status);
|
||||
}
|
||||
fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer.
|
||||
fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries.
|
||||
while (fTextIdx < position) { // Move forwards to a position at or following the requested pos.
|
||||
next();
|
||||
}
|
||||
if (fTextIdx > position) {
|
||||
// If position is not itself a boundary, the next() loop above will overshoot.
|
||||
// Back up one, leaving cache position at the boundary preceding the requested position.
|
||||
previous(status);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
U_ASSERT(fTextIdx == position);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
UBool RuleBasedBreakIterator::BreakCache::populateFollowing() {
|
||||
int32_t fromPosition = fBoundaries[fEndBufIdx];
|
||||
int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx];
|
||||
int32_t pos = 0;
|
||||
int32_t ruleStatusIdx = 0;
|
||||
|
||||
if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
|
||||
addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
fBI->fPosition = fromPosition;
|
||||
pos = fBI->handleNext();
|
||||
if (pos == UBRK_DONE) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
ruleStatusIdx = fBI->fRuleStatusIndex;
|
||||
if (fBI->fDictionaryCharCount > 0) {
|
||||
// The text segment obtained from the rules includes dictionary characters.
|
||||
// Subdivide it, with subdivided results going into the dictionary cache.
|
||||
fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
|
||||
if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
|
||||
addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
|
||||
return TRUE;
|
||||
// TODO: may want to move a sizable chunk of dictionary cache to break cache at this point.
|
||||
// But be careful with interactions with populateNear().
|
||||
}
|
||||
}
|
||||
|
||||
// Rule based segment did not include dictionary characters.
|
||||
// Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
|
||||
// meaning that we didn't take the return, above.
|
||||
// Add its end point to the cache.
|
||||
addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
|
||||
|
||||
// Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
|
||||
// (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
|
||||
//
|
||||
for (int count=0; count<6; ++count) {
|
||||
pos = fBI->handleNext();
|
||||
if (pos == UBRK_DONE || fBI->fDictionaryCharCount > 0) {
|
||||
break;
|
||||
}
|
||||
addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition);
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
||||
UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
int32_t fromPosition = fBoundaries[fStartBufIdx];
|
||||
if (fromPosition == 0) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
int32_t position = 0;
|
||||
int32_t positionStatusIdx = 0;
|
||||
|
||||
if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) {
|
||||
addPreceding(position, positionStatusIdx, UpdateCachePosition);
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
int32_t backupPosition = fromPosition;
|
||||
|
||||
// Find a boundary somewhere preceding the first already-cached boundary
|
||||
do {
|
||||
backupPosition = backupPosition - 30;
|
||||
if (backupPosition <= 0) {
|
||||
backupPosition = 0;
|
||||
} else {
|
||||
backupPosition = fBI->handlePrevious(backupPosition);
|
||||
}
|
||||
if (backupPosition == UBRK_DONE || backupPosition == 0) {
|
||||
position = 0;
|
||||
positionStatusIdx = 0;
|
||||
} else {
|
||||
fBI->fPosition = backupPosition; // TODO: pass starting position in a clearer way.
|
||||
position = fBI->handleNext();
|
||||
positionStatusIdx = fBI->fRuleStatusIndex;
|
||||
|
||||
}
|
||||
} while (position >= fromPosition);
|
||||
|
||||
// Find boundaries between the one we just located and the first already-cached boundary
|
||||
// Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer..
|
||||
|
||||
fSideBuffer.removeAllElements();
|
||||
fSideBuffer.addElement(position, status);
|
||||
fSideBuffer.addElement(positionStatusIdx, status);
|
||||
|
||||
do {
|
||||
int32_t prevPosition = fBI->fPosition = position;
|
||||
int32_t prevStatusIdx = positionStatusIdx;
|
||||
position = fBI->handleNext();
|
||||
positionStatusIdx = fBI->fRuleStatusIndex;
|
||||
if (position == UBRK_DONE) {
|
||||
break;
|
||||
}
|
||||
|
||||
UBool segmentHandledByDictionary = FALSE;
|
||||
if (fBI->fDictionaryCharCount != 0) {
|
||||
// Segment from the rules includes dictionary characters.
|
||||
// Subdivide it, with subdivided results going into the dictionary cache.
|
||||
int32_t dictSegEndPosition = position;
|
||||
fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
|
||||
while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) {
|
||||
segmentHandledByDictionary = true;
|
||||
U_ASSERT(position > prevPosition);
|
||||
if (position >= fromPosition) {
|
||||
break;
|
||||
}
|
||||
U_ASSERT(position <= dictSegEndPosition);
|
||||
fSideBuffer.addElement(position, status);
|
||||
fSideBuffer.addElement(positionStatusIdx, status);
|
||||
prevPosition = position;
|
||||
}
|
||||
U_ASSERT(position==dictSegEndPosition || position>=fromPosition);
|
||||
}
|
||||
|
||||
if (!segmentHandledByDictionary && position < fromPosition) {
|
||||
fSideBuffer.addElement(position, status);
|
||||
fSideBuffer.addElement(positionStatusIdx, status);
|
||||
}
|
||||
} while (position < fromPosition);
|
||||
|
||||
// Move boundaries from the side buffer to the main circular buffer.
|
||||
UBool success = FALSE;
|
||||
if (!fSideBuffer.isEmpty()) {
|
||||
positionStatusIdx = fSideBuffer.popi();
|
||||
position = fSideBuffer.popi();
|
||||
addPreceding(position, positionStatusIdx, UpdateCachePosition);
|
||||
success = TRUE;
|
||||
}
|
||||
|
||||
while (!fSideBuffer.isEmpty()) {
|
||||
positionStatusIdx = fSideBuffer.popi();
|
||||
position = fSideBuffer.popi();
|
||||
if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
|
||||
// No space in circular buffer to hold a new preceding result while
|
||||
// also retaining the current cache (iteration) position.
|
||||
// Bailing out is safe; the cache will refill again if needed.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
|
||||
void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
|
||||
U_ASSERT(position > fBoundaries[fEndBufIdx]);
|
||||
U_ASSERT(ruleStatusIdx <= UINT16_MAX);
|
||||
int32_t nextIdx = modChunkSize(fEndBufIdx + 1);
|
||||
if (nextIdx == fStartBufIdx) {
|
||||
fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1.
|
||||
}
|
||||
fBoundaries[nextIdx] = position;
|
||||
fStatuses[nextIdx] = ruleStatusIdx;
|
||||
fEndBufIdx = nextIdx;
|
||||
if (update == UpdateCachePosition) {
|
||||
// Set current position to the newly added boundary.
|
||||
fBufIdx = nextIdx;
|
||||
fTextIdx = position;
|
||||
} else {
|
||||
// Retaining the original cache position.
|
||||
// Check if the added boundary wraps around the buffer, and would over-write the original position.
|
||||
// It's the responsibility of callers of this function to not add too many.
|
||||
U_ASSERT(nextIdx != fBufIdx);
|
||||
}
|
||||
}
|
||||
|
||||
bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
|
||||
U_ASSERT(position < fBoundaries[fStartBufIdx]);
|
||||
U_ASSERT(ruleStatusIdx <= UINT16_MAX);
|
||||
int32_t nextIdx = modChunkSize(fStartBufIdx - 1);
|
||||
if (nextIdx == fEndBufIdx) {
|
||||
if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
|
||||
// Failure. The insertion of the new boundary would claim the buffer position that is the
|
||||
// current iteration position. And we also want to retain the current iteration position.
|
||||
// (The buffer is already completely full of entries that precede the iteration position.)
|
||||
return false;
|
||||
}
|
||||
fEndBufIdx = modChunkSize(fEndBufIdx - 1);
|
||||
}
|
||||
fBoundaries[nextIdx] = position;
|
||||
fStatuses[nextIdx] = ruleStatusIdx;
|
||||
fStartBufIdx = nextIdx;
|
||||
if (update == UpdateCachePosition) {
|
||||
fBufIdx = nextIdx;
|
||||
fTextIdx = position;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void RuleBasedBreakIterator::BreakCache::dumpCache() {
|
||||
printf("fTextIdx:%d fBufIdx:%d\n", fTextIdx, fBufIdx);
|
||||
for (int32_t i=fStartBufIdx; ; i=modChunkSize(i+1)) {
|
||||
printf("%d %d\n", i, fBoundaries[i]);
|
||||
if (i == fEndBufIdx) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
199
icu4c/source/common/rbbi_cache.h
Normal file
199
icu4c/source/common/rbbi_cache.h
Normal file
|
@ -0,0 +1,199 @@
|
|||
// Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// file: rbbi_cache.h
|
||||
//
|
||||
#ifndef RBBI_CACHE_H
|
||||
#define RBBI_CACHE_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#include "unicode/rbbi.h"
|
||||
#include "unicode/uobject.h"
|
||||
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/* DictionaryCache stores the boundaries obtained from a run of dictionary characters.
|
||||
* Dictionary boundaries are moved first to this cache, then from here
|
||||
* to the main BreakCache, where they may inter-leave with non-dictionary
|
||||
* boundaries. The public BreakIterator API always fetches directly
|
||||
* from the main BreakCache, not from here.
|
||||
*
|
||||
* In common situations, the number of boundaries in a single dictionary run
|
||||
* should be quite small, it will be terminated by punctuation, spaces,
|
||||
* or any other non-dictionary characters. The main BreakCache may end
|
||||
* up with boundaries from multiple dictionary based runs.
|
||||
*
|
||||
* The boundaries are stored in a simple ArrayList (vector), with the
|
||||
* assumption that they will be accessed sequentially.
|
||||
*/
|
||||
class RuleBasedBreakIterator::DictionaryCache: public UMemory {
|
||||
public:
|
||||
DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status);
|
||||
~DictionaryCache();
|
||||
|
||||
void reset();
|
||||
|
||||
UBool following(int32_t fromPos, int32_t *pos, int32_t *statusIndex);
|
||||
UBool preceding(int32_t fromPos, int32_t *pos, int32_t *statusIndex);
|
||||
|
||||
/**
|
||||
* Populate the cache with the dictionary based boundaries within a region of text.
|
||||
* @param startPos The start position of a range of text
|
||||
* @param endPos The end position of a range of text
|
||||
* @param firstRuleStatus The rule status index that applies to the break at startPos
|
||||
* @param otherRuleStatus The rule status index that applies to boundaries other than startPos
|
||||
* @internal
|
||||
*/
|
||||
void populateDictionary(int32_t startPos, int32_t endPos,
|
||||
int32_t firstRuleStatus, int32_t otherRuleStatus);
|
||||
|
||||
|
||||
|
||||
RuleBasedBreakIterator *fBI;
|
||||
|
||||
UVector32 *fBreaks; // A vector containing the boundaries.
|
||||
int32_t fPositionInCache; // Index in fBreaks of last boundary returned by following()
|
||||
// or preceding(). Optimizes sequential access.
|
||||
int32_t fStart; // Text position of first boundary in cache.
|
||||
int32_t fLimit; // Last boundary in cache. Which is the limit of the
|
||||
// text segment being handled by the dictionary.
|
||||
int32_t fFirstRuleStatusIndex; // Rule status info for first boundary.
|
||||
int32_t fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries.
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* class BreakCache
|
||||
*
|
||||
* Cache of break boundary positions and rule status values.
|
||||
* Break iterator API functions, next(), previous(), etc., will use cached results
|
||||
* when possible, and otherwise cache new results as they are obtained.
|
||||
*
|
||||
* Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
|
||||
*
|
||||
* The cache is implemented as a single circular buffer.
|
||||
*/
|
||||
|
||||
/*
|
||||
* size of the circular cache buffer.
|
||||
*/
|
||||
|
||||
class RuleBasedBreakIterator::BreakCache: public UMemory {
|
||||
public:
|
||||
BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status);
|
||||
virtual ~BreakCache();
|
||||
void reset(int32_t pos = 0, int32_t ruleStatus = 0);
|
||||
void next() { if (fBufIdx == fEndBufIdx) {
|
||||
nextOL();
|
||||
} else {
|
||||
fBufIdx = modChunkSize(fBufIdx + 1);
|
||||
fTextIdx = fBI->fPosition = fBoundaries[fBufIdx];
|
||||
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
void nextOL();
|
||||
void previous(UErrorCode &status);
|
||||
|
||||
// Move the iteration state to the position following the startPosition.
|
||||
// Input position must be pinned to the input length.
|
||||
void following(int32_t startPosition, UErrorCode &status);
|
||||
|
||||
void preceding(int32_t startPosition, UErrorCode &status);
|
||||
|
||||
/*
|
||||
* Update the state of the public BreakIterator (fBI) to reflect the
|
||||
* current state of the break iterator cache (this).
|
||||
*/
|
||||
int32_t current();
|
||||
|
||||
/**
|
||||
* Add boundaries to the cache near the specified position.
|
||||
* The given position need not be a boundary itself.
|
||||
* The input position must be within the range of the text, and
|
||||
* on a code point boundary.
|
||||
* If the requested position is a break boundary, leave the iteration
|
||||
* position on it.
|
||||
* If the requested position is not a boundary, leave the iteration
|
||||
* position on the preceding boundary and include both the the
|
||||
* preceding and following boundaries in the cache.
|
||||
* Additional boundaries, either preceding or following, may be added
|
||||
* to the cache as a side effect.
|
||||
*
|
||||
* Return FALSE if the operation failed.
|
||||
*/
|
||||
UBool populateNear(int32_t position, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Add boundary(s) to the cache following the current last boundary.
|
||||
* Return FALSE if at the end of the text, and no more boundaries can be added.
|
||||
* Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
|
||||
*/
|
||||
UBool populateFollowing();
|
||||
|
||||
/**
|
||||
* Add one or more boundaries to the cache preceding the first currently cached boundary.
|
||||
* Leave the iteration position on the first added boundary.
|
||||
* Return false if no boundaries could be added (if at the start of the text.)
|
||||
*/
|
||||
UBool populatePreceding(UErrorCode &status);
|
||||
|
||||
enum UpdatePositionValues {
|
||||
RetainCachePosition = 0,
|
||||
UpdateCachePosition = 1
|
||||
};
|
||||
|
||||
/*
|
||||
* Add the boundary following the current position.
|
||||
* The current position can be left as it was, or changed to the newly added boundary,
|
||||
* as specified by the update parameter.
|
||||
*/
|
||||
void addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);
|
||||
|
||||
|
||||
/*
|
||||
* Add the boundary preceding the current position.
|
||||
* The current position can be left as it was, or changed to the newly added boundary,
|
||||
* as specified by the update parameter.
|
||||
*/
|
||||
bool addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);
|
||||
|
||||
/**
|
||||
* Set the cache position to the specified position, or, if the position
|
||||
* falls between to cached boundaries, to the preceding boundary.
|
||||
* Fails if the requested position is outside of the range of boundaries currently held by the cache.
|
||||
* The startPosition must be on a code point boundary.
|
||||
*
|
||||
* Return TRUE if successful, FALSE if the specified position is after
|
||||
* the last cached boundary or before the first.
|
||||
*/
|
||||
UBool seek(int32_t startPosition);
|
||||
|
||||
void dumpCache();
|
||||
|
||||
private:
|
||||
static inline int32_t modChunkSize(int index) { return index & (CACHE_SIZE - 1); };
|
||||
|
||||
static constexpr int32_t CACHE_SIZE = 128;
|
||||
static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");
|
||||
|
||||
RuleBasedBreakIterator *fBI;
|
||||
int32_t fStartBufIdx;
|
||||
int32_t fEndBufIdx; // inclusive
|
||||
|
||||
int32_t fTextIdx;
|
||||
int32_t fBufIdx;
|
||||
|
||||
int32_t fBoundaries[CACHE_SIZE];
|
||||
uint16_t fStatuses[CACHE_SIZE];
|
||||
|
||||
UVector32 fSideBuffer;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // RBBI_CACHE_H
|
|
@ -14,7 +14,7 @@
|
|||
#include "unicode/utypes.h"
|
||||
#include "rbbidata.h"
|
||||
#include "rbbirb.h"
|
||||
#include "utrie.h"
|
||||
#include "utrie2.h"
|
||||
#include "udatamem.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
|
@ -83,11 +83,11 @@ void RBBIDataWrapper::init0() {
|
|||
fReverseTable = NULL;
|
||||
fSafeFwdTable = NULL;
|
||||
fSafeRevTable = NULL;
|
||||
fRuleSource = NULL;
|
||||
fRuleSource = NULL;
|
||||
fRuleStatusTable = NULL;
|
||||
fTrie = NULL;
|
||||
fUDataMem = NULL;
|
||||
fRefCount = 0;
|
||||
fTrie = NULL;
|
||||
fUDataMem = NULL;
|
||||
fRefCount = 0;
|
||||
fDontFreeData = TRUE;
|
||||
}
|
||||
|
||||
|
@ -118,6 +118,14 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
|
|||
fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
|
||||
}
|
||||
|
||||
// Rule Compatibility Hacks
|
||||
// If a rule set includes reverse rules but does not explicitly include safe reverse rules,
|
||||
// the reverse rules are to be treated as safe reverse rules.
|
||||
|
||||
if (fSafeRevTable == NULL && fReverseTable != NULL) {
|
||||
fSafeRevTable = fReverseTable;
|
||||
fReverseTable = NULL;
|
||||
}
|
||||
|
||||
fTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
|
||||
(uint8_t *)data + fHeader->fTrie,
|
||||
|
|
|
@ -184,11 +184,11 @@ public:
|
|||
/* number of int32_t values in the rule status table. Used to sanity check indexing */
|
||||
int32_t fStatusMaxIdx;
|
||||
|
||||
UTrie2 *fTrie;
|
||||
UTrie2 *fTrie;
|
||||
|
||||
private:
|
||||
u_atomic_int32_t fRefCount;
|
||||
UDataMemory *fUDataMem;
|
||||
UDataMemory *fUDataMem;
|
||||
UnicodeString fRuleString;
|
||||
UBool fDontFreeData;
|
||||
|
||||
|
|
|
@ -24,16 +24,16 @@
|
|||
#include "unicode/uchriter.h"
|
||||
#include "unicode/parsepos.h"
|
||||
#include "unicode/parseerr.h"
|
||||
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
|
||||
#include "rbbirb.h"
|
||||
#include "rbbinode.h"
|
||||
|
||||
#include "rbbiscan.h"
|
||||
#include "rbbisetb.h"
|
||||
#include "rbbitblb.h"
|
||||
#include "rbbidata.h"
|
||||
#include "uassert.h"
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
@ -164,8 +164,13 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||
int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
|
||||
int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar));
|
||||
|
||||
int32_t totalSize = headerSize + forwardTableSize + reverseTableSize
|
||||
+ safeFwdTableSize + safeRevTableSize
|
||||
(void)safeFwdTableSize;
|
||||
|
||||
int32_t totalSize = headerSize
|
||||
+ forwardTableSize
|
||||
+ /* reverseTableSize */ 0
|
||||
+ /* safeFwdTableSize */ 0
|
||||
+ (safeRevTableSize ? safeRevTableSize : reverseTableSize)
|
||||
+ statusTableSize + trieSize + rulesSize;
|
||||
|
||||
RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
|
||||
|
@ -184,16 +189,38 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||
data->fLength = totalSize;
|
||||
data->fCatCount = fSetBuilder->getNumCharCategories();
|
||||
|
||||
// Only save the forward table and the safe reverse table,
|
||||
// because these are the only ones used at run-time.
|
||||
//
|
||||
// For the moment, we still build the other tables if they are present in the rule source files,
|
||||
// for backwards compatibility. Old rule files need to work, and this is the simplest approach.
|
||||
//
|
||||
// Additional backwards compatibility consideration: if no safe rules are provided, consider the
|
||||
// reverse rules to actually be the safe reverse rules.
|
||||
|
||||
data->fFTable = headerSize;
|
||||
data->fFTableLen = forwardTableSize;
|
||||
data->fRTable = data->fFTable + forwardTableSize;
|
||||
data->fRTableLen = reverseTableSize;
|
||||
data->fSFTable = data->fRTable + reverseTableSize;
|
||||
data->fSFTableLen = safeFwdTableSize;
|
||||
data->fSRTable = data->fSFTable + safeFwdTableSize;
|
||||
data->fSRTableLen = safeRevTableSize;
|
||||
|
||||
data->fTrie = data->fSRTable + safeRevTableSize;
|
||||
// Do not save Reverse Table.
|
||||
data->fRTable = data->fFTable + forwardTableSize;
|
||||
data->fRTableLen = 0;
|
||||
|
||||
// Do not save the Safe Forward table.
|
||||
data->fSFTable = data->fRTable + 0;
|
||||
data->fSFTableLen = 0;
|
||||
|
||||
data->fSRTable = data->fSFTable + 0;
|
||||
if (safeRevTableSize > 0) {
|
||||
data->fSRTableLen = safeRevTableSize;
|
||||
} else if (reverseTableSize > 0) {
|
||||
data->fSRTableLen = reverseTableSize;
|
||||
} else {
|
||||
U_ASSERT(FALSE); // Rule build should have failed for lack of a reverse table
|
||||
// before reaching this point.
|
||||
}
|
||||
|
||||
|
||||
data->fTrie = data->fSRTable + data->fSRTableLen;
|
||||
data->fTrieLen = fSetBuilder->getTrieSize();
|
||||
data->fStatusTable = data->fTrie + trieSize;
|
||||
data->fStatusTableLen= statusTableSize;
|
||||
|
@ -203,9 +230,14 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
|
||||
|
||||
fForwardTables->exportTable((uint8_t *)data + data->fFTable);
|
||||
fReverseTables->exportTable((uint8_t *)data + data->fRTable);
|
||||
fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
|
||||
fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
|
||||
// fReverseTables->exportTable((uint8_t *)data + data->fRTable);
|
||||
// fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
|
||||
if (safeRevTableSize > 0) {
|
||||
fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
|
||||
} else {
|
||||
fReverseTables->exportTable((uint8_t *)data + data->fSRTable);
|
||||
}
|
||||
|
||||
fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
|
||||
|
||||
int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
|
||||
|
|
|
@ -15,6 +15,9 @@
|
|||
#define RBBIRB_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/rbbi.h"
|
||||
#include "unicode/uniset.h"
|
||||
|
@ -207,6 +210,9 @@ struct RBBISetTableEl {
|
|||
#endif
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
|
|
@ -47,6 +47,7 @@
|
|||
//
|
||||
//------------------------------------------------------------------------------
|
||||
static const UChar gRuleSet_rule_char_pattern[] = {
|
||||
// Characters that may appear as literals in patterns without escaping or quoting.
|
||||
// [ ^ [ \ p { Z } \ u 0 0 2 0
|
||||
0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30,
|
||||
// - \ u 0 0 7 f ] - [ \ p
|
||||
|
@ -558,6 +559,10 @@ UBool RBBIRuleScanner::doParseActions(int32_t action)
|
|||
fRB->fDefaultTree = &fRB->fSafeRevTree;
|
||||
} else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) {
|
||||
fRB->fLookAheadHardBreak = TRUE;
|
||||
} else if (opt == UNICODE_STRING("quoted_literals_only", 20)) {
|
||||
fRuleSets[kRuleSet_rule_char-128].clear();
|
||||
} else if (opt == UNICODE_STRING("unquoted_literals", 17)) {
|
||||
fRuleSets[kRuleSet_rule_char-128].applyPattern(UnicodeString(gRuleSet_rule_char_pattern), *fRB->fStatus);
|
||||
} else {
|
||||
error(U_BRK_UNRECOGNIZED_OPTION);
|
||||
}
|
||||
|
|
|
@ -250,12 +250,17 @@ void RBBISetBuilder::build() {
|
|||
// Build the Trie table for mapping UChar32 values to the corresponding
|
||||
// range group number
|
||||
//
|
||||
fTrie = utrie2_open(0, // Initial value for all code points
|
||||
0, // errorValue
|
||||
fTrie = utrie2_open(0, // Initial value for all code points.
|
||||
0, // Error value for out-of-range input.
|
||||
fStatus);
|
||||
|
||||
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
|
||||
utrie2_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar, rlRange->fNum, TRUE, fStatus);
|
||||
for (rlRange = fRangeList; rlRange!=0 && U_SUCCESS(*fStatus); rlRange=rlRange->fNext) {
|
||||
utrie2_setRange32(fTrie,
|
||||
rlRange->fStartChar, // Range start
|
||||
rlRange->fEndChar, // Range end (inclusive)
|
||||
rlRange->fNum, // value for range
|
||||
TRUE, // Overwrite previously written values
|
||||
fStatus);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -265,7 +270,10 @@ void RBBISetBuilder::build() {
|
|||
// getTrieSize() Return the size that will be required to serialize the Trie.
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
int32_t RBBISetBuilder::getTrieSize() /*const*/ {
|
||||
int32_t RBBISetBuilder::getTrieSize() {
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return 0;
|
||||
}
|
||||
utrie2_freeze(fTrie, UTRIE2_16_VALUE_BITS, fStatus);
|
||||
fTrieSize = utrie2_serialize(fTrie,
|
||||
NULL, // Buffer
|
||||
|
|
|
@ -13,6 +13,9 @@
|
|||
#define RBBISETB_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/uobject.h"
|
||||
#include "rbbirb.h"
|
||||
#include "utrie2.h"
|
||||
|
@ -108,8 +111,8 @@ private:
|
|||
|
||||
RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors
|
||||
|
||||
UTrie2 *fTrie; // The mapping TRIE that is the end result of processing
|
||||
uint32_t fTrieSize; // the Unicode Sets.
|
||||
UTrie2 *fTrie; // The mapping TRIE that is the end result of processing
|
||||
uint32_t fTrieSize; // the Unicode Sets.
|
||||
|
||||
// Groups correspond to character categories -
|
||||
// groups of ranges that are in the same original UnicodeSets.
|
||||
|
@ -128,4 +131,7 @@ private:
|
|||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
||||
#endif
|
||||
|
|
|
@ -20,8 +20,11 @@
|
|||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/bytestream.h"
|
||||
#include "unicode/casemap.h"
|
||||
#include "unicode/edits.h"
|
||||
#include "unicode/stringoptions.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "unicode/ubrk.h"
|
||||
#include "unicode/uloc.h"
|
||||
#include "unicode/ustring.h"
|
||||
|
@ -32,6 +35,7 @@
|
|||
#include "unicode/utf.h"
|
||||
#include "unicode/utf8.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "bytesinkutil.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "uassert.h"
|
||||
|
@ -39,27 +43,6 @@
|
|||
#include "ucasemap_imp.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
namespace {
|
||||
|
||||
// TODO: share with UTF-16? inline in ucasemap_imp.h?
|
||||
int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity,
|
||||
Edits *edits, UErrorCode &errorCode) {
|
||||
if (U_SUCCESS(errorCode)) {
|
||||
if (destIndex > destCapacity) {
|
||||
errorCode = U_BUFFER_OVERFLOW_ERROR;
|
||||
} else if (edits != NULL) {
|
||||
edits->copyErrorTo(errorCode);
|
||||
}
|
||||
}
|
||||
return destIndex;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
/* UCaseMap service object -------------------------------------------------- */
|
||||
|
@ -150,152 +133,39 @@ ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
|
|||
|
||||
/* TODO(markus): Move to a new, separate utf8case.cpp file. */
|
||||
|
||||
namespace {
|
||||
|
||||
/* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
|
||||
static inline int32_t
|
||||
appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity,
|
||||
int32_t result, const UChar *s,
|
||||
int32_t cpLength, uint32_t options, icu::Edits *edits) {
|
||||
UChar32 c;
|
||||
int32_t length;
|
||||
UErrorCode errorCode;
|
||||
inline UBool
|
||||
appendResult(int32_t cpLength, int32_t result, const UChar *s,
|
||||
ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
|
||||
U_ASSERT(U_SUCCESS(errorCode));
|
||||
|
||||
/* decode the result */
|
||||
if(result<0) {
|
||||
/* (not) original code point */
|
||||
if(edits!=NULL) {
|
||||
edits->addUnchanged(cpLength);
|
||||
if(options & U_OMIT_UNCHANGED_TEXT) {
|
||||
return destIndex;
|
||||
}
|
||||
}
|
||||
c=~result;
|
||||
if(destIndex<destCapacity && c<=0x7f) { // ASCII slightly-fastpath
|
||||
dest[destIndex++]=(uint8_t)c;
|
||||
return destIndex;
|
||||
if((options & U_OMIT_UNCHANGED_TEXT) == 0) {
|
||||
ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);
|
||||
}
|
||||
length=cpLength;
|
||||
} else {
|
||||
if(result<=UCASE_MAX_STRING_LENGTH) {
|
||||
// string: "result" is the UTF-16 length
|
||||
if(result==0) {
|
||||
length=0;
|
||||
} else {
|
||||
errorCode=U_ZERO_ERROR;
|
||||
if(destIndex<destCapacity) {
|
||||
u_strToUTF8((char *)(dest+destIndex), destCapacity-destIndex, &length,
|
||||
s, result, &errorCode);
|
||||
} else {
|
||||
u_strToUTF8(NULL, 0, &length, s, result, &errorCode);
|
||||
}
|
||||
if(U_FAILURE(errorCode) && errorCode != U_BUFFER_OVERFLOW_ERROR) {
|
||||
return -1;
|
||||
}
|
||||
if(length>(INT32_MAX-destIndex)) {
|
||||
return -1; // integer overflow
|
||||
}
|
||||
}
|
||||
if(edits!=NULL) {
|
||||
edits->addReplace(cpLength, length);
|
||||
}
|
||||
// We might have an overflow, but we know the actual length.
|
||||
return destIndex+length;
|
||||
} else if(destIndex<destCapacity && result<=0x7f) { // ASCII slightly-fastpath
|
||||
dest[destIndex++]=(uint8_t)result;
|
||||
if(edits!=NULL) {
|
||||
edits->addReplace(cpLength, 1);
|
||||
}
|
||||
return destIndex;
|
||||
return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);
|
||||
} else {
|
||||
c=result;
|
||||
length=U8_LENGTH(c);
|
||||
if(edits!=NULL) {
|
||||
edits->addReplace(cpLength, length);
|
||||
}
|
||||
ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);
|
||||
}
|
||||
}
|
||||
// c>=0 single code point
|
||||
if(length>(INT32_MAX-destIndex)) {
|
||||
return -1; // integer overflow
|
||||
}
|
||||
|
||||
if(destIndex<destCapacity) {
|
||||
/* append the result */
|
||||
UBool isError=FALSE;
|
||||
U8_APPEND(dest, destIndex, destCapacity, c, isError);
|
||||
if(isError) {
|
||||
/* overflow, nothing written */
|
||||
destIndex+=length;
|
||||
}
|
||||
} else {
|
||||
/* preflight */
|
||||
destIndex+=length;
|
||||
}
|
||||
return destIndex;
|
||||
}
|
||||
|
||||
static inline int32_t
|
||||
appendASCII(uint8_t *dest, int32_t destIndex, int32_t destCapacity, uint8_t c) {
|
||||
if(destIndex<destCapacity) {
|
||||
dest[destIndex]=c;
|
||||
} else if(destIndex==INT32_MAX) {
|
||||
return -1; // integer overflow
|
||||
}
|
||||
return destIndex+1;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
// See unicode/utf8.h U8_APPEND_UNSAFE().
|
||||
static inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
|
||||
static inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
|
||||
inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
|
||||
inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
|
||||
|
||||
static inline int32_t
|
||||
appendTwoBytes(uint8_t *dest, int32_t destIndex, int32_t destCapacity, UChar32 c) {
|
||||
U_ASSERT(0x370 <= c && c <= 0x3ff); // 2-byte UTF-8, main Greek block
|
||||
if(2>(INT32_MAX-destIndex)) {
|
||||
return -1; // integer overflow
|
||||
}
|
||||
int32_t limit=destIndex+2;
|
||||
if(limit<=destCapacity) {
|
||||
dest+=destIndex;
|
||||
dest[0]=getTwoByteLead(c);
|
||||
dest[1]=getTwoByteTrail(c);
|
||||
}
|
||||
return limit;
|
||||
}
|
||||
|
||||
static inline int32_t
|
||||
appendTwoBytes(uint8_t *dest, int32_t destIndex, int32_t destCapacity, const char *s) {
|
||||
if(2>(INT32_MAX-destIndex)) {
|
||||
return -1; // integer overflow
|
||||
}
|
||||
int32_t limit=destIndex+2;
|
||||
if(limit<=destCapacity) {
|
||||
dest+=destIndex;
|
||||
dest[0]=(uint8_t)s[0];
|
||||
dest[1]=(uint8_t)s[1];
|
||||
}
|
||||
return limit;
|
||||
}
|
||||
|
||||
static inline int32_t
|
||||
appendUnchanged(uint8_t *dest, int32_t destIndex, int32_t destCapacity,
|
||||
const uint8_t *s, int32_t length, uint32_t options, icu::Edits *edits) {
|
||||
if(length>0) {
|
||||
if(edits!=NULL) {
|
||||
edits->addUnchanged(length);
|
||||
if(options & U_OMIT_UNCHANGED_TEXT) {
|
||||
return destIndex;
|
||||
}
|
||||
}
|
||||
if(length>(INT32_MAX-destIndex)) {
|
||||
return -1; // integer overflow
|
||||
}
|
||||
if((destIndex+length)<=destCapacity) {
|
||||
uprv_memcpy(dest+destIndex, s, length);
|
||||
}
|
||||
destIndex+=length;
|
||||
}
|
||||
return destIndex;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
static UChar32 U_CALLCONV
|
||||
utf8_caseContextIterator(void *context, int8_t dir) {
|
||||
|
@ -333,17 +203,15 @@ utf8_caseContextIterator(void *context, int8_t dir) {
|
|||
* Case-maps [srcStart..srcLimit[ but takes
|
||||
* context [0..srcLength[ into account.
|
||||
*/
|
||||
static int32_t
|
||||
static void
|
||||
_caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, UCaseContext *csc,
|
||||
int32_t srcStart, int32_t srcLimit,
|
||||
icu::Edits *edits,
|
||||
icu::ByteSink &sink, icu::Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
/* case mapping loop */
|
||||
int32_t srcIndex=srcStart;
|
||||
int32_t destIndex=0;
|
||||
while(srcIndex<srcLimit) {
|
||||
while (U_SUCCESS(errorCode) && srcIndex<srcLimit) {
|
||||
int32_t cpStart;
|
||||
csc->cpStart=cpStart=srcIndex;
|
||||
UChar32 c;
|
||||
|
@ -351,45 +219,32 @@ _caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
|
|||
csc->cpLimit=srcIndex;
|
||||
if(c<0) {
|
||||
// Malformed UTF-8.
|
||||
destIndex=appendUnchanged(dest, destIndex, destCapacity,
|
||||
src+cpStart, srcIndex-cpStart, options, edits);
|
||||
if(destIndex<0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
const UChar *s;
|
||||
c=map(c, utf8_caseContextIterator, csc, &s, caseLocale);
|
||||
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
|
||||
srcIndex - cpStart, options, edits);
|
||||
if (destIndex < 0) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart,
|
||||
sink, options, edits, errorCode);
|
||||
} else {
|
||||
const UChar *s;
|
||||
c=map(c, utf8_caseContextIterator, csc, &s, caseLocale);
|
||||
appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
return destIndex;
|
||||
}
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
U_CFUNC int32_t U_CALLCONV
|
||||
U_CFUNC void U_CALLCONV
|
||||
ucasemap_internalUTF8ToTitle(
|
||||
int32_t caseLocale, uint32_t options, BreakIterator *iter,
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
icu::Edits *edits,
|
||||
ByteSink &sink, icu::Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
|
||||
return 0;
|
||||
return;
|
||||
}
|
||||
|
||||
/* set up local variables */
|
||||
UCaseContext csc=UCASECONTEXT_INITIALIZER;
|
||||
csc.p=(void *)src;
|
||||
csc.limit=srcLength;
|
||||
int32_t destIndex=0;
|
||||
int32_t prev=0;
|
||||
UBool isFirstIndex=TRUE;
|
||||
|
||||
|
@ -434,11 +289,9 @@ ucasemap_internalUTF8ToTitle(
|
|||
U8_NEXT(src, titleLimit, index, c);
|
||||
}
|
||||
if (prev < titleStart) {
|
||||
destIndex=appendUnchanged(dest, destIndex, destCapacity,
|
||||
src+prev, titleStart-prev, options, edits);
|
||||
if(destIndex<0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
|
||||
sink, options, edits, errorCode)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -450,16 +303,15 @@ ucasemap_internalUTF8ToTitle(
|
|||
csc.cpLimit=titleLimit;
|
||||
const UChar *s;
|
||||
c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
|
||||
destIndex=appendResult(dest, destIndex, destCapacity, c, s,
|
||||
titleLimit-titleStart, options, edits);
|
||||
if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
// Malformed UTF-8.
|
||||
destIndex=appendUnchanged(dest, destIndex, destCapacity,
|
||||
src+titleStart, titleLimit-titleStart, options, edits);
|
||||
}
|
||||
if(destIndex<0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
|
||||
sink, options, edits, errorCode)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* Special case Dutch IJ titlecasing */
|
||||
|
@ -467,22 +319,13 @@ ucasemap_internalUTF8ToTitle(
|
|||
caseLocale == UCASE_LOC_DUTCH &&
|
||||
(src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
|
||||
if (src[titleStart+1] == 0x006A) {
|
||||
destIndex=appendASCII(dest, destIndex, destCapacity, 0x004A);
|
||||
if(destIndex<0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
if(edits!=NULL) {
|
||||
edits->addReplace(1, 1);
|
||||
}
|
||||
ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits);
|
||||
titleLimit++;
|
||||
} else if (src[titleStart+1] == 0x004A) {
|
||||
// Keep the capital J from getting lowercased.
|
||||
destIndex=appendUnchanged(dest, destIndex, destCapacity,
|
||||
src+titleStart+1, 1, options, edits);
|
||||
if(destIndex<0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1,
|
||||
sink, options, edits, errorCode)) {
|
||||
return;
|
||||
}
|
||||
titleLimit++;
|
||||
}
|
||||
|
@ -492,26 +335,18 @@ ucasemap_internalUTF8ToTitle(
|
|||
if(titleLimit<index) {
|
||||
if((options&U_TITLECASE_NO_LOWERCASE)==0) {
|
||||
/* Normal operation: Lowercase the rest of the word. */
|
||||
destIndex+=
|
||||
_caseMap(
|
||||
caseLocale, options, ucase_toFullLower,
|
||||
dest+destIndex, destCapacity-destIndex,
|
||||
src, &csc,
|
||||
titleLimit, index,
|
||||
edits, errorCode);
|
||||
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
errorCode=U_ZERO_ERROR;
|
||||
}
|
||||
_caseMap(caseLocale, options, ucase_toFullLower,
|
||||
src, &csc,
|
||||
titleLimit, index,
|
||||
sink, edits, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return destIndex;
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
/* Optionally just copy the rest of the word unchanged. */
|
||||
destIndex=appendUnchanged(dest, destIndex, destCapacity,
|
||||
src+titleLimit, index-titleLimit, options, edits);
|
||||
if(destIndex<0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
|
||||
sink, options, edits, errorCode)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -520,8 +355,6 @@ ucasemap_internalUTF8ToTitle(
|
|||
|
||||
prev=index;
|
||||
}
|
||||
|
||||
return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -546,12 +379,10 @@ UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
|
|||
}
|
||||
|
||||
// Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
|
||||
int32_t toUpper(uint32_t options,
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
int32_t destIndex=0;
|
||||
void toUpper(uint32_t options,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
ByteSink &sink, Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
uint32_t state = 0;
|
||||
for (int32_t i = 0; i < srcLength;) {
|
||||
int32_t nextIndex = i;
|
||||
|
@ -627,8 +458,10 @@ int32_t toUpper(uint32_t options,
|
|||
}
|
||||
}
|
||||
|
||||
UBool change = TRUE;
|
||||
if (edits != NULL) {
|
||||
UBool change;
|
||||
if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
|
||||
change = TRUE; // common, simple usage
|
||||
} else {
|
||||
// Find out first whether we are changing the text.
|
||||
U_ASSERT(0x370 <= upper && upper <= 0x3ff); // 2-byte UTF-8, main Greek block
|
||||
change = (i + 2) > nextIndex ||
|
||||
|
@ -664,143 +497,141 @@ int32_t toUpper(uint32_t options,
|
|||
}
|
||||
|
||||
if (change) {
|
||||
destIndex=appendTwoBytes(dest, destIndex, destCapacity, upper);
|
||||
if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
|
||||
destIndex=appendTwoBytes(dest, destIndex, destCapacity, u8"\u0308"); // restore or add a dialytika
|
||||
ByteSinkUtil::appendTwoBytes(upper, sink);
|
||||
if ((data & HAS_EITHER_DIALYTIKA) != 0) {
|
||||
sink.Append(u8"\u0308", 2); // restore or add a dialytika
|
||||
}
|
||||
if (destIndex >= 0 && addTonos) {
|
||||
destIndex=appendTwoBytes(dest, destIndex, destCapacity, u8"\u0301");
|
||||
if (addTonos) {
|
||||
sink.Append(u8"\u0301", 2);
|
||||
}
|
||||
while (destIndex >= 0 && numYpogegrammeni > 0) {
|
||||
destIndex=appendTwoBytes(dest, destIndex, destCapacity, u8"\u0399");
|
||||
while (numYpogegrammeni > 0) {
|
||||
sink.Append(u8"\u0399", 2);
|
||||
--numYpogegrammeni;
|
||||
}
|
||||
if(destIndex<0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
} else if(c>=0) {
|
||||
const UChar *s;
|
||||
c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
|
||||
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
|
||||
nextIndex - i, options, edits);
|
||||
if (destIndex < 0) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
// Malformed UTF-8.
|
||||
destIndex=appendUnchanged(dest, destIndex, destCapacity,
|
||||
src+i, nextIndex-i, options, edits);
|
||||
if(destIndex<0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,
|
||||
sink, options, edits, errorCode)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
i = nextIndex;
|
||||
state = nextState;
|
||||
}
|
||||
|
||||
return destIndex;
|
||||
}
|
||||
|
||||
} // namespace GreekUpper
|
||||
U_NAMESPACE_END
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
static void U_CALLCONV
|
||||
ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
icu::Edits *edits,
|
||||
icu::ByteSink &sink, icu::Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
UCaseContext csc=UCASECONTEXT_INITIALIZER;
|
||||
csc.p=(void *)src;
|
||||
csc.limit=srcLength;
|
||||
int32_t destIndex = _caseMap(
|
||||
_caseMap(
|
||||
caseLocale, options, ucase_toFullLower,
|
||||
dest, destCapacity,
|
||||
src, &csc, 0, srcLength,
|
||||
edits, errorCode);
|
||||
return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
|
||||
sink, edits, errorCode);
|
||||
}
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
static void U_CALLCONV
|
||||
ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
icu::Edits *edits,
|
||||
icu::ByteSink &sink, icu::Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
int32_t destIndex;
|
||||
if (caseLocale == UCASE_LOC_GREEK) {
|
||||
destIndex = GreekUpper::toUpper(options, dest, destCapacity,
|
||||
src, srcLength, edits, errorCode);
|
||||
GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);
|
||||
} else {
|
||||
UCaseContext csc=UCASECONTEXT_INITIALIZER;
|
||||
csc.p=(void *)src;
|
||||
csc.limit=srcLength;
|
||||
destIndex = _caseMap(
|
||||
_caseMap(
|
||||
caseLocale, options, ucase_toFullUpper,
|
||||
dest, destCapacity,
|
||||
src, &csc, 0, srcLength,
|
||||
edits, errorCode);
|
||||
sink, edits, errorCode);
|
||||
}
|
||||
return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
|
||||
}
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
static void U_CALLCONV
|
||||
ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
icu::Edits *edits,
|
||||
icu::ByteSink &sink, icu::Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
/* case mapping loop */
|
||||
int32_t srcIndex = 0;
|
||||
int32_t destIndex = 0;
|
||||
while (srcIndex < srcLength) {
|
||||
while (U_SUCCESS(errorCode) && srcIndex < srcLength) {
|
||||
int32_t cpStart = srcIndex;
|
||||
UChar32 c;
|
||||
U8_NEXT(src, srcIndex, srcLength, c);
|
||||
if(c<0) {
|
||||
// Malformed UTF-8.
|
||||
destIndex=appendUnchanged(dest, destIndex, destCapacity,
|
||||
src+cpStart, srcIndex-cpStart, options, edits);
|
||||
if(destIndex<0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
const UChar *s;
|
||||
c = ucase_toFullFolding(c, &s, options);
|
||||
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
|
||||
srcIndex - cpStart, options, edits);
|
||||
if (destIndex < 0) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart,
|
||||
sink, options, edits, errorCode);
|
||||
} else {
|
||||
const UChar *s;
|
||||
c = ucase_toFullFolding(c, &s, options);
|
||||
appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
|
||||
}
|
||||
|
||||
U_CFUNC int32_t
|
||||
void
|
||||
ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
const char *src, int32_t srcLength,
|
||||
UTF8CaseMapper *stringCaseMapper,
|
||||
icu::ByteSink &sink, icu::Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
/* check argument values */
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
if ((src == nullptr && srcLength != 0) || srcLength < -1) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
// Get the string length.
|
||||
if (srcLength == -1) {
|
||||
srcLength = (int32_t)uprv_strlen((const char *)src);
|
||||
}
|
||||
|
||||
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
|
||||
edits->reset();
|
||||
}
|
||||
stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
|
||||
(const uint8_t *)src, srcLength, sink, edits, errorCode);
|
||||
sink.Flush();
|
||||
if (U_SUCCESS(errorCode)) {
|
||||
if (edits != nullptr) {
|
||||
edits->copyErrorTo(errorCode);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int32_t
|
||||
ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
|
||||
char *dest, int32_t destCapacity,
|
||||
const char *src, int32_t srcLength,
|
||||
UTF8CaseMapper *stringCaseMapper,
|
||||
icu::Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
int32_t destLength;
|
||||
|
||||
/* check argument values */
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if( destCapacity<0 ||
|
||||
(dest==NULL && destCapacity>0) ||
|
||||
src==NULL ||
|
||||
srcLength<-1
|
||||
(src==NULL && srcLength!=0) || srcLength<-1
|
||||
) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
|
@ -820,12 +651,21 @@ ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_P
|
|||
return 0;
|
||||
}
|
||||
|
||||
CheckedArrayByteSink sink(dest, destCapacity);
|
||||
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
|
||||
edits->reset();
|
||||
}
|
||||
destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
|
||||
dest, destCapacity, src, srcLength, edits, errorCode);
|
||||
return u_terminateChars((char *)dest, destCapacity, destLength, &errorCode);
|
||||
stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
|
||||
(const uint8_t *)src, srcLength, sink, edits, errorCode);
|
||||
sink.Flush();
|
||||
if (U_SUCCESS(errorCode)) {
|
||||
if (sink.Overflowed()) {
|
||||
errorCode = U_BUFFER_OVERFLOW_ERROR;
|
||||
} else if (edits != nullptr) {
|
||||
edits->copyErrorTo(errorCode);
|
||||
}
|
||||
}
|
||||
return u_terminateChars(dest, destCapacity, sink.NumberOfBytesAppended(), &errorCode);
|
||||
}
|
||||
|
||||
/* public API functions */
|
||||
|
@ -837,8 +677,8 @@ ucasemap_utf8ToLower(const UCaseMap *csm,
|
|||
UErrorCode *pErrorCode) {
|
||||
return ucasemap_mapUTF8(
|
||||
csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
|
||||
(uint8_t *)dest, destCapacity,
|
||||
(const uint8_t *)src, srcLength,
|
||||
dest, destCapacity,
|
||||
src, srcLength,
|
||||
ucasemap_internalUTF8ToLower, NULL, *pErrorCode);
|
||||
}
|
||||
|
||||
|
@ -849,8 +689,8 @@ ucasemap_utf8ToUpper(const UCaseMap *csm,
|
|||
UErrorCode *pErrorCode) {
|
||||
return ucasemap_mapUTF8(
|
||||
csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
|
||||
(uint8_t *)dest, destCapacity,
|
||||
(const uint8_t *)src, srcLength,
|
||||
dest, destCapacity,
|
||||
src, srcLength,
|
||||
ucasemap_internalUTF8ToUpper, NULL, *pErrorCode);
|
||||
}
|
||||
|
||||
|
@ -861,13 +701,43 @@ ucasemap_utf8FoldCase(const UCaseMap *csm,
|
|||
UErrorCode *pErrorCode) {
|
||||
return ucasemap_mapUTF8(
|
||||
UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
|
||||
(uint8_t *)dest, destCapacity,
|
||||
(const uint8_t *)src, srcLength,
|
||||
dest, destCapacity,
|
||||
src, srcLength,
|
||||
ucasemap_internalUTF8Fold, NULL, *pErrorCode);
|
||||
}
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
void CaseMap::utf8ToLower(
|
||||
const char *locale, uint32_t options,
|
||||
StringPiece src, ByteSink &sink, Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
ucasemap_mapUTF8(
|
||||
ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
|
||||
src.data(), src.length(),
|
||||
ucasemap_internalUTF8ToLower, sink, edits, errorCode);
|
||||
}
|
||||
|
||||
void CaseMap::utf8ToUpper(
|
||||
const char *locale, uint32_t options,
|
||||
StringPiece src, ByteSink &sink, Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
ucasemap_mapUTF8(
|
||||
ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
|
||||
src.data(), src.length(),
|
||||
ucasemap_internalUTF8ToUpper, sink, edits, errorCode);
|
||||
}
|
||||
|
||||
void CaseMap::utf8Fold(
|
||||
uint32_t options,
|
||||
StringPiece src, ByteSink &sink, Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
ucasemap_mapUTF8(
|
||||
UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
|
||||
src.data(), src.length(),
|
||||
ucasemap_internalUTF8Fold, sink, edits, errorCode);
|
||||
}
|
||||
|
||||
int32_t CaseMap::utf8ToLower(
|
||||
const char *locale, uint32_t options,
|
||||
const char *src, int32_t srcLength,
|
||||
|
@ -875,8 +745,8 @@ int32_t CaseMap::utf8ToLower(
|
|||
UErrorCode &errorCode) {
|
||||
return ucasemap_mapUTF8(
|
||||
ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
|
||||
(uint8_t *)dest, destCapacity,
|
||||
(const uint8_t *)src, srcLength,
|
||||
dest, destCapacity,
|
||||
src, srcLength,
|
||||
ucasemap_internalUTF8ToLower, edits, errorCode);
|
||||
}
|
||||
|
||||
|
@ -887,8 +757,8 @@ int32_t CaseMap::utf8ToUpper(
|
|||
UErrorCode &errorCode) {
|
||||
return ucasemap_mapUTF8(
|
||||
ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
|
||||
(uint8_t *)dest, destCapacity,
|
||||
(const uint8_t *)src, srcLength,
|
||||
dest, destCapacity,
|
||||
src, srcLength,
|
||||
ucasemap_internalUTF8ToUpper, edits, errorCode);
|
||||
}
|
||||
|
||||
|
@ -899,8 +769,8 @@ int32_t CaseMap::utf8Fold(
|
|||
UErrorCode &errorCode) {
|
||||
return ucasemap_mapUTF8(
|
||||
UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
|
||||
(uint8_t *)dest, destCapacity,
|
||||
(const uint8_t *)src, srcLength,
|
||||
dest, destCapacity,
|
||||
src, srcLength,
|
||||
ucasemap_internalUTF8Fold, edits, errorCode);
|
||||
}
|
||||
|
||||
|
|
|
@ -73,6 +73,8 @@ uprv_haveProperties(UErrorCode *pErrorCode);
|
|||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class ByteSink;
|
||||
|
||||
/** Returns TRUE if the options are valid. Otherwise FALSE, and sets an error. */
|
||||
inline UBool ustrcase_checkTitleAdjustmentOptions(uint32_t options, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return FALSE; }
|
||||
|
@ -207,39 +209,43 @@ ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITE
|
|||
* UTF-8 version of UStringCaseMapper.
|
||||
* All error checking must be done.
|
||||
* The UCaseMap must be fully initialized, with locale and/or iter set as needed.
|
||||
* src and dest must not overlap.
|
||||
*/
|
||||
typedef int32_t U_CALLCONV
|
||||
typedef void U_CALLCONV
|
||||
UTF8CaseMapper(int32_t caseLocale, uint32_t options,
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
icu::BreakIterator *iter,
|
||||
#endif
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
icu::Edits *edits,
|
||||
icu::ByteSink &sink, icu::Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
/** Implements UTF8CaseMapper. */
|
||||
U_CFUNC int32_t U_CALLCONV
|
||||
U_CFUNC void U_CALLCONV
|
||||
ucasemap_internalUTF8ToTitle(int32_t caseLocale, uint32_t options,
|
||||
icu::BreakIterator *iter,
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
icu::Edits *edits,
|
||||
icu::ByteSink &sink, icu::Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
#endif
|
||||
|
||||
void
|
||||
ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
|
||||
const char *src, int32_t srcLength,
|
||||
UTF8CaseMapper *stringCaseMapper,
|
||||
icu::ByteSink &sink, icu::Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* Implements argument checking and buffer handling
|
||||
* for UTF-8 string case mapping as a common function.
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
int32_t
|
||||
ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
char *dest, int32_t destCapacity,
|
||||
const char *src, int32_t srcLength,
|
||||
UTF8CaseMapper *stringCaseMapper,
|
||||
icu::Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
|
|
@ -31,6 +31,29 @@
|
|||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
void CaseMap::utf8ToTitle(
|
||||
const char *locale, uint32_t options, BreakIterator *iter,
|
||||
StringPiece src, ByteSink &sink, Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
UText utext = UTEXT_INITIALIZER;
|
||||
utext_openUTF8(&utext, src.data(), src.length(), &errorCode);
|
||||
LocalPointer<BreakIterator> ownedIter;
|
||||
iter = ustrcase_getTitleBreakIterator(nullptr, locale, options, iter, ownedIter, errorCode);
|
||||
if (iter == nullptr) {
|
||||
utext_close(&utext);
|
||||
return;
|
||||
}
|
||||
iter->setText(&utext, errorCode);
|
||||
ucasemap_mapUTF8(
|
||||
ustrcase_getCaseLocale(locale), options, iter,
|
||||
src.data(), src.length(),
|
||||
ucasemap_internalUTF8ToTitle, sink, edits, errorCode);
|
||||
utext_close(&utext);
|
||||
}
|
||||
|
||||
int32_t CaseMap::utf8ToTitle(
|
||||
const char *locale, uint32_t options, BreakIterator *iter,
|
||||
const char *src, int32_t srcLength,
|
||||
|
@ -50,8 +73,8 @@ int32_t CaseMap::utf8ToTitle(
|
|||
iter->setText(&utext, errorCode);
|
||||
int32_t length=ucasemap_mapUTF8(
|
||||
ustrcase_getCaseLocale(locale), options, iter,
|
||||
(uint8_t *)dest, destCapacity,
|
||||
(const uint8_t *)src, srcLength,
|
||||
dest, destCapacity,
|
||||
src, srcLength,
|
||||
ucasemap_internalUTF8ToTitle, edits, errorCode);
|
||||
utext_close(&utext);
|
||||
return length;
|
||||
|
@ -101,8 +124,8 @@ ucasemap_utf8ToTitle(UCaseMap *csm,
|
|||
csm->iter->setText(&utext, *pErrorCode);
|
||||
int32_t length=ucasemap_mapUTF8(
|
||||
csm->caseLocale, csm->options, csm->iter,
|
||||
(uint8_t *)dest, destCapacity,
|
||||
(const uint8_t *)src, srcLength,
|
||||
dest, destCapacity,
|
||||
src, srcLength,
|
||||
ucasemap_internalUTF8ToTitle, NULL, *pErrorCode);
|
||||
utext_close(&utext);
|
||||
return length;
|
||||
|
|
|
@ -1323,9 +1323,17 @@ _UTF16GetName(const UConverter *cnv) {
|
|||
U_CDECL_END
|
||||
extern const UConverterSharedData _UTF16Data;
|
||||
|
||||
#define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData)
|
||||
#define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData)
|
||||
#define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data)
|
||||
static inline bool IS_UTF16BE(const UConverter *cnv) {
|
||||
return ((cnv)->sharedData == &_UTF16BEData);
|
||||
}
|
||||
|
||||
static inline bool IS_UTF16LE(const UConverter *cnv) {
|
||||
return ((cnv)->sharedData == &_UTF16LEData);
|
||||
}
|
||||
|
||||
static inline bool IS_UTF16(const UConverter *cnv) {
|
||||
return ((cnv)->sharedData==&_UTF16Data) || ((cnv)->sharedData == &_UTF16v2Data);
|
||||
}
|
||||
|
||||
U_CDECL_BEGIN
|
||||
static void U_CALLCONV
|
||||
|
|
|
@ -31,6 +31,7 @@
|
|||
#include "ucnv_bld.h"
|
||||
#include "ucnv_cnv.h"
|
||||
#include "cmemory.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
/* Prototypes --------------------------------------------------------------- */
|
||||
|
||||
|
@ -44,51 +45,13 @@ U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args
|
|||
|
||||
/* UTF-8 -------------------------------------------------------------------- */
|
||||
|
||||
/* UTF-8 Conversion DATA
|
||||
* for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
|
||||
*/
|
||||
/*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
|
||||
#define MAXIMUM_UCS2 0x0000FFFF
|
||||
#define MAXIMUM_UTF 0x0010FFFF
|
||||
#define MAXIMUM_UCS4 0x7FFFFFFF
|
||||
#define HALF_SHIFT 10
|
||||
#define HALF_BASE 0x0010000
|
||||
#define HALF_MASK 0x3FF
|
||||
#define SURROGATE_HIGH_START 0xD800
|
||||
#define SURROGATE_HIGH_END 0xDBFF
|
||||
#define SURROGATE_LOW_START 0xDC00
|
||||
#define SURROGATE_LOW_END 0xDFFF
|
||||
|
||||
/* -SURROGATE_LOW_START + HALF_BASE */
|
||||
#define SURROGATE_LOW_BASE 9216
|
||||
|
||||
static const uint32_t offsetsFromUTF8[7] = {0,
|
||||
static const uint32_t offsetsFromUTF8[5] = {0,
|
||||
(uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
|
||||
(uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
|
||||
(uint32_t) 0x03C82080
|
||||
};
|
||||
|
||||
/* END OF UTF-8 Conversion DATA */
|
||||
|
||||
static const int8_t bytesFromUTF8[256] = {
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
|
||||
};
|
||||
|
||||
/*
|
||||
* Starting with Unicode 3.0.1:
|
||||
* UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
|
||||
* byte sequences with more than 4 bytes are illegal in UTF-8,
|
||||
* which is tested with impossible values for them
|
||||
*/
|
||||
static const uint32_t
|
||||
utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
|
||||
|
||||
static UBool hasCESU8Data(const UConverter *cnv)
|
||||
{
|
||||
#if UCONFIG_ONLY_HTML_CONVERSION
|
||||
|
@ -127,7 +90,7 @@ static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
|
|||
while (mySource < sourceLimit && myTarget < targetLimit)
|
||||
{
|
||||
ch = *(mySource++);
|
||||
if (ch < 0x80) /* Simple case */
|
||||
if (U8_IS_SINGLE(ch)) /* Simple case */
|
||||
{
|
||||
*(myTarget++) = (UChar) ch;
|
||||
}
|
||||
|
@ -135,7 +98,7 @@ static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
|
|||
{
|
||||
/* store the first char */
|
||||
toUBytes[0] = (char)ch;
|
||||
inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
|
||||
inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */
|
||||
i = 1;
|
||||
|
||||
morebytes:
|
||||
|
@ -144,7 +107,8 @@ morebytes:
|
|||
if (mySource < sourceLimit)
|
||||
{
|
||||
toUBytes[i] = (char) (ch2 = *mySource);
|
||||
if (!U8_IS_TRAIL(ch2))
|
||||
if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) &&
|
||||
!(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
|
||||
{
|
||||
break; /* i < inBytes */
|
||||
}
|
||||
|
@ -162,24 +126,12 @@ morebytes:
|
|||
}
|
||||
}
|
||||
|
||||
/* Remove the accumulated high bits */
|
||||
ch -= offsetsFromUTF8[inBytes];
|
||||
|
||||
/*
|
||||
* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
|
||||
* - use only trail bytes after a lead byte (checked above)
|
||||
* - use the right number of trail bytes for a given lead byte
|
||||
* - encode a code point <= U+10ffff
|
||||
* - use the fewest possible number of bytes for their code points
|
||||
* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
|
||||
*
|
||||
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
|
||||
* There are no irregular sequences any more.
|
||||
* In CESU-8, only surrogates, not supplementary code points, are encoded directly.
|
||||
*/
|
||||
if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
|
||||
(isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
|
||||
// In CESU-8, only surrogates, not supplementary code points, are encoded directly.
|
||||
if (i == inBytes && (!isCESU8 || i <= 3))
|
||||
{
|
||||
/* Remove the accumulated high bits */
|
||||
ch -= offsetsFromUTF8[inBytes];
|
||||
|
||||
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
|
||||
if (ch <= MAXIMUM_UCS2)
|
||||
{
|
||||
|
@ -189,9 +141,8 @@ morebytes:
|
|||
else
|
||||
{
|
||||
/* write out the surrogates */
|
||||
ch -= HALF_BASE;
|
||||
*(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
|
||||
ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
|
||||
*(myTarget++) = U16_LEAD(ch);
|
||||
ch = U16_TRAIL(ch);
|
||||
if (myTarget < targetLimit)
|
||||
{
|
||||
*(myTarget++) = (UChar)ch;
|
||||
|
@ -256,7 +207,7 @@ static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeAr
|
|||
while (mySource < sourceLimit && myTarget < targetLimit)
|
||||
{
|
||||
ch = *(mySource++);
|
||||
if (ch < 0x80) /* Simple case */
|
||||
if (U8_IS_SINGLE(ch)) /* Simple case */
|
||||
{
|
||||
*(myTarget++) = (UChar) ch;
|
||||
*(myOffsets++) = offsetNum++;
|
||||
|
@ -264,7 +215,7 @@ static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeAr
|
|||
else
|
||||
{
|
||||
toUBytes[0] = (char)ch;
|
||||
inBytes = bytesFromUTF8[ch];
|
||||
inBytes = U8_COUNT_BYTES_NON_ASCII(ch);
|
||||
i = 1;
|
||||
|
||||
morebytes:
|
||||
|
@ -273,7 +224,8 @@ morebytes:
|
|||
if (mySource < sourceLimit)
|
||||
{
|
||||
toUBytes[i] = (char) (ch2 = *mySource);
|
||||
if (!U8_IS_TRAIL(ch2))
|
||||
if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) &&
|
||||
!(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
|
||||
{
|
||||
break; /* i < inBytes */
|
||||
}
|
||||
|
@ -290,24 +242,12 @@ morebytes:
|
|||
}
|
||||
}
|
||||
|
||||
/* Remove the accumulated high bits */
|
||||
ch -= offsetsFromUTF8[inBytes];
|
||||
|
||||
/*
|
||||
* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
|
||||
* - use only trail bytes after a lead byte (checked above)
|
||||
* - use the right number of trail bytes for a given lead byte
|
||||
* - encode a code point <= U+10ffff
|
||||
* - use the fewest possible number of bytes for their code points
|
||||
* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
|
||||
*
|
||||
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
|
||||
* There are no irregular sequences any more.
|
||||
* In CESU-8, only surrogates, not supplementary code points, are encoded directly.
|
||||
*/
|
||||
if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
|
||||
(isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
|
||||
// In CESU-8, only surrogates, not supplementary code points, are encoded directly.
|
||||
if (i == inBytes && (!isCESU8 || i <= 3))
|
||||
{
|
||||
/* Remove the accumulated high bits */
|
||||
ch -= offsetsFromUTF8[inBytes];
|
||||
|
||||
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
|
||||
if (ch <= MAXIMUM_UCS2)
|
||||
{
|
||||
|
@ -318,10 +258,9 @@ morebytes:
|
|||
else
|
||||
{
|
||||
/* write out the surrogates */
|
||||
ch -= HALF_BASE;
|
||||
*(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
|
||||
*(myTarget++) = U16_LEAD(ch);
|
||||
*(myOffsets++) = offsetNum;
|
||||
ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
|
||||
ch = U16_TRAIL(ch);
|
||||
if (myTarget < targetLimit)
|
||||
{
|
||||
*(myTarget++) = (UChar)ch;
|
||||
|
@ -616,10 +555,9 @@ static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
|
|||
UConverter *cnv;
|
||||
const uint8_t *sourceInitial;
|
||||
const uint8_t *source;
|
||||
uint16_t extraBytesToWrite;
|
||||
uint8_t myByte;
|
||||
UChar32 ch;
|
||||
int8_t i, isLegalSequence;
|
||||
int8_t i;
|
||||
|
||||
/* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
|
||||
|
||||
|
@ -633,14 +571,14 @@ static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
|
|||
}
|
||||
|
||||
myByte = (uint8_t)*(source++);
|
||||
if (myByte < 0x80)
|
||||
if (U8_IS_SINGLE(myByte))
|
||||
{
|
||||
args->source = (const char *)source;
|
||||
return (UChar32)myByte;
|
||||
}
|
||||
|
||||
extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
|
||||
if (extraBytesToWrite == 0) {
|
||||
uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte);
|
||||
if (countTrailBytes == 0) {
|
||||
cnv->toUBytes[0] = myByte;
|
||||
cnv->toULength = 1;
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
|
@ -649,15 +587,17 @@ static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
|
|||
}
|
||||
|
||||
/*The byte sequence is longer than the buffer area passed*/
|
||||
if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
|
||||
if (((const char *)source + countTrailBytes) > args->sourceLimit)
|
||||
{
|
||||
/* check if all of the remaining bytes are trail bytes */
|
||||
uint16_t extraBytesToWrite = countTrailBytes + 1;
|
||||
cnv->toUBytes[0] = myByte;
|
||||
i = 1;
|
||||
*err = U_TRUNCATED_CHAR_FOUND;
|
||||
while(source < (const uint8_t *)args->sourceLimit) {
|
||||
if(U8_IS_TRAIL(myByte = *source)) {
|
||||
cnv->toUBytes[i++] = myByte;
|
||||
uint8_t b = *source;
|
||||
if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) {
|
||||
cnv->toUBytes[i++] = b;
|
||||
++source;
|
||||
} else {
|
||||
/* error even before we run out of input */
|
||||
|
@ -670,81 +610,28 @@ static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
|
|||
return 0xffff;
|
||||
}
|
||||
|
||||
isLegalSequence = 1;
|
||||
ch = myByte << 6;
|
||||
switch(extraBytesToWrite)
|
||||
{
|
||||
/* note: code falls through cases! (sic)*/
|
||||
case 6:
|
||||
ch += (myByte = *source);
|
||||
ch <<= 6;
|
||||
if (!U8_IS_TRAIL(myByte))
|
||||
{
|
||||
isLegalSequence = 0;
|
||||
break;
|
||||
if(countTrailBytes == 2) {
|
||||
uint8_t t1 = *source, t2;
|
||||
if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) {
|
||||
args->source = (const char *)(source + 1);
|
||||
return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3];
|
||||
}
|
||||
++source;
|
||||
U_FALLTHROUGH;
|
||||
case 5:
|
||||
ch += (myByte = *source);
|
||||
ch <<= 6;
|
||||
if (!U8_IS_TRAIL(myByte))
|
||||
{
|
||||
isLegalSequence = 0;
|
||||
break;
|
||||
} else if(countTrailBytes == 1) {
|
||||
uint8_t t1 = *source;
|
||||
if(U8_IS_TRAIL(t1)) {
|
||||
args->source = (const char *)(source + 1);
|
||||
return (ch + t1) - offsetsFromUTF8[2];
|
||||
}
|
||||
++source;
|
||||
U_FALLTHROUGH;
|
||||
case 4:
|
||||
ch += (myByte = *source);
|
||||
ch <<= 6;
|
||||
if (!U8_IS_TRAIL(myByte))
|
||||
{
|
||||
isLegalSequence = 0;
|
||||
break;
|
||||
} else { // countTrailBytes == 3
|
||||
uint8_t t1 = *source, t2, t3;
|
||||
if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) &&
|
||||
U8_IS_TRAIL(t3 = *++source)) {
|
||||
args->source = (const char *)(source + 1);
|
||||
return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4];
|
||||
}
|
||||
++source;
|
||||
U_FALLTHROUGH;
|
||||
case 3:
|
||||
ch += (myByte = *source);
|
||||
ch <<= 6;
|
||||
if (!U8_IS_TRAIL(myByte))
|
||||
{
|
||||
isLegalSequence = 0;
|
||||
break;
|
||||
}
|
||||
++source;
|
||||
U_FALLTHROUGH;
|
||||
case 2:
|
||||
ch += (myByte = *source);
|
||||
if (!U8_IS_TRAIL(myByte))
|
||||
{
|
||||
isLegalSequence = 0;
|
||||
break;
|
||||
}
|
||||
++source;
|
||||
};
|
||||
ch -= offsetsFromUTF8[extraBytesToWrite];
|
||||
args->source = (const char *)source;
|
||||
|
||||
/*
|
||||
* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
|
||||
* - use only trail bytes after a lead byte (checked above)
|
||||
* - use the right number of trail bytes for a given lead byte
|
||||
* - encode a code point <= U+10ffff
|
||||
* - use the fewest possible number of bytes for their code points
|
||||
* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
|
||||
*
|
||||
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
|
||||
* There are no irregular sequences any more.
|
||||
*/
|
||||
if (isLegalSequence &&
|
||||
(uint32_t)ch <= MAXIMUM_UTF &&
|
||||
(uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
|
||||
!U_IS_SURROGATE(ch)
|
||||
) {
|
||||
return ch; /* return the code point */
|
||||
}
|
||||
args->source = (const char *)source;
|
||||
|
||||
for(i = 0; sourceInitial < source; ++i) {
|
||||
cnv->toUBytes[i] = *sourceInitial++;
|
||||
|
@ -757,14 +644,6 @@ U_CDECL_END
|
|||
|
||||
/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
|
||||
|
||||
/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
|
||||
static const UChar32
|
||||
utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
|
||||
|
||||
/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
|
||||
static const UChar32
|
||||
utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
|
||||
|
||||
U_CDECL_BEGIN
|
||||
/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
|
||||
static void U_CALLCONV
|
||||
|
@ -812,39 +691,35 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
|||
*pErrorCode=U_USING_DEFAULT_WARNING;
|
||||
return;
|
||||
} else {
|
||||
/*
|
||||
* Use a single counter for source and target, counting the minimum of
|
||||
* the source length and the target capacity.
|
||||
* As a result, the source length is checked only once per multi-byte
|
||||
* character instead of twice.
|
||||
*
|
||||
* Make sure that the last byte sequence is complete, or else
|
||||
* stop just before it.
|
||||
* (The longest legal byte sequence has 3 trail bytes.)
|
||||
* Count oldToULength (number of source bytes from a previous buffer)
|
||||
* into the source length but reduce the source index by toULimit
|
||||
* while going back over trail bytes in order to not go back into
|
||||
* the bytes that will be read for finishing a partial
|
||||
* sequence from the previous buffer.
|
||||
* Let the standard converter handle edge cases.
|
||||
*/
|
||||
int32_t i;
|
||||
|
||||
// Use a single counter for source and target, counting the minimum of
|
||||
// the source length and the target capacity.
|
||||
// Let the standard converter handle edge cases.
|
||||
if(count>targetCapacity) {
|
||||
count=targetCapacity;
|
||||
}
|
||||
|
||||
i=0;
|
||||
while(i<3 && i<(count-toULimit)) {
|
||||
b=source[count-oldToULength-i-1];
|
||||
if(U8_IS_TRAIL(b)) {
|
||||
++i;
|
||||
} else {
|
||||
if(i<U8_COUNT_TRAIL_BYTES(b)) {
|
||||
/* stop converting before the lead byte if there are not enough trail bytes for it */
|
||||
count-=i+1;
|
||||
// The conversion loop checks count>0 only once per 1/2/3-byte character.
|
||||
// If the buffer ends with a truncated 2- or 3-byte sequence,
|
||||
// then we reduce the count to stop before that,
|
||||
// and collect the remaining bytes after the conversion loop.
|
||||
{
|
||||
// Do not go back into the bytes that will be read for finishing a partial
|
||||
// sequence from the previous buffer.
|
||||
int32_t length=count-toULimit;
|
||||
if(length>0) {
|
||||
uint8_t b1=*(sourceLimit-1);
|
||||
if(U8_IS_SINGLE(b1)) {
|
||||
// common ASCII character
|
||||
} else if(U8_IS_TRAIL(b1) && length>=2) {
|
||||
uint8_t b2=*(sourceLimit-2);
|
||||
if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
|
||||
// truncated 3-byte sequence
|
||||
count-=2;
|
||||
}
|
||||
} else if(0xc2<=b1 && b1<0xf0) {
|
||||
// truncated 2- or 3-byte sequence
|
||||
--count;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -859,17 +734,17 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
|||
/* conversion loop */
|
||||
while(count>0) {
|
||||
b=*source++;
|
||||
if((int8_t)b>=0) {
|
||||
if(U8_IS_SINGLE(b)) {
|
||||
/* convert ASCII */
|
||||
*target++=b;
|
||||
--count;
|
||||
continue;
|
||||
} else {
|
||||
if(b>0xe0) {
|
||||
if( /* handle U+1000..U+D7FF inline */
|
||||
(t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
|
||||
(b==0xed && (t1 <= 0x9f))) &&
|
||||
(t2=source[1]) >= 0x80 && t2 <= 0xbf
|
||||
if(b>=0xe0) {
|
||||
if( /* handle U+0800..U+FFFF inline */
|
||||
b<0xf0 &&
|
||||
U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
|
||||
U8_IS_TRAIL(t2=source[1])
|
||||
) {
|
||||
source+=2;
|
||||
*target++=b;
|
||||
|
@ -878,10 +753,10 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
|||
count-=3;
|
||||
continue;
|
||||
}
|
||||
} else if(b<0xe0) {
|
||||
} else {
|
||||
if( /* handle U+0080..U+07FF inline */
|
||||
b>=0xc2 &&
|
||||
(t1=*source) >= 0x80 && t1 <= 0xbf
|
||||
U8_IS_TRAIL(t1=*source)
|
||||
) {
|
||||
++source;
|
||||
*target++=b;
|
||||
|
@ -889,30 +764,18 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
|||
count-=2;
|
||||
continue;
|
||||
}
|
||||
} else if(b==0xe0) {
|
||||
if( /* handle U+0800..U+0FFF inline */
|
||||
(t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
|
||||
(t2=source[1]) >= 0x80 && t2 <= 0xbf
|
||||
) {
|
||||
source+=2;
|
||||
*target++=b;
|
||||
*target++=t1;
|
||||
*target++=t2;
|
||||
count-=3;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* handle "complicated" and error cases, and continuing partial characters */
|
||||
oldToULength=0;
|
||||
toULength=1;
|
||||
toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
|
||||
toULimit=U8_COUNT_BYTES_NON_ASCII(b);
|
||||
c=b;
|
||||
moreBytes:
|
||||
while(toULength<toULimit) {
|
||||
if(source<sourceLimit) {
|
||||
b=*source;
|
||||
if(U8_IS_TRAIL(b)) {
|
||||
if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
|
||||
++source;
|
||||
++toULength;
|
||||
c=(c<<6)+b;
|
||||
|
@ -934,18 +797,7 @@ moreBytes:
|
|||
}
|
||||
}
|
||||
|
||||
if( toULength==toULimit && /* consumed all trail bytes */
|
||||
(toULength==3 || toULength==2) && /* BMP */
|
||||
(c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
|
||||
(c<=0xd7ff || 0xe000<=c) /* not a surrogate */
|
||||
) {
|
||||
/* legal byte sequence for BMP code point */
|
||||
} else if(
|
||||
toULength==toULimit && toULength==4 &&
|
||||
(0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
|
||||
) {
|
||||
/* legal byte sequence for supplementary code point */
|
||||
} else {
|
||||
if(toULength!=toULimit) {
|
||||
/* error handling: illegal UTF-8 byte sequence */
|
||||
source-=(toULength-oldToULength);
|
||||
while(oldToULength<toULength) {
|
||||
|
@ -979,7 +831,7 @@ moreBytes:
|
|||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
} else {
|
||||
b=*source;
|
||||
toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
|
||||
toULimit=U8_COUNT_BYTES(b);
|
||||
if(toULimit>(sourceLimit-source)) {
|
||||
/* collect a truncated byte sequence */
|
||||
toULength=0;
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#include "unicode/utf8.h"
|
||||
#include "ucnv_bld.h"
|
||||
#include "ucnv_cnv.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
/* control optimizations according to the platform */
|
||||
#define LATIN1_UNROLL_FROM_UNICODE 1
|
||||
|
@ -374,7 +375,7 @@ ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
|||
while(source<sourceLimit) {
|
||||
if(targetCapacity>0) {
|
||||
b=*source++;
|
||||
if((int8_t)b>=0) {
|
||||
if(U8_IS_SINGLE(b)) {
|
||||
/* convert ASCII */
|
||||
*target++=(uint8_t)b;
|
||||
--targetCapacity;
|
||||
|
@ -409,7 +410,7 @@ ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
|||
if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
|
||||
utf8->toUnicodeStatus=utf8->toUBytes[0]=b=*source++;
|
||||
utf8->toULength=1;
|
||||
utf8->mode=U8_COUNT_TRAIL_BYTES(b)+1;
|
||||
utf8->mode=U8_COUNT_BYTES(b);
|
||||
}
|
||||
|
||||
/* write back the updated pointers */
|
||||
|
|
|
@ -59,6 +59,7 @@
|
|||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "umutex.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
/* control optimizations according to the platform */
|
||||
#define MBCS_UNROLL_SINGLE_TO_BMP 1
|
||||
|
@ -5011,13 +5012,9 @@ ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
|
|||
|
||||
/* MBCS-from-UTF-8 conversion functions ------------------------------------- */
|
||||
|
||||
/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
|
||||
static const UChar32
|
||||
utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
|
||||
|
||||
/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
|
||||
static const UChar32
|
||||
utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
|
||||
utf8_offsets[5]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
|
||||
|
||||
static void U_CALLCONV
|
||||
ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
||||
|
@ -5075,28 +5072,27 @@ ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
|||
toULength=oldToULength=toULimit=0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure that the last byte sequence before sourceLimit is complete
|
||||
* or runs into a lead byte.
|
||||
* Do not go back into the bytes that will be read for finishing a partial
|
||||
* sequence from the previous buffer.
|
||||
* In the conversion loop compare source with sourceLimit only once
|
||||
* per multi-byte character.
|
||||
*/
|
||||
// The conversion loop checks source<sourceLimit only once per 1/2/3-byte character.
|
||||
// If the buffer ends with a truncated 2- or 3-byte sequence,
|
||||
// then we reduce the sourceLimit to before that,
|
||||
// and collect the remaining bytes after the conversion loop.
|
||||
{
|
||||
int32_t i, length;
|
||||
|
||||
length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
|
||||
for(i=0; i<3 && i<length;) {
|
||||
b=*(sourceLimit-i-1);
|
||||
if(U8_IS_TRAIL(b)) {
|
||||
++i;
|
||||
} else {
|
||||
if(i<U8_COUNT_TRAIL_BYTES(b)) {
|
||||
/* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
|
||||
sourceLimit-=i+1;
|
||||
// Do not go back into the bytes that will be read for finishing a partial
|
||||
// sequence from the previous buffer.
|
||||
int32_t length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
|
||||
if(length>0) {
|
||||
uint8_t b1=*(sourceLimit-1);
|
||||
if(U8_IS_SINGLE(b1)) {
|
||||
// common ASCII character
|
||||
} else if(U8_IS_TRAIL(b1) && length>=2) {
|
||||
uint8_t b2=*(sourceLimit-2);
|
||||
if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
|
||||
// truncated 3-byte sequence
|
||||
sourceLimit-=2;
|
||||
}
|
||||
break;
|
||||
} else if(0xc2<=b1 && b1<0xf0) {
|
||||
// truncated 2- or 3-byte sequence
|
||||
--sourceLimit;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -5130,7 +5126,7 @@ ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
|||
while(source<sourceLimit) {
|
||||
if(targetCapacity>0) {
|
||||
b=*source++;
|
||||
if((int8_t)b>=0) {
|
||||
if(U8_IS_SINGLE(b)) {
|
||||
/* convert ASCII */
|
||||
if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
|
||||
*target++=(uint8_t)b;
|
||||
|
@ -5185,7 +5181,7 @@ ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
|||
/* handle "complicated" and error cases, and continuing partial characters */
|
||||
oldToULength=0;
|
||||
toULength=1;
|
||||
toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
|
||||
toULimit=U8_COUNT_BYTES_NON_ASCII(b);
|
||||
c=b;
|
||||
moreBytes:
|
||||
while(toULength<toULimit) {
|
||||
|
@ -5198,7 +5194,7 @@ moreBytes:
|
|||
*/
|
||||
if(source<(uint8_t *)pToUArgs->sourceLimit) {
|
||||
b=*source;
|
||||
if(U8_IS_TRAIL(b)) {
|
||||
if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
|
||||
++source;
|
||||
++toULength;
|
||||
c=(c<<6)+b;
|
||||
|
@ -5220,22 +5216,18 @@ moreBytes:
|
|||
}
|
||||
}
|
||||
|
||||
if( toULength==toULimit && /* consumed all trail bytes */
|
||||
(toULength==3 || toULength==2) && /* BMP */
|
||||
(c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
|
||||
(c<=0xd7ff || 0xe000<=c) /* not a surrogate */
|
||||
) {
|
||||
value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
|
||||
} else if(
|
||||
toULength==toULimit && toULength==4 &&
|
||||
(0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
|
||||
) {
|
||||
/* supplementary code point */
|
||||
if(!hasSupplementary) {
|
||||
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
|
||||
value=0;
|
||||
} else {
|
||||
if(toULength==toULimit) {
|
||||
c-=utf8_offsets[toULength];
|
||||
if(toULength<=3) { /* BMP */
|
||||
value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
|
||||
} else {
|
||||
/* supplementary code point */
|
||||
if(!hasSupplementary) {
|
||||
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
|
||||
value=0;
|
||||
} else {
|
||||
value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* error handling: illegal UTF-8 byte sequence */
|
||||
|
@ -5310,7 +5302,7 @@ moreBytes:
|
|||
source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
|
||||
c=utf8->toUBytes[0]=b=*source++;
|
||||
toULength=1;
|
||||
toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
|
||||
toULimit=U8_COUNT_BYTES(b);
|
||||
while(source<sourceLimit) {
|
||||
utf8->toUBytes[toULength++]=b=*source++;
|
||||
c=(c<<6)+b;
|
||||
|
@ -5375,28 +5367,27 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
|||
toULength=oldToULength=toULimit=0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure that the last byte sequence before sourceLimit is complete
|
||||
* or runs into a lead byte.
|
||||
* Do not go back into the bytes that will be read for finishing a partial
|
||||
* sequence from the previous buffer.
|
||||
* In the conversion loop compare source with sourceLimit only once
|
||||
* per multi-byte character.
|
||||
*/
|
||||
// The conversion loop checks source<sourceLimit only once per 1/2/3-byte character.
|
||||
// If the buffer ends with a truncated 2- or 3-byte sequence,
|
||||
// then we reduce the sourceLimit to before that,
|
||||
// and collect the remaining bytes after the conversion loop.
|
||||
{
|
||||
int32_t i, length;
|
||||
|
||||
length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
|
||||
for(i=0; i<3 && i<length;) {
|
||||
b=*(sourceLimit-i-1);
|
||||
if(U8_IS_TRAIL(b)) {
|
||||
++i;
|
||||
} else {
|
||||
if(i<U8_COUNT_TRAIL_BYTES(b)) {
|
||||
/* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
|
||||
sourceLimit-=i+1;
|
||||
// Do not go back into the bytes that will be read for finishing a partial
|
||||
// sequence from the previous buffer.
|
||||
int32_t length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
|
||||
if(length>0) {
|
||||
uint8_t b1=*(sourceLimit-1);
|
||||
if(U8_IS_SINGLE(b1)) {
|
||||
// common ASCII character
|
||||
} else if(U8_IS_TRAIL(b1) && length>=2) {
|
||||
uint8_t b2=*(sourceLimit-2);
|
||||
if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
|
||||
// truncated 3-byte sequence
|
||||
sourceLimit-=2;
|
||||
}
|
||||
break;
|
||||
} else if(0xc2<=b1 && b1<0xf0) {
|
||||
// truncated 2- or 3-byte sequence
|
||||
--sourceLimit;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -5412,7 +5403,7 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
|||
while(source<sourceLimit) {
|
||||
if(targetCapacity>0) {
|
||||
b=*source++;
|
||||
if((int8_t)b>=0) {
|
||||
if(U8_IS_SINGLE(b)) {
|
||||
/* convert ASCII */
|
||||
if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
|
||||
*target++=b;
|
||||
|
@ -5426,13 +5417,13 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
|||
}
|
||||
}
|
||||
} else {
|
||||
if(b>0xe0) {
|
||||
if( /* handle U+1000..U+D7FF inline */
|
||||
(((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) ||
|
||||
(b==0xed && (t1 <= 0x1f))) &&
|
||||
if(b>=0xe0) {
|
||||
if( /* handle U+0800..U+D7FF inline */
|
||||
b<=0xed && // do not assume maxFastUChar>0xd7ff
|
||||
U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
|
||||
(t2=(uint8_t)(source[1]-0x80)) <= 0x3f
|
||||
) {
|
||||
c=((b&0xf)<<6)|t1;
|
||||
c=((b&0xf)<<6)|(t1&0x3f);
|
||||
source+=2;
|
||||
value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2);
|
||||
if(value==0) {
|
||||
|
@ -5442,7 +5433,7 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
|||
} else {
|
||||
c=-1;
|
||||
}
|
||||
} else if(b<0xe0) {
|
||||
} else {
|
||||
if( /* handle U+0080..U+07FF inline */
|
||||
b>=0xc2 &&
|
||||
(t1=(uint8_t)(*source-0x80)) <= 0x3f
|
||||
|
@ -5457,15 +5448,13 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
|||
} else {
|
||||
c=-1;
|
||||
}
|
||||
} else {
|
||||
c=-1;
|
||||
}
|
||||
|
||||
if(c<0) {
|
||||
/* handle "complicated" and error cases, and continuing partial characters */
|
||||
oldToULength=0;
|
||||
toULength=1;
|
||||
toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
|
||||
toULimit=U8_COUNT_BYTES_NON_ASCII(b);
|
||||
c=b;
|
||||
moreBytes:
|
||||
while(toULength<toULimit) {
|
||||
|
@ -5478,7 +5467,7 @@ moreBytes:
|
|||
*/
|
||||
if(source<(uint8_t *)pToUArgs->sourceLimit) {
|
||||
b=*source;
|
||||
if(U8_IS_TRAIL(b)) {
|
||||
if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
|
||||
++source;
|
||||
++toULength;
|
||||
c=(c<<6)+b;
|
||||
|
@ -5500,22 +5489,18 @@ moreBytes:
|
|||
}
|
||||
}
|
||||
|
||||
if( toULength==toULimit && /* consumed all trail bytes */
|
||||
(toULength==3 || toULength==2) && /* BMP */
|
||||
(c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
|
||||
(c<=0xd7ff || 0xe000<=c) /* not a surrogate */
|
||||
) {
|
||||
stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
|
||||
} else if(
|
||||
toULength==toULimit && toULength==4 &&
|
||||
(0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
|
||||
) {
|
||||
/* supplementary code point */
|
||||
if(!hasSupplementary) {
|
||||
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
|
||||
stage2Entry=0;
|
||||
} else {
|
||||
if(toULength==toULimit) {
|
||||
c-=utf8_offsets[toULength];
|
||||
if(toULength<=3) { /* BMP */
|
||||
stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
|
||||
} else {
|
||||
/* supplementary code point */
|
||||
if(!hasSupplementary) {
|
||||
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
|
||||
stage2Entry=0;
|
||||
} else {
|
||||
stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* error handling: illegal UTF-8 byte sequence */
|
||||
|
@ -5620,7 +5605,7 @@ unassigned:
|
|||
source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
|
||||
c=utf8->toUBytes[0]=b=*source++;
|
||||
toULength=1;
|
||||
toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
|
||||
toULimit=U8_COUNT_BYTES(b);
|
||||
while(source<sourceLimit) {
|
||||
utf8->toUBytes[toULength++]=b=*source++;
|
||||
c=(c<<6)+b;
|
||||
|
|
|
@ -79,14 +79,14 @@
|
|||
* prime number while being less than a power of two.
|
||||
*/
|
||||
static const int32_t PRIMES[] = {
|
||||
13, 31, 61, 127, 251, 509, 1021, 2039, 4093, 8191, 16381, 32749,
|
||||
7, 13, 31, 61, 127, 251, 509, 1021, 2039, 4093, 8191, 16381, 32749,
|
||||
65521, 131071, 262139, 524287, 1048573, 2097143, 4194301, 8388593,
|
||||
16777213, 33554393, 67108859, 134217689, 268435399, 536870909,
|
||||
1073741789, 2147483647 /*, 4294967291 */
|
||||
};
|
||||
|
||||
#define PRIMES_LENGTH UPRV_LENGTHOF(PRIMES)
|
||||
#define DEFAULT_PRIME_INDEX 3
|
||||
#define DEFAULT_PRIME_INDEX 4
|
||||
|
||||
/* These ratios are tuned to the PRIMES array such that a resize
|
||||
* places the table back into the zone of non-resizing. That is,
|
||||
|
@ -231,7 +231,7 @@ _uhash_allocate(UHashtable *hash,
|
|||
|
||||
emptytok.pointer = NULL; /* Only one of these two is needed */
|
||||
emptytok.integer = 0; /* but we don't know which one. */
|
||||
|
||||
|
||||
limit = p + hash->length;
|
||||
while (p < limit) {
|
||||
p->key = emptytok;
|
||||
|
@ -247,7 +247,7 @@ _uhash_allocate(UHashtable *hash,
|
|||
|
||||
static UHashtable*
|
||||
_uhash_init(UHashtable *result,
|
||||
UHashFunction *keyHash,
|
||||
UHashFunction *keyHash,
|
||||
UKeyComparator *keyComp,
|
||||
UValueComparator *valueComp,
|
||||
int32_t primeIndex,
|
||||
|
@ -275,7 +275,7 @@ _uhash_init(UHashtable *result,
|
|||
}
|
||||
|
||||
static UHashtable*
|
||||
_uhash_create(UHashFunction *keyHash,
|
||||
_uhash_create(UHashFunction *keyHash,
|
||||
UKeyComparator *keyComp,
|
||||
UValueComparator *valueComp,
|
||||
int32_t primeIndex,
|
||||
|
@ -415,7 +415,7 @@ _uhash_rehash(UHashtable *hash, UErrorCode *status) {
|
|||
|
||||
if (U_FAILURE(*status)) {
|
||||
hash->elements = old;
|
||||
hash->length = oldLength;
|
||||
hash->length = oldLength;
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -536,7 +536,7 @@ _uhash_put(UHashtable *hash,
|
|||
********************************************************************/
|
||||
|
||||
U_CAPI UHashtable* U_EXPORT2
|
||||
uhash_open(UHashFunction *keyHash,
|
||||
uhash_open(UHashFunction *keyHash,
|
||||
UKeyComparator *keyComp,
|
||||
UValueComparator *valueComp,
|
||||
UErrorCode *status) {
|
||||
|
@ -545,7 +545,7 @@ uhash_open(UHashFunction *keyHash,
|
|||
}
|
||||
|
||||
U_CAPI UHashtable* U_EXPORT2
|
||||
uhash_openSize(UHashFunction *keyHash,
|
||||
uhash_openSize(UHashFunction *keyHash,
|
||||
UKeyComparator *keyComp,
|
||||
UValueComparator *valueComp,
|
||||
int32_t size,
|
||||
|
@ -562,7 +562,7 @@ uhash_openSize(UHashFunction *keyHash,
|
|||
|
||||
U_CAPI UHashtable* U_EXPORT2
|
||||
uhash_init(UHashtable *fillinResult,
|
||||
UHashFunction *keyHash,
|
||||
UHashFunction *keyHash,
|
||||
UKeyComparator *keyComp,
|
||||
UValueComparator *valueComp,
|
||||
UErrorCode *status) {
|
||||
|
@ -570,6 +570,22 @@ uhash_init(UHashtable *fillinResult,
|
|||
return _uhash_init(fillinResult, keyHash, keyComp, valueComp, DEFAULT_PRIME_INDEX, status);
|
||||
}
|
||||
|
||||
U_CAPI UHashtable* U_EXPORT2
|
||||
uhash_initSize(UHashtable *fillinResult,
|
||||
UHashFunction *keyHash,
|
||||
UKeyComparator *keyComp,
|
||||
UValueComparator *valueComp,
|
||||
int32_t size,
|
||||
UErrorCode *status) {
|
||||
|
||||
// Find the smallest index i for which PRIMES[i] >= size.
|
||||
int32_t i = 0;
|
||||
while (i<(PRIMES_LENGTH-1) && PRIMES[i]<size) {
|
||||
++i;
|
||||
}
|
||||
return _uhash_init(fillinResult, keyHash, keyComp, valueComp, i, status);
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uhash_close(UHashtable *hash) {
|
||||
if (hash == NULL) {
|
||||
|
@ -604,7 +620,7 @@ uhash_setKeyComparator(UHashtable *hash, UKeyComparator *fn) {
|
|||
hash->keyComparator = fn;
|
||||
return result;
|
||||
}
|
||||
U_CAPI UValueComparator *U_EXPORT2
|
||||
U_CAPI UValueComparator *U_EXPORT2
|
||||
uhash_setValueComparator(UHashtable *hash, UValueComparator *fn){
|
||||
UValueComparator *result = hash->valueComparator;
|
||||
hash->valueComparator = fn;
|
||||
|
@ -630,7 +646,7 @@ uhash_setResizePolicy(UHashtable *hash, enum UHashResizePolicy policy) {
|
|||
UErrorCode status = U_ZERO_ERROR;
|
||||
_uhash_internalSetResizePolicy(hash, policy);
|
||||
hash->lowWaterMark = (int32_t)(hash->length * hash->lowWaterRatio);
|
||||
hash->highWaterMark = (int32_t)(hash->length * hash->highWaterRatio);
|
||||
hash->highWaterMark = (int32_t)(hash->length * hash->highWaterRatio);
|
||||
_uhash_rehash(hash, &status);
|
||||
}
|
||||
|
||||
|
@ -853,7 +869,7 @@ uhash_hashIChars(const UHashTok key) {
|
|||
return s == NULL ? 0 : ustr_hashICharsN(s, uprv_strlen(s));
|
||||
}
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
U_CAPI UBool U_EXPORT2
|
||||
uhash_equals(const UHashtable* hash1, const UHashtable* hash2){
|
||||
int32_t count1, count2, pos, i;
|
||||
|
||||
|
@ -886,14 +902,14 @@ uhash_equals(const UHashtable* hash1, const UHashtable* hash2){
|
|||
if(count1!=count2){
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
||||
pos=UHASH_FIRST;
|
||||
for(i=0; i<count1; i++){
|
||||
const UHashElement* elem1 = uhash_nextElement(hash1, &pos);
|
||||
const UHashTok key1 = elem1->key;
|
||||
const UHashTok val1 = elem1->value;
|
||||
/* here the keys are not compared, instead the key form hash1 is used to fetch
|
||||
* value from hash2. If the hashes are equal then then both hashes should
|
||||
* value from hash2. If the hashes are equal then then both hashes should
|
||||
* contain equal values for the same key!
|
||||
*/
|
||||
const UHashElement* elem2 = _uhash_find(hash2, key1, hash2->keyHasher(key1));
|
||||
|
|
|
@ -154,7 +154,7 @@ struct UHashtable {
|
|||
* If NULL won't do anything */
|
||||
|
||||
/* Size parameters */
|
||||
|
||||
|
||||
int32_t count; /* The number of key-value pairs in this table.
|
||||
* 0 <= count <= length. In practice we
|
||||
* never let count == length (see code). */
|
||||
|
@ -162,12 +162,12 @@ struct UHashtable {
|
|||
* and values. Must be prime. */
|
||||
|
||||
/* Rehashing thresholds */
|
||||
|
||||
|
||||
int32_t highWaterMark; /* If count > highWaterMark, rehash */
|
||||
int32_t lowWaterMark; /* If count < lowWaterMark, rehash */
|
||||
float highWaterRatio; /* 0..1; high water as a fraction of length */
|
||||
float lowWaterRatio; /* 0..1; low water as a fraction of length */
|
||||
|
||||
|
||||
int8_t primeIndex; /* Index into our prime table for length.
|
||||
* length == PRIMES[primeIndex] */
|
||||
UBool allocated; /* Was this UHashtable allocated? */
|
||||
|
@ -190,7 +190,7 @@ U_CDECL_END
|
|||
* @return A pointer to a UHashtable, or 0 if an error occurred.
|
||||
* @see uhash_openSize
|
||||
*/
|
||||
U_CAPI UHashtable* U_EXPORT2
|
||||
U_CAPI UHashtable* U_EXPORT2
|
||||
uhash_open(UHashFunction *keyHash,
|
||||
UKeyComparator *keyComp,
|
||||
UValueComparator *valueComp,
|
||||
|
@ -207,7 +207,7 @@ uhash_open(UHashFunction *keyHash,
|
|||
* @return A pointer to a UHashtable, or 0 if an error occurred.
|
||||
* @see uhash_open
|
||||
*/
|
||||
U_CAPI UHashtable* U_EXPORT2
|
||||
U_CAPI UHashtable* U_EXPORT2
|
||||
uhash_openSize(UHashFunction *keyHash,
|
||||
UKeyComparator *keyComp,
|
||||
UValueComparator *valueComp,
|
||||
|
@ -224,18 +224,37 @@ uhash_openSize(UHashFunction *keyHash,
|
|||
* @return A pointer to a UHashtable, or 0 if an error occurred.
|
||||
* @see uhash_openSize
|
||||
*/
|
||||
U_CAPI UHashtable* U_EXPORT2
|
||||
U_CAPI UHashtable* U_EXPORT2
|
||||
uhash_init(UHashtable *hash,
|
||||
UHashFunction *keyHash,
|
||||
UKeyComparator *keyComp,
|
||||
UValueComparator *valueComp,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Initialize an existing UHashtable.
|
||||
* @param keyHash A pointer to the key hashing function. Must not be
|
||||
* NULL.
|
||||
* @param keyComp A pointer to the function that compares keys. Must
|
||||
* not be NULL.
|
||||
* @param size The initial capacity of this hash table.
|
||||
* @param status A pointer to an UErrorCode to receive any errors.
|
||||
* @return A pointer to a UHashtable, or 0 if an error occurred.
|
||||
* @see uhash_openSize
|
||||
*/
|
||||
U_CAPI UHashtable* U_EXPORT2
|
||||
uhash_initSize(UHashtable *hash,
|
||||
UHashFunction *keyHash,
|
||||
UKeyComparator *keyComp,
|
||||
UValueComparator *valueComp,
|
||||
int32_t size,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Close a UHashtable, releasing the memory used.
|
||||
* @param hash The UHashtable to close. If hash is NULL no operation is performed.
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
U_CAPI void U_EXPORT2
|
||||
uhash_close(UHashtable *hash);
|
||||
|
||||
|
||||
|
@ -246,7 +265,7 @@ uhash_close(UHashtable *hash);
|
|||
* @param fn the function to be used hash keys; must not be NULL
|
||||
* @return the previous key hasher; non-NULL
|
||||
*/
|
||||
U_CAPI UHashFunction *U_EXPORT2
|
||||
U_CAPI UHashFunction *U_EXPORT2
|
||||
uhash_setKeyHasher(UHashtable *hash, UHashFunction *fn);
|
||||
|
||||
/**
|
||||
|
@ -256,7 +275,7 @@ uhash_setKeyHasher(UHashtable *hash, UHashFunction *fn);
|
|||
* @param fn the function to be used compare keys; must not be NULL
|
||||
* @return the previous key comparator; non-NULL
|
||||
*/
|
||||
U_CAPI UKeyComparator *U_EXPORT2
|
||||
U_CAPI UKeyComparator *U_EXPORT2
|
||||
uhash_setKeyComparator(UHashtable *hash, UKeyComparator *fn);
|
||||
|
||||
/**
|
||||
|
@ -266,7 +285,7 @@ uhash_setKeyComparator(UHashtable *hash, UKeyComparator *fn);
|
|||
* @param fn the function to be used compare keys; must not be NULL
|
||||
* @return the previous key comparator; non-NULL
|
||||
*/
|
||||
U_CAPI UValueComparator *U_EXPORT2
|
||||
U_CAPI UValueComparator *U_EXPORT2
|
||||
uhash_setValueComparator(UHashtable *hash, UValueComparator *fn);
|
||||
|
||||
/**
|
||||
|
@ -279,7 +298,7 @@ uhash_setValueComparator(UHashtable *hash, UValueComparator *fn);
|
|||
* @param fn the function to be used delete keys, or NULL
|
||||
* @return the previous key deleter; may be NULL
|
||||
*/
|
||||
U_CAPI UObjectDeleter *U_EXPORT2
|
||||
U_CAPI UObjectDeleter *U_EXPORT2
|
||||
uhash_setKeyDeleter(UHashtable *hash, UObjectDeleter *fn);
|
||||
|
||||
/**
|
||||
|
@ -292,7 +311,7 @@ uhash_setKeyDeleter(UHashtable *hash, UObjectDeleter *fn);
|
|||
* @param fn the function to be used delete values, or NULL
|
||||
* @return the previous value deleter; may be NULL
|
||||
*/
|
||||
U_CAPI UObjectDeleter *U_EXPORT2
|
||||
U_CAPI UObjectDeleter *U_EXPORT2
|
||||
uhash_setValueDeleter(UHashtable *hash, UObjectDeleter *fn);
|
||||
|
||||
/**
|
||||
|
@ -302,7 +321,7 @@ uhash_setValueDeleter(UHashtable *hash, UObjectDeleter *fn);
|
|||
* @param hash The UHashtable to set
|
||||
* @param policy The way the hashtable resizes itself, {U_GROW, U_GROW_AND_SHRINK, U_FIXED}
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
U_CAPI void U_EXPORT2
|
||||
uhash_setResizePolicy(UHashtable *hash, enum UHashResizePolicy policy);
|
||||
|
||||
/**
|
||||
|
@ -310,7 +329,7 @@ uhash_setResizePolicy(UHashtable *hash, enum UHashResizePolicy policy);
|
|||
* @param hash The UHashtable to query.
|
||||
* @return The number of key-value pairs stored in hash.
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uhash_count(const UHashtable *hash);
|
||||
|
||||
/**
|
||||
|
@ -326,7 +345,7 @@ uhash_count(const UHashtable *hash);
|
|||
* @return The previous value, or NULL if none.
|
||||
* @see uhash_get
|
||||
*/
|
||||
U_CAPI void* U_EXPORT2
|
||||
U_CAPI void* U_EXPORT2
|
||||
uhash_put(UHashtable *hash,
|
||||
void *key,
|
||||
void *value,
|
||||
|
@ -344,7 +363,7 @@ uhash_put(UHashtable *hash,
|
|||
* @return The previous value, or NULL if none.
|
||||
* @see uhash_get
|
||||
*/
|
||||
U_CAPI void* U_EXPORT2
|
||||
U_CAPI void* U_EXPORT2
|
||||
uhash_iput(UHashtable *hash,
|
||||
int32_t key,
|
||||
void* value,
|
||||
|
@ -362,7 +381,7 @@ uhash_iput(UHashtable *hash,
|
|||
* @return The previous value, or 0 if none.
|
||||
* @see uhash_get
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uhash_puti(UHashtable *hash,
|
||||
void* key,
|
||||
int32_t value,
|
||||
|
@ -380,7 +399,7 @@ uhash_puti(UHashtable *hash,
|
|||
* @return The previous value, or 0 if none.
|
||||
* @see uhash_get
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uhash_iputi(UHashtable *hash,
|
||||
int32_t key,
|
||||
int32_t value,
|
||||
|
@ -393,8 +412,8 @@ uhash_iputi(UHashtable *hash,
|
|||
* @param key A pointer key stored in a hashtable
|
||||
* @return The requested item, or NULL if not found.
|
||||
*/
|
||||
U_CAPI void* U_EXPORT2
|
||||
uhash_get(const UHashtable *hash,
|
||||
U_CAPI void* U_EXPORT2
|
||||
uhash_get(const UHashtable *hash,
|
||||
const void *key);
|
||||
|
||||
/**
|
||||
|
@ -404,7 +423,7 @@ uhash_get(const UHashtable *hash,
|
|||
* @param key An integer key stored in a hashtable
|
||||
* @return The requested item, or NULL if not found.
|
||||
*/
|
||||
U_CAPI void* U_EXPORT2
|
||||
U_CAPI void* U_EXPORT2
|
||||
uhash_iget(const UHashtable *hash,
|
||||
int32_t key);
|
||||
|
||||
|
@ -415,7 +434,7 @@ uhash_iget(const UHashtable *hash,
|
|||
* @param key A pointer key stored in a hashtable
|
||||
* @return The requested item, or 0 if not found.
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uhash_geti(const UHashtable *hash,
|
||||
const void* key);
|
||||
/**
|
||||
|
@ -425,7 +444,7 @@ uhash_geti(const UHashtable *hash,
|
|||
* @param key An integer key stored in a hashtable
|
||||
* @return The requested item, or 0 if not found.
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uhash_igeti(const UHashtable *hash,
|
||||
int32_t key);
|
||||
|
||||
|
@ -435,7 +454,7 @@ uhash_igeti(const UHashtable *hash,
|
|||
* @param key A key stored in a hashtable
|
||||
* @return The item removed, or NULL if not found.
|
||||
*/
|
||||
U_CAPI void* U_EXPORT2
|
||||
U_CAPI void* U_EXPORT2
|
||||
uhash_remove(UHashtable *hash,
|
||||
const void *key);
|
||||
|
||||
|
@ -445,7 +464,7 @@ uhash_remove(UHashtable *hash,
|
|||
* @param key An integer key stored in a hashtable
|
||||
* @return The item removed, or NULL if not found.
|
||||
*/
|
||||
U_CAPI void* U_EXPORT2
|
||||
U_CAPI void* U_EXPORT2
|
||||
uhash_iremove(UHashtable *hash,
|
||||
int32_t key);
|
||||
|
||||
|
@ -455,7 +474,7 @@ uhash_iremove(UHashtable *hash,
|
|||
* @param key An key stored in a hashtable
|
||||
* @return The item removed, or 0 if not found.
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uhash_removei(UHashtable *hash,
|
||||
const void* key);
|
||||
|
||||
|
@ -465,7 +484,7 @@ uhash_removei(UHashtable *hash,
|
|||
* @param key An integer key stored in a hashtable
|
||||
* @return The item removed, or 0 if not found.
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uhash_iremovei(UHashtable *hash,
|
||||
int32_t key);
|
||||
|
||||
|
@ -473,7 +492,7 @@ uhash_iremovei(UHashtable *hash,
|
|||
* Remove all items from a UHashtable.
|
||||
* @param hash The target UHashtable.
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
U_CAPI void U_EXPORT2
|
||||
uhash_removeAll(UHashtable *hash);
|
||||
|
||||
/**
|
||||
|
@ -487,7 +506,7 @@ uhash_removeAll(UHashtable *hash);
|
|||
* @param key A key stored in a hashtable
|
||||
* @return a hash element, or NULL if the key is not found.
|
||||
*/
|
||||
U_CAPI const UHashElement* U_EXPORT2
|
||||
U_CAPI const UHashElement* U_EXPORT2
|
||||
uhash_find(const UHashtable *hash, const void* key);
|
||||
|
||||
/**
|
||||
|
@ -510,7 +529,7 @@ uhash_find(const UHashtable *hash, const void* key);
|
|||
* @return a hash element, or NULL if no further key-value pairs
|
||||
* exist in the table.
|
||||
*/
|
||||
U_CAPI const UHashElement* U_EXPORT2
|
||||
U_CAPI const UHashElement* U_EXPORT2
|
||||
uhash_nextElement(const UHashtable *hash,
|
||||
int32_t *pos);
|
||||
|
||||
|
@ -525,7 +544,7 @@ uhash_nextElement(const UHashtable *hash,
|
|||
* modified.
|
||||
* @return the value that was removed.
|
||||
*/
|
||||
U_CAPI void* U_EXPORT2
|
||||
U_CAPI void* U_EXPORT2
|
||||
uhash_removeElement(UHashtable *hash, const UHashElement* e);
|
||||
|
||||
/********************************************************************
|
||||
|
@ -537,7 +556,7 @@ uhash_removeElement(UHashtable *hash, const UHashElement* e);
|
|||
* @param i The given integer
|
||||
* @return a UHashTok for an integer.
|
||||
*/
|
||||
/*U_CAPI UHashTok U_EXPORT2
|
||||
/*U_CAPI UHashTok U_EXPORT2
|
||||
uhash_toki(int32_t i);*/
|
||||
|
||||
/**
|
||||
|
@ -545,7 +564,7 @@ uhash_toki(int32_t i);*/
|
|||
* @param p The given pointer
|
||||
* @return a UHashTok for a pointer.
|
||||
*/
|
||||
/*U_CAPI UHashTok U_EXPORT2
|
||||
/*U_CAPI UHashTok U_EXPORT2
|
||||
uhash_tokp(void* p);*/
|
||||
|
||||
/********************************************************************
|
||||
|
@ -559,7 +578,7 @@ uhash_tokp(void* p);*/
|
|||
* @param key The string (const UChar*) to hash.
|
||||
* @return A hash code for the key.
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uhash_hashUChars(const UHashTok key);
|
||||
|
||||
/**
|
||||
|
@ -569,7 +588,7 @@ uhash_hashUChars(const UHashTok key);
|
|||
* @param key The string (const char*) to hash.
|
||||
* @return A hash code for the key.
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uhash_hashChars(const UHashTok key);
|
||||
|
||||
/**
|
||||
|
@ -589,7 +608,7 @@ uhash_hashIChars(const UHashTok key);
|
|||
* @param key2 The string for comparison
|
||||
* @return true if key1 and key2 are equal, return false otherwise.
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
U_CAPI UBool U_EXPORT2
|
||||
uhash_compareUChars(const UHashTok key1, const UHashTok key2);
|
||||
|
||||
/**
|
||||
|
@ -599,7 +618,7 @@ uhash_compareUChars(const UHashTok key1, const UHashTok key2);
|
|||
* @param key2 The string for comparison
|
||||
* @return true if key1 and key2 are equal, return false otherwise.
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
U_CAPI UBool U_EXPORT2
|
||||
uhash_compareChars(const UHashTok key1, const UHashTok key2);
|
||||
|
||||
/**
|
||||
|
@ -609,7 +628,7 @@ uhash_compareChars(const UHashTok key1, const UHashTok key2);
|
|||
* @param key2 The string for comparison
|
||||
* @return true if key1 and key2 are equal, return false otherwise.
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
U_CAPI UBool U_EXPORT2
|
||||
uhash_compareIChars(const UHashTok key1, const UHashTok key2);
|
||||
|
||||
/********************************************************************
|
||||
|
@ -621,7 +640,7 @@ uhash_compareIChars(const UHashTok key1, const UHashTok key2);
|
|||
* @param key The string (const char*) to hash.
|
||||
* @return A hash code for the key.
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uhash_hashUnicodeString(const UElement key);
|
||||
|
||||
/**
|
||||
|
@ -630,7 +649,7 @@ uhash_hashUnicodeString(const UElement key);
|
|||
* @param key The string (const char*) to hash.
|
||||
* @return A hash code for the key.
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uhash_hashCaselessUnicodeString(const UElement key);
|
||||
|
||||
/********************************************************************
|
||||
|
@ -642,7 +661,7 @@ uhash_hashCaselessUnicodeString(const UElement key);
|
|||
* @param key The string (const char*) to hash.
|
||||
* @return A hash code for the key.
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uhash_hashLong(const UHashTok key);
|
||||
|
||||
/**
|
||||
|
@ -651,7 +670,7 @@ uhash_hashLong(const UHashTok key);
|
|||
* @param Key2 The integer for comparison
|
||||
* @return true if key1 and key2 are equal, return false otherwise
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
U_CAPI UBool U_EXPORT2
|
||||
uhash_compareLong(const UHashTok key1, const UHashTok key2);
|
||||
|
||||
/********************************************************************
|
||||
|
@ -662,7 +681,7 @@ uhash_compareLong(const UHashTok key1, const UHashTok key2);
|
|||
* Deleter for Hashtable objects.
|
||||
* @param obj The object to be deleted
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
U_CAPI void U_EXPORT2
|
||||
uhash_deleteHashtable(void *obj);
|
||||
|
||||
/* Use uprv_free() itself as a deleter for any key or value allocated using uprv_malloc. */
|
||||
|
@ -673,7 +692,7 @@ uhash_deleteHashtable(void *obj);
|
|||
* @param hash2
|
||||
* @return true if the hashtables are equal and false if not.
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
U_CAPI UBool U_EXPORT2
|
||||
uhash_equals(const UHashtable* hash1, const UHashtable* hash2);
|
||||
|
||||
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
#define __CASEMAP_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "unicode/uobject.h"
|
||||
|
||||
/**
|
||||
|
@ -20,6 +21,7 @@ U_NAMESPACE_BEGIN
|
|||
#ifndef U_HIDE_DRAFT_API
|
||||
|
||||
class BreakIterator;
|
||||
class ByteSink;
|
||||
class Edits;
|
||||
|
||||
/**
|
||||
|
@ -36,7 +38,7 @@ public:
|
|||
* The source string and the destination buffer must not overlap.
|
||||
*
|
||||
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT.
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
|
||||
* @param src The original string.
|
||||
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
|
||||
* @param dest A buffer for the result string. The result will be NUL-terminated if
|
||||
|
@ -48,7 +50,8 @@ public:
|
|||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* The Edits contents is undefined if any error occurs.
|
||||
* This function calls edits->reset() first. edits can be NULL.
|
||||
* This function calls edits->reset() first unless
|
||||
* options includes U_EDITS_NO_RESET. edits can be NULL.
|
||||
* @param errorCode Reference to an in/out error code value
|
||||
* which must not indicate a failure before the function call.
|
||||
* @return The length of the result string, if successful.
|
||||
|
@ -71,7 +74,7 @@ public:
|
|||
* The source string and the destination buffer must not overlap.
|
||||
*
|
||||
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT.
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
|
||||
* @param src The original string.
|
||||
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
|
||||
* @param dest A buffer for the result string. The result will be NUL-terminated if
|
||||
|
@ -83,7 +86,8 @@ public:
|
|||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* The Edits contents is undefined if any error occurs.
|
||||
* This function calls edits->reset() first. edits can be NULL.
|
||||
* This function calls edits->reset() first unless
|
||||
* options includes U_EDITS_NO_RESET. edits can be NULL.
|
||||
* @param errorCode Reference to an in/out error code value
|
||||
* which must not indicate a failure before the function call.
|
||||
* @return The length of the result string, if successful.
|
||||
|
@ -112,7 +116,7 @@ public:
|
|||
* all others. (This can be modified with options bits.)
|
||||
*
|
||||
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT,
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
|
||||
* U_TITLECASE_NO_LOWERCASE,
|
||||
* U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
|
||||
* U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
|
||||
|
@ -132,7 +136,8 @@ public:
|
|||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* The Edits contents is undefined if any error occurs.
|
||||
* This function calls edits->reset() first. edits can be NULL.
|
||||
* This function calls edits->reset() first unless
|
||||
* options includes U_EDITS_NO_RESET. edits can be NULL.
|
||||
* @param errorCode Reference to an in/out error code value
|
||||
* which must not indicate a failure before the function call.
|
||||
* @return The length of the result string, if successful.
|
||||
|
@ -161,7 +166,7 @@ public:
|
|||
* The result may be longer or shorter than the original.
|
||||
* The source string and the destination buffer must not overlap.
|
||||
*
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT,
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
|
||||
* U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I.
|
||||
* @param src The original string.
|
||||
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
|
||||
|
@ -174,7 +179,8 @@ public:
|
|||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* The Edits contents is undefined if any error occurs.
|
||||
* This function calls edits->reset() first. edits can be NULL.
|
||||
* This function calls edits->reset() first unless
|
||||
* options includes U_EDITS_NO_RESET. edits can be NULL.
|
||||
* @param errorCode Reference to an in/out error code value
|
||||
* which must not indicate a failure before the function call.
|
||||
* @return The length of the result string, if successful.
|
||||
|
@ -190,6 +196,129 @@ public:
|
|||
char16_t *dest, int32_t destCapacity, Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* Lowercases a UTF-8 string and optionally records edits.
|
||||
* Casing is locale-dependent and context-sensitive.
|
||||
* The result may be longer or shorter than the original.
|
||||
*
|
||||
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
|
||||
* @param src The original string.
|
||||
* @param sink A ByteSink to which the result string is written.
|
||||
* sink.Flush() is called at the end.
|
||||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* The Edits contents is undefined if any error occurs.
|
||||
* This function calls edits->reset() first unless
|
||||
* options includes U_EDITS_NO_RESET. edits can be NULL.
|
||||
* @param errorCode Reference to an in/out error code value
|
||||
* which must not indicate a failure before the function call.
|
||||
*
|
||||
* @see ucasemap_utf8ToLower
|
||||
* @draft ICU 60
|
||||
*/
|
||||
static void utf8ToLower(
|
||||
const char *locale, uint32_t options,
|
||||
StringPiece src, ByteSink &sink, Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* Uppercases a UTF-8 string and optionally records edits.
|
||||
* Casing is locale-dependent and context-sensitive.
|
||||
* The result may be longer or shorter than the original.
|
||||
*
|
||||
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
|
||||
* @param src The original string.
|
||||
* @param sink A ByteSink to which the result string is written.
|
||||
* sink.Flush() is called at the end.
|
||||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* The Edits contents is undefined if any error occurs.
|
||||
* This function calls edits->reset() first unless
|
||||
* options includes U_EDITS_NO_RESET. edits can be NULL.
|
||||
* @param errorCode Reference to an in/out error code value
|
||||
* which must not indicate a failure before the function call.
|
||||
*
|
||||
* @see ucasemap_utf8ToUpper
|
||||
* @draft ICU 60
|
||||
*/
|
||||
static void utf8ToUpper(
|
||||
const char *locale, uint32_t options,
|
||||
StringPiece src, ByteSink &sink, Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
/**
|
||||
* Titlecases a UTF-8 string and optionally records edits.
|
||||
* Casing is locale-dependent and context-sensitive.
|
||||
* The result may be longer or shorter than the original.
|
||||
*
|
||||
* Titlecasing uses a break iterator to find the first characters of words
|
||||
* that are to be titlecased. It titlecases those characters and lowercases
|
||||
* all others. (This can be modified with options bits.)
|
||||
*
|
||||
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
|
||||
* U_TITLECASE_NO_LOWERCASE,
|
||||
* U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
|
||||
* U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
|
||||
* @param iter A break iterator to find the first characters of words that are to be titlecased.
|
||||
* It is set to the source string (setUText())
|
||||
* and used one or more times for iteration (first() and next()).
|
||||
* If NULL, then a word break iterator for the locale is used
|
||||
* (or something equivalent).
|
||||
* @param src The original string.
|
||||
* @param sink A ByteSink to which the result string is written.
|
||||
* sink.Flush() is called at the end.
|
||||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* The Edits contents is undefined if any error occurs.
|
||||
* This function calls edits->reset() first unless
|
||||
* options includes U_EDITS_NO_RESET. edits can be NULL.
|
||||
* @param errorCode Reference to an in/out error code value
|
||||
* which must not indicate a failure before the function call.
|
||||
*
|
||||
* @see ucasemap_utf8ToTitle
|
||||
* @draft ICU 60
|
||||
*/
|
||||
static void utf8ToTitle(
|
||||
const char *locale, uint32_t options, BreakIterator *iter,
|
||||
StringPiece src, ByteSink &sink, Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
#endif // UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
/**
|
||||
* Case-folds a UTF-8 string and optionally records edits.
|
||||
*
|
||||
* Case folding is locale-independent and not context-sensitive,
|
||||
* but there is an option for whether to include or exclude mappings for dotted I
|
||||
* and dotless i that are marked with 'T' in CaseFolding.txt.
|
||||
*
|
||||
* The result may be longer or shorter than the original.
|
||||
*
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
|
||||
* @param src The original string.
|
||||
* @param sink A ByteSink to which the result string is written.
|
||||
* sink.Flush() is called at the end.
|
||||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* The Edits contents is undefined if any error occurs.
|
||||
* This function calls edits->reset() first unless
|
||||
* options includes U_EDITS_NO_RESET. edits can be NULL.
|
||||
* @param errorCode Reference to an in/out error code value
|
||||
* which must not indicate a failure before the function call.
|
||||
*
|
||||
* @see ucasemap_utf8FoldCase
|
||||
* @draft ICU 60
|
||||
*/
|
||||
static void utf8Fold(
|
||||
uint32_t options,
|
||||
StringPiece src, ByteSink &sink, Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* Lowercases a UTF-8 string and optionally records edits.
|
||||
* Casing is locale-dependent and context-sensitive.
|
||||
|
@ -197,7 +326,7 @@ public:
|
|||
* The source string and the destination buffer must not overlap.
|
||||
*
|
||||
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT.
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
|
||||
* @param src The original string.
|
||||
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
|
||||
* @param dest A buffer for the result string. The result will be NUL-terminated if
|
||||
|
@ -209,7 +338,8 @@ public:
|
|||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* The Edits contents is undefined if any error occurs.
|
||||
* This function calls edits->reset() first. edits can be NULL.
|
||||
* This function calls edits->reset() first unless
|
||||
* options includes U_EDITS_NO_RESET. edits can be NULL.
|
||||
* @param errorCode Reference to an in/out error code value
|
||||
* which must not indicate a failure before the function call.
|
||||
* @return The length of the result string, if successful.
|
||||
|
@ -219,7 +349,7 @@ public:
|
|||
* @see ucasemap_utf8ToLower
|
||||
* @draft ICU 59
|
||||
*/
|
||||
static int32_t utf8ToLower(
|
||||
static int32_t utf8ToLower(
|
||||
const char *locale, uint32_t options,
|
||||
const char *src, int32_t srcLength,
|
||||
char *dest, int32_t destCapacity, Edits *edits,
|
||||
|
@ -232,7 +362,7 @@ public:
|
|||
* The source string and the destination buffer must not overlap.
|
||||
*
|
||||
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT.
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
|
||||
* @param src The original string.
|
||||
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
|
||||
* @param dest A buffer for the result string. The result will be NUL-terminated if
|
||||
|
@ -244,7 +374,8 @@ public:
|
|||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* The Edits contents is undefined if any error occurs.
|
||||
* This function calls edits->reset() first. edits can be NULL.
|
||||
* This function calls edits->reset() first unless
|
||||
* options includes U_EDITS_NO_RESET. edits can be NULL.
|
||||
* @param errorCode Reference to an in/out error code value
|
||||
* which must not indicate a failure before the function call.
|
||||
* @return The length of the result string, if successful.
|
||||
|
@ -273,7 +404,7 @@ public:
|
|||
* all others. (This can be modified with options bits.)
|
||||
*
|
||||
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT,
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
|
||||
* U_TITLECASE_NO_LOWERCASE,
|
||||
* U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
|
||||
* U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
|
||||
|
@ -293,7 +424,8 @@ public:
|
|||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* The Edits contents is undefined if any error occurs.
|
||||
* This function calls edits->reset() first. edits can be NULL.
|
||||
* This function calls edits->reset() first unless
|
||||
* options includes U_EDITS_NO_RESET. edits can be NULL.
|
||||
* @param errorCode Reference to an in/out error code value
|
||||
* which must not indicate a failure before the function call.
|
||||
* @return The length of the result string, if successful.
|
||||
|
@ -321,7 +453,7 @@ public:
|
|||
* The result may be longer or shorter than the original.
|
||||
* The source string and the destination buffer must not overlap.
|
||||
*
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT,
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
|
||||
* U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I.
|
||||
* @param src The original string.
|
||||
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
|
||||
|
@ -334,7 +466,8 @@ public:
|
|||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* The Edits contents is undefined if any error occurs.
|
||||
* This function calls edits->reset() first. edits can be NULL.
|
||||
* This function calls edits->reset() first unless
|
||||
* options includes U_EDITS_NO_RESET. edits can be NULL.
|
||||
* @param errorCode Reference to an in/out error code value
|
||||
* which must not indicate a failure before the function call.
|
||||
* @return The length of the result string, if successful.
|
||||
|
|
|
@ -148,7 +148,7 @@ public:
|
|||
Iterator() :
|
||||
array(nullptr), index(0), length(0),
|
||||
remaining(0), onlyChanges_(FALSE), coarse(FALSE),
|
||||
changed(FALSE), oldLength_(0), newLength_(0),
|
||||
dir(0), changed(FALSE), oldLength_(0), newLength_(0),
|
||||
srcIndex(0), replIndex(0), destIndex(0) {}
|
||||
/**
|
||||
* Copy constructor.
|
||||
|
@ -306,17 +306,22 @@ public:
|
|||
Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs);
|
||||
|
||||
int32_t readLength(int32_t head);
|
||||
void updateIndexes();
|
||||
void updateNextIndexes();
|
||||
void updatePreviousIndexes();
|
||||
UBool noNext();
|
||||
UBool next(UBool onlyChanges, UErrorCode &errorCode);
|
||||
UBool previous(UErrorCode &errorCode);
|
||||
/** @return -1: error or i<0; 0: found; 1: i>=string length */
|
||||
int32_t findIndex(int32_t i, UBool findSource, UErrorCode &errorCode);
|
||||
|
||||
const uint16_t *array;
|
||||
int32_t index, length;
|
||||
// 0 if we are not within compressed equal-length changes.
|
||||
// Otherwise the number of remaining changes, including the current one.
|
||||
int32_t remaining;
|
||||
UBool onlyChanges_, coarse;
|
||||
|
||||
int8_t dir; // iteration direction: back(<0), initial(0), forward(>0)
|
||||
UBool changed;
|
||||
int32_t oldLength_, newLength_;
|
||||
int32_t srcIndex, replIndex, destIndex;
|
||||
|
|
|
@ -55,14 +55,26 @@ class U_COMMON_API FilteredBreakIteratorBuilder : public UObject {
|
|||
*/
|
||||
static FilteredBreakIteratorBuilder *createInstance(const Locale& where, UErrorCode& status);
|
||||
|
||||
/**
|
||||
* This function has been deprecated in favor of createEmptyInstance, which has
|
||||
* identical behavior.
|
||||
* @param status The error code.
|
||||
* @return the new builder
|
||||
* @deprecated ICU 60 use createEmptyInstance instead
|
||||
* @see createEmptyInstance()
|
||||
*/
|
||||
static inline FilteredBreakIteratorBuilder *createInstance(UErrorCode &status) {
|
||||
return createEmptyInstance(status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct an empty FilteredBreakIteratorBuilder.
|
||||
* In this state, it will not suppress any segment boundaries.
|
||||
* @param status The error code.
|
||||
* @return the new builder
|
||||
* @stable ICU 56
|
||||
* @draft ICU 60
|
||||
*/
|
||||
static FilteredBreakIteratorBuilder *createInstance(UErrorCode &status);
|
||||
static FilteredBreakIteratorBuilder *createEmptyInstance(UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Suppress a certain string from being the end of a segment.
|
||||
|
@ -89,6 +101,17 @@ class U_COMMON_API FilteredBreakIteratorBuilder : public UObject {
|
|||
*/
|
||||
virtual UBool unsuppressBreakAfter(const UnicodeString& string, UErrorCode& status) = 0;
|
||||
|
||||
/**
|
||||
* This function has been deprecated in favor of wrapIteratorWithFilter()
|
||||
* The behavior is identical.
|
||||
* @param adoptBreakIterator the break iterator to adopt
|
||||
* @param status error code
|
||||
* @return the new BreakIterator, owned by the caller.
|
||||
* @deprecated ICU 60 use wrapIteratorWithFilter() instead
|
||||
* @see wrapBreakIteratorWithFilter()
|
||||
*/
|
||||
virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status) = 0;
|
||||
|
||||
/**
|
||||
* Wrap (adopt) an existing break iterator in a new filtered instance.
|
||||
* The resulting BreakIterator is owned by the caller.
|
||||
|
@ -96,12 +119,15 @@ class U_COMMON_API FilteredBreakIteratorBuilder : public UObject {
|
|||
* Note that the adoptBreakIterator is adopted by the new BreakIterator
|
||||
* and should no longer be used by the caller.
|
||||
* The FilteredBreakIteratorBuilder may be reused.
|
||||
* This function is an alias for build()
|
||||
* @param adoptBreakIterator the break iterator to adopt
|
||||
* @param status error code
|
||||
* @return the new BreakIterator, owned by the caller.
|
||||
* @stable ICU 56
|
||||
* @draft ICU 60
|
||||
*/
|
||||
virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status) = 0;
|
||||
inline BreakIterator *wrapIteratorWithFilter(BreakIterator* adoptBreakIterator, UErrorCode& status) {
|
||||
return build(adoptBreakIterator, status);
|
||||
}
|
||||
|
||||
protected:
|
||||
/**
|
||||
|
|
|
@ -88,7 +88,7 @@ class UnicodeString;
|
|||
* <P>
|
||||
* The third constructor requires a third argument--the <STRONG>Variant.</STRONG>
|
||||
* The Variant codes are vendor and browser-specific.
|
||||
* For example, use REVISED for a langauge's revised script orthography, and POSIX for POSIX.
|
||||
* For example, use REVISED for a language's revised script orthography, and POSIX for POSIX.
|
||||
* Where there are two variants, separate them with an underscore, and
|
||||
* put the most important one first. For
|
||||
* example, a Traditional Spanish collation might be referenced, with
|
||||
|
|
|
@ -228,14 +228,15 @@ public:
|
|||
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
|
||||
* Otherwise currently converts to & from UTF-16 and does not support edits.
|
||||
*
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT.
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
|
||||
* @param src Source UTF-8 string.
|
||||
* @param sink A ByteSink to which the normalized UTF-8 result string is written.
|
||||
* sink.Flush() is called at the end.
|
||||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* The Edits contents is undefined if any error occurs.
|
||||
* This function calls edits->reset() first. edits can be nullptr.
|
||||
* This function calls edits->reset() first unless
|
||||
* options includes U_EDITS_NO_RESET. edits can be nullptr.
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
|
@ -534,7 +535,7 @@ public:
|
|||
virtual UnicodeString &
|
||||
normalize(const UnicodeString &src,
|
||||
UnicodeString &dest,
|
||||
UErrorCode &errorCode) const override;
|
||||
UErrorCode &errorCode) const U_OVERRIDE;
|
||||
|
||||
/**
|
||||
* Normalizes a UTF-8 string and optionally records how source substrings
|
||||
|
@ -545,14 +546,15 @@ public:
|
|||
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
|
||||
* Otherwise currently converts to & from UTF-16 and does not support edits.
|
||||
*
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT.
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
|
||||
* @param src Source UTF-8 string.
|
||||
* @param sink A ByteSink to which the normalized UTF-8 result string is written.
|
||||
* sink.Flush() is called at the end.
|
||||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* The Edits contents is undefined if any error occurs.
|
||||
* This function calls edits->reset() first. edits can be nullptr.
|
||||
* This function calls edits->reset() first unless
|
||||
* options includes U_EDITS_NO_RESET. edits can be nullptr.
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
|
@ -561,7 +563,7 @@ public:
|
|||
*/
|
||||
virtual void
|
||||
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
|
||||
Edits *edits, UErrorCode &errorCode) const override;
|
||||
Edits *edits, UErrorCode &errorCode) const U_OVERRIDE;
|
||||
|
||||
/**
|
||||
* Appends the normalized form of the second string to the first string
|
||||
|
@ -580,7 +582,7 @@ public:
|
|||
virtual UnicodeString &
|
||||
normalizeSecondAndAppend(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const override;
|
||||
UErrorCode &errorCode) const U_OVERRIDE;
|
||||
/**
|
||||
* Appends the second string to the first string
|
||||
* (merging them at the boundary) and returns the first string.
|
||||
|
@ -598,7 +600,7 @@ public:
|
|||
virtual UnicodeString &
|
||||
append(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const override;
|
||||
UErrorCode &errorCode) const U_OVERRIDE;
|
||||
|
||||
/**
|
||||
* Gets the decomposition mapping of c.
|
||||
|
@ -612,7 +614,7 @@ public:
|
|||
* @stable ICU 4.6
|
||||
*/
|
||||
virtual UBool
|
||||
getDecomposition(UChar32 c, UnicodeString &decomposition) const override;
|
||||
getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
|
||||
|
||||
/**
|
||||
* Gets the raw decomposition mapping of c.
|
||||
|
@ -626,7 +628,7 @@ public:
|
|||
* @stable ICU 49
|
||||
*/
|
||||
virtual UBool
|
||||
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override;
|
||||
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
|
||||
|
||||
/**
|
||||
* Performs pairwise composition of a & b and returns the composite if there is one.
|
||||
|
@ -639,7 +641,7 @@ public:
|
|||
* @stable ICU 49
|
||||
*/
|
||||
virtual UChar32
|
||||
composePair(UChar32 a, UChar32 b) const override;
|
||||
composePair(UChar32 a, UChar32 b) const U_OVERRIDE;
|
||||
|
||||
/**
|
||||
* Gets the combining class of c.
|
||||
|
@ -650,7 +652,7 @@ public:
|
|||
* @stable ICU 49
|
||||
*/
|
||||
virtual uint8_t
|
||||
getCombiningClass(UChar32 c) const override;
|
||||
getCombiningClass(UChar32 c) const U_OVERRIDE;
|
||||
|
||||
/**
|
||||
* Tests if the string is normalized.
|
||||
|
@ -664,7 +666,7 @@ public:
|
|||
* @stable ICU 4.4
|
||||
*/
|
||||
virtual UBool
|
||||
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
|
||||
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
|
||||
/**
|
||||
* Tests if the UTF-8 string is normalized.
|
||||
* Internally, in cases where the quickCheck() method would return "maybe"
|
||||
|
@ -687,7 +689,7 @@ public:
|
|||
* @draft ICU 60
|
||||
*/
|
||||
virtual UBool
|
||||
isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
|
||||
isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE;
|
||||
/**
|
||||
* Tests if the string is normalized.
|
||||
* For details see the Normalizer2 base class documentation.
|
||||
|
@ -700,7 +702,7 @@ public:
|
|||
* @stable ICU 4.4
|
||||
*/
|
||||
virtual UNormalizationCheckResult
|
||||
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override;
|
||||
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
|
||||
/**
|
||||
* Returns the end of the normalized substring of the input string.
|
||||
* For details see the Normalizer2 base class documentation.
|
||||
|
@ -713,7 +715,7 @@ public:
|
|||
* @stable ICU 4.4
|
||||
*/
|
||||
virtual int32_t
|
||||
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override;
|
||||
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
|
||||
|
||||
/**
|
||||
* Tests if the character always has a normalization boundary before it,
|
||||
|
@ -723,7 +725,7 @@ public:
|
|||
* @return TRUE if c has a normalization boundary before it
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const override;
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE;
|
||||
|
||||
/**
|
||||
* Tests if the character always has a normalization boundary after it,
|
||||
|
@ -733,7 +735,7 @@ public:
|
|||
* @return TRUE if c has a normalization boundary after it
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const override;
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE;
|
||||
|
||||
/**
|
||||
* Tests if the character is normalization-inert.
|
||||
|
@ -742,7 +744,7 @@ public:
|
|||
* @return TRUE if c is normalization-inert
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
virtual UBool isInert(UChar32 c) const override;
|
||||
virtual UBool isInert(UChar32 c) const U_OVERRIDE;
|
||||
private:
|
||||
UnicodeString &
|
||||
normalize(const UnicodeString &src,
|
||||
|
|
|
@ -830,6 +830,16 @@ namespace std {
|
|||
# define U_CALLCONV U_EXPORT2
|
||||
#endif
|
||||
|
||||
/**
|
||||
* \def U_CALLCONV_FPTR
|
||||
* Similar to U_CALLCONV, but only used on function pointers.
|
||||
* @internal
|
||||
*/
|
||||
#if U_PLATFORM == U_PF_OS390 && defined(__cplusplus)
|
||||
# define U_CALLCONV_FPTR U_CALLCONV
|
||||
#else
|
||||
# define U_CALLCONV_FPTR
|
||||
#endif
|
||||
/* @} */
|
||||
|
||||
#endif
|
||||
|
|
|
@ -31,21 +31,14 @@
|
|||
#include "unicode/schriter.h"
|
||||
#include "unicode/uchriter.h"
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/** @internal */
|
||||
struct RBBIDataHeader;
|
||||
class RuleBasedBreakIteratorTables;
|
||||
class BreakIterator;
|
||||
class RBBIDataWrapper;
|
||||
class UStack;
|
||||
class LanguageBreakEngine;
|
||||
struct RBBIDataHeader;
|
||||
class RBBIDataWrapper;
|
||||
class UnhandledEngine;
|
||||
struct RBBIStateTable;
|
||||
|
||||
|
||||
|
||||
class UStack;
|
||||
|
||||
/**
|
||||
*
|
||||
|
@ -94,19 +87,36 @@ private:
|
|||
*/
|
||||
RBBIDataWrapper *fData;
|
||||
|
||||
/** Index of the Rule {tag} values for the most recent match.
|
||||
/**
|
||||
* The iteration state - current position, rule status for the current position,
|
||||
* and whether the iterator ran off the end, yielding UBRK_DONE.
|
||||
* Current position is pinned to be 0 < position <= text.length.
|
||||
* Current position is always set to a boundary.
|
||||
* @internal
|
||||
*/
|
||||
int32_t fLastRuleStatusIndex;
|
||||
/**
|
||||
* The current position of the iterator. Pinned, 0 < fPosition <= text.length.
|
||||
* Never has the value UBRK_DONE (-1).
|
||||
*/
|
||||
int32_t fPosition;
|
||||
|
||||
/**
|
||||
* Rule tag value valid flag.
|
||||
* Some iterator operations don't intrinsically set the correct tag value.
|
||||
* This flag lets us lazily compute the value if we are ever asked for it.
|
||||
* @internal
|
||||
*/
|
||||
UBool fLastStatusIndexValid;
|
||||
* TODO:
|
||||
*/
|
||||
int32_t fRuleStatusIndex;
|
||||
|
||||
/**
|
||||
* True when iteration has run off the end, and iterator functions should return UBRK_DONE.
|
||||
*/
|
||||
UBool fDone;
|
||||
|
||||
/**
|
||||
* Cache of previously determined boundary positions.
|
||||
*/
|
||||
public: // TODO: debug, return to private.
|
||||
class BreakCache;
|
||||
BreakCache *fBreakCache;
|
||||
private:
|
||||
/**
|
||||
* Counter for the number of characters encountered with the "dictionary"
|
||||
* flag set.
|
||||
|
@ -115,26 +125,11 @@ private:
|
|||
uint32_t fDictionaryCharCount;
|
||||
|
||||
/**
|
||||
* When a range of characters is divided up using the dictionary, the break
|
||||
* positions that are discovered are stored here, preventing us from having
|
||||
* to use either the dictionary or the state table again until the iterator
|
||||
* leaves this range of text. Has the most impact for line breaking.
|
||||
* @internal
|
||||
* Cache of boundary positions within a region of text that has been
|
||||
* sub-divided by dictionary based breaking.
|
||||
*/
|
||||
int32_t* fCachedBreakPositions;
|
||||
|
||||
/**
|
||||
* The number of elements in fCachedBreakPositions
|
||||
* @internal
|
||||
*/
|
||||
int32_t fNumCachedBreakPositions;
|
||||
|
||||
/**
|
||||
* if fCachedBreakPositions is not null, this indicates which item in the
|
||||
* cache the current iteration position refers to
|
||||
* @internal
|
||||
*/
|
||||
int32_t fPositionInCache;
|
||||
class DictionaryCache;
|
||||
DictionaryCache *fDictionaryCache;
|
||||
|
||||
/**
|
||||
*
|
||||
|
@ -177,13 +172,11 @@ private:
|
|||
*/
|
||||
RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
|
||||
|
||||
|
||||
/** @internal */
|
||||
friend class RBBIRuleBuilder;
|
||||
/** @internal */
|
||||
friend class BreakIterator;
|
||||
|
||||
|
||||
|
||||
public:
|
||||
|
||||
/** Default constructor. Creates an empty shell of an iterator, with no
|
||||
|
@ -467,7 +460,10 @@ public:
|
|||
virtual UBool isBoundary(int32_t offset);
|
||||
|
||||
/**
|
||||
* Returns the current iteration position.
|
||||
* Returns the current iteration position. Note that UBRK_DONE is never
|
||||
* returned from this function; if iteration has run to the end of a
|
||||
* string, current() will return the length of the string while
|
||||
* next() will return UBRK_DONE).
|
||||
* @return The current iteration position.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
|
@ -499,6 +495,7 @@ public:
|
|||
* Note: this function is not thread safe. It should not have been
|
||||
* declared const, and the const remains only for compatibility
|
||||
* reasons. (The function is logically const, but not bit-wise const).
|
||||
* TODO: check this. Probably thread safe now.
|
||||
* <p>
|
||||
* @return the status from the break rule that determined the most recently
|
||||
* returned break position.
|
||||
|
@ -658,46 +655,31 @@ private:
|
|||
* Common initialization function, used by constructors and bufferClone.
|
||||
* @internal
|
||||
*/
|
||||
void init();
|
||||
void init(UErrorCode &status);
|
||||
|
||||
/**
|
||||
* This method backs the iterator back up to a "safe position" in the text.
|
||||
* This is a position that we know, without any context, must be a break position.
|
||||
* The various calling methods then iterate forward from this safe position to
|
||||
* the appropriate position to return. (For more information, see the description
|
||||
* of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
|
||||
* @param statetable state table used of moving backwards
|
||||
* Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules.
|
||||
* This locates a "Safe Position" from which the forward break rules
|
||||
* will operate correctly. A Safe Position is not necessarily a boundary itself.
|
||||
*
|
||||
* @param fromPosition the position in the input text to begin the iteration.
|
||||
* @internal
|
||||
*/
|
||||
int32_t handlePrevious(const RBBIStateTable *statetable);
|
||||
int32_t handlePrevious(int32_t fromPosition);
|
||||
|
||||
/**
|
||||
* This method is the actual implementation of the next() method. All iteration
|
||||
* vectors through here. This method initializes the state machine to state 1
|
||||
* and advances through the text character by character until we reach the end
|
||||
* of the text or the state machine transitions to state 0. We update our return
|
||||
* value every time the state machine passes through a possible end state.
|
||||
* @param statetable state table used of moving forwards
|
||||
* Find a rule-based boundary by running the state machine.
|
||||
* Input
|
||||
* fPosition, the position in the text to begin from.
|
||||
* Output
|
||||
* fPosition: the boundary following the starting position.
|
||||
* fDictionaryCharCount the number of dictionary characters encountered.
|
||||
* If > 0, the segment will be further subdivided
|
||||
* fRuleStatusIndex Info from the state table indicating which rules caused the boundary.
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
int32_t handleNext(const RBBIStateTable *statetable);
|
||||
|
||||
|
||||
/**
|
||||
* This is the function that actually implements dictionary-based
|
||||
* breaking. Covering at least the range from startPos to endPos,
|
||||
* it checks for dictionary characters, and if it finds them determines
|
||||
* the appropriate object to deal with them. It may cache found breaks in
|
||||
* fCachedBreakPositions as it goes. It may well also look at text outside
|
||||
* the range startPos to endPos.
|
||||
* If going forward, endPos is the normal Unicode break result, and
|
||||
* if goind in reverse, startPos is the normal Unicode break result
|
||||
* @param startPos The start position of a range of text
|
||||
* @param endPos The end position of a range of text
|
||||
* @param reverse The call is for the reverse direction
|
||||
* @internal
|
||||
*/
|
||||
int32_t checkDictionary(int32_t startPos, int32_t endPos, UBool reverse);
|
||||
int32_t handleNext();
|
||||
|
||||
|
||||
/**
|
||||
|
@ -708,11 +690,12 @@ private:
|
|||
*/
|
||||
const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);
|
||||
|
||||
public:
|
||||
/**
|
||||
* @internal
|
||||
* Debugging function only.
|
||||
* @internal
|
||||
*/
|
||||
void makeRuleStatusValid();
|
||||
|
||||
void dumpCache();
|
||||
};
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
|
|
@ -134,6 +134,17 @@
|
|||
*/
|
||||
#define U_TITLECASE_ADJUST_TO_CASED 0x400
|
||||
|
||||
/**
|
||||
* Option for string transformation functions to not first reset the Edits object.
|
||||
* Used for example in some case-mapping and normalization functions.
|
||||
*
|
||||
* @see CaseMap
|
||||
* @see Edits
|
||||
* @see Normalizer2
|
||||
* @draft ICU 60
|
||||
*/
|
||||
#define U_EDITS_NO_RESET 0x2000
|
||||
|
||||
/**
|
||||
* Omit unchanged text when recording how source substrings
|
||||
* relate to changed and unchanged result substrings.
|
||||
|
@ -182,7 +193,6 @@
|
|||
// ucasemap_imp.h #define U_TITLECASE_ITERATOR_MASK 0xe0
|
||||
// ucasemap_imp.h #define U_TITLECASE_ADJUSTMENT_MASK 0x600
|
||||
// ustr_imp.h #define _STRNCMP_STYLE 0x1000
|
||||
// ustr_imp.h #define U_EDITS_NO_RESET 0x2000
|
||||
// unormcmp.cpp #define _COMPARE_EQUIV 0x80000
|
||||
|
||||
#endif // __STRINGOPTIONS_H__
|
||||
|
|
|
@ -230,7 +230,8 @@ typedef enum USentenceBreakTag {
|
|||
* @param locale The locale specifying the text-breaking conventions. Note that
|
||||
* locale keys such as "lb" and "ss" may be used to modify text break behavior,
|
||||
* see general discussion of BreakIterator C API.
|
||||
* @param text The text to be iterated over.
|
||||
* @param text The text to be iterated over. May be null, in which case ubrk_setText() is
|
||||
* used to specify the text to be iterated.
|
||||
* @param textLength The number of characters in text, or -1 if null-terminated.
|
||||
* @param status A UErrorCode to receive any errors.
|
||||
* @return A UBreakIterator for the specified locale.
|
||||
|
|
|
@ -149,7 +149,7 @@ typedef void U_CALLCONV UMemFreeFn (const void *context, void *mem);
|
|||
* @system
|
||||
*/
|
||||
U_STABLE void U_EXPORT2
|
||||
u_setMemoryFunctions(const void *context, UMemAllocFn * U_CALLCONV a, UMemReallocFn * U_CALLCONV r, UMemFreeFn * U_CALLCONV f,
|
||||
u_setMemoryFunctions(const void *context, UMemAllocFn * U_CALLCONV_FPTR a, UMemReallocFn * U_CALLCONV_FPTR r, UMemFreeFn * U_CALLCONV_FPTR f,
|
||||
UErrorCode *status);
|
||||
|
||||
U_CDECL_END
|
||||
|
|
|
@ -768,7 +768,7 @@ utext_extract(UText *ut,
|
|||
*/
|
||||
#define UTEXT_SETNATIVEINDEX(ut, ix) \
|
||||
{ int64_t __offset = (ix) - (ut)->chunkNativeStart; \
|
||||
if (__offset>=0 && __offset<=(int64_t)(ut)->nativeIndexingLimit) { \
|
||||
if (__offset>=0 && __offset<(int64_t)(ut)->nativeIndexingLimit && (ut)->chunkContents[__offset]<0xdc00) { \
|
||||
(ut)->chunkOffset=(int32_t)__offset; \
|
||||
} else { \
|
||||
utext_setNativeIndex((ut), (ix)); } }
|
||||
|
|
|
@ -23,9 +23,6 @@
|
|||
* This file defines macros for checking whether a code point is
|
||||
* a surrogate or a non-character etc.
|
||||
*
|
||||
* The UChar and UChar32 data types for Unicode code units and code points
|
||||
* are defined in umachine.h because they can be machine-dependent.
|
||||
*
|
||||
* If U_NO_DEFAULT_INCLUDE_UTF_HEADERS is 0 then utf.h is included by utypes.h
|
||||
* and itself includes utf8.h and utf16.h after some
|
||||
* common definitions.
|
||||
|
@ -50,11 +47,11 @@
|
|||
* but are optimized for the much more frequently occurring BMP code points.
|
||||
*
|
||||
* umachine.h defines UChar to be an unsigned 16-bit integer.
|
||||
* Where available, UChar is defined to be a char16_t
|
||||
* or a wchar_t (if that is an unsigned 16-bit type), otherwise uint16_t.
|
||||
* Since ICU 59, ICU uses char16_t in C++, UChar only in C,
|
||||
* and defines UChar=char16_t by default. See the UChar API docs for details.
|
||||
*
|
||||
* UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit
|
||||
* Unicode code point (Unicode scalar value, 0..0x10ffff).
|
||||
* Unicode code point (Unicode scalar value, 0..0x10ffff) and U_SENTINEL (-1).
|
||||
* Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as
|
||||
* the definition of UChar. For details see the documentation for UChar32 itself.
|
||||
*
|
||||
|
@ -63,11 +60,20 @@
|
|||
* For actual Unicode character properties see uchar.h.
|
||||
*
|
||||
* By default, string operations must be done with error checking in case
|
||||
* a string is not well-formed UTF-16.
|
||||
* The macros will detect if a surrogate code unit is unpaired
|
||||
* a string is not well-formed UTF-16 or UTF-8.
|
||||
*
|
||||
* The U16_ macros detect if a surrogate code unit is unpaired
|
||||
* (lead unit without trail unit or vice versa) and just return the unit itself
|
||||
* as the code point.
|
||||
*
|
||||
* The U8_ macros detect illegal byte sequences and return a negative value.
|
||||
* Starting with ICU 60, the observable length of a single illegal byte sequence
|
||||
* skipped by one of these macros follows the Unicode 6+ recommendation
|
||||
* which is consistent with the W3C Encoding Standard.
|
||||
*
|
||||
* There are ..._OR_FFFD versions of both U16_ and U8_ macros
|
||||
* that return U+FFFD for illegal code unit sequences.
|
||||
*
|
||||
* The regular "safe" macros require that the initial, passed-in string index
|
||||
* is within bounds. They only check the index when they read more than one
|
||||
* code unit. This is usually done with code similar to the following loop:
|
||||
|
@ -91,10 +97,7 @@
|
|||
* The performance differences are much larger here because UTF-8 provides so
|
||||
* many opportunities for malformed sequences.
|
||||
* The unsafe UTF-8 macros are entirely implemented inside the macro definitions
|
||||
* and are fast, while the safe UTF-8 macros call functions for all but the
|
||||
* trivial (ASCII) cases.
|
||||
* (ICU 3.6 optimizes U8_NEXT() and U8_APPEND() to handle most other common
|
||||
* characters inline as well.)
|
||||
* and are fast, while the safe UTF-8 macros call functions for some complicated cases.
|
||||
*
|
||||
* Unlike with UTF-16, malformed sequences cannot be expressed with distinct
|
||||
* code point values (0..U+10ffff). They are indicated with negative values instead.
|
||||
|
@ -126,8 +129,7 @@
|
|||
*/
|
||||
#define U_IS_UNICODE_NONCHAR(c) \
|
||||
((c)>=0xfdd0 && \
|
||||
((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \
|
||||
(uint32_t)(c)<=0x10ffff)
|
||||
((c)<=0xfdef || ((c)&0xfffe)==0xfffe) && (c)<=0x10ffff)
|
||||
|
||||
/**
|
||||
* Is c a Unicode code point value (0..U+10ffff)
|
||||
|
@ -148,9 +150,7 @@
|
|||
*/
|
||||
#define U_IS_UNICODE_CHAR(c) \
|
||||
((uint32_t)(c)<0xd800 || \
|
||||
((uint32_t)(c)>0xdfff && \
|
||||
(uint32_t)(c)<=0x10ffff && \
|
||||
!U_IS_UNICODE_NONCHAR(c)))
|
||||
(0xdfff<(c) && (c)<=0x10ffff && !U_IS_UNICODE_NONCHAR(c)))
|
||||
|
||||
/**
|
||||
* Is this code point a BMP code point (U+0000..U+ffff)?
|
||||
|
|
|
@ -185,8 +185,8 @@
|
|||
*
|
||||
* The length can be negative for a NUL-terminated string.
|
||||
*
|
||||
* If the offset points to a single, unpaired surrogate, then that itself
|
||||
* will be returned as the code point.
|
||||
* If the offset points to a single, unpaired surrogate, then
|
||||
* c is set to that unpaired surrogate.
|
||||
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
|
@ -213,6 +213,53 @@
|
|||
} \
|
||||
}
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Get a code point from a string at a random-access offset,
|
||||
* without changing the offset.
|
||||
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||
*
|
||||
* The offset may point to either the lead or trail surrogate unit
|
||||
* for a supplementary code point, in which case the macro will read
|
||||
* the adjacent matching surrogate as well.
|
||||
*
|
||||
* The length can be negative for a NUL-terminated string.
|
||||
*
|
||||
* If the offset points to a single, unpaired surrogate, then
|
||||
* c is set to U+FFFD.
|
||||
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT_OR_FFFD.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param start starting string offset (usually 0)
|
||||
* @param i string offset, must be start<=i<length
|
||||
* @param length string length
|
||||
* @param c output UChar32 variable
|
||||
* @see U16_GET_UNSAFE
|
||||
* @draft ICU 60
|
||||
*/
|
||||
#define U16_GET_OR_FFFD(s, start, i, length, c) { \
|
||||
(c)=(s)[i]; \
|
||||
if(U16_IS_SURROGATE(c)) { \
|
||||
uint16_t __c2; \
|
||||
if(U16_IS_SURROGATE_LEAD(c)) { \
|
||||
if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
|
||||
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
|
||||
} else { \
|
||||
(c)=0xfffd; \
|
||||
} \
|
||||
} else { \
|
||||
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
|
||||
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
|
||||
} else { \
|
||||
(c)=0xfffd; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
/* definitions with forward iteration --------------------------------------- */
|
||||
|
||||
/**
|
||||
|
@ -253,8 +300,7 @@
|
|||
* for a supplementary code point, in which case the macro will read
|
||||
* the following trail surrogate as well.
|
||||
* If the offset points to a trail surrogate or
|
||||
* to a single, unpaired lead surrogate, then that itself
|
||||
* will be returned as the code point.
|
||||
* to a single, unpaired lead surrogate, then c is set to that unpaired surrogate.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset, must be i<length
|
||||
|
@ -274,6 +320,44 @@
|
|||
} \
|
||||
}
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Get a code point from a string at a code point boundary offset,
|
||||
* and advance the offset to the next code point boundary.
|
||||
* (Post-incrementing forward iteration.)
|
||||
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||
*
|
||||
* The length can be negative for a NUL-terminated string.
|
||||
*
|
||||
* The offset may point to the lead surrogate unit
|
||||
* for a supplementary code point, in which case the macro will read
|
||||
* the following trail surrogate as well.
|
||||
* If the offset points to a trail surrogate or
|
||||
* to a single, unpaired lead surrogate, then c is set to U+FFFD.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset, must be i<length
|
||||
* @param length string length
|
||||
* @param c output UChar32 variable
|
||||
* @see U16_NEXT_UNSAFE
|
||||
* @draft ICU 60
|
||||
*/
|
||||
#define U16_NEXT_OR_FFFD(s, i, length, c) { \
|
||||
(c)=(s)[(i)++]; \
|
||||
if(U16_IS_SURROGATE(c)) { \
|
||||
uint16_t __c2; \
|
||||
if(U16_IS_SURROGATE_LEAD(c) && (i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
|
||||
++(i); \
|
||||
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
|
||||
} else { \
|
||||
(c)=0xfffd; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Append a code point to a string, overwriting 1 or 2 code units.
|
||||
* The offset points to the current end of the string contents
|
||||
|
@ -481,8 +565,7 @@
|
|||
* for a supplementary code point, then the macro will read
|
||||
* the preceding lead surrogate as well.
|
||||
* If the offset is behind a lead surrogate or behind a single, unpaired
|
||||
* trail surrogate, then that itself
|
||||
* will be returned as the code point.
|
||||
* trail surrogate, then c is set to that unpaired surrogate.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param start starting string offset (usually 0)
|
||||
|
@ -502,6 +585,43 @@
|
|||
} \
|
||||
}
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Move the string offset from one code point boundary to the previous one
|
||||
* and get the code point between them.
|
||||
* (Pre-decrementing backward iteration.)
|
||||
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||
*
|
||||
* The input offset may be the same as the string length.
|
||||
* If the offset is behind a trail surrogate unit
|
||||
* for a supplementary code point, then the macro will read
|
||||
* the preceding lead surrogate as well.
|
||||
* If the offset is behind a lead surrogate or behind a single, unpaired
|
||||
* trail surrogate, then c is set to U+FFFD.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param start starting string offset (usually 0)
|
||||
* @param i string offset, must be start<i
|
||||
* @param c output UChar32 variable
|
||||
* @see U16_PREV_UNSAFE
|
||||
* @draft ICU 60
|
||||
*/
|
||||
#define U16_PREV_OR_FFFD(s, start, i, c) { \
|
||||
(c)=(s)[--(i)]; \
|
||||
if(U16_IS_SURROGATE(c)) { \
|
||||
uint16_t __c2; \
|
||||
if(U16_IS_SURROGATE_TRAIL(c) && (i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
|
||||
--(i); \
|
||||
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
|
||||
} else { \
|
||||
(c)=0xfffd; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Move the string offset from one code point boundary to the previous one.
|
||||
* (Pre-decrementing backward iteration.)
|
||||
|
|
|
@ -41,34 +41,24 @@
|
|||
|
||||
/* internal definitions ----------------------------------------------------- */
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Counts the trail bytes for a UTF-8 lead byte.
|
||||
* Returns 0 for 0..0xbf as well as for 0xfe and 0xff.
|
||||
* Returns 0 for 0..0xc1 as well as for 0xf5..0xff.
|
||||
* leadByte might be evaluated multiple times.
|
||||
*
|
||||
* This is internal since it is not meant to be called directly by external clients;
|
||||
* however it is called by public macros in this file and thus must remain stable.
|
||||
*
|
||||
* Note: Beginning with ICU 50, the implementation uses a multi-condition expression
|
||||
* which was shown in 2012 (on x86-64) to compile to fast, branch-free code.
|
||||
* leadByte is evaluated multiple times.
|
||||
*
|
||||
* The pre-ICU 50 implementation used the exported array utf8_countTrailBytes:
|
||||
* #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte])
|
||||
* leadByte was evaluated exactly once.
|
||||
*
|
||||
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
|
||||
* @internal
|
||||
*/
|
||||
#define U8_COUNT_TRAIL_BYTES(leadByte) \
|
||||
((uint8_t)(leadByte)<0xf0 ? \
|
||||
((uint8_t)(leadByte)>=0xc0)+((uint8_t)(leadByte)>=0xe0) : \
|
||||
(uint8_t)(leadByte)<0xfe ? 3+((uint8_t)(leadByte)>=0xf8)+((uint8_t)(leadByte)>=0xfc) : 0)
|
||||
(U8_IS_LEAD(leadByte) ? \
|
||||
((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0)
|
||||
|
||||
/**
|
||||
* Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
|
||||
* The maximum supported lead byte is 0xf4 corresponding to U+10FFFF.
|
||||
* Returns 0 for 0..0xc1. Undefined for 0xf5..0xff.
|
||||
* leadByte might be evaluated multiple times.
|
||||
*
|
||||
* This is internal since it is not meant to be called directly by external clients;
|
||||
|
@ -78,7 +68,7 @@
|
|||
* @internal
|
||||
*/
|
||||
#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
|
||||
(((leadByte)>=0xc0)+((leadByte)>=0xe0)+((leadByte)>=0xf0))
|
||||
(((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))
|
||||
|
||||
/**
|
||||
* Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
|
||||
|
@ -89,6 +79,40 @@
|
|||
*/
|
||||
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
|
||||
|
||||
/**
|
||||
* Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
|
||||
* Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
|
||||
* Lead byte E0..EF bits 3..0 are used as byte index,
|
||||
* first trail byte bits 7..5 are used as bit index into that byte.
|
||||
* @see U8_IS_VALID_LEAD3_AND_T1
|
||||
* @internal
|
||||
*/
|
||||
#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
|
||||
|
||||
/**
|
||||
* Internal 3-byte UTF-8 validity check.
|
||||
* Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence.
|
||||
* @internal
|
||||
*/
|
||||
#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
|
||||
|
||||
/**
|
||||
* Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
|
||||
* Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
|
||||
* First trail byte bits 7..4 are used as byte index,
|
||||
* lead byte F0..F4 bits 2..0 are used as bit index into that byte.
|
||||
* @see U8_IS_VALID_LEAD4_AND_T1
|
||||
* @internal
|
||||
*/
|
||||
#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
|
||||
|
||||
/**
|
||||
* Internal 4-byte UTF-8 validity check.
|
||||
* Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence.
|
||||
* @internal
|
||||
*/
|
||||
#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
|
||||
|
||||
/**
|
||||
* Function for handling "next code point" with error-checking.
|
||||
*
|
||||
|
@ -148,20 +172,21 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
|||
#define U8_IS_SINGLE(c) (((c)&0x80)==0)
|
||||
|
||||
/**
|
||||
* Is this code unit (byte) a UTF-8 lead byte?
|
||||
* Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
|
||||
* @param c 8-bit code unit (byte)
|
||||
* @return TRUE or FALSE
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e)
|
||||
#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
|
||||
// 0x32=0xf4-0xc2
|
||||
|
||||
/**
|
||||
* Is this code unit (byte) a UTF-8 trail byte?
|
||||
* Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
|
||||
* @param c 8-bit code unit (byte)
|
||||
* @return TRUE or FALSE
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
#define U8_IS_TRAIL(c) (((c)&0xc0)==0x80)
|
||||
#define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40)
|
||||
|
||||
/**
|
||||
* How many code units (bytes) are used for the UTF-8 encoding
|
||||
|
@ -289,7 +314,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
|||
*/
|
||||
#define U8_NEXT_UNSAFE(s, i, c) { \
|
||||
(c)=(uint8_t)(s)[(i)++]; \
|
||||
if((c)>=0x80) { \
|
||||
if(!U8_IS_SINGLE(c)) { \
|
||||
if((c)<0xe0) { \
|
||||
(c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
|
||||
} else if((c)<0xf0) { \
|
||||
|
@ -325,22 +350,19 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
|||
*/
|
||||
#define U8_NEXT(s, i, length, c) { \
|
||||
(c)=(uint8_t)(s)[(i)++]; \
|
||||
if((c)>=0x80) { \
|
||||
if(!U8_IS_SINGLE(c)) { \
|
||||
uint8_t __t1, __t2; \
|
||||
if( /* handle U+1000..U+CFFF inline */ \
|
||||
(0xe0<(c) && (c)<=0xec) && \
|
||||
(((i)+1)<(length) || (length)<0) && \
|
||||
(__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
|
||||
(__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
|
||||
) { \
|
||||
/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
|
||||
(c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
|
||||
if( /* handle U+0800..U+FFFF inline */ \
|
||||
(0xe0<=(c) && (c)<0xf0) && \
|
||||
(((i)+1)<(length) || (length)<0) && \
|
||||
U8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \
|
||||
(__t2=(s)[(i)+1]-0x80)<=0x3f) { \
|
||||
(c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \
|
||||
(i)+=2; \
|
||||
} else if( /* handle U+0080..U+07FF inline */ \
|
||||
((c)<0xe0 && (c)>=0xc2) && \
|
||||
((i)!=(length)) && \
|
||||
(__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
|
||||
) { \
|
||||
((c)<0xe0 && (c)>=0xc2) && \
|
||||
((i)!=(length)) && \
|
||||
(__t1=(s)[i]-0x80)<=0x3f) { \
|
||||
(c)=(((c)&0x1f)<<6)|__t1; \
|
||||
++(i); \
|
||||
} else { \
|
||||
|
@ -376,22 +398,19 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
|||
*/
|
||||
#define U8_NEXT_OR_FFFD(s, i, length, c) { \
|
||||
(c)=(uint8_t)(s)[(i)++]; \
|
||||
if((c)>=0x80) { \
|
||||
if(!U8_IS_SINGLE(c)) { \
|
||||
uint8_t __t1, __t2; \
|
||||
if( /* handle U+1000..U+CFFF inline */ \
|
||||
(0xe0<(c) && (c)<=0xec) && \
|
||||
(((i)+1)<(length) || (length)<0) && \
|
||||
(__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
|
||||
(__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
|
||||
) { \
|
||||
/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
|
||||
(c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
|
||||
if( /* handle U+0800..U+FFFF inline */ \
|
||||
(0xe0<=(c) && (c)<0xf0) && \
|
||||
(((i)+1)<(length) || (length)<0) && \
|
||||
U8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \
|
||||
(__t2=(s)[(i)+1]-0x80)<=0x3f) { \
|
||||
(c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \
|
||||
(i)+=2; \
|
||||
} else if( /* handle U+0080..U+07FF inline */ \
|
||||
((c)<0xe0 && (c)>=0xc2) && \
|
||||
((i)!=(length)) && \
|
||||
(__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
|
||||
) { \
|
||||
((c)<0xe0 && (c)>=0xc2) && \
|
||||
((i)!=(length)) && \
|
||||
(__t1=(s)[i]-0x80)<=0x3f) { \
|
||||
(c)=(((c)&0x1f)<<6)|__t1; \
|
||||
++(i); \
|
||||
} else { \
|
||||
|
@ -476,7 +495,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
|||
* @stable ICU 2.4
|
||||
*/
|
||||
#define U8_FWD_1_UNSAFE(s, i) { \
|
||||
(i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((uint8_t)(s)[i]); \
|
||||
(i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -493,15 +512,24 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
|||
* @stable ICU 2.4
|
||||
*/
|
||||
#define U8_FWD_1(s, i, length) { \
|
||||
uint8_t __b=(uint8_t)(s)[(i)++]; \
|
||||
if(U8_IS_LEAD(__b)) { \
|
||||
uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \
|
||||
if((i)+__count>(length) && (length)>=0) { \
|
||||
__count=(uint8_t)((length)-(i)); \
|
||||
} \
|
||||
while(__count>0 && U8_IS_TRAIL((s)[i])) { \
|
||||
++(i); \
|
||||
--__count; \
|
||||
uint8_t __b=(s)[(i)++]; \
|
||||
if(U8_IS_LEAD(__b) && (i)!=(length)) { \
|
||||
uint8_t __t1=(s)[i]; \
|
||||
if((0xe0<=__b && __b<0xf0)) { \
|
||||
if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \
|
||||
++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
|
||||
++(i); \
|
||||
} \
|
||||
} else if(__b<0xe0) { \
|
||||
if(U8_IS_TRAIL(__t1)) { \
|
||||
++(i); \
|
||||
} \
|
||||
} else /* c>=0xf0 */ { \
|
||||
if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
|
||||
++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
|
||||
++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
|
||||
++(i); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
@ -615,7 +643,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
|||
/* c is a trail byte */ \
|
||||
(c)&=0x3f; \
|
||||
for(;;) { \
|
||||
__b=(uint8_t)(s)[--(i)]; \
|
||||
__b=(s)[--(i)]; \
|
||||
if(__b>=0xc0) { \
|
||||
U8_MASK_LEAD_BYTE(__b, __count); \
|
||||
(c)|=(UChar32)__b<<__shift; \
|
||||
|
@ -651,7 +679,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
|||
*/
|
||||
#define U8_PREV(s, start, i, c) { \
|
||||
(c)=(uint8_t)(s)[--(i)]; \
|
||||
if((c)>=0x80) { \
|
||||
if(!U8_IS_SINGLE(c)) { \
|
||||
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
|
||||
} \
|
||||
}
|
||||
|
@ -682,7 +710,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
|||
*/
|
||||
#define U8_PREV_OR_FFFD(s, start, i, c) { \
|
||||
(c)=(uint8_t)(s)[--(i)]; \
|
||||
if((c)>=0x80) { \
|
||||
if(!U8_IS_SINGLE(c)) { \
|
||||
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
|
||||
} \
|
||||
}
|
||||
|
|
|
@ -145,7 +145,22 @@
|
|||
#ifndef __UTF_OLD_H__
|
||||
#define __UTF_OLD_H__
|
||||
|
||||
#ifndef U_HIDE_DEPRECATED_API
|
||||
/**
|
||||
* \def U_HIDE_OBSOLETE_UTF_OLD_H
|
||||
*
|
||||
* Hides the obsolete definitions in unicode/utf_old.h.
|
||||
* Recommended to be set to 1 at compile time to make sure
|
||||
* the long-deprecated macros are no longer used.
|
||||
*
|
||||
* For reasons for the deprecation see the utf_old.h file comments.
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
#ifndef U_HIDE_OBSOLETE_UTF_OLD_H
|
||||
# define U_HIDE_OBSOLETE_UTF_OLD_H 0
|
||||
#endif
|
||||
|
||||
#if !defined(U_HIDE_DEPRECATED_API) && !U_HIDE_OBSOLETE_UTF_OLD_H
|
||||
|
||||
#include "unicode/utf.h"
|
||||
#include "unicode/utf8.h"
|
||||
|
@ -1184,7 +1199,6 @@ U_CFUNC U_IMPORT const uint8_t utf8_countTrailBytes[]; /* U_IMPORT2? */ /*U_I
|
|||
*/
|
||||
#define UTF_SET_CHAR_LIMIT(s, start, i, length) U16_SET_CP_LIMIT(s, start, i, length)
|
||||
|
||||
#endif /* U_HIDE_DEPRECATED_API */
|
||||
#endif // !U_HIDE_DEPRECATED_API && !U_HIDE_OBSOLETE_UTF_OLD_H
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -502,7 +502,7 @@ spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
|
|||
static inline int32_t
|
||||
spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
|
||||
UChar32 c=*s;
|
||||
if((int8_t)c>=0) {
|
||||
if(U8_IS_SINGLE(c)) {
|
||||
return set.contains(c) ? 1 : -1;
|
||||
}
|
||||
// Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
|
||||
|
@ -514,7 +514,7 @@ spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
|
|||
static inline int32_t
|
||||
spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
|
||||
UChar32 c=s[length-1];
|
||||
if((int8_t)c>=0) {
|
||||
if(U8_IS_SINGLE(c)) {
|
||||
return set.contains(c) ? 1 : -1;
|
||||
}
|
||||
int32_t i=length-1;
|
||||
|
@ -1006,11 +1006,9 @@ int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpa
|
|||
// Try to match if the increment is not listed already.
|
||||
// Match at code point boundaries. (The UTF-8 strings were converted
|
||||
// from UTF-16 and are guaranteed to be well-formed.)
|
||||
if( !U8_IS_TRAIL(s[pos-overlap]) &&
|
||||
!offsets.containsOffset(inc) &&
|
||||
matches8(s+pos-overlap, s8, length8)
|
||||
|
||||
) {
|
||||
if(!U8_IS_TRAIL(s[pos-overlap]) &&
|
||||
!offsets.containsOffset(inc) &&
|
||||
matches8(s+pos-overlap, s8, length8)) {
|
||||
if(inc==rest) {
|
||||
return length; // Reached the end of the string.
|
||||
}
|
||||
|
@ -1052,11 +1050,10 @@ int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpa
|
|||
// Try to match if the string is longer or starts earlier.
|
||||
// Match at code point boundaries. (The UTF-8 strings were converted
|
||||
// from UTF-16 and are guaranteed to be well-formed.)
|
||||
if( !U8_IS_TRAIL(s[pos-overlap]) &&
|
||||
(overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
|
||||
matches8(s+pos-overlap, s8, length8)
|
||||
|
||||
) {
|
||||
if(!U8_IS_TRAIL(s[pos-overlap]) &&
|
||||
(overlap>maxOverlap ||
|
||||
/* redundant overlap==maxOverlap && */ inc>maxInc) &&
|
||||
matches8(s+pos-overlap, s8, length8)) {
|
||||
maxInc=inc; // Longest match from earliest start.
|
||||
maxOverlap=overlap;
|
||||
break;
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#define __USTR_IMP_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/utf8.h"
|
||||
|
||||
/**
|
||||
* Internal option for unorm_cmpEquivFold() for strncmp style.
|
||||
|
@ -25,11 +26,6 @@
|
|||
*/
|
||||
#define _STRNCMP_STYLE 0x1000
|
||||
|
||||
/**
|
||||
* Internal option for string transformation functions to not first reset the Edits object.
|
||||
*/
|
||||
#define U_EDITS_NO_RESET 0x2000
|
||||
|
||||
/**
|
||||
* Compare two strings in code point order or code unit order.
|
||||
* Works in strcmp style (both lengths -1),
|
||||
|
@ -86,4 +82,62 @@ u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorC
|
|||
U_CAPI int32_t U_EXPORT2
|
||||
u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Counts the bytes of any whole valid sequence for a UTF-8 lead byte.
|
||||
* Returns 1 for ASCII 0..0x7f.
|
||||
* Returns 0 for 0x80..0xc1 as well as for 0xf5..0xff.
|
||||
* leadByte might be evaluated multiple times.
|
||||
*
|
||||
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
|
||||
* @return 0..4
|
||||
*/
|
||||
#define U8_COUNT_BYTES(leadByte) \
|
||||
(U8_IS_SINGLE(leadByte) ? 1 : U8_COUNT_BYTES_NON_ASCII(leadByte))
|
||||
|
||||
/**
|
||||
* Counts the bytes of any whole valid sequence for a UTF-8 lead byte.
|
||||
* Returns 0 for 0x00..0xc1 as well as for 0xf5..0xff.
|
||||
* leadByte might be evaluated multiple times.
|
||||
*
|
||||
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
|
||||
* @return 0 or 2..4
|
||||
*/
|
||||
#define U8_COUNT_BYTES_NON_ASCII(leadByte) \
|
||||
(U8_IS_LEAD(leadByte) ? ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+2 : 0)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class UTF8 {
|
||||
public:
|
||||
UTF8() = delete; // all static
|
||||
|
||||
/**
|
||||
* Is t a valid UTF-8 trail byte?
|
||||
*
|
||||
* @param prev Must be the preceding lead byte if i==1 and length>=3;
|
||||
* otherwise ignored.
|
||||
* @param t The i-th byte following the lead byte.
|
||||
* @param i The index (1..3) of byte t in the byte sequence. 0<i<length
|
||||
* @param length The length (2..4) of the byte sequence according to the lead byte.
|
||||
* @return TRUE if t is a valid trail byte in this context.
|
||||
*/
|
||||
static inline UBool isValidTrail(int32_t prev, uint8_t t, int32_t i, int32_t length) {
|
||||
// The first trail byte after a 3- or 4-byte lead byte
|
||||
// needs to be validated together with its lead byte.
|
||||
if (length <= 2 || i > 1) {
|
||||
return U8_IS_TRAIL(t);
|
||||
} else if (length == 3) {
|
||||
return U8_IS_VALID_LEAD3_AND_T1(prev, t);
|
||||
} else { // length == 4
|
||||
return U8_IS_VALID_LEAD4_AND_T1(prev, t);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __cplusplus
|
||||
|
||||
#endif
|
||||
|
|
|
@ -43,28 +43,28 @@ U_NAMESPACE_BEGIN
|
|||
class WholeStringBreakIterator : public BreakIterator {
|
||||
public:
|
||||
WholeStringBreakIterator() : BreakIterator(), length(0) {}
|
||||
~WholeStringBreakIterator() override;
|
||||
UBool operator==(const BreakIterator&) const override;
|
||||
BreakIterator *clone() const override;
|
||||
~WholeStringBreakIterator() U_OVERRIDE;
|
||||
UBool operator==(const BreakIterator&) const U_OVERRIDE;
|
||||
BreakIterator *clone() const U_OVERRIDE;
|
||||
static UClassID U_EXPORT2 getStaticClassID();
|
||||
UClassID getDynamicClassID() const override;
|
||||
CharacterIterator &getText() const override;
|
||||
UText *getUText(UText *fillIn, UErrorCode &errorCode) const override;
|
||||
void setText(const UnicodeString &text) override;
|
||||
void setText(UText *text, UErrorCode &errorCode) override;
|
||||
void adoptText(CharacterIterator* it) override;
|
||||
int32_t first() override;
|
||||
int32_t last() override;
|
||||
int32_t previous() override;
|
||||
int32_t next() override;
|
||||
int32_t current() const override;
|
||||
int32_t following(int32_t offset) override;
|
||||
int32_t preceding(int32_t offset) override;
|
||||
UBool isBoundary(int32_t offset) override;
|
||||
int32_t next(int32_t n) override;
|
||||
UClassID getDynamicClassID() const U_OVERRIDE;
|
||||
CharacterIterator &getText() const U_OVERRIDE;
|
||||
UText *getUText(UText *fillIn, UErrorCode &errorCode) const U_OVERRIDE;
|
||||
void setText(const UnicodeString &text) U_OVERRIDE;
|
||||
void setText(UText *text, UErrorCode &errorCode) U_OVERRIDE;
|
||||
void adoptText(CharacterIterator* it) U_OVERRIDE;
|
||||
int32_t first() U_OVERRIDE;
|
||||
int32_t last() U_OVERRIDE;
|
||||
int32_t previous() U_OVERRIDE;
|
||||
int32_t next() U_OVERRIDE;
|
||||
int32_t current() const U_OVERRIDE;
|
||||
int32_t following(int32_t offset) U_OVERRIDE;
|
||||
int32_t preceding(int32_t offset) U_OVERRIDE;
|
||||
UBool isBoundary(int32_t offset) U_OVERRIDE;
|
||||
int32_t next(int32_t n) U_OVERRIDE;
|
||||
BreakIterator *createBufferClone(void *stackBuffer, int32_t &BufferSize,
|
||||
UErrorCode &errorCode) override;
|
||||
BreakIterator &refreshInputText(UText *input, UErrorCode &errorCode) override;
|
||||
UErrorCode &errorCode) U_OVERRIDE;
|
||||
BreakIterator &refreshInputText(UText *input, UErrorCode &errorCode) U_OVERRIDE;
|
||||
|
||||
private:
|
||||
int32_t length;
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
#include "unicode/brkiter.h"
|
||||
#include "unicode/casemap.h"
|
||||
#include "unicode/edits.h"
|
||||
#include "unicode/stringoptions.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/ucasemap.h"
|
||||
#include "unicode/ubrk.h"
|
||||
|
@ -72,9 +73,9 @@ appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
|
|||
/* (not) original code point */
|
||||
if(edits!=NULL) {
|
||||
edits->addUnchanged(cpLength);
|
||||
if(options & U_OMIT_UNCHANGED_TEXT) {
|
||||
return destIndex;
|
||||
}
|
||||
}
|
||||
if(options & U_OMIT_UNCHANGED_TEXT) {
|
||||
return destIndex;
|
||||
}
|
||||
c=~result;
|
||||
if(destIndex<destCapacity && c<=0xffff) { // BMP slightly-fastpath
|
||||
|
@ -149,9 +150,9 @@ appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
|
|||
if(length>0) {
|
||||
if(edits!=NULL) {
|
||||
edits->addUnchanged(length);
|
||||
if(options & U_OMIT_UNCHANGED_TEXT) {
|
||||
return destIndex;
|
||||
}
|
||||
}
|
||||
if(options & U_OMIT_UNCHANGED_TEXT) {
|
||||
return destIndex;
|
||||
}
|
||||
if(length>(INT32_MAX-destIndex)) {
|
||||
return -1; // integer overflow
|
||||
|
@ -933,8 +934,10 @@ int32_t toUpper(uint32_t options,
|
|||
}
|
||||
}
|
||||
|
||||
UBool change = TRUE;
|
||||
if (edits != NULL) {
|
||||
UBool change;
|
||||
if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
|
||||
change = TRUE; // common, simple usage
|
||||
} else {
|
||||
// Find out first whether we are changing the text.
|
||||
change = src[i] != upper || numYpogegrammeni > 0;
|
||||
int32_t i2 = i + 1;
|
||||
|
|
|
@ -256,152 +256,6 @@ u_strToUTF32(UChar32 *dest,
|
|||
pErrorCode);
|
||||
}
|
||||
|
||||
/* for utf8_nextCharSafeBodyTerminated() */
|
||||
static const UChar32
|
||||
utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
|
||||
|
||||
/*
|
||||
* Version of utf8_nextCharSafeBody() with the following differences:
|
||||
* - checks for NUL termination instead of length
|
||||
* - works with pointers instead of indexes
|
||||
* - always strict (strict==-1)
|
||||
*
|
||||
* *ps points to after the lead byte and will be moved to after the last trail byte.
|
||||
* c is the lead byte.
|
||||
* @return the code point, or U_SENTINEL
|
||||
*/
|
||||
static UChar32
|
||||
utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
|
||||
const uint8_t *s=*ps;
|
||||
uint8_t trail, illegal=0;
|
||||
uint8_t count=U8_COUNT_TRAIL_BYTES(c);
|
||||
U_ASSERT(count<6);
|
||||
U8_MASK_LEAD_BYTE((c), count);
|
||||
/* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
|
||||
switch(count) {
|
||||
/* each branch falls through to the next one */
|
||||
case 5:
|
||||
case 4:
|
||||
/* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
|
||||
illegal=1;
|
||||
break;
|
||||
case 3:
|
||||
trail=(uint8_t)(*s++ - 0x80);
|
||||
c=(c<<6)|trail;
|
||||
if(trail>0x3f || c>=0x110) {
|
||||
/* not a trail byte, or code point>0x10ffff (outside Unicode) */
|
||||
illegal=1;
|
||||
break;
|
||||
}
|
||||
U_FALLTHROUGH;
|
||||
case 2:
|
||||
trail=(uint8_t)(*s++ - 0x80);
|
||||
if(trail>0x3f) {
|
||||
/* not a trail byte */
|
||||
illegal=1;
|
||||
break;
|
||||
}
|
||||
c=(c<<6)|trail;
|
||||
U_FALLTHROUGH;
|
||||
case 1:
|
||||
trail=(uint8_t)(*s++ - 0x80);
|
||||
if(trail>0x3f) {
|
||||
/* not a trail byte */
|
||||
illegal=1;
|
||||
}
|
||||
c=(c<<6)|trail;
|
||||
break;
|
||||
case 0:
|
||||
return U_SENTINEL;
|
||||
/* no default branch to optimize switch() - all values are covered */
|
||||
}
|
||||
|
||||
/* correct sequence - all trail bytes have (b7..b6)==(10)? */
|
||||
/* illegal is also set if count>=4 */
|
||||
if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
|
||||
/* error handling */
|
||||
/* don't go beyond this sequence */
|
||||
s=*ps;
|
||||
while(count>0 && U8_IS_TRAIL(*s)) {
|
||||
++s;
|
||||
--count;
|
||||
}
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
*ps=s;
|
||||
return c;
|
||||
}
|
||||
|
||||
/*
|
||||
* Version of utf8_nextCharSafeBody() with the following differences:
|
||||
* - works with pointers instead of indexes
|
||||
* - always strict (strict==-1)
|
||||
*
|
||||
* *ps points to after the lead byte and will be moved to after the last trail byte.
|
||||
* c is the lead byte.
|
||||
* @return the code point, or U_SENTINEL
|
||||
*/
|
||||
static UChar32
|
||||
utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
|
||||
const uint8_t *s=*ps;
|
||||
uint8_t trail, illegal=0;
|
||||
uint8_t count=U8_COUNT_TRAIL_BYTES(c);
|
||||
if((limit-s)>=count) {
|
||||
U8_MASK_LEAD_BYTE((c), count);
|
||||
/* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
|
||||
switch(count) {
|
||||
/* each branch falls through to the next one */
|
||||
case 5:
|
||||
case 4:
|
||||
/* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
|
||||
illegal=1;
|
||||
break;
|
||||
case 3:
|
||||
trail=*s++;
|
||||
c=(c<<6)|(trail&0x3f);
|
||||
if(c<0x110) {
|
||||
illegal|=(trail&0xc0)^0x80;
|
||||
} else {
|
||||
/* code point>0x10ffff, outside Unicode */
|
||||
illegal=1;
|
||||
break;
|
||||
}
|
||||
U_FALLTHROUGH;
|
||||
case 2:
|
||||
trail=*s++;
|
||||
c=(c<<6)|(trail&0x3f);
|
||||
illegal|=(trail&0xc0)^0x80;
|
||||
U_FALLTHROUGH;
|
||||
case 1:
|
||||
trail=*s++;
|
||||
c=(c<<6)|(trail&0x3f);
|
||||
illegal|=(trail&0xc0)^0x80;
|
||||
break;
|
||||
case 0:
|
||||
return U_SENTINEL;
|
||||
/* no default branch to optimize switch() - all values are covered */
|
||||
}
|
||||
} else {
|
||||
illegal=1; /* too few bytes left */
|
||||
}
|
||||
|
||||
/* correct sequence - all trail bytes have (b7..b6)==(10)? */
|
||||
/* illegal is also set if count>=4 */
|
||||
U_ASSERT(illegal || count<UPRV_LENGTHOF(utf8_minLegal));
|
||||
if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
|
||||
/* error handling */
|
||||
/* don't go beyond this sequence */
|
||||
s=*ps;
|
||||
while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
|
||||
++s;
|
||||
--count;
|
||||
}
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
*ps=s;
|
||||
return c;
|
||||
}
|
||||
|
||||
U_CAPI UChar* U_EXPORT2
|
||||
u_strFromUTF8WithSub(UChar *dest,
|
||||
int32_t destCapacity,
|
||||
|
@ -410,19 +264,10 @@ u_strFromUTF8WithSub(UChar *dest,
|
|||
int32_t srcLength,
|
||||
UChar32 subchar, int32_t *pNumSubstitutions,
|
||||
UErrorCode *pErrorCode){
|
||||
UChar *pDest = dest;
|
||||
UChar *pDestLimit = dest+destCapacity;
|
||||
UChar32 ch;
|
||||
int32_t reqLength = 0;
|
||||
const uint8_t* pSrc = (const uint8_t*) src;
|
||||
uint8_t t1, t2; /* trail bytes */
|
||||
int32_t numSubstitutions;
|
||||
|
||||
/* args check */
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if( (src==NULL && srcLength!=0) || srcLength < -1 ||
|
||||
(destCapacity<0) || (dest == NULL && destCapacity > 0) ||
|
||||
subchar > 0x10ffff || U_IS_SURROGATE(subchar)
|
||||
|
@ -434,7 +279,10 @@ u_strFromUTF8WithSub(UChar *dest,
|
|||
if(pNumSubstitutions!=NULL) {
|
||||
*pNumSubstitutions=0;
|
||||
}
|
||||
numSubstitutions=0;
|
||||
UChar *pDest = dest;
|
||||
UChar *pDestLimit = dest+destCapacity;
|
||||
int32_t reqLength = 0;
|
||||
int32_t numSubstitutions=0;
|
||||
|
||||
/*
|
||||
* Inline processing of UTF-8 byte sequences:
|
||||
|
@ -455,95 +303,81 @@ u_strFromUTF8WithSub(UChar *dest,
|
|||
* The code explicitly checks for NULs only in the lead byte position.
|
||||
* A NUL byte in the trail byte position fails the trail byte range check anyway.
|
||||
*/
|
||||
while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
|
||||
if(ch <= 0x7f){
|
||||
*pDest++=(UChar)ch;
|
||||
++pSrc;
|
||||
int32_t i;
|
||||
UChar32 c;
|
||||
for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) {
|
||||
// modified copy of U8_NEXT()
|
||||
++i;
|
||||
if(U8_IS_SINGLE(c)) {
|
||||
*pDest++=(UChar)c;
|
||||
} else {
|
||||
if(ch > 0xe0) {
|
||||
if( /* handle U+1000..U+CFFF inline */
|
||||
ch <= 0xec &&
|
||||
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
|
||||
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
|
||||
) {
|
||||
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
|
||||
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
|
||||
pSrc += 3;
|
||||
continue;
|
||||
}
|
||||
} else if(ch < 0xe0) {
|
||||
if( /* handle U+0080..U+07FF inline */
|
||||
ch >= 0xc2 &&
|
||||
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
|
||||
) {
|
||||
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
|
||||
pSrc += 2;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* function call for "complicated" and error cases */
|
||||
++pSrc; /* continue after the lead byte */
|
||||
ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
|
||||
if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
} else if(ch<=0xFFFF) {
|
||||
*(pDest++)=(UChar)ch;
|
||||
uint8_t __t1, __t2;
|
||||
if( /* handle U+0800..U+FFFF inline */
|
||||
(0xe0<=(c) && (c)<0xf0) &&
|
||||
U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
|
||||
(__t2=src[(i)+1]-0x80)<=0x3f) {
|
||||
*pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
|
||||
i+=2;
|
||||
} else if( /* handle U+0080..U+07FF inline */
|
||||
((c)<0xe0 && (c)>=0xc2) &&
|
||||
(__t1=src[i]-0x80)<=0x3f) {
|
||||
*pDest++ = (((c)&0x1f)<<6)|__t1;
|
||||
++(i);
|
||||
} else {
|
||||
*(pDest++)=U16_LEAD(ch);
|
||||
if(pDest<pDestLimit) {
|
||||
*(pDest++)=U16_TRAIL(ch);
|
||||
/* function call for "complicated" and error cases */
|
||||
(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
|
||||
if(c<0 && (++numSubstitutions, c = subchar) < 0) {
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
} else if(c<=0xFFFF) {
|
||||
*(pDest++)=(UChar)c;
|
||||
} else {
|
||||
reqLength++;
|
||||
break;
|
||||
*(pDest++)=U16_LEAD(c);
|
||||
if(pDest<pDestLimit) {
|
||||
*(pDest++)=U16_TRAIL(c);
|
||||
} else {
|
||||
reqLength++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Pre-flight the rest of the string. */
|
||||
while((ch = *pSrc) != 0) {
|
||||
if(ch <= 0x7f){
|
||||
while((c = (uint8_t)src[i]) != 0) {
|
||||
// modified copy of U8_NEXT()
|
||||
++i;
|
||||
if(U8_IS_SINGLE(c)) {
|
||||
++reqLength;
|
||||
++pSrc;
|
||||
} else {
|
||||
if(ch > 0xe0) {
|
||||
if( /* handle U+1000..U+CFFF inline */
|
||||
ch <= 0xec &&
|
||||
(uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
|
||||
(uint8_t)(pSrc[2] - 0x80) <= 0x3f
|
||||
) {
|
||||
++reqLength;
|
||||
pSrc += 3;
|
||||
continue;
|
||||
}
|
||||
} else if(ch < 0xe0) {
|
||||
if( /* handle U+0080..U+07FF inline */
|
||||
ch >= 0xc2 &&
|
||||
(uint8_t)(pSrc[1] - 0x80) <= 0x3f
|
||||
) {
|
||||
++reqLength;
|
||||
pSrc += 2;
|
||||
continue;
|
||||
uint8_t __t1, __t2;
|
||||
if( /* handle U+0800..U+FFFF inline */
|
||||
(0xe0<=(c) && (c)<0xf0) &&
|
||||
U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
|
||||
(__t2=src[(i)+1]-0x80)<=0x3f) {
|
||||
++reqLength;
|
||||
i+=2;
|
||||
} else if( /* handle U+0080..U+07FF inline */
|
||||
((c)<0xe0 && (c)>=0xc2) &&
|
||||
(__t1=src[i]-0x80)<=0x3f) {
|
||||
++reqLength;
|
||||
++(i);
|
||||
} else {
|
||||
/* function call for "complicated" and error cases */
|
||||
(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
|
||||
if(c<0 && (++numSubstitutions, c = subchar) < 0) {
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
}
|
||||
reqLength += U16_LENGTH(c);
|
||||
}
|
||||
|
||||
/* function call for "complicated" and error cases */
|
||||
++pSrc; /* continue after the lead byte */
|
||||
ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
|
||||
if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
}
|
||||
reqLength += U16_LENGTH(ch);
|
||||
}
|
||||
}
|
||||
} else /* srcLength >= 0 */ {
|
||||
const uint8_t *pSrcLimit = pSrc + srcLength;
|
||||
int32_t count;
|
||||
|
||||
/* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
|
||||
/* Faster loop without ongoing checking for srcLength and pDestLimit. */
|
||||
int32_t i = 0;
|
||||
UChar32 c;
|
||||
for(;;) {
|
||||
/*
|
||||
* Each iteration of the inner loop progresses by at most 3 UTF-8
|
||||
|
@ -551,10 +385,10 @@ u_strFromUTF8WithSub(UChar *dest,
|
|||
* For supplementary code points (4 & 2), which are rare,
|
||||
* there is an additional adjustment.
|
||||
*/
|
||||
count = (int32_t)(pDestLimit - pDest);
|
||||
srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
|
||||
if(count > srcLength) {
|
||||
count = srcLength; /* min(remaining dest, remaining src/3) */
|
||||
int32_t count = (int32_t)(pDestLimit - pDest);
|
||||
int32_t count2 = (srcLength - i) / 3;
|
||||
if(count > count2) {
|
||||
count = count2; /* min(remaining dest, remaining src/3) */
|
||||
}
|
||||
if(count < 3) {
|
||||
/*
|
||||
|
@ -565,147 +399,123 @@ u_strFromUTF8WithSub(UChar *dest,
|
|||
}
|
||||
|
||||
do {
|
||||
ch = *pSrc;
|
||||
if(ch <= 0x7f){
|
||||
*pDest++=(UChar)ch;
|
||||
++pSrc;
|
||||
// modified copy of U8_NEXT()
|
||||
c = (uint8_t)src[i++];
|
||||
if(U8_IS_SINGLE(c)) {
|
||||
*pDest++=(UChar)c;
|
||||
} else {
|
||||
if(ch > 0xe0) {
|
||||
if( /* handle U+1000..U+CFFF inline */
|
||||
ch <= 0xec &&
|
||||
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
|
||||
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
|
||||
) {
|
||||
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
|
||||
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
|
||||
pSrc += 3;
|
||||
continue;
|
||||
uint8_t __t1, __t2;
|
||||
if( /* handle U+0800..U+FFFF inline */
|
||||
(0xe0<=(c) && (c)<0xf0) &&
|
||||
((i)+1)<srcLength &&
|
||||
U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
|
||||
(__t2=src[(i)+1]-0x80)<=0x3f) {
|
||||
*pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
|
||||
i+=2;
|
||||
} else if( /* handle U+0080..U+07FF inline */
|
||||
((c)<0xe0 && (c)>=0xc2) &&
|
||||
((i)!=srcLength) &&
|
||||
(__t1=src[i]-0x80)<=0x3f) {
|
||||
*pDest++ = (((c)&0x1f)<<6)|__t1;
|
||||
++(i);
|
||||
} else {
|
||||
if(c >= 0xf0 || subchar > 0xffff) {
|
||||
// We may read up to four bytes and write up to two UChars,
|
||||
// which we didn't account for with computing count,
|
||||
// so we adjust it here.
|
||||
if(--count == 0) {
|
||||
--i; // back out byte c
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if(ch < 0xe0) {
|
||||
if( /* handle U+0080..U+07FF inline */
|
||||
ch >= 0xc2 &&
|
||||
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
|
||||
) {
|
||||
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
|
||||
pSrc += 2;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if(ch >= 0xf0 || subchar > 0xffff) {
|
||||
/*
|
||||
* We may read up to six bytes and write up to two UChars,
|
||||
* which we didn't account for with computing count,
|
||||
* so we adjust it here.
|
||||
*/
|
||||
if(--count == 0) {
|
||||
break;
|
||||
/* function call for "complicated" and error cases */
|
||||
(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
|
||||
if(c<0 && (++numSubstitutions, c = subchar) < 0) {
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
} else if(c<=0xFFFF) {
|
||||
*(pDest++)=(UChar)c;
|
||||
} else {
|
||||
*(pDest++)=U16_LEAD(c);
|
||||
*(pDest++)=U16_TRAIL(c);
|
||||
}
|
||||
}
|
||||
|
||||
/* function call for "complicated" and error cases */
|
||||
++pSrc; /* continue after the lead byte */
|
||||
ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
|
||||
if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
}else if(ch<=0xFFFF){
|
||||
*(pDest++)=(UChar)ch;
|
||||
}else{
|
||||
*(pDest++)=U16_LEAD(ch);
|
||||
*(pDest++)=U16_TRAIL(ch);
|
||||
}
|
||||
}
|
||||
} while(--count > 0);
|
||||
}
|
||||
|
||||
while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
|
||||
ch = *pSrc;
|
||||
if(ch <= 0x7f){
|
||||
*pDest++=(UChar)ch;
|
||||
++pSrc;
|
||||
while(i < srcLength && (pDest < pDestLimit)) {
|
||||
// modified copy of U8_NEXT()
|
||||
c = (uint8_t)src[i++];
|
||||
if(U8_IS_SINGLE(c)) {
|
||||
*pDest++=(UChar)c;
|
||||
} else {
|
||||
if(ch > 0xe0) {
|
||||
if( /* handle U+1000..U+CFFF inline */
|
||||
ch <= 0xec &&
|
||||
((pSrcLimit - pSrc) >= 3) &&
|
||||
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
|
||||
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
|
||||
) {
|
||||
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
|
||||
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
|
||||
pSrc += 3;
|
||||
continue;
|
||||
}
|
||||
} else if(ch < 0xe0) {
|
||||
if( /* handle U+0080..U+07FF inline */
|
||||
ch >= 0xc2 &&
|
||||
((pSrcLimit - pSrc) >= 2) &&
|
||||
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
|
||||
) {
|
||||
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
|
||||
pSrc += 2;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* function call for "complicated" and error cases */
|
||||
++pSrc; /* continue after the lead byte */
|
||||
ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
|
||||
if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
}else if(ch<=0xFFFF){
|
||||
*(pDest++)=(UChar)ch;
|
||||
}else{
|
||||
*(pDest++)=U16_LEAD(ch);
|
||||
if(pDest<pDestLimit){
|
||||
*(pDest++)=U16_TRAIL(ch);
|
||||
}else{
|
||||
reqLength++;
|
||||
break;
|
||||
uint8_t __t1, __t2;
|
||||
if( /* handle U+0800..U+FFFF inline */
|
||||
(0xe0<=(c) && (c)<0xf0) &&
|
||||
((i)+1)<srcLength &&
|
||||
U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
|
||||
(__t2=src[(i)+1]-0x80)<=0x3f) {
|
||||
*pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
|
||||
i+=2;
|
||||
} else if( /* handle U+0080..U+07FF inline */
|
||||
((c)<0xe0 && (c)>=0xc2) &&
|
||||
((i)!=srcLength) &&
|
||||
(__t1=src[i]-0x80)<=0x3f) {
|
||||
*pDest++ = (((c)&0x1f)<<6)|__t1;
|
||||
++(i);
|
||||
} else {
|
||||
/* function call for "complicated" and error cases */
|
||||
(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
|
||||
if(c<0 && (++numSubstitutions, c = subchar) < 0) {
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
} else if(c<=0xFFFF) {
|
||||
*(pDest++)=(UChar)c;
|
||||
} else {
|
||||
*(pDest++)=U16_LEAD(c);
|
||||
if(pDest<pDestLimit) {
|
||||
*(pDest++)=U16_TRAIL(c);
|
||||
} else {
|
||||
reqLength++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/* do not fill the dest buffer just count the UChars needed */
|
||||
while(pSrc < pSrcLimit){
|
||||
ch = *pSrc;
|
||||
if(ch <= 0x7f){
|
||||
reqLength++;
|
||||
++pSrc;
|
||||
} else {
|
||||
if(ch > 0xe0) {
|
||||
if( /* handle U+1000..U+CFFF inline */
|
||||
ch <= 0xec &&
|
||||
((pSrcLimit - pSrc) >= 3) &&
|
||||
(uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
|
||||
(uint8_t)(pSrc[2] - 0x80) <= 0x3f
|
||||
) {
|
||||
reqLength++;
|
||||
pSrc += 3;
|
||||
continue;
|
||||
}
|
||||
} else if(ch < 0xe0) {
|
||||
if( /* handle U+0080..U+07FF inline */
|
||||
ch >= 0xc2 &&
|
||||
((pSrcLimit - pSrc) >= 2) &&
|
||||
(uint8_t)(pSrc[1] - 0x80) <= 0x3f
|
||||
) {
|
||||
reqLength++;
|
||||
pSrc += 2;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* function call for "complicated" and error cases */
|
||||
++pSrc; /* continue after the lead byte */
|
||||
ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
|
||||
if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
/* Pre-flight the rest of the string. */
|
||||
while(i < srcLength) {
|
||||
// modified copy of U8_NEXT()
|
||||
c = (uint8_t)src[i++];
|
||||
if(U8_IS_SINGLE(c)) {
|
||||
++reqLength;
|
||||
} else {
|
||||
uint8_t __t1, __t2;
|
||||
if( /* handle U+0800..U+FFFF inline */
|
||||
(0xe0<=(c) && (c)<0xf0) &&
|
||||
((i)+1)<srcLength &&
|
||||
U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
|
||||
(__t2=src[(i)+1]-0x80)<=0x3f) {
|
||||
++reqLength;
|
||||
i+=2;
|
||||
} else if( /* handle U+0080..U+07FF inline */
|
||||
((c)<0xe0 && (c)>=0xc2) &&
|
||||
((i)!=srcLength) &&
|
||||
(__t1=src[i]-0x80)<=0x3f) {
|
||||
++reqLength;
|
||||
++(i);
|
||||
} else {
|
||||
/* function call for "complicated" and error cases */
|
||||
(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
|
||||
if(c<0 && (++numSubstitutions, c = subchar) < 0) {
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
}
|
||||
reqLength += U16_LENGTH(c);
|
||||
}
|
||||
reqLength+=U16_LENGTH(ch);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -753,7 +563,7 @@ u_strFromUTF8Lenient(UChar *dest,
|
|||
uint8_t* pSrc = (uint8_t*) src;
|
||||
|
||||
/* args check */
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
|
||||
if(U_FAILURE(*pErrorCode)){
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -994,7 +804,7 @@ u_strToUTF8WithSub(char *dest,
|
|||
int32_t numSubstitutions;
|
||||
|
||||
/* args check */
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
|
||||
if(U_FAILURE(*pErrorCode)){
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -1266,18 +1076,8 @@ u_strFromJavaModifiedUTF8WithSub(
|
|||
int32_t srcLength,
|
||||
UChar32 subchar, int32_t *pNumSubstitutions,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar *pDest = dest;
|
||||
UChar *pDestLimit = dest+destCapacity;
|
||||
UChar32 ch;
|
||||
int32_t reqLength = 0;
|
||||
const uint8_t* pSrc = (const uint8_t*) src;
|
||||
const uint8_t *pSrcLimit;
|
||||
int32_t count;
|
||||
uint8_t t1, t2; /* trail bytes */
|
||||
int32_t numSubstitutions;
|
||||
|
||||
/* args check */
|
||||
if(U_FAILURE(*pErrorCode)){
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
if( (src==NULL && srcLength!=0) || srcLength < -1 ||
|
||||
|
@ -1291,18 +1091,22 @@ u_strFromJavaModifiedUTF8WithSub(
|
|||
if(pNumSubstitutions!=NULL) {
|
||||
*pNumSubstitutions=0;
|
||||
}
|
||||
numSubstitutions=0;
|
||||
UChar *pDest = dest;
|
||||
UChar *pDestLimit = dest+destCapacity;
|
||||
int32_t reqLength = 0;
|
||||
int32_t numSubstitutions=0;
|
||||
|
||||
if(srcLength < 0) {
|
||||
/*
|
||||
* Transform a NUL-terminated ASCII string.
|
||||
* Handle non-ASCII strings with slower code.
|
||||
*/
|
||||
while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
|
||||
*pDest++=(UChar)ch;
|
||||
++pSrc;
|
||||
UChar32 c;
|
||||
while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) {
|
||||
*pDest++=(UChar)c;
|
||||
++src;
|
||||
}
|
||||
if(ch == 0) {
|
||||
if(c == 0) {
|
||||
reqLength=(int32_t)(pDest - dest);
|
||||
if(pDestLength) {
|
||||
*pDestLength = reqLength;
|
||||
|
@ -1312,33 +1116,38 @@ u_strFromJavaModifiedUTF8WithSub(
|
|||
u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
|
||||
return dest;
|
||||
}
|
||||
srcLength = static_cast<int32_t>(uprv_strlen((const char *)pSrc));
|
||||
srcLength = static_cast<int32_t>(uprv_strlen(src));
|
||||
}
|
||||
|
||||
/* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
|
||||
pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
|
||||
/* Faster loop without ongoing checking for srcLength and pDestLimit. */
|
||||
UChar32 ch;
|
||||
uint8_t t1, t2;
|
||||
int32_t i = 0;
|
||||
for(;;) {
|
||||
count = (int32_t)(pDestLimit - pDest);
|
||||
srcLength = (int32_t)(pSrcLimit - pSrc);
|
||||
if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
|
||||
int32_t count = (int32_t)(pDestLimit - pDest);
|
||||
int32_t count2 = srcLength - i;
|
||||
if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) {
|
||||
/* fast ASCII loop */
|
||||
const uint8_t *prevSrc = pSrc;
|
||||
int32_t delta;
|
||||
while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
|
||||
*pDest++=(UChar)ch;
|
||||
++pSrc;
|
||||
int32_t start = i;
|
||||
uint8_t b;
|
||||
while(i < srcLength && U8_IS_SINGLE(b = src[i])) {
|
||||
*pDest++=b;
|
||||
++i;
|
||||
}
|
||||
delta = (int32_t)(pSrc - prevSrc);
|
||||
int32_t delta = i - start;
|
||||
count -= delta;
|
||||
srcLength -= delta;
|
||||
count2 -= delta;
|
||||
}
|
||||
/*
|
||||
* Each iteration of the inner loop progresses by at most 3 UTF-8
|
||||
* bytes and one UChar.
|
||||
*/
|
||||
srcLength /= 3;
|
||||
if(count > srcLength) {
|
||||
count = srcLength; /* min(remaining dest, remaining src/3) */
|
||||
if(subchar > 0xFFFF) {
|
||||
break;
|
||||
}
|
||||
count2 /= 3;
|
||||
if(count > count2) {
|
||||
count = count2; /* min(remaining dest, remaining src/3) */
|
||||
}
|
||||
if(count < 3) {
|
||||
/*
|
||||
|
@ -1348,29 +1157,28 @@ u_strFromJavaModifiedUTF8WithSub(
|
|||
break;
|
||||
}
|
||||
do {
|
||||
ch = *pSrc;
|
||||
if(ch <= 0x7f){
|
||||
ch = (uint8_t)src[i++];
|
||||
if(U8_IS_SINGLE(ch)) {
|
||||
*pDest++=(UChar)ch;
|
||||
++pSrc;
|
||||
} else {
|
||||
if(ch >= 0xe0) {
|
||||
if( /* handle U+0000..U+FFFF inline */
|
||||
ch <= 0xef &&
|
||||
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
|
||||
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
|
||||
(t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
|
||||
(t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
|
||||
) {
|
||||
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
|
||||
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
|
||||
pSrc += 3;
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if( /* handle U+0000..U+07FF inline */
|
||||
ch >= 0xc0 &&
|
||||
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
|
||||
(t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
|
||||
) {
|
||||
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
|
||||
pSrc += 2;
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -1383,49 +1191,43 @@ u_strFromJavaModifiedUTF8WithSub(
|
|||
* We need to write two UChars, adjusted count for that,
|
||||
* and ran out of space.
|
||||
*/
|
||||
--i; // back out byte ch
|
||||
break;
|
||||
} else {
|
||||
/* function call for error cases */
|
||||
++pSrc; /* continue after the lead byte */
|
||||
utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
|
||||
utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
|
||||
++numSubstitutions;
|
||||
if(subchar<=0xFFFF) {
|
||||
*(pDest++)=(UChar)subchar;
|
||||
} else {
|
||||
*(pDest++)=U16_LEAD(subchar);
|
||||
*(pDest++)=U16_TRAIL(subchar);
|
||||
}
|
||||
*(pDest++)=(UChar)subchar;
|
||||
}
|
||||
}
|
||||
} while(--count > 0);
|
||||
}
|
||||
|
||||
while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
|
||||
ch = *pSrc;
|
||||
if(ch <= 0x7f){
|
||||
while(i < srcLength && (pDest < pDestLimit)) {
|
||||
ch = (uint8_t)src[i++];
|
||||
if(U8_IS_SINGLE(ch)){
|
||||
*pDest++=(UChar)ch;
|
||||
++pSrc;
|
||||
} else {
|
||||
if(ch >= 0xe0) {
|
||||
if( /* handle U+0000..U+FFFF inline */
|
||||
ch <= 0xef &&
|
||||
((pSrcLimit - pSrc) >= 3) &&
|
||||
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
|
||||
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
|
||||
(i+1) < srcLength &&
|
||||
(t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
|
||||
(t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
|
||||
) {
|
||||
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
|
||||
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
|
||||
pSrc += 3;
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if( /* handle U+0000..U+07FF inline */
|
||||
ch >= 0xc0 &&
|
||||
((pSrcLimit - pSrc) >= 2) &&
|
||||
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
|
||||
i < srcLength &&
|
||||
(t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
|
||||
) {
|
||||
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
|
||||
pSrc += 2;
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -1435,8 +1237,7 @@ u_strFromJavaModifiedUTF8WithSub(
|
|||
return NULL;
|
||||
} else {
|
||||
/* function call for error cases */
|
||||
++pSrc; /* continue after the lead byte */
|
||||
utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
|
||||
utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
|
||||
++numSubstitutions;
|
||||
if(subchar<=0xFFFF) {
|
||||
*(pDest++)=(UChar)subchar;
|
||||
|
@ -1453,32 +1254,31 @@ u_strFromJavaModifiedUTF8WithSub(
|
|||
}
|
||||
}
|
||||
|
||||
/* do not fill the dest buffer just count the UChars needed */
|
||||
while(pSrc < pSrcLimit){
|
||||
ch = *pSrc;
|
||||
if(ch <= 0x7f) {
|
||||
/* Pre-flight the rest of the string. */
|
||||
while(i < srcLength) {
|
||||
ch = (uint8_t)src[i++];
|
||||
if(U8_IS_SINGLE(ch)) {
|
||||
reqLength++;
|
||||
++pSrc;
|
||||
} else {
|
||||
if(ch >= 0xe0) {
|
||||
if( /* handle U+0000..U+FFFF inline */
|
||||
ch <= 0xef &&
|
||||
((pSrcLimit - pSrc) >= 3) &&
|
||||
(uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
|
||||
(uint8_t)(pSrc[2] - 0x80) <= 0x3f
|
||||
(i+1) < srcLength &&
|
||||
(uint8_t)(src[i] - 0x80) <= 0x3f &&
|
||||
(uint8_t)(src[i+1] - 0x80) <= 0x3f
|
||||
) {
|
||||
reqLength++;
|
||||
pSrc += 3;
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if( /* handle U+0000..U+07FF inline */
|
||||
ch >= 0xc0 &&
|
||||
((pSrcLimit - pSrc) >= 2) &&
|
||||
(uint8_t)(pSrc[1] - 0x80) <= 0x3f
|
||||
i < srcLength &&
|
||||
(uint8_t)(src[i] - 0x80) <= 0x3f
|
||||
) {
|
||||
reqLength++;
|
||||
pSrc += 2;
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -1488,8 +1288,7 @@ u_strFromJavaModifiedUTF8WithSub(
|
|||
return NULL;
|
||||
} else {
|
||||
/* function call for error cases */
|
||||
++pSrc; /* continue after the lead byte */
|
||||
utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
|
||||
utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
|
||||
++numSubstitutions;
|
||||
reqLength+=U16_LENGTH(ch);
|
||||
}
|
||||
|
|
|
@ -847,15 +847,11 @@ U_CDECL_END
|
|||
//------------------------------------------------------------------------------
|
||||
|
||||
// Chunk size.
|
||||
// Must be less than 42 (256/6), because of byte mapping from UChar indexes to native indexes.
|
||||
// Worst case there are six UTF-8 bytes per UChar.
|
||||
// obsolete 6 byte form fd + 5 trails maps to fffd
|
||||
// obsolete 5 byte form fc + 4 trails maps to fffd
|
||||
// non-shortest 4 byte forms maps to fffd
|
||||
// normal supplementaries map to a pair of utf-16, two utf8 bytes per utf-16 unit
|
||||
// mapToUChars array size must allow for the worst case, 6.
|
||||
// This could be brought down to 4, by treating fd and fc as pure illegal,
|
||||
// rather than obsolete lead bytes. But that is not compatible with the utf-8 access macros.
|
||||
// Must be less than 85 (256/3), because of byte mapping from UChar indexes to native indexes.
|
||||
// Worst case is three native bytes to one UChar. (Supplemenaries are 4 native bytes
|
||||
// to two UChars.)
|
||||
// The longest illegal byte sequence treated as a single error (and converted to U+FFFD)
|
||||
// is a three-byte sequence (truncated four-byte sequence).
|
||||
//
|
||||
enum { UTF8_TEXT_CHUNK_SIZE=32 };
|
||||
|
||||
|
@ -895,7 +891,7 @@ struct UTF8Buf {
|
|||
// Requires two extra slots,
|
||||
// one for a supplementary starting in the last normal position,
|
||||
// and one for an entry for the buffer limit position.
|
||||
uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*6+6]; // Map native offset from bufNativeStart to
|
||||
uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to
|
||||
// correspoding offset in filled part of buf.
|
||||
int32_t align;
|
||||
};
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: utf_impl.c
|
||||
* file name: utf_impl.cpp
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
|
@ -27,7 +27,6 @@
|
|||
#include "unicode/utypes.h"
|
||||
#include "unicode/utf.h"
|
||||
#include "unicode/utf8.h"
|
||||
#include "unicode/utf_old.h"
|
||||
#include "uassert.h"
|
||||
|
||||
/*
|
||||
|
@ -55,10 +54,6 @@
|
|||
* - SUB AX, BX (result)
|
||||
* -finish:
|
||||
* (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
|
||||
*
|
||||
* In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;
|
||||
* lead bytes above 0xf4 are illegal.
|
||||
* We keep them in this table for skipping long ISO 10646-UTF-8 sequences.
|
||||
*/
|
||||
extern "C" U_EXPORT const uint8_t
|
||||
utf8_countTrailBytes[256]={
|
||||
|
@ -77,24 +72,24 @@ utf8_countTrailBytes[256]={
|
|||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
// illegal C0 & C1
|
||||
// 2-byte lead bytes C2..DF
|
||||
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
|
||||
// 3-byte lead bytes E0..EF
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
3, 3, 3, 3, 3,
|
||||
3, 3, 3, /* illegal in Unicode */
|
||||
4, 4, 4, 4, /* illegal in Unicode */
|
||||
5, 5, /* illegal in Unicode */
|
||||
0, 0 /* illegal bytes 0xfe and 0xff */
|
||||
// 4-byte lead bytes F0..F4
|
||||
// illegal F5..FF
|
||||
3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
|
||||
static const UChar32
|
||||
utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
|
||||
|
||||
static const UChar32
|
||||
utf8_errorValue[6]={
|
||||
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff,
|
||||
0x3ffffff, 0x7fffffff
|
||||
// Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
|
||||
// but without relying on the obsolete unicode/utf_old.h.
|
||||
0x15, 0x9f, 0xffff,
|
||||
0x10ffff
|
||||
};
|
||||
|
||||
static UChar32
|
||||
|
@ -134,61 +129,59 @@ errorValue(int32_t count, int8_t strict) {
|
|||
*/
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
|
||||
// *pi is one after byte c.
|
||||
int32_t i=*pi;
|
||||
uint8_t count=U8_COUNT_TRAIL_BYTES(c);
|
||||
U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
|
||||
if(i+count<=length || length<0) {
|
||||
uint8_t trail;
|
||||
|
||||
U8_MASK_LEAD_BYTE(c, count);
|
||||
/* support NUL-terminated strings: do not read beyond the first non-trail byte */
|
||||
switch(count) {
|
||||
/* each branch falls through to the next one */
|
||||
case 0:
|
||||
/* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
|
||||
case 5:
|
||||
case 4:
|
||||
/* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
|
||||
break;
|
||||
case 3:
|
||||
trail=s[i++]-0x80;
|
||||
c=(c<<6)|trail;
|
||||
/* c>=0x110 would result in code point>0x10ffff, outside Unicode */
|
||||
if(c>=0x110 || trail>0x3f) { break; }
|
||||
U_FALLTHROUGH;
|
||||
case 2:
|
||||
trail=s[i++]-0x80;
|
||||
c=(c<<6)|trail;
|
||||
/*
|
||||
* test for a surrogate d800..dfff unless we are lenient:
|
||||
* before the last (c<<6), a surrogate is c=360..37f
|
||||
*/
|
||||
if(((c&0xffe0)==0x360 && strict!=-2) || trail>0x3f) { break; }
|
||||
U_FALLTHROUGH;
|
||||
case 1:
|
||||
trail=s[i++]-0x80;
|
||||
c=(c<<6)|trail;
|
||||
if(trail>0x3f) { break; }
|
||||
/* correct sequence - all trail bytes have (b7..b6)==(10) */
|
||||
if(c>=utf8_minLegal[count] &&
|
||||
/* strict: forbid non-characters like U+fffe */
|
||||
(strict<=0 || !U_IS_UNICODE_NONCHAR(c))) {
|
||||
// length can be negative for NUL-terminated strings: Read and validate one byte at a time.
|
||||
if(i==length || c>0xf4) {
|
||||
// end of string, or not a lead byte
|
||||
} else if(c>=0xf0) {
|
||||
// Test for 4-byte sequences first because
|
||||
// U8_NEXT() handles shorter valid sequences inline.
|
||||
uint8_t t1=s[i], t2, t3;
|
||||
c&=7;
|
||||
if(U8_IS_VALID_LEAD4_AND_T1(c, t1) &&
|
||||
++i!=length && (t2=s[i]-0x80)<=0x3f &&
|
||||
++i!=length && (t3=s[i]-0x80)<=0x3f) {
|
||||
++i;
|
||||
c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
|
||||
// strict: forbid non-characters like U+fffe
|
||||
if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
|
||||
*pi=i;
|
||||
return c;
|
||||
}
|
||||
/* no default branch to optimize switch() - all values are covered */
|
||||
}
|
||||
} else {
|
||||
/* too few bytes left */
|
||||
count=length-i;
|
||||
}
|
||||
} else if(c>=0xe0) {
|
||||
c&=0xf;
|
||||
if(strict!=-2) {
|
||||
uint8_t t1=s[i], t2;
|
||||
if(U8_IS_VALID_LEAD3_AND_T1(c, t1) &&
|
||||
++i!=length && (t2=s[i]-0x80)<=0x3f) {
|
||||
++i;
|
||||
c=(c<<12)|((t1&0x3f)<<6)|t2;
|
||||
// strict: forbid non-characters like U+fffe
|
||||
if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
|
||||
*pi=i;
|
||||
return c;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// strict=-2 -> lenient: allow surrogates
|
||||
uint8_t t1=s[i]-0x80, t2;
|
||||
if(t1<=0x3f && (c>0 || t1>=0x20) &&
|
||||
++i!=length && (t2=s[i]-0x80)<=0x3f) {
|
||||
*pi=i+1;
|
||||
return (c<<12)|(t1<<6)|t2;
|
||||
}
|
||||
}
|
||||
} else if(c>=0xc2) {
|
||||
uint8_t t1=s[i]-0x80;
|
||||
if(t1<=0x3f) {
|
||||
*pi=i+1;
|
||||
return ((c-0xc0)<<6)|t1;
|
||||
}
|
||||
} // else 0x80<=c<0xc2 is not a lead byte
|
||||
|
||||
/* error handling */
|
||||
i=*pi;
|
||||
while(count>0 && U8_IS_TRAIL(s[i])) {
|
||||
++i;
|
||||
--count;
|
||||
}
|
||||
c=errorValue(i-*pi, strict);
|
||||
*pi=i;
|
||||
return c;
|
||||
|
@ -232,7 +225,7 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool
|
|||
s+=i;
|
||||
offset=0;
|
||||
c=utf8_errorValue[length-1];
|
||||
UTF8_APPEND_CHAR_UNSAFE(s, offset, c);
|
||||
U8_APPEND_UNSAFE(s, offset, c);
|
||||
i=i+offset;
|
||||
}
|
||||
}
|
||||
|
@ -241,99 +234,99 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool
|
|||
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) {
|
||||
// *pi is the index of byte c.
|
||||
int32_t i=*pi;
|
||||
uint8_t b, count=1, shift=6;
|
||||
|
||||
if(!U8_IS_TRAIL(c)) { return errorValue(0, strict); }
|
||||
|
||||
/* extract value bits from the last trail byte */
|
||||
c&=0x3f;
|
||||
|
||||
for(;;) {
|
||||
if(i<=start) {
|
||||
/* no lead byte at all */
|
||||
return errorValue(0, strict);
|
||||
}
|
||||
|
||||
/* read another previous byte */
|
||||
b=s[--i];
|
||||
if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
|
||||
if(b&0x40) {
|
||||
/* lead byte, this will always end the loop */
|
||||
uint8_t shouldCount=U8_COUNT_TRAIL_BYTES(b);
|
||||
|
||||
if(count==shouldCount) {
|
||||
/* set the new position */
|
||||
*pi=i;
|
||||
U8_MASK_LEAD_BYTE(b, count);
|
||||
c|=(UChar32)b<<shift;
|
||||
if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || (U_IS_SURROGATE(c) && strict!=-2) || (strict>0 && U_IS_UNICODE_NONCHAR(c))) {
|
||||
/* illegal sequence or (strict and non-character) */
|
||||
if(count>=4) {
|
||||
count=3;
|
||||
if(U8_IS_TRAIL(c) && i>start) {
|
||||
uint8_t b1=s[--i];
|
||||
if(0xc2<=b1 && b1<0xe0) {
|
||||
*pi=i;
|
||||
return ((b1-0xc0)<<6)|(c&0x3f);
|
||||
} else if(U8_IS_TRAIL(b1) && i>start) {
|
||||
// Extract the value bits from the last trail byte.
|
||||
c&=0x3f;
|
||||
uint8_t b2=s[--i];
|
||||
if(0xe0<=b2 && b2<0xf0) {
|
||||
b2&=0xf;
|
||||
if(strict!=-2) {
|
||||
if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
|
||||
*pi=i;
|
||||
c=(b2<<12)|((b1&0x3f)<<6)|c;
|
||||
if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
|
||||
return c;
|
||||
} else {
|
||||
// strict: forbid non-characters like U+fffe
|
||||
return errorValue(2, strict);
|
||||
}
|
||||
c=errorValue(count, strict);
|
||||
} else {
|
||||
/* exit with correct c */
|
||||
}
|
||||
} else {
|
||||
/* the lead byte does not match the number of trail bytes */
|
||||
/* only set the position to the lead byte if it would
|
||||
include the trail byte that we started with */
|
||||
if(count<shouldCount) {
|
||||
// strict=-2 -> lenient: allow surrogates
|
||||
b1-=0x80;
|
||||
if((b2>0 || b1>=0x20)) {
|
||||
*pi=i;
|
||||
c=errorValue(count, strict);
|
||||
} else {
|
||||
c=errorValue(0, strict);
|
||||
return (b2<<12)|(b1<<6)|c;
|
||||
}
|
||||
}
|
||||
break;
|
||||
} else if(count<5) {
|
||||
/* trail byte */
|
||||
c|=(UChar32)(b&0x3f)<<shift;
|
||||
++count;
|
||||
shift+=6;
|
||||
} else {
|
||||
/* more than 5 trail bytes is illegal */
|
||||
c=errorValue(0, strict);
|
||||
break;
|
||||
} else if(U8_IS_TRAIL(b2) && i>start) {
|
||||
uint8_t b3=s[--i];
|
||||
if(0xf0<=b3 && b3<=0xf4) {
|
||||
b3&=7;
|
||||
if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
|
||||
*pi=i;
|
||||
c=(b3<<18)|((b2&0x3f)<<12)|((b1&0x3f)<<6)|c;
|
||||
if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
|
||||
return c;
|
||||
} else {
|
||||
// strict: forbid non-characters like U+fffe
|
||||
return errorValue(3, strict);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
|
||||
// Truncated 4-byte sequence.
|
||||
*pi=i;
|
||||
return errorValue(2, strict);
|
||||
}
|
||||
} else {
|
||||
/* single-byte character precedes trailing bytes */
|
||||
c=errorValue(0, strict);
|
||||
break;
|
||||
} else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
|
||||
(0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
|
||||
// Truncated 3- or 4-byte sequence.
|
||||
*pi=i;
|
||||
return errorValue(1, strict);
|
||||
}
|
||||
}
|
||||
return c;
|
||||
return errorValue(0, strict);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
|
||||
/* i had been decremented once before the function call */
|
||||
int32_t I=i, Z;
|
||||
uint8_t b;
|
||||
|
||||
/* read at most the 6 bytes s[Z] to s[i], inclusively */
|
||||
if(I-5>start) {
|
||||
Z=I-5;
|
||||
} else {
|
||||
Z=start;
|
||||
}
|
||||
|
||||
/* return I if the sequence starting there is long enough to include i */
|
||||
do {
|
||||
b=s[I];
|
||||
if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */
|
||||
break;
|
||||
} else if(b>=0xc0) {
|
||||
if(U8_COUNT_TRAIL_BYTES(b)>=(i-I)) {
|
||||
return I;
|
||||
} else {
|
||||
break;
|
||||
// Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points.
|
||||
int32_t orig_i=i;
|
||||
uint8_t c=s[i];
|
||||
if(U8_IS_TRAIL(c) && i>start) {
|
||||
uint8_t b1=s[--i];
|
||||
if(0xc2<=b1 && b1<0xe0) {
|
||||
return i;
|
||||
} else if(U8_IS_TRAIL(b1) && i>start) {
|
||||
uint8_t b2=s[--i];
|
||||
if(0xe0<=b2 && b2<0xf0) {
|
||||
if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
|
||||
return i;
|
||||
}
|
||||
} else if(U8_IS_TRAIL(b2) && i>start) {
|
||||
uint8_t b3=s[--i];
|
||||
if(0xf0<=b3 && b3<=0xf4) {
|
||||
if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
} else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
|
||||
// Truncated 4-byte sequence.
|
||||
return i;
|
||||
}
|
||||
} else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
|
||||
(0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
|
||||
// Truncated 3- or 4-byte sequence.
|
||||
return i;
|
||||
}
|
||||
} while(Z<=--I);
|
||||
|
||||
/* return i itself to be consistent with the FWD_1 macro */
|
||||
return i;
|
||||
}
|
||||
return orig_i;
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#define __UTRIE2_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/utf8.h"
|
||||
#include "putilimp.h"
|
||||
#include "udataswp.h"
|
||||
|
||||
|
@ -54,6 +55,8 @@ typedef struct UTrie UTrie;
|
|||
* is truncated, omitting both the BMP portion and the high range.
|
||||
* - There is a special small index for 2-byte UTF-8, and the initial data
|
||||
* entries are designed for fast 1/2-byte UTF-8 lookup.
|
||||
* Starting with ICU 60, C0 and C1 are not recognized as UTF-8 lead bytes any more at all,
|
||||
* and the associated 2-byte indexes are unused.
|
||||
*/
|
||||
|
||||
/**
|
||||
|
@ -933,29 +936,29 @@ utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c,
|
|||
/** Internal UTF-8 next-post-increment: get the next code point's data. */
|
||||
#define _UTRIE2_U8_NEXT(trie, ascii, data, src, limit, result) { \
|
||||
uint8_t __lead=(uint8_t)*(src)++; \
|
||||
if(__lead<0xc0) { \
|
||||
if(U8_IS_SINGLE(__lead)) { \
|
||||
(result)=(trie)->ascii[__lead]; \
|
||||
} else { \
|
||||
uint8_t __t1, __t2; \
|
||||
if( /* handle U+0000..U+07FF inline */ \
|
||||
__lead<0xe0 && (src)<(limit) && \
|
||||
if( /* handle U+0800..U+FFFF inline */ \
|
||||
0xe0<=__lead && __lead<0xf0 && ((src)+1)<(limit) && \
|
||||
U8_IS_VALID_LEAD3_AND_T1(__lead, __t1=(uint8_t)*(src)) && \
|
||||
(__t2=(uint8_t)(*((src)+1)-0x80))<= 0x3f \
|
||||
) { \
|
||||
(src)+=2; \
|
||||
(result)=(trie)->data[ \
|
||||
((int32_t)((trie)->index[((__lead-0xe0)<<(12-UTRIE2_SHIFT_2))+ \
|
||||
((__t1&0x3f)<<(6-UTRIE2_SHIFT_2))+(__t2>>UTRIE2_SHIFT_2)]) \
|
||||
<<UTRIE2_INDEX_SHIFT)+ \
|
||||
(__t2&UTRIE2_DATA_MASK)]; \
|
||||
} else if( /* handle U+0080..U+07FF inline */ \
|
||||
__lead<0xe0 && __lead>=0xc2 && (src)<(limit) && \
|
||||
(__t1=(uint8_t)(*(src)-0x80))<=0x3f \
|
||||
) { \
|
||||
++(src); \
|
||||
(result)=(trie)->data[ \
|
||||
(trie)->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET-0xc0)+__lead]+ \
|
||||
__t1]; \
|
||||
} else if( /* handle U+0000..U+CFFF inline */ \
|
||||
__lead<0xed && ((src)+1)<(limit) && \
|
||||
(__t1=(uint8_t)(*(src)-0x80))<=0x3f && (__lead>0xe0 || __t1>=0x20) && \
|
||||
(__t2=(uint8_t)(*((src)+1)-0x80))<= 0x3f \
|
||||
) { \
|
||||
(src)+=2; \
|
||||
(result)=(trie)->data[ \
|
||||
((int32_t)((trie)->index[((__lead-0xe0)<<(12-UTRIE2_SHIFT_2))+ \
|
||||
(__t1<<(6-UTRIE2_SHIFT_2))+(__t2>>UTRIE2_SHIFT_2)]) \
|
||||
<<UTRIE2_INDEX_SHIFT)+ \
|
||||
(__t2&UTRIE2_DATA_MASK)]; \
|
||||
} else { \
|
||||
int32_t __index=utrie2_internalU8NextIndex((trie), __lead, (const uint8_t *)(src), \
|
||||
(const uint8_t *)(limit)); \
|
||||
|
@ -968,7 +971,7 @@ utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c,
|
|||
/** Internal UTF-8 pre-decrement-previous: get the previous code point's data. */
|
||||
#define _UTRIE2_U8_PREV(trie, ascii, data, start, src, result) { \
|
||||
uint8_t __b=(uint8_t)*--(src); \
|
||||
if(__b<0x80) { \
|
||||
if(U8_IS_SINGLE(__b)) { \
|
||||
(result)=(trie)->ascii[__b]; \
|
||||
} else { \
|
||||
int32_t __index=utrie2_internalU8PrevIndex((trie), __b, (const uint8_t *)(start), \
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#
|
||||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (C) 2002-2016, International Business Machines Corporation and others.
|
||||
|
@ -12,6 +12,8 @@
|
|||
# Plus revisions to rule GB 11 from http://unicode.org/cldr/trac/ticket/10088
|
||||
# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
|
||||
|
||||
!!quoted_literals_only;
|
||||
|
||||
#
|
||||
# Character Class Definitions.
|
||||
#
|
||||
|
@ -78,42 +80,6 @@ $Prepend [^$Control $CR $LF];
|
|||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
$LF $CR;
|
||||
($L | $V | $LV | $LVT) $L;
|
||||
($V | $T) ($LV | $V);
|
||||
$T ($LVT | $T);
|
||||
|
||||
# GB 9
|
||||
($Extend | $ZWJ) [^$Control $CR $LF]; #note that this will chain into Regional_Indicator when needed.
|
||||
|
||||
# GB 9a
|
||||
$SpacingMark [^$Control $CR $LF];
|
||||
|
||||
# GB 9b
|
||||
[^$Control $CR $LF] $Prepend;
|
||||
|
||||
# GB 10
|
||||
$E_Modifier $Extend* ($E_Base | $E_Base_GAZ);
|
||||
|
||||
# GB 11 Don't break between ZWJ and Glue_After_ZWJ
|
||||
($Extended_Pict | $EmojiNRK) $ZWJ $Extend* ($Extended_Pict | $EmojiNRK);
|
||||
|
||||
# GB 12-13. Going backwards, we must scan through any number of regional indicators as pairs.
|
||||
#
|
||||
[{bof} $Extend $ZWJ $SpacingMark] $Regional_Indicator $Regional_Indicator / ($Regional_Indicator $Regional_Indicator)+ [{eof}[^$Regional_Indicator]];
|
||||
[{bof} $Extend $ZWJ $SpacingMark] $Regional_Indicator / ($Regional_Indicator $Regional_Indicator)+ [{eof}[^$Regional_Indicator]];
|
||||
$Regional_Indicator $Regional_Indicator;
|
||||
$Regional_Indicator $Prepend;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
$Regional_Indicator $Regional_Indicator;
|
||||
($Extend | $ZWJ | $EmojiNRK | $Extended_Pict)+ .;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
$Regional_Indicator $Regional_Indicator;
|
||||
($Extend | $ZWJ | $EmojiNRK | $Extended_Pict)+ .;
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
|
@ -25,6 +25,7 @@
|
|||
#
|
||||
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
|
@ -334,209 +335,6 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
|||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
#
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
|
||||
# LB 9 Combining Marks.
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
|
||||
^$CM+ $CAN_CM?;
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] [whatever]
|
||||
# The CM needs to behave as an AL
|
||||
#
|
||||
$AL_FOLLOW $CM+ / (
|
||||
[$BK $CR $LF $NL $ZW {eof}] |
|
||||
$SP+ $CM+ $SP |
|
||||
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
|
||||
# LB14 says OP SP* x .
|
||||
# becomes OP SP* x AL
|
||||
# becomes OP SP* x CM+ AL_FOLLOW
|
||||
#
|
||||
# Further note: the $AL in [$AL {eof}] is only to work around
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
|
||||
# LB 4, 5, 6
|
||||
|
||||
$LB4Breaks [$LB4NonBreaks-$CM];
|
||||
$LB4Breaks $CM+ $CAN_CM;
|
||||
$LF $CR;
|
||||
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
[$SP $ZW] [$LB4NonBreaks-$CM];
|
||||
[$SP $ZW] $CM+ $CAN_CM;
|
||||
|
||||
# LB 8 ZW SP* <break>
|
||||
# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
|
||||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
#
|
||||
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
# X $CM needs to behave like X, where X is not $SP or controls.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
^$CM+ $CAN_CM;
|
||||
|
||||
|
||||
# LB 11
|
||||
#
|
||||
$WJ $CM* $CAN_CM;
|
||||
$WJ [$LB8NonBreaks-$CM];
|
||||
|
||||
$CANT_CM $CM* $WJ;
|
||||
$CAN_CM $CM* $WJ;
|
||||
|
||||
# LB 12a
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
|
||||
|
||||
# LB 12
|
||||
# GL x
|
||||
#
|
||||
$CANT_CM $CM* $GL;
|
||||
$CAN_CM $CM* $GL;
|
||||
|
||||
|
||||
# LB 13
|
||||
$CL $CM+ $CAN_CM;
|
||||
$CP $CM+ $CAN_CM;
|
||||
$EX $CM+ $CAN_CM;
|
||||
$IS $CM+ $CAN_CM;
|
||||
$SY $CM+ $CAN_CM;
|
||||
|
||||
$CL [$LB8NonBreaks-$CM];
|
||||
$CP [$LB8NonBreaks-$CM];
|
||||
$EX [$LB8NonBreaks-$CM];
|
||||
$IS [$LB8NonBreaks-$CM];
|
||||
$SY [$LB8NonBreaks-$CM];
|
||||
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
. $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
|
||||
|
||||
|
||||
# LB 15
|
||||
$OP $SP* $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
$NS $SP* $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$B2 $SP* $CM* $B2;
|
||||
|
||||
# LB 18 break after spaces
|
||||
# Nothing explicit needed here.
|
||||
|
||||
|
||||
#
|
||||
# LB 19
|
||||
#
|
||||
$QU $CM* $CAN_CM; # . x QU
|
||||
$QU $LB18NonBreaks;
|
||||
|
||||
|
||||
$CAN_CM $CM* $QU; # QU x .
|
||||
$CANT_CM $CM* $QU;
|
||||
|
||||
#
|
||||
# LB 20 Break before and after CB.
|
||||
# nothing needed here.
|
||||
#
|
||||
|
||||
# LB 21
|
||||
($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
|
||||
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
|
||||
# LB21a
|
||||
[^$CB] $CM* ($HY | $BA) $CM* $HL;
|
||||
|
||||
# LB21b (reverse)
|
||||
$HL $CM* $SY;
|
||||
|
||||
# LB 22
|
||||
$IN $CM* ($ALPlus | $HL);
|
||||
$IN $CM* $EX;
|
||||
$IN $CM* ($ID | $EB | $EM);
|
||||
$IN $CM* $IN;
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB23a
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
|
||||
# LB 24
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 25
|
||||
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26
|
||||
($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
($JT | $JV) $CM* ($H2 | $JV);
|
||||
$JT $CM* ($H3 | $JT);
|
||||
|
||||
# LB 27
|
||||
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
|
||||
# LB 28
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 29
|
||||
($ALPlus | $HL) $CM* $IS;
|
||||
|
||||
# LB 30
|
||||
$OP $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* $CP;
|
||||
|
||||
# LB 30a
|
||||
# Pairs of Regional Indicators.
|
||||
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
|
||||
# the second with an even number. Stripping away the cruft they look like
|
||||
# [^RI] RI / (RI RI)+ ^RI;
|
||||
# [^RI] RI RI / (RI RI)+ ^RI;
|
||||
#
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
|
||||
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
|
||||
$RI $CM* $RI;
|
||||
|
||||
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
|
||||
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
|
||||
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EM $CM* $EB;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
|
@ -544,7 +342,6 @@ $EM $CM* $EB;
|
|||
|
||||
# LB 9
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
^$CM+ $SP / .;
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
@ -571,19 +368,3 @@ $CM* ($HY | $BA) $CM* $HL;
|
|||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
||||
# Skip forward over all character classes that are involved in
|
||||
# rules containing patterns with possibly more than one char
|
||||
# of context.
|
||||
#
|
||||
# It might be slightly more efficient to have specific rules
|
||||
# instead of one generic one, but only if we could
|
||||
# turn off rule chaining. We don't want to move more
|
||||
# than necessary.
|
||||
#
|
||||
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
|
@ -30,6 +30,7 @@
|
|||
#
|
||||
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
|
@ -343,220 +344,12 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
|||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
#
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
|
||||
# LB 9 Combining Marks.
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
|
||||
^$CM+ $CAN_CM?;
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] [whatever]
|
||||
# The CM needs to behave as an AL
|
||||
#
|
||||
$AL_FOLLOW $CM+ / (
|
||||
[$BK $CR $LF $NL $ZW {eof}] |
|
||||
$SP+ $CM+ $SP |
|
||||
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
|
||||
# LB14 says OP SP* x .
|
||||
# becomes OP SP* x AL
|
||||
# becomes OP SP* x CM+ AL_FOLLOW
|
||||
#
|
||||
# Further note: the $AL in [$AL {eof}] is only to work around
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
|
||||
# LB 4, 5, 6
|
||||
|
||||
$LB4Breaks [$LB4NonBreaks-$CM];
|
||||
$LB4Breaks $CM+ $CAN_CM;
|
||||
$LF $CR;
|
||||
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
[$SP $ZW] [$LB4NonBreaks-$CM];
|
||||
[$SP $ZW] $CM+ $CAN_CM;
|
||||
|
||||
# LB 8 ZW SP* <break>
|
||||
# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
|
||||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
#
|
||||
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
# X $CM needs to behave like X, where X is not $SP or controls.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
^$CM+ $CAN_CM;
|
||||
|
||||
|
||||
# LB 11
|
||||
#
|
||||
$WJ $CM* $CAN_CM;
|
||||
$WJ [$LB8NonBreaks-$CM];
|
||||
|
||||
$CANT_CM $CM* $WJ;
|
||||
$CAN_CM $CM* $WJ;
|
||||
|
||||
# LB 12a
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
|
||||
|
||||
# LB 12
|
||||
# GL x
|
||||
#
|
||||
$CANT_CM $CM* $GL;
|
||||
$CAN_CM $CM* $GL;
|
||||
|
||||
|
||||
# LB 13
|
||||
$CL $CM+ $CAN_CM;
|
||||
$CP $CM+ $CAN_CM;
|
||||
$EX $CM+ $CAN_CM;
|
||||
$IS $CM+ $CAN_CM;
|
||||
$SY $CM+ $CAN_CM;
|
||||
|
||||
$CL [$LB8NonBreaks-$CM];
|
||||
$CP [$LB8NonBreaks-$CM];
|
||||
$EX [$LB8NonBreaks-$CM];
|
||||
$IS [$LB8NonBreaks-$CM];
|
||||
$SY [$LB8NonBreaks-$CM];
|
||||
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
. $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
|
||||
|
||||
|
||||
# LB 15
|
||||
$OP $SP* $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
$NS $SP* $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$B2 $SP* $CM* $B2;
|
||||
|
||||
# LB 18 break after spaces
|
||||
# Nothing explicit needed here.
|
||||
|
||||
|
||||
#
|
||||
# LB 19
|
||||
#
|
||||
$QU $CM* $CAN_CM; # . x QU
|
||||
$QU $LB18NonBreaks;
|
||||
|
||||
|
||||
$CAN_CM $CM* $QU; # QU x .
|
||||
$CANT_CM $CM* $QU;
|
||||
|
||||
#
|
||||
# LB 20 Break before and after CB.
|
||||
# nothing needed here.
|
||||
#
|
||||
|
||||
# LB 20.09 added rule for Finnish tailoring
|
||||
$AL ($HY | $HH) / $SP;
|
||||
|
||||
# LB 21
|
||||
($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
|
||||
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
|
||||
# LB21a
|
||||
[^$CB] $CM* ($HY | $BA | $HH) $CM* $HL;
|
||||
|
||||
# LB21b (reverse)
|
||||
$HL $CM* $SY;
|
||||
|
||||
# LB 22
|
||||
$IN $CM* ($ALPlus | $HL);
|
||||
$IN $CM* $EX;
|
||||
$IN $CM* ($ID | $EB | $EM);
|
||||
$IN $CM* $IN;
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB23a
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
|
||||
# LB 24
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 25
|
||||
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26
|
||||
($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
($JT | $JV) $CM* ($H2 | $JV);
|
||||
$JT $CM* ($H3 | $JT);
|
||||
|
||||
# LB 27
|
||||
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
|
||||
# LB 28
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 29
|
||||
($ALPlus | $HL) $CM* $IS;
|
||||
|
||||
# LB 30
|
||||
$OP $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* $CP;
|
||||
|
||||
# LB 30a
|
||||
# Pairs of Regional Indicators.
|
||||
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
|
||||
# the second with an even number. Stripping away the cruft they look like
|
||||
# [^RI] RI / (RI RI)+ ^RI;
|
||||
# [^RI] RI RI / (RI RI)+ ^RI;
|
||||
#
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
|
||||
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
|
||||
$RI $CM* $RI;
|
||||
|
||||
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
|
||||
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
|
||||
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EM $CM* $EB;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 9
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
^$CM+ $SP / .;
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
@ -582,20 +375,3 @@ $CM* ($HY | $BA | $HH) $CM* $HL;
|
|||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
||||
# Skip forward over all character classes that are involved in
|
||||
# rules containing patterns with possibly more than one char
|
||||
# of context.
|
||||
#
|
||||
# It might be slightly more efficient to have specific rules
|
||||
# instead of one generic one, but only if we could
|
||||
# turn off rule chaining. We don't want to move more
|
||||
# than necessary.
|
||||
#
|
||||
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
#
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
|
@ -32,6 +33,7 @@
|
|||
#
|
||||
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
|
@ -345,212 +347,6 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
|||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
#
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
|
||||
# LB 9 Combining Marks.
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
|
||||
^$CM+ $CAN_CM?;
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] [whatever]
|
||||
# The CM needs to behave as an AL
|
||||
#
|
||||
$AL_FOLLOW $CM+ / (
|
||||
[$BK $CR $LF $NL $ZW {eof}] |
|
||||
$SP+ $CM+ $SP |
|
||||
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
|
||||
# LB14 says OP SP* x .
|
||||
# becomes OP SP* x AL
|
||||
# becomes OP SP* x CM+ AL_FOLLOW
|
||||
#
|
||||
# Further note: the $AL in [$AL {eof}] is only to work around
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
|
||||
# LB 4, 5, 6
|
||||
|
||||
$LB4Breaks [$LB4NonBreaks-$CM];
|
||||
$LB4Breaks $CM+ $CAN_CM;
|
||||
$LF $CR;
|
||||
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
[$SP $ZW] [$LB4NonBreaks-$CM];
|
||||
[$SP $ZW] $CM+ $CAN_CM;
|
||||
|
||||
# LB 8 ZW SP* <break>
|
||||
# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
|
||||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
#
|
||||
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
# X $CM needs to behave like X, where X is not $SP or controls.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
^$CM+ $CAN_CM;
|
||||
|
||||
|
||||
# LB 11
|
||||
#
|
||||
$WJ $CM* $CAN_CM;
|
||||
$WJ [$LB8NonBreaks-$CM];
|
||||
|
||||
$CANT_CM $CM* $WJ;
|
||||
$CAN_CM $CM* $WJ;
|
||||
|
||||
# LB 12a
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
|
||||
|
||||
# LB 12
|
||||
# GL x
|
||||
#
|
||||
$CANT_CM $CM* $GL;
|
||||
$CAN_CM $CM* $GL;
|
||||
|
||||
|
||||
# LB 13
|
||||
$CL $CM+ $CAN_CM;
|
||||
$CP $CM+ $CAN_CM;
|
||||
$EX $CM+ $CAN_CM;
|
||||
$IS $CM+ $CAN_CM;
|
||||
$SY $CM+ $CAN_CM;
|
||||
|
||||
$CL [$LB8NonBreaks-$CM];
|
||||
$CP [$LB8NonBreaks-$CM];
|
||||
$EX [$LB8NonBreaks-$CM];
|
||||
$IS [$LB8NonBreaks-$CM];
|
||||
$SY [$LB8NonBreaks-$CM];
|
||||
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
. $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
|
||||
|
||||
|
||||
# LB 15
|
||||
$OP $SP* $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
# Don't include $NSX here
|
||||
$NS $SP* $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$B2 $SP* $CM* $B2;
|
||||
|
||||
# LB 18 break after spaces
|
||||
# Nothing explicit needed here.
|
||||
|
||||
|
||||
#
|
||||
# LB 19
|
||||
#
|
||||
$QU $CM* $CAN_CM; # . x QU
|
||||
$QU $LB18NonBreaks;
|
||||
|
||||
|
||||
$CAN_CM $CM* $QU; # QU x .
|
||||
$CANT_CM $CM* $QU;
|
||||
|
||||
#
|
||||
# LB 20 Break before and after CB.
|
||||
# nothing needed here.
|
||||
#
|
||||
|
||||
# LB 21
|
||||
# Don't include $NSX here
|
||||
($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
|
||||
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
|
||||
# LB21a
|
||||
[^$CB] $CM* ($HY | $BA) $CM* $HL;
|
||||
|
||||
# LB21b (reverse)
|
||||
$HL $CM* $SY;
|
||||
|
||||
# LB 22
|
||||
$IN $CM* ($ALPlus | $HL);
|
||||
$IN $CM* $EX;
|
||||
$IN $CM* ($ID | $EB | $EM);
|
||||
# $IN $CM* $IN; # delete this rule for CSS loose
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB23a
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
|
||||
# LB 24
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 25
|
||||
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26
|
||||
($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
($JT | $JV) $CM* ($H2 | $JV);
|
||||
$JT $CM* ($H3 | $JT);
|
||||
|
||||
# LB 27
|
||||
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
|
||||
# LB 28
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 29
|
||||
($ALPlus | $HL) $CM* $IS;
|
||||
|
||||
# LB 30
|
||||
$OP $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* $CP;
|
||||
|
||||
# LB 30a
|
||||
# Pairs of Regional Indicators.
|
||||
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
|
||||
# the second with an even number. Stripping away the cruft they look like
|
||||
# [^RI] RI / (RI RI)+ ^RI;
|
||||
# [^RI] RI RI / (RI RI)+ ^RI;
|
||||
#
|
||||
# Line Loose tailoring: Don't include NSX here.
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
|
||||
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
|
||||
$RI $CM* $RI;
|
||||
|
||||
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
|
||||
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
|
||||
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EM $CM* $EB;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
|
@ -558,7 +354,6 @@ $EM $CM* $EB;
|
|||
|
||||
# LB 9
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
^$CM+ $SP / .;
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
@ -584,20 +379,3 @@ $CM* ($HY | $BA) $CM* $HL;
|
|||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
||||
# Skip forward over all character classes that are involved in
|
||||
# rules containing patterns with possibly more than one char
|
||||
# of context.
|
||||
#
|
||||
# It might be slightly more efficient to have specific rules
|
||||
# instead of one generic one, but only if we could
|
||||
# turn off rule chaining. We don't want to move more
|
||||
# than necessary.
|
||||
#
|
||||
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
|
@ -39,6 +39,7 @@
|
|||
#
|
||||
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
|
@ -360,226 +361,12 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
|||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
#
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
|
||||
# LB 9 Combining Marks.
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
|
||||
^$CM+ $CAN_CM?;
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] [whatever]
|
||||
# The CM needs to behave as an AL
|
||||
#
|
||||
$AL_FOLLOW $CM+ / (
|
||||
[$BK $CR $LF $NL $ZW {eof}] |
|
||||
$SP+ $CM+ $SP |
|
||||
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
|
||||
# LB14 says OP SP* x .
|
||||
# becomes OP SP* x AL
|
||||
# becomes OP SP* x CM+ AL_FOLLOW
|
||||
#
|
||||
# Further note: the $AL in [$AL {eof}] is only to work around
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
|
||||
# LB 4, 5, 6
|
||||
|
||||
$LB4Breaks [$LB4NonBreaks-$CM];
|
||||
$LB4Breaks $CM+ $CAN_CM;
|
||||
$LF $CR;
|
||||
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
[$SP $ZW] [$LB4NonBreaks-$CM];
|
||||
[$SP $ZW] $CM+ $CAN_CM;
|
||||
|
||||
# LB 8 ZW SP* <break>
|
||||
# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
|
||||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
#
|
||||
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
# X $CM needs to behave like X, where X is not $SP or controls.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
^$CM+ $CAN_CM;
|
||||
|
||||
|
||||
# LB 11
|
||||
#
|
||||
$WJ $CM* $CAN_CM;
|
||||
$WJ [$LB8NonBreaks-$CM];
|
||||
|
||||
$CANT_CM $CM* $WJ;
|
||||
$CAN_CM $CM* $WJ;
|
||||
|
||||
# LB 12a
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
|
||||
|
||||
# LB 12
|
||||
# GL x
|
||||
#
|
||||
$CANT_CM $CM* $GL;
|
||||
$CAN_CM $CM* $GL;
|
||||
|
||||
|
||||
# LB 13
|
||||
# Do not include $EXX here
|
||||
$CL $CM+ $CAN_CM;
|
||||
$CP $CM+ $CAN_CM;
|
||||
$EX $CM+ $CAN_CM;
|
||||
$IS $CM+ $CAN_CM;
|
||||
$SY $CM+ $CAN_CM;
|
||||
|
||||
$CL [$LB8NonBreaks-$CM];
|
||||
$CP [$LB8NonBreaks-$CM];
|
||||
$EX [$LB8NonBreaks-$CM];
|
||||
$IS [$LB8NonBreaks-$CM];
|
||||
$SY [$LB8NonBreaks-$CM];
|
||||
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
. $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
|
||||
|
||||
|
||||
# LB 15
|
||||
$OP $SP* $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
# Don't include $NSX here
|
||||
$NS $SP* $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$B2 $SP* $CM* $B2;
|
||||
|
||||
# LB 18 break after spaces
|
||||
# Nothing explicit needed here.
|
||||
|
||||
|
||||
#
|
||||
# LB 19
|
||||
#
|
||||
$QU $CM* $CAN_CM; # . x QU
|
||||
$QU $LB18NonBreaks;
|
||||
|
||||
|
||||
$CAN_CM $CM* $QU; # QU x .
|
||||
$CANT_CM $CM* $QU;
|
||||
|
||||
#
|
||||
# LB 20 Break before and after CB.
|
||||
# nothing needed here.
|
||||
#
|
||||
|
||||
# LB 21
|
||||
# Don't include $BAX or $NSX here
|
||||
($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
|
||||
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
|
||||
# LB21a Don't break after Hebrew + Hyphen.
|
||||
([^$CB] $CM*)? ($HY | $BA | $BAX) $CM* $HL;
|
||||
|
||||
# LB21b (reverse)
|
||||
$HL $CM* $SY;
|
||||
|
||||
# LB 22
|
||||
$IN $CM* ($ALPlus | $HL);
|
||||
$IN $CM* $EX;
|
||||
$IN $CM* ($ID | $EB | $EM);
|
||||
# $IN $CM* $IN; # delete this rule for CSS loose
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
# Do not include $POX here
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB23a
|
||||
# Do not include $PRX here
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
|
||||
# LB 24
|
||||
# Do not include $PRX here
|
||||
($ALPlus | $HL) $CM* ($PR | $PO | $POX);
|
||||
($PR | $PO | $POX) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 25
|
||||
# Here do not include $POX at the beginning or $PRX at the end
|
||||
($CM* ($PR | $PRX | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO | $POX))?;
|
||||
|
||||
# LB 26
|
||||
($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
($JT | $JV) $CM* ($H2 | $JV);
|
||||
$JT $CM* ($H3 | $JT);
|
||||
|
||||
# LB 27
|
||||
# Do not include $POX or $PRX here
|
||||
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
|
||||
# LB 28
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 29
|
||||
($ALPlus | $HL) $CM* $IS;
|
||||
|
||||
# LB 30
|
||||
$OP $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* $CP;
|
||||
|
||||
# LB 30a
|
||||
# Pairs of Regional Indicators.
|
||||
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
|
||||
# the second with an even number. Stripping away the cruft they look like
|
||||
# [^RI] RI / (RI RI)+ ^RI;
|
||||
# [^RI] RI RI / (RI RI)+ ^RI;
|
||||
#
|
||||
# Line Loose tailoring: Don't include NSX here.
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
|
||||
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
|
||||
$RI $CM* $RI;
|
||||
|
||||
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
|
||||
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
|
||||
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EM $CM* $EB;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 9
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
^$CM+ $SP / .;
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
@ -605,20 +392,3 @@ $CM* ($HY | $BA | $BAX) $CM* $HL;
|
|||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
||||
# Skip forward over all character classes that are involved in
|
||||
# rules containing patterns with possibly more than one char
|
||||
# of context.
|
||||
#
|
||||
# It might be slightly more efficient to have specific rules
|
||||
# instead of one generic one, but only if we could
|
||||
# turn off rule chaining. We don't want to move more
|
||||
# than necessary.
|
||||
#
|
||||
^[$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $RI $ZWJ $dictionary];
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
|
@ -28,6 +28,7 @@
|
|||
#
|
||||
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
|
@ -345,215 +346,6 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
|||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
#
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
|
||||
# LB 9 Combining Marks.
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
|
||||
^$CM+ $CAN_CM?;
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] [whatever]
|
||||
# The CM needs to behave as an AL
|
||||
#
|
||||
$AL_FOLLOW $CM+ / (
|
||||
[$BK $CR $LF $NL $ZW {eof}] |
|
||||
$SP+ $CM+ $SP |
|
||||
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
|
||||
# LB14 says OP SP* x .
|
||||
# becomes OP SP* x AL
|
||||
# becomes OP SP* x CM+ AL_FOLLOW
|
||||
#
|
||||
# Further note: the $AL in [$AL {eof}] is only to work around
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
|
||||
# LB 4, 5, 6
|
||||
|
||||
$LB4Breaks [$LB4NonBreaks-$CM];
|
||||
$LB4Breaks $CM+ $CAN_CM;
|
||||
$LF $CR;
|
||||
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
[$SP $ZW] [$LB4NonBreaks-$CM];
|
||||
[$SP $ZW] $CM+ $CAN_CM;
|
||||
|
||||
# LB 8 ZW SP* <break>
|
||||
# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
|
||||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
#
|
||||
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
# X $CM needs to behave like X, where X is not $SP or controls.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
^$CM+ $CAN_CM;
|
||||
|
||||
|
||||
# LB 11
|
||||
#
|
||||
$WJ $CM* $CAN_CM;
|
||||
$WJ [$LB8NonBreaks-$CM];
|
||||
|
||||
$CANT_CM $CM* $WJ;
|
||||
$CAN_CM $CM* $WJ;
|
||||
|
||||
# LB 12a
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
|
||||
|
||||
# LB 12
|
||||
# GL x
|
||||
#
|
||||
$CANT_CM $CM* $GL;
|
||||
$CAN_CM $CM* $GL;
|
||||
|
||||
|
||||
# LB 13
|
||||
$CL $CM+ $CAN_CM;
|
||||
$CP $CM+ $CAN_CM;
|
||||
$EX $CM+ $CAN_CM;
|
||||
$IS $CM+ $CAN_CM;
|
||||
$SY $CM+ $CAN_CM;
|
||||
|
||||
$CL [$LB8NonBreaks-$CM];
|
||||
$CP [$LB8NonBreaks-$CM];
|
||||
$EX [$LB8NonBreaks-$CM];
|
||||
$IS [$LB8NonBreaks-$CM];
|
||||
$SY [$LB8NonBreaks-$CM];
|
||||
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
. $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
|
||||
|
||||
|
||||
# LB 15
|
||||
$OP $SP* $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
# Don't include $NSX here
|
||||
$NS $SP* $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$B2 $SP* $CM* $B2;
|
||||
|
||||
# LB 18 break after spaces
|
||||
# Nothing explicit needed here.
|
||||
|
||||
|
||||
#
|
||||
# LB 19
|
||||
#
|
||||
$QU $CM* $CAN_CM; # . x QU
|
||||
$QU $LB18NonBreaks;
|
||||
|
||||
|
||||
$CAN_CM $CM* $QU; # QU x .
|
||||
$CANT_CM $CM* $QU;
|
||||
|
||||
#
|
||||
# LB 20 Break before and after CB.
|
||||
# nothing needed here.
|
||||
#
|
||||
|
||||
# LB 20.09 added rule for Finnish tailoring
|
||||
$AL ($HY | $HH) / $SP;
|
||||
|
||||
# LB 21
|
||||
# Don't include $NSX here
|
||||
($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
|
||||
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
|
||||
# LB21a
|
||||
[^$CB] $CM* ($HY | $BA | $HH) $CM* $HL;
|
||||
|
||||
# LB21b (reverse)
|
||||
$HL $CM* $SY;
|
||||
|
||||
# LB 22
|
||||
$IN $CM* ($ALPlus | $HL);
|
||||
$IN $CM* $EX;
|
||||
$IN $CM* ($ID | $EB | $EM);
|
||||
# $IN $CM* $IN; # delete this rule for CSS loose
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB23a
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
|
||||
# LB 24
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 25
|
||||
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26
|
||||
($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
($JT | $JV) $CM* ($H2 | $JV);
|
||||
$JT $CM* ($H3 | $JT);
|
||||
|
||||
# LB 27
|
||||
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
|
||||
# LB 28
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 29
|
||||
($ALPlus | $HL) $CM* $IS;
|
||||
|
||||
# LB 30
|
||||
$OP $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* $CP;
|
||||
|
||||
# LB 30a
|
||||
# Pairs of Regional Indicators.
|
||||
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
|
||||
# the second with an even number. Stripping away the cruft they look like
|
||||
# [^RI] RI / (RI RI)+ ^RI;
|
||||
# [^RI] RI RI / (RI RI)+ ^RI;
|
||||
#
|
||||
# Line Loose tailoring: Don't include NSX here.
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
|
||||
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
|
||||
$RI $CM* $RI;
|
||||
|
||||
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
|
||||
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
|
||||
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EM $CM* $EB;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
|
@ -561,7 +353,6 @@ $EM $CM* $EB;
|
|||
|
||||
# LB 9
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
^$CM+ $SP / .;
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
@ -587,20 +378,3 @@ $CM* ($HY | $BA | $HH) $CM* $HL;
|
|||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
||||
# Skip forward over all character classes that are involved in
|
||||
# rules containing patterns with possibly more than one char
|
||||
# of context.
|
||||
#
|
||||
# It might be slightly more efficient to have specific rules
|
||||
# instead of one generic one, but only if we could
|
||||
# turn off rule chaining. We don't want to move more
|
||||
# than necessary.
|
||||
#
|
||||
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
|
@ -29,6 +29,7 @@
|
|||
#
|
||||
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
|
@ -338,217 +339,12 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
|||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
#
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
|
||||
# LB 9 Combining Marks.
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
|
||||
^$CM+ $CAN_CM?;
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] [whatever]
|
||||
# The CM needs to behave as an AL
|
||||
#
|
||||
$AL_FOLLOW $CM+ / (
|
||||
[$BK $CR $LF $NL $ZW {eof}] |
|
||||
$SP+ $CM+ $SP |
|
||||
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
|
||||
# LB14 says OP SP* x .
|
||||
# becomes OP SP* x AL
|
||||
# becomes OP SP* x CM+ AL_FOLLOW
|
||||
#
|
||||
# Further note: the $AL in [$AL {eof}] is only to work around
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
|
||||
# LB 4, 5, 6
|
||||
|
||||
$LB4Breaks [$LB4NonBreaks-$CM];
|
||||
$LB4Breaks $CM+ $CAN_CM;
|
||||
$LF $CR;
|
||||
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
[$SP $ZW] [$LB4NonBreaks-$CM];
|
||||
[$SP $ZW] $CM+ $CAN_CM;
|
||||
|
||||
# LB 8 ZW SP* <break>
|
||||
# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
|
||||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
#
|
||||
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
# X $CM needs to behave like X, where X is not $SP or controls.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
^$CM+ $CAN_CM;
|
||||
|
||||
|
||||
# LB 11
|
||||
#
|
||||
$WJ $CM* $CAN_CM;
|
||||
$WJ [$LB8NonBreaks-$CM];
|
||||
|
||||
$CANT_CM $CM* $WJ;
|
||||
$CAN_CM $CM* $WJ;
|
||||
|
||||
# LB 12a
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
|
||||
|
||||
# LB 12
|
||||
# GL x
|
||||
#
|
||||
$CANT_CM $CM* $GL;
|
||||
$CAN_CM $CM* $GL;
|
||||
|
||||
|
||||
# LB 13
|
||||
$CL $CM+ $CAN_CM;
|
||||
$CP $CM+ $CAN_CM;
|
||||
$EX $CM+ $CAN_CM;
|
||||
$IS $CM+ $CAN_CM;
|
||||
$SY $CM+ $CAN_CM;
|
||||
|
||||
$CL [$LB8NonBreaks-$CM];
|
||||
$CP [$LB8NonBreaks-$CM];
|
||||
$EX [$LB8NonBreaks-$CM];
|
||||
$IS [$LB8NonBreaks-$CM];
|
||||
$SY [$LB8NonBreaks-$CM];
|
||||
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
. $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
|
||||
|
||||
|
||||
# LB 15
|
||||
$OP $SP* $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
$NS $SP* $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$B2 $SP* $CM* $B2;
|
||||
|
||||
# LB 18 break after spaces
|
||||
# Nothing explicit needed here.
|
||||
|
||||
|
||||
#
|
||||
# LB 19
|
||||
#
|
||||
$QU $CM* $CAN_CM; # . x QU
|
||||
$QU $LB18NonBreaks;
|
||||
|
||||
|
||||
$CAN_CM $CM* $QU; # QU x .
|
||||
$CANT_CM $CM* $QU;
|
||||
|
||||
#
|
||||
# LB 20 Break before and after CB.
|
||||
# nothing needed here.
|
||||
#
|
||||
|
||||
# LB 21
|
||||
($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
|
||||
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
|
||||
# LB21a
|
||||
[^$CB] $CM* ($HY | $BA) $CM* $HL;
|
||||
|
||||
# LB21b (reverse)
|
||||
$HL $CM* $SY;
|
||||
|
||||
# LB 22
|
||||
$IN $CM* ($ALPlus | $HL);
|
||||
$IN $CM* $EX;
|
||||
$IN $CM* ($ID | $EB | $EM);
|
||||
$IN $CM* $IN;
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB23a
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
|
||||
# LB 24
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 25
|
||||
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26
|
||||
($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
($JT | $JV) $CM* ($H2 | $JV);
|
||||
$JT $CM* ($H3 | $JT);
|
||||
|
||||
# LB 27
|
||||
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
|
||||
# LB 28
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 29
|
||||
($ALPlus | $HL) $CM* $IS;
|
||||
|
||||
# LB 30
|
||||
$OP $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* $CP;
|
||||
|
||||
# LB 30a
|
||||
# Pairs of Regional Indicators.
|
||||
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
|
||||
# the second with an even number. Stripping away the cruft they look like
|
||||
# [^RI] RI / (RI RI)+ ^RI;
|
||||
# [^RI] RI RI / (RI RI)+ ^RI;
|
||||
#
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
|
||||
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
|
||||
$RI $CM* $RI;
|
||||
|
||||
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
|
||||
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
|
||||
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EM $CM* $EB;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 9
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
^$CM+ $SP / .;
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
@ -574,20 +370,3 @@ $CM* ($HY | $BA) $CM* $HL;
|
|||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
||||
# Skip forward over all character classes that are involved in
|
||||
# rules containing patterns with possibly more than one char
|
||||
# of context.
|
||||
#
|
||||
# It might be slightly more efficient to have specific rules
|
||||
# instead of one generic one, but only if we could
|
||||
# turn off rule chaining. We don't want to move more
|
||||
# than necessary.
|
||||
#
|
||||
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
|
@ -30,6 +30,7 @@
|
|||
#
|
||||
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
|
@ -344,219 +345,12 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
|||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
#
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
|
||||
# LB 9 Combining Marks.
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
|
||||
^$CM+ $CAN_CM?;
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] [whatever]
|
||||
# The CM needs to behave as an AL
|
||||
#
|
||||
$AL_FOLLOW $CM+ / (
|
||||
[$BK $CR $LF $NL $ZW {eof}] |
|
||||
$SP+ $CM+ $SP |
|
||||
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
|
||||
# LB14 says OP SP* x .
|
||||
# becomes OP SP* x AL
|
||||
# becomes OP SP* x CM+ AL_FOLLOW
|
||||
#
|
||||
# Further note: the $AL in [$AL {eof}] is only to work around
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
|
||||
# LB 4, 5, 6
|
||||
|
||||
$LB4Breaks [$LB4NonBreaks-$CM];
|
||||
$LB4Breaks $CM+ $CAN_CM;
|
||||
$LF $CR;
|
||||
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
[$SP $ZW] [$LB4NonBreaks-$CM];
|
||||
[$SP $ZW] $CM+ $CAN_CM;
|
||||
|
||||
# LB 8 ZW SP* <break>
|
||||
# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
|
||||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
#
|
||||
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
# X $CM needs to behave like X, where X is not $SP or controls.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
^$CM+ $CAN_CM;
|
||||
|
||||
|
||||
# LB 11
|
||||
#
|
||||
$WJ $CM* $CAN_CM;
|
||||
$WJ [$LB8NonBreaks-$CM];
|
||||
|
||||
$CANT_CM $CM* $WJ;
|
||||
$CAN_CM $CM* $WJ;
|
||||
|
||||
# LB 12a
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
|
||||
|
||||
# LB 12
|
||||
# GL x
|
||||
#
|
||||
$CANT_CM $CM* $GL;
|
||||
$CAN_CM $CM* $GL;
|
||||
|
||||
|
||||
# LB 13
|
||||
$CL $CM+ $CAN_CM;
|
||||
$CP $CM+ $CAN_CM;
|
||||
$EX $CM+ $CAN_CM;
|
||||
$IS $CM+ $CAN_CM;
|
||||
$SY $CM+ $CAN_CM;
|
||||
|
||||
$CL [$LB8NonBreaks-$CM];
|
||||
$CP [$LB8NonBreaks-$CM];
|
||||
$EX [$LB8NonBreaks-$CM];
|
||||
$IS [$LB8NonBreaks-$CM];
|
||||
$SY [$LB8NonBreaks-$CM];
|
||||
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
. $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
|
||||
|
||||
|
||||
# LB 15
|
||||
$OP $SP* $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
# Don't include $NSX here
|
||||
$NS $SP* $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$B2 $SP* $CM* $B2;
|
||||
|
||||
# LB 18 break after spaces
|
||||
# Nothing explicit needed here.
|
||||
|
||||
|
||||
#
|
||||
# LB 19
|
||||
#
|
||||
$QU $CM* $CAN_CM; # . x QU
|
||||
$QU $LB18NonBreaks;
|
||||
|
||||
|
||||
$CAN_CM $CM* $QU; # QU x .
|
||||
$CANT_CM $CM* $QU;
|
||||
|
||||
#
|
||||
# LB 20 Break before and after CB.
|
||||
# nothing needed here.
|
||||
#
|
||||
|
||||
# LB 21
|
||||
# Don't include $BAX or $NSX here
|
||||
($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
|
||||
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
|
||||
# LB21a Don't break after Hebrew + Hyphen.
|
||||
([^$CB] $CM*)? ($HY | $BA | $BAX) $CM* $HL;
|
||||
|
||||
# LB21b (reverse)
|
||||
$HL $CM* $SY;
|
||||
|
||||
# LB 22
|
||||
$IN $CM* ($ALPlus | $HL);
|
||||
$IN $CM* $EX;
|
||||
$IN $CM* ($ID | $EB | $EM);
|
||||
$IN $CM* $IN;
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB23a
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
|
||||
# LB 24
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 25
|
||||
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26
|
||||
($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
($JT | $JV) $CM* ($H2 | $JV);
|
||||
$JT $CM* ($H3 | $JT);
|
||||
|
||||
# LB 27
|
||||
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
|
||||
# LB 28
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 29
|
||||
($ALPlus | $HL) $CM* $IS;
|
||||
|
||||
# LB 30
|
||||
$OP $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* $CP;
|
||||
|
||||
# LB 30a
|
||||
# Pairs of Regional Indicators.
|
||||
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
|
||||
# the second with an even number. Stripping away the cruft they look like
|
||||
# [^RI] RI / (RI RI)+ ^RI;
|
||||
# [^RI] RI RI / (RI RI)+ ^RI;
|
||||
#
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
|
||||
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
|
||||
$RI $CM* $RI;
|
||||
|
||||
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
|
||||
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
|
||||
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EM $CM* $EB;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 9
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
^$CM+ $SP / .;
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
@ -582,20 +376,3 @@ $CM* ($HY | $BA | $BAX) $CM* $HL;
|
|||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
||||
# Skip forward over all character classes that are involved in
|
||||
# rules containing patterns with possibly more than one char
|
||||
# of context.
|
||||
#
|
||||
# It might be slightly more efficient to have specific rules
|
||||
# instead of one generic one, but only if we could
|
||||
# turn off rule chaining. We don't want to move more
|
||||
# than necessary.
|
||||
#
|
||||
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $RI $ZWJ $dictionary];
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
|
@ -28,6 +28,7 @@
|
|||
#
|
||||
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
|
@ -341,213 +342,6 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
|||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
#
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
|
||||
# LB 9 Combining Marks.
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
|
||||
^$CM+ $CAN_CM?;
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] [whatever]
|
||||
# The CM needs to behave as an AL
|
||||
#
|
||||
$AL_FOLLOW $CM+ / (
|
||||
[$BK $CR $LF $NL $ZW {eof}] |
|
||||
$SP+ $CM+ $SP |
|
||||
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
|
||||
# LB14 says OP SP* x .
|
||||
# becomes OP SP* x AL
|
||||
# becomes OP SP* x CM+ AL_FOLLOW
|
||||
#
|
||||
# Further note: the $AL in [$AL {eof}] is only to work around
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
|
||||
# LB 4, 5, 6
|
||||
|
||||
$LB4Breaks [$LB4NonBreaks-$CM];
|
||||
$LB4Breaks $CM+ $CAN_CM;
|
||||
$LF $CR;
|
||||
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
[$SP $ZW] [$LB4NonBreaks-$CM];
|
||||
[$SP $ZW] $CM+ $CAN_CM;
|
||||
|
||||
# LB 8 ZW SP* <break>
|
||||
# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
|
||||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
#
|
||||
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
# X $CM needs to behave like X, where X is not $SP or controls.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
^$CM+ $CAN_CM;
|
||||
|
||||
|
||||
# LB 11
|
||||
#
|
||||
$WJ $CM* $CAN_CM;
|
||||
$WJ [$LB8NonBreaks-$CM];
|
||||
|
||||
$CANT_CM $CM* $WJ;
|
||||
$CAN_CM $CM* $WJ;
|
||||
|
||||
# LB 12a
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
|
||||
|
||||
# LB 12
|
||||
# GL x
|
||||
#
|
||||
$CANT_CM $CM* $GL;
|
||||
$CAN_CM $CM* $GL;
|
||||
|
||||
|
||||
# LB 13
|
||||
$CL $CM+ $CAN_CM;
|
||||
$CP $CM+ $CAN_CM;
|
||||
$EX $CM+ $CAN_CM;
|
||||
$IS $CM+ $CAN_CM;
|
||||
$SY $CM+ $CAN_CM;
|
||||
|
||||
$CL [$LB8NonBreaks-$CM];
|
||||
$CP [$LB8NonBreaks-$CM];
|
||||
$EX [$LB8NonBreaks-$CM];
|
||||
$IS [$LB8NonBreaks-$CM];
|
||||
$SY [$LB8NonBreaks-$CM];
|
||||
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
. $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
|
||||
|
||||
|
||||
# LB 15
|
||||
$OP $SP* $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
$NS $SP* $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$B2 $SP* $CM* $B2;
|
||||
|
||||
# LB 18 break after spaces
|
||||
# Nothing explicit needed here.
|
||||
|
||||
|
||||
#
|
||||
# LB 19
|
||||
#
|
||||
$QU $CM* $CAN_CM; # . x QU
|
||||
$QU $LB18NonBreaks;
|
||||
|
||||
|
||||
$CAN_CM $CM* $QU; # QU x .
|
||||
$CANT_CM $CM* $QU;
|
||||
|
||||
#
|
||||
# LB 20 Break before and after CB.
|
||||
# nothing needed here.
|
||||
#
|
||||
|
||||
# LB 20.09 added rule for Finnish tailoring
|
||||
$AL ($HY | $HH) / $SP;
|
||||
|
||||
# LB 21
|
||||
($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
|
||||
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
|
||||
# LB21a
|
||||
[^$CB] $CM* ($HY | $BA | $HH) $CM* $HL;
|
||||
|
||||
# LB21b (reverse)
|
||||
$HL $CM* $SY;
|
||||
|
||||
# LB 22
|
||||
$IN $CM* ($ALPlus | $HL);
|
||||
$IN $CM* $EX;
|
||||
$IN $CM* ($ID | $EB | $EM);
|
||||
$IN $CM* $IN;
|
||||
$IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB23a
|
||||
($ID | $EB | $EM) $CM* $PR;
|
||||
$PO $CM* ($ID | $EB | $EM);
|
||||
|
||||
# LB 24
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 25
|
||||
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26
|
||||
($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
($JT | $JV) $CM* ($H2 | $JV);
|
||||
$JT $CM* ($H3 | $JT);
|
||||
|
||||
# LB 27
|
||||
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
|
||||
# LB 28
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
|
||||
|
||||
# LB 29
|
||||
($ALPlus | $HL) $CM* $IS;
|
||||
|
||||
# LB 30
|
||||
$OP $CM* ($ALPlus | $HL | $NU);
|
||||
($ALPlus | $HL | $NU) $CM* $CP;
|
||||
|
||||
# LB 30a
|
||||
# Pairs of Regional Indicators.
|
||||
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
|
||||
# the second with an even number. Stripping away the cruft they look like
|
||||
# [^RI] RI / (RI RI)+ ^RI;
|
||||
# [^RI] RI RI / (RI RI)+ ^RI;
|
||||
#
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
|
||||
|
||||
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
|
||||
$RI $CM* $RI;
|
||||
|
||||
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
|
||||
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
|
||||
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EM $CM* $EB;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
@ -580,20 +374,3 @@ $CM* ($HY | $BA | $HH) $CM* $HL;
|
|||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
||||
# Skip forward over all character classes that are involved in
|
||||
# rules containing patterns with possibly more than one char
|
||||
# of context.
|
||||
#
|
||||
# It might be slightly more efficient to have specific rules
|
||||
# instead of one generic one, but only if we could
|
||||
# turn off rule chaining. We don't want to move more
|
||||
# than necessary.
|
||||
#
|
||||
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html#License
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
#
|
||||
# Copyright (C) 2002-2015, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
|
@ -12,6 +11,7 @@
|
|||
# These rules are based on UAX #29 Revision 26 for Unicode Version 8.0
|
||||
#
|
||||
|
||||
!!quoted_literals_only;
|
||||
|
||||
#
|
||||
# Character categories as defined in TR 29
|
||||
|
@ -85,22 +85,13 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
|
|||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
!!safe_reverse;
|
||||
|
||||
$SpEx_R = ($Extend | $Format)* $Sp;
|
||||
$ATermEx_R = ($Extend | $Format)* $ATerm;
|
||||
$STermEx_R = ($Extend | $Format)* $STerm;
|
||||
$CloseEx_R = ($Extend | $Format)* $Close;
|
||||
|
||||
#
|
||||
# Reverse rules.
|
||||
# For now, use the old style inexact reverse rules, which are easier
|
||||
# to write, but less efficient.
|
||||
# TODO: exact reverse rules. It appears that exact reverse rules
|
||||
# may require improving support for look-ahead breaks in the
|
||||
# builder. Needs more investigation.
|
||||
#
|
||||
|
||||
[{bof}] (.? | $LF $CR) [^$Sep $CR $LF]* [$Sep $CR $LF {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*;
|
||||
#.*;
|
||||
|
||||
|
@ -112,9 +103,9 @@ $CloseEx_R = ($Extend | $Format)* $Close;
|
|||
# The preceding $Sep, which will be the second one that the rule matches.
|
||||
# Any immediately preceding STerm or ATerm sequences. We need to see these
|
||||
# to get the correct rule status when moving forwards again.
|
||||
#
|
||||
#
|
||||
# [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match
|
||||
# the entire string.
|
||||
# the entire string. TODO: can bof be replaced with ^
|
||||
#
|
||||
# (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because position might be
|
||||
# at the beginning of the string at this point, and we don't want to fail.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html#License
|
||||
#
|
||||
# Copyright (C) 2002-2015, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
|
@ -12,6 +12,7 @@
|
|||
# These rules are based on UAX #29 Revision 26 for Unicode Version 8.0
|
||||
#
|
||||
|
||||
!!quoted_literals_only;
|
||||
|
||||
#
|
||||
# Character categories as defined in TR 29
|
||||
|
@ -85,7 +86,7 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
|
|||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
!!safe_reverse;
|
||||
|
||||
$SpEx_R = ($Extend | $Format)* $Sp;
|
||||
$ATermEx_R = ($Extend | $Format)* $ATerm;
|
||||
|
@ -102,7 +103,6 @@ $CloseEx_R = ($Extend | $Format)* $Close;
|
|||
#
|
||||
|
||||
[{bof}] (.? | $LF $CR) [^$Sep $CR $LF]* [$Sep $CR $LF {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*;
|
||||
#.*;
|
||||
|
||||
# Explanation for this rule:
|
||||
#
|
||||
|
@ -112,7 +112,7 @@ $CloseEx_R = ($Extend | $Format)* $Close;
|
|||
# The preceding $Sep, which will be the second one that the rule matches.
|
||||
# Any immediately preceding STerm or ATerm sequences. We need to see these
|
||||
# to get the correct rule status when moving forwards again.
|
||||
#
|
||||
#
|
||||
# [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match
|
||||
# the entire string.
|
||||
#
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html#License
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
#
|
||||
# Copyright (c) 2002-2015, International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
|
@ -7,6 +7,7 @@
|
|||
# Title Casing Break Rules
|
||||
#
|
||||
|
||||
!!quoted_literals_only;
|
||||
|
||||
$CaseIgnorable = [[:Mn:][:Me:][:Cf:][:Lm:][:Sk:] \u0027 \u00AD \u2019];
|
||||
$Cased = [[:Upper_Case:][:Lower_Case:][:Lt:] - $CaseIgnorable];
|
||||
|
@ -27,19 +28,6 @@ $NotCased = [[^ $Cased] - $CaseIgnorable];
|
|||
$Cased ($Cased | $CaseIgnorable)* ($NotCased | $CaseIgnorable)*;
|
||||
|
||||
|
||||
# Reverse Rules
|
||||
!!reverse;
|
||||
|
||||
# Normal Rule, will work nearly universally, so long as there is a
|
||||
# start-of-word preceding the current iteration position.
|
||||
|
||||
($NotCased | $CaseIgnorable)* ($Cased | $CaseIgnorable)* $Cased;
|
||||
|
||||
# Short rule, will be effective only when moving to the start of text,
|
||||
# with no word (cased character) preceding the current iteration position.
|
||||
|
||||
($NotCased | $CaseIgnorable)*;
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# Safe Reverse: the exact forward rule must not start in the middle
|
||||
|
@ -47,10 +35,3 @@ $Cased ($Cased | $CaseIgnorable)* ($NotCased | $CaseIgnorable)*;
|
|||
# leaving it just before the start of a word.
|
||||
|
||||
($Cased | $CaseIgnorable)*;
|
||||
|
||||
!!safe_forward;
|
||||
|
||||
# Safe Forward, nothing needs to be done, the exact Reverse rules will
|
||||
# always find valid boundaries from any starting position.
|
||||
# Still, some rule is needed, so '.', a one character movement.
|
||||
.;
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#
|
||||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (C) 2002-2016, International Business Machines Corporation
|
||||
# Copyright (C) 2002-2016, International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
# file: word.txt
|
||||
|
@ -22,6 +22,7 @@
|
|||
##############################################################################
|
||||
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
|
||||
#
|
||||
|
@ -194,95 +195,6 @@ $HangulSyllable $HangulSyllable {200};
|
|||
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
|
||||
$BackHebrew_LetterEx = ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
|
||||
$BackALetterEx = ($Format | $Extend | $ZWJ)* $ALetterPlus;
|
||||
$BackSingle_QuoteEx = ($Format | $Extend | $ZWJ)* $Single_Quote;
|
||||
$BackDouble_QuoteEx = ($Format | $Extend | $ZWJ)* $Double_Quote;
|
||||
$BackMidNumLetEx = ($Format | $Extend | $ZWJ)* $MidNumLet;
|
||||
$BackNumericEx = ($Format | $Extend | $ZWJ)* $Numeric;
|
||||
$BackMidNumEx = ($Format | $Extend | $ZWJ)* $MidNum;
|
||||
$BackMidLetterEx = ($Format | $Extend | $ZWJ)* $MidLetter;
|
||||
$BackKatakanaEx = ($Format | $Extend | $ZWJ)* $Katakana;
|
||||
$BackHiraganaEx = ($Format | $Extend | $ZWJ)* $Hiragana;
|
||||
$BackExtendNumLetEx = ($Format | $Extend | $ZWJ)* $ExtendNumLet;
|
||||
$BackRegional_IndicatorEx = ($Format | $Extend | $ZWJ)* $Regional_Indicator;
|
||||
|
||||
# rule 3
|
||||
$LF $CR;
|
||||
|
||||
# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
|
||||
#
|
||||
($Extended_Pict | $EmojiNRK) $ZWJ;
|
||||
|
||||
# rule 4
|
||||
($Format | $Extend | $ZWJ)* [^$CR $LF $Newline]?;
|
||||
|
||||
# rule 5
|
||||
|
||||
($BackALetterEx | $BackHebrew_LetterEx) ($BackALetterEx | $BackHebrew_LetterEx);
|
||||
|
||||
# rule 6 and 7
|
||||
|
||||
($BackALetterEx | $BackHebrew_LetterEx) ($BackMidLetterEx | $BackMidNumLetEx | $BackSingle_QuoteEx) ($BackALetterEx | $BackHebrew_LetterEx);
|
||||
|
||||
# rule 7a
|
||||
$BackSingle_QuoteEx $BackHebrew_LetterEx;
|
||||
|
||||
# Rule 7b and 7c
|
||||
$BackHebrew_LetterEx $BackDouble_QuoteEx $BackHebrew_LetterEx;
|
||||
|
||||
# rule 8
|
||||
|
||||
$BackNumericEx $BackNumericEx;
|
||||
|
||||
# rule 9
|
||||
|
||||
$BackNumericEx ($BackALetterEx | $BackHebrew_LetterEx);
|
||||
|
||||
# rule 10
|
||||
|
||||
($BackALetterEx | $BackHebrew_LetterEx) $BackNumericEx;
|
||||
|
||||
# rule 11 and 12
|
||||
|
||||
$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx | $BackSingle_QuoteEx) $BackNumericEx;
|
||||
|
||||
# rule 13
|
||||
|
||||
$BackKatakanaEx $BackKatakanaEx;
|
||||
|
||||
# rules 13 a/b
|
||||
#
|
||||
$BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
|
||||
($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
|
||||
|
||||
# special handling for CJK characters: chain for later dictionary segmentation
|
||||
$HangulSyllable $HangulSyllable;
|
||||
$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
|
||||
|
||||
# rule 14
|
||||
|
||||
$E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $EBG);
|
||||
|
||||
# rule 15 - 17
|
||||
# Pairs of Regional Indicators stay together.
|
||||
|
||||
^$BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
|
||||
($Extended_Pict | $EmojiNRK) $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
($Extended_Pict | $EmojiNRK) $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
@ -291,39 +203,17 @@ $E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $EBG);
|
|||
($Extend | $Format | $ZWJ)+ .?;
|
||||
|
||||
# rule 6
|
||||
($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx);
|
||||
($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus);
|
||||
|
||||
# rule 7b
|
||||
$Double_Quote $BackHebrew_LetterEx;
|
||||
$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
|
||||
|
||||
|
||||
# rule 11
|
||||
($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx;
|
||||
($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric;
|
||||
|
||||
# rule 13c
|
||||
$BackRegional_IndicatorEx*;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
||||
# rule 4
|
||||
($Extend | $Format | $ZWJ)+ .?;
|
||||
|
||||
# rule 6
|
||||
($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx);
|
||||
|
||||
# rule 7b
|
||||
$Double_QuoteEx $Hebrew_LetterEx;
|
||||
|
||||
# rule 11
|
||||
($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx;
|
||||
|
||||
# rule 13c
|
||||
$Regional_IndicatorEx*;
|
||||
$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#
|
||||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (C) 2002-2016, International Business Machines Corporation
|
||||
# Copyright (C) 2002-2016, International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
# file: word_POSIX.txt
|
||||
|
@ -22,6 +22,7 @@
|
|||
##############################################################################
|
||||
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
|
||||
#
|
||||
|
@ -62,7 +63,7 @@ $Hiragana = [:Hiragana:];
|
|||
# 5.0 or later as the definition of Complex_Context was corrected to include all
|
||||
# characters requiring dictionary break.
|
||||
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
$HangulSyllable = [\uac00-\ud7a3];
|
||||
$ComplexContext = [:LineBreak = Complex_Context:];
|
||||
$KanaKanji = [$Han $Hiragana $Katakana];
|
||||
|
@ -74,7 +75,7 @@ $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
|
|||
|
||||
|
||||
#
|
||||
# Rules 4 Ignore Format and Extend characters,
|
||||
# Rules 4 Ignore Format and Extend characters,
|
||||
# except when they appear at the beginning of a region of text.
|
||||
#
|
||||
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
|
||||
|
@ -154,7 +155,7 @@ $NumericEx $NumericEx {100};
|
|||
|
||||
$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
|
||||
|
||||
# rule 11 and 12
|
||||
# rule 11 and 12
|
||||
|
||||
$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
|
||||
|
||||
|
@ -191,96 +192,7 @@ $ExtendNumLetEx $KatakanaEx {400}; # (13b)
|
|||
|
||||
# special handling for CJK characters: chain for later dictionary segmentation
|
||||
$HangulSyllable $HangulSyllable {200};
|
||||
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
|
||||
$BackHebrew_LetterEx = ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
|
||||
$BackALetterEx = ($Format | $Extend | $ZWJ)* $ALetterPlus;
|
||||
$BackSingle_QuoteEx = ($Format | $Extend | $ZWJ)* $Single_Quote;
|
||||
$BackDouble_QuoteEx = ($Format | $Extend | $ZWJ)* $Double_Quote;
|
||||
$BackMidNumLetEx = ($Format | $Extend | $ZWJ)* $MidNumLet;
|
||||
$BackNumericEx = ($Format | $Extend | $ZWJ)* $Numeric;
|
||||
$BackMidNumEx = ($Format | $Extend | $ZWJ)* $MidNum;
|
||||
$BackMidLetterEx = ($Format | $Extend | $ZWJ)* $MidLetter;
|
||||
$BackKatakanaEx = ($Format | $Extend | $ZWJ)* $Katakana;
|
||||
$BackHiraganaEx = ($Format | $Extend | $ZWJ)* $Hiragana;
|
||||
$BackExtendNumLetEx = ($Format | $Extend | $ZWJ)* $ExtendNumLet;
|
||||
$BackRegional_IndicatorEx = ($Format | $Extend | $ZWJ)* $Regional_Indicator;
|
||||
|
||||
# rule 3
|
||||
$LF $CR;
|
||||
|
||||
# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
|
||||
#
|
||||
($Extended_Pict | $EmojiNRK) $ZWJ;
|
||||
|
||||
# rule 4
|
||||
($Format | $Extend | $ZWJ)* [^$CR $LF $Newline]?;
|
||||
|
||||
# rule 5
|
||||
|
||||
($BackALetterEx | $BackHebrew_LetterEx) ($BackALetterEx | $BackHebrew_LetterEx);
|
||||
|
||||
# rule 6 and 7
|
||||
|
||||
($BackALetterEx | $BackHebrew_LetterEx) ($BackMidLetterEx | $BackMidNumLetEx | $BackSingle_QuoteEx) ($BackALetterEx | $BackHebrew_LetterEx);
|
||||
|
||||
# rule 7a
|
||||
$BackSingle_QuoteEx $BackHebrew_LetterEx;
|
||||
|
||||
# Rule 7b and 7c
|
||||
$BackHebrew_LetterEx $BackDouble_QuoteEx $BackHebrew_LetterEx;
|
||||
|
||||
# rule 8
|
||||
|
||||
$BackNumericEx $BackNumericEx;
|
||||
|
||||
# rule 9
|
||||
|
||||
$BackNumericEx ($BackALetterEx | $BackHebrew_LetterEx);
|
||||
|
||||
# rule 10
|
||||
|
||||
($BackALetterEx | $BackHebrew_LetterEx) $BackNumericEx;
|
||||
|
||||
# rule 11 and 12
|
||||
|
||||
$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx | $BackSingle_QuoteEx) $BackNumericEx;
|
||||
|
||||
# rule 13
|
||||
|
||||
$BackKatakanaEx $BackKatakanaEx;
|
||||
|
||||
# rules 13 a/b
|
||||
#
|
||||
$BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
|
||||
($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
|
||||
|
||||
# special handling for CJK characters: chain for later dictionary segmentation
|
||||
$HangulSyllable $HangulSyllable;
|
||||
$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
|
||||
|
||||
# rule 14
|
||||
|
||||
$E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $EBG);
|
||||
|
||||
# rule 15 - 17
|
||||
# Pairs of Regional Indicators stay together.
|
||||
|
||||
^$BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
|
||||
($Extended_Pict | $EmojiNRK) $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
($Extended_Pict | $EmojiNRK) $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
|
||||
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
@ -291,39 +203,17 @@ $E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $EBG);
|
|||
($Extend | $Format | $ZWJ)+ .?;
|
||||
|
||||
# rule 6
|
||||
($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx);
|
||||
($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus);
|
||||
|
||||
# rule 7b
|
||||
$Double_Quote $BackHebrew_LetterEx;
|
||||
$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
|
||||
|
||||
|
||||
# rule 11
|
||||
($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx;
|
||||
($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric;
|
||||
|
||||
# rule 13c
|
||||
$BackRegional_IndicatorEx*;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
||||
# rule 4
|
||||
($Extend | $Format | $ZWJ)+ .?;
|
||||
|
||||
# rule 6
|
||||
($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx);
|
||||
|
||||
# rule 7b
|
||||
$Double_QuoteEx $Hebrew_LetterEx;
|
||||
|
||||
# rule 11
|
||||
($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx;
|
||||
|
||||
# rule 13c
|
||||
$Regional_IndicatorEx*;
|
||||
$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -201,7 +201,7 @@ tzdbNames{
|
|||
"meta:China"{
|
||||
sd{"CDT"}
|
||||
ss{"CST"}
|
||||
parseRegions{"CN", "MO", "TW"}
|
||||
parseRegions{"CN", "MO"}
|
||||
}
|
||||
"meta:Choibalsan"{
|
||||
sd{"CHOST"}
|
||||
|
@ -562,6 +562,10 @@ tzdbNames{
|
|||
"meta:Ponape"{
|
||||
ss{"PONT"}
|
||||
}
|
||||
"meta:Pyongyang"{
|
||||
ss{"KST"}
|
||||
parseRegions{"KP"}
|
||||
}
|
||||
"meta:Qyzylorda"{
|
||||
sd{"QYZST"}
|
||||
ss{"QYZT"}
|
||||
|
@ -617,6 +621,7 @@ tzdbNames{
|
|||
"meta:Taipei"{
|
||||
sd{"CDT"}
|
||||
ss{"CST"}
|
||||
parseRegions{"TW"}
|
||||
}
|
||||
"meta:Tajikistan"{
|
||||
ss{"TJT"}
|
||||
|
|
|
@ -31,9 +31,13 @@
|
|||
|
||||
static const UChar TARGET_SEP = 45; // '-'
|
||||
static const UChar VARIANT_SEP = 47; // '/'
|
||||
static const UChar ANY[] = {65,110,121,0}; // "Any"
|
||||
static const UChar ANY[] = {0x41,0x6E,0x79,0}; // "Any"
|
||||
static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
|
||||
static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-"
|
||||
static const UChar LATIN_PIVOT[] = {0x2D,0x4C,0x61,0x74,0x6E,0x3B,0x4C,0x61,0x74,0x6E,0x2D,0}; // "-Latn;Latn-"
|
||||
|
||||
// initial size for an Any-XXXX transform's cache of script-XXXX transforms
|
||||
// (will grow as necessary, but we don't expect to have source text with more than 7 scripts)
|
||||
#define ANY_TRANS_CACHE_INIT_SIZE 7
|
||||
|
||||
//------------------------------------------------------------
|
||||
|
||||
|
@ -186,7 +190,7 @@ AnyTransliterator::AnyTransliterator(const UnicodeString& id,
|
|||
Transliterator(id, NULL),
|
||||
targetScript(theTargetScript)
|
||||
{
|
||||
cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
|
||||
cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec);
|
||||
if (U_FAILURE(ec)) {
|
||||
return;
|
||||
}
|
||||
|
@ -212,7 +216,7 @@ AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
|
|||
{
|
||||
// Don't copy the cache contents
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
|
||||
cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec);
|
||||
if (U_FAILURE(ec)) {
|
||||
return;
|
||||
}
|
||||
|
@ -286,7 +290,7 @@ Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
|
|||
}
|
||||
if (t == NULL) {
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
UnicodeString sourceName(uscript_getName(source), -1, US_INV);
|
||||
UnicodeString sourceName(uscript_getShortName(source), -1, US_INV);
|
||||
UnicodeString id(sourceName);
|
||||
id.append(TARGET_SEP).append(target);
|
||||
|
||||
|
|
|
@ -8,12 +8,12 @@
|
|||
*
|
||||
* File CALENDAR.CPP
|
||||
*
|
||||
* Modification History:
|
||||
* Modification History:
|
||||
*
|
||||
* Date Name Description
|
||||
* 02/03/97 clhuang Creation.
|
||||
* 04/22/97 aliu Cleaned up, fixed memory leak, made
|
||||
* setWeekCountData() more robust.
|
||||
* 04/22/97 aliu Cleaned up, fixed memory leak, made
|
||||
* setWeekCountData() more robust.
|
||||
* Moved platform code to TPlatformUtilities.
|
||||
* 05/01/97 aliu Made equals(), before(), after() arguments const.
|
||||
* 05/20/97 aliu Changed logic of when to compute fields and time
|
||||
|
@ -26,7 +26,7 @@
|
|||
*******************************************************************************
|
||||
*/
|
||||
|
||||
#include "utypeinfo.h" // for 'typeid' to work
|
||||
#include "utypeinfo.h" // for 'typeid' to work
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
|
@ -66,10 +66,8 @@
|
|||
#if !UCONFIG_NO_SERVICE
|
||||
static icu::ICULocaleService* gService = NULL;
|
||||
static icu::UInitOnce gServiceInitOnce = U_INITONCE_INITIALIZER;
|
||||
#endif
|
||||
|
||||
// INTERNAL - for cleanup
|
||||
|
||||
U_CDECL_BEGIN
|
||||
static UBool calendar_cleanup(void) {
|
||||
#if !UCONFIG_NO_SERVICE
|
||||
|
@ -82,6 +80,7 @@ static UBool calendar_cleanup(void) {
|
|||
return TRUE;
|
||||
}
|
||||
U_CDECL_END
|
||||
#endif
|
||||
|
||||
// ------------------------------------------
|
||||
//
|
||||
|
@ -93,9 +92,9 @@ U_CDECL_END
|
|||
|
||||
#if defined( U_DEBUG_CALSVC ) || defined (U_DEBUG_CAL)
|
||||
|
||||
/**
|
||||
* fldName was removed as a duplicate implementation.
|
||||
* use udbg_ services instead,
|
||||
/**
|
||||
* fldName was removed as a duplicate implementation.
|
||||
* use udbg_ services instead,
|
||||
* which depend on include files and library from ../tools/toolutil, the following circular link:
|
||||
* CPPFLAGS+=-I$(top_srcdir)/tools/toolutil
|
||||
* LIBS+=$(LIBICUTOOLUTIL)
|
||||
|
@ -123,7 +122,7 @@ void ucal_dump(const Calendar &cal) {
|
|||
void Calendar::dump() const {
|
||||
int i;
|
||||
fprintf(stderr, "@calendar=%s, timeset=%c, fieldset=%c, allfields=%c, virtualset=%c, t=%.2f",
|
||||
getType(), fIsTimeSet?'y':'n', fAreFieldsSet?'y':'n', fAreAllFieldsSet?'y':'n',
|
||||
getType(), fIsTimeSet?'y':'n', fAreFieldsSet?'y':'n', fAreAllFieldsSet?'y':'n',
|
||||
fAreFieldsVirtuallySet?'y':'n',
|
||||
fTime);
|
||||
|
||||
|
@ -135,9 +134,9 @@ void Calendar::dump() const {
|
|||
fprintf(stderr, " %25s: %-11ld", f, fFields[i]);
|
||||
if(fStamp[i] == kUnset) {
|
||||
fprintf(stderr, " (unset) ");
|
||||
} else if(fStamp[i] == kInternallySet) {
|
||||
} else if(fStamp[i] == kInternallySet) {
|
||||
fprintf(stderr, " (internally set) ");
|
||||
//} else if(fStamp[i] == kInternalDefault) {
|
||||
//} else if(fStamp[i] == kInternalDefault) {
|
||||
// fprintf(stderr, " (internal default) ");
|
||||
} else {
|
||||
fprintf(stderr, " %%%d ", fStamp[i]);
|
||||
|
@ -213,7 +212,7 @@ const SharedCalendar *LocaleCacheKey<SharedCalendar>::createObject(
|
|||
const void * /*unusedCreationContext*/, UErrorCode &status) const {
|
||||
Calendar *calendar = Calendar::makeInstance(fLoc, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
return NULL;
|
||||
}
|
||||
SharedCalendar *shared = new SharedCalendar(calendar);
|
||||
if (shared == NULL) {
|
||||
|
@ -234,7 +233,9 @@ static ECalType getCalendarType(const char *s) {
|
|||
return CALTYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
static UBool isStandardSupportedKeyword(const char *keyword, UErrorCode& status) {
|
||||
#if !UCONFIG_NO_SERVICE
|
||||
// Only used with service registration.
|
||||
static UBool isStandardSupportedKeyword(const char *keyword, UErrorCode& status) {
|
||||
if(U_FAILURE(status)) {
|
||||
return FALSE;
|
||||
}
|
||||
|
@ -242,6 +243,7 @@ static UBool isStandardSupportedKeyword(const char *keyword, UErrorCode& status)
|
|||
return (calType != CALTYPE_UNKNOWN);
|
||||
}
|
||||
|
||||
// only used with service registration.
|
||||
static void getCalendarKeyword(const UnicodeString &id, char *targetBuffer, int32_t targetBufferSize) {
|
||||
UnicodeString calendarKeyword = UNICODE_STRING_SIMPLE("calendar=");
|
||||
int32_t calKeyLen = calendarKeyword.length();
|
||||
|
@ -255,6 +257,7 @@ static void getCalendarKeyword(const UnicodeString &id, char *targetBuffer, int3
|
|||
}
|
||||
targetBuffer[keyLen] = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static ECalType getCalendarTypeForLocale(const char *locid) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
@ -291,7 +294,7 @@ static ECalType getCalendarTypeForLocale(const char *locid) {
|
|||
if (U_FAILURE(status)) {
|
||||
return CALTYPE_GREGORIAN;
|
||||
}
|
||||
|
||||
|
||||
// Read preferred calendar values from supplementalData calendarPreference
|
||||
UResourceBundle *rb = ures_openDirect(NULL, "supplementalData", &status);
|
||||
ures_getByKey(rb, "calendarPreferenceData", rb, &status);
|
||||
|
@ -394,7 +397,7 @@ static Calendar *createStandardCalendar(ECalType calType, const Locale &loc, UEr
|
|||
// -------------------------------------
|
||||
|
||||
/**
|
||||
* a Calendar Factory which creates the "basic" calendar types, that is, those
|
||||
* a Calendar Factory which creates the "basic" calendar types, that is, those
|
||||
* shipped with ICU.
|
||||
*/
|
||||
class BasicCalendarFactory : public LocaleKeyFactory {
|
||||
|
@ -408,7 +411,7 @@ public:
|
|||
virtual ~BasicCalendarFactory();
|
||||
|
||||
protected:
|
||||
//virtual UBool isSupportedID( const UnicodeString& id, UErrorCode& status) const {
|
||||
//virtual UBool isSupportedID( const UnicodeString& id, UErrorCode& status) const {
|
||||
// if(U_FAILURE(status)) {
|
||||
// return FALSE;
|
||||
// }
|
||||
|
@ -466,7 +469,7 @@ protected:
|
|||
|
||||
BasicCalendarFactory::~BasicCalendarFactory() {}
|
||||
|
||||
/**
|
||||
/**
|
||||
* A factory which looks up the DefaultCalendar resource to determine which class of calendar to use
|
||||
*/
|
||||
|
||||
|
@ -510,7 +513,7 @@ public:
|
|||
virtual UObject* cloneInstance(UObject* instance) const {
|
||||
UnicodeString *s = dynamic_cast<UnicodeString *>(instance);
|
||||
if(s != NULL) {
|
||||
return s->clone();
|
||||
return s->clone();
|
||||
} else {
|
||||
#ifdef U_DEBUG_CALSVC_F
|
||||
UErrorCode status2 = U_ZERO_ERROR;
|
||||
|
@ -573,7 +576,7 @@ initCalendarService(UErrorCode &status)
|
|||
fprintf(stderr, "Registering classes..\n");
|
||||
#endif
|
||||
|
||||
// Register all basic instances.
|
||||
// Register all basic instances.
|
||||
gService->registerFactory(new BasicCalendarFactory(),status);
|
||||
|
||||
#ifdef U_DEBUG_CALSVC
|
||||
|
@ -589,7 +592,7 @@ initCalendarService(UErrorCode &status)
|
|||
}
|
||||
}
|
||||
|
||||
static ICULocaleService*
|
||||
static ICULocaleService*
|
||||
getCalendarService(UErrorCode &status)
|
||||
{
|
||||
umtx_initOnce(gServiceInitOnce, &initCalendarService, status);
|
||||
|
@ -743,7 +746,7 @@ fSkippedWallTime(UCAL_WALLTIME_LAST)
|
|||
return;
|
||||
}
|
||||
|
||||
clear();
|
||||
clear();
|
||||
fZone = zone;
|
||||
setWeekData(aLocale, NULL, success);
|
||||
}
|
||||
|
@ -850,7 +853,7 @@ Calendar::createInstance(const Locale& aLocale, UErrorCode& success)
|
|||
return createInstance(TimeZone::createDefault(), aLocale, success);
|
||||
}
|
||||
|
||||
// ------------------------------------- Adopting
|
||||
// ------------------------------------- Adopting
|
||||
|
||||
// Note: this is the bottleneck that actually calls the service routines.
|
||||
|
||||
|
@ -903,7 +906,7 @@ Calendar::makeInstance(const Locale& aLocale, UErrorCode& success) {
|
|||
c = (Calendar*)getCalendarService(success)->get(l, LocaleKey::KIND_ANY, &actualLoc2, success);
|
||||
|
||||
if(U_FAILURE(success) || !c) {
|
||||
if(U_SUCCESS(success)) {
|
||||
if(U_SUCCESS(success)) {
|
||||
success = U_INTERNAL_PROGRAM_ERROR; // Propagate some err
|
||||
}
|
||||
return NULL;
|
||||
|
@ -911,7 +914,7 @@ Calendar::makeInstance(const Locale& aLocale, UErrorCode& success) {
|
|||
|
||||
str = dynamic_cast<const UnicodeString*>(c);
|
||||
if(str != NULL) {
|
||||
// recursed! Second lookup returned a UnicodeString.
|
||||
// recursed! Second lookup returned a UnicodeString.
|
||||
// Perhaps DefaultCalendar{} was set to another locale.
|
||||
#ifdef U_DEBUG_CALSVC
|
||||
char tmp[200];
|
||||
|
@ -985,7 +988,7 @@ Calendar::createInstance(const TimeZone& zone, const Locale& aLocale, UErrorCode
|
|||
if(U_SUCCESS(success) && c) {
|
||||
c->setTimeZone(zone);
|
||||
}
|
||||
return c;
|
||||
return c;
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
|
@ -1017,7 +1020,7 @@ Calendar::operator==(const Calendar& that) const
|
|||
U_SUCCESS(status);
|
||||
}
|
||||
|
||||
UBool
|
||||
UBool
|
||||
Calendar::isEquivalentTo(const Calendar& other) const
|
||||
{
|
||||
return typeid(*this) == typeid(other) &&
|
||||
|
@ -1099,13 +1102,13 @@ Calendar::getNow()
|
|||
* Gets this Calendar's current time as a long.
|
||||
* @return the current time as UTC milliseconds from the epoch.
|
||||
*/
|
||||
double
|
||||
double
|
||||
Calendar::getTimeInMillis(UErrorCode& status) const
|
||||
{
|
||||
if(U_FAILURE(status))
|
||||
if(U_FAILURE(status))
|
||||
return 0.0;
|
||||
|
||||
if ( ! fIsTimeSet)
|
||||
if ( ! fIsTimeSet)
|
||||
((Calendar*)this)->updateTime(status);
|
||||
|
||||
/* Test for buffer overflows */
|
||||
|
@ -1124,9 +1127,9 @@ Calendar::getTimeInMillis(UErrorCode& status) const
|
|||
* when in lenient mode the out of range values are pinned to their respective min/max.
|
||||
* @param date the new time in UTC milliseconds from the epoch.
|
||||
*/
|
||||
void
|
||||
void
|
||||
Calendar::setTimeInMillis( double millis, UErrorCode& status ) {
|
||||
if(U_FAILURE(status))
|
||||
if(U_FAILURE(status))
|
||||
return;
|
||||
|
||||
if (millis > MAX_MILLIS) {
|
||||
|
@ -1154,7 +1157,7 @@ Calendar::setTimeInMillis( double millis, UErrorCode& status ) {
|
|||
fStamp[i] = kUnset;
|
||||
fIsSet[i] = FALSE;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
@ -1479,7 +1482,7 @@ void Calendar::computeFields(UErrorCode &ec)
|
|||
double localMillis = internalGetTime();
|
||||
int32_t rawOffset, dstOffset;
|
||||
getTimeZone().getOffset(localMillis, FALSE, rawOffset, dstOffset, ec);
|
||||
localMillis += (rawOffset + dstOffset);
|
||||
localMillis += (rawOffset + dstOffset);
|
||||
|
||||
// Mark fields as set. Do this before calling handleComputeFields().
|
||||
uint32_t mask = //fInternalSetMask;
|
||||
|
@ -1488,7 +1491,7 @@ void Calendar::computeFields(UErrorCode &ec)
|
|||
(1 << UCAL_MONTH) |
|
||||
(1 << UCAL_DAY_OF_MONTH) | // = UCAL_DATE
|
||||
(1 << UCAL_DAY_OF_YEAR) |
|
||||
(1 << UCAL_EXTENDED_YEAR);
|
||||
(1 << UCAL_EXTENDED_YEAR);
|
||||
|
||||
for (int32_t i=0; i<UCAL_FIELD_COUNT; ++i) {
|
||||
if ((mask & 1) == 0) {
|
||||
|
@ -1517,7 +1520,7 @@ void Calendar::computeFields(UErrorCode &ec)
|
|||
#if defined (U_DEBUG_CAL)
|
||||
//fprintf(stderr, "%s:%d- Hmm! Jules @ %d, as per %.0lf millis\n",
|
||||
//__FILE__, __LINE__, fFields[UCAL_JULIAN_DAY], localMillis);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
computeGregorianAndDOWFields(fFields[UCAL_JULIAN_DAY], ec);
|
||||
|
||||
|
@ -1615,7 +1618,7 @@ void Calendar::computeGregorianFields(int32_t julianDay, UErrorCode & /* ec */)
|
|||
* proleptic Gregorian calendar, which has no field larger than a year.
|
||||
*/
|
||||
void Calendar::computeWeekFields(UErrorCode &ec) {
|
||||
if(U_FAILURE(ec)) {
|
||||
if(U_FAILURE(ec)) {
|
||||
return;
|
||||
}
|
||||
int32_t eyear = fFields[UCAL_EXTENDED_YEAR];
|
||||
|
@ -1678,7 +1681,7 @@ void Calendar::computeWeekFields(UErrorCode &ec) {
|
|||
fFields[UCAL_WEEK_OF_MONTH] = weekNumber(dayOfMonth, dayOfWeek);
|
||||
fFields[UCAL_DAY_OF_WEEK_IN_MONTH] = (dayOfMonth-1) / 7 + 1;
|
||||
#if defined (U_DEBUG_CAL)
|
||||
if(fFields[UCAL_DAY_OF_WEEK_IN_MONTH]==0) fprintf(stderr, "%s:%d: DOWIM %d on %g\n",
|
||||
if(fFields[UCAL_DAY_OF_WEEK_IN_MONTH]==0) fprintf(stderr, "%s:%d: DOWIM %d on %g\n",
|
||||
__FILE__, __LINE__,fFields[UCAL_DAY_OF_WEEK_IN_MONTH], fTime);
|
||||
#endif
|
||||
}
|
||||
|
@ -1723,7 +1726,7 @@ void Calendar::handleComputeFields(int32_t /* julianDay */, UErrorCode &/* statu
|
|||
// -------------------------------------
|
||||
|
||||
|
||||
void Calendar::roll(EDateFields field, int32_t amount, UErrorCode& status)
|
||||
void Calendar::roll(EDateFields field, int32_t amount, UErrorCode& status)
|
||||
{
|
||||
roll((UCalendarDateFields)field, amount, status);
|
||||
}
|
||||
|
@ -2061,7 +2064,7 @@ void Calendar::roll(UCalendarDateFields field, int32_t amount, UErrorCode& statu
|
|||
default:
|
||||
// Other fields cannot be rolled by this method
|
||||
#if defined (U_DEBUG_CAL)
|
||||
fprintf(stderr, "%s:%d: ILLEGAL ARG because of roll on non-rollable field %s\n",
|
||||
fprintf(stderr, "%s:%d: ILLEGAL ARG because of roll on non-rollable field %s\n",
|
||||
__FILE__, __LINE__,fldName(field));
|
||||
#endif
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
|
@ -2252,7 +2255,7 @@ void Calendar::add(UCalendarDateFields field, int32_t amount, UErrorCode& status
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
|
@ -2617,7 +2620,7 @@ Calendar::isWeekend(void) const
|
|||
|
||||
// ------------------------------------- limits
|
||||
|
||||
int32_t
|
||||
int32_t
|
||||
Calendar::getMinimum(EDateFields field) const {
|
||||
return getLimit((UCalendarDateFields) field,UCAL_LIMIT_MINIMUM);
|
||||
}
|
||||
|
@ -2668,7 +2671,7 @@ Calendar::getLeastMaximum(UCalendarDateFields field) const
|
|||
}
|
||||
|
||||
// -------------------------------------
|
||||
int32_t
|
||||
int32_t
|
||||
Calendar::getActualMinimum(EDateFields field, UErrorCode& status) const
|
||||
{
|
||||
return getActualMinimum((UCalendarDateFields) field, status);
|
||||
|
@ -2744,7 +2747,7 @@ Calendar::getActualMinimum(UCalendarDateFields field, UErrorCode& status) const
|
|||
work->set(field, fieldValue);
|
||||
if (work->get(field, status) != fieldValue) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
result = fieldValue;
|
||||
fieldValue--;
|
||||
|
@ -2800,7 +2803,7 @@ void Calendar::validateField(UCalendarDateFields field, UErrorCode &status) {
|
|||
case UCAL_DAY_OF_WEEK_IN_MONTH:
|
||||
if (internalGet(field) == 0) {
|
||||
#if defined (U_DEBUG_CAL)
|
||||
fprintf(stderr, "%s:%d: ILLEGAL ARG because DOW in month cannot be 0\n",
|
||||
fprintf(stderr, "%s:%d: ILLEGAL ARG because DOW in month cannot be 0\n",
|
||||
__FILE__, __LINE__);
|
||||
#endif
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR; // "DAY_OF_WEEK_IN_MONTH cannot be zero"
|
||||
|
@ -2826,7 +2829,7 @@ void Calendar::validateField(UCalendarDateFields field, int32_t min, int32_t max
|
|||
int32_t value = fFields[field];
|
||||
if (value < min || value > max) {
|
||||
#if defined (U_DEBUG_CAL)
|
||||
fprintf(stderr, "%s:%d: ILLEGAL ARG because of field %s out of range %d..%d at %d\n",
|
||||
fprintf(stderr, "%s:%d: ILLEGAL ARG because of field %s out of range %d..%d at %d\n",
|
||||
__FILE__, __LINE__,fldName(field),min,max,value);
|
||||
#endif
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
|
@ -2892,7 +2895,7 @@ linesInGroup:
|
|||
}
|
||||
|
||||
const UFieldResolutionTable Calendar::kDatePrecedence[] =
|
||||
{
|
||||
{
|
||||
{
|
||||
{ UCAL_DAY_OF_MONTH, kResolveSTOP },
|
||||
{ UCAL_WEEK_OF_YEAR, UCAL_DAY_OF_WEEK, kResolveSTOP },
|
||||
|
@ -2913,12 +2916,12 @@ const UFieldResolutionTable Calendar::kDatePrecedence[] =
|
|||
{ kResolveRemap | UCAL_DAY_OF_WEEK_IN_MONTH, UCAL_DAY_OF_WEEK, kResolveSTOP },
|
||||
{ kResolveRemap | UCAL_DAY_OF_WEEK_IN_MONTH, UCAL_DOW_LOCAL, kResolveSTOP },
|
||||
{ kResolveSTOP }
|
||||
},
|
||||
},
|
||||
{{kResolveSTOP}}
|
||||
};
|
||||
|
||||
|
||||
const UFieldResolutionTable Calendar::kDOWPrecedence[] =
|
||||
const UFieldResolutionTable Calendar::kDOWPrecedence[] =
|
||||
{
|
||||
{
|
||||
{ UCAL_DAY_OF_WEEK,kResolveSTOP, kResolveSTOP },
|
||||
|
@ -2929,7 +2932,7 @@ const UFieldResolutionTable Calendar::kDOWPrecedence[] =
|
|||
};
|
||||
|
||||
// precedence for calculating a year
|
||||
const UFieldResolutionTable Calendar::kYearPrecedence[] =
|
||||
const UFieldResolutionTable Calendar::kYearPrecedence[] =
|
||||
{
|
||||
{
|
||||
{ UCAL_YEAR, kResolveSTOP },
|
||||
|
@ -2966,7 +2969,7 @@ void Calendar::computeTime(UErrorCode& status) {
|
|||
// }
|
||||
#endif
|
||||
|
||||
int32_t millisInDay;
|
||||
double millisInDay;
|
||||
|
||||
// We only use MILLISECONDS_IN_DAY if it has been set by the user.
|
||||
// This makes it possible for the caller to set the calendar to a
|
||||
|
@ -3086,10 +3089,10 @@ UBool Calendar::getImmediatePreviousZoneTransition(UDate base, UDate *transition
|
|||
* reflects local zone wall time.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
int32_t Calendar::computeMillisInDay() {
|
||||
double Calendar::computeMillisInDay() {
|
||||
// Do the time portion of the conversion.
|
||||
|
||||
int32_t millisInDay = 0;
|
||||
double millisInDay = 0;
|
||||
|
||||
// Find the best set of fields specifying the time of day. There
|
||||
// are only two possibilities here; the HOUR_OF_DAY or the
|
||||
|
@ -3131,7 +3134,7 @@ int32_t Calendar::computeMillisInDay() {
|
|||
* or range.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
int32_t Calendar::computeZoneOffset(double millis, int32_t millisInDay, UErrorCode &ec) {
|
||||
int32_t Calendar::computeZoneOffset(double millis, double millisInDay, UErrorCode &ec) {
|
||||
int32_t rawOffset, dstOffset;
|
||||
UDate wall = millis + millisInDay;
|
||||
BasicTimeZone* btz = getBasicTimeZone();
|
||||
|
@ -3178,7 +3181,7 @@ int32_t Calendar::computeZoneOffset(double millis, int32_t millisInDay, UErrorCo
|
|||
return rawOffset + dstOffset;
|
||||
}
|
||||
|
||||
int32_t Calendar::computeJulianDay()
|
||||
int32_t Calendar::computeJulianDay()
|
||||
{
|
||||
// We want to see if any of the date fields is newer than the
|
||||
// JULIAN_DAY. If not, then we use JULIAN_DAY. If so, then we do
|
||||
|
@ -3220,9 +3223,9 @@ int32_t Calendar::handleComputeJulianDay(UCalendarDateFields bestField) {
|
|||
internalSet(UCAL_EXTENDED_YEAR, year);
|
||||
}
|
||||
|
||||
#if defined (U_DEBUG_CAL)
|
||||
#if defined (U_DEBUG_CAL)
|
||||
fprintf(stderr, "%s:%d: bestField= %s - y=%d\n", __FILE__, __LINE__, fldName(bestField), year);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Get the Julian day of the day BEFORE the start of this year.
|
||||
// If useMonth is true, get the day before the start of the month.
|
||||
|
@ -3304,9 +3307,9 @@ int32_t Calendar::handleComputeJulianDay(UCalendarDateFields bestField) {
|
|||
date += ((monthLength - date) / 7 + dim + 1) * 7;
|
||||
}
|
||||
} else {
|
||||
#if defined (U_DEBUG_CAL)
|
||||
#if defined (U_DEBUG_CAL)
|
||||
fprintf(stderr, "%s:%d - bf= %s\n", __FILE__, __LINE__, fldName(bestField));
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if(bestField == UCAL_WEEK_OF_YEAR) { // ------------------------------------- WOY -------------
|
||||
if(!isSet(UCAL_YEAR_WOY) || // YWOY not set at all or
|
||||
|
@ -3317,30 +3320,30 @@ int32_t Calendar::handleComputeJulianDay(UCalendarDateFields bestField) {
|
|||
int32_t woy = internalGet(bestField);
|
||||
|
||||
int32_t nextJulianDay = handleComputeMonthStart(year+1, 0, FALSE); // jd of day before jan 1
|
||||
int32_t nextFirst = julianDayToDayOfWeek(nextJulianDay + 1) - firstDayOfWeek;
|
||||
int32_t nextFirst = julianDayToDayOfWeek(nextJulianDay + 1) - firstDayOfWeek;
|
||||
|
||||
if (nextFirst < 0) { // 0..6 ldow of Jan 1
|
||||
nextFirst += 7;
|
||||
}
|
||||
|
||||
if(woy==1) { // FIRST WEEK ---------------------------------
|
||||
#if defined (U_DEBUG_CAL)
|
||||
fprintf(stderr, "%s:%d - woy=%d, yp=%d, nj(%d)=%d, nf=%d", __FILE__, __LINE__,
|
||||
internalGet(bestField), resolveFields(kYearPrecedence), year+1,
|
||||
#if defined (U_DEBUG_CAL)
|
||||
fprintf(stderr, "%s:%d - woy=%d, yp=%d, nj(%d)=%d, nf=%d", __FILE__, __LINE__,
|
||||
internalGet(bestField), resolveFields(kYearPrecedence), year+1,
|
||||
nextJulianDay, nextFirst);
|
||||
|
||||
fprintf(stderr, " next: %d DFW, min=%d \n", (7-nextFirst), getMinimalDaysInFirstWeek() );
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// nextFirst is now the localized DOW of Jan 1 of y-woy+1
|
||||
if((nextFirst > 0) && // Jan 1 starts on FDOW
|
||||
(7-nextFirst) >= getMinimalDaysInFirstWeek()) // or enough days in the week
|
||||
{
|
||||
// Jan 1 of (yearWoy+1) is in yearWoy+1 - recalculate JD to next year
|
||||
#if defined (U_DEBUG_CAL)
|
||||
fprintf(stderr, "%s:%d - was going to move JD from %d to %d [d%d]\n", __FILE__, __LINE__,
|
||||
#if defined (U_DEBUG_CAL)
|
||||
fprintf(stderr, "%s:%d - was going to move JD from %d to %d [d%d]\n", __FILE__, __LINE__,
|
||||
julianDay, nextJulianDay, (nextJulianDay-julianDay));
|
||||
#endif
|
||||
#endif
|
||||
julianDay = nextJulianDay;
|
||||
|
||||
// recalculate 'first' [0-based local dow of jan 1]
|
||||
|
@ -3351,7 +3354,7 @@ int32_t Calendar::handleComputeJulianDay(UCalendarDateFields bestField) {
|
|||
// recalculate date.
|
||||
date = 1 - first + dowLocal;
|
||||
}
|
||||
} else if(woy>=getLeastMaximum(bestField)) {
|
||||
} else if(woy>=getLeastMaximum(bestField)) {
|
||||
// could be in the last week- find out if this JD would overstep
|
||||
int32_t testDate = date;
|
||||
if ((7 - first) < getMinimalDaysInFirstWeek()) {
|
||||
|
@ -3361,7 +3364,7 @@ int32_t Calendar::handleComputeJulianDay(UCalendarDateFields bestField) {
|
|||
// Now adjust for the week number.
|
||||
testDate += 7 * (woy - 1);
|
||||
|
||||
#if defined (U_DEBUG_CAL)
|
||||
#if defined (U_DEBUG_CAL)
|
||||
fprintf(stderr, "%s:%d - y=%d, y-1=%d doy%d, njd%d (C.F. %d)\n",
|
||||
__FILE__, __LINE__, year, year-1, testDate, julianDay+testDate, nextJulianDay);
|
||||
#endif
|
||||
|
@ -3375,7 +3378,7 @@ int32_t Calendar::handleComputeJulianDay(UCalendarDateFields bestField) {
|
|||
}
|
||||
date = 1 - first + dowLocal;
|
||||
|
||||
#if defined (U_DEBUG_CAL)
|
||||
#if defined (U_DEBUG_CAL)
|
||||
fprintf(stderr, "%s:%d - date now %d, jd%d, ywoy%d\n",
|
||||
__FILE__, __LINE__, date, julianDay, year-1);
|
||||
#endif
|
||||
|
@ -3400,13 +3403,13 @@ int32_t Calendar::handleComputeJulianDay(UCalendarDateFields bestField) {
|
|||
}
|
||||
|
||||
int32_t
|
||||
Calendar::getDefaultMonthInYear(int32_t /*eyear*/)
|
||||
Calendar::getDefaultMonthInYear(int32_t /*eyear*/)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t
|
||||
Calendar::getDefaultDayInMonth(int32_t /*eyear*/, int32_t /*month*/)
|
||||
Calendar::getDefaultDayInMonth(int32_t /*eyear*/, int32_t /*month*/)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
@ -3436,13 +3439,13 @@ int32_t Calendar::getLocalDOW()
|
|||
|
||||
int32_t Calendar::handleGetExtendedYearFromWeekFields(int32_t yearWoy, int32_t woy)
|
||||
{
|
||||
// We have UCAL_YEAR_WOY and UCAL_WEEK_OF_YEAR - from those, determine
|
||||
// We have UCAL_YEAR_WOY and UCAL_WEEK_OF_YEAR - from those, determine
|
||||
// what year we fall in, so that other code can set it properly.
|
||||
// (code borrowed from computeWeekFields and handleComputeJulianDay)
|
||||
//return yearWoy;
|
||||
|
||||
// First, we need a reliable DOW.
|
||||
UCalendarDateFields bestField = resolveFields(kDatePrecedence); // !! Note: if subclasses have a different table, they should override handleGetExtendedYearFromWeekFields
|
||||
UCalendarDateFields bestField = resolveFields(kDatePrecedence); // !! Note: if subclasses have a different table, they should override handleGetExtendedYearFromWeekFields
|
||||
|
||||
// Now, a local DOW
|
||||
int32_t dowLocal = getLocalDOW(); // 0..6
|
||||
|
@ -3475,9 +3478,9 @@ int32_t Calendar::handleGetExtendedYearFromWeekFields(int32_t yearWoy, int32_t w
|
|||
|
||||
int32_t minDays = getMinimalDaysInFirstWeek();
|
||||
UBool jan1InPrevYear = FALSE; // January 1st in the year of WOY is the 1st week? (i.e. first week is < minimal )
|
||||
//UBool nextJan1InPrevYear = FALSE; // January 1st of Year of WOY + 1 is in the first week?
|
||||
//UBool nextJan1InPrevYear = FALSE; // January 1st of Year of WOY + 1 is in the first week?
|
||||
|
||||
if((7 - first) < minDays) {
|
||||
if((7 - first) < minDays) {
|
||||
jan1InPrevYear = TRUE;
|
||||
}
|
||||
|
||||
|
@ -3500,8 +3503,8 @@ int32_t Calendar::handleGetExtendedYearFromWeekFields(int32_t yearWoy, int32_t w
|
|||
return yearWoy; // in this year
|
||||
}
|
||||
}
|
||||
} else if(woy >= getLeastMaximum(bestField)) {
|
||||
// we _might_ be in the last week..
|
||||
} else if(woy >= getLeastMaximum(bestField)) {
|
||||
// we _might_ be in the last week..
|
||||
int32_t jd = // Calculate JD of our target day:
|
||||
jan1Start + // JD of Jan 1
|
||||
(7-first) + // days in the first week (Jan 1.. )
|
||||
|
@ -3538,7 +3541,7 @@ int32_t Calendar::handleGetExtendedYearFromWeekFields(int32_t yearWoy, int32_t w
|
|||
}
|
||||
|
||||
//(internalGet(UCAL_DATE) <= (7-first)) /* && in minDow */ ) {
|
||||
//within 1st week and in this month..
|
||||
//within 1st week and in this month..
|
||||
//return yearWoy+1;
|
||||
return yearWoy;
|
||||
|
||||
|
@ -3671,7 +3674,7 @@ void Calendar::prepareGetActual(UCalendarDateFields field, UBool isMinimum, UErr
|
|||
dow += 7;
|
||||
}
|
||||
}
|
||||
#if defined (U_DEBUG_CAL)
|
||||
#if defined (U_DEBUG_CAL)
|
||||
fprintf(stderr, "prepareGetActualHelper(WOM/WOY) - dow=%d\n", dow);
|
||||
#endif
|
||||
set(UCAL_DAY_OF_WEEK, dow);
|
||||
|
@ -3687,7 +3690,7 @@ void Calendar::prepareGetActual(UCalendarDateFields field, UBool isMinimum, UErr
|
|||
|
||||
int32_t Calendar::getActualHelper(UCalendarDateFields field, int32_t startValue, int32_t endValue, UErrorCode &status) const
|
||||
{
|
||||
#if defined (U_DEBUG_CAL)
|
||||
#if defined (U_DEBUG_CAL)
|
||||
fprintf(stderr, "getActualHelper(%d,%d .. %d, %s)\n", field, startValue, endValue, u_errorName(status));
|
||||
#endif
|
||||
if (startValue == endValue) {
|
||||
|
@ -3723,7 +3726,7 @@ int32_t Calendar::getActualHelper(UCalendarDateFields field, int32_t startValue,
|
|||
int32_t result = startValue;
|
||||
if ((work->get(field, status) != startValue
|
||||
&& field != UCAL_WEEK_OF_MONTH && delta > 0 ) || U_FAILURE(status)) {
|
||||
#if defined (U_DEBUG_CAL)
|
||||
#if defined (U_DEBUG_CAL)
|
||||
fprintf(stderr, "getActualHelper(fld %d) - got %d (not %d) - %s\n", field, work->get(field,status), startValue, u_errorName(status));
|
||||
#endif
|
||||
} else {
|
||||
|
@ -3740,7 +3743,7 @@ int32_t Calendar::getActualHelper(UCalendarDateFields field, int32_t startValue,
|
|||
} while (startValue != endValue);
|
||||
}
|
||||
delete work;
|
||||
#if defined (U_DEBUG_CAL)
|
||||
#if defined (U_DEBUG_CAL)
|
||||
fprintf(stderr, "getActualHelper(%d) = %d\n", field, result);
|
||||
#endif
|
||||
return result;
|
||||
|
@ -3767,18 +3770,18 @@ Calendar::setWeekData(const Locale& desiredLocale, const char *type, UErrorCode&
|
|||
// Since week and weekend data is territory based instead of language based,
|
||||
// we may need to tweak the locale that we are using to try to get the appropriate
|
||||
// values, using the following logic:
|
||||
// 1). If the locale has a language but no territory, use the territory as defined by
|
||||
// 1). If the locale has a language but no territory, use the territory as defined by
|
||||
// the likely subtags.
|
||||
// 2). If the locale has a script designation then we ignore it,
|
||||
// then remove it ( i.e. "en_Latn_US" becomes "en_US" )
|
||||
|
||||
|
||||
char minLocaleID[ULOC_FULLNAME_CAPACITY] = { 0 };
|
||||
UErrorCode myStatus = U_ZERO_ERROR;
|
||||
|
||||
uloc_minimizeSubtags(desiredLocale.getName(),minLocaleID,ULOC_FULLNAME_CAPACITY,&myStatus);
|
||||
Locale min = Locale::createFromName(minLocaleID);
|
||||
Locale useLocale;
|
||||
if ( uprv_strlen(desiredLocale.getCountry()) == 0 ||
|
||||
if ( uprv_strlen(desiredLocale.getCountry()) == 0 ||
|
||||
(uprv_strlen(desiredLocale.getScript()) > 0 && uprv_strlen(min.getScript()) == 0) ) {
|
||||
char maxLocaleID[ULOC_FULLNAME_CAPACITY] = { 0 };
|
||||
myStatus = U_ZERO_ERROR;
|
||||
|
@ -3788,8 +3791,8 @@ Calendar::setWeekData(const Locale& desiredLocale, const char *type, UErrorCode&
|
|||
} else {
|
||||
useLocale = Locale(desiredLocale);
|
||||
}
|
||||
|
||||
/* The code here is somewhat of a hack, since week data and weekend data aren't really tied to
|
||||
|
||||
/* The code here is somewhat of a hack, since week data and weekend data aren't really tied to
|
||||
a specific calendar, they aren't truly locale data. But this is the only place where valid and
|
||||
actual locale can be set, so we take a shot at it here by loading a representative resource
|
||||
from the calendar data. The code used to use the dateTimeElements resource to get first day
|
||||
|
@ -3865,8 +3868,8 @@ Calendar::setWeekData(const Locale& desiredLocale, const char *type, UErrorCode&
|
|||
* and areFieldsSet. Callers should check isTimeSet and only
|
||||
* call this method if isTimeSet is false.
|
||||
*/
|
||||
void
|
||||
Calendar::updateTime(UErrorCode& status)
|
||||
void
|
||||
Calendar::updateTime(UErrorCode& status)
|
||||
{
|
||||
computeTime(status);
|
||||
if(U_FAILURE(status))
|
||||
|
@ -3875,14 +3878,14 @@ Calendar::updateTime(UErrorCode& status)
|
|||
// If we are lenient, we need to recompute the fields to normalize
|
||||
// the values. Also, if we haven't set all the fields yet (i.e.,
|
||||
// in a newly-created object), we need to fill in the fields. [LIU]
|
||||
if (isLenient() || ! fAreAllFieldsSet)
|
||||
if (isLenient() || ! fAreAllFieldsSet)
|
||||
fAreFieldsSet = FALSE;
|
||||
|
||||
fIsTimeSet = TRUE;
|
||||
fAreFieldsVirtuallySet = FALSE;
|
||||
}
|
||||
|
||||
Locale
|
||||
Locale
|
||||
Calendar::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
|
||||
U_LOCALE_BASED(locBased, *this);
|
||||
return locBased.getLocale(type, status);
|
||||
|
@ -3945,4 +3948,3 @@ U_NAMESPACE_END
|
|||
|
||||
|
||||
//eof
|
||||
|
||||
|
|
|
@ -63,8 +63,10 @@
|
|||
|
||||
static icu::Locale* availableLocaleList = NULL;
|
||||
static int32_t availableLocaleListCount;
|
||||
#if !UCONFIG_NO_SERVICE
|
||||
static icu::ICULocaleService* gService = NULL;
|
||||
static icu::UInitOnce gServiceInitOnce = U_INITONCE_INITIALIZER;
|
||||
#endif
|
||||
static icu::UInitOnce gAvailableLocaleListInitOnce;
|
||||
|
||||
/**
|
||||
|
|
|
@ -224,7 +224,7 @@ CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
|
|||
int32_t totalSize = indexesLength * 4;
|
||||
|
||||
if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) {
|
||||
indexes[CollationDataReader::IX_JAMO_CE32S_START] = data.jamoCE32s - data.ce32s;
|
||||
indexes[CollationDataReader::IX_JAMO_CE32S_START] = static_cast<int32_t>(data.jamoCE32s - data.ce32s);
|
||||
} else {
|
||||
indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1;
|
||||
}
|
||||
|
|
|
@ -607,7 +607,7 @@ CollationFastLatinBuilder::encodeContractions(UErrorCode &errorCode) {
|
|||
}
|
||||
UBool firstTriple = TRUE;
|
||||
for(int32_t index = (int32_t)ce & 0x7fffffff;; index += 3) {
|
||||
int32_t x = contractionCEs.elementAti(index);
|
||||
int32_t x = static_cast<int32_t>(contractionCEs.elementAti(index));
|
||||
if((uint32_t)x == CollationFastLatin::CONTR_CHAR_MASK && !firstTriple) { break; }
|
||||
int64_t cce0 = contractionCEs.elementAti(index + 1);
|
||||
int64_t cce1 = contractionCEs.elementAti(index + 2);
|
||||
|
|
|
@ -739,7 +739,7 @@ DateFormat::setBooleanAttribute(UDateFormatBooleanAttribute attr,
|
|||
UBool
|
||||
DateFormat::getBooleanAttribute(UDateFormatBooleanAttribute attr, UErrorCode &/*status*/) const {
|
||||
|
||||
return fBoolFlags.get(attr);
|
||||
return static_cast<UBool>(fBoolFlags.get(attr));
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -386,7 +386,7 @@ U_CAPI decNumber * U_EXPORT2 uprv_decNumberFromUInt32(decNumber *dn, uInt uin) {
|
|||
*up=(Unit)(uin%(DECDPUNMAX+1));
|
||||
uin=uin/(DECDPUNMAX+1);
|
||||
}
|
||||
dn->digits=decGetDigits(dn->lsu, up-dn->lsu);
|
||||
dn->digits=decGetDigits(dn->lsu, static_cast<int32_t>(up - dn->lsu));
|
||||
return dn;
|
||||
} /* decNumberFromUInt32 */
|
||||
|
||||
|
@ -666,7 +666,7 @@ U_CAPI decNumber * U_EXPORT2 uprv_decNumberFromString(decNumber *dn, const char
|
|||
|
||||
/* Handle decimal point... */
|
||||
if (dotchar!=NULL && dotchar<last) /* non-trailing '.' found? */
|
||||
exponent-=(last-dotchar); /* adjust exponent */
|
||||
exponent -= static_cast<int32_t>(last-dotchar); /* adjust exponent */
|
||||
/* [we can now ignore the .] */
|
||||
|
||||
/* OK, the digits string is good. Assemble in the decNumber, or in */
|
||||
|
@ -866,7 +866,7 @@ U_CAPI decNumber * U_EXPORT2 uprv_decNumberAnd(decNumber *res, const decNumber *
|
|||
} /* both OK */
|
||||
} /* each unit */
|
||||
/* [here uc-1 is the msu of the result] */
|
||||
res->digits=decGetDigits(res->lsu, uc-res->lsu);
|
||||
res->digits=decGetDigits(res->lsu, static_cast<int32_t>(uc - res->lsu));
|
||||
res->exponent=0; /* integer */
|
||||
res->bits=0; /* sign=0 */
|
||||
return res; /* [no status to set] */
|
||||
|
@ -1253,7 +1253,7 @@ U_CAPI decNumber * U_EXPORT2 uprv_decNumberInvert(decNumber *res, const decNumbe
|
|||
} /* each digit */
|
||||
} /* each unit */
|
||||
/* [here uc-1 is the msu of the result] */
|
||||
res->digits=decGetDigits(res->lsu, uc-res->lsu);
|
||||
res->digits=decGetDigits(res->lsu, static_cast<int32_t>(uc - res->lsu));
|
||||
res->exponent=0; /* integer */
|
||||
res->bits=0; /* sign=0 */
|
||||
return res; /* [no status to set] */
|
||||
|
@ -1880,7 +1880,7 @@ U_CAPI decNumber * U_EXPORT2 uprv_decNumberOr(decNumber *res, const decNumber *l
|
|||
} /* non-zero */
|
||||
} /* each unit */
|
||||
/* [here uc-1 is the msu of the result] */
|
||||
res->digits=decGetDigits(res->lsu, uc-res->lsu);
|
||||
res->digits=decGetDigits(res->lsu, static_cast<int32_t>(uc-res->lsu));
|
||||
res->exponent=0; /* integer */
|
||||
res->bits=0; /* sign=0 */
|
||||
return res; /* [no status to set] */
|
||||
|
@ -2586,7 +2586,7 @@ U_CAPI decNumber * U_EXPORT2 uprv_decNumberRotate(decNumber *res, const decNumbe
|
|||
} /* whole units to rotate */
|
||||
/* the rotation may have left an undetermined number of zeros */
|
||||
/* on the left, so true length needs to be calculated */
|
||||
res->digits=decGetDigits(res->lsu, msumax-res->lsu+1);
|
||||
res->digits=decGetDigits(res->lsu, static_cast<int32_t>(msumax-res->lsu+1));
|
||||
} /* rotate needed */
|
||||
} /* rhs OK */
|
||||
} /* numerics */
|
||||
|
@ -3310,7 +3310,7 @@ U_CAPI decNumber * U_EXPORT2 uprv_decNumberXor(decNumber *res, const decNumber *
|
|||
} /* non-zero */
|
||||
} /* each unit */
|
||||
/* [here uc-1 is the msu of the result] */
|
||||
res->digits=decGetDigits(res->lsu, uc-res->lsu);
|
||||
res->digits=decGetDigits(res->lsu, static_cast<int32_t>(uc-res->lsu));
|
||||
res->exponent=0; /* integer */
|
||||
res->bits=0; /* sign=0 */
|
||||
return res; /* [no status to set] */
|
||||
|
@ -5101,7 +5101,7 @@ static decNumber * decMultiplyOp(decNumber *res, const decNumber *lhs,
|
|||
} /* p */
|
||||
*up=(Unit)item; up++; /* [final needs no division] */
|
||||
} /* lp */
|
||||
accunits=up-acc; /* count of units */
|
||||
accunits = static_cast<int32_t>(up-acc); /* count of units */
|
||||
}
|
||||
else { /* here to use units directly, without chunking ['old code'] */
|
||||
#endif
|
||||
|
@ -6587,11 +6587,11 @@ static Int decUnitAddSub(const Unit *a, Int alength,
|
|||
|
||||
/* OK, all A and B processed; might still have carry or borrow */
|
||||
/* return number of Units in the result, negated if a borrow */
|
||||
if (carry==0) return c-clsu; /* no carry, so no more to do */
|
||||
if (carry==0) return static_cast<int32_t>(c-clsu); /* no carry, so no more to do */
|
||||
if (carry>0) { /* positive carry */
|
||||
*c=(Unit)carry; /* place as new unit */
|
||||
c++; /* .. */
|
||||
return c-clsu;
|
||||
return static_cast<int32_t>(c-clsu);
|
||||
}
|
||||
/* -ve carry: it's a borrow; complement needed */
|
||||
add=1; /* temporary carry... */
|
||||
|
@ -6614,7 +6614,7 @@ static Int decUnitAddSub(const Unit *a, Int alength,
|
|||
*c=(Unit)(add-carry-1);
|
||||
c++; /* interesting, include it */
|
||||
}
|
||||
return clsu-c; /* -ve result indicates borrowed */
|
||||
return static_cast<int32_t>(clsu-c); /* -ve result indicates borrowed */
|
||||
} /* decUnitAddSub */
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
@ -6798,7 +6798,7 @@ static Int decShiftToLeast(Unit *uar, Int units, Int shift) {
|
|||
if (cut==DECDPUN) { /* unit-boundary case; easy */
|
||||
up=uar+D2U(shift);
|
||||
for (; up<uar+units; target++, up++) *target=*up;
|
||||
return target-uar;
|
||||
return static_cast<int32_t>(target-uar);
|
||||
}
|
||||
|
||||
/* messier */
|
||||
|
@ -6826,7 +6826,7 @@ static Int decShiftToLeast(Unit *uar, Int units, Int shift) {
|
|||
count-=cut;
|
||||
if (count<=0) break;
|
||||
}
|
||||
return target-uar+1;
|
||||
return static_cast<int32_t>(target-uar+1);
|
||||
} /* decShiftToLeast */
|
||||
|
||||
#if DECSUBSET
|
||||
|
@ -7690,7 +7690,7 @@ static decNumber *decDecap(decNumber *dn, Int drop) {
|
|||
cut=MSUDIGITS(dn->digits-drop); /* digits to be in use in msu */
|
||||
if (cut!=DECDPUN) *msu%=powers[cut]; /* clear left digits */
|
||||
/* that may have left leading zero digits, so do a proper count... */
|
||||
dn->digits=decGetDigits(dn->lsu, msu-dn->lsu+1);
|
||||
dn->digits=decGetDigits(dn->lsu, static_cast<int32_t>(msu-dn->lsu+1));
|
||||
return dn;
|
||||
} /* decDecap */
|
||||
|
||||
|
|
|
@ -2543,7 +2543,7 @@ UnicodeString DecimalFormat::getPadCharacterString() const {
|
|||
}
|
||||
|
||||
void DecimalFormat::setPadCharacter(const UnicodeString &padChar) {
|
||||
UChar pad;
|
||||
UChar32 pad;
|
||||
if (padChar.length() > 0) {
|
||||
pad = padChar.char32At(0);
|
||||
}
|
||||
|
@ -2792,7 +2792,7 @@ DecimalFormat::setDecimalSeparatorAlwaysShown(UBool newValue)
|
|||
UBool
|
||||
DecimalFormat::isDecimalPatternMatchRequired(void) const
|
||||
{
|
||||
return fBoolFlags.contains(UNUM_PARSE_DECIMAL_MARK_REQUIRED);
|
||||
return static_cast<UBool>(fBoolFlags.contains(UNUM_PARSE_DECIMAL_MARK_REQUIRED));
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
|
|
@ -1200,12 +1200,11 @@ RuleBasedNumberFormat::format(double number,
|
|||
UnicodeString& toAppendTo,
|
||||
FieldPosition& /* pos */) const
|
||||
{
|
||||
int32_t startPos = toAppendTo.length();
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
if (defaultRuleSet) {
|
||||
format(number, *defaultRuleSet, toAppendTo, status);
|
||||
}
|
||||
return adjustForCapitalizationContext(startPos, toAppendTo, status);
|
||||
return toAppendTo;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -46,11 +46,29 @@ static const UChar LOCALE_SEP = 95; // '_'
|
|||
//static const UChar VARIANT_SEP = 0x002F; // '/'
|
||||
|
||||
// String constants
|
||||
static const UChar ANY[] = { 65, 110, 121, 0 }; // Any
|
||||
static const UChar ANY[] = { 0x41, 0x6E, 0x79, 0 }; // Any
|
||||
static const UChar LAT[] = { 0x4C, 0x61, 0x74, 0 }; // Lat
|
||||
|
||||
// empty string
|
||||
#define NO_VARIANT UnicodeString()
|
||||
|
||||
// initial estimate for specDAG size
|
||||
// ICU 60 Transliterator::countAvailableSources()
|
||||
#define SPECDAG_INIT_SIZE 149
|
||||
|
||||
// initial estimate for number of variant names
|
||||
#define VARIANT_LIST_INIT_SIZE 11
|
||||
#define VARIANT_LIST_MAX_SIZE 31
|
||||
|
||||
// initial estimate for availableIDs count (default estimate is 8 => multiple reallocs)
|
||||
// ICU 60 Transliterator::countAvailableIDs()
|
||||
#define AVAILABLE_IDS_INIT_SIZE 641
|
||||
|
||||
// initial estimate for number of targets for source "Any", "Lat"
|
||||
// ICU 60 Transliterator::countAvailableTargets("Any")/("Latn")
|
||||
#define ANY_TARGETS_INIT_SIZE 125
|
||||
#define LAT_TARGETS_INIT_SIZE 23
|
||||
|
||||
/**
|
||||
* Resource bundle key for the RuleBasedTransliterator rule.
|
||||
*/
|
||||
|
@ -517,10 +535,17 @@ U_CDECL_END
|
|||
|
||||
TransliteratorRegistry::TransliteratorRegistry(UErrorCode& status) :
|
||||
registry(TRUE, status),
|
||||
specDAG(TRUE, status),
|
||||
availableIDs(status)
|
||||
specDAG(TRUE, SPECDAG_INIT_SIZE, status),
|
||||
variantList(VARIANT_LIST_INIT_SIZE, status),
|
||||
availableIDs(AVAILABLE_IDS_INIT_SIZE, status)
|
||||
{
|
||||
registry.setValueDeleter(deleteEntry);
|
||||
variantList.setDeleter(uprv_deleteUObject);
|
||||
variantList.setComparer(uhash_compareCaselessUnicodeString);
|
||||
UnicodeString *emptyString = new UnicodeString();
|
||||
if (emptyString != NULL) {
|
||||
variantList.addElement(emptyString, status);
|
||||
}
|
||||
availableIDs.setDeleter(uprv_deleteUObject);
|
||||
availableIDs.setComparer(uhash_compareCaselessUnicodeString);
|
||||
specDAG.setValueDeleter(uhash_deleteHashtable);
|
||||
|
@ -781,9 +806,15 @@ int32_t TransliteratorRegistry::countAvailableVariants(const UnicodeString& sour
|
|||
if (targets == 0) {
|
||||
return 0;
|
||||
}
|
||||
UVector *variants = (UVector*) targets->get(target);
|
||||
// variants may be 0 if the source/target are invalid
|
||||
return (variants == 0) ? 0 : variants->size();
|
||||
int32_t varMask = targets->geti(target);
|
||||
int32_t varCount = 0;
|
||||
while (varMask > 0) {
|
||||
if (varMask & 1) {
|
||||
varCount++;
|
||||
}
|
||||
varMask >>= 1;
|
||||
}
|
||||
return varCount;
|
||||
}
|
||||
|
||||
UnicodeString& TransliteratorRegistry::getAvailableVariant(int32_t index,
|
||||
|
@ -795,17 +826,25 @@ UnicodeString& TransliteratorRegistry::getAvailableVariant(int32_t index,
|
|||
result.truncate(0); // invalid source
|
||||
return result;
|
||||
}
|
||||
UVector *variants = (UVector*) targets->get(target);
|
||||
if (variants == 0) {
|
||||
result.truncate(0); // invalid target
|
||||
return result;
|
||||
}
|
||||
UnicodeString *v = (UnicodeString*) variants->elementAt(index);
|
||||
if (v == 0) {
|
||||
result.truncate(0); // invalid index
|
||||
} else {
|
||||
result = *v;
|
||||
int32_t varMask = targets->geti(target);
|
||||
int32_t varCount = 0;
|
||||
int32_t varListIndex = 0;
|
||||
while (varMask > 0) {
|
||||
if (varMask & 1) {
|
||||
if (varCount == index) {
|
||||
UnicodeString *v = (UnicodeString*) variantList.elementAt(varListIndex);
|
||||
if (v != NULL) {
|
||||
result = *v;
|
||||
return result;
|
||||
}
|
||||
break;
|
||||
}
|
||||
varCount++;
|
||||
}
|
||||
varMask >>= 1;
|
||||
varListIndex++;
|
||||
}
|
||||
result.truncate(0); // invalid target or index
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -911,9 +950,9 @@ void TransliteratorRegistry::registerEntry(const UnicodeString& ID,
|
|||
UnicodeString *newID = (UnicodeString *)ID.clone();
|
||||
// Check to make sure newID was created.
|
||||
if (newID != NULL) {
|
||||
// NUL-terminate the ID string
|
||||
newID->getTerminatedBuffer();
|
||||
availableIDs.addElement(newID, status);
|
||||
// NUL-terminate the ID string
|
||||
newID->getTerminatedBuffer();
|
||||
availableIDs.addElement(newID, status);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
@ -924,9 +963,7 @@ void TransliteratorRegistry::registerEntry(const UnicodeString& ID,
|
|||
|
||||
/**
|
||||
* Register a source-target/variant in the specDAG. Variant may be
|
||||
* empty, but source and target must not be. If variant is empty then
|
||||
* the special variant NO_VARIANT is stored in slot zero of the
|
||||
* UVector of variants.
|
||||
* empty, but source and target must not be.
|
||||
*/
|
||||
void TransliteratorRegistry::registerSTV(const UnicodeString& source,
|
||||
const UnicodeString& target,
|
||||
|
@ -936,39 +973,38 @@ void TransliteratorRegistry::registerSTV(const UnicodeString& source,
|
|||
UErrorCode status = U_ZERO_ERROR;
|
||||
Hashtable *targets = (Hashtable*) specDAG.get(source);
|
||||
if (targets == 0) {
|
||||
targets = new Hashtable(TRUE, status);
|
||||
if (U_FAILURE(status) || targets == 0) {
|
||||
int32_t size = 3;
|
||||
if (source.compare(ANY,3) == 0) {
|
||||
size = ANY_TARGETS_INIT_SIZE;
|
||||
} else if (source.compare(LAT,3) == 0) {
|
||||
size = LAT_TARGETS_INIT_SIZE;
|
||||
}
|
||||
targets = new Hashtable(TRUE, size, status);
|
||||
if (U_FAILURE(status) || targets == NULL) {
|
||||
return;
|
||||
}
|
||||
targets->setValueDeleter(uprv_deleteUObject);
|
||||
specDAG.put(source, targets, status);
|
||||
}
|
||||
UVector *variants = (UVector*) targets->get(target);
|
||||
if (variants == 0) {
|
||||
variants = new UVector(uprv_deleteUObject,
|
||||
uhash_compareCaselessUnicodeString, status);
|
||||
if (variants == 0) {
|
||||
int32_t variantListIndex = variantList.indexOf((void*) &variant, 0);
|
||||
if (variantListIndex < 0) {
|
||||
if (variantList.size() >= VARIANT_LIST_MAX_SIZE) {
|
||||
// can't handle any more variants
|
||||
return;
|
||||
}
|
||||
targets->put(target, variants, status);
|
||||
}
|
||||
// assert(NO_VARIANT == "");
|
||||
// We add the variant string. If it is the special "no variant"
|
||||
// string, that is, the empty string, we add it at position zero.
|
||||
if (!variants->contains((void*) &variant)) {
|
||||
UnicodeString *tempus; // Used for null pointer check.
|
||||
if (variant.length() > 0) {
|
||||
tempus = new UnicodeString(variant);
|
||||
if (tempus != NULL) {
|
||||
variants->addElement(tempus, status);
|
||||
}
|
||||
} else {
|
||||
tempus = new UnicodeString(); // = NO_VARIANT
|
||||
if (tempus != NULL) {
|
||||
variants->insertElementAt(tempus, 0, status);
|
||||
}
|
||||
UnicodeString *variantEntry = new UnicodeString(variant);
|
||||
if (variantEntry != NULL) {
|
||||
variantList.addElement(variantEntry, status);
|
||||
if (U_SUCCESS(status)) {
|
||||
variantListIndex = variantList.size() - 1;
|
||||
}
|
||||
}
|
||||
if (variantListIndex < 0) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
int32_t addMask = 1 << variantListIndex;
|
||||
int32_t varMask = targets->geti(target);
|
||||
targets->puti(target, varMask | addMask, status);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -979,17 +1015,24 @@ void TransliteratorRegistry::removeSTV(const UnicodeString& source,
|
|||
const UnicodeString& variant) {
|
||||
// assert(source.length() > 0);
|
||||
// assert(target.length() > 0);
|
||||
// UErrorCode status = U_ZERO_ERROR;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
Hashtable *targets = (Hashtable*) specDAG.get(source);
|
||||
if (targets == 0) {
|
||||
if (targets == NULL) {
|
||||
return; // should never happen for valid s-t/v
|
||||
}
|
||||
UVector *variants = (UVector*) targets->get(target);
|
||||
if (variants == 0) {
|
||||
int32_t varMask = targets->geti(target);
|
||||
if (varMask == 0) {
|
||||
return; // should never happen for valid s-t/v
|
||||
}
|
||||
variants->removeElement((void*) &variant);
|
||||
if (variants->size() == 0) {
|
||||
int32_t variantListIndex = variantList.indexOf((void*) &variant, 0);
|
||||
if (variantListIndex < 0) {
|
||||
return; // should never happen for valid s-t/v
|
||||
}
|
||||
int32_t remMask = 1 << variantListIndex;
|
||||
varMask &= (~remMask);
|
||||
if (varMask != 0) {
|
||||
targets->puti(target, varMask, status);
|
||||
} else {
|
||||
targets->remove(target); // should delete variants
|
||||
if (targets->count() == 0) {
|
||||
specDAG.remove(source); // should delete targets
|
||||
|
@ -1281,8 +1324,8 @@ Transliterator* TransliteratorRegistry::instantiateEntry(const UnicodeString& ID
|
|||
UVector* rbts = new UVector(entry->u.dataVector->size(), status);
|
||||
// Check for null pointer
|
||||
if (rbts == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
int32_t passNumber = 1;
|
||||
for (int32_t i = 0; U_SUCCESS(status) && i < entry->u.dataVector->size(); i++) {
|
||||
|
|
|
@ -440,13 +440,15 @@ class TransliteratorRegistry : public UMemory {
|
|||
|
||||
/**
|
||||
* DAG of visible IDs by spec. Hashtable: source => (Hashtable:
|
||||
* target => (UVector: variant)) The UVector of variants is never
|
||||
* empty. For a source-target with no variant, the special
|
||||
* variant NO_VARIANT (the empty string) is stored in slot zero of
|
||||
* the UVector.
|
||||
* target => variant bitmask)
|
||||
*/
|
||||
Hashtable specDAG;
|
||||
|
||||
/**
|
||||
* Vector of all variant names
|
||||
*/
|
||||
UVector variantList;
|
||||
|
||||
/**
|
||||
* Vector of public full IDs.
|
||||
*/
|
||||
|
|
|
@ -2056,6 +2056,9 @@ static void U_CALLCONV prepareFind(UErrorCode &status) {
|
|||
if (U_SUCCESS(status)) {
|
||||
while ((mzID = mzIDs->snext(status)) && U_SUCCESS(status)) {
|
||||
const TZDBNames *names = TZDBTimeZoneNames::getMetaZoneNames(*mzID, status);
|
||||
if (U_FAILURE(status)) {
|
||||
break;
|
||||
}
|
||||
if (names == NULL) {
|
||||
continue;
|
||||
}
|
||||
|
@ -2187,9 +2190,11 @@ TZDBTimeZoneNames::getMetaZoneDisplayName(const UnicodeString& mzID,
|
|||
UErrorCode status = U_ZERO_ERROR;
|
||||
const TZDBNames *tzdbNames = TZDBTimeZoneNames::getMetaZoneNames(mzID, status);
|
||||
if (U_SUCCESS(status)) {
|
||||
const UChar *s = tzdbNames->getName(type);
|
||||
if (s != NULL) {
|
||||
name.setTo(TRUE, s, -1);
|
||||
if (tzdbNames != NULL) {
|
||||
const UChar *s = tzdbNames->getName(type);
|
||||
if (s != NULL) {
|
||||
name.setTo(TRUE, s, -1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue