ICU-13177 Merging trunk to branch

X-SVN-Rev: 40460
This commit is contained in:
Shane Carr 2017-09-26 09:33:44 +00:00
commit c09ca5d6b9
407 changed files with 13072 additions and 12173 deletions

View file

@ -1,3 +1,6 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later)
Copyright © 1991-2017 Unicode, Inc. All rights reserved.
@ -131,7 +134,7 @@ property of their respective owners.
# ---------COPYING.libtabe ---- BEGIN--------------------
#
# /*
# * Copyrighy (c) 1999 TaBE Project.
# * Copyright (c) 1999 TaBE Project.
# * Copyright (c) 1999 Pai-Hsiang Hsiao.
# * All rights reserved.
# *

View file

@ -630,6 +630,14 @@
(via -D or uconfig.h, as above)
and include those header files explicitly that you actually need.<br />
Note: The ICU test suites cannot be compiled with this setting.</li>
<li><b>utf_old.h:</b>
All of utf_old.h is deprecated or obsolete.<br />
Beginning with ICU 60,
you should define <code>U_HIDE_OBSOLETE_UTF_OLD_H</code> to 1
(via -D or uconfig.h, as above).
Use of any of these macros should be replaced as noted
in the comments for the obsolete macro.<br />
Note: The ICU test suites <i>can</i> be compiled with this setting.</li>
<li><b>.dat file:</b> By default, the ICU data is built into
a shared library (DLL). This is convenient because it requires no
install-time or runtime configuration,

View file

@ -194,7 +194,7 @@ EXPAND_ONLY_PREDEF = YES
SEARCH_INCLUDES = YES
INCLUDE_PATH =
INCLUDE_FILE_PATTERNS =
PREDEFINED = U_EXPORT2= U_STABLE= U_DRAFT= U_INTERNAL= U_SYSTEM= U_DEPRECATED= U_OBSOLETE= U_CALLCONV= U_CDECL_BEGIN= U_CDECL_END= U_NO_THROW=\ "U_NAMESPACE_BEGIN=namespace icu{" "U_NAMESPACE_END=}" U_SHOW_CPLUSPLUS_API=1 U_DEFINE_LOCAL_OPEN_POINTER()= U_IN_DOXYGEN=1 U_OVERRIDE=override U_FINAL=final UCONFIG_ENABLE_PLUGINS=1 U_CHAR16_IS_TYPEDEF=0 U_CPLUSPLUS_VERSION=11 U_WCHAR_IS_UTF16
PREDEFINED = U_EXPORT2= U_STABLE= U_DRAFT= U_INTERNAL= U_SYSTEM= U_DEPRECATED= U_OBSOLETE= U_CALLCONV_FPTR= U_CALLCONV= U_CDECL_BEGIN= U_CDECL_END= U_NO_THROW=\ "U_NAMESPACE_BEGIN=namespace icu{" "U_NAMESPACE_END=}" U_SHOW_CPLUSPLUS_API=1 U_DEFINE_LOCAL_OPEN_POINTER()= U_IN_DOXYGEN=1 U_OVERRIDE= U_FINAL=final UCONFIG_ENABLE_PLUGINS=1 U_CHAR16_IS_TYPEDEF=0 U_CPLUSPLUS_VERSION=11 U_WCHAR_IS_UTF16 U_NOEXCEPT=
EXPAND_AS_DEFINED =
SKIP_FUNCTION_MACROS = YES
#---------------------------------------------------------------------------

View file

@ -89,7 +89,7 @@ ucnv_ext.o ucnvmbcs.o ucnv2022.o ucnvhz.o ucnv_lmb.o ucnvisci.o ucnvdisp.o ucnv_
resource.o uresbund.o ures_cnv.o uresdata.o resbund.o resbund_cnv.o \
ucurr.o \
messagepattern.o ucat.o locmap.o uloc.o locid.o locutil.o locavailable.o locdispnames.o locdspnm.o loclikely.o locresdata.o \
bytestream.o stringpiece.o \
bytestream.o stringpiece.o bytesinkutil.o \
stringtriebuilder.o bytestriebuilder.o \
bytestrie.o bytestrieiterator.o \
ucharstrie.o ucharstriebuilder.o ucharstrieiterator.o \
@ -104,7 +104,7 @@ patternprops.o uchar.o uprops.o ucase.o propname.o ubidi_props.o ubidi.o ubidiwr
uscript.o uscript_props.o usc_impl.o unames.o \
utrie.o utrie2.o utrie2_builder.o bmpset.o unisetspan.o uset_props.o uniset_props.o uniset_closure.o uset.o uniset.o usetiter.o ruleiter.o caniter.o unifilt.o unifunct.o \
uarrsort.o brkiter.o ubrk.o brkeng.o dictbe.o filteredbrk.o \
rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o \
rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o rbbi_cache.o \
serv.o servnotf.o servls.o servlk.o servlkf.o servrbf.o servslkf.o \
uidna.o usprep.o uts46.o punycode.o \
util.o util_props.o parsepos.o locbased.o cwchar.o wintz.o dtintrv.o ucnvsel.o propsvec.o \

View file

@ -28,7 +28,7 @@ U_NAMESPACE_BEGIN
BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
list(parentList), listLength(parentListLength) {
uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
uprv_memset(table7FF, 0, sizeof(table7FF));
uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
@ -45,14 +45,16 @@ BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
}
list4kStarts[0x11]=listLength-1;
containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);
initBits();
overrideIllegal();
}
BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
containsFFFD(otherBMPSet.containsFFFD),
list(newParentList), listLength(newParentListLength) {
uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
@ -120,7 +122,7 @@ void BMPSet::initBits() {
UChar32 start, limit;
int32_t listIndex=0;
// Set asciiBytes[].
// Set latin1Contains[].
do {
start=list[listIndex++];
if(listIndex<listLength) {
@ -128,13 +130,30 @@ void BMPSet::initBits() {
} else {
limit=0x110000;
}
if(start>=0x80) {
if(start>=0x100) {
break;
}
do {
asciiBytes[start++]=1;
} while(start<limit && start<0x80);
} while(limit<=0x80);
latin1Contains[start++]=1;
} while(start<limit && start<0x100);
} while(limit<=0x100);
// Find the first range overlapping with (or after) 80..FF again,
// to include them in table7FF as well.
for(listIndex=0;;) {
start=list[listIndex++];
if(listIndex<listLength) {
limit=list[listIndex++];
} else {
limit=0x110000;
}
if(limit>0x80) {
if(start<0x80) {
start=0x80;
}
break;
}
}
// Set table7FF[].
while(start<0x800) {
@ -204,19 +223,14 @@ void BMPSet::initBits() {
* for faster validity checking at runtime.
* No need to set 0 values where they were reset to 0 in the constructor
* and not modified by initBits().
* (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
* (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
* Need to set 0 values for surrogates D800..DFFF.
*/
void BMPSet::overrideIllegal() {
uint32_t bits, mask;
int32_t i;
if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
// contains(FFFD)==TRUE
for(i=0x80; i<0xc0; ++i) {
asciiBytes[i]=1;
}
if(containsFFFD) {
bits=3; // Lead bytes 0xC0 and 0xC1.
for(i=0; i<64; ++i) {
table7FF[i]|=bits;
@ -233,7 +247,6 @@ void BMPSet::overrideIllegal() {
bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
}
} else {
// contains(FFFD)==FALSE
mask=~(0x10001<<0xd); // Lead byte 0xED.
for(i=32; i<64; ++i) { // Second half of 4k block.
bmpBlockBits[i]&=mask;
@ -277,8 +290,8 @@ int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
UBool
BMPSet::contains(UChar32 c) const {
if((uint32_t)c<=0x7f) {
return (UBool)asciiBytes[c];
if((uint32_t)c<=0xff) {
return (UBool)latin1Contains[c];
} else if((uint32_t)c<=0x7ff) {
return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
} else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
@ -314,8 +327,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
// span
do {
c=*s;
if(c<=0x7f) {
if(!asciiBytes[c]) {
if(c<=0xff) {
if(!latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
@ -354,8 +367,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
// span not
do {
c=*s;
if(c<=0x7f) {
if(asciiBytes[c]) {
if(c<=0xff) {
if(latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
@ -403,8 +416,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
// span
for(;;) {
c=*(--limit);
if(c<=0x7f) {
if(!asciiBytes[c]) {
if(c<=0xff) {
if(!latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
@ -446,8 +459,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
// span not
for(;;) {
c=*(--limit);
if(c<=0x7f) {
if(asciiBytes[c]) {
if(c<=0xff) {
if(latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
@ -497,22 +510,22 @@ const uint8_t *
BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
const uint8_t *limit=s+length;
uint8_t b=*s;
if((int8_t)b>=0) {
if(U8_IS_SINGLE(b)) {
// Initial all-ASCII span.
if(spanCondition) {
do {
if(!asciiBytes[b] || ++s==limit) {
if(!latin1Contains[b] || ++s==limit) {
return s;
}
b=*s;
} while((int8_t)b>=0);
} while(U8_IS_SINGLE(b));
} else {
do {
if(asciiBytes[b] || ++s==limit) {
if(latin1Contains[b] || ++s==limit) {
return s;
}
b=*s;
} while((int8_t)b>=0);
} while(U8_IS_SINGLE(b));
}
length=(int32_t)(limit-s);
}
@ -540,20 +553,20 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
// single trail byte, check for preceding 3- or 4-byte lead byte
if(length>=2 && (b=*(limit-2))>=0xe0) {
limit-=2;
if(asciiBytes[0x80]!=spanCondition) {
if(containsFFFD!=spanCondition) {
limit0=limit;
}
} else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
// 4-byte lead byte with only two trail bytes
limit-=3;
if(asciiBytes[0x80]!=spanCondition) {
if(containsFFFD!=spanCondition) {
limit0=limit;
}
}
} else {
// lead byte with no trail bytes
--limit;
if(asciiBytes[0x80]!=spanCondition) {
if(containsFFFD!=spanCondition) {
limit0=limit;
}
}
@ -563,26 +576,26 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
while(s<limit) {
b=*s;
if(b<0xc0) {
// ASCII; or trail bytes with the result of contains(FFFD).
if(U8_IS_SINGLE(b)) {
// ASCII
if(spanCondition) {
do {
if(!asciiBytes[b]) {
if(!latin1Contains[b]) {
return s;
} else if(++s==limit) {
return limit0;
}
b=*s;
} while(b<0xc0);
} while(U8_IS_SINGLE(b));
} else {
do {
if(asciiBytes[b]) {
if(latin1Contains[b]) {
return s;
} else if(++s==limit) {
return limit0;
}
b=*s;
} while(b<0xc0);
} while(U8_IS_SINGLE(b));
}
}
++s; // Advance past the lead byte.
@ -619,7 +632,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
if( ( (0x10000<=c && c<=0x10ffff) ?
containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
asciiBytes[0x80]
containsFFFD
) != spanCondition
) {
return s-1;
@ -627,8 +640,9 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
s+=3;
continue;
}
} else /* 0xc0<=b<0xe0 */ {
} else {
if( /* handle U+0000..U+07FF inline */
b>=0xc0 &&
(t1=(uint8_t)(*s-0x80)) <= 0x3f
) {
if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
@ -642,7 +656,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
// Give an illegal sequence the same value as the result of contains(FFFD).
// Handle each byte of an illegal sequence separately to simplify the code;
// no need to optimize error handling.
if(asciiBytes[0x80]!=spanCondition) {
if(containsFFFD!=spanCondition) {
return s-1;
}
}
@ -667,26 +681,26 @@ BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCon
do {
b=s[--length];
if((int8_t)b>=0) {
if(U8_IS_SINGLE(b)) {
// ASCII sub-span
if(spanCondition) {
do {
if(!asciiBytes[b]) {
if(!latin1Contains[b]) {
return length+1;
} else if(length==0) {
return 0;
}
b=s[--length];
} while((int8_t)b>=0);
} while(U8_IS_SINGLE(b));
} else {
do {
if(asciiBytes[b]) {
if(latin1Contains[b]) {
return length+1;
} else if(length==0) {
return 0;
}
b=s[--length];
} while((int8_t)b>=0);
} while(U8_IS_SINGLE(b));
}
}

View file

@ -28,11 +28,12 @@ U_NAMESPACE_BEGIN
* Helper class for frozen UnicodeSets, implements contains() and span()
* optimized for BMP code points. Structured to be UTF-8-friendly.
*
* ASCII: Look up bytes.
* Latin-1: Look up bytes.
* 2-byte characters: Bits organized vertically.
* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,
* with mixed for illegal ranges.
* Supplementary characters: Call contains() on the parent set.
* Supplementary characters: Binary search over
* the supplementary part of the parent set's inversion list.
*/
class BMPSet : public UMemory {
public:
@ -96,12 +97,12 @@ private:
inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;
/*
* One byte per ASCII character, or trail byte in lead position.
* 0 or 1 for ASCII characters.
* The value for trail bytes is the result of contains(FFFD)
* for faster validity checking at runtime.
* One byte 0 or 1 per Latin-1 character.
*/
UBool asciiBytes[0xc0];
UBool latin1Contains[0x100];
/* TRUE if contains(U+FFFD). */
UBool containsFFFD;
/*
* One bit per code point from U+0000..U+07FF.

View file

@ -11,9 +11,6 @@
#if !UCONFIG_NO_BREAK_ITERATION
#include "brkeng.h"
#include "cmemory.h"
#include "dictbe.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/chariter.h"
@ -24,6 +21,10 @@
#include "unicode/uscript.h"
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
#include "brkeng.h"
#include "cmemory.h"
#include "dictbe.h"
#include "charstr.h"
#include "dictionarydata.h"
#include "mutex.h"
@ -80,23 +81,15 @@ UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
int32_t
UnhandledEngine::findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
UBool reverse,
int32_t breakType,
UStack &/*foundBreaks*/ ) const {
int32_t /* startPos */,
int32_t endPos,
int32_t breakType,
UVector32 &/*foundBreaks*/ ) const {
if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
UChar32 c = utext_current32(text);
if (reverse) {
while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
c = utext_previous32(text);
}
}
else {
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
utext_next32(text); // TODO: recast loop to work with post-increment operations.
c = utext_current32(text);
}
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
utext_next32(text); // TODO: recast loop to work with post-increment operations.
c = utext_current32(text);
}
}
return 0;

View file

@ -19,6 +19,7 @@ U_NAMESPACE_BEGIN
class UnicodeSet;
class UStack;
class UVector32;
class DictionaryMatcher;
/*******************************************************************
@ -67,18 +68,15 @@ class LanguageBreakEngine : public UMemory {
* is capable of handling.
* @param startPos The start of the run within the supplied text.
* @param endPos The end of the run within the supplied text.
* @param reverse Whether the caller is looking for breaks in a reverse
* direction.
* @param breakType The type of break desired, or -1.
* @param foundBreaks An allocated C array of the breaks found, if any
* @param foundBreaks A Vector of int32_t to receive the breaks.
* @return The number of breaks found.
*/
virtual int32_t findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
UBool reverse,
int32_t breakType,
UStack &foundBreaks ) const = 0;
UVector32 &foundBreaks ) const = 0;
};
@ -192,8 +190,6 @@ class UnhandledEngine : public LanguageBreakEngine {
* is capable of handling.
* @param startPos The start of the run within the supplied text.
* @param endPos The end of the run within the supplied text.
* @param reverse Whether the caller is looking for breaks in a reverse
* direction.
* @param breakType The type of break desired, or -1.
* @param foundBreaks An allocated C array of the breaks found, if any
* @return The number of breaks found.
@ -201,9 +197,8 @@ class UnhandledEngine : public LanguageBreakEngine {
virtual int32_t findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
UBool reverse,
int32_t breakType,
UStack &foundBreaks ) const;
UVector32 &foundBreaks ) const;
/**
* <p>Tell the engine to handle a particular character and break type.</p>

View file

@ -0,0 +1,123 @@
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// bytesinkutil.cpp
// created: 2017sep14 Markus W. Scherer
#include "unicode/utypes.h"
#include "unicode/bytestream.h"
#include "unicode/edits.h"
#include "unicode/stringoptions.h"
#include "unicode/utf8.h"
#include "unicode/utf16.h"
#include "bytesinkutil.h"
#include "cmemory.h"
#include "uassert.h"
U_NAMESPACE_BEGIN
UBool
ByteSinkUtil::appendChange(int32_t length, const char16_t *s16, int32_t s16Length,
ByteSink &sink, Edits *edits, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return FALSE; }
char scratch[200];
int32_t s8Length = 0;
for (int32_t i = 0; i < s16Length;) {
int32_t capacity;
int32_t desiredCapacity = s16Length - i;
if (desiredCapacity < (INT32_MAX / 3)) {
desiredCapacity *= 3; // max 3 UTF-8 bytes per UTF-16 code unit
} else if (desiredCapacity < (INT32_MAX / 2)) {
desiredCapacity *= 2;
} else {
desiredCapacity = INT32_MAX;
}
char *buffer = sink.GetAppendBuffer(U8_MAX_LENGTH, desiredCapacity,
scratch, UPRV_LENGTHOF(scratch), &capacity);
capacity -= U8_MAX_LENGTH - 1;
int32_t j = 0;
for (; i < s16Length && j < capacity;) {
UChar32 c;
U16_NEXT_UNSAFE(s16, i, c);
U8_APPEND_UNSAFE(buffer, j, c);
}
if (j > (INT32_MAX - s8Length)) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
sink.Append(buffer, j);
s8Length += j;
}
if (edits != nullptr) {
edits->addReplace(length, s8Length);
}
return TRUE;
}
UBool
ByteSinkUtil::appendChange(const uint8_t *s, const uint8_t *limit,
const char16_t *s16, int32_t s16Length,
ByteSink &sink, Edits *edits, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return FALSE; }
if ((limit - s) > INT32_MAX) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
return appendChange((int32_t)(limit - s), s16, s16Length, sink, edits, errorCode);
}
void
ByteSinkUtil::appendCodePoint(int32_t length, UChar32 c, ByteSink &sink, Edits *edits) {
char s8[U8_MAX_LENGTH];
int32_t s8Length = 0;
U8_APPEND_UNSAFE(s8, s8Length, c);
if (edits != nullptr) {
edits->addReplace(length, s8Length);
}
sink.Append(s8, s8Length);
}
namespace {
// See unicode/utf8.h U8_APPEND_UNSAFE().
inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
} // namespace
void
ByteSinkUtil::appendTwoBytes(UChar32 c, ByteSink &sink) {
U_ASSERT(0x80 <= c && c <= 0x7ff); // 2-byte UTF-8
char s8[2] = { (char)getTwoByteLead(c), (char)getTwoByteTrail(c) };
sink.Append(s8, 2);
}
UBool
ByteSinkUtil::appendUnchanged(const uint8_t *s, int32_t length,
ByteSink &sink, uint32_t options, Edits *edits,
UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return FALSE; }
if (length > 0) {
if (edits != nullptr) {
edits->addUnchanged(length);
}
if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
sink.Append(reinterpret_cast<const char *>(s), length);
}
}
return TRUE;
}
UBool
ByteSinkUtil::appendUnchanged(const uint8_t *s, const uint8_t *limit,
ByteSink &sink, uint32_t options, Edits *edits,
UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return FALSE; }
if ((limit - s) > INT32_MAX) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
return appendUnchanged(s, (int32_t)(limit - s), sink, options, edits, errorCode);
}
U_NAMESPACE_END

View file

@ -0,0 +1,53 @@
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// bytesinkutil.h
// created: 2017sep14 Markus W. Scherer
#include "unicode/utypes.h"
#include "unicode/bytestream.h"
#include "unicode/edits.h"
#include "cmemory.h"
#include "uassert.h"
U_NAMESPACE_BEGIN
class ByteSink;
class Edits;
class U_COMMON_API ByteSinkUtil {
public:
ByteSinkUtil() = delete; // all static
/** (length) bytes were mapped to valid (s16, s16Length). */
static UBool appendChange(int32_t length,
const char16_t *s16, int32_t s16Length,
ByteSink &sink, Edits *edits, UErrorCode &errorCode);
/** The bytes at [s, limit[ were mapped to valid (s16, s16Length). */
static UBool appendChange(const uint8_t *s, const uint8_t *limit,
const char16_t *s16, int32_t s16Length,
ByteSink &sink, Edits *edits, UErrorCode &errorCode);
/** (length) bytes were mapped/changed to valid code point c. */
static void appendCodePoint(int32_t length, UChar32 c, ByteSink &sink, Edits *edits = nullptr);
/** The few bytes at [src, nextSrc[ were mapped/changed to valid code point c. */
static inline void appendCodePoint(const uint8_t *src, const uint8_t *nextSrc, UChar32 c,
ByteSink &sink, Edits *edits = nullptr) {
appendCodePoint((int32_t)(nextSrc - src), c, sink, edits);
}
/** Append the two-byte character (U+0080..U+07FF). */
static void appendTwoBytes(UChar32 c, ByteSink &sink);
static UBool appendUnchanged(const uint8_t *s, int32_t length,
ByteSink &sink, uint32_t options, Edits *edits,
UErrorCode &errorCode);
static UBool appendUnchanged(const uint8_t *s, const uint8_t *limit,
ByteSink &sink, uint32_t options, Edits *edits,
UErrorCode &errorCode);
};
U_NAMESPACE_END

View file

@ -45,6 +45,12 @@ void CheckedArrayByteSink::Append(const char* bytes, int32_t n) {
if (n <= 0) {
return;
}
if (n > (INT32_MAX - appended_)) {
// TODO: Report as integer overflow, not merely buffer overflow.
appended_ = INT32_MAX;
overflowed_ = TRUE;
return;
}
appended_ += n;
int32_t available = capacity_ - size_;
if (n > available) {

View file

@ -268,6 +268,8 @@
</ClCompile>
<ClCompile Include="rbbitblb.cpp">
</ClCompile>
<ClCompile Include="rbbi_cache.cpp">
</ClCompile>
<ClCompile Include="dictionarydata.cpp" />
<ClCompile Include="ubrk.cpp" />
<ClCompile Include="ucol_swp.cpp">
@ -445,6 +447,7 @@
</ClCompile>
<ClCompile Include="usprep.cpp" />
<ClCompile Include="appendable.cpp" />
<ClCompile Include="bytesinkutil.cpp" />
<ClCompile Include="bytestream.cpp" />
<ClCompile Include="bytestrie.cpp" />
<ClCompile Include="bytestriebuilder.cpp" />
@ -572,6 +575,7 @@
<ClInclude Include="rbbiscan.h" />
<ClInclude Include="rbbisetb.h" />
<ClInclude Include="rbbitblb.h" />
<ClInclude Include="rbbi_cache.h" />
<ClInclude Include="dictionarydata.h" />
<CustomBuild Include="unicode\ubrk.h">
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">copy "%(FullPath)" ..\..\include\unicode
@ -1478,6 +1482,7 @@
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
</CustomBuild>
<ClInclude Include="bytesinkutil.h" />
<CustomBuild Include="unicode\bytestream.h">
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">copy "%(FullPath)" ..\..\include\unicode
</Command>

View file

@ -97,6 +97,9 @@
<ClCompile Include="rbbitblb.cpp">
<Filter>break iteration</Filter>
</ClCompile>
<ClCompile Include="rbbi_cache.cpp">
<Filter>break iteration</Filter>
</ClCompile>
<ClCompile Include="ubrk.cpp">
<Filter>break iteration</Filter>
</ClCompile>
@ -460,6 +463,9 @@
<ClCompile Include="usprep.cpp">
<Filter>sprep</Filter>
</ClCompile>
<ClCompile Include="bytesinkutil.cpp">
<Filter>strings</Filter>
</ClCompile>
<ClCompile Include="bytestream.cpp">
<Filter>strings</Filter>
</ClCompile>
@ -636,6 +642,9 @@
<ClInclude Include="rbbitblb.h">
<Filter>break iteration</Filter>
</ClInclude>
<ClInclude Include="rbbi_cache.h">
<Filter>break iteration</Filter>
</ClInclude>
<ClInclude Include="ubrkimpl.h">
<Filter>break iteration</Filter>
</ClInclude>
@ -861,6 +870,9 @@
<ClInclude Include="sprpimpl.h">
<Filter>sprep</Filter>
</ClInclude>
<ClInclude Include="bytesinkutil.h">
<Filter>strings</Filter>
</ClInclude>
<ClInclude Include="charstr.h">
<Filter>strings</Filter>
</ClInclude>

View file

@ -299,6 +299,8 @@
</ClCompile>
<ClCompile Include="rbbitblb.cpp">
</ClCompile>
<ClCompile Include="rbbi_cache.cpp">
</ClCompile>
<ClCompile Include="dictionarydata.cpp" />
<ClCompile Include="ubrk.cpp" />
<ClCompile Include="ucol_swp.cpp">
@ -452,6 +454,7 @@
</ClCompile>
<ClCompile Include="usprep.cpp" />
<ClCompile Include="appendable.cpp" />
<ClCompile Include="bytesinkutil.cpp" />
<ClCompile Include="bytestream.cpp" />
<ClCompile Include="bytestrie.cpp" />
<ClCompile Include="bytestriebuilder.cpp" />
@ -529,6 +532,7 @@
<ClInclude Include="rbbiscan.h" />
<ClInclude Include="rbbisetb.h" />
<ClInclude Include="rbbitblb.h" />
<ClInclude Include="rbbi_cache.h" />
<ClInclude Include="dictionarydata.h" />
<CustomBuild Include="unicode\ubrk.h">
<Command>copy "%(FullPath)" ..\..\include\unicode</Command>
@ -894,6 +898,7 @@
</Command>
<Outputs>..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
</CustomBuild>
<ClInclude Include="bytesinkutil.h" />
<CustomBuild Include="unicode\bytestream.h">
<Command>copy "%(FullPath)" ..\..\include\unicode
</Command>

View file

@ -46,9 +46,9 @@ int32_t
DictionaryBreakEngine::findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
UBool reverse,
int32_t breakType,
UStack &foundBreaks ) const {
UVector32 &foundBreaks ) const {
(void)startPos; // TODO: remove this param?
int32_t result = 0;
// Find the span of characters included in the set.
@ -60,34 +60,12 @@ DictionaryBreakEngine::findBreaks( UText *text,
int32_t rangeStart;
int32_t rangeEnd;
UChar32 c = utext_current32(text);
if (reverse) {
UBool isDict = fSet.contains(c);
while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
c = utext_previous32(text);
isDict = fSet.contains(c);
}
if (current < startPos) {
rangeStart = startPos;
} else {
rangeStart = current;
if (!isDict) {
utext_next32(text);
rangeStart = (int32_t)utext_getNativeIndex(text);
}
}
// rangeEnd = start + 1;
utext_setNativeIndex(text, start);
utext_next32(text);
rangeEnd = (int32_t)utext_getNativeIndex(text);
}
else {
while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
utext_next32(text); // TODO: recast loop for postincrement
c = utext_current32(text);
}
rangeStart = start;
rangeEnd = current;
while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
utext_next32(text); // TODO: recast loop for postincrement
c = utext_current32(text);
}
rangeStart = start;
rangeEnd = current;
if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
utext_setNativeIndex(text, current);
@ -248,7 +226,7 @@ int32_t
ThaiBreakEngine::divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const {
UVector32 &foundBreaks ) const {
utext_setNativeIndex(text, rangeStart);
utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
if (utext_getNativeIndex(text) >= rangeEnd) {
@ -487,7 +465,7 @@ int32_t
LaoBreakEngine::divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const {
UVector32 &foundBreaks ) const {
if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
return 0; // Not enough characters for two words
}
@ -680,7 +658,7 @@ int32_t
BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const {
UVector32 &foundBreaks ) const {
if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
return 0; // Not enough characters for two words
}
@ -885,7 +863,7 @@ int32_t
KhmerBreakEngine::divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const {
UVector32 &foundBreaks ) const {
if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
return 0; // Not enough characters for two words
}
@ -1110,9 +1088,9 @@ static inline uint32_t getKatakanaCost(int32_t wordLength){
return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
}
static inline bool isKatakana(uint16_t value) {
return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
(value >= 0xFF66u && value <= 0xFF9fu);
static inline bool isKatakana(UChar32 value) {
return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
(value >= 0xFF66 && value <= 0xFF9f);
}
@ -1128,14 +1106,14 @@ static inline int32_t utext_i32_flag(int32_t bitIndex) {
* @param text A UText representing the text
* @param rangeStart The start of the range of dictionary characters
* @param rangeEnd The end of the range of dictionary characters
* @param foundBreaks Output of C array of int32_t break positions, or 0
* @param foundBreaks vector<int32> to receive the break positions
* @return The number of breaks found
*/
int32_t
CjkBreakEngine::divideUpDictionaryRange( UText *inText,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const {
UVector32 &foundBreaks ) const {
if (rangeStart >= rangeEnd) {
return 0;
}

View file

@ -15,6 +15,7 @@
#include "unicode/utext.h"
#include "brkeng.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
@ -84,21 +85,18 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
*
* @param text A UText representing the text. The iterator is left at
* the end of the run of characters which the engine is capable of handling
* that starts from the first (or last) character in the range.
* that starts from the first character in the range.
* @param startPos The start of the run within the supplied text.
* @param endPos The end of the run within the supplied text.
* @param reverse Whether the caller is looking for breaks in a reverse
* direction.
* @param breakType The type of break desired, or -1.
* @param foundBreaks An allocated C array of the breaks found, if any
* @param foundBreaks vector of int32_t to receive the break positions
* @return The number of breaks found.
*/
virtual int32_t findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
UBool reverse,
int32_t breakType,
UStack &foundBreaks ) const;
UVector32 &foundBreaks ) const;
protected:
@ -128,7 +126,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
virtual int32_t divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const = 0;
UVector32 &foundBreaks ) const = 0;
};
@ -185,7 +183,7 @@ class ThaiBreakEngine : public DictionaryBreakEngine {
virtual int32_t divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const;
UVector32 &foundBreaks ) const;
};
@ -241,7 +239,7 @@ class LaoBreakEngine : public DictionaryBreakEngine {
virtual int32_t divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const;
UVector32 &foundBreaks ) const;
};
@ -297,7 +295,7 @@ class BurmeseBreakEngine : public DictionaryBreakEngine {
virtual int32_t divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const;
UVector32 &foundBreaks ) const;
};
@ -353,7 +351,7 @@ class KhmerBreakEngine : public DictionaryBreakEngine {
virtual int32_t divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const;
UVector32 &foundBreaks ) const;
};
@ -417,7 +415,7 @@ class CjkBreakEngine : public DictionaryBreakEngine {
virtual int32_t divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const;
UVector32 &foundBreaks ) const;
};

View file

@ -17,10 +17,10 @@ namespace {
const int32_t MAX_UNCHANGED_LENGTH = 0x1000;
const int32_t MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1;
// 0wwwcccccccccccc with w=1..6 records ccc+1 replacements of w:w text units.
// No length change.
const int32_t MAX_SHORT_WIDTH = 6;
const int32_t MAX_SHORT_CHANGE_LENGTH = 0xfff;
// 0mmmnnnccccccccc with m=1..6 records ccc+1 replacements of m:n text units.
const int32_t MAX_SHORT_CHANGE_OLD_LENGTH = 6;
const int32_t MAX_SHORT_CHANGE_NEW_LENGTH = 7;
const int32_t SHORT_CHANGE_NUM_MASK = 0x1ff;
const int32_t MAX_SHORT_CHANGE = 0x6fff;
// 0111mmmmmmnnnnnn records a replacement of m text units with n.
@ -138,20 +138,6 @@ void Edits::addUnchanged(int32_t unchangedLength) {
void Edits::addReplace(int32_t oldLength, int32_t newLength) {
if(U_FAILURE(errorCode_)) { return; }
if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) {
// Replacement of short oldLength text units by same-length new text.
// Merge into previous short-replacement record, if any.
++numChanges;
int32_t last = lastUnit();
if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
(last >> 12) == oldLength && (last & 0xfff) < MAX_SHORT_CHANGE_LENGTH) {
setLastUnit(last + 1);
return;
}
append(oldLength << 12);
return;
}
if(oldLength < 0 || newLength < 0) {
errorCode_ = U_ILLEGAL_ARGUMENT_ERROR;
return;
@ -171,6 +157,21 @@ void Edits::addReplace(int32_t oldLength, int32_t newLength) {
delta += newDelta;
}
if(0 < oldLength && oldLength <= MAX_SHORT_CHANGE_OLD_LENGTH &&
newLength <= MAX_SHORT_CHANGE_NEW_LENGTH) {
// Merge into previous same-lengths short-replacement record, if any.
int32_t u = (oldLength << 12) | (newLength << 9);
int32_t last = lastUnit();
if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
(last & ~SHORT_CHANGE_NUM_MASK) == u &&
(last & SHORT_CHANGE_NUM_MASK) < SHORT_CHANGE_NUM_MASK) {
setLastUnit(last + 1);
return;
}
append(u);
return;
}
int32_t head = 0x7000;
if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) {
head |= oldLength << 6;
@ -396,7 +397,7 @@ Edits &Edits::mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &error
Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) :
array(a), index(0), length(len), remaining(0),
onlyChanges_(oc), coarse(crs),
changed(FALSE), oldLength_(0), newLength_(0),
dir(0), changed(FALSE), oldLength_(0), newLength_(0),
srcIndex(0), replIndex(0), destIndex(0) {}
int32_t Edits::Iterator::readLength(int32_t head) {
@ -418,7 +419,7 @@ int32_t Edits::Iterator::readLength(int32_t head) {
}
}
void Edits::Iterator::updateIndexes() {
void Edits::Iterator::updateNextIndexes() {
srcIndex += oldLength_;
if (changed) {
replIndex += newLength_;
@ -426,22 +427,52 @@ void Edits::Iterator::updateIndexes() {
destIndex += newLength_;
}
void Edits::Iterator::updatePreviousIndexes() {
srcIndex -= oldLength_;
if (changed) {
replIndex -= newLength_;
}
destIndex -= newLength_;
}
UBool Edits::Iterator::noNext() {
// No change beyond the string.
// No change before or beyond the string.
dir = 0;
changed = FALSE;
oldLength_ = newLength_ = 0;
return FALSE;
}
UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) {
// Forward iteration: Update the string indexes to the limit of the current span,
// and post-increment-read array units to assemble a new span.
// Leaves the array index one after the last unit of that span.
if (U_FAILURE(errorCode)) { return FALSE; }
// We have an errorCode in case we need to start guarding against integer overflows.
// It is also convenient for caller loops if we bail out when an error was set elsewhere.
updateIndexes();
if (remaining > 0) {
// Fine-grained iterator: Continue a sequence of equal-length changes.
--remaining;
return TRUE;
if (dir > 0) {
updateNextIndexes();
} else {
if (dir < 0) {
// Turn around from previous() to next().
// Post-increment-read the same span again.
if (remaining > 0) {
// Fine-grained iterator:
// Stay on the current one of a sequence of compressed changes.
++index; // next() rests on the index after the sequence unit.
dir = 1;
return TRUE;
}
}
dir = 1;
}
if (remaining >= 1) {
// Fine-grained iterator: Continue a sequence of compressed changes.
if (remaining > 1) {
--remaining;
return TRUE;
}
remaining = 0;
}
if (index >= length) {
return noNext();
@ -457,7 +488,7 @@ UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) {
}
newLength_ = oldLength_;
if (onlyChanges) {
updateIndexes();
updateNextIndexes();
if (index >= length) {
return noNext();
}
@ -469,14 +500,19 @@ UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) {
}
changed = TRUE;
if (u <= MAX_SHORT_CHANGE) {
int32_t oldLen = u >> 12;
int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH;
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
if (coarse) {
int32_t w = u >> 12;
int32_t len = (u & 0xfff) + 1;
oldLength_ = newLength_ = len * w;
oldLength_ = num * oldLen;
newLength_ = num * newLen;
} else {
// Split a sequence of equal-length changes that was compressed into one unit.
oldLength_ = newLength_ = u >> 12;
remaining = u & 0xfff;
// Split a sequence of changes that was compressed into one unit.
oldLength_ = oldLen;
newLength_ = newLen;
if (num > 1) {
remaining = num; // This is the first of two or more changes.
}
return TRUE;
}
} else {
@ -491,22 +527,127 @@ UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) {
while (index < length && (u = array[index]) > MAX_UNCHANGED) {
++index;
if (u <= MAX_SHORT_CHANGE) {
int32_t w = u >> 12;
int32_t len = (u & 0xfff) + 1;
len = len * w;
oldLength_ += len;
newLength_ += len;
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
oldLength_ += (u >> 12) * num;
newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num;
} else {
U_ASSERT(u <= 0x7fff);
int32_t oldLen = readLength((u >> 6) & 0x3f);
int32_t newLen = readLength(u & 0x3f);
oldLength_ += oldLen;
newLength_ += newLen;
oldLength_ += readLength((u >> 6) & 0x3f);
newLength_ += readLength(u & 0x3f);
}
}
return TRUE;
}
UBool Edits::Iterator::previous(UErrorCode &errorCode) {
// Backward iteration: Pre-decrement-read array units to assemble a new span,
// then update the string indexes to the start of that span.
// Leaves the array index on the head unit of that span.
if (U_FAILURE(errorCode)) { return FALSE; }
// We have an errorCode in case we need to start guarding against integer overflows.
// It is also convenient for caller loops if we bail out when an error was set elsewhere.
if (dir >= 0) {
if (dir > 0) {
// Turn around from next() to previous().
// Set the string indexes to the span limit and
// pre-decrement-read the same span again.
if (remaining > 0) {
// Fine-grained iterator:
// Stay on the current one of a sequence of compressed changes.
--index; // previous() rests on the sequence unit.
dir = -1;
return TRUE;
}
updateNextIndexes();
}
dir = -1;
}
if (remaining > 0) {
// Fine-grained iterator: Continue a sequence of compressed changes.
int32_t u = array[index];
U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE);
if (remaining <= (u & SHORT_CHANGE_NUM_MASK)) {
++remaining;
updatePreviousIndexes();
return TRUE;
}
remaining = 0;
}
if (index <= 0) {
return noNext();
}
int32_t u = array[--index];
if (u <= MAX_UNCHANGED) {
// Combine adjacent unchanged ranges.
changed = FALSE;
oldLength_ = u + 1;
while (index > 0 && (u = array[index - 1]) <= MAX_UNCHANGED) {
--index;
oldLength_ += u + 1;
}
newLength_ = oldLength_;
// No need to handle onlyChanges as long as previous() is called only from findIndex().
updatePreviousIndexes();
return TRUE;
}
changed = TRUE;
if (u <= MAX_SHORT_CHANGE) {
int32_t oldLen = u >> 12;
int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH;
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
if (coarse) {
oldLength_ = num * oldLen;
newLength_ = num * newLen;
} else {
// Split a sequence of changes that was compressed into one unit.
oldLength_ = oldLen;
newLength_ = newLen;
if (num > 1) {
remaining = 1; // This is the last of two or more changes.
}
updatePreviousIndexes();
return TRUE;
}
} else {
if (u <= 0x7fff) {
// The change is encoded in u alone.
oldLength_ = readLength((u >> 6) & 0x3f);
newLength_ = readLength(u & 0x3f);
} else {
// Back up to the head of the change, read the lengths,
// and reset the index to the head again.
U_ASSERT(index > 0);
while ((u = array[--index]) > 0x7fff) {}
U_ASSERT(u > MAX_SHORT_CHANGE);
int32_t headIndex = index++;
oldLength_ = readLength((u >> 6) & 0x3f);
newLength_ = readLength(u & 0x3f);
index = headIndex;
}
if (!coarse) {
updatePreviousIndexes();
return TRUE;
}
}
// Combine adjacent changes.
while (index > 0 && (u = array[index - 1]) > MAX_UNCHANGED) {
--index;
if (u <= MAX_SHORT_CHANGE) {
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
oldLength_ += (u >> 12) * num;
newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num;
} else if (u <= 0x7fff) {
// Read the lengths, and reset the index to the head again.
int32_t headIndex = index++;
oldLength_ += readLength((u >> 6) & 0x3f);
newLength_ += readLength(u & 0x3f);
index = headIndex;
}
}
updatePreviousIndexes();
return TRUE;
}
int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &errorCode) {
if (U_FAILURE(errorCode) || i < 0) { return -1; }
int32_t spanStart, spanLength;
@ -518,7 +659,44 @@ int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &erro
spanLength = newLength_;
}
if (i < spanStart) {
if (i >= (spanStart / 2)) {
// Search backwards.
for (;;) {
UBool hasPrevious = previous(errorCode);
U_ASSERT(hasPrevious); // because i>=0 and the first span starts at 0
(void)hasPrevious; // avoid unused-variable warning
spanStart = findSource ? srcIndex : destIndex;
if (i >= spanStart) {
// The index is in the current span.
return 0;
}
if (remaining > 0) {
// Is the index in one of the remaining compressed edits?
// spanStart is the start of the current span, first of the remaining ones.
spanLength = findSource ? oldLength_ : newLength_;
int32_t u = array[index];
U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE);
int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1 - remaining;
int32_t len = num * spanLength;
if (i >= (spanStart - len)) {
int32_t n = ((spanStart - i - 1) / spanLength) + 1;
// 1 <= n <= num
srcIndex -= n * oldLength_;
replIndex -= n * newLength_;
destIndex -= n * newLength_;
remaining += n;
return 0;
}
// Skip all of these edits at once.
srcIndex -= num * oldLength_;
replIndex -= num * newLength_;
destIndex -= num * newLength_;
remaining = 0;
}
}
}
// Reset the iterator to the start.
dir = 0;
index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;
} else if (i < (spanStart + spanLength)) {
// The index is in the current span.
@ -536,21 +714,21 @@ int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &erro
// The index is in the current span.
return 0;
}
if (remaining > 0) {
if (remaining > 1) {
// Is the index in one of the remaining compressed edits?
// spanStart is the start of the current span, before the remaining ones.
int32_t len = (remaining + 1) * spanLength;
// spanStart is the start of the current span, first of the remaining ones.
int32_t len = remaining * spanLength;
if (i < (spanStart + len)) {
int32_t n = (i - spanStart) / spanLength; // 1 <= n <= remaining
len = n * spanLength;
srcIndex += len;
replIndex += len;
destIndex += len;
int32_t n = (i - spanStart) / spanLength; // 1 <= n <= remaining - 1
srcIndex += n * oldLength_;
replIndex += n * newLength_;
destIndex += n * newLength_;
remaining -= n;
return 0;
}
// Make next() skip all of these edits at once.
oldLength_ = newLength_ = len;
oldLength_ *= remaining;
newLength_ *= remaining;
remaining = 0;
}
}

View file

@ -694,7 +694,7 @@ FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& st
}
FilteredBreakIteratorBuilder *
FilteredBreakIteratorBuilder::createInstance(UErrorCode& status) {
FilteredBreakIteratorBuilder::createEmptyInstance(UErrorCode& status) {
if(U_FAILURE(status)) return NULL;
LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status);
return (U_SUCCESS(status))? ret.orphan(): NULL;

View file

@ -22,11 +22,11 @@
#include "unicode/edits.h"
#include "unicode/normalizer2.h"
#include "unicode/stringoptions.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/unorm.h"
#include "cpputils.h"
#include "ustr_imp.h" // U_EDITS_NO_RESET
U_NAMESPACE_BEGIN

View file

@ -33,6 +33,8 @@ class U_COMMON_API Hashtable : public UMemory {
inline void init(UHashFunction *keyHash, UKeyComparator *keyComp, UValueComparator *valueComp, UErrorCode& status);
inline void initSize(UHashFunction *keyHash, UKeyComparator *keyComp, UValueComparator *valueComp, int32_t size, UErrorCode& status);
public:
/**
* Construct a hashtable
@ -41,6 +43,14 @@ public:
*/
Hashtable(UBool ignoreKeyCase, UErrorCode& status);
/**
* Construct a hashtable
* @param ignoreKeyCase If true, keys are case insensitive.
* @param size initial size allocation
* @param status Error code
*/
Hashtable(UBool ignoreKeyCase, int32_t size, UErrorCode& status);
/**
* Construct a hashtable
* @param keyComp Comparator for comparing the keys
@ -76,9 +86,9 @@ public:
int32_t puti(const UnicodeString& key, int32_t value, UErrorCode& status);
void* get(const UnicodeString& key) const;
int32_t geti(const UnicodeString& key) const;
void* remove(const UnicodeString& key);
int32_t removei(const UnicodeString& key);
@ -92,9 +102,9 @@ public:
* @see uhash_nextElement
*/
const UHashElement* nextElement(int32_t& pos) const;
UKeyComparator* setKeyComparator(UKeyComparator*keyComp);
UValueComparator* setValueComparator(UValueComparator* valueComp);
UBool equals(const Hashtable& that) const;
@ -107,7 +117,7 @@ private:
* Implementation
********************************************************************/
inline void Hashtable::init(UHashFunction *keyHash, UKeyComparator *keyComp,
inline void Hashtable::init(UHashFunction *keyHash, UKeyComparator *keyComp,
UValueComparator *valueComp, UErrorCode& status) {
if (U_FAILURE(status)) {
return;
@ -119,10 +129,23 @@ inline void Hashtable::init(UHashFunction *keyHash, UKeyComparator *keyComp,
}
}
inline Hashtable::Hashtable(UKeyComparator *keyComp, UValueComparator *valueComp,
inline void Hashtable::initSize(UHashFunction *keyHash, UKeyComparator *keyComp,
UValueComparator *valueComp, int32_t size, UErrorCode& status) {
if (U_FAILURE(status)) {
return;
}
uhash_initSize(&hashObj, keyHash, keyComp, valueComp, size, &status);
if (U_SUCCESS(status)) {
hash = &hashObj;
uhash_setKeyDeleter(hash, uprv_deleteUObject);
}
}
inline Hashtable::Hashtable(UKeyComparator *keyComp, UValueComparator *valueComp,
UErrorCode& status) : hash(0) {
init( uhash_hashUnicodeString, keyComp, valueComp, status);
}
inline Hashtable::Hashtable(UBool ignoreKeyCase, UErrorCode& status)
: hash(0)
{
@ -134,6 +157,17 @@ inline Hashtable::Hashtable(UBool ignoreKeyCase, UErrorCode& status)
status);
}
inline Hashtable::Hashtable(UBool ignoreKeyCase, int32_t size, UErrorCode& status)
: hash(0)
{
initSize(ignoreKeyCase ? uhash_hashCaselessUnicodeString
: uhash_hashUnicodeString,
ignoreKeyCase ? uhash_compareCaselessUnicodeString
: uhash_compareUnicodeString,
NULL, size,
status);
}
inline Hashtable::Hashtable(UErrorCode& status)
: hash(0)
{
@ -200,7 +234,7 @@ inline void Hashtable::removeAll(void) {
inline UKeyComparator* Hashtable::setKeyComparator(UKeyComparator*keyComp){
return uhash_setKeyComparator(hash, keyComp);
}
inline UValueComparator* Hashtable::setValueComparator(UValueComparator* valueComp){
return uhash_setValueComparator(hash, valueComp);
}

View file

@ -542,7 +542,7 @@ uloc_getDisplayName(const char *locale,
return 0;
}
separator = (const UChar *)p0 + subLen;
sepLen = p1 - separator;
sepLen = static_cast<int32_t>(p1 - separator);
}
if(patLen==0 || (patLen==defaultPatLen && !u_strncmp(pattern, defaultPattern, patLen))) {
@ -558,8 +558,8 @@ uloc_getDisplayName(const char *locale,
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
sub0Pos=p0-pattern;
sub1Pos=p1-pattern;
sub0Pos = static_cast<int32_t>(p0-pattern);
sub1Pos = static_cast<int32_t>(p1-pattern);
if (sub1Pos < sub0Pos) { /* a very odd pattern */
int32_t t=sub0Pos; sub0Pos=sub1Pos; sub1Pos=t;
langi=1;

View file

@ -54,7 +54,7 @@ static int32_t ncat(char *buffer, uint32_t buflen, ...) {
*p = 0;
va_end(args);
return p - buffer;
return static_cast<int32_t>(p - buffer);
}
U_NAMESPACE_BEGIN

View file

@ -300,21 +300,21 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={
1,1,1,1,0x864,0x198d,1,1,1,1,1,1,0x868,0x1993,1,0x86c,
0x1999,1,1,1,1,1,1,1,0xfc0e,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,0xfe12,1,1,1,0xffcc,0xffb8,0xffcc,
0xffcc,1,1,1,0x29dd,0x29e3,0x29e9,0x29ef,0x29f5,0x29fb,0x2a01,0x2a07,1,1,1,1,
0xffcc,1,1,1,0x29dc,0x29e2,0x29e8,0x29ee,0x29f4,0x29fa,0x2a00,0x2a06,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,0xfe0e,1,0xfc00,1,1,1,1,1,
1,1,1,0x870,1,1,1,0x199f,0x19a5,0xfe12,1,1,1,1,1,1,
1,1,1,0xfc00,1,1,1,1,0x2a0d,0x2a13,1,0x2a19,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0x2a1f,
1,1,0x2a25,1,1,1,1,1,0xfe0e,1,1,1,1,1,1,1,
1,1,1,0xfc00,1,1,1,1,0x2a0c,0x2a12,1,0x2a18,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0x2a1e,
1,1,0x2a24,1,1,1,1,1,0xfe0e,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,0xfe12,1,1,1,1,1,1,
1,1,1,1,1,0x2a2b,0x2a31,0x2a37,1,1,0x2a3d,1,1,1,1,1,
1,1,1,1,1,0x2a2a,0x2a30,0x2a36,1,1,0x2a3c,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,0xfe0e,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,0xfe12,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0x878,
0x19ab,1,1,0x19b1,0x19b7,0xfe12,1,1,1,1,1,1,1,1,0xfc00,0xfc00,
1,1,1,1,0x2a43,0x2a49,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,0x2a42,0x2a48,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,0x884,1,0x19bd,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,0xfc00,1,
@ -342,7 +342,7 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={
1,1,1,0x2a4f,1,1,1,1,1,1,1,1,1,0x2a55,1,1,
1,1,0x2a5b,1,1,1,1,0x2a61,1,1,1,1,0x2a67,1,1,1,
1,1,1,1,1,1,1,1,1,0x2a6d,1,1,1,1,1,1,
1,0xff02,0xff04,0x3c40,0xff08,0x3c48,0x2a73,1,0x2a79,1,0xff04,0xff04,0xff04,0xff04,1,1,
1,0xff02,0xff04,0x3c40,0xff08,0x3c48,0x2a72,1,0x2a78,1,0xff04,0xff04,0xff04,0xff04,1,1,
0xff04,0x3c50,0xffcc,0xffcc,0xfe12,1,0xffcc,0xffcc,1,1,1,1,1,1,1,1,
1,1,1,0x2a7f,1,1,1,1,1,1,1,1,1,0x2a85,1,1,
1,1,0x2a8b,1,1,1,1,0x2a91,1,1,1,1,0x2a97,1,1,1,
@ -406,15 +406,15 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={
0x21ef,0x21f9,0x2203,0x220d,0x10d8,0x10e6,0x2217,0x2221,0x222b,0x2235,1,1,0x10f4,0x1102,0x223f,0x2249,
0x2253,0x225d,1,1,0x1110,0x1122,0x2267,0x2271,0x227b,0x2285,0x228f,0x2299,1,0x1134,1,0x22a3,
1,0x22ad,1,0x22b7,0x1146,0x115c,0x1174,0x1182,0x1190,0x119e,0x11ac,0x11ba,0x11c6,0x11dc,0x11f4,0x1202,
0x1210,0x121e,0x122c,0x123a,0x1246,0x3b8e,0x22bf,0x3b97,0x1250,0x3b9e,0x22c5,0x3ba7,0x22cb,0x3baf,0x22d1,0x3bb7,
0x1210,0x121e,0x122c,0x123a,0x1246,0x3b8e,0x22bf,0x3b96,0x1250,0x3b9e,0x22c5,0x3ba6,0x22cb,0x3bae,0x22d1,0x3bb6,
0x125a,0x3bbe,1,1,0x22d8,0x22e2,0x22f1,0x2301,0x2311,0x2321,0x2331,0x2341,0x234c,0x2356,0x2365,0x2375,
0x2385,0x2395,0x23a5,0x23b5,0x23c0,0x23ca,0x23d9,0x23e9,0x23f9,0x2409,0x2419,0x2429,0x2434,0x243e,0x244d,0x245d,
0x246d,0x247d,0x248d,0x249d,0x24a8,0x24b2,0x24c1,0x24d1,0x24e1,0x24f1,0x2501,0x2511,0x251c,0x2526,0x2535,0x2545,
0x2555,0x2565,0x2575,0x2585,0x258f,0x2595,0x259d,0x25a4,0x25ad,1,0x1264,0x25b7,0x25bf,0x25c5,0x25cb,0x3bc7,
0x25d0,1,0x2aa2,0x8f0,1,0x25d7,0x25df,0x25e6,0x25ef,1,0x126e,0x25f9,0x2601,0x3bcf,0x2607,0x3bd7,
0x260c,0x2613,0x2619,0x261f,0x2625,0x262b,0x2633,0x3be1,1,1,0x263b,0x2643,0x264b,0x2651,0x2657,0x3beb,
1,0x265d,0x2663,0x2669,0x266f,0x2675,0x267d,0x3bf5,0x2685,0x268b,0x2691,0x2699,0x26a1,0x26a7,0x26ad,0x3bff,
0x26b3,0x26b9,0x3c07,0x2aa7,1,1,0x26c1,0x26c8,0x26d1,1,0x1278,0x26db,0x26e3,0x3c0f,0x26e9,0x3c17,
0x2555,0x2565,0x2575,0x2585,0x258f,0x2595,0x259d,0x25a4,0x25ad,1,0x1264,0x25b7,0x25bf,0x25c5,0x25cb,0x3bc6,
0x25d0,1,0x2aa2,0x8f0,1,0x25d7,0x25df,0x25e6,0x25ef,1,0x126e,0x25f9,0x2601,0x3bce,0x2607,0x3bd6,
0x260c,0x2613,0x2619,0x261f,0x2625,0x262b,0x2633,0x3be0,1,1,0x263b,0x2643,0x264b,0x2651,0x2657,0x3bea,
1,0x265d,0x2663,0x2669,0x266f,0x2675,0x267d,0x3bf4,0x2685,0x268b,0x2691,0x2699,0x26a1,0x26a7,0x26ad,0x3bfe,
0x26b3,0x26b9,0x3c06,0x2aa7,1,1,0x26c1,0x26c8,0x26d1,1,0x1278,0x26db,0x26e3,0x3c0e,0x26e9,0x3c16,
0x26ee,0x2aab,0x8fc,1,0xfa09,0xfa09,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,0xffcc,0xffcc,0xfe02,0xfe02,0xffcc,0xffcc,0xffcc,0xffcc,0xfe02,0xfe02,0xfe02,0xffcc,
@ -512,10 +512,10 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={
0x311b,0x3009,0x311f,0x3123,0x3127,0x312b,0x312f,0x3011,0x2f09,0x3133,0x3015,0x3137,0x3019,0x313b,0x2ae1,0x313f,
0x3145,0x314b,0x3151,0x3155,0x3159,0x315d,0x3163,0x3169,0x316f,0x3173,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,0x3177,0xfe34,0x317d,1,1,1,1,
1,1,1,1,1,1,0x3183,0x3189,0x3191,0x319b,0x31a3,0x31a9,0x31af,0x31b5,0x31bb,0x31c1,
0x31c7,0x31cd,0x31d3,1,0x31d9,0x31df,0x31e5,0x31eb,0x31f1,1,0x31f7,1,0x31fd,0x3203,1,0x3209,
0x320f,1,0x3215,0x321b,0x3221,0x3227,0x322d,0x3233,0x3239,0x323f,0x3245,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,0x3176,0xfe34,0x317c,1,1,1,1,
1,1,1,1,1,1,0x3182,0x3188,0x3190,0x319a,0x31a2,0x31a8,0x31ae,0x31b4,0x31ba,0x31c0,
0x31c6,0x31cc,0x31d2,1,0x31d8,0x31de,0x31e4,0x31ea,0x31f0,1,0x31f6,1,0x31fc,0x3202,1,0x3208,
0x320e,1,0x3214,0x321a,0x3220,0x3226,0x322c,0x3232,0x3238,0x323e,0x3244,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,0xffcc,0xffcc,0xffcc,0xffcc,
0xffcc,0xffcc,0xffcc,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffcc,0xffcc,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@ -560,13 +560,13 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,0xfe02,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,0x324b,0x3255,0x3269,0x3281,0x3299,0x32b1,0x32c9,0xffb0,0xffb0,0xfe02,
1,1,1,1,1,1,0x324a,0x3254,0x3268,0x3280,0x3298,0x32b0,0x32c8,0xffb0,0xffb0,0xfe02,
0xfe02,0xfe02,1,1,1,0xffc4,0xffb0,0xffb0,0xffb0,0xffb0,0xffb0,1,1,1,1,1,
1,1,1,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,1,1,0xffcc,0xffcc,0xffcc,
0xffcc,0xffcc,0xffb8,0xffb8,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,0xffcc,0xffcc,0xffcc,0xffcc,1,1,
1,1,1,1,1,1,1,1,1,1,1,0x32d7,0x32e1,0x32f5,0x330d,0x3325,
0x333d,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,0x32d6,0x32e0,0x32f4,0x330c,0x3324,
0x333c,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,0xffcc,0xffcc,0xffcc,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

View file

@ -20,10 +20,10 @@
#include "unicode/edits.h"
#include "unicode/normalizer2.h"
#include "unicode/stringoptions.h"
#include "unicode/unistr.h"
#include "cpputils.h"
#include "normalizer2impl.h"
#include "ustr_imp.h" // U_EDITS_NO_RESET
U_NAMESPACE_BEGIN
@ -226,14 +226,14 @@ public:
private:
virtual void
normalize(const UChar *src, const UChar *limit,
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
ReorderingBuffer &buffer, UErrorCode &errorCode) const U_OVERRIDE {
impl.compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
}
using Normalizer2WithImpl::normalize; // Avoid warning about hiding base class function.
void
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
Edits *edits, UErrorCode &errorCode) const override {
Edits *edits, UErrorCode &errorCode) const U_OVERRIDE {
if (U_FAILURE(errorCode)) {
return;
}
@ -249,12 +249,12 @@ private:
virtual void
normalizeAndAppend(const UChar *src, const UChar *limit, UBool doNormalize,
UnicodeString &safeMiddle,
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
ReorderingBuffer &buffer, UErrorCode &errorCode) const U_OVERRIDE {
impl.composeAndAppend(src, limit, doNormalize, onlyContiguous, safeMiddle, buffer, errorCode);
}
virtual UBool
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override {
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE {
if(U_FAILURE(errorCode)) {
return FALSE;
}
@ -271,7 +271,7 @@ private:
return impl.compose(sArray, sArray+s.length(), onlyContiguous, FALSE, buffer, errorCode);
}
virtual UBool
isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const override {
isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const U_OVERRIDE {
if(U_FAILURE(errorCode)) {
return FALSE;
}
@ -279,7 +279,7 @@ private:
return impl.composeUTF8(0, onlyContiguous, s, s + sp.length(), nullptr, nullptr, errorCode);
}
virtual UNormalizationCheckResult
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override {
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE {
if(U_FAILURE(errorCode)) {
return UNORM_MAYBE;
}
@ -293,20 +293,20 @@ private:
return qcResult;
}
virtual const UChar *
spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &) const override {
spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &) const U_OVERRIDE {
return impl.composeQuickCheck(src, limit, onlyContiguous, NULL);
}
using Normalizer2WithImpl::spanQuickCheckYes; // Avoid warning about hiding base class function.
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const override {
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const U_OVERRIDE {
return impl.getCompQuickCheck(impl.getNorm16(c));
}
virtual UBool hasBoundaryBefore(UChar32 c) const override {
virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE {
return impl.hasCompBoundaryBefore(c);
}
virtual UBool hasBoundaryAfter(UChar32 c) const override {
virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE {
return impl.hasCompBoundaryAfter(c, onlyContiguous);
}
virtual UBool isInert(UChar32 c) const override {
virtual UBool isInert(UChar32 c) const U_OVERRIDE {
return impl.isCompInert(c, onlyContiguous);
}

View file

@ -22,6 +22,7 @@
#include "unicode/edits.h"
#include "unicode/normalizer2.h"
#include "unicode/stringoptions.h"
#include "unicode/unistr.h"
#include "unicode/unorm.h"
#include "cstring.h"
@ -30,7 +31,6 @@
#include "normalizer2impl.h"
#include "uassert.h"
#include "ucln_cmn.h"
#include "ustr_imp.h" // U_EDITS_NO_RESET
using icu::Normalizer2Impl;
@ -85,7 +85,7 @@ class NoopNormalizer2 : public Normalizer2 {
virtual UnicodeString &
normalize(const UnicodeString &src,
UnicodeString &dest,
UErrorCode &errorCode) const override {
UErrorCode &errorCode) const U_OVERRIDE {
if(U_SUCCESS(errorCode)) {
if(&dest!=&src) {
dest=src;
@ -97,7 +97,7 @@ class NoopNormalizer2 : public Normalizer2 {
}
virtual void
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
Edits *edits, UErrorCode &errorCode) const override {
Edits *edits, UErrorCode &errorCode) const U_OVERRIDE {
if(U_SUCCESS(errorCode)) {
if (edits != nullptr) {
if ((options & U_EDITS_NO_RESET) == 0) {
@ -115,7 +115,7 @@ class NoopNormalizer2 : public Normalizer2 {
virtual UnicodeString &
normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const override {
UErrorCode &errorCode) const U_OVERRIDE {
if(U_SUCCESS(errorCode)) {
if(&first!=&second) {
first.append(second);
@ -128,7 +128,7 @@ class NoopNormalizer2 : public Normalizer2 {
virtual UnicodeString &
append(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const override {
UErrorCode &errorCode) const U_OVERRIDE {
if(U_SUCCESS(errorCode)) {
if(&first!=&second) {
first.append(second);
@ -139,29 +139,29 @@ class NoopNormalizer2 : public Normalizer2 {
return first;
}
virtual UBool
getDecomposition(UChar32, UnicodeString &) const override {
getDecomposition(UChar32, UnicodeString &) const U_OVERRIDE {
return FALSE;
}
// No need to override the default getRawDecomposition().
// No need to U_OVERRIDE the default getRawDecomposition().
virtual UBool
isNormalized(const UnicodeString &, UErrorCode &errorCode) const override {
isNormalized(const UnicodeString &, UErrorCode &errorCode) const U_OVERRIDE {
return U_SUCCESS(errorCode);
}
virtual UBool
isNormalizedUTF8(StringPiece, UErrorCode &errorCode) const override {
isNormalizedUTF8(StringPiece, UErrorCode &errorCode) const U_OVERRIDE {
return U_SUCCESS(errorCode);
}
virtual UNormalizationCheckResult
quickCheck(const UnicodeString &, UErrorCode &) const override {
quickCheck(const UnicodeString &, UErrorCode &) const U_OVERRIDE {
return UNORM_YES;
}
virtual int32_t
spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const override {
spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const U_OVERRIDE {
return s.length();
}
virtual UBool hasBoundaryBefore(UChar32) const override { return TRUE; }
virtual UBool hasBoundaryAfter(UChar32) const override { return TRUE; }
virtual UBool isInert(UChar32) const override { return TRUE; }
virtual UBool hasBoundaryBefore(UChar32) const U_OVERRIDE { return TRUE; }
virtual UBool hasBoundaryAfter(UChar32) const U_OVERRIDE { return TRUE; }
virtual UBool isInert(UChar32) const U_OVERRIDE { return TRUE; }
};
NoopNormalizer2::~NoopNormalizer2() {}

View file

@ -28,6 +28,7 @@
#include "unicode/ustring.h"
#include "unicode/utf16.h"
#include "unicode/utf8.h"
#include "bytesinkutil.h"
#include "cmemory.h"
#include "mutex.h"
#include "normalizer2impl.h"
@ -129,60 +130,6 @@ int32_t getJamoTMinusBase(const uint8_t *src, const uint8_t *limit) {
return -1;
}
/** The bytes at [src, nextSrc[ were mapped to valid (s16, s16Length). */
UBool
appendChange(const uint8_t *src, const uint8_t *nextSrc,
const char16_t *s16, int32_t s16Length,
ByteSink &sink, Edits *edits, UErrorCode &errorCode) {
U_ASSERT(U_SUCCESS(errorCode));
U_ASSERT((nextSrc - src) <= INT32_MAX); // ensured by caller
char scratch[200];
int32_t s8Length = 0;
for (int32_t i = 0; i < s16Length;) {
int32_t capacity;
int32_t desiredCapacity = s16Length - i;
if (desiredCapacity < (INT32_MAX / 3)) {
desiredCapacity *= 3; // max 3 UTF-8 bytes per UTF-16 code unit
} else if (desiredCapacity < (INT32_MAX / 2)) {
desiredCapacity *= 2;
} else {
desiredCapacity = INT32_MAX;
}
char *buffer = sink.GetAppendBuffer(U8_MAX_LENGTH, desiredCapacity,
scratch, UPRV_LENGTHOF(scratch), &capacity);
capacity -= U8_MAX_LENGTH - 1;
int32_t j = 0;
for (; i < s16Length && j < capacity;) {
UChar32 c;
U16_NEXT_UNSAFE(s16, i, c);
U8_APPEND_UNSAFE(buffer, j, c);
}
if (j > (INT32_MAX - s8Length)) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
sink.Append(buffer, j);
s8Length += j;
}
if (edits != nullptr) {
edits->addReplace((int32_t)(nextSrc - src), s8Length);
}
return TRUE;
}
/** The few bytes at [src, nextSrc[ were mapped to valid code point c. */
void
appendCodePoint(const uint8_t *src, const uint8_t *nextSrc, UChar32 c,
ByteSink &sink, Edits *edits) {
char buffer[U8_MAX_LENGTH];
int32_t length = 0;
U8_APPEND_UNSAFE(buffer, length, c);
if (edits != nullptr) {
edits->addReplace((int32_t)(nextSrc - src), length);
}
sink.Append(buffer, length);
}
void
appendCodePointDelta(const uint8_t *cpStart, const uint8_t *cpLimit, int32_t delta,
ByteSink &sink, Edits *edits) {
@ -214,27 +161,6 @@ appendCodePointDelta(const uint8_t *cpStart, const uint8_t *cpLimit, int32_t del
sink.Append(buffer, length);
}
UBool
appendUnchanged(const uint8_t *s, const uint8_t *limit,
ByteSink &sink, uint32_t options, Edits *edits,
UErrorCode &errorCode) {
U_ASSERT(U_SUCCESS(errorCode));
if ((limit - s) > INT32_MAX) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
int32_t length = (int32_t)(limit - s);
if (length > 0) {
if (edits != nullptr) {
edits->addUnchanged(length);
}
if ((options & U_OMIT_UNCHANGED_TEXT) ==0) {
sink.Append(reinterpret_cast<const char *>(s), length);
}
}
return TRUE;
}
} // namespace
// ReorderingBuffer -------------------------------------------------------- ***
@ -1851,7 +1777,8 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
for (;;) {
if (src == limit) {
if (prevBoundary != limit && sink != nullptr) {
appendUnchanged(prevBoundary, limit, *sink, options, edits, errorCode);
ByteSinkUtil::appendUnchanged(prevBoundary, limit,
*sink, options, edits, errorCode);
}
return TRUE;
}
@ -1884,7 +1811,8 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
hasCompBoundaryBefore(src, limit)) {
if (prevBoundary != prevSrc &&
!appendUnchanged(prevBoundary, prevSrc, *sink, options, edits, errorCode)) {
!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
*sink, options, edits, errorCode)) {
break;
}
appendCodePointDelta(prevSrc, src, getAlgorithmicDelta(norm16), *sink, edits);
@ -1896,13 +1824,14 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
hasCompBoundaryBefore(src, limit)) {
if (prevBoundary != prevSrc &&
!appendUnchanged(prevBoundary, prevSrc, *sink, options, edits, errorCode)) {
!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
*sink, options, edits, errorCode)) {
break;
}
const uint16_t *mapping = getMapping(norm16);
int32_t length = *mapping++ & MAPPING_LENGTH_MASK;
if (!appendChange(prevSrc, src, (const UChar *)mapping, length,
*sink, edits, errorCode)) {
if (!ByteSinkUtil::appendChange(prevSrc, src, (const UChar *)mapping, length,
*sink, edits, errorCode)) {
break;
}
prevBoundary = src;
@ -1915,7 +1844,8 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
if (hasCompBoundaryBefore(src, limit) ||
hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) {
if (prevBoundary != prevSrc &&
!appendUnchanged(prevBoundary, prevSrc, *sink, options, edits, errorCode)) {
!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
*sink, options, edits, errorCode)) {
break;
}
if (edits != nullptr) {
@ -1955,10 +1885,11 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
Hangul::JAMO_T_COUNT + t;
prevSrc -= 3; // Replace the Jamo L as well.
if (prevBoundary != prevSrc &&
!appendUnchanged(prevBoundary, prevSrc, *sink, options, edits, errorCode)) {
!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
*sink, options, edits, errorCode)) {
break;
}
appendCodePoint(prevSrc, src, syllable, *sink, edits);
ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);
prevBoundary = src;
continue;
}
@ -1979,10 +1910,11 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
UChar32 syllable = prev + getJamoTMinusBase(prevSrc, src);
prevSrc -= 3; // Replace the Hangul LV as well.
if (prevBoundary != prevSrc &&
!appendUnchanged(prevBoundary, prevSrc, *sink, options, edits, errorCode)) {
!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
*sink, options, edits, errorCode)) {
break;
}
appendCodePoint(prevSrc, src, syllable, *sink, edits);
ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);
prevBoundary = src;
continue;
}
@ -2006,7 +1938,8 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
for (;;) {
if (src == limit) {
if (sink != nullptr) {
appendUnchanged(prevBoundary, limit, *sink, options, edits, errorCode);
ByteSinkUtil::appendUnchanged(prevBoundary, limit,
*sink, options, edits, errorCode);
}
return TRUE;
}
@ -2070,11 +2003,12 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
return FALSE;
}
if (prevBoundary != prevSrc &&
!appendUnchanged(prevBoundary, prevSrc, *sink, options, edits, errorCode)) {
!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
*sink, options, edits, errorCode)) {
break;
}
if (!appendChange(prevSrc, src, buffer.getStart(), buffer.length(),
*sink, edits, errorCode)) {
if (!ByteSinkUtil::appendChange(prevSrc, src, buffer.getStart(), buffer.length(),
*sink, edits, errorCode)) {
break;
}
prevBoundary = src;

View file

@ -675,6 +675,16 @@ extern U_IMPORT char *U_TZNAME[];
#if !UCONFIG_NO_FILE_IO && ((U_PLATFORM_IS_DARWIN_BASED && (U_PLATFORM != U_PF_IPHONE || defined(U_TIMEZONE))) || U_PLATFORM_IS_LINUX_BASED || U_PLATFORM == U_PF_BSD || U_PLATFORM == U_PF_SOLARIS)
/* These platforms are likely to use Olson timezone IDs. */
/* common targets of the symbolic link at TZDEFAULT are:
* "/usr/share/zoneinfo/<olsonID>" default, older Linus distros, macOS to 10.12
* "../usr/share/zoneinfo/<olsonID>" newer Linux distros: Red Hat Enterprise Linux 7, Ubuntu, SuSe Linux
* "/usr/share/lib/zoneinfo/<olsonID>" Solaris
* "../usr/share/lib/zoneinfo/<olsonID>" Solaris
* "/var/db/timezone/zoneinfo/<olsonID>" macOS 10.13
* To avoid checking lots of paths, just check that the target path
* before the <olsonID> ends with "/zoneinfo/", and the <olsonID> is valid.
*/
#define CHECK_LOCALTIME_LINK 1
#if U_PLATFORM_IS_DARWIN_BASED
#include <tzfile.h>
@ -682,12 +692,12 @@ extern U_IMPORT char *U_TZNAME[];
#elif U_PLATFORM == U_PF_SOLARIS
#define TZDEFAULT "/etc/localtime"
#define TZZONEINFO "/usr/share/lib/zoneinfo/"
#define TZZONEINFO2 "../usr/share/lib/zoneinfo/"
#define TZ_ENV_CHECK "localtime"
#else
#define TZDEFAULT "/etc/localtime"
#define TZZONEINFO "/usr/share/zoneinfo/"
#endif
#define TZZONEINFOTAIL "/zoneinfo/"
#if U_HAVE_DIRENT_H
#define TZFILE_SKIP "posixrules" /* tz file to skip when searching. */
/* Some Linux distributions have 'localtime' in /usr/share/zoneinfo
@ -1131,24 +1141,15 @@ uprv_tzname(int n)
*/
int32_t ret = (int32_t)readlink(TZDEFAULT, gTimeZoneBuffer, sizeof(gTimeZoneBuffer)-1);
if (0 < ret) {
int32_t tzZoneInfoLen = uprv_strlen(TZZONEINFO);
int32_t tzZoneInfoTailLen = uprv_strlen(TZZONEINFOTAIL);
gTimeZoneBuffer[ret] = 0;
if (uprv_strncmp(gTimeZoneBuffer, TZZONEINFO, tzZoneInfoLen) == 0
&& isValidOlsonID(gTimeZoneBuffer + tzZoneInfoLen))
char * tzZoneInfoTailPtr = uprv_strstr(gTimeZoneBuffer, TZZONEINFOTAIL);
if (tzZoneInfoTailPtr != NULL
&& isValidOlsonID(tzZoneInfoTailPtr + tzZoneInfoTailLen))
{
return (gTimeZoneBufferPtr = gTimeZoneBuffer + tzZoneInfoLen);
return (gTimeZoneBufferPtr = tzZoneInfoTailPtr + tzZoneInfoTailLen);
}
#if U_PLATFORM == U_PF_SOLARIS
else
{
tzZoneInfoLen = uprv_strlen(TZZONEINFO2);
if (uprv_strncmp(gTimeZoneBuffer, TZZONEINFO2, tzZoneInfoLen) == 0
&& isValidOlsonID(gTimeZoneBuffer + tzZoneInfoLen))
{
return (gTimeZoneBufferPtr = gTimeZoneBuffer + tzZoneInfoLen);
}
}
#endif
} else {
#if defined(SEARCH_TZFILE)
DefaultTZInfo* tzInfo = (DefaultTZInfo*)uprv_malloc(sizeof(DefaultTZInfo));

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,622 @@
// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// file: rbbi_cache.cpp
#include "unicode/utypes.h"
#include "unicode/ubrk.h"
#include "unicode/rbbi.h"
#include "rbbi_cache.h"
#include "brkeng.h"
#include "cmemory.h"
#include "rbbidata.h"
#include "uassert.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
/*
* DictionaryCache implementation
*/
RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
fBI(bi), fBreaks(NULL), fPositionInCache(-1),
fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) {
fBreaks = new UVector32(status);
}
RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() {
delete fBreaks;
fBreaks = NULL;
}
void RuleBasedBreakIterator::DictionaryCache::reset() {
fPositionInCache = -1;
fStart = 0;
fLimit = 0;
fFirstRuleStatusIndex = 0;
fOtherRuleStatusIndex = 0;
fBreaks->removeAllElements();
}
UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
if (fromPos >= fLimit || fromPos < fStart) {
fPositionInCache = -1;
return FALSE;
}
// Sequential iteration, move from previous boundary to the following
int32_t r = 0;
if (fPositionInCache >= 0 && fPositionInCache < fBreaks->size() && fBreaks->elementAti(fPositionInCache) == fromPos) {
++fPositionInCache;
if (fPositionInCache >= fBreaks->size()) {
fPositionInCache = -1;
return FALSE;
}
r = fBreaks->elementAti(fPositionInCache);
U_ASSERT(r > fromPos);
*result = r;
*statusIndex = fOtherRuleStatusIndex;
return TRUE;
}
// Random indexing. Linear search for the boundary following the given position.
for (fPositionInCache = 0; fPositionInCache < fBreaks->size(); ++fPositionInCache) {
r= fBreaks->elementAti(fPositionInCache);
if (r > fromPos) {
*result = r;
*statusIndex = fOtherRuleStatusIndex;
return TRUE;
}
}
U_ASSERT(FALSE);
fPositionInCache = -1;
return FALSE;
}
UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
if (fromPos <= fStart || fromPos > fLimit) {
fPositionInCache = -1;
return FALSE;
}
if (fromPos == fLimit) {
fPositionInCache = fBreaks->size() - 1;
if (fPositionInCache >= 0) {
U_ASSERT(fBreaks->elementAti(fPositionInCache) == fromPos);
}
}
int32_t r;
if (fPositionInCache > 0 && fPositionInCache < fBreaks->size() && fBreaks->elementAti(fPositionInCache) == fromPos) {
--fPositionInCache;
r = fBreaks->elementAti(fPositionInCache);
U_ASSERT(r < fromPos);
*result = r;
*statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
return TRUE;
}
if (fPositionInCache == 0) {
fPositionInCache = -1;
return FALSE;
}
for (fPositionInCache = fBreaks->size()-1; fPositionInCache >= 0; --fPositionInCache) {
r = fBreaks->elementAti(fPositionInCache);
if (r < fromPos) {
*result = r;
*statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
return TRUE;
}
}
U_ASSERT(FALSE);
fPositionInCache = -1;
return FALSE;
}
void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos,
int32_t firstRuleStatus, int32_t otherRuleStatus) {
if ((endPos - startPos) <= 1) {
return;
}
reset();
fFirstRuleStatusIndex = firstRuleStatus;
fOtherRuleStatusIndex = otherRuleStatus;
int32_t rangeStart = startPos;
int32_t rangeEnd = endPos;
uint16_t category;
int32_t current;
UErrorCode status = U_ZERO_ERROR;
int32_t foundBreakCount = 0;
UText *text = fBI->fText;
// Loop through the text, looking for ranges of dictionary characters.
// For each span, find the appropriate break engine, and ask it to find
// any breaks within the span.
utext_setNativeIndex(text, rangeStart);
UChar32 c = utext_current32(text);
category = UTRIE2_GET16(fBI->fData->fTrie, c);
while(U_SUCCESS(status)) {
while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd && (category & 0x4000) == 0) {
utext_next32(text); // TODO: cleaner loop structure.
c = utext_current32(text);
category = UTRIE2_GET16(fBI->fData->fTrie, c);
}
if (current >= rangeEnd) {
break;
}
// We now have a dictionary character. Get the appropriate language object
// to deal with it.
const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(c);
// Ask the language object if there are any breaks. It will add them to the cache and
// leave the text pointer on the other side of its range, ready to search for the next one.
if (lbe != NULL) {
foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBI->fBreakType, *fBreaks);
}
// Reload the loop variables for the next go-round
c = utext_current32(text);
category = UTRIE2_GET16(fBI->fData->fTrie, c);
}
// If we found breaks, ensure that the first and last entries are
// the original starting and ending position. And initialize the
// cache iteration position to the first entry.
// printf("foundBreakCount = %d\n", foundBreakCount);
if (foundBreakCount > 0) {
U_ASSERT(foundBreakCount == fBreaks->size());
if (startPos < fBreaks->elementAti(0)) {
// The dictionary did not place a boundary at the start of the segment of text.
// Add one now. This should not commonly happen, but it would be easy for interactions
// of the rules for dictionary segments and the break engine implementations to
// inadvertently cause it. Cover it here, just in case.
fBreaks->insertElementAt(startPos, 0, status);
}
if (endPos > fBreaks->peeki()) {
fBreaks->push(endPos, status);
}
fPositionInCache = 0;
// Note: Dictionary matching may extend beyond the original limit.
fStart = fBreaks->elementAti(0);
fLimit = fBreaks->peeki();
} else {
// there were no language-based breaks, even though the segment contained
// dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
// for this range will fail, and the calling code will fall back to the rule based boundaries.
}
}
/*
* BreakCache implemetation
*/
RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
fBI(bi), fSideBuffer(status) {
reset();
}
RuleBasedBreakIterator::BreakCache::~BreakCache() {
}
void RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) {
fStartBufIdx = 0;
fEndBufIdx = 0;
fTextIdx = pos;
fBufIdx = 0;
fBoundaries[0] = pos;
fStatuses[0] = (uint16_t)ruleStatus;
}
int32_t RuleBasedBreakIterator::BreakCache::current() {
fBI->fPosition = fTextIdx;
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
fBI->fDone = FALSE;
return fTextIdx;
}
void RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
// startPos is in the cache. Do a next() from that position.
// TODO: an awkward set of interactions with bi->fDone
// seek() does not clear it; it can't because of interactions with populateNear().
// next() does not clear it in the fast-path case, where everything matters. Maybe it should.
// So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
fBI->fDone = false;
next();
}
return;
}
void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) {
if (startPos == fTextIdx) {
previous(status);
} else {
// seek() leaves the BreakCache positioned at the preceding boundary
// if the requested position is between two bounaries.
// current() pushes the BreakCache position out to the BreakIterator itself.
U_ASSERT(startPos > fTextIdx);
current();
}
}
return;
}
/*
* Out-of-line code for BreakCache::next().
* Cache does not already contain the boundary
*/
void RuleBasedBreakIterator::BreakCache::nextOL() {
fBI->fDone = !populateFollowing();
fBI->fPosition = fTextIdx;
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
return;
}
void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
int32_t initialBufIdx = fBufIdx;
if (fBufIdx == fStartBufIdx) {
// At start of cache. Prepend to it.
populatePreceding(status);
} else {
// Cache already holds the next boundary
fBufIdx = modChunkSize(fBufIdx - 1);
fTextIdx = fBoundaries[fBufIdx];
}
fBI->fDone = (fBufIdx == initialBufIdx);
fBI->fPosition = fTextIdx;
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
return;
}
UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
return FALSE;
}
if (pos == fBoundaries[fStartBufIdx]) {
// Common case: seek(0), from BreakIterator::first()
fBufIdx = fStartBufIdx;
fTextIdx = fBoundaries[fBufIdx];
return TRUE;
}
if (pos == fBoundaries[fEndBufIdx]) {
fBufIdx = fEndBufIdx;
fTextIdx = fBoundaries[fBufIdx];
return TRUE;
}
int32_t min = fStartBufIdx;
int32_t max = fEndBufIdx;
while (min != max) {
int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
probe = modChunkSize(probe);
if (fBoundaries[probe] > pos) {
max = probe;
} else {
min = modChunkSize(probe + 1);
}
}
U_ASSERT(fBoundaries[max] > pos);
fBufIdx = modChunkSize(max - 1);
fTextIdx = fBoundaries[fBufIdx];
U_ASSERT(fTextIdx <= pos);
return TRUE;
}
UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) {
if (U_FAILURE(status)) {
return FALSE;
}
U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
// Find a boundary somewhere in the vicinity of the requested position.
// Depending on the safe rules and the text data, it could be either before, at, or after
// the requested position.
// If the requested position is not near already cached positions, clear the existing cache,
// find a near-by boundary and begin new cache contents there.
if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
int32_t aBoundary = 0;
int32_t ruleStatusIndex = 0;
// TODO: check for position == length of text. Although may still need to back up to get rule status.
if (position > 20) {
int32_t backupPos = fBI->handlePrevious(position);
fBI->fPosition = backupPos;
aBoundary = fBI->handleNext(); // Ignore dictionary, just finding a rule based boundary.
ruleStatusIndex = fBI->fRuleStatusIndex;
}
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
}
// Fill in boundaries between existing cache content and the new requested position.
if (fBoundaries[fEndBufIdx] < position) {
// The last position in the cache precedes the requested position.
// Add following position(s) to the cache.
while (fBoundaries[fEndBufIdx] < position) {
if (!populateFollowing()) {
U_ASSERT(false);
return false;
}
}
fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer.
fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries.
while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos.
previous(status);
}
return true;
}
if (fBoundaries[fStartBufIdx] > position) {
// The first position in the cache is beyond the requested position.
// back up more until we get a boundary <= the requested position.
while (fBoundaries[fStartBufIdx] > position) {
populatePreceding(status);
}
fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer.
fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries.
while (fTextIdx < position) { // Move forwards to a position at or following the requested pos.
next();
}
if (fTextIdx > position) {
// If position is not itself a boundary, the next() loop above will overshoot.
// Back up one, leaving cache position at the boundary preceding the requested position.
previous(status);
}
return true;
}
U_ASSERT(fTextIdx == position);
return true;
}
UBool RuleBasedBreakIterator::BreakCache::populateFollowing() {
int32_t fromPosition = fBoundaries[fEndBufIdx];
int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx];
int32_t pos = 0;
int32_t ruleStatusIdx = 0;
if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
return TRUE;
}
fBI->fPosition = fromPosition;
pos = fBI->handleNext();
if (pos == UBRK_DONE) {
return FALSE;
}
ruleStatusIdx = fBI->fRuleStatusIndex;
if (fBI->fDictionaryCharCount > 0) {
// The text segment obtained from the rules includes dictionary characters.
// Subdivide it, with subdivided results going into the dictionary cache.
fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) {
addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
return TRUE;
// TODO: may want to move a sizable chunk of dictionary cache to break cache at this point.
// But be careful with interactions with populateNear().
}
}
// Rule based segment did not include dictionary characters.
// Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
// meaning that we didn't take the return, above.
// Add its end point to the cache.
addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
// Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
// (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
//
for (int count=0; count<6; ++count) {
pos = fBI->handleNext();
if (pos == UBRK_DONE || fBI->fDictionaryCharCount > 0) {
break;
}
addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition);
}
return TRUE;
}
UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) {
if (U_FAILURE(status)) {
return FALSE;
}
int32_t fromPosition = fBoundaries[fStartBufIdx];
if (fromPosition == 0) {
return FALSE;
}
int32_t position = 0;
int32_t positionStatusIdx = 0;
if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) {
addPreceding(position, positionStatusIdx, UpdateCachePosition);
return TRUE;
}
int32_t backupPosition = fromPosition;
// Find a boundary somewhere preceding the first already-cached boundary
do {
backupPosition = backupPosition - 30;
if (backupPosition <= 0) {
backupPosition = 0;
} else {
backupPosition = fBI->handlePrevious(backupPosition);
}
if (backupPosition == UBRK_DONE || backupPosition == 0) {
position = 0;
positionStatusIdx = 0;
} else {
fBI->fPosition = backupPosition; // TODO: pass starting position in a clearer way.
position = fBI->handleNext();
positionStatusIdx = fBI->fRuleStatusIndex;
}
} while (position >= fromPosition);
// Find boundaries between the one we just located and the first already-cached boundary
// Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer..
fSideBuffer.removeAllElements();
fSideBuffer.addElement(position, status);
fSideBuffer.addElement(positionStatusIdx, status);
do {
int32_t prevPosition = fBI->fPosition = position;
int32_t prevStatusIdx = positionStatusIdx;
position = fBI->handleNext();
positionStatusIdx = fBI->fRuleStatusIndex;
if (position == UBRK_DONE) {
break;
}
UBool segmentHandledByDictionary = FALSE;
if (fBI->fDictionaryCharCount != 0) {
// Segment from the rules includes dictionary characters.
// Subdivide it, with subdivided results going into the dictionary cache.
int32_t dictSegEndPosition = position;
fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) {
segmentHandledByDictionary = true;
U_ASSERT(position > prevPosition);
if (position >= fromPosition) {
break;
}
U_ASSERT(position <= dictSegEndPosition);
fSideBuffer.addElement(position, status);
fSideBuffer.addElement(positionStatusIdx, status);
prevPosition = position;
}
U_ASSERT(position==dictSegEndPosition || position>=fromPosition);
}
if (!segmentHandledByDictionary && position < fromPosition) {
fSideBuffer.addElement(position, status);
fSideBuffer.addElement(positionStatusIdx, status);
}
} while (position < fromPosition);
// Move boundaries from the side buffer to the main circular buffer.
UBool success = FALSE;
if (!fSideBuffer.isEmpty()) {
positionStatusIdx = fSideBuffer.popi();
position = fSideBuffer.popi();
addPreceding(position, positionStatusIdx, UpdateCachePosition);
success = TRUE;
}
while (!fSideBuffer.isEmpty()) {
positionStatusIdx = fSideBuffer.popi();
position = fSideBuffer.popi();
if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
// No space in circular buffer to hold a new preceding result while
// also retaining the current cache (iteration) position.
// Bailing out is safe; the cache will refill again if needed.
break;
}
}
return success;
}
void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
U_ASSERT(position > fBoundaries[fEndBufIdx]);
U_ASSERT(ruleStatusIdx <= UINT16_MAX);
int32_t nextIdx = modChunkSize(fEndBufIdx + 1);
if (nextIdx == fStartBufIdx) {
fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1.
}
fBoundaries[nextIdx] = position;
fStatuses[nextIdx] = ruleStatusIdx;
fEndBufIdx = nextIdx;
if (update == UpdateCachePosition) {
// Set current position to the newly added boundary.
fBufIdx = nextIdx;
fTextIdx = position;
} else {
// Retaining the original cache position.
// Check if the added boundary wraps around the buffer, and would over-write the original position.
// It's the responsibility of callers of this function to not add too many.
U_ASSERT(nextIdx != fBufIdx);
}
}
bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) {
U_ASSERT(position < fBoundaries[fStartBufIdx]);
U_ASSERT(ruleStatusIdx <= UINT16_MAX);
int32_t nextIdx = modChunkSize(fStartBufIdx - 1);
if (nextIdx == fEndBufIdx) {
if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
// Failure. The insertion of the new boundary would claim the buffer position that is the
// current iteration position. And we also want to retain the current iteration position.
// (The buffer is already completely full of entries that precede the iteration position.)
return false;
}
fEndBufIdx = modChunkSize(fEndBufIdx - 1);
}
fBoundaries[nextIdx] = position;
fStatuses[nextIdx] = ruleStatusIdx;
fStartBufIdx = nextIdx;
if (update == UpdateCachePosition) {
fBufIdx = nextIdx;
fTextIdx = position;
}
return true;
}
void RuleBasedBreakIterator::BreakCache::dumpCache() {
printf("fTextIdx:%d fBufIdx:%d\n", fTextIdx, fBufIdx);
for (int32_t i=fStartBufIdx; ; i=modChunkSize(i+1)) {
printf("%d %d\n", i, fBoundaries[i]);
if (i == fEndBufIdx) {
break;
}
}
}
U_NAMESPACE_END

View file

@ -0,0 +1,199 @@
// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// file: rbbi_cache.h
//
#ifndef RBBI_CACHE_H
#define RBBI_CACHE_H
#include "unicode/utypes.h"
#include "unicode/rbbi.h"
#include "unicode/uobject.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
/* DictionaryCache stores the boundaries obtained from a run of dictionary characters.
* Dictionary boundaries are moved first to this cache, then from here
* to the main BreakCache, where they may inter-leave with non-dictionary
* boundaries. The public BreakIterator API always fetches directly
* from the main BreakCache, not from here.
*
* In common situations, the number of boundaries in a single dictionary run
* should be quite small, it will be terminated by punctuation, spaces,
* or any other non-dictionary characters. The main BreakCache may end
* up with boundaries from multiple dictionary based runs.
*
* The boundaries are stored in a simple ArrayList (vector), with the
* assumption that they will be accessed sequentially.
*/
class RuleBasedBreakIterator::DictionaryCache: public UMemory {
public:
DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status);
~DictionaryCache();
void reset();
UBool following(int32_t fromPos, int32_t *pos, int32_t *statusIndex);
UBool preceding(int32_t fromPos, int32_t *pos, int32_t *statusIndex);
/**
* Populate the cache with the dictionary based boundaries within a region of text.
* @param startPos The start position of a range of text
* @param endPos The end position of a range of text
* @param firstRuleStatus The rule status index that applies to the break at startPos
* @param otherRuleStatus The rule status index that applies to boundaries other than startPos
* @internal
*/
void populateDictionary(int32_t startPos, int32_t endPos,
int32_t firstRuleStatus, int32_t otherRuleStatus);
RuleBasedBreakIterator *fBI;
UVector32 *fBreaks; // A vector containing the boundaries.
int32_t fPositionInCache; // Index in fBreaks of last boundary returned by following()
// or preceding(). Optimizes sequential access.
int32_t fStart; // Text position of first boundary in cache.
int32_t fLimit; // Last boundary in cache. Which is the limit of the
// text segment being handled by the dictionary.
int32_t fFirstRuleStatusIndex; // Rule status info for first boundary.
int32_t fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries.
};
/*
* class BreakCache
*
* Cache of break boundary positions and rule status values.
* Break iterator API functions, next(), previous(), etc., will use cached results
* when possible, and otherwise cache new results as they are obtained.
*
* Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
*
* The cache is implemented as a single circular buffer.
*/
/*
* size of the circular cache buffer.
*/
class RuleBasedBreakIterator::BreakCache: public UMemory {
public:
BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status);
virtual ~BreakCache();
void reset(int32_t pos = 0, int32_t ruleStatus = 0);
void next() { if (fBufIdx == fEndBufIdx) {
nextOL();
} else {
fBufIdx = modChunkSize(fBufIdx + 1);
fTextIdx = fBI->fPosition = fBoundaries[fBufIdx];
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
}
};
void nextOL();
void previous(UErrorCode &status);
// Move the iteration state to the position following the startPosition.
// Input position must be pinned to the input length.
void following(int32_t startPosition, UErrorCode &status);
void preceding(int32_t startPosition, UErrorCode &status);
/*
* Update the state of the public BreakIterator (fBI) to reflect the
* current state of the break iterator cache (this).
*/
int32_t current();
/**
* Add boundaries to the cache near the specified position.
* The given position need not be a boundary itself.
* The input position must be within the range of the text, and
* on a code point boundary.
* If the requested position is a break boundary, leave the iteration
* position on it.
* If the requested position is not a boundary, leave the iteration
* position on the preceding boundary and include both the the
* preceding and following boundaries in the cache.
* Additional boundaries, either preceding or following, may be added
* to the cache as a side effect.
*
* Return FALSE if the operation failed.
*/
UBool populateNear(int32_t position, UErrorCode &status);
/**
* Add boundary(s) to the cache following the current last boundary.
* Return FALSE if at the end of the text, and no more boundaries can be added.
* Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
*/
UBool populateFollowing();
/**
* Add one or more boundaries to the cache preceding the first currently cached boundary.
* Leave the iteration position on the first added boundary.
* Return false if no boundaries could be added (if at the start of the text.)
*/
UBool populatePreceding(UErrorCode &status);
enum UpdatePositionValues {
RetainCachePosition = 0,
UpdateCachePosition = 1
};
/*
* Add the boundary following the current position.
* The current position can be left as it was, or changed to the newly added boundary,
* as specified by the update parameter.
*/
void addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);
/*
* Add the boundary preceding the current position.
* The current position can be left as it was, or changed to the newly added boundary,
* as specified by the update parameter.
*/
bool addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);
/**
* Set the cache position to the specified position, or, if the position
* falls between to cached boundaries, to the preceding boundary.
* Fails if the requested position is outside of the range of boundaries currently held by the cache.
* The startPosition must be on a code point boundary.
*
* Return TRUE if successful, FALSE if the specified position is after
* the last cached boundary or before the first.
*/
UBool seek(int32_t startPosition);
void dumpCache();
private:
static inline int32_t modChunkSize(int index) { return index & (CACHE_SIZE - 1); };
static constexpr int32_t CACHE_SIZE = 128;
static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");
RuleBasedBreakIterator *fBI;
int32_t fStartBufIdx;
int32_t fEndBufIdx; // inclusive
int32_t fTextIdx;
int32_t fBufIdx;
int32_t fBoundaries[CACHE_SIZE];
uint16_t fStatuses[CACHE_SIZE];
UVector32 fSideBuffer;
};
U_NAMESPACE_END
#endif // RBBI_CACHE_H

View file

@ -14,7 +14,7 @@
#include "unicode/utypes.h"
#include "rbbidata.h"
#include "rbbirb.h"
#include "utrie.h"
#include "utrie2.h"
#include "udatamem.h"
#include "cmemory.h"
#include "cstring.h"
@ -83,11 +83,11 @@ void RBBIDataWrapper::init0() {
fReverseTable = NULL;
fSafeFwdTable = NULL;
fSafeRevTable = NULL;
fRuleSource = NULL;
fRuleSource = NULL;
fRuleStatusTable = NULL;
fTrie = NULL;
fUDataMem = NULL;
fRefCount = 0;
fTrie = NULL;
fUDataMem = NULL;
fRefCount = 0;
fDontFreeData = TRUE;
}
@ -118,6 +118,14 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
}
// Rule Compatibility Hacks
// If a rule set includes reverse rules but does not explicitly include safe reverse rules,
// the reverse rules are to be treated as safe reverse rules.
if (fSafeRevTable == NULL && fReverseTable != NULL) {
fSafeRevTable = fReverseTable;
fReverseTable = NULL;
}
fTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
(uint8_t *)data + fHeader->fTrie,

View file

@ -184,11 +184,11 @@ public:
/* number of int32_t values in the rule status table. Used to sanity check indexing */
int32_t fStatusMaxIdx;
UTrie2 *fTrie;
UTrie2 *fTrie;
private:
u_atomic_int32_t fRefCount;
UDataMemory *fUDataMem;
UDataMemory *fUDataMem;
UnicodeString fRuleString;
UBool fDontFreeData;

View file

@ -24,16 +24,16 @@
#include "unicode/uchriter.h"
#include "unicode/parsepos.h"
#include "unicode/parseerr.h"
#include "cmemory.h"
#include "cstring.h"
#include "rbbirb.h"
#include "rbbinode.h"
#include "rbbiscan.h"
#include "rbbisetb.h"
#include "rbbitblb.h"
#include "rbbidata.h"
#include "uassert.h"
U_NAMESPACE_BEGIN
@ -164,8 +164,13 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar));
int32_t totalSize = headerSize + forwardTableSize + reverseTableSize
+ safeFwdTableSize + safeRevTableSize
(void)safeFwdTableSize;
int32_t totalSize = headerSize
+ forwardTableSize
+ /* reverseTableSize */ 0
+ /* safeFwdTableSize */ 0
+ (safeRevTableSize ? safeRevTableSize : reverseTableSize)
+ statusTableSize + trieSize + rulesSize;
RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
@ -184,16 +189,38 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
data->fLength = totalSize;
data->fCatCount = fSetBuilder->getNumCharCategories();
// Only save the forward table and the safe reverse table,
// because these are the only ones used at run-time.
//
// For the moment, we still build the other tables if they are present in the rule source files,
// for backwards compatibility. Old rule files need to work, and this is the simplest approach.
//
// Additional backwards compatibility consideration: if no safe rules are provided, consider the
// reverse rules to actually be the safe reverse rules.
data->fFTable = headerSize;
data->fFTableLen = forwardTableSize;
data->fRTable = data->fFTable + forwardTableSize;
data->fRTableLen = reverseTableSize;
data->fSFTable = data->fRTable + reverseTableSize;
data->fSFTableLen = safeFwdTableSize;
data->fSRTable = data->fSFTable + safeFwdTableSize;
data->fSRTableLen = safeRevTableSize;
data->fTrie = data->fSRTable + safeRevTableSize;
// Do not save Reverse Table.
data->fRTable = data->fFTable + forwardTableSize;
data->fRTableLen = 0;
// Do not save the Safe Forward table.
data->fSFTable = data->fRTable + 0;
data->fSFTableLen = 0;
data->fSRTable = data->fSFTable + 0;
if (safeRevTableSize > 0) {
data->fSRTableLen = safeRevTableSize;
} else if (reverseTableSize > 0) {
data->fSRTableLen = reverseTableSize;
} else {
U_ASSERT(FALSE); // Rule build should have failed for lack of a reverse table
// before reaching this point.
}
data->fTrie = data->fSRTable + data->fSRTableLen;
data->fTrieLen = fSetBuilder->getTrieSize();
data->fStatusTable = data->fTrie + trieSize;
data->fStatusTableLen= statusTableSize;
@ -203,9 +230,14 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
fForwardTables->exportTable((uint8_t *)data + data->fFTable);
fReverseTables->exportTable((uint8_t *)data + data->fRTable);
fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
// fReverseTables->exportTable((uint8_t *)data + data->fRTable);
// fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
if (safeRevTableSize > 0) {
fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
} else {
fReverseTables->exportTable((uint8_t *)data + data->fSRTable);
}
fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);

View file

@ -15,6 +15,9 @@
#define RBBIRB_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/uobject.h"
#include "unicode/rbbi.h"
#include "unicode/uniset.h"
@ -207,6 +210,9 @@ struct RBBISetTableEl {
#endif
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif

View file

@ -47,6 +47,7 @@
//
//------------------------------------------------------------------------------
static const UChar gRuleSet_rule_char_pattern[] = {
// Characters that may appear as literals in patterns without escaping or quoting.
// [ ^ [ \ p { Z } \ u 0 0 2 0
0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30,
// - \ u 0 0 7 f ] - [ \ p
@ -558,6 +559,10 @@ UBool RBBIRuleScanner::doParseActions(int32_t action)
fRB->fDefaultTree = &fRB->fSafeRevTree;
} else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) {
fRB->fLookAheadHardBreak = TRUE;
} else if (opt == UNICODE_STRING("quoted_literals_only", 20)) {
fRuleSets[kRuleSet_rule_char-128].clear();
} else if (opt == UNICODE_STRING("unquoted_literals", 17)) {
fRuleSets[kRuleSet_rule_char-128].applyPattern(UnicodeString(gRuleSet_rule_char_pattern), *fRB->fStatus);
} else {
error(U_BRK_UNRECOGNIZED_OPTION);
}

View file

@ -250,12 +250,17 @@ void RBBISetBuilder::build() {
// Build the Trie table for mapping UChar32 values to the corresponding
// range group number
//
fTrie = utrie2_open(0, // Initial value for all code points
0, // errorValue
fTrie = utrie2_open(0, // Initial value for all code points.
0, // Error value for out-of-range input.
fStatus);
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
utrie2_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar, rlRange->fNum, TRUE, fStatus);
for (rlRange = fRangeList; rlRange!=0 && U_SUCCESS(*fStatus); rlRange=rlRange->fNext) {
utrie2_setRange32(fTrie,
rlRange->fStartChar, // Range start
rlRange->fEndChar, // Range end (inclusive)
rlRange->fNum, // value for range
TRUE, // Overwrite previously written values
fStatus);
}
}
@ -265,7 +270,10 @@ void RBBISetBuilder::build() {
// getTrieSize() Return the size that will be required to serialize the Trie.
//
//-----------------------------------------------------------------------------------
int32_t RBBISetBuilder::getTrieSize() /*const*/ {
int32_t RBBISetBuilder::getTrieSize() {
if (U_FAILURE(*fStatus)) {
return 0;
}
utrie2_freeze(fTrie, UTRIE2_16_VALUE_BITS, fStatus);
fTrieSize = utrie2_serialize(fTrie,
NULL, // Buffer

View file

@ -13,6 +13,9 @@
#define RBBISETB_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/uobject.h"
#include "rbbirb.h"
#include "utrie2.h"
@ -108,8 +111,8 @@ private:
RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors
UTrie2 *fTrie; // The mapping TRIE that is the end result of processing
uint32_t fTrieSize; // the Unicode Sets.
UTrie2 *fTrie; // The mapping TRIE that is the end result of processing
uint32_t fTrieSize; // the Unicode Sets.
// Groups correspond to character categories -
// groups of ranges that are in the same original UnicodeSets.
@ -128,4 +131,7 @@ private:
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif

View file

@ -20,8 +20,11 @@
#include "unicode/utypes.h"
#include "unicode/brkiter.h"
#include "unicode/bytestream.h"
#include "unicode/casemap.h"
#include "unicode/edits.h"
#include "unicode/stringoptions.h"
#include "unicode/stringpiece.h"
#include "unicode/ubrk.h"
#include "unicode/uloc.h"
#include "unicode/ustring.h"
@ -32,6 +35,7 @@
#include "unicode/utf.h"
#include "unicode/utf8.h"
#include "unicode/utf16.h"
#include "bytesinkutil.h"
#include "cmemory.h"
#include "cstring.h"
#include "uassert.h"
@ -39,27 +43,6 @@
#include "ucasemap_imp.h"
#include "ustr_imp.h"
U_NAMESPACE_BEGIN
namespace {
// TODO: share with UTF-16? inline in ucasemap_imp.h?
int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity,
Edits *edits, UErrorCode &errorCode) {
if (U_SUCCESS(errorCode)) {
if (destIndex > destCapacity) {
errorCode = U_BUFFER_OVERFLOW_ERROR;
} else if (edits != NULL) {
edits->copyErrorTo(errorCode);
}
}
return destIndex;
}
} // namespace
U_NAMESPACE_END
U_NAMESPACE_USE
/* UCaseMap service object -------------------------------------------------- */
@ -150,152 +133,39 @@ ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
/* TODO(markus): Move to a new, separate utf8case.cpp file. */
namespace {
/* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
static inline int32_t
appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity,
int32_t result, const UChar *s,
int32_t cpLength, uint32_t options, icu::Edits *edits) {
UChar32 c;
int32_t length;
UErrorCode errorCode;
inline UBool
appendResult(int32_t cpLength, int32_t result, const UChar *s,
ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
U_ASSERT(U_SUCCESS(errorCode));
/* decode the result */
if(result<0) {
/* (not) original code point */
if(edits!=NULL) {
edits->addUnchanged(cpLength);
if(options & U_OMIT_UNCHANGED_TEXT) {
return destIndex;
}
}
c=~result;
if(destIndex<destCapacity && c<=0x7f) { // ASCII slightly-fastpath
dest[destIndex++]=(uint8_t)c;
return destIndex;
if((options & U_OMIT_UNCHANGED_TEXT) == 0) {
ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);
}
length=cpLength;
} else {
if(result<=UCASE_MAX_STRING_LENGTH) {
// string: "result" is the UTF-16 length
if(result==0) {
length=0;
} else {
errorCode=U_ZERO_ERROR;
if(destIndex<destCapacity) {
u_strToUTF8((char *)(dest+destIndex), destCapacity-destIndex, &length,
s, result, &errorCode);
} else {
u_strToUTF8(NULL, 0, &length, s, result, &errorCode);
}
if(U_FAILURE(errorCode) && errorCode != U_BUFFER_OVERFLOW_ERROR) {
return -1;
}
if(length>(INT32_MAX-destIndex)) {
return -1; // integer overflow
}
}
if(edits!=NULL) {
edits->addReplace(cpLength, length);
}
// We might have an overflow, but we know the actual length.
return destIndex+length;
} else if(destIndex<destCapacity && result<=0x7f) { // ASCII slightly-fastpath
dest[destIndex++]=(uint8_t)result;
if(edits!=NULL) {
edits->addReplace(cpLength, 1);
}
return destIndex;
return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);
} else {
c=result;
length=U8_LENGTH(c);
if(edits!=NULL) {
edits->addReplace(cpLength, length);
}
ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);
}
}
// c>=0 single code point
if(length>(INT32_MAX-destIndex)) {
return -1; // integer overflow
}
if(destIndex<destCapacity) {
/* append the result */
UBool isError=FALSE;
U8_APPEND(dest, destIndex, destCapacity, c, isError);
if(isError) {
/* overflow, nothing written */
destIndex+=length;
}
} else {
/* preflight */
destIndex+=length;
}
return destIndex;
}
static inline int32_t
appendASCII(uint8_t *dest, int32_t destIndex, int32_t destCapacity, uint8_t c) {
if(destIndex<destCapacity) {
dest[destIndex]=c;
} else if(destIndex==INT32_MAX) {
return -1; // integer overflow
}
return destIndex+1;
return TRUE;
}
// See unicode/utf8.h U8_APPEND_UNSAFE().
static inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
static inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
static inline int32_t
appendTwoBytes(uint8_t *dest, int32_t destIndex, int32_t destCapacity, UChar32 c) {
U_ASSERT(0x370 <= c && c <= 0x3ff); // 2-byte UTF-8, main Greek block
if(2>(INT32_MAX-destIndex)) {
return -1; // integer overflow
}
int32_t limit=destIndex+2;
if(limit<=destCapacity) {
dest+=destIndex;
dest[0]=getTwoByteLead(c);
dest[1]=getTwoByteTrail(c);
}
return limit;
}
static inline int32_t
appendTwoBytes(uint8_t *dest, int32_t destIndex, int32_t destCapacity, const char *s) {
if(2>(INT32_MAX-destIndex)) {
return -1; // integer overflow
}
int32_t limit=destIndex+2;
if(limit<=destCapacity) {
dest+=destIndex;
dest[0]=(uint8_t)s[0];
dest[1]=(uint8_t)s[1];
}
return limit;
}
static inline int32_t
appendUnchanged(uint8_t *dest, int32_t destIndex, int32_t destCapacity,
const uint8_t *s, int32_t length, uint32_t options, icu::Edits *edits) {
if(length>0) {
if(edits!=NULL) {
edits->addUnchanged(length);
if(options & U_OMIT_UNCHANGED_TEXT) {
return destIndex;
}
}
if(length>(INT32_MAX-destIndex)) {
return -1; // integer overflow
}
if((destIndex+length)<=destCapacity) {
uprv_memcpy(dest+destIndex, s, length);
}
destIndex+=length;
}
return destIndex;
}
} // namespace
static UChar32 U_CALLCONV
utf8_caseContextIterator(void *context, int8_t dir) {
@ -333,17 +203,15 @@ utf8_caseContextIterator(void *context, int8_t dir) {
* Case-maps [srcStart..srcLimit[ but takes
* context [0..srcLength[ into account.
*/
static int32_t
static void
_caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
uint8_t *dest, int32_t destCapacity,
const uint8_t *src, UCaseContext *csc,
int32_t srcStart, int32_t srcLimit,
icu::Edits *edits,
icu::ByteSink &sink, icu::Edits *edits,
UErrorCode &errorCode) {
/* case mapping loop */
int32_t srcIndex=srcStart;
int32_t destIndex=0;
while(srcIndex<srcLimit) {
while (U_SUCCESS(errorCode) && srcIndex<srcLimit) {
int32_t cpStart;
csc->cpStart=cpStart=srcIndex;
UChar32 c;
@ -351,45 +219,32 @@ _caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
csc->cpLimit=srcIndex;
if(c<0) {
// Malformed UTF-8.
destIndex=appendUnchanged(dest, destIndex, destCapacity,
src+cpStart, srcIndex-cpStart, options, edits);
if(destIndex<0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
continue;
}
const UChar *s;
c=map(c, utf8_caseContextIterator, csc, &s, caseLocale);
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
srcIndex - cpStart, options, edits);
if (destIndex < 0) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart,
sink, options, edits, errorCode);
} else {
const UChar *s;
c=map(c, utf8_caseContextIterator, csc, &s, caseLocale);
appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
}
}
return destIndex;
}
#if !UCONFIG_NO_BREAK_ITERATION
U_CFUNC int32_t U_CALLCONV
U_CFUNC void U_CALLCONV
ucasemap_internalUTF8ToTitle(
int32_t caseLocale, uint32_t options, BreakIterator *iter,
uint8_t *dest, int32_t destCapacity,
const uint8_t *src, int32_t srcLength,
icu::Edits *edits,
ByteSink &sink, icu::Edits *edits,
UErrorCode &errorCode) {
if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
return 0;
return;
}
/* set up local variables */
UCaseContext csc=UCASECONTEXT_INITIALIZER;
csc.p=(void *)src;
csc.limit=srcLength;
int32_t destIndex=0;
int32_t prev=0;
UBool isFirstIndex=TRUE;
@ -434,11 +289,9 @@ ucasemap_internalUTF8ToTitle(
U8_NEXT(src, titleLimit, index, c);
}
if (prev < titleStart) {
destIndex=appendUnchanged(dest, destIndex, destCapacity,
src+prev, titleStart-prev, options, edits);
if(destIndex<0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
sink, options, edits, errorCode)) {
return;
}
}
}
@ -450,16 +303,15 @@ ucasemap_internalUTF8ToTitle(
csc.cpLimit=titleLimit;
const UChar *s;
c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
destIndex=appendResult(dest, destIndex, destCapacity, c, s,
titleLimit-titleStart, options, edits);
if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {
return;
}
} else {
// Malformed UTF-8.
destIndex=appendUnchanged(dest, destIndex, destCapacity,
src+titleStart, titleLimit-titleStart, options, edits);
}
if(destIndex<0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
sink, options, edits, errorCode)) {
return;
}
}
/* Special case Dutch IJ titlecasing */
@ -467,22 +319,13 @@ ucasemap_internalUTF8ToTitle(
caseLocale == UCASE_LOC_DUTCH &&
(src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
if (src[titleStart+1] == 0x006A) {
destIndex=appendASCII(dest, destIndex, destCapacity, 0x004A);
if(destIndex<0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
if(edits!=NULL) {
edits->addReplace(1, 1);
}
ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits);
titleLimit++;
} else if (src[titleStart+1] == 0x004A) {
// Keep the capital J from getting lowercased.
destIndex=appendUnchanged(dest, destIndex, destCapacity,
src+titleStart+1, 1, options, edits);
if(destIndex<0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1,
sink, options, edits, errorCode)) {
return;
}
titleLimit++;
}
@ -492,26 +335,18 @@ ucasemap_internalUTF8ToTitle(
if(titleLimit<index) {
if((options&U_TITLECASE_NO_LOWERCASE)==0) {
/* Normal operation: Lowercase the rest of the word. */
destIndex+=
_caseMap(
caseLocale, options, ucase_toFullLower,
dest+destIndex, destCapacity-destIndex,
src, &csc,
titleLimit, index,
edits, errorCode);
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
errorCode=U_ZERO_ERROR;
}
_caseMap(caseLocale, options, ucase_toFullLower,
src, &csc,
titleLimit, index,
sink, edits, errorCode);
if(U_FAILURE(errorCode)) {
return destIndex;
return;
}
} else {
/* Optionally just copy the rest of the word unchanged. */
destIndex=appendUnchanged(dest, destIndex, destCapacity,
src+titleLimit, index-titleLimit, options, edits);
if(destIndex<0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
sink, options, edits, errorCode)) {
return;
}
}
}
@ -520,8 +355,6 @@ ucasemap_internalUTF8ToTitle(
prev=index;
}
return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
}
#endif
@ -546,12 +379,10 @@ UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
}
// Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
int32_t toUpper(uint32_t options,
uint8_t *dest, int32_t destCapacity,
const uint8_t *src, int32_t srcLength,
Edits *edits,
UErrorCode &errorCode) {
int32_t destIndex=0;
void toUpper(uint32_t options,
const uint8_t *src, int32_t srcLength,
ByteSink &sink, Edits *edits,
UErrorCode &errorCode) {
uint32_t state = 0;
for (int32_t i = 0; i < srcLength;) {
int32_t nextIndex = i;
@ -627,8 +458,10 @@ int32_t toUpper(uint32_t options,
}
}
UBool change = TRUE;
if (edits != NULL) {
UBool change;
if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
change = TRUE; // common, simple usage
} else {
// Find out first whether we are changing the text.
U_ASSERT(0x370 <= upper && upper <= 0x3ff); // 2-byte UTF-8, main Greek block
change = (i + 2) > nextIndex ||
@ -664,143 +497,141 @@ int32_t toUpper(uint32_t options,
}
if (change) {
destIndex=appendTwoBytes(dest, destIndex, destCapacity, upper);
if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
destIndex=appendTwoBytes(dest, destIndex, destCapacity, u8"\u0308"); // restore or add a dialytika
ByteSinkUtil::appendTwoBytes(upper, sink);
if ((data & HAS_EITHER_DIALYTIKA) != 0) {
sink.Append(u8"\u0308", 2); // restore or add a dialytika
}
if (destIndex >= 0 && addTonos) {
destIndex=appendTwoBytes(dest, destIndex, destCapacity, u8"\u0301");
if (addTonos) {
sink.Append(u8"\u0301", 2);
}
while (destIndex >= 0 && numYpogegrammeni > 0) {
destIndex=appendTwoBytes(dest, destIndex, destCapacity, u8"\u0399");
while (numYpogegrammeni > 0) {
sink.Append(u8"\u0399", 2);
--numYpogegrammeni;
}
if(destIndex<0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
}
} else if(c>=0) {
const UChar *s;
c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
nextIndex - i, options, edits);
if (destIndex < 0) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {
return;
}
} else {
// Malformed UTF-8.
destIndex=appendUnchanged(dest, destIndex, destCapacity,
src+i, nextIndex-i, options, edits);
if(destIndex<0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,
sink, options, edits, errorCode)) {
return;
}
}
i = nextIndex;
state = nextState;
}
return destIndex;
}
} // namespace GreekUpper
U_NAMESPACE_END
static int32_t U_CALLCONV
static void U_CALLCONV
ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
uint8_t *dest, int32_t destCapacity,
const uint8_t *src, int32_t srcLength,
icu::Edits *edits,
icu::ByteSink &sink, icu::Edits *edits,
UErrorCode &errorCode) {
UCaseContext csc=UCASECONTEXT_INITIALIZER;
csc.p=(void *)src;
csc.limit=srcLength;
int32_t destIndex = _caseMap(
_caseMap(
caseLocale, options, ucase_toFullLower,
dest, destCapacity,
src, &csc, 0, srcLength,
edits, errorCode);
return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
sink, edits, errorCode);
}
static int32_t U_CALLCONV
static void U_CALLCONV
ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
uint8_t *dest, int32_t destCapacity,
const uint8_t *src, int32_t srcLength,
icu::Edits *edits,
icu::ByteSink &sink, icu::Edits *edits,
UErrorCode &errorCode) {
int32_t destIndex;
if (caseLocale == UCASE_LOC_GREEK) {
destIndex = GreekUpper::toUpper(options, dest, destCapacity,
src, srcLength, edits, errorCode);
GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);
} else {
UCaseContext csc=UCASECONTEXT_INITIALIZER;
csc.p=(void *)src;
csc.limit=srcLength;
destIndex = _caseMap(
_caseMap(
caseLocale, options, ucase_toFullUpper,
dest, destCapacity,
src, &csc, 0, srcLength,
edits, errorCode);
sink, edits, errorCode);
}
return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
}
static int32_t U_CALLCONV
static void U_CALLCONV
ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
uint8_t *dest, int32_t destCapacity,
const uint8_t *src, int32_t srcLength,
icu::Edits *edits,
icu::ByteSink &sink, icu::Edits *edits,
UErrorCode &errorCode) {
/* case mapping loop */
int32_t srcIndex = 0;
int32_t destIndex = 0;
while (srcIndex < srcLength) {
while (U_SUCCESS(errorCode) && srcIndex < srcLength) {
int32_t cpStart = srcIndex;
UChar32 c;
U8_NEXT(src, srcIndex, srcLength, c);
if(c<0) {
// Malformed UTF-8.
destIndex=appendUnchanged(dest, destIndex, destCapacity,
src+cpStart, srcIndex-cpStart, options, edits);
if(destIndex<0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
continue;
}
const UChar *s;
c = ucase_toFullFolding(c, &s, options);
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
srcIndex - cpStart, options, edits);
if (destIndex < 0) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart,
sink, options, edits, errorCode);
} else {
const UChar *s;
c = ucase_toFullFolding(c, &s, options);
appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
}
}
return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
}
U_CFUNC int32_t
void
ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
uint8_t *dest, int32_t destCapacity,
const uint8_t *src, int32_t srcLength,
const char *src, int32_t srcLength,
UTF8CaseMapper *stringCaseMapper,
icu::ByteSink &sink, icu::Edits *edits,
UErrorCode &errorCode) {
/* check argument values */
if (U_FAILURE(errorCode)) {
return;
}
if ((src == nullptr && srcLength != 0) || srcLength < -1) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
// Get the string length.
if (srcLength == -1) {
srcLength = (int32_t)uprv_strlen((const char *)src);
}
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
edits->reset();
}
stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
(const uint8_t *)src, srcLength, sink, edits, errorCode);
sink.Flush();
if (U_SUCCESS(errorCode)) {
if (edits != nullptr) {
edits->copyErrorTo(errorCode);
}
}
}
int32_t
ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
char *dest, int32_t destCapacity,
const char *src, int32_t srcLength,
UTF8CaseMapper *stringCaseMapper,
icu::Edits *edits,
UErrorCode &errorCode) {
int32_t destLength;
/* check argument values */
if(U_FAILURE(errorCode)) {
return 0;
}
if( destCapacity<0 ||
(dest==NULL && destCapacity>0) ||
src==NULL ||
srcLength<-1
(src==NULL && srcLength!=0) || srcLength<-1
) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
@ -820,12 +651,21 @@ ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_P
return 0;
}
CheckedArrayByteSink sink(dest, destCapacity);
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
edits->reset();
}
destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
dest, destCapacity, src, srcLength, edits, errorCode);
return u_terminateChars((char *)dest, destCapacity, destLength, &errorCode);
stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
(const uint8_t *)src, srcLength, sink, edits, errorCode);
sink.Flush();
if (U_SUCCESS(errorCode)) {
if (sink.Overflowed()) {
errorCode = U_BUFFER_OVERFLOW_ERROR;
} else if (edits != nullptr) {
edits->copyErrorTo(errorCode);
}
}
return u_terminateChars(dest, destCapacity, sink.NumberOfBytesAppended(), &errorCode);
}
/* public API functions */
@ -837,8 +677,8 @@ ucasemap_utf8ToLower(const UCaseMap *csm,
UErrorCode *pErrorCode) {
return ucasemap_mapUTF8(
csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
(uint8_t *)dest, destCapacity,
(const uint8_t *)src, srcLength,
dest, destCapacity,
src, srcLength,
ucasemap_internalUTF8ToLower, NULL, *pErrorCode);
}
@ -849,8 +689,8 @@ ucasemap_utf8ToUpper(const UCaseMap *csm,
UErrorCode *pErrorCode) {
return ucasemap_mapUTF8(
csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
(uint8_t *)dest, destCapacity,
(const uint8_t *)src, srcLength,
dest, destCapacity,
src, srcLength,
ucasemap_internalUTF8ToUpper, NULL, *pErrorCode);
}
@ -861,13 +701,43 @@ ucasemap_utf8FoldCase(const UCaseMap *csm,
UErrorCode *pErrorCode) {
return ucasemap_mapUTF8(
UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
(uint8_t *)dest, destCapacity,
(const uint8_t *)src, srcLength,
dest, destCapacity,
src, srcLength,
ucasemap_internalUTF8Fold, NULL, *pErrorCode);
}
U_NAMESPACE_BEGIN
void CaseMap::utf8ToLower(
const char *locale, uint32_t options,
StringPiece src, ByteSink &sink, Edits *edits,
UErrorCode &errorCode) {
ucasemap_mapUTF8(
ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
src.data(), src.length(),
ucasemap_internalUTF8ToLower, sink, edits, errorCode);
}
void CaseMap::utf8ToUpper(
const char *locale, uint32_t options,
StringPiece src, ByteSink &sink, Edits *edits,
UErrorCode &errorCode) {
ucasemap_mapUTF8(
ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
src.data(), src.length(),
ucasemap_internalUTF8ToUpper, sink, edits, errorCode);
}
void CaseMap::utf8Fold(
uint32_t options,
StringPiece src, ByteSink &sink, Edits *edits,
UErrorCode &errorCode) {
ucasemap_mapUTF8(
UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
src.data(), src.length(),
ucasemap_internalUTF8Fold, sink, edits, errorCode);
}
int32_t CaseMap::utf8ToLower(
const char *locale, uint32_t options,
const char *src, int32_t srcLength,
@ -875,8 +745,8 @@ int32_t CaseMap::utf8ToLower(
UErrorCode &errorCode) {
return ucasemap_mapUTF8(
ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
(uint8_t *)dest, destCapacity,
(const uint8_t *)src, srcLength,
dest, destCapacity,
src, srcLength,
ucasemap_internalUTF8ToLower, edits, errorCode);
}
@ -887,8 +757,8 @@ int32_t CaseMap::utf8ToUpper(
UErrorCode &errorCode) {
return ucasemap_mapUTF8(
ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
(uint8_t *)dest, destCapacity,
(const uint8_t *)src, srcLength,
dest, destCapacity,
src, srcLength,
ucasemap_internalUTF8ToUpper, edits, errorCode);
}
@ -899,8 +769,8 @@ int32_t CaseMap::utf8Fold(
UErrorCode &errorCode) {
return ucasemap_mapUTF8(
UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
(uint8_t *)dest, destCapacity,
(const uint8_t *)src, srcLength,
dest, destCapacity,
src, srcLength,
ucasemap_internalUTF8Fold, edits, errorCode);
}

View file

@ -73,6 +73,8 @@ uprv_haveProperties(UErrorCode *pErrorCode);
U_NAMESPACE_BEGIN
class ByteSink;
/** Returns TRUE if the options are valid. Otherwise FALSE, and sets an error. */
inline UBool ustrcase_checkTitleAdjustmentOptions(uint32_t options, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return FALSE; }
@ -207,39 +209,43 @@ ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITE
* UTF-8 version of UStringCaseMapper.
* All error checking must be done.
* The UCaseMap must be fully initialized, with locale and/or iter set as needed.
* src and dest must not overlap.
*/
typedef int32_t U_CALLCONV
typedef void U_CALLCONV
UTF8CaseMapper(int32_t caseLocale, uint32_t options,
#if !UCONFIG_NO_BREAK_ITERATION
icu::BreakIterator *iter,
#endif
uint8_t *dest, int32_t destCapacity,
const uint8_t *src, int32_t srcLength,
icu::Edits *edits,
icu::ByteSink &sink, icu::Edits *edits,
UErrorCode &errorCode);
#if !UCONFIG_NO_BREAK_ITERATION
/** Implements UTF8CaseMapper. */
U_CFUNC int32_t U_CALLCONV
U_CFUNC void U_CALLCONV
ucasemap_internalUTF8ToTitle(int32_t caseLocale, uint32_t options,
icu::BreakIterator *iter,
uint8_t *dest, int32_t destCapacity,
const uint8_t *src, int32_t srcLength,
icu::Edits *edits,
icu::ByteSink &sink, icu::Edits *edits,
UErrorCode &errorCode);
#endif
void
ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
const char *src, int32_t srcLength,
UTF8CaseMapper *stringCaseMapper,
icu::ByteSink &sink, icu::Edits *edits,
UErrorCode &errorCode);
/**
* Implements argument checking and buffer handling
* for UTF-8 string case mapping as a common function.
*/
U_CFUNC int32_t
int32_t
ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
uint8_t *dest, int32_t destCapacity,
const uint8_t *src, int32_t srcLength,
char *dest, int32_t destCapacity,
const char *src, int32_t srcLength,
UTF8CaseMapper *stringCaseMapper,
icu::Edits *edits,
UErrorCode &errorCode);

View file

@ -31,6 +31,29 @@
U_NAMESPACE_BEGIN
void CaseMap::utf8ToTitle(
const char *locale, uint32_t options, BreakIterator *iter,
StringPiece src, ByteSink &sink, Edits *edits,
UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) {
return;
}
UText utext = UTEXT_INITIALIZER;
utext_openUTF8(&utext, src.data(), src.length(), &errorCode);
LocalPointer<BreakIterator> ownedIter;
iter = ustrcase_getTitleBreakIterator(nullptr, locale, options, iter, ownedIter, errorCode);
if (iter == nullptr) {
utext_close(&utext);
return;
}
iter->setText(&utext, errorCode);
ucasemap_mapUTF8(
ustrcase_getCaseLocale(locale), options, iter,
src.data(), src.length(),
ucasemap_internalUTF8ToTitle, sink, edits, errorCode);
utext_close(&utext);
}
int32_t CaseMap::utf8ToTitle(
const char *locale, uint32_t options, BreakIterator *iter,
const char *src, int32_t srcLength,
@ -50,8 +73,8 @@ int32_t CaseMap::utf8ToTitle(
iter->setText(&utext, errorCode);
int32_t length=ucasemap_mapUTF8(
ustrcase_getCaseLocale(locale), options, iter,
(uint8_t *)dest, destCapacity,
(const uint8_t *)src, srcLength,
dest, destCapacity,
src, srcLength,
ucasemap_internalUTF8ToTitle, edits, errorCode);
utext_close(&utext);
return length;
@ -101,8 +124,8 @@ ucasemap_utf8ToTitle(UCaseMap *csm,
csm->iter->setText(&utext, *pErrorCode);
int32_t length=ucasemap_mapUTF8(
csm->caseLocale, csm->options, csm->iter,
(uint8_t *)dest, destCapacity,
(const uint8_t *)src, srcLength,
dest, destCapacity,
src, srcLength,
ucasemap_internalUTF8ToTitle, NULL, *pErrorCode);
utext_close(&utext);
return length;

View file

@ -1323,9 +1323,17 @@ _UTF16GetName(const UConverter *cnv) {
U_CDECL_END
extern const UConverterSharedData _UTF16Data;
#define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData)
#define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData)
#define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data)
static inline bool IS_UTF16BE(const UConverter *cnv) {
return ((cnv)->sharedData == &_UTF16BEData);
}
static inline bool IS_UTF16LE(const UConverter *cnv) {
return ((cnv)->sharedData == &_UTF16LEData);
}
static inline bool IS_UTF16(const UConverter *cnv) {
return ((cnv)->sharedData==&_UTF16Data) || ((cnv)->sharedData == &_UTF16v2Data);
}
U_CDECL_BEGIN
static void U_CALLCONV

View file

@ -31,6 +31,7 @@
#include "ucnv_bld.h"
#include "ucnv_cnv.h"
#include "cmemory.h"
#include "ustr_imp.h"
/* Prototypes --------------------------------------------------------------- */
@ -44,51 +45,13 @@ U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args
/* UTF-8 -------------------------------------------------------------------- */
/* UTF-8 Conversion DATA
* for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
*/
/*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
#define MAXIMUM_UCS2 0x0000FFFF
#define MAXIMUM_UTF 0x0010FFFF
#define MAXIMUM_UCS4 0x7FFFFFFF
#define HALF_SHIFT 10
#define HALF_BASE 0x0010000
#define HALF_MASK 0x3FF
#define SURROGATE_HIGH_START 0xD800
#define SURROGATE_HIGH_END 0xDBFF
#define SURROGATE_LOW_START 0xDC00
#define SURROGATE_LOW_END 0xDFFF
/* -SURROGATE_LOW_START + HALF_BASE */
#define SURROGATE_LOW_BASE 9216
static const uint32_t offsetsFromUTF8[7] = {0,
static const uint32_t offsetsFromUTF8[5] = {0,
(uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
(uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
(uint32_t) 0x03C82080
};
/* END OF UTF-8 Conversion DATA */
static const int8_t bytesFromUTF8[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
};
/*
* Starting with Unicode 3.0.1:
* UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
* byte sequences with more than 4 bytes are illegal in UTF-8,
* which is tested with impossible values for them
*/
static const uint32_t
utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
static UBool hasCESU8Data(const UConverter *cnv)
{
#if UCONFIG_ONLY_HTML_CONVERSION
@ -127,7 +90,7 @@ static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
while (mySource < sourceLimit && myTarget < targetLimit)
{
ch = *(mySource++);
if (ch < 0x80) /* Simple case */
if (U8_IS_SINGLE(ch)) /* Simple case */
{
*(myTarget++) = (UChar) ch;
}
@ -135,7 +98,7 @@ static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
{
/* store the first char */
toUBytes[0] = (char)ch;
inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */
i = 1;
morebytes:
@ -144,7 +107,8 @@ morebytes:
if (mySource < sourceLimit)
{
toUBytes[i] = (char) (ch2 = *mySource);
if (!U8_IS_TRAIL(ch2))
if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) &&
!(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
{
break; /* i < inBytes */
}
@ -162,24 +126,12 @@ morebytes:
}
}
/* Remove the accumulated high bits */
ch -= offsetsFromUTF8[inBytes];
/*
* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
* - use only trail bytes after a lead byte (checked above)
* - use the right number of trail bytes for a given lead byte
* - encode a code point <= U+10ffff
* - use the fewest possible number of bytes for their code points
* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
*
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
* There are no irregular sequences any more.
* In CESU-8, only surrogates, not supplementary code points, are encoded directly.
*/
if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
(isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
// In CESU-8, only surrogates, not supplementary code points, are encoded directly.
if (i == inBytes && (!isCESU8 || i <= 3))
{
/* Remove the accumulated high bits */
ch -= offsetsFromUTF8[inBytes];
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
if (ch <= MAXIMUM_UCS2)
{
@ -189,9 +141,8 @@ morebytes:
else
{
/* write out the surrogates */
ch -= HALF_BASE;
*(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
*(myTarget++) = U16_LEAD(ch);
ch = U16_TRAIL(ch);
if (myTarget < targetLimit)
{
*(myTarget++) = (UChar)ch;
@ -256,7 +207,7 @@ static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeAr
while (mySource < sourceLimit && myTarget < targetLimit)
{
ch = *(mySource++);
if (ch < 0x80) /* Simple case */
if (U8_IS_SINGLE(ch)) /* Simple case */
{
*(myTarget++) = (UChar) ch;
*(myOffsets++) = offsetNum++;
@ -264,7 +215,7 @@ static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeAr
else
{
toUBytes[0] = (char)ch;
inBytes = bytesFromUTF8[ch];
inBytes = U8_COUNT_BYTES_NON_ASCII(ch);
i = 1;
morebytes:
@ -273,7 +224,8 @@ morebytes:
if (mySource < sourceLimit)
{
toUBytes[i] = (char) (ch2 = *mySource);
if (!U8_IS_TRAIL(ch2))
if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) &&
!(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
{
break; /* i < inBytes */
}
@ -290,24 +242,12 @@ morebytes:
}
}
/* Remove the accumulated high bits */
ch -= offsetsFromUTF8[inBytes];
/*
* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
* - use only trail bytes after a lead byte (checked above)
* - use the right number of trail bytes for a given lead byte
* - encode a code point <= U+10ffff
* - use the fewest possible number of bytes for their code points
* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
*
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
* There are no irregular sequences any more.
* In CESU-8, only surrogates, not supplementary code points, are encoded directly.
*/
if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
(isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
// In CESU-8, only surrogates, not supplementary code points, are encoded directly.
if (i == inBytes && (!isCESU8 || i <= 3))
{
/* Remove the accumulated high bits */
ch -= offsetsFromUTF8[inBytes];
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
if (ch <= MAXIMUM_UCS2)
{
@ -318,10 +258,9 @@ morebytes:
else
{
/* write out the surrogates */
ch -= HALF_BASE;
*(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
*(myTarget++) = U16_LEAD(ch);
*(myOffsets++) = offsetNum;
ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
ch = U16_TRAIL(ch);
if (myTarget < targetLimit)
{
*(myTarget++) = (UChar)ch;
@ -616,10 +555,9 @@ static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
UConverter *cnv;
const uint8_t *sourceInitial;
const uint8_t *source;
uint16_t extraBytesToWrite;
uint8_t myByte;
UChar32 ch;
int8_t i, isLegalSequence;
int8_t i;
/* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
@ -633,14 +571,14 @@ static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
}
myByte = (uint8_t)*(source++);
if (myByte < 0x80)
if (U8_IS_SINGLE(myByte))
{
args->source = (const char *)source;
return (UChar32)myByte;
}
extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
if (extraBytesToWrite == 0) {
uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte);
if (countTrailBytes == 0) {
cnv->toUBytes[0] = myByte;
cnv->toULength = 1;
*err = U_ILLEGAL_CHAR_FOUND;
@ -649,15 +587,17 @@ static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
}
/*The byte sequence is longer than the buffer area passed*/
if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
if (((const char *)source + countTrailBytes) > args->sourceLimit)
{
/* check if all of the remaining bytes are trail bytes */
uint16_t extraBytesToWrite = countTrailBytes + 1;
cnv->toUBytes[0] = myByte;
i = 1;
*err = U_TRUNCATED_CHAR_FOUND;
while(source < (const uint8_t *)args->sourceLimit) {
if(U8_IS_TRAIL(myByte = *source)) {
cnv->toUBytes[i++] = myByte;
uint8_t b = *source;
if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) {
cnv->toUBytes[i++] = b;
++source;
} else {
/* error even before we run out of input */
@ -670,81 +610,28 @@ static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
return 0xffff;
}
isLegalSequence = 1;
ch = myByte << 6;
switch(extraBytesToWrite)
{
/* note: code falls through cases! (sic)*/
case 6:
ch += (myByte = *source);
ch <<= 6;
if (!U8_IS_TRAIL(myByte))
{
isLegalSequence = 0;
break;
if(countTrailBytes == 2) {
uint8_t t1 = *source, t2;
if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) {
args->source = (const char *)(source + 1);
return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3];
}
++source;
U_FALLTHROUGH;
case 5:
ch += (myByte = *source);
ch <<= 6;
if (!U8_IS_TRAIL(myByte))
{
isLegalSequence = 0;
break;
} else if(countTrailBytes == 1) {
uint8_t t1 = *source;
if(U8_IS_TRAIL(t1)) {
args->source = (const char *)(source + 1);
return (ch + t1) - offsetsFromUTF8[2];
}
++source;
U_FALLTHROUGH;
case 4:
ch += (myByte = *source);
ch <<= 6;
if (!U8_IS_TRAIL(myByte))
{
isLegalSequence = 0;
break;
} else { // countTrailBytes == 3
uint8_t t1 = *source, t2, t3;
if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) &&
U8_IS_TRAIL(t3 = *++source)) {
args->source = (const char *)(source + 1);
return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4];
}
++source;
U_FALLTHROUGH;
case 3:
ch += (myByte = *source);
ch <<= 6;
if (!U8_IS_TRAIL(myByte))
{
isLegalSequence = 0;
break;
}
++source;
U_FALLTHROUGH;
case 2:
ch += (myByte = *source);
if (!U8_IS_TRAIL(myByte))
{
isLegalSequence = 0;
break;
}
++source;
};
ch -= offsetsFromUTF8[extraBytesToWrite];
args->source = (const char *)source;
/*
* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
* - use only trail bytes after a lead byte (checked above)
* - use the right number of trail bytes for a given lead byte
* - encode a code point <= U+10ffff
* - use the fewest possible number of bytes for their code points
* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
*
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
* There are no irregular sequences any more.
*/
if (isLegalSequence &&
(uint32_t)ch <= MAXIMUM_UTF &&
(uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
!U_IS_SURROGATE(ch)
) {
return ch; /* return the code point */
}
args->source = (const char *)source;
for(i = 0; sourceInitial < source; ++i) {
cnv->toUBytes[i] = *sourceInitial++;
@ -757,14 +644,6 @@ U_CDECL_END
/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
static const UChar32
utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
static const UChar32
utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
U_CDECL_BEGIN
/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
static void U_CALLCONV
@ -812,39 +691,35 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
*pErrorCode=U_USING_DEFAULT_WARNING;
return;
} else {
/*
* Use a single counter for source and target, counting the minimum of
* the source length and the target capacity.
* As a result, the source length is checked only once per multi-byte
* character instead of twice.
*
* Make sure that the last byte sequence is complete, or else
* stop just before it.
* (The longest legal byte sequence has 3 trail bytes.)
* Count oldToULength (number of source bytes from a previous buffer)
* into the source length but reduce the source index by toULimit
* while going back over trail bytes in order to not go back into
* the bytes that will be read for finishing a partial
* sequence from the previous buffer.
* Let the standard converter handle edge cases.
*/
int32_t i;
// Use a single counter for source and target, counting the minimum of
// the source length and the target capacity.
// Let the standard converter handle edge cases.
if(count>targetCapacity) {
count=targetCapacity;
}
i=0;
while(i<3 && i<(count-toULimit)) {
b=source[count-oldToULength-i-1];
if(U8_IS_TRAIL(b)) {
++i;
} else {
if(i<U8_COUNT_TRAIL_BYTES(b)) {
/* stop converting before the lead byte if there are not enough trail bytes for it */
count-=i+1;
// The conversion loop checks count>0 only once per 1/2/3-byte character.
// If the buffer ends with a truncated 2- or 3-byte sequence,
// then we reduce the count to stop before that,
// and collect the remaining bytes after the conversion loop.
{
// Do not go back into the bytes that will be read for finishing a partial
// sequence from the previous buffer.
int32_t length=count-toULimit;
if(length>0) {
uint8_t b1=*(sourceLimit-1);
if(U8_IS_SINGLE(b1)) {
// common ASCII character
} else if(U8_IS_TRAIL(b1) && length>=2) {
uint8_t b2=*(sourceLimit-2);
if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
// truncated 3-byte sequence
count-=2;
}
} else if(0xc2<=b1 && b1<0xf0) {
// truncated 2- or 3-byte sequence
--count;
}
break;
}
}
}
@ -859,17 +734,17 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
/* conversion loop */
while(count>0) {
b=*source++;
if((int8_t)b>=0) {
if(U8_IS_SINGLE(b)) {
/* convert ASCII */
*target++=b;
--count;
continue;
} else {
if(b>0xe0) {
if( /* handle U+1000..U+D7FF inline */
(t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
(b==0xed && (t1 <= 0x9f))) &&
(t2=source[1]) >= 0x80 && t2 <= 0xbf
if(b>=0xe0) {
if( /* handle U+0800..U+FFFF inline */
b<0xf0 &&
U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
U8_IS_TRAIL(t2=source[1])
) {
source+=2;
*target++=b;
@ -878,10 +753,10 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
count-=3;
continue;
}
} else if(b<0xe0) {
} else {
if( /* handle U+0080..U+07FF inline */
b>=0xc2 &&
(t1=*source) >= 0x80 && t1 <= 0xbf
U8_IS_TRAIL(t1=*source)
) {
++source;
*target++=b;
@ -889,30 +764,18 @@ ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
count-=2;
continue;
}
} else if(b==0xe0) {
if( /* handle U+0800..U+0FFF inline */
(t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
(t2=source[1]) >= 0x80 && t2 <= 0xbf
) {
source+=2;
*target++=b;
*target++=t1;
*target++=t2;
count-=3;
continue;
}
}
/* handle "complicated" and error cases, and continuing partial characters */
oldToULength=0;
toULength=1;
toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
toULimit=U8_COUNT_BYTES_NON_ASCII(b);
c=b;
moreBytes:
while(toULength<toULimit) {
if(source<sourceLimit) {
b=*source;
if(U8_IS_TRAIL(b)) {
if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
++source;
++toULength;
c=(c<<6)+b;
@ -934,18 +797,7 @@ moreBytes:
}
}
if( toULength==toULimit && /* consumed all trail bytes */
(toULength==3 || toULength==2) && /* BMP */
(c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
(c<=0xd7ff || 0xe000<=c) /* not a surrogate */
) {
/* legal byte sequence for BMP code point */
} else if(
toULength==toULimit && toULength==4 &&
(0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
) {
/* legal byte sequence for supplementary code point */
} else {
if(toULength!=toULimit) {
/* error handling: illegal UTF-8 byte sequence */
source-=(toULength-oldToULength);
while(oldToULength<toULength) {
@ -979,7 +831,7 @@ moreBytes:
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
} else {
b=*source;
toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
toULimit=U8_COUNT_BYTES(b);
if(toULimit>(sourceLimit-source)) {
/* collect a truncated byte sequence */
toULength=0;

View file

@ -23,6 +23,7 @@
#include "unicode/utf8.h"
#include "ucnv_bld.h"
#include "ucnv_cnv.h"
#include "ustr_imp.h"
/* control optimizations according to the platform */
#define LATIN1_UNROLL_FROM_UNICODE 1
@ -374,7 +375,7 @@ ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
while(source<sourceLimit) {
if(targetCapacity>0) {
b=*source++;
if((int8_t)b>=0) {
if(U8_IS_SINGLE(b)) {
/* convert ASCII */
*target++=(uint8_t)b;
--targetCapacity;
@ -409,7 +410,7 @@ ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
utf8->toUnicodeStatus=utf8->toUBytes[0]=b=*source++;
utf8->toULength=1;
utf8->mode=U8_COUNT_TRAIL_BYTES(b)+1;
utf8->mode=U8_COUNT_BYTES(b);
}
/* write back the updated pointers */

View file

@ -59,6 +59,7 @@
#include "cmemory.h"
#include "cstring.h"
#include "umutex.h"
#include "ustr_imp.h"
/* control optimizations according to the platform */
#define MBCS_UNROLL_SINGLE_TO_BMP 1
@ -5011,13 +5012,9 @@ ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
/* MBCS-from-UTF-8 conversion functions ------------------------------------- */
/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
static const UChar32
utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
static const UChar32
utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
utf8_offsets[5]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
static void U_CALLCONV
ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
@ -5075,28 +5072,27 @@ ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
toULength=oldToULength=toULimit=0;
}
/*
* Make sure that the last byte sequence before sourceLimit is complete
* or runs into a lead byte.
* Do not go back into the bytes that will be read for finishing a partial
* sequence from the previous buffer.
* In the conversion loop compare source with sourceLimit only once
* per multi-byte character.
*/
// The conversion loop checks source<sourceLimit only once per 1/2/3-byte character.
// If the buffer ends with a truncated 2- or 3-byte sequence,
// then we reduce the sourceLimit to before that,
// and collect the remaining bytes after the conversion loop.
{
int32_t i, length;
length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
for(i=0; i<3 && i<length;) {
b=*(sourceLimit-i-1);
if(U8_IS_TRAIL(b)) {
++i;
} else {
if(i<U8_COUNT_TRAIL_BYTES(b)) {
/* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
sourceLimit-=i+1;
// Do not go back into the bytes that will be read for finishing a partial
// sequence from the previous buffer.
int32_t length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
if(length>0) {
uint8_t b1=*(sourceLimit-1);
if(U8_IS_SINGLE(b1)) {
// common ASCII character
} else if(U8_IS_TRAIL(b1) && length>=2) {
uint8_t b2=*(sourceLimit-2);
if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
// truncated 3-byte sequence
sourceLimit-=2;
}
break;
} else if(0xc2<=b1 && b1<0xf0) {
// truncated 2- or 3-byte sequence
--sourceLimit;
}
}
}
@ -5130,7 +5126,7 @@ ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
while(source<sourceLimit) {
if(targetCapacity>0) {
b=*source++;
if((int8_t)b>=0) {
if(U8_IS_SINGLE(b)) {
/* convert ASCII */
if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
*target++=(uint8_t)b;
@ -5185,7 +5181,7 @@ ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
/* handle "complicated" and error cases, and continuing partial characters */
oldToULength=0;
toULength=1;
toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
toULimit=U8_COUNT_BYTES_NON_ASCII(b);
c=b;
moreBytes:
while(toULength<toULimit) {
@ -5198,7 +5194,7 @@ moreBytes:
*/
if(source<(uint8_t *)pToUArgs->sourceLimit) {
b=*source;
if(U8_IS_TRAIL(b)) {
if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
++source;
++toULength;
c=(c<<6)+b;
@ -5220,22 +5216,18 @@ moreBytes:
}
}
if( toULength==toULimit && /* consumed all trail bytes */
(toULength==3 || toULength==2) && /* BMP */
(c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
(c<=0xd7ff || 0xe000<=c) /* not a surrogate */
) {
value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
} else if(
toULength==toULimit && toULength==4 &&
(0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
) {
/* supplementary code point */
if(!hasSupplementary) {
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
value=0;
} else {
if(toULength==toULimit) {
c-=utf8_offsets[toULength];
if(toULength<=3) { /* BMP */
value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
} else {
/* supplementary code point */
if(!hasSupplementary) {
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
value=0;
} else {
value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);
}
}
} else {
/* error handling: illegal UTF-8 byte sequence */
@ -5310,7 +5302,7 @@ moreBytes:
source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
c=utf8->toUBytes[0]=b=*source++;
toULength=1;
toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
toULimit=U8_COUNT_BYTES(b);
while(source<sourceLimit) {
utf8->toUBytes[toULength++]=b=*source++;
c=(c<<6)+b;
@ -5375,28 +5367,27 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
toULength=oldToULength=toULimit=0;
}
/*
* Make sure that the last byte sequence before sourceLimit is complete
* or runs into a lead byte.
* Do not go back into the bytes that will be read for finishing a partial
* sequence from the previous buffer.
* In the conversion loop compare source with sourceLimit only once
* per multi-byte character.
*/
// The conversion loop checks source<sourceLimit only once per 1/2/3-byte character.
// If the buffer ends with a truncated 2- or 3-byte sequence,
// then we reduce the sourceLimit to before that,
// and collect the remaining bytes after the conversion loop.
{
int32_t i, length;
length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
for(i=0; i<3 && i<length;) {
b=*(sourceLimit-i-1);
if(U8_IS_TRAIL(b)) {
++i;
} else {
if(i<U8_COUNT_TRAIL_BYTES(b)) {
/* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
sourceLimit-=i+1;
// Do not go back into the bytes that will be read for finishing a partial
// sequence from the previous buffer.
int32_t length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);
if(length>0) {
uint8_t b1=*(sourceLimit-1);
if(U8_IS_SINGLE(b1)) {
// common ASCII character
} else if(U8_IS_TRAIL(b1) && length>=2) {
uint8_t b2=*(sourceLimit-2);
if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
// truncated 3-byte sequence
sourceLimit-=2;
}
break;
} else if(0xc2<=b1 && b1<0xf0) {
// truncated 2- or 3-byte sequence
--sourceLimit;
}
}
}
@ -5412,7 +5403,7 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
while(source<sourceLimit) {
if(targetCapacity>0) {
b=*source++;
if((int8_t)b>=0) {
if(U8_IS_SINGLE(b)) {
/* convert ASCII */
if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {
*target++=b;
@ -5426,13 +5417,13 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
}
}
} else {
if(b>0xe0) {
if( /* handle U+1000..U+D7FF inline */
(((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) ||
(b==0xed && (t1 <= 0x1f))) &&
if(b>=0xe0) {
if( /* handle U+0800..U+D7FF inline */
b<=0xed && // do not assume maxFastUChar>0xd7ff
U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
(t2=(uint8_t)(source[1]-0x80)) <= 0x3f
) {
c=((b&0xf)<<6)|t1;
c=((b&0xf)<<6)|(t1&0x3f);
source+=2;
value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2);
if(value==0) {
@ -5442,7 +5433,7 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
} else {
c=-1;
}
} else if(b<0xe0) {
} else {
if( /* handle U+0080..U+07FF inline */
b>=0xc2 &&
(t1=(uint8_t)(*source-0x80)) <= 0x3f
@ -5457,15 +5448,13 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
} else {
c=-1;
}
} else {
c=-1;
}
if(c<0) {
/* handle "complicated" and error cases, and continuing partial characters */
oldToULength=0;
toULength=1;
toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
toULimit=U8_COUNT_BYTES_NON_ASCII(b);
c=b;
moreBytes:
while(toULength<toULimit) {
@ -5478,7 +5467,7 @@ moreBytes:
*/
if(source<(uint8_t *)pToUArgs->sourceLimit) {
b=*source;
if(U8_IS_TRAIL(b)) {
if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
++source;
++toULength;
c=(c<<6)+b;
@ -5500,22 +5489,18 @@ moreBytes:
}
}
if( toULength==toULimit && /* consumed all trail bytes */
(toULength==3 || toULength==2) && /* BMP */
(c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
(c<=0xd7ff || 0xe000<=c) /* not a surrogate */
) {
stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
} else if(
toULength==toULimit && toULength==4 &&
(0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
) {
/* supplementary code point */
if(!hasSupplementary) {
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
stage2Entry=0;
} else {
if(toULength==toULimit) {
c-=utf8_offsets[toULength];
if(toULength<=3) { /* BMP */
stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
} else {
/* supplementary code point */
if(!hasSupplementary) {
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
stage2Entry=0;
} else {
stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
}
}
} else {
/* error handling: illegal UTF-8 byte sequence */
@ -5620,7 +5605,7 @@ unassigned:
source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
c=utf8->toUBytes[0]=b=*source++;
toULength=1;
toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
toULimit=U8_COUNT_BYTES(b);
while(source<sourceLimit) {
utf8->toUBytes[toULength++]=b=*source++;
c=(c<<6)+b;

View file

@ -79,14 +79,14 @@
* prime number while being less than a power of two.
*/
static const int32_t PRIMES[] = {
13, 31, 61, 127, 251, 509, 1021, 2039, 4093, 8191, 16381, 32749,
7, 13, 31, 61, 127, 251, 509, 1021, 2039, 4093, 8191, 16381, 32749,
65521, 131071, 262139, 524287, 1048573, 2097143, 4194301, 8388593,
16777213, 33554393, 67108859, 134217689, 268435399, 536870909,
1073741789, 2147483647 /*, 4294967291 */
};
#define PRIMES_LENGTH UPRV_LENGTHOF(PRIMES)
#define DEFAULT_PRIME_INDEX 3
#define DEFAULT_PRIME_INDEX 4
/* These ratios are tuned to the PRIMES array such that a resize
* places the table back into the zone of non-resizing. That is,
@ -231,7 +231,7 @@ _uhash_allocate(UHashtable *hash,
emptytok.pointer = NULL; /* Only one of these two is needed */
emptytok.integer = 0; /* but we don't know which one. */
limit = p + hash->length;
while (p < limit) {
p->key = emptytok;
@ -247,7 +247,7 @@ _uhash_allocate(UHashtable *hash,
static UHashtable*
_uhash_init(UHashtable *result,
UHashFunction *keyHash,
UHashFunction *keyHash,
UKeyComparator *keyComp,
UValueComparator *valueComp,
int32_t primeIndex,
@ -275,7 +275,7 @@ _uhash_init(UHashtable *result,
}
static UHashtable*
_uhash_create(UHashFunction *keyHash,
_uhash_create(UHashFunction *keyHash,
UKeyComparator *keyComp,
UValueComparator *valueComp,
int32_t primeIndex,
@ -415,7 +415,7 @@ _uhash_rehash(UHashtable *hash, UErrorCode *status) {
if (U_FAILURE(*status)) {
hash->elements = old;
hash->length = oldLength;
hash->length = oldLength;
return;
}
@ -536,7 +536,7 @@ _uhash_put(UHashtable *hash,
********************************************************************/
U_CAPI UHashtable* U_EXPORT2
uhash_open(UHashFunction *keyHash,
uhash_open(UHashFunction *keyHash,
UKeyComparator *keyComp,
UValueComparator *valueComp,
UErrorCode *status) {
@ -545,7 +545,7 @@ uhash_open(UHashFunction *keyHash,
}
U_CAPI UHashtable* U_EXPORT2
uhash_openSize(UHashFunction *keyHash,
uhash_openSize(UHashFunction *keyHash,
UKeyComparator *keyComp,
UValueComparator *valueComp,
int32_t size,
@ -562,7 +562,7 @@ uhash_openSize(UHashFunction *keyHash,
U_CAPI UHashtable* U_EXPORT2
uhash_init(UHashtable *fillinResult,
UHashFunction *keyHash,
UHashFunction *keyHash,
UKeyComparator *keyComp,
UValueComparator *valueComp,
UErrorCode *status) {
@ -570,6 +570,22 @@ uhash_init(UHashtable *fillinResult,
return _uhash_init(fillinResult, keyHash, keyComp, valueComp, DEFAULT_PRIME_INDEX, status);
}
U_CAPI UHashtable* U_EXPORT2
uhash_initSize(UHashtable *fillinResult,
UHashFunction *keyHash,
UKeyComparator *keyComp,
UValueComparator *valueComp,
int32_t size,
UErrorCode *status) {
// Find the smallest index i for which PRIMES[i] >= size.
int32_t i = 0;
while (i<(PRIMES_LENGTH-1) && PRIMES[i]<size) {
++i;
}
return _uhash_init(fillinResult, keyHash, keyComp, valueComp, i, status);
}
U_CAPI void U_EXPORT2
uhash_close(UHashtable *hash) {
if (hash == NULL) {
@ -604,7 +620,7 @@ uhash_setKeyComparator(UHashtable *hash, UKeyComparator *fn) {
hash->keyComparator = fn;
return result;
}
U_CAPI UValueComparator *U_EXPORT2
U_CAPI UValueComparator *U_EXPORT2
uhash_setValueComparator(UHashtable *hash, UValueComparator *fn){
UValueComparator *result = hash->valueComparator;
hash->valueComparator = fn;
@ -630,7 +646,7 @@ uhash_setResizePolicy(UHashtable *hash, enum UHashResizePolicy policy) {
UErrorCode status = U_ZERO_ERROR;
_uhash_internalSetResizePolicy(hash, policy);
hash->lowWaterMark = (int32_t)(hash->length * hash->lowWaterRatio);
hash->highWaterMark = (int32_t)(hash->length * hash->highWaterRatio);
hash->highWaterMark = (int32_t)(hash->length * hash->highWaterRatio);
_uhash_rehash(hash, &status);
}
@ -853,7 +869,7 @@ uhash_hashIChars(const UHashTok key) {
return s == NULL ? 0 : ustr_hashICharsN(s, uprv_strlen(s));
}
U_CAPI UBool U_EXPORT2
U_CAPI UBool U_EXPORT2
uhash_equals(const UHashtable* hash1, const UHashtable* hash2){
int32_t count1, count2, pos, i;
@ -886,14 +902,14 @@ uhash_equals(const UHashtable* hash1, const UHashtable* hash2){
if(count1!=count2){
return FALSE;
}
pos=UHASH_FIRST;
for(i=0; i<count1; i++){
const UHashElement* elem1 = uhash_nextElement(hash1, &pos);
const UHashTok key1 = elem1->key;
const UHashTok val1 = elem1->value;
/* here the keys are not compared, instead the key form hash1 is used to fetch
* value from hash2. If the hashes are equal then then both hashes should
* value from hash2. If the hashes are equal then then both hashes should
* contain equal values for the same key!
*/
const UHashElement* elem2 = _uhash_find(hash2, key1, hash2->keyHasher(key1));

View file

@ -154,7 +154,7 @@ struct UHashtable {
* If NULL won't do anything */
/* Size parameters */
int32_t count; /* The number of key-value pairs in this table.
* 0 <= count <= length. In practice we
* never let count == length (see code). */
@ -162,12 +162,12 @@ struct UHashtable {
* and values. Must be prime. */
/* Rehashing thresholds */
int32_t highWaterMark; /* If count > highWaterMark, rehash */
int32_t lowWaterMark; /* If count < lowWaterMark, rehash */
float highWaterRatio; /* 0..1; high water as a fraction of length */
float lowWaterRatio; /* 0..1; low water as a fraction of length */
int8_t primeIndex; /* Index into our prime table for length.
* length == PRIMES[primeIndex] */
UBool allocated; /* Was this UHashtable allocated? */
@ -190,7 +190,7 @@ U_CDECL_END
* @return A pointer to a UHashtable, or 0 if an error occurred.
* @see uhash_openSize
*/
U_CAPI UHashtable* U_EXPORT2
U_CAPI UHashtable* U_EXPORT2
uhash_open(UHashFunction *keyHash,
UKeyComparator *keyComp,
UValueComparator *valueComp,
@ -207,7 +207,7 @@ uhash_open(UHashFunction *keyHash,
* @return A pointer to a UHashtable, or 0 if an error occurred.
* @see uhash_open
*/
U_CAPI UHashtable* U_EXPORT2
U_CAPI UHashtable* U_EXPORT2
uhash_openSize(UHashFunction *keyHash,
UKeyComparator *keyComp,
UValueComparator *valueComp,
@ -224,18 +224,37 @@ uhash_openSize(UHashFunction *keyHash,
* @return A pointer to a UHashtable, or 0 if an error occurred.
* @see uhash_openSize
*/
U_CAPI UHashtable* U_EXPORT2
U_CAPI UHashtable* U_EXPORT2
uhash_init(UHashtable *hash,
UHashFunction *keyHash,
UKeyComparator *keyComp,
UValueComparator *valueComp,
UErrorCode *status);
/**
* Initialize an existing UHashtable.
* @param keyHash A pointer to the key hashing function. Must not be
* NULL.
* @param keyComp A pointer to the function that compares keys. Must
* not be NULL.
* @param size The initial capacity of this hash table.
* @param status A pointer to an UErrorCode to receive any errors.
* @return A pointer to a UHashtable, or 0 if an error occurred.
* @see uhash_openSize
*/
U_CAPI UHashtable* U_EXPORT2
uhash_initSize(UHashtable *hash,
UHashFunction *keyHash,
UKeyComparator *keyComp,
UValueComparator *valueComp,
int32_t size,
UErrorCode *status);
/**
* Close a UHashtable, releasing the memory used.
* @param hash The UHashtable to close. If hash is NULL no operation is performed.
*/
U_CAPI void U_EXPORT2
U_CAPI void U_EXPORT2
uhash_close(UHashtable *hash);
@ -246,7 +265,7 @@ uhash_close(UHashtable *hash);
* @param fn the function to be used hash keys; must not be NULL
* @return the previous key hasher; non-NULL
*/
U_CAPI UHashFunction *U_EXPORT2
U_CAPI UHashFunction *U_EXPORT2
uhash_setKeyHasher(UHashtable *hash, UHashFunction *fn);
/**
@ -256,7 +275,7 @@ uhash_setKeyHasher(UHashtable *hash, UHashFunction *fn);
* @param fn the function to be used compare keys; must not be NULL
* @return the previous key comparator; non-NULL
*/
U_CAPI UKeyComparator *U_EXPORT2
U_CAPI UKeyComparator *U_EXPORT2
uhash_setKeyComparator(UHashtable *hash, UKeyComparator *fn);
/**
@ -266,7 +285,7 @@ uhash_setKeyComparator(UHashtable *hash, UKeyComparator *fn);
* @param fn the function to be used compare keys; must not be NULL
* @return the previous key comparator; non-NULL
*/
U_CAPI UValueComparator *U_EXPORT2
U_CAPI UValueComparator *U_EXPORT2
uhash_setValueComparator(UHashtable *hash, UValueComparator *fn);
/**
@ -279,7 +298,7 @@ uhash_setValueComparator(UHashtable *hash, UValueComparator *fn);
* @param fn the function to be used delete keys, or NULL
* @return the previous key deleter; may be NULL
*/
U_CAPI UObjectDeleter *U_EXPORT2
U_CAPI UObjectDeleter *U_EXPORT2
uhash_setKeyDeleter(UHashtable *hash, UObjectDeleter *fn);
/**
@ -292,7 +311,7 @@ uhash_setKeyDeleter(UHashtable *hash, UObjectDeleter *fn);
* @param fn the function to be used delete values, or NULL
* @return the previous value deleter; may be NULL
*/
U_CAPI UObjectDeleter *U_EXPORT2
U_CAPI UObjectDeleter *U_EXPORT2
uhash_setValueDeleter(UHashtable *hash, UObjectDeleter *fn);
/**
@ -302,7 +321,7 @@ uhash_setValueDeleter(UHashtable *hash, UObjectDeleter *fn);
* @param hash The UHashtable to set
* @param policy The way the hashtable resizes itself, {U_GROW, U_GROW_AND_SHRINK, U_FIXED}
*/
U_CAPI void U_EXPORT2
U_CAPI void U_EXPORT2
uhash_setResizePolicy(UHashtable *hash, enum UHashResizePolicy policy);
/**
@ -310,7 +329,7 @@ uhash_setResizePolicy(UHashtable *hash, enum UHashResizePolicy policy);
* @param hash The UHashtable to query.
* @return The number of key-value pairs stored in hash.
*/
U_CAPI int32_t U_EXPORT2
U_CAPI int32_t U_EXPORT2
uhash_count(const UHashtable *hash);
/**
@ -326,7 +345,7 @@ uhash_count(const UHashtable *hash);
* @return The previous value, or NULL if none.
* @see uhash_get
*/
U_CAPI void* U_EXPORT2
U_CAPI void* U_EXPORT2
uhash_put(UHashtable *hash,
void *key,
void *value,
@ -344,7 +363,7 @@ uhash_put(UHashtable *hash,
* @return The previous value, or NULL if none.
* @see uhash_get
*/
U_CAPI void* U_EXPORT2
U_CAPI void* U_EXPORT2
uhash_iput(UHashtable *hash,
int32_t key,
void* value,
@ -362,7 +381,7 @@ uhash_iput(UHashtable *hash,
* @return The previous value, or 0 if none.
* @see uhash_get
*/
U_CAPI int32_t U_EXPORT2
U_CAPI int32_t U_EXPORT2
uhash_puti(UHashtable *hash,
void* key,
int32_t value,
@ -380,7 +399,7 @@ uhash_puti(UHashtable *hash,
* @return The previous value, or 0 if none.
* @see uhash_get
*/
U_CAPI int32_t U_EXPORT2
U_CAPI int32_t U_EXPORT2
uhash_iputi(UHashtable *hash,
int32_t key,
int32_t value,
@ -393,8 +412,8 @@ uhash_iputi(UHashtable *hash,
* @param key A pointer key stored in a hashtable
* @return The requested item, or NULL if not found.
*/
U_CAPI void* U_EXPORT2
uhash_get(const UHashtable *hash,
U_CAPI void* U_EXPORT2
uhash_get(const UHashtable *hash,
const void *key);
/**
@ -404,7 +423,7 @@ uhash_get(const UHashtable *hash,
* @param key An integer key stored in a hashtable
* @return The requested item, or NULL if not found.
*/
U_CAPI void* U_EXPORT2
U_CAPI void* U_EXPORT2
uhash_iget(const UHashtable *hash,
int32_t key);
@ -415,7 +434,7 @@ uhash_iget(const UHashtable *hash,
* @param key A pointer key stored in a hashtable
* @return The requested item, or 0 if not found.
*/
U_CAPI int32_t U_EXPORT2
U_CAPI int32_t U_EXPORT2
uhash_geti(const UHashtable *hash,
const void* key);
/**
@ -425,7 +444,7 @@ uhash_geti(const UHashtable *hash,
* @param key An integer key stored in a hashtable
* @return The requested item, or 0 if not found.
*/
U_CAPI int32_t U_EXPORT2
U_CAPI int32_t U_EXPORT2
uhash_igeti(const UHashtable *hash,
int32_t key);
@ -435,7 +454,7 @@ uhash_igeti(const UHashtable *hash,
* @param key A key stored in a hashtable
* @return The item removed, or NULL if not found.
*/
U_CAPI void* U_EXPORT2
U_CAPI void* U_EXPORT2
uhash_remove(UHashtable *hash,
const void *key);
@ -445,7 +464,7 @@ uhash_remove(UHashtable *hash,
* @param key An integer key stored in a hashtable
* @return The item removed, or NULL if not found.
*/
U_CAPI void* U_EXPORT2
U_CAPI void* U_EXPORT2
uhash_iremove(UHashtable *hash,
int32_t key);
@ -455,7 +474,7 @@ uhash_iremove(UHashtable *hash,
* @param key An key stored in a hashtable
* @return The item removed, or 0 if not found.
*/
U_CAPI int32_t U_EXPORT2
U_CAPI int32_t U_EXPORT2
uhash_removei(UHashtable *hash,
const void* key);
@ -465,7 +484,7 @@ uhash_removei(UHashtable *hash,
* @param key An integer key stored in a hashtable
* @return The item removed, or 0 if not found.
*/
U_CAPI int32_t U_EXPORT2
U_CAPI int32_t U_EXPORT2
uhash_iremovei(UHashtable *hash,
int32_t key);
@ -473,7 +492,7 @@ uhash_iremovei(UHashtable *hash,
* Remove all items from a UHashtable.
* @param hash The target UHashtable.
*/
U_CAPI void U_EXPORT2
U_CAPI void U_EXPORT2
uhash_removeAll(UHashtable *hash);
/**
@ -487,7 +506,7 @@ uhash_removeAll(UHashtable *hash);
* @param key A key stored in a hashtable
* @return a hash element, or NULL if the key is not found.
*/
U_CAPI const UHashElement* U_EXPORT2
U_CAPI const UHashElement* U_EXPORT2
uhash_find(const UHashtable *hash, const void* key);
/**
@ -510,7 +529,7 @@ uhash_find(const UHashtable *hash, const void* key);
* @return a hash element, or NULL if no further key-value pairs
* exist in the table.
*/
U_CAPI const UHashElement* U_EXPORT2
U_CAPI const UHashElement* U_EXPORT2
uhash_nextElement(const UHashtable *hash,
int32_t *pos);
@ -525,7 +544,7 @@ uhash_nextElement(const UHashtable *hash,
* modified.
* @return the value that was removed.
*/
U_CAPI void* U_EXPORT2
U_CAPI void* U_EXPORT2
uhash_removeElement(UHashtable *hash, const UHashElement* e);
/********************************************************************
@ -537,7 +556,7 @@ uhash_removeElement(UHashtable *hash, const UHashElement* e);
* @param i The given integer
* @return a UHashTok for an integer.
*/
/*U_CAPI UHashTok U_EXPORT2
/*U_CAPI UHashTok U_EXPORT2
uhash_toki(int32_t i);*/
/**
@ -545,7 +564,7 @@ uhash_toki(int32_t i);*/
* @param p The given pointer
* @return a UHashTok for a pointer.
*/
/*U_CAPI UHashTok U_EXPORT2
/*U_CAPI UHashTok U_EXPORT2
uhash_tokp(void* p);*/
/********************************************************************
@ -559,7 +578,7 @@ uhash_tokp(void* p);*/
* @param key The string (const UChar*) to hash.
* @return A hash code for the key.
*/
U_CAPI int32_t U_EXPORT2
U_CAPI int32_t U_EXPORT2
uhash_hashUChars(const UHashTok key);
/**
@ -569,7 +588,7 @@ uhash_hashUChars(const UHashTok key);
* @param key The string (const char*) to hash.
* @return A hash code for the key.
*/
U_CAPI int32_t U_EXPORT2
U_CAPI int32_t U_EXPORT2
uhash_hashChars(const UHashTok key);
/**
@ -589,7 +608,7 @@ uhash_hashIChars(const UHashTok key);
* @param key2 The string for comparison
* @return true if key1 and key2 are equal, return false otherwise.
*/
U_CAPI UBool U_EXPORT2
U_CAPI UBool U_EXPORT2
uhash_compareUChars(const UHashTok key1, const UHashTok key2);
/**
@ -599,7 +618,7 @@ uhash_compareUChars(const UHashTok key1, const UHashTok key2);
* @param key2 The string for comparison
* @return true if key1 and key2 are equal, return false otherwise.
*/
U_CAPI UBool U_EXPORT2
U_CAPI UBool U_EXPORT2
uhash_compareChars(const UHashTok key1, const UHashTok key2);
/**
@ -609,7 +628,7 @@ uhash_compareChars(const UHashTok key1, const UHashTok key2);
* @param key2 The string for comparison
* @return true if key1 and key2 are equal, return false otherwise.
*/
U_CAPI UBool U_EXPORT2
U_CAPI UBool U_EXPORT2
uhash_compareIChars(const UHashTok key1, const UHashTok key2);
/********************************************************************
@ -621,7 +640,7 @@ uhash_compareIChars(const UHashTok key1, const UHashTok key2);
* @param key The string (const char*) to hash.
* @return A hash code for the key.
*/
U_CAPI int32_t U_EXPORT2
U_CAPI int32_t U_EXPORT2
uhash_hashUnicodeString(const UElement key);
/**
@ -630,7 +649,7 @@ uhash_hashUnicodeString(const UElement key);
* @param key The string (const char*) to hash.
* @return A hash code for the key.
*/
U_CAPI int32_t U_EXPORT2
U_CAPI int32_t U_EXPORT2
uhash_hashCaselessUnicodeString(const UElement key);
/********************************************************************
@ -642,7 +661,7 @@ uhash_hashCaselessUnicodeString(const UElement key);
* @param key The string (const char*) to hash.
* @return A hash code for the key.
*/
U_CAPI int32_t U_EXPORT2
U_CAPI int32_t U_EXPORT2
uhash_hashLong(const UHashTok key);
/**
@ -651,7 +670,7 @@ uhash_hashLong(const UHashTok key);
* @param Key2 The integer for comparison
* @return true if key1 and key2 are equal, return false otherwise
*/
U_CAPI UBool U_EXPORT2
U_CAPI UBool U_EXPORT2
uhash_compareLong(const UHashTok key1, const UHashTok key2);
/********************************************************************
@ -662,7 +681,7 @@ uhash_compareLong(const UHashTok key1, const UHashTok key2);
* Deleter for Hashtable objects.
* @param obj The object to be deleted
*/
U_CAPI void U_EXPORT2
U_CAPI void U_EXPORT2
uhash_deleteHashtable(void *obj);
/* Use uprv_free() itself as a deleter for any key or value allocated using uprv_malloc. */
@ -673,7 +692,7 @@ uhash_deleteHashtable(void *obj);
* @param hash2
* @return true if the hashtables are equal and false if not.
*/
U_CAPI UBool U_EXPORT2
U_CAPI UBool U_EXPORT2
uhash_equals(const UHashtable* hash1, const UHashtable* hash2);

View file

@ -8,6 +8,7 @@
#define __CASEMAP_H__
#include "unicode/utypes.h"
#include "unicode/stringpiece.h"
#include "unicode/uobject.h"
/**
@ -20,6 +21,7 @@ U_NAMESPACE_BEGIN
#ifndef U_HIDE_DRAFT_API
class BreakIterator;
class ByteSink;
class Edits;
/**
@ -36,7 +38,7 @@ public:
* The source string and the destination buffer must not overlap.
*
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT.
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param dest A buffer for the result string. The result will be NUL-terminated if
@ -48,7 +50,8 @@ public:
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first. edits can be NULL.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be NULL.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful.
@ -71,7 +74,7 @@ public:
* The source string and the destination buffer must not overlap.
*
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT.
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param dest A buffer for the result string. The result will be NUL-terminated if
@ -83,7 +86,8 @@ public:
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first. edits can be NULL.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be NULL.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful.
@ -112,7 +116,7 @@ public:
* all others. (This can be modified with options bits.)
*
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT,
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
* U_TITLECASE_NO_LOWERCASE,
* U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
* U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
@ -132,7 +136,8 @@ public:
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first. edits can be NULL.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be NULL.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful.
@ -161,7 +166,7 @@ public:
* The result may be longer or shorter than the original.
* The source string and the destination buffer must not overlap.
*
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT,
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
* U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
@ -174,7 +179,8 @@ public:
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first. edits can be NULL.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be NULL.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful.
@ -190,6 +196,129 @@ public:
char16_t *dest, int32_t destCapacity, Edits *edits,
UErrorCode &errorCode);
/**
* Lowercases a UTF-8 string and optionally records edits.
* Casing is locale-dependent and context-sensitive.
* The result may be longer or shorter than the original.
*
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
* @param src The original string.
* @param sink A ByteSink to which the result string is written.
* sink.Flush() is called at the end.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be NULL.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
*
* @see ucasemap_utf8ToLower
* @draft ICU 60
*/
static void utf8ToLower(
const char *locale, uint32_t options,
StringPiece src, ByteSink &sink, Edits *edits,
UErrorCode &errorCode);
/**
* Uppercases a UTF-8 string and optionally records edits.
* Casing is locale-dependent and context-sensitive.
* The result may be longer or shorter than the original.
*
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
* @param src The original string.
* @param sink A ByteSink to which the result string is written.
* sink.Flush() is called at the end.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be NULL.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
*
* @see ucasemap_utf8ToUpper
* @draft ICU 60
*/
static void utf8ToUpper(
const char *locale, uint32_t options,
StringPiece src, ByteSink &sink, Edits *edits,
UErrorCode &errorCode);
#if !UCONFIG_NO_BREAK_ITERATION
/**
* Titlecases a UTF-8 string and optionally records edits.
* Casing is locale-dependent and context-sensitive.
* The result may be longer or shorter than the original.
*
* Titlecasing uses a break iterator to find the first characters of words
* that are to be titlecased. It titlecases those characters and lowercases
* all others. (This can be modified with options bits.)
*
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
* U_TITLECASE_NO_LOWERCASE,
* U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
* U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
* @param iter A break iterator to find the first characters of words that are to be titlecased.
* It is set to the source string (setUText())
* and used one or more times for iteration (first() and next()).
* If NULL, then a word break iterator for the locale is used
* (or something equivalent).
* @param src The original string.
* @param sink A ByteSink to which the result string is written.
* sink.Flush() is called at the end.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be NULL.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
*
* @see ucasemap_utf8ToTitle
* @draft ICU 60
*/
static void utf8ToTitle(
const char *locale, uint32_t options, BreakIterator *iter,
StringPiece src, ByteSink &sink, Edits *edits,
UErrorCode &errorCode);
#endif // UCONFIG_NO_BREAK_ITERATION
/**
* Case-folds a UTF-8 string and optionally records edits.
*
* Case folding is locale-independent and not context-sensitive,
* but there is an option for whether to include or exclude mappings for dotted I
* and dotless i that are marked with 'T' in CaseFolding.txt.
*
* The result may be longer or shorter than the original.
*
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
* @param src The original string.
* @param sink A ByteSink to which the result string is written.
* sink.Flush() is called at the end.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be NULL.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
*
* @see ucasemap_utf8FoldCase
* @draft ICU 60
*/
static void utf8Fold(
uint32_t options,
StringPiece src, ByteSink &sink, Edits *edits,
UErrorCode &errorCode);
/**
* Lowercases a UTF-8 string and optionally records edits.
* Casing is locale-dependent and context-sensitive.
@ -197,7 +326,7 @@ public:
* The source string and the destination buffer must not overlap.
*
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT.
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param dest A buffer for the result string. The result will be NUL-terminated if
@ -209,7 +338,8 @@ public:
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first. edits can be NULL.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be NULL.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful.
@ -219,7 +349,7 @@ public:
* @see ucasemap_utf8ToLower
* @draft ICU 59
*/
static int32_t utf8ToLower(
static int32_t utf8ToLower(
const char *locale, uint32_t options,
const char *src, int32_t srcLength,
char *dest, int32_t destCapacity, Edits *edits,
@ -232,7 +362,7 @@ public:
* The source string and the destination buffer must not overlap.
*
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT.
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param dest A buffer for the result string. The result will be NUL-terminated if
@ -244,7 +374,8 @@ public:
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first. edits can be NULL.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be NULL.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful.
@ -273,7 +404,7 @@ public:
* all others. (This can be modified with options bits.)
*
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT,
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
* U_TITLECASE_NO_LOWERCASE,
* U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
* U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
@ -293,7 +424,8 @@ public:
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first. edits can be NULL.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be NULL.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful.
@ -321,7 +453,7 @@ public:
* The result may be longer or shorter than the original.
* The source string and the destination buffer must not overlap.
*
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT,
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
* U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
@ -334,7 +466,8 @@ public:
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first. edits can be NULL.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be NULL.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful.

View file

@ -148,7 +148,7 @@ public:
Iterator() :
array(nullptr), index(0), length(0),
remaining(0), onlyChanges_(FALSE), coarse(FALSE),
changed(FALSE), oldLength_(0), newLength_(0),
dir(0), changed(FALSE), oldLength_(0), newLength_(0),
srcIndex(0), replIndex(0), destIndex(0) {}
/**
* Copy constructor.
@ -306,17 +306,22 @@ public:
Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs);
int32_t readLength(int32_t head);
void updateIndexes();
void updateNextIndexes();
void updatePreviousIndexes();
UBool noNext();
UBool next(UBool onlyChanges, UErrorCode &errorCode);
UBool previous(UErrorCode &errorCode);
/** @return -1: error or i<0; 0: found; 1: i>=string length */
int32_t findIndex(int32_t i, UBool findSource, UErrorCode &errorCode);
const uint16_t *array;
int32_t index, length;
// 0 if we are not within compressed equal-length changes.
// Otherwise the number of remaining changes, including the current one.
int32_t remaining;
UBool onlyChanges_, coarse;
int8_t dir; // iteration direction: back(<0), initial(0), forward(>0)
UBool changed;
int32_t oldLength_, newLength_;
int32_t srcIndex, replIndex, destIndex;

View file

@ -55,14 +55,26 @@ class U_COMMON_API FilteredBreakIteratorBuilder : public UObject {
*/
static FilteredBreakIteratorBuilder *createInstance(const Locale& where, UErrorCode& status);
/**
* This function has been deprecated in favor of createEmptyInstance, which has
* identical behavior.
* @param status The error code.
* @return the new builder
* @deprecated ICU 60 use createEmptyInstance instead
* @see createEmptyInstance()
*/
static inline FilteredBreakIteratorBuilder *createInstance(UErrorCode &status) {
return createEmptyInstance(status);
}
/**
* Construct an empty FilteredBreakIteratorBuilder.
* In this state, it will not suppress any segment boundaries.
* @param status The error code.
* @return the new builder
* @stable ICU 56
* @draft ICU 60
*/
static FilteredBreakIteratorBuilder *createInstance(UErrorCode &status);
static FilteredBreakIteratorBuilder *createEmptyInstance(UErrorCode &status);
/**
* Suppress a certain string from being the end of a segment.
@ -89,6 +101,17 @@ class U_COMMON_API FilteredBreakIteratorBuilder : public UObject {
*/
virtual UBool unsuppressBreakAfter(const UnicodeString& string, UErrorCode& status) = 0;
/**
* This function has been deprecated in favor of wrapIteratorWithFilter()
* The behavior is identical.
* @param adoptBreakIterator the break iterator to adopt
* @param status error code
* @return the new BreakIterator, owned by the caller.
* @deprecated ICU 60 use wrapIteratorWithFilter() instead
* @see wrapBreakIteratorWithFilter()
*/
virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status) = 0;
/**
* Wrap (adopt) an existing break iterator in a new filtered instance.
* The resulting BreakIterator is owned by the caller.
@ -96,12 +119,15 @@ class U_COMMON_API FilteredBreakIteratorBuilder : public UObject {
* Note that the adoptBreakIterator is adopted by the new BreakIterator
* and should no longer be used by the caller.
* The FilteredBreakIteratorBuilder may be reused.
* This function is an alias for build()
* @param adoptBreakIterator the break iterator to adopt
* @param status error code
* @return the new BreakIterator, owned by the caller.
* @stable ICU 56
* @draft ICU 60
*/
virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status) = 0;
inline BreakIterator *wrapIteratorWithFilter(BreakIterator* adoptBreakIterator, UErrorCode& status) {
return build(adoptBreakIterator, status);
}
protected:
/**

View file

@ -88,7 +88,7 @@ class UnicodeString;
* <P>
* The third constructor requires a third argument--the <STRONG>Variant.</STRONG>
* The Variant codes are vendor and browser-specific.
* For example, use REVISED for a langauge's revised script orthography, and POSIX for POSIX.
* For example, use REVISED for a language's revised script orthography, and POSIX for POSIX.
* Where there are two variants, separate them with an underscore, and
* put the most important one first. For
* example, a Traditional Spanish collation might be referenced, with

View file

@ -228,14 +228,15 @@ public:
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
* Otherwise currently converts to & from UTF-16 and does not support edits.
*
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT.
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
* @param src Source UTF-8 string.
* @param sink A ByteSink to which the normalized UTF-8 result string is written.
* sink.Flush() is called at the end.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first. edits can be nullptr.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be nullptr.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
@ -534,7 +535,7 @@ public:
virtual UnicodeString &
normalize(const UnicodeString &src,
UnicodeString &dest,
UErrorCode &errorCode) const override;
UErrorCode &errorCode) const U_OVERRIDE;
/**
* Normalizes a UTF-8 string and optionally records how source substrings
@ -545,14 +546,15 @@ public:
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
* Otherwise currently converts to & from UTF-16 and does not support edits.
*
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT.
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
* @param src Source UTF-8 string.
* @param sink A ByteSink to which the normalized UTF-8 result string is written.
* sink.Flush() is called at the end.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first. edits can be nullptr.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be nullptr.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
@ -561,7 +563,7 @@ public:
*/
virtual void
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
Edits *edits, UErrorCode &errorCode) const override;
Edits *edits, UErrorCode &errorCode) const U_OVERRIDE;
/**
* Appends the normalized form of the second string to the first string
@ -580,7 +582,7 @@ public:
virtual UnicodeString &
normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const override;
UErrorCode &errorCode) const U_OVERRIDE;
/**
* Appends the second string to the first string
* (merging them at the boundary) and returns the first string.
@ -598,7 +600,7 @@ public:
virtual UnicodeString &
append(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const override;
UErrorCode &errorCode) const U_OVERRIDE;
/**
* Gets the decomposition mapping of c.
@ -612,7 +614,7 @@ public:
* @stable ICU 4.6
*/
virtual UBool
getDecomposition(UChar32 c, UnicodeString &decomposition) const override;
getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
/**
* Gets the raw decomposition mapping of c.
@ -626,7 +628,7 @@ public:
* @stable ICU 49
*/
virtual UBool
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override;
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
/**
* Performs pairwise composition of a & b and returns the composite if there is one.
@ -639,7 +641,7 @@ public:
* @stable ICU 49
*/
virtual UChar32
composePair(UChar32 a, UChar32 b) const override;
composePair(UChar32 a, UChar32 b) const U_OVERRIDE;
/**
* Gets the combining class of c.
@ -650,7 +652,7 @@ public:
* @stable ICU 49
*/
virtual uint8_t
getCombiningClass(UChar32 c) const override;
getCombiningClass(UChar32 c) const U_OVERRIDE;
/**
* Tests if the string is normalized.
@ -664,7 +666,7 @@ public:
* @stable ICU 4.4
*/
virtual UBool
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
/**
* Tests if the UTF-8 string is normalized.
* Internally, in cases where the quickCheck() method would return "maybe"
@ -687,7 +689,7 @@ public:
* @draft ICU 60
*/
virtual UBool
isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE;
/**
* Tests if the string is normalized.
* For details see the Normalizer2 base class documentation.
@ -700,7 +702,7 @@ public:
* @stable ICU 4.4
*/
virtual UNormalizationCheckResult
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override;
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
/**
* Returns the end of the normalized substring of the input string.
* For details see the Normalizer2 base class documentation.
@ -713,7 +715,7 @@ public:
* @stable ICU 4.4
*/
virtual int32_t
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override;
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
/**
* Tests if the character always has a normalization boundary before it,
@ -723,7 +725,7 @@ public:
* @return TRUE if c has a normalization boundary before it
* @stable ICU 4.4
*/
virtual UBool hasBoundaryBefore(UChar32 c) const override;
virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE;
/**
* Tests if the character always has a normalization boundary after it,
@ -733,7 +735,7 @@ public:
* @return TRUE if c has a normalization boundary after it
* @stable ICU 4.4
*/
virtual UBool hasBoundaryAfter(UChar32 c) const override;
virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE;
/**
* Tests if the character is normalization-inert.
@ -742,7 +744,7 @@ public:
* @return TRUE if c is normalization-inert
* @stable ICU 4.4
*/
virtual UBool isInert(UChar32 c) const override;
virtual UBool isInert(UChar32 c) const U_OVERRIDE;
private:
UnicodeString &
normalize(const UnicodeString &src,

View file

@ -830,6 +830,16 @@ namespace std {
# define U_CALLCONV U_EXPORT2
#endif
/**
* \def U_CALLCONV_FPTR
* Similar to U_CALLCONV, but only used on function pointers.
* @internal
*/
#if U_PLATFORM == U_PF_OS390 && defined(__cplusplus)
# define U_CALLCONV_FPTR U_CALLCONV
#else
# define U_CALLCONV_FPTR
#endif
/* @} */
#endif

View file

@ -31,21 +31,14 @@
#include "unicode/schriter.h"
#include "unicode/uchriter.h"
U_NAMESPACE_BEGIN
/** @internal */
struct RBBIDataHeader;
class RuleBasedBreakIteratorTables;
class BreakIterator;
class RBBIDataWrapper;
class UStack;
class LanguageBreakEngine;
struct RBBIDataHeader;
class RBBIDataWrapper;
class UnhandledEngine;
struct RBBIStateTable;
class UStack;
/**
*
@ -94,19 +87,36 @@ private:
*/
RBBIDataWrapper *fData;
/** Index of the Rule {tag} values for the most recent match.
/**
* The iteration state - current position, rule status for the current position,
* and whether the iterator ran off the end, yielding UBRK_DONE.
* Current position is pinned to be 0 < position <= text.length.
* Current position is always set to a boundary.
* @internal
*/
int32_t fLastRuleStatusIndex;
/**
* The current position of the iterator. Pinned, 0 < fPosition <= text.length.
* Never has the value UBRK_DONE (-1).
*/
int32_t fPosition;
/**
* Rule tag value valid flag.
* Some iterator operations don't intrinsically set the correct tag value.
* This flag lets us lazily compute the value if we are ever asked for it.
* @internal
*/
UBool fLastStatusIndexValid;
* TODO:
*/
int32_t fRuleStatusIndex;
/**
* True when iteration has run off the end, and iterator functions should return UBRK_DONE.
*/
UBool fDone;
/**
* Cache of previously determined boundary positions.
*/
public: // TODO: debug, return to private.
class BreakCache;
BreakCache *fBreakCache;
private:
/**
* Counter for the number of characters encountered with the "dictionary"
* flag set.
@ -115,26 +125,11 @@ private:
uint32_t fDictionaryCharCount;
/**
* When a range of characters is divided up using the dictionary, the break
* positions that are discovered are stored here, preventing us from having
* to use either the dictionary or the state table again until the iterator
* leaves this range of text. Has the most impact for line breaking.
* @internal
* Cache of boundary positions within a region of text that has been
* sub-divided by dictionary based breaking.
*/
int32_t* fCachedBreakPositions;
/**
* The number of elements in fCachedBreakPositions
* @internal
*/
int32_t fNumCachedBreakPositions;
/**
* if fCachedBreakPositions is not null, this indicates which item in the
* cache the current iteration position refers to
* @internal
*/
int32_t fPositionInCache;
class DictionaryCache;
DictionaryCache *fDictionaryCache;
/**
*
@ -177,13 +172,11 @@ private:
*/
RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
/** @internal */
friend class RBBIRuleBuilder;
/** @internal */
friend class BreakIterator;
public:
/** Default constructor. Creates an empty shell of an iterator, with no
@ -467,7 +460,10 @@ public:
virtual UBool isBoundary(int32_t offset);
/**
* Returns the current iteration position.
* Returns the current iteration position. Note that UBRK_DONE is never
* returned from this function; if iteration has run to the end of a
* string, current() will return the length of the string while
* next() will return UBRK_DONE).
* @return The current iteration position.
* @stable ICU 2.0
*/
@ -499,6 +495,7 @@ public:
* Note: this function is not thread safe. It should not have been
* declared const, and the const remains only for compatibility
* reasons. (The function is logically const, but not bit-wise const).
* TODO: check this. Probably thread safe now.
* <p>
* @return the status from the break rule that determined the most recently
* returned break position.
@ -658,46 +655,31 @@ private:
* Common initialization function, used by constructors and bufferClone.
* @internal
*/
void init();
void init(UErrorCode &status);
/**
* This method backs the iterator back up to a "safe position" in the text.
* This is a position that we know, without any context, must be a break position.
* The various calling methods then iterate forward from this safe position to
* the appropriate position to return. (For more information, see the description
* of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
* @param statetable state table used of moving backwards
* Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules.
* This locates a "Safe Position" from which the forward break rules
* will operate correctly. A Safe Position is not necessarily a boundary itself.
*
* @param fromPosition the position in the input text to begin the iteration.
* @internal
*/
int32_t handlePrevious(const RBBIStateTable *statetable);
int32_t handlePrevious(int32_t fromPosition);
/**
* This method is the actual implementation of the next() method. All iteration
* vectors through here. This method initializes the state machine to state 1
* and advances through the text character by character until we reach the end
* of the text or the state machine transitions to state 0. We update our return
* value every time the state machine passes through a possible end state.
* @param statetable state table used of moving forwards
* Find a rule-based boundary by running the state machine.
* Input
* fPosition, the position in the text to begin from.
* Output
* fPosition: the boundary following the starting position.
* fDictionaryCharCount the number of dictionary characters encountered.
* If > 0, the segment will be further subdivided
* fRuleStatusIndex Info from the state table indicating which rules caused the boundary.
*
* @internal
*/
int32_t handleNext(const RBBIStateTable *statetable);
/**
* This is the function that actually implements dictionary-based
* breaking. Covering at least the range from startPos to endPos,
* it checks for dictionary characters, and if it finds them determines
* the appropriate object to deal with them. It may cache found breaks in
* fCachedBreakPositions as it goes. It may well also look at text outside
* the range startPos to endPos.
* If going forward, endPos is the normal Unicode break result, and
* if goind in reverse, startPos is the normal Unicode break result
* @param startPos The start position of a range of text
* @param endPos The end position of a range of text
* @param reverse The call is for the reverse direction
* @internal
*/
int32_t checkDictionary(int32_t startPos, int32_t endPos, UBool reverse);
int32_t handleNext();
/**
@ -708,11 +690,12 @@ private:
*/
const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);
public:
/**
* @internal
* Debugging function only.
* @internal
*/
void makeRuleStatusValid();
void dumpCache();
};
//------------------------------------------------------------------------------

View file

@ -134,6 +134,17 @@
*/
#define U_TITLECASE_ADJUST_TO_CASED 0x400
/**
* Option for string transformation functions to not first reset the Edits object.
* Used for example in some case-mapping and normalization functions.
*
* @see CaseMap
* @see Edits
* @see Normalizer2
* @draft ICU 60
*/
#define U_EDITS_NO_RESET 0x2000
/**
* Omit unchanged text when recording how source substrings
* relate to changed and unchanged result substrings.
@ -182,7 +193,6 @@
// ucasemap_imp.h #define U_TITLECASE_ITERATOR_MASK 0xe0
// ucasemap_imp.h #define U_TITLECASE_ADJUSTMENT_MASK 0x600
// ustr_imp.h #define _STRNCMP_STYLE 0x1000
// ustr_imp.h #define U_EDITS_NO_RESET 0x2000
// unormcmp.cpp #define _COMPARE_EQUIV 0x80000
#endif // __STRINGOPTIONS_H__

View file

@ -230,7 +230,8 @@ typedef enum USentenceBreakTag {
* @param locale The locale specifying the text-breaking conventions. Note that
* locale keys such as "lb" and "ss" may be used to modify text break behavior,
* see general discussion of BreakIterator C API.
* @param text The text to be iterated over.
* @param text The text to be iterated over. May be null, in which case ubrk_setText() is
* used to specify the text to be iterated.
* @param textLength The number of characters in text, or -1 if null-terminated.
* @param status A UErrorCode to receive any errors.
* @return A UBreakIterator for the specified locale.

View file

@ -149,7 +149,7 @@ typedef void U_CALLCONV UMemFreeFn (const void *context, void *mem);
* @system
*/
U_STABLE void U_EXPORT2
u_setMemoryFunctions(const void *context, UMemAllocFn * U_CALLCONV a, UMemReallocFn * U_CALLCONV r, UMemFreeFn * U_CALLCONV f,
u_setMemoryFunctions(const void *context, UMemAllocFn * U_CALLCONV_FPTR a, UMemReallocFn * U_CALLCONV_FPTR r, UMemFreeFn * U_CALLCONV_FPTR f,
UErrorCode *status);
U_CDECL_END

View file

@ -768,7 +768,7 @@ utext_extract(UText *ut,
*/
#define UTEXT_SETNATIVEINDEX(ut, ix) \
{ int64_t __offset = (ix) - (ut)->chunkNativeStart; \
if (__offset>=0 && __offset<=(int64_t)(ut)->nativeIndexingLimit) { \
if (__offset>=0 && __offset<(int64_t)(ut)->nativeIndexingLimit && (ut)->chunkContents[__offset]<0xdc00) { \
(ut)->chunkOffset=(int32_t)__offset; \
} else { \
utext_setNativeIndex((ut), (ix)); } }

View file

@ -23,9 +23,6 @@
* This file defines macros for checking whether a code point is
* a surrogate or a non-character etc.
*
* The UChar and UChar32 data types for Unicode code units and code points
* are defined in umachine.h because they can be machine-dependent.
*
* If U_NO_DEFAULT_INCLUDE_UTF_HEADERS is 0 then utf.h is included by utypes.h
* and itself includes utf8.h and utf16.h after some
* common definitions.
@ -50,11 +47,11 @@
* but are optimized for the much more frequently occurring BMP code points.
*
* umachine.h defines UChar to be an unsigned 16-bit integer.
* Where available, UChar is defined to be a char16_t
* or a wchar_t (if that is an unsigned 16-bit type), otherwise uint16_t.
* Since ICU 59, ICU uses char16_t in C++, UChar only in C,
* and defines UChar=char16_t by default. See the UChar API docs for details.
*
* UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit
* Unicode code point (Unicode scalar value, 0..0x10ffff).
* Unicode code point (Unicode scalar value, 0..0x10ffff) and U_SENTINEL (-1).
* Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as
* the definition of UChar. For details see the documentation for UChar32 itself.
*
@ -63,11 +60,20 @@
* For actual Unicode character properties see uchar.h.
*
* By default, string operations must be done with error checking in case
* a string is not well-formed UTF-16.
* The macros will detect if a surrogate code unit is unpaired
* a string is not well-formed UTF-16 or UTF-8.
*
* The U16_ macros detect if a surrogate code unit is unpaired
* (lead unit without trail unit or vice versa) and just return the unit itself
* as the code point.
*
* The U8_ macros detect illegal byte sequences and return a negative value.
* Starting with ICU 60, the observable length of a single illegal byte sequence
* skipped by one of these macros follows the Unicode 6+ recommendation
* which is consistent with the W3C Encoding Standard.
*
* There are ..._OR_FFFD versions of both U16_ and U8_ macros
* that return U+FFFD for illegal code unit sequences.
*
* The regular "safe" macros require that the initial, passed-in string index
* is within bounds. They only check the index when they read more than one
* code unit. This is usually done with code similar to the following loop:
@ -91,10 +97,7 @@
* The performance differences are much larger here because UTF-8 provides so
* many opportunities for malformed sequences.
* The unsafe UTF-8 macros are entirely implemented inside the macro definitions
* and are fast, while the safe UTF-8 macros call functions for all but the
* trivial (ASCII) cases.
* (ICU 3.6 optimizes U8_NEXT() and U8_APPEND() to handle most other common
* characters inline as well.)
* and are fast, while the safe UTF-8 macros call functions for some complicated cases.
*
* Unlike with UTF-16, malformed sequences cannot be expressed with distinct
* code point values (0..U+10ffff). They are indicated with negative values instead.
@ -126,8 +129,7 @@
*/
#define U_IS_UNICODE_NONCHAR(c) \
((c)>=0xfdd0 && \
((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \
(uint32_t)(c)<=0x10ffff)
((c)<=0xfdef || ((c)&0xfffe)==0xfffe) && (c)<=0x10ffff)
/**
* Is c a Unicode code point value (0..U+10ffff)
@ -148,9 +150,7 @@
*/
#define U_IS_UNICODE_CHAR(c) \
((uint32_t)(c)<0xd800 || \
((uint32_t)(c)>0xdfff && \
(uint32_t)(c)<=0x10ffff && \
!U_IS_UNICODE_NONCHAR(c)))
(0xdfff<(c) && (c)<=0x10ffff && !U_IS_UNICODE_NONCHAR(c)))
/**
* Is this code point a BMP code point (U+0000..U+ffff)?

View file

@ -185,8 +185,8 @@
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to a single, unpaired surrogate, then that itself
* will be returned as the code point.
* If the offset points to a single, unpaired surrogate, then
* c is set to that unpaired surrogate.
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
*
* @param s const UChar * string
@ -213,6 +213,53 @@
} \
}
#ifndef U_HIDE_DRAFT_API
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The offset may point to either the lead or trail surrogate unit
* for a supplementary code point, in which case the macro will read
* the adjacent matching surrogate as well.
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to a single, unpaired surrogate, then
* c is set to U+FFFD.
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT_OR_FFFD.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<=i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_GET_UNSAFE
* @draft ICU 60
*/
#define U16_GET_OR_FFFD(s, start, i, length, c) { \
(c)=(s)[i]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_LEAD(c)) { \
if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} else { \
(c)=0xfffd; \
} \
} else { \
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} else { \
(c)=0xfffd; \
} \
} \
} \
}
#endif // U_HIDE_DRAFT_API
/* definitions with forward iteration --------------------------------------- */
/**
@ -253,8 +300,7 @@
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate or
* to a single, unpaired lead surrogate, then that itself
* will be returned as the code point.
* to a single, unpaired lead surrogate, then c is set to that unpaired surrogate.
*
* @param s const UChar * string
* @param i string offset, must be i<length
@ -274,6 +320,44 @@
} \
}
#ifndef U_HIDE_DRAFT_API
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate or
* to a single, unpaired lead surrogate, then c is set to U+FFFD.
*
* @param s const UChar * string
* @param i string offset, must be i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_NEXT_UNSAFE
* @draft ICU 60
*/
#define U16_NEXT_OR_FFFD(s, i, length, c) { \
(c)=(s)[(i)++]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_LEAD(c) && (i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
++(i); \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} else { \
(c)=0xfffd; \
} \
} \
}
#endif // U_HIDE_DRAFT_API
/**
* Append a code point to a string, overwriting 1 or 2 code units.
* The offset points to the current end of the string contents
@ -481,8 +565,7 @@
* for a supplementary code point, then the macro will read
* the preceding lead surrogate as well.
* If the offset is behind a lead surrogate or behind a single, unpaired
* trail surrogate, then that itself
* will be returned as the code point.
* trail surrogate, then c is set to that unpaired surrogate.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
@ -502,6 +585,43 @@
} \
}
#ifndef U_HIDE_DRAFT_API
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a trail surrogate unit
* for a supplementary code point, then the macro will read
* the preceding lead surrogate as well.
* If the offset is behind a lead surrogate or behind a single, unpaired
* trail surrogate, then c is set to U+FFFD.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<i
* @param c output UChar32 variable
* @see U16_PREV_UNSAFE
* @draft ICU 60
*/
#define U16_PREV_OR_FFFD(s, start, i, c) { \
(c)=(s)[--(i)]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_TRAIL(c) && (i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
--(i); \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} else { \
(c)=0xfffd; \
} \
} \
}
#endif // U_HIDE_DRAFT_API
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)

View file

@ -41,34 +41,24 @@
/* internal definitions ----------------------------------------------------- */
/**
* Counts the trail bytes for a UTF-8 lead byte.
* Returns 0 for 0..0xbf as well as for 0xfe and 0xff.
* Returns 0 for 0..0xc1 as well as for 0xf5..0xff.
* leadByte might be evaluated multiple times.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this file and thus must remain stable.
*
* Note: Beginning with ICU 50, the implementation uses a multi-condition expression
* which was shown in 2012 (on x86-64) to compile to fast, branch-free code.
* leadByte is evaluated multiple times.
*
* The pre-ICU 50 implementation used the exported array utf8_countTrailBytes:
* #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte])
* leadByte was evaluated exactly once.
*
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
* @internal
*/
#define U8_COUNT_TRAIL_BYTES(leadByte) \
((uint8_t)(leadByte)<0xf0 ? \
((uint8_t)(leadByte)>=0xc0)+((uint8_t)(leadByte)>=0xe0) : \
(uint8_t)(leadByte)<0xfe ? 3+((uint8_t)(leadByte)>=0xf8)+((uint8_t)(leadByte)>=0xfc) : 0)
(U8_IS_LEAD(leadByte) ? \
((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0)
/**
* Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
* The maximum supported lead byte is 0xf4 corresponding to U+10FFFF.
* Returns 0 for 0..0xc1. Undefined for 0xf5..0xff.
* leadByte might be evaluated multiple times.
*
* This is internal since it is not meant to be called directly by external clients;
@ -78,7 +68,7 @@
* @internal
*/
#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
(((leadByte)>=0xc0)+((leadByte)>=0xe0)+((leadByte)>=0xf0))
(((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))
/**
* Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
@ -89,6 +79,40 @@
*/
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
/**
* Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
* Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
* Lead byte E0..EF bits 3..0 are used as byte index,
* first trail byte bits 7..5 are used as bit index into that byte.
* @see U8_IS_VALID_LEAD3_AND_T1
* @internal
*/
#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
/**
* Internal 3-byte UTF-8 validity check.
* Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence.
* @internal
*/
#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
/**
* Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
* Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
* First trail byte bits 7..4 are used as byte index,
* lead byte F0..F4 bits 2..0 are used as bit index into that byte.
* @see U8_IS_VALID_LEAD4_AND_T1
* @internal
*/
#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
/**
* Internal 4-byte UTF-8 validity check.
* Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence.
* @internal
*/
#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
/**
* Function for handling "next code point" with error-checking.
*
@ -148,20 +172,21 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
#define U8_IS_SINGLE(c) (((c)&0x80)==0)
/**
* Is this code unit (byte) a UTF-8 lead byte?
* Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e)
#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
// 0x32=0xf4-0xc2
/**
* Is this code unit (byte) a UTF-8 trail byte?
* Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U8_IS_TRAIL(c) (((c)&0xc0)==0x80)
#define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40)
/**
* How many code units (bytes) are used for the UTF-8 encoding
@ -289,7 +314,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
*/
#define U8_NEXT_UNSAFE(s, i, c) { \
(c)=(uint8_t)(s)[(i)++]; \
if((c)>=0x80) { \
if(!U8_IS_SINGLE(c)) { \
if((c)<0xe0) { \
(c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
} else if((c)<0xf0) { \
@ -325,22 +350,19 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
*/
#define U8_NEXT(s, i, length, c) { \
(c)=(uint8_t)(s)[(i)++]; \
if((c)>=0x80) { \
if(!U8_IS_SINGLE(c)) { \
uint8_t __t1, __t2; \
if( /* handle U+1000..U+CFFF inline */ \
(0xe0<(c) && (c)<=0xec) && \
(((i)+1)<(length) || (length)<0) && \
(__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
(__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
) { \
/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
(c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
if( /* handle U+0800..U+FFFF inline */ \
(0xe0<=(c) && (c)<0xf0) && \
(((i)+1)<(length) || (length)<0) && \
U8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \
(__t2=(s)[(i)+1]-0x80)<=0x3f) { \
(c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \
(i)+=2; \
} else if( /* handle U+0080..U+07FF inline */ \
((c)<0xe0 && (c)>=0xc2) && \
((i)!=(length)) && \
(__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
) { \
((c)<0xe0 && (c)>=0xc2) && \
((i)!=(length)) && \
(__t1=(s)[i]-0x80)<=0x3f) { \
(c)=(((c)&0x1f)<<6)|__t1; \
++(i); \
} else { \
@ -376,22 +398,19 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
*/
#define U8_NEXT_OR_FFFD(s, i, length, c) { \
(c)=(uint8_t)(s)[(i)++]; \
if((c)>=0x80) { \
if(!U8_IS_SINGLE(c)) { \
uint8_t __t1, __t2; \
if( /* handle U+1000..U+CFFF inline */ \
(0xe0<(c) && (c)<=0xec) && \
(((i)+1)<(length) || (length)<0) && \
(__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
(__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
) { \
/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
(c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
if( /* handle U+0800..U+FFFF inline */ \
(0xe0<=(c) && (c)<0xf0) && \
(((i)+1)<(length) || (length)<0) && \
U8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \
(__t2=(s)[(i)+1]-0x80)<=0x3f) { \
(c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \
(i)+=2; \
} else if( /* handle U+0080..U+07FF inline */ \
((c)<0xe0 && (c)>=0xc2) && \
((i)!=(length)) && \
(__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
) { \
((c)<0xe0 && (c)>=0xc2) && \
((i)!=(length)) && \
(__t1=(s)[i]-0x80)<=0x3f) { \
(c)=(((c)&0x1f)<<6)|__t1; \
++(i); \
} else { \
@ -476,7 +495,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
* @stable ICU 2.4
*/
#define U8_FWD_1_UNSAFE(s, i) { \
(i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((uint8_t)(s)[i]); \
(i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \
}
/**
@ -493,15 +512,24 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
* @stable ICU 2.4
*/
#define U8_FWD_1(s, i, length) { \
uint8_t __b=(uint8_t)(s)[(i)++]; \
if(U8_IS_LEAD(__b)) { \
uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \
if((i)+__count>(length) && (length)>=0) { \
__count=(uint8_t)((length)-(i)); \
} \
while(__count>0 && U8_IS_TRAIL((s)[i])) { \
++(i); \
--__count; \
uint8_t __b=(s)[(i)++]; \
if(U8_IS_LEAD(__b) && (i)!=(length)) { \
uint8_t __t1=(s)[i]; \
if((0xe0<=__b && __b<0xf0)) { \
if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \
++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
++(i); \
} \
} else if(__b<0xe0) { \
if(U8_IS_TRAIL(__t1)) { \
++(i); \
} \
} else /* c>=0xf0 */ { \
if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
++(i); \
} \
} \
} \
}
@ -615,7 +643,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
/* c is a trail byte */ \
(c)&=0x3f; \
for(;;) { \
__b=(uint8_t)(s)[--(i)]; \
__b=(s)[--(i)]; \
if(__b>=0xc0) { \
U8_MASK_LEAD_BYTE(__b, __count); \
(c)|=(UChar32)__b<<__shift; \
@ -651,7 +679,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
*/
#define U8_PREV(s, start, i, c) { \
(c)=(uint8_t)(s)[--(i)]; \
if((c)>=0x80) { \
if(!U8_IS_SINGLE(c)) { \
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
} \
}
@ -682,7 +710,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
*/
#define U8_PREV_OR_FFFD(s, start, i, c) { \
(c)=(uint8_t)(s)[--(i)]; \
if((c)>=0x80) { \
if(!U8_IS_SINGLE(c)) { \
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
} \
}

View file

@ -145,7 +145,22 @@
#ifndef __UTF_OLD_H__
#define __UTF_OLD_H__
#ifndef U_HIDE_DEPRECATED_API
/**
* \def U_HIDE_OBSOLETE_UTF_OLD_H
*
* Hides the obsolete definitions in unicode/utf_old.h.
* Recommended to be set to 1 at compile time to make sure
* the long-deprecated macros are no longer used.
*
* For reasons for the deprecation see the utf_old.h file comments.
*
* @internal
*/
#ifndef U_HIDE_OBSOLETE_UTF_OLD_H
# define U_HIDE_OBSOLETE_UTF_OLD_H 0
#endif
#if !defined(U_HIDE_DEPRECATED_API) && !U_HIDE_OBSOLETE_UTF_OLD_H
#include "unicode/utf.h"
#include "unicode/utf8.h"
@ -1184,7 +1199,6 @@ U_CFUNC U_IMPORT const uint8_t utf8_countTrailBytes[]; /* U_IMPORT2? */ /*U_I
*/
#define UTF_SET_CHAR_LIMIT(s, start, i, length) U16_SET_CP_LIMIT(s, start, i, length)
#endif /* U_HIDE_DEPRECATED_API */
#endif // !U_HIDE_DEPRECATED_API && !U_HIDE_OBSOLETE_UTF_OLD_H
#endif

View file

@ -502,7 +502,7 @@ spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
static inline int32_t
spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
UChar32 c=*s;
if((int8_t)c>=0) {
if(U8_IS_SINGLE(c)) {
return set.contains(c) ? 1 : -1;
}
// Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
@ -514,7 +514,7 @@ spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
static inline int32_t
spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
UChar32 c=s[length-1];
if((int8_t)c>=0) {
if(U8_IS_SINGLE(c)) {
return set.contains(c) ? 1 : -1;
}
int32_t i=length-1;
@ -1006,11 +1006,9 @@ int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpa
// Try to match if the increment is not listed already.
// Match at code point boundaries. (The UTF-8 strings were converted
// from UTF-16 and are guaranteed to be well-formed.)
if( !U8_IS_TRAIL(s[pos-overlap]) &&
!offsets.containsOffset(inc) &&
matches8(s+pos-overlap, s8, length8)
) {
if(!U8_IS_TRAIL(s[pos-overlap]) &&
!offsets.containsOffset(inc) &&
matches8(s+pos-overlap, s8, length8)) {
if(inc==rest) {
return length; // Reached the end of the string.
}
@ -1052,11 +1050,10 @@ int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpa
// Try to match if the string is longer or starts earlier.
// Match at code point boundaries. (The UTF-8 strings were converted
// from UTF-16 and are guaranteed to be well-formed.)
if( !U8_IS_TRAIL(s[pos-overlap]) &&
(overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
matches8(s+pos-overlap, s8, length8)
) {
if(!U8_IS_TRAIL(s[pos-overlap]) &&
(overlap>maxOverlap ||
/* redundant overlap==maxOverlap && */ inc>maxInc) &&
matches8(s+pos-overlap, s8, length8)) {
maxInc=inc; // Longest match from earliest start.
maxOverlap=overlap;
break;

View file

@ -18,6 +18,7 @@
#define __USTR_IMP_H__
#include "unicode/utypes.h"
#include "unicode/utf8.h"
/**
* Internal option for unorm_cmpEquivFold() for strncmp style.
@ -25,11 +26,6 @@
*/
#define _STRNCMP_STYLE 0x1000
/**
* Internal option for string transformation functions to not first reset the Edits object.
*/
#define U_EDITS_NO_RESET 0x2000
/**
* Compare two strings in code point order or code unit order.
* Works in strcmp style (both lengths -1),
@ -86,4 +82,62 @@ u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorC
U_CAPI int32_t U_EXPORT2
u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode);
/**
* Counts the bytes of any whole valid sequence for a UTF-8 lead byte.
* Returns 1 for ASCII 0..0x7f.
* Returns 0 for 0x80..0xc1 as well as for 0xf5..0xff.
* leadByte might be evaluated multiple times.
*
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
* @return 0..4
*/
#define U8_COUNT_BYTES(leadByte) \
(U8_IS_SINGLE(leadByte) ? 1 : U8_COUNT_BYTES_NON_ASCII(leadByte))
/**
* Counts the bytes of any whole valid sequence for a UTF-8 lead byte.
* Returns 0 for 0x00..0xc1 as well as for 0xf5..0xff.
* leadByte might be evaluated multiple times.
*
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
* @return 0 or 2..4
*/
#define U8_COUNT_BYTES_NON_ASCII(leadByte) \
(U8_IS_LEAD(leadByte) ? ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+2 : 0)
#ifdef __cplusplus
U_NAMESPACE_BEGIN
class UTF8 {
public:
UTF8() = delete; // all static
/**
* Is t a valid UTF-8 trail byte?
*
* @param prev Must be the preceding lead byte if i==1 and length>=3;
* otherwise ignored.
* @param t The i-th byte following the lead byte.
* @param i The index (1..3) of byte t in the byte sequence. 0<i<length
* @param length The length (2..4) of the byte sequence according to the lead byte.
* @return TRUE if t is a valid trail byte in this context.
*/
static inline UBool isValidTrail(int32_t prev, uint8_t t, int32_t i, int32_t length) {
// The first trail byte after a 3- or 4-byte lead byte
// needs to be validated together with its lead byte.
if (length <= 2 || i > 1) {
return U8_IS_TRAIL(t);
} else if (length == 3) {
return U8_IS_VALID_LEAD3_AND_T1(prev, t);
} else { // length == 4
return U8_IS_VALID_LEAD4_AND_T1(prev, t);
}
}
};
U_NAMESPACE_END
#endif // __cplusplus
#endif

View file

@ -43,28 +43,28 @@ U_NAMESPACE_BEGIN
class WholeStringBreakIterator : public BreakIterator {
public:
WholeStringBreakIterator() : BreakIterator(), length(0) {}
~WholeStringBreakIterator() override;
UBool operator==(const BreakIterator&) const override;
BreakIterator *clone() const override;
~WholeStringBreakIterator() U_OVERRIDE;
UBool operator==(const BreakIterator&) const U_OVERRIDE;
BreakIterator *clone() const U_OVERRIDE;
static UClassID U_EXPORT2 getStaticClassID();
UClassID getDynamicClassID() const override;
CharacterIterator &getText() const override;
UText *getUText(UText *fillIn, UErrorCode &errorCode) const override;
void setText(const UnicodeString &text) override;
void setText(UText *text, UErrorCode &errorCode) override;
void adoptText(CharacterIterator* it) override;
int32_t first() override;
int32_t last() override;
int32_t previous() override;
int32_t next() override;
int32_t current() const override;
int32_t following(int32_t offset) override;
int32_t preceding(int32_t offset) override;
UBool isBoundary(int32_t offset) override;
int32_t next(int32_t n) override;
UClassID getDynamicClassID() const U_OVERRIDE;
CharacterIterator &getText() const U_OVERRIDE;
UText *getUText(UText *fillIn, UErrorCode &errorCode) const U_OVERRIDE;
void setText(const UnicodeString &text) U_OVERRIDE;
void setText(UText *text, UErrorCode &errorCode) U_OVERRIDE;
void adoptText(CharacterIterator* it) U_OVERRIDE;
int32_t first() U_OVERRIDE;
int32_t last() U_OVERRIDE;
int32_t previous() U_OVERRIDE;
int32_t next() U_OVERRIDE;
int32_t current() const U_OVERRIDE;
int32_t following(int32_t offset) U_OVERRIDE;
int32_t preceding(int32_t offset) U_OVERRIDE;
UBool isBoundary(int32_t offset) U_OVERRIDE;
int32_t next(int32_t n) U_OVERRIDE;
BreakIterator *createBufferClone(void *stackBuffer, int32_t &BufferSize,
UErrorCode &errorCode) override;
BreakIterator &refreshInputText(UText *input, UErrorCode &errorCode) override;
UErrorCode &errorCode) U_OVERRIDE;
BreakIterator &refreshInputText(UText *input, UErrorCode &errorCode) U_OVERRIDE;
private:
int32_t length;

View file

@ -24,6 +24,7 @@
#include "unicode/brkiter.h"
#include "unicode/casemap.h"
#include "unicode/edits.h"
#include "unicode/stringoptions.h"
#include "unicode/ustring.h"
#include "unicode/ucasemap.h"
#include "unicode/ubrk.h"
@ -72,9 +73,9 @@ appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
/* (not) original code point */
if(edits!=NULL) {
edits->addUnchanged(cpLength);
if(options & U_OMIT_UNCHANGED_TEXT) {
return destIndex;
}
}
if(options & U_OMIT_UNCHANGED_TEXT) {
return destIndex;
}
c=~result;
if(destIndex<destCapacity && c<=0xffff) { // BMP slightly-fastpath
@ -149,9 +150,9 @@ appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
if(length>0) {
if(edits!=NULL) {
edits->addUnchanged(length);
if(options & U_OMIT_UNCHANGED_TEXT) {
return destIndex;
}
}
if(options & U_OMIT_UNCHANGED_TEXT) {
return destIndex;
}
if(length>(INT32_MAX-destIndex)) {
return -1; // integer overflow
@ -933,8 +934,10 @@ int32_t toUpper(uint32_t options,
}
}
UBool change = TRUE;
if (edits != NULL) {
UBool change;
if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
change = TRUE; // common, simple usage
} else {
// Find out first whether we are changing the text.
change = src[i] != upper || numYpogegrammeni > 0;
int32_t i2 = i + 1;

View file

@ -256,152 +256,6 @@ u_strToUTF32(UChar32 *dest,
pErrorCode);
}
/* for utf8_nextCharSafeBodyTerminated() */
static const UChar32
utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
/*
* Version of utf8_nextCharSafeBody() with the following differences:
* - checks for NUL termination instead of length
* - works with pointers instead of indexes
* - always strict (strict==-1)
*
* *ps points to after the lead byte and will be moved to after the last trail byte.
* c is the lead byte.
* @return the code point, or U_SENTINEL
*/
static UChar32
utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
const uint8_t *s=*ps;
uint8_t trail, illegal=0;
uint8_t count=U8_COUNT_TRAIL_BYTES(c);
U_ASSERT(count<6);
U8_MASK_LEAD_BYTE((c), count);
/* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
switch(count) {
/* each branch falls through to the next one */
case 5:
case 4:
/* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
illegal=1;
break;
case 3:
trail=(uint8_t)(*s++ - 0x80);
c=(c<<6)|trail;
if(trail>0x3f || c>=0x110) {
/* not a trail byte, or code point>0x10ffff (outside Unicode) */
illegal=1;
break;
}
U_FALLTHROUGH;
case 2:
trail=(uint8_t)(*s++ - 0x80);
if(trail>0x3f) {
/* not a trail byte */
illegal=1;
break;
}
c=(c<<6)|trail;
U_FALLTHROUGH;
case 1:
trail=(uint8_t)(*s++ - 0x80);
if(trail>0x3f) {
/* not a trail byte */
illegal=1;
}
c=(c<<6)|trail;
break;
case 0:
return U_SENTINEL;
/* no default branch to optimize switch() - all values are covered */
}
/* correct sequence - all trail bytes have (b7..b6)==(10)? */
/* illegal is also set if count>=4 */
if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
/* error handling */
/* don't go beyond this sequence */
s=*ps;
while(count>0 && U8_IS_TRAIL(*s)) {
++s;
--count;
}
c=U_SENTINEL;
}
*ps=s;
return c;
}
/*
* Version of utf8_nextCharSafeBody() with the following differences:
* - works with pointers instead of indexes
* - always strict (strict==-1)
*
* *ps points to after the lead byte and will be moved to after the last trail byte.
* c is the lead byte.
* @return the code point, or U_SENTINEL
*/
static UChar32
utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
const uint8_t *s=*ps;
uint8_t trail, illegal=0;
uint8_t count=U8_COUNT_TRAIL_BYTES(c);
if((limit-s)>=count) {
U8_MASK_LEAD_BYTE((c), count);
/* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
switch(count) {
/* each branch falls through to the next one */
case 5:
case 4:
/* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
illegal=1;
break;
case 3:
trail=*s++;
c=(c<<6)|(trail&0x3f);
if(c<0x110) {
illegal|=(trail&0xc0)^0x80;
} else {
/* code point>0x10ffff, outside Unicode */
illegal=1;
break;
}
U_FALLTHROUGH;
case 2:
trail=*s++;
c=(c<<6)|(trail&0x3f);
illegal|=(trail&0xc0)^0x80;
U_FALLTHROUGH;
case 1:
trail=*s++;
c=(c<<6)|(trail&0x3f);
illegal|=(trail&0xc0)^0x80;
break;
case 0:
return U_SENTINEL;
/* no default branch to optimize switch() - all values are covered */
}
} else {
illegal=1; /* too few bytes left */
}
/* correct sequence - all trail bytes have (b7..b6)==(10)? */
/* illegal is also set if count>=4 */
U_ASSERT(illegal || count<UPRV_LENGTHOF(utf8_minLegal));
if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
/* error handling */
/* don't go beyond this sequence */
s=*ps;
while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
++s;
--count;
}
c=U_SENTINEL;
}
*ps=s;
return c;
}
U_CAPI UChar* U_EXPORT2
u_strFromUTF8WithSub(UChar *dest,
int32_t destCapacity,
@ -410,19 +264,10 @@ u_strFromUTF8WithSub(UChar *dest,
int32_t srcLength,
UChar32 subchar, int32_t *pNumSubstitutions,
UErrorCode *pErrorCode){
UChar *pDest = dest;
UChar *pDestLimit = dest+destCapacity;
UChar32 ch;
int32_t reqLength = 0;
const uint8_t* pSrc = (const uint8_t*) src;
uint8_t t1, t2; /* trail bytes */
int32_t numSubstitutions;
/* args check */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
if(U_FAILURE(*pErrorCode)) {
return NULL;
}
if( (src==NULL && srcLength!=0) || srcLength < -1 ||
(destCapacity<0) || (dest == NULL && destCapacity > 0) ||
subchar > 0x10ffff || U_IS_SURROGATE(subchar)
@ -434,7 +279,10 @@ u_strFromUTF8WithSub(UChar *dest,
if(pNumSubstitutions!=NULL) {
*pNumSubstitutions=0;
}
numSubstitutions=0;
UChar *pDest = dest;
UChar *pDestLimit = dest+destCapacity;
int32_t reqLength = 0;
int32_t numSubstitutions=0;
/*
* Inline processing of UTF-8 byte sequences:
@ -455,95 +303,81 @@ u_strFromUTF8WithSub(UChar *dest,
* The code explicitly checks for NULs only in the lead byte position.
* A NUL byte in the trail byte position fails the trail byte range check anyway.
*/
while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
if(ch <= 0x7f){
*pDest++=(UChar)ch;
++pSrc;
int32_t i;
UChar32 c;
for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) {
// modified copy of U8_NEXT()
++i;
if(U8_IS_SINGLE(c)) {
*pDest++=(UChar)c;
} else {
if(ch > 0xe0) {
if( /* handle U+1000..U+CFFF inline */
ch <= 0xec &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
) {
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
pSrc += 3;
continue;
}
} else if(ch < 0xe0) {
if( /* handle U+0080..U+07FF inline */
ch >= 0xc2 &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
) {
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
pSrc += 2;
continue;
}
}
/* function call for "complicated" and error cases */
++pSrc; /* continue after the lead byte */
ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
} else if(ch<=0xFFFF) {
*(pDest++)=(UChar)ch;
uint8_t __t1, __t2;
if( /* handle U+0800..U+FFFF inline */
(0xe0<=(c) && (c)<0xf0) &&
U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
(__t2=src[(i)+1]-0x80)<=0x3f) {
*pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
i+=2;
} else if( /* handle U+0080..U+07FF inline */
((c)<0xe0 && (c)>=0xc2) &&
(__t1=src[i]-0x80)<=0x3f) {
*pDest++ = (((c)&0x1f)<<6)|__t1;
++(i);
} else {
*(pDest++)=U16_LEAD(ch);
if(pDest<pDestLimit) {
*(pDest++)=U16_TRAIL(ch);
/* function call for "complicated" and error cases */
(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
if(c<0 && (++numSubstitutions, c = subchar) < 0) {
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
} else if(c<=0xFFFF) {
*(pDest++)=(UChar)c;
} else {
reqLength++;
break;
*(pDest++)=U16_LEAD(c);
if(pDest<pDestLimit) {
*(pDest++)=U16_TRAIL(c);
} else {
reqLength++;
break;
}
}
}
}
}
/* Pre-flight the rest of the string. */
while((ch = *pSrc) != 0) {
if(ch <= 0x7f){
while((c = (uint8_t)src[i]) != 0) {
// modified copy of U8_NEXT()
++i;
if(U8_IS_SINGLE(c)) {
++reqLength;
++pSrc;
} else {
if(ch > 0xe0) {
if( /* handle U+1000..U+CFFF inline */
ch <= 0xec &&
(uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
(uint8_t)(pSrc[2] - 0x80) <= 0x3f
) {
++reqLength;
pSrc += 3;
continue;
}
} else if(ch < 0xe0) {
if( /* handle U+0080..U+07FF inline */
ch >= 0xc2 &&
(uint8_t)(pSrc[1] - 0x80) <= 0x3f
) {
++reqLength;
pSrc += 2;
continue;
uint8_t __t1, __t2;
if( /* handle U+0800..U+FFFF inline */
(0xe0<=(c) && (c)<0xf0) &&
U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
(__t2=src[(i)+1]-0x80)<=0x3f) {
++reqLength;
i+=2;
} else if( /* handle U+0080..U+07FF inline */
((c)<0xe0 && (c)>=0xc2) &&
(__t1=src[i]-0x80)<=0x3f) {
++reqLength;
++(i);
} else {
/* function call for "complicated" and error cases */
(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
if(c<0 && (++numSubstitutions, c = subchar) < 0) {
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
}
reqLength += U16_LENGTH(c);
}
/* function call for "complicated" and error cases */
++pSrc; /* continue after the lead byte */
ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
}
reqLength += U16_LENGTH(ch);
}
}
} else /* srcLength >= 0 */ {
const uint8_t *pSrcLimit = pSrc + srcLength;
int32_t count;
/* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
/* Faster loop without ongoing checking for srcLength and pDestLimit. */
int32_t i = 0;
UChar32 c;
for(;;) {
/*
* Each iteration of the inner loop progresses by at most 3 UTF-8
@ -551,10 +385,10 @@ u_strFromUTF8WithSub(UChar *dest,
* For supplementary code points (4 & 2), which are rare,
* there is an additional adjustment.
*/
count = (int32_t)(pDestLimit - pDest);
srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
if(count > srcLength) {
count = srcLength; /* min(remaining dest, remaining src/3) */
int32_t count = (int32_t)(pDestLimit - pDest);
int32_t count2 = (srcLength - i) / 3;
if(count > count2) {
count = count2; /* min(remaining dest, remaining src/3) */
}
if(count < 3) {
/*
@ -565,147 +399,123 @@ u_strFromUTF8WithSub(UChar *dest,
}
do {
ch = *pSrc;
if(ch <= 0x7f){
*pDest++=(UChar)ch;
++pSrc;
// modified copy of U8_NEXT()
c = (uint8_t)src[i++];
if(U8_IS_SINGLE(c)) {
*pDest++=(UChar)c;
} else {
if(ch > 0xe0) {
if( /* handle U+1000..U+CFFF inline */
ch <= 0xec &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
) {
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
pSrc += 3;
continue;
uint8_t __t1, __t2;
if( /* handle U+0800..U+FFFF inline */
(0xe0<=(c) && (c)<0xf0) &&
((i)+1)<srcLength &&
U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
(__t2=src[(i)+1]-0x80)<=0x3f) {
*pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
i+=2;
} else if( /* handle U+0080..U+07FF inline */
((c)<0xe0 && (c)>=0xc2) &&
((i)!=srcLength) &&
(__t1=src[i]-0x80)<=0x3f) {
*pDest++ = (((c)&0x1f)<<6)|__t1;
++(i);
} else {
if(c >= 0xf0 || subchar > 0xffff) {
// We may read up to four bytes and write up to two UChars,
// which we didn't account for with computing count,
// so we adjust it here.
if(--count == 0) {
--i; // back out byte c
break;
}
}
} else if(ch < 0xe0) {
if( /* handle U+0080..U+07FF inline */
ch >= 0xc2 &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
) {
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
pSrc += 2;
continue;
}
}
if(ch >= 0xf0 || subchar > 0xffff) {
/*
* We may read up to six bytes and write up to two UChars,
* which we didn't account for with computing count,
* so we adjust it here.
*/
if(--count == 0) {
break;
/* function call for "complicated" and error cases */
(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
if(c<0 && (++numSubstitutions, c = subchar) < 0) {
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
} else if(c<=0xFFFF) {
*(pDest++)=(UChar)c;
} else {
*(pDest++)=U16_LEAD(c);
*(pDest++)=U16_TRAIL(c);
}
}
/* function call for "complicated" and error cases */
++pSrc; /* continue after the lead byte */
ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
}else if(ch<=0xFFFF){
*(pDest++)=(UChar)ch;
}else{
*(pDest++)=U16_LEAD(ch);
*(pDest++)=U16_TRAIL(ch);
}
}
} while(--count > 0);
}
while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
ch = *pSrc;
if(ch <= 0x7f){
*pDest++=(UChar)ch;
++pSrc;
while(i < srcLength && (pDest < pDestLimit)) {
// modified copy of U8_NEXT()
c = (uint8_t)src[i++];
if(U8_IS_SINGLE(c)) {
*pDest++=(UChar)c;
} else {
if(ch > 0xe0) {
if( /* handle U+1000..U+CFFF inline */
ch <= 0xec &&
((pSrcLimit - pSrc) >= 3) &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
) {
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
pSrc += 3;
continue;
}
} else if(ch < 0xe0) {
if( /* handle U+0080..U+07FF inline */
ch >= 0xc2 &&
((pSrcLimit - pSrc) >= 2) &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
) {
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
pSrc += 2;
continue;
}
}
/* function call for "complicated" and error cases */
++pSrc; /* continue after the lead byte */
ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
}else if(ch<=0xFFFF){
*(pDest++)=(UChar)ch;
}else{
*(pDest++)=U16_LEAD(ch);
if(pDest<pDestLimit){
*(pDest++)=U16_TRAIL(ch);
}else{
reqLength++;
break;
uint8_t __t1, __t2;
if( /* handle U+0800..U+FFFF inline */
(0xe0<=(c) && (c)<0xf0) &&
((i)+1)<srcLength &&
U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
(__t2=src[(i)+1]-0x80)<=0x3f) {
*pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
i+=2;
} else if( /* handle U+0080..U+07FF inline */
((c)<0xe0 && (c)>=0xc2) &&
((i)!=srcLength) &&
(__t1=src[i]-0x80)<=0x3f) {
*pDest++ = (((c)&0x1f)<<6)|__t1;
++(i);
} else {
/* function call for "complicated" and error cases */
(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
if(c<0 && (++numSubstitutions, c = subchar) < 0) {
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
} else if(c<=0xFFFF) {
*(pDest++)=(UChar)c;
} else {
*(pDest++)=U16_LEAD(c);
if(pDest<pDestLimit) {
*(pDest++)=U16_TRAIL(c);
} else {
reqLength++;
break;
}
}
}
}
}
/* do not fill the dest buffer just count the UChars needed */
while(pSrc < pSrcLimit){
ch = *pSrc;
if(ch <= 0x7f){
reqLength++;
++pSrc;
} else {
if(ch > 0xe0) {
if( /* handle U+1000..U+CFFF inline */
ch <= 0xec &&
((pSrcLimit - pSrc) >= 3) &&
(uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
(uint8_t)(pSrc[2] - 0x80) <= 0x3f
) {
reqLength++;
pSrc += 3;
continue;
}
} else if(ch < 0xe0) {
if( /* handle U+0080..U+07FF inline */
ch >= 0xc2 &&
((pSrcLimit - pSrc) >= 2) &&
(uint8_t)(pSrc[1] - 0x80) <= 0x3f
) {
reqLength++;
pSrc += 2;
continue;
}
}
/* function call for "complicated" and error cases */
++pSrc; /* continue after the lead byte */
ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
/* Pre-flight the rest of the string. */
while(i < srcLength) {
// modified copy of U8_NEXT()
c = (uint8_t)src[i++];
if(U8_IS_SINGLE(c)) {
++reqLength;
} else {
uint8_t __t1, __t2;
if( /* handle U+0800..U+FFFF inline */
(0xe0<=(c) && (c)<0xf0) &&
((i)+1)<srcLength &&
U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
(__t2=src[(i)+1]-0x80)<=0x3f) {
++reqLength;
i+=2;
} else if( /* handle U+0080..U+07FF inline */
((c)<0xe0 && (c)>=0xc2) &&
((i)!=srcLength) &&
(__t1=src[i]-0x80)<=0x3f) {
++reqLength;
++(i);
} else {
/* function call for "complicated" and error cases */
(c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
if(c<0 && (++numSubstitutions, c = subchar) < 0) {
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
}
reqLength += U16_LENGTH(c);
}
reqLength+=U16_LENGTH(ch);
}
}
}
@ -753,7 +563,7 @@ u_strFromUTF8Lenient(UChar *dest,
uint8_t* pSrc = (uint8_t*) src;
/* args check */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
if(U_FAILURE(*pErrorCode)){
return NULL;
}
@ -994,7 +804,7 @@ u_strToUTF8WithSub(char *dest,
int32_t numSubstitutions;
/* args check */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
if(U_FAILURE(*pErrorCode)){
return NULL;
}
@ -1266,18 +1076,8 @@ u_strFromJavaModifiedUTF8WithSub(
int32_t srcLength,
UChar32 subchar, int32_t *pNumSubstitutions,
UErrorCode *pErrorCode) {
UChar *pDest = dest;
UChar *pDestLimit = dest+destCapacity;
UChar32 ch;
int32_t reqLength = 0;
const uint8_t* pSrc = (const uint8_t*) src;
const uint8_t *pSrcLimit;
int32_t count;
uint8_t t1, t2; /* trail bytes */
int32_t numSubstitutions;
/* args check */
if(U_FAILURE(*pErrorCode)){
if(U_FAILURE(*pErrorCode)) {
return NULL;
}
if( (src==NULL && srcLength!=0) || srcLength < -1 ||
@ -1291,18 +1091,22 @@ u_strFromJavaModifiedUTF8WithSub(
if(pNumSubstitutions!=NULL) {
*pNumSubstitutions=0;
}
numSubstitutions=0;
UChar *pDest = dest;
UChar *pDestLimit = dest+destCapacity;
int32_t reqLength = 0;
int32_t numSubstitutions=0;
if(srcLength < 0) {
/*
* Transform a NUL-terminated ASCII string.
* Handle non-ASCII strings with slower code.
*/
while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
*pDest++=(UChar)ch;
++pSrc;
UChar32 c;
while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) {
*pDest++=(UChar)c;
++src;
}
if(ch == 0) {
if(c == 0) {
reqLength=(int32_t)(pDest - dest);
if(pDestLength) {
*pDestLength = reqLength;
@ -1312,33 +1116,38 @@ u_strFromJavaModifiedUTF8WithSub(
u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
return dest;
}
srcLength = static_cast<int32_t>(uprv_strlen((const char *)pSrc));
srcLength = static_cast<int32_t>(uprv_strlen(src));
}
/* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
/* Faster loop without ongoing checking for srcLength and pDestLimit. */
UChar32 ch;
uint8_t t1, t2;
int32_t i = 0;
for(;;) {
count = (int32_t)(pDestLimit - pDest);
srcLength = (int32_t)(pSrcLimit - pSrc);
if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
int32_t count = (int32_t)(pDestLimit - pDest);
int32_t count2 = srcLength - i;
if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) {
/* fast ASCII loop */
const uint8_t *prevSrc = pSrc;
int32_t delta;
while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
*pDest++=(UChar)ch;
++pSrc;
int32_t start = i;
uint8_t b;
while(i < srcLength && U8_IS_SINGLE(b = src[i])) {
*pDest++=b;
++i;
}
delta = (int32_t)(pSrc - prevSrc);
int32_t delta = i - start;
count -= delta;
srcLength -= delta;
count2 -= delta;
}
/*
* Each iteration of the inner loop progresses by at most 3 UTF-8
* bytes and one UChar.
*/
srcLength /= 3;
if(count > srcLength) {
count = srcLength; /* min(remaining dest, remaining src/3) */
if(subchar > 0xFFFF) {
break;
}
count2 /= 3;
if(count > count2) {
count = count2; /* min(remaining dest, remaining src/3) */
}
if(count < 3) {
/*
@ -1348,29 +1157,28 @@ u_strFromJavaModifiedUTF8WithSub(
break;
}
do {
ch = *pSrc;
if(ch <= 0x7f){
ch = (uint8_t)src[i++];
if(U8_IS_SINGLE(ch)) {
*pDest++=(UChar)ch;
++pSrc;
} else {
if(ch >= 0xe0) {
if( /* handle U+0000..U+FFFF inline */
ch <= 0xef &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
(t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
(t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
) {
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
pSrc += 3;
i += 2;
continue;
}
} else {
if( /* handle U+0000..U+07FF inline */
ch >= 0xc0 &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
(t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
) {
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
pSrc += 2;
++i;
continue;
}
}
@ -1383,49 +1191,43 @@ u_strFromJavaModifiedUTF8WithSub(
* We need to write two UChars, adjusted count for that,
* and ran out of space.
*/
--i; // back out byte ch
break;
} else {
/* function call for error cases */
++pSrc; /* continue after the lead byte */
utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
++numSubstitutions;
if(subchar<=0xFFFF) {
*(pDest++)=(UChar)subchar;
} else {
*(pDest++)=U16_LEAD(subchar);
*(pDest++)=U16_TRAIL(subchar);
}
*(pDest++)=(UChar)subchar;
}
}
} while(--count > 0);
}
while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
ch = *pSrc;
if(ch <= 0x7f){
while(i < srcLength && (pDest < pDestLimit)) {
ch = (uint8_t)src[i++];
if(U8_IS_SINGLE(ch)){
*pDest++=(UChar)ch;
++pSrc;
} else {
if(ch >= 0xe0) {
if( /* handle U+0000..U+FFFF inline */
ch <= 0xef &&
((pSrcLimit - pSrc) >= 3) &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
(t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
(i+1) < srcLength &&
(t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
(t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
) {
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
pSrc += 3;
i += 2;
continue;
}
} else {
if( /* handle U+0000..U+07FF inline */
ch >= 0xc0 &&
((pSrcLimit - pSrc) >= 2) &&
(t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
i < srcLength &&
(t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
) {
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
pSrc += 2;
++i;
continue;
}
}
@ -1435,8 +1237,7 @@ u_strFromJavaModifiedUTF8WithSub(
return NULL;
} else {
/* function call for error cases */
++pSrc; /* continue after the lead byte */
utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
++numSubstitutions;
if(subchar<=0xFFFF) {
*(pDest++)=(UChar)subchar;
@ -1453,32 +1254,31 @@ u_strFromJavaModifiedUTF8WithSub(
}
}
/* do not fill the dest buffer just count the UChars needed */
while(pSrc < pSrcLimit){
ch = *pSrc;
if(ch <= 0x7f) {
/* Pre-flight the rest of the string. */
while(i < srcLength) {
ch = (uint8_t)src[i++];
if(U8_IS_SINGLE(ch)) {
reqLength++;
++pSrc;
} else {
if(ch >= 0xe0) {
if( /* handle U+0000..U+FFFF inline */
ch <= 0xef &&
((pSrcLimit - pSrc) >= 3) &&
(uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
(uint8_t)(pSrc[2] - 0x80) <= 0x3f
(i+1) < srcLength &&
(uint8_t)(src[i] - 0x80) <= 0x3f &&
(uint8_t)(src[i+1] - 0x80) <= 0x3f
) {
reqLength++;
pSrc += 3;
i += 2;
continue;
}
} else {
if( /* handle U+0000..U+07FF inline */
ch >= 0xc0 &&
((pSrcLimit - pSrc) >= 2) &&
(uint8_t)(pSrc[1] - 0x80) <= 0x3f
i < srcLength &&
(uint8_t)(src[i] - 0x80) <= 0x3f
) {
reqLength++;
pSrc += 2;
++i;
continue;
}
}
@ -1488,8 +1288,7 @@ u_strFromJavaModifiedUTF8WithSub(
return NULL;
} else {
/* function call for error cases */
++pSrc; /* continue after the lead byte */
utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
++numSubstitutions;
reqLength+=U16_LENGTH(ch);
}

View file

@ -847,15 +847,11 @@ U_CDECL_END
//------------------------------------------------------------------------------
// Chunk size.
// Must be less than 42 (256/6), because of byte mapping from UChar indexes to native indexes.
// Worst case there are six UTF-8 bytes per UChar.
// obsolete 6 byte form fd + 5 trails maps to fffd
// obsolete 5 byte form fc + 4 trails maps to fffd
// non-shortest 4 byte forms maps to fffd
// normal supplementaries map to a pair of utf-16, two utf8 bytes per utf-16 unit
// mapToUChars array size must allow for the worst case, 6.
// This could be brought down to 4, by treating fd and fc as pure illegal,
// rather than obsolete lead bytes. But that is not compatible with the utf-8 access macros.
// Must be less than 85 (256/3), because of byte mapping from UChar indexes to native indexes.
// Worst case is three native bytes to one UChar. (Supplemenaries are 4 native bytes
// to two UChars.)
// The longest illegal byte sequence treated as a single error (and converted to U+FFFD)
// is a three-byte sequence (truncated four-byte sequence).
//
enum { UTF8_TEXT_CHUNK_SIZE=32 };
@ -895,7 +891,7 @@ struct UTF8Buf {
// Requires two extra slots,
// one for a supplementary starting in the last normal position,
// and one for an entry for the buffer limit position.
uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*6+6]; // Map native offset from bufNativeStart to
uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to
// correspoding offset in filled part of buf.
int32_t align;
};

View file

@ -7,7 +7,7 @@
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: utf_impl.c
* file name: utf_impl.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
@ -27,7 +27,6 @@
#include "unicode/utypes.h"
#include "unicode/utf.h"
#include "unicode/utf8.h"
#include "unicode/utf_old.h"
#include "uassert.h"
/*
@ -55,10 +54,6 @@
* - SUB AX, BX (result)
* -finish:
* (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
*
* In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;
* lead bytes above 0xf4 are illegal.
* We keep them in this table for skipping long ISO 10646-UTF-8 sequences.
*/
extern "C" U_EXPORT const uint8_t
utf8_countTrailBytes[256]={
@ -77,24 +72,24 @@ utf8_countTrailBytes[256]={
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// illegal C0 & C1
// 2-byte lead bytes C2..DF
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// 3-byte lead bytes E0..EF
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3,
3, 3, 3, /* illegal in Unicode */
4, 4, 4, 4, /* illegal in Unicode */
5, 5, /* illegal in Unicode */
0, 0 /* illegal bytes 0xfe and 0xff */
// 4-byte lead bytes F0..F4
// illegal F5..FF
3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
static const UChar32
utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
static const UChar32
utf8_errorValue[6]={
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff,
0x3ffffff, 0x7fffffff
// Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
// but without relying on the obsolete unicode/utf_old.h.
0x15, 0x9f, 0xffff,
0x10ffff
};
static UChar32
@ -134,61 +129,59 @@ errorValue(int32_t count, int8_t strict) {
*/
U_CAPI UChar32 U_EXPORT2
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
// *pi is one after byte c.
int32_t i=*pi;
uint8_t count=U8_COUNT_TRAIL_BYTES(c);
U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
if(i+count<=length || length<0) {
uint8_t trail;
U8_MASK_LEAD_BYTE(c, count);
/* support NUL-terminated strings: do not read beyond the first non-trail byte */
switch(count) {
/* each branch falls through to the next one */
case 0:
/* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
case 5:
case 4:
/* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
break;
case 3:
trail=s[i++]-0x80;
c=(c<<6)|trail;
/* c>=0x110 would result in code point>0x10ffff, outside Unicode */
if(c>=0x110 || trail>0x3f) { break; }
U_FALLTHROUGH;
case 2:
trail=s[i++]-0x80;
c=(c<<6)|trail;
/*
* test for a surrogate d800..dfff unless we are lenient:
* before the last (c<<6), a surrogate is c=360..37f
*/
if(((c&0xffe0)==0x360 && strict!=-2) || trail>0x3f) { break; }
U_FALLTHROUGH;
case 1:
trail=s[i++]-0x80;
c=(c<<6)|trail;
if(trail>0x3f) { break; }
/* correct sequence - all trail bytes have (b7..b6)==(10) */
if(c>=utf8_minLegal[count] &&
/* strict: forbid non-characters like U+fffe */
(strict<=0 || !U_IS_UNICODE_NONCHAR(c))) {
// length can be negative for NUL-terminated strings: Read and validate one byte at a time.
if(i==length || c>0xf4) {
// end of string, or not a lead byte
} else if(c>=0xf0) {
// Test for 4-byte sequences first because
// U8_NEXT() handles shorter valid sequences inline.
uint8_t t1=s[i], t2, t3;
c&=7;
if(U8_IS_VALID_LEAD4_AND_T1(c, t1) &&
++i!=length && (t2=s[i]-0x80)<=0x3f &&
++i!=length && (t3=s[i]-0x80)<=0x3f) {
++i;
c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
// strict: forbid non-characters like U+fffe
if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
*pi=i;
return c;
}
/* no default branch to optimize switch() - all values are covered */
}
} else {
/* too few bytes left */
count=length-i;
}
} else if(c>=0xe0) {
c&=0xf;
if(strict!=-2) {
uint8_t t1=s[i], t2;
if(U8_IS_VALID_LEAD3_AND_T1(c, t1) &&
++i!=length && (t2=s[i]-0x80)<=0x3f) {
++i;
c=(c<<12)|((t1&0x3f)<<6)|t2;
// strict: forbid non-characters like U+fffe
if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
*pi=i;
return c;
}
}
} else {
// strict=-2 -> lenient: allow surrogates
uint8_t t1=s[i]-0x80, t2;
if(t1<=0x3f && (c>0 || t1>=0x20) &&
++i!=length && (t2=s[i]-0x80)<=0x3f) {
*pi=i+1;
return (c<<12)|(t1<<6)|t2;
}
}
} else if(c>=0xc2) {
uint8_t t1=s[i]-0x80;
if(t1<=0x3f) {
*pi=i+1;
return ((c-0xc0)<<6)|t1;
}
} // else 0x80<=c<0xc2 is not a lead byte
/* error handling */
i=*pi;
while(count>0 && U8_IS_TRAIL(s[i])) {
++i;
--count;
}
c=errorValue(i-*pi, strict);
*pi=i;
return c;
@ -232,7 +225,7 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool
s+=i;
offset=0;
c=utf8_errorValue[length-1];
UTF8_APPEND_CHAR_UNSAFE(s, offset, c);
U8_APPEND_UNSAFE(s, offset, c);
i=i+offset;
}
}
@ -241,99 +234,99 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool
U_CAPI UChar32 U_EXPORT2
utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) {
// *pi is the index of byte c.
int32_t i=*pi;
uint8_t b, count=1, shift=6;
if(!U8_IS_TRAIL(c)) { return errorValue(0, strict); }
/* extract value bits from the last trail byte */
c&=0x3f;
for(;;) {
if(i<=start) {
/* no lead byte at all */
return errorValue(0, strict);
}
/* read another previous byte */
b=s[--i];
if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
if(b&0x40) {
/* lead byte, this will always end the loop */
uint8_t shouldCount=U8_COUNT_TRAIL_BYTES(b);
if(count==shouldCount) {
/* set the new position */
*pi=i;
U8_MASK_LEAD_BYTE(b, count);
c|=(UChar32)b<<shift;
if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || (U_IS_SURROGATE(c) && strict!=-2) || (strict>0 && U_IS_UNICODE_NONCHAR(c))) {
/* illegal sequence or (strict and non-character) */
if(count>=4) {
count=3;
if(U8_IS_TRAIL(c) && i>start) {
uint8_t b1=s[--i];
if(0xc2<=b1 && b1<0xe0) {
*pi=i;
return ((b1-0xc0)<<6)|(c&0x3f);
} else if(U8_IS_TRAIL(b1) && i>start) {
// Extract the value bits from the last trail byte.
c&=0x3f;
uint8_t b2=s[--i];
if(0xe0<=b2 && b2<0xf0) {
b2&=0xf;
if(strict!=-2) {
if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
*pi=i;
c=(b2<<12)|((b1&0x3f)<<6)|c;
if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
return c;
} else {
// strict: forbid non-characters like U+fffe
return errorValue(2, strict);
}
c=errorValue(count, strict);
} else {
/* exit with correct c */
}
} else {
/* the lead byte does not match the number of trail bytes */
/* only set the position to the lead byte if it would
include the trail byte that we started with */
if(count<shouldCount) {
// strict=-2 -> lenient: allow surrogates
b1-=0x80;
if((b2>0 || b1>=0x20)) {
*pi=i;
c=errorValue(count, strict);
} else {
c=errorValue(0, strict);
return (b2<<12)|(b1<<6)|c;
}
}
break;
} else if(count<5) {
/* trail byte */
c|=(UChar32)(b&0x3f)<<shift;
++count;
shift+=6;
} else {
/* more than 5 trail bytes is illegal */
c=errorValue(0, strict);
break;
} else if(U8_IS_TRAIL(b2) && i>start) {
uint8_t b3=s[--i];
if(0xf0<=b3 && b3<=0xf4) {
b3&=7;
if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
*pi=i;
c=(b3<<18)|((b2&0x3f)<<12)|((b1&0x3f)<<6)|c;
if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
return c;
} else {
// strict: forbid non-characters like U+fffe
return errorValue(3, strict);
}
}
}
} else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
// Truncated 4-byte sequence.
*pi=i;
return errorValue(2, strict);
}
} else {
/* single-byte character precedes trailing bytes */
c=errorValue(0, strict);
break;
} else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
(0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
// Truncated 3- or 4-byte sequence.
*pi=i;
return errorValue(1, strict);
}
}
return c;
return errorValue(0, strict);
}
U_CAPI int32_t U_EXPORT2
utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
/* i had been decremented once before the function call */
int32_t I=i, Z;
uint8_t b;
/* read at most the 6 bytes s[Z] to s[i], inclusively */
if(I-5>start) {
Z=I-5;
} else {
Z=start;
}
/* return I if the sequence starting there is long enough to include i */
do {
b=s[I];
if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */
break;
} else if(b>=0xc0) {
if(U8_COUNT_TRAIL_BYTES(b)>=(i-I)) {
return I;
} else {
break;
// Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points.
int32_t orig_i=i;
uint8_t c=s[i];
if(U8_IS_TRAIL(c) && i>start) {
uint8_t b1=s[--i];
if(0xc2<=b1 && b1<0xe0) {
return i;
} else if(U8_IS_TRAIL(b1) && i>start) {
uint8_t b2=s[--i];
if(0xe0<=b2 && b2<0xf0) {
if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
return i;
}
} else if(U8_IS_TRAIL(b2) && i>start) {
uint8_t b3=s[--i];
if(0xf0<=b3 && b3<=0xf4) {
if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
return i;
}
}
} else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
// Truncated 4-byte sequence.
return i;
}
} else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
(0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
// Truncated 3- or 4-byte sequence.
return i;
}
} while(Z<=--I);
/* return i itself to be consistent with the FWD_1 macro */
return i;
}
return orig_i;
}

View file

@ -20,6 +20,7 @@
#define __UTRIE2_H__
#include "unicode/utypes.h"
#include "unicode/utf8.h"
#include "putilimp.h"
#include "udataswp.h"
@ -54,6 +55,8 @@ typedef struct UTrie UTrie;
* is truncated, omitting both the BMP portion and the high range.
* - There is a special small index for 2-byte UTF-8, and the initial data
* entries are designed for fast 1/2-byte UTF-8 lookup.
* Starting with ICU 60, C0 and C1 are not recognized as UTF-8 lead bytes any more at all,
* and the associated 2-byte indexes are unused.
*/
/**
@ -933,29 +936,29 @@ utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c,
/** Internal UTF-8 next-post-increment: get the next code point's data. */
#define _UTRIE2_U8_NEXT(trie, ascii, data, src, limit, result) { \
uint8_t __lead=(uint8_t)*(src)++; \
if(__lead<0xc0) { \
if(U8_IS_SINGLE(__lead)) { \
(result)=(trie)->ascii[__lead]; \
} else { \
uint8_t __t1, __t2; \
if( /* handle U+0000..U+07FF inline */ \
__lead<0xe0 && (src)<(limit) && \
if( /* handle U+0800..U+FFFF inline */ \
0xe0<=__lead && __lead<0xf0 && ((src)+1)<(limit) && \
U8_IS_VALID_LEAD3_AND_T1(__lead, __t1=(uint8_t)*(src)) && \
(__t2=(uint8_t)(*((src)+1)-0x80))<= 0x3f \
) { \
(src)+=2; \
(result)=(trie)->data[ \
((int32_t)((trie)->index[((__lead-0xe0)<<(12-UTRIE2_SHIFT_2))+ \
((__t1&0x3f)<<(6-UTRIE2_SHIFT_2))+(__t2>>UTRIE2_SHIFT_2)]) \
<<UTRIE2_INDEX_SHIFT)+ \
(__t2&UTRIE2_DATA_MASK)]; \
} else if( /* handle U+0080..U+07FF inline */ \
__lead<0xe0 && __lead>=0xc2 && (src)<(limit) && \
(__t1=(uint8_t)(*(src)-0x80))<=0x3f \
) { \
++(src); \
(result)=(trie)->data[ \
(trie)->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET-0xc0)+__lead]+ \
__t1]; \
} else if( /* handle U+0000..U+CFFF inline */ \
__lead<0xed && ((src)+1)<(limit) && \
(__t1=(uint8_t)(*(src)-0x80))<=0x3f && (__lead>0xe0 || __t1>=0x20) && \
(__t2=(uint8_t)(*((src)+1)-0x80))<= 0x3f \
) { \
(src)+=2; \
(result)=(trie)->data[ \
((int32_t)((trie)->index[((__lead-0xe0)<<(12-UTRIE2_SHIFT_2))+ \
(__t1<<(6-UTRIE2_SHIFT_2))+(__t2>>UTRIE2_SHIFT_2)]) \
<<UTRIE2_INDEX_SHIFT)+ \
(__t2&UTRIE2_DATA_MASK)]; \
} else { \
int32_t __index=utrie2_internalU8NextIndex((trie), __lead, (const uint8_t *)(src), \
(const uint8_t *)(limit)); \
@ -968,7 +971,7 @@ utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c,
/** Internal UTF-8 pre-decrement-previous: get the previous code point's data. */
#define _UTRIE2_U8_PREV(trie, ascii, data, start, src, result) { \
uint8_t __b=(uint8_t)*--(src); \
if(__b<0x80) { \
if(U8_IS_SINGLE(__b)) { \
(result)=(trie)->ascii[__b]; \
} else { \
int32_t __index=utrie2_internalU8PrevIndex((trie), __b, (const uint8_t *)(start), \

View file

@ -1,4 +1,4 @@
#
#
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (C) 2002-2016, International Business Machines Corporation and others.
@ -12,6 +12,8 @@
# Plus revisions to rule GB 11 from http://unicode.org/cldr/trac/ticket/10088
# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
!!quoted_literals_only;
#
# Character Class Definitions.
#
@ -78,42 +80,6 @@ $Prepend [^$Control $CR $LF];
## -------------------------------------------------
!!reverse;
$LF $CR;
($L | $V | $LV | $LVT) $L;
($V | $T) ($LV | $V);
$T ($LVT | $T);
# GB 9
($Extend | $ZWJ) [^$Control $CR $LF]; #note that this will chain into Regional_Indicator when needed.
# GB 9a
$SpacingMark [^$Control $CR $LF];
# GB 9b
[^$Control $CR $LF] $Prepend;
# GB 10
$E_Modifier $Extend* ($E_Base | $E_Base_GAZ);
# GB 11 Don't break between ZWJ and Glue_After_ZWJ
($Extended_Pict | $EmojiNRK) $ZWJ $Extend* ($Extended_Pict | $EmojiNRK);
# GB 12-13. Going backwards, we must scan through any number of regional indicators as pairs.
#
[{bof} $Extend $ZWJ $SpacingMark] $Regional_Indicator $Regional_Indicator / ($Regional_Indicator $Regional_Indicator)+ [{eof}[^$Regional_Indicator]];
[{bof} $Extend $ZWJ $SpacingMark] $Regional_Indicator / ($Regional_Indicator $Regional_Indicator)+ [{eof}[^$Regional_Indicator]];
$Regional_Indicator $Regional_Indicator;
$Regional_Indicator $Prepend;
## -------------------------------------------------
!!safe_reverse;
$Regional_Indicator $Regional_Indicator;
($Extend | $ZWJ | $EmojiNRK | $Extended_Pict)+ .;
## -------------------------------------------------
!!safe_forward;
$Regional_Indicator $Regional_Indicator;
($Extend | $ZWJ | $EmojiNRK | $Extended_Pict)+ .;

View file

@ -1,4 +1,4 @@
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
@ -25,6 +25,7 @@
#
!!chain;
!!quoted_literals_only;
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
@ -334,209 +335,6 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Reverse Rules.
#
## -------------------------------------------------
!!reverse;
# LB 9 Combining Marks.
# Stick together any combining sequences that don't match other rules.
^$CM+ $CAN_CM?;
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] [whatever]
# The CM needs to behave as an AL
#
$AL_FOLLOW $CM+ / (
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
# LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
$LF $CR;
# LB 7 x SP
# x ZW
[$SP $ZW] [$LB4NonBreaks-$CM];
[$SP $ZW] $CM+ $CAN_CM;
# LB 8 ZW SP* <break>
# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
# Requires an engine enhancement.
# / $SP* $ZW
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
#
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
^$CM+ $CAN_CM;
# LB 11
#
$WJ $CM* $CAN_CM;
$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
$CAN_CM $CM* $GL;
# LB 13
$CL $CM+ $CAN_CM;
$CP $CM+ $CAN_CM;
$EX $CM+ $CAN_CM;
$IS $CM+ $CAN_CM;
$SY $CM+ $CAN_CM;
$CL [$LB8NonBreaks-$CM];
$CP [$LB8NonBreaks-$CM];
$EX [$LB8NonBreaks-$CM];
$IS [$LB8NonBreaks-$CM];
$SY [$LB8NonBreaks-$CM];
# LB 14 OP SP* x
#
. $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
# LB 15
$OP $SP* $CM* $QU;
# LB 16
$NS $SP* $CM* ($CL | $CP);
# LB 17
$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
#
# LB 19
#
$QU $CM* $CAN_CM; # . x QU
$QU $LB18NonBreaks;
$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
#
# LB 20 Break before and after CB.
# nothing needed here.
#
# LB 21
($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
# LB21a
[^$CB] $CM* ($HY | $BA) $CM* $HL;
# LB21b (reverse)
$HL $CM* $SY;
# LB 22
$IN $CM* ($ALPlus | $HL);
$IN $CM* $EX;
$IN $CM* ($ID | $EB | $EM);
$IN $CM* $IN;
$IN $CM* $NU;
# LB 23
$NU $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* $NU;
# LB23a
($ID | $EB | $EM) $CM* $PR;
$PO $CM* ($ID | $EB | $EM);
# LB 24
($ALPlus | $HL) $CM* ($PR | $PO);
($PR | $PO) $CM* ($ALPlus | $HL);
# LB 25
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
# LB 26
($H3 | $H2 | $JV | $JL) $CM* $JL;
($JT | $JV) $CM* ($H2 | $JV);
$JT $CM* ($H3 | $JT);
# LB 27
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
($ALPlus | $HL) $CM* $IS;
# LB 30
$OP $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
# Pairs of Regional Indicators.
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
# the second with an even number. Stripping away the cruft they look like
# [^RI] RI / (RI RI)+ ^RI;
# [^RI] RI RI / (RI RI)+ ^RI;
#
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
$RI $CM* $RI;
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EM $CM* $EB;
## -------------------------------------------------
@ -544,7 +342,6 @@ $EM $CM* $EB;
# LB 9
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
@ -571,19 +368,3 @@ $CM* ($HY | $BA) $CM* $HL;
# For dictionary-based break
$dictionary $dictionary;
## -------------------------------------------------
!!safe_forward;
# Skip forward over all character classes that are involved in
# rules containing patterns with possibly more than one char
# of context.
#
# It might be slightly more efficient to have specific rules
# instead of one generic one, but only if we could
# turn off rule chaining. We don't want to move more
# than necessary.
#
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
$dictionary $dictionary;

View file

@ -1,4 +1,4 @@
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
@ -30,6 +30,7 @@
#
!!chain;
!!quoted_literals_only;
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
@ -343,220 +344,12 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Reverse Rules.
#
## -------------------------------------------------
!!reverse;
# LB 9 Combining Marks.
# Stick together any combining sequences that don't match other rules.
^$CM+ $CAN_CM?;
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] [whatever]
# The CM needs to behave as an AL
#
$AL_FOLLOW $CM+ / (
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
# LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
$LF $CR;
# LB 7 x SP
# x ZW
[$SP $ZW] [$LB4NonBreaks-$CM];
[$SP $ZW] $CM+ $CAN_CM;
# LB 8 ZW SP* <break>
# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
# Requires an engine enhancement.
# / $SP* $ZW
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
#
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
^$CM+ $CAN_CM;
# LB 11
#
$WJ $CM* $CAN_CM;
$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
$CAN_CM $CM* $GL;
# LB 13
$CL $CM+ $CAN_CM;
$CP $CM+ $CAN_CM;
$EX $CM+ $CAN_CM;
$IS $CM+ $CAN_CM;
$SY $CM+ $CAN_CM;
$CL [$LB8NonBreaks-$CM];
$CP [$LB8NonBreaks-$CM];
$EX [$LB8NonBreaks-$CM];
$IS [$LB8NonBreaks-$CM];
$SY [$LB8NonBreaks-$CM];
# LB 14 OP SP* x
#
. $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
# LB 15
$OP $SP* $CM* $QU;
# LB 16
$NS $SP* $CM* ($CL | $CP);
# LB 17
$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
#
# LB 19
#
$QU $CM* $CAN_CM; # . x QU
$QU $LB18NonBreaks;
$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
#
# LB 20 Break before and after CB.
# nothing needed here.
#
# LB 20.09 added rule for Finnish tailoring
$AL ($HY | $HH) / $SP;
# LB 21
($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
# LB21a
[^$CB] $CM* ($HY | $BA | $HH) $CM* $HL;
# LB21b (reverse)
$HL $CM* $SY;
# LB 22
$IN $CM* ($ALPlus | $HL);
$IN $CM* $EX;
$IN $CM* ($ID | $EB | $EM);
$IN $CM* $IN;
$IN $CM* $NU;
# LB 23
$NU $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* $NU;
# LB23a
($ID | $EB | $EM) $CM* $PR;
$PO $CM* ($ID | $EB | $EM);
# LB 24
($ALPlus | $HL) $CM* ($PR | $PO);
($PR | $PO) $CM* ($ALPlus | $HL);
# LB 25
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
# LB 26
($H3 | $H2 | $JV | $JL) $CM* $JL;
($JT | $JV) $CM* ($H2 | $JV);
$JT $CM* ($H3 | $JT);
# LB 27
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
($ALPlus | $HL) $CM* $IS;
# LB 30
$OP $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
# Pairs of Regional Indicators.
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
# the second with an even number. Stripping away the cruft they look like
# [^RI] RI / (RI RI)+ ^RI;
# [^RI] RI RI / (RI RI)+ ^RI;
#
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
$RI $CM* $RI;
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EM $CM* $EB;
## -------------------------------------------------
!!safe_reverse;
# LB 9
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
@ -582,20 +375,3 @@ $CM* ($HY | $BA | $HH) $CM* $HL;
# For dictionary-based break
$dictionary $dictionary;
## -------------------------------------------------
!!safe_forward;
# Skip forward over all character classes that are involved in
# rules containing patterns with possibly more than one char
# of context.
#
# It might be slightly more efficient to have specific rules
# instead of one generic one, but only if we could
# turn off rule chaining. We don't want to move more
# than necessary.
#
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
$dictionary $dictionary;

View file

@ -1,5 +1,6 @@
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
#
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
#
@ -32,6 +33,7 @@
#
!!chain;
!!quoted_literals_only;
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
@ -345,212 +347,6 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Reverse Rules.
#
## -------------------------------------------------
!!reverse;
# LB 9 Combining Marks.
# Stick together any combining sequences that don't match other rules.
^$CM+ $CAN_CM?;
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] [whatever]
# The CM needs to behave as an AL
#
$AL_FOLLOW $CM+ / (
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
# LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
$LF $CR;
# LB 7 x SP
# x ZW
[$SP $ZW] [$LB4NonBreaks-$CM];
[$SP $ZW] $CM+ $CAN_CM;
# LB 8 ZW SP* <break>
# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
# Requires an engine enhancement.
# / $SP* $ZW
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
#
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
^$CM+ $CAN_CM;
# LB 11
#
$WJ $CM* $CAN_CM;
$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
$CAN_CM $CM* $GL;
# LB 13
$CL $CM+ $CAN_CM;
$CP $CM+ $CAN_CM;
$EX $CM+ $CAN_CM;
$IS $CM+ $CAN_CM;
$SY $CM+ $CAN_CM;
$CL [$LB8NonBreaks-$CM];
$CP [$LB8NonBreaks-$CM];
$EX [$LB8NonBreaks-$CM];
$IS [$LB8NonBreaks-$CM];
$SY [$LB8NonBreaks-$CM];
# LB 14 OP SP* x
#
. $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
# LB 15
$OP $SP* $CM* $QU;
# LB 16
# Don't include $NSX here
$NS $SP* $CM* ($CL | $CP);
# LB 17
$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
#
# LB 19
#
$QU $CM* $CAN_CM; # . x QU
$QU $LB18NonBreaks;
$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
#
# LB 20 Break before and after CB.
# nothing needed here.
#
# LB 21
# Don't include $NSX here
($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
# LB21a
[^$CB] $CM* ($HY | $BA) $CM* $HL;
# LB21b (reverse)
$HL $CM* $SY;
# LB 22
$IN $CM* ($ALPlus | $HL);
$IN $CM* $EX;
$IN $CM* ($ID | $EB | $EM);
# $IN $CM* $IN; # delete this rule for CSS loose
$IN $CM* $NU;
# LB 23
$NU $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* $NU;
# LB23a
($ID | $EB | $EM) $CM* $PR;
$PO $CM* ($ID | $EB | $EM);
# LB 24
($ALPlus | $HL) $CM* ($PR | $PO);
($PR | $PO) $CM* ($ALPlus | $HL);
# LB 25
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
# LB 26
($H3 | $H2 | $JV | $JL) $CM* $JL;
($JT | $JV) $CM* ($H2 | $JV);
$JT $CM* ($H3 | $JT);
# LB 27
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
($ALPlus | $HL) $CM* $IS;
# LB 30
$OP $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
# Pairs of Regional Indicators.
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
# the second with an even number. Stripping away the cruft they look like
# [^RI] RI / (RI RI)+ ^RI;
# [^RI] RI RI / (RI RI)+ ^RI;
#
# Line Loose tailoring: Don't include NSX here.
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
$RI $CM* $RI;
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EM $CM* $EB;
## -------------------------------------------------
@ -558,7 +354,6 @@ $EM $CM* $EB;
# LB 9
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
@ -584,20 +379,3 @@ $CM* ($HY | $BA) $CM* $HL;
# For dictionary-based break
$dictionary $dictionary;
## -------------------------------------------------
!!safe_forward;
# Skip forward over all character classes that are involved in
# rules containing patterns with possibly more than one char
# of context.
#
# It might be slightly more efficient to have specific rules
# instead of one generic one, but only if we could
# turn off rule chaining. We don't want to move more
# than necessary.
#
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
$dictionary $dictionary;

View file

@ -1,4 +1,4 @@
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
@ -39,6 +39,7 @@
#
!!chain;
!!quoted_literals_only;
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
@ -360,226 +361,12 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Reverse Rules.
#
## -------------------------------------------------
!!reverse;
# LB 9 Combining Marks.
# Stick together any combining sequences that don't match other rules.
^$CM+ $CAN_CM?;
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] [whatever]
# The CM needs to behave as an AL
#
$AL_FOLLOW $CM+ / (
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
# LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
$LF $CR;
# LB 7 x SP
# x ZW
[$SP $ZW] [$LB4NonBreaks-$CM];
[$SP $ZW] $CM+ $CAN_CM;
# LB 8 ZW SP* <break>
# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
# Requires an engine enhancement.
# / $SP* $ZW
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
#
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
^$CM+ $CAN_CM;
# LB 11
#
$WJ $CM* $CAN_CM;
$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
$CAN_CM $CM* $GL;
# LB 13
# Do not include $EXX here
$CL $CM+ $CAN_CM;
$CP $CM+ $CAN_CM;
$EX $CM+ $CAN_CM;
$IS $CM+ $CAN_CM;
$SY $CM+ $CAN_CM;
$CL [$LB8NonBreaks-$CM];
$CP [$LB8NonBreaks-$CM];
$EX [$LB8NonBreaks-$CM];
$IS [$LB8NonBreaks-$CM];
$SY [$LB8NonBreaks-$CM];
# LB 14 OP SP* x
#
. $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
# LB 15
$OP $SP* $CM* $QU;
# LB 16
# Don't include $NSX here
$NS $SP* $CM* ($CL | $CP);
# LB 17
$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
#
# LB 19
#
$QU $CM* $CAN_CM; # . x QU
$QU $LB18NonBreaks;
$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
#
# LB 20 Break before and after CB.
# nothing needed here.
#
# LB 21
# Don't include $BAX or $NSX here
($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
# LB21a Don't break after Hebrew + Hyphen.
([^$CB] $CM*)? ($HY | $BA | $BAX) $CM* $HL;
# LB21b (reverse)
$HL $CM* $SY;
# LB 22
$IN $CM* ($ALPlus | $HL);
$IN $CM* $EX;
$IN $CM* ($ID | $EB | $EM);
# $IN $CM* $IN; # delete this rule for CSS loose
$IN $CM* $NU;
# LB 23
# Do not include $POX here
$NU $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* $NU;
# LB23a
# Do not include $PRX here
($ID | $EB | $EM) $CM* $PR;
$PO $CM* ($ID | $EB | $EM);
# LB 24
# Do not include $PRX here
($ALPlus | $HL) $CM* ($PR | $PO | $POX);
($PR | $PO | $POX) $CM* ($ALPlus | $HL);
# LB 25
# Here do not include $POX at the beginning or $PRX at the end
($CM* ($PR | $PRX | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO | $POX))?;
# LB 26
($H3 | $H2 | $JV | $JL) $CM* $JL;
($JT | $JV) $CM* ($H2 | $JV);
$JT $CM* ($H3 | $JT);
# LB 27
# Do not include $POX or $PRX here
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
($ALPlus | $HL) $CM* $IS;
# LB 30
$OP $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
# Pairs of Regional Indicators.
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
# the second with an even number. Stripping away the cruft they look like
# [^RI] RI / (RI RI)+ ^RI;
# [^RI] RI RI / (RI RI)+ ^RI;
#
# Line Loose tailoring: Don't include NSX here.
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
$RI $CM* $RI;
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EM $CM* $EB;
## -------------------------------------------------
!!safe_reverse;
# LB 9
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
@ -605,20 +392,3 @@ $CM* ($HY | $BA | $BAX) $CM* $HL;
# For dictionary-based break
$dictionary $dictionary;
## -------------------------------------------------
!!safe_forward;
# Skip forward over all character classes that are involved in
# rules containing patterns with possibly more than one char
# of context.
#
# It might be slightly more efficient to have specific rules
# instead of one generic one, but only if we could
# turn off rule chaining. We don't want to move more
# than necessary.
#
^[$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $RI $ZWJ $dictionary];
$dictionary $dictionary;

View file

@ -1,4 +1,4 @@
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
@ -28,6 +28,7 @@
#
!!chain;
!!quoted_literals_only;
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
@ -345,215 +346,6 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Reverse Rules.
#
## -------------------------------------------------
!!reverse;
# LB 9 Combining Marks.
# Stick together any combining sequences that don't match other rules.
^$CM+ $CAN_CM?;
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] [whatever]
# The CM needs to behave as an AL
#
$AL_FOLLOW $CM+ / (
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
# LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
$LF $CR;
# LB 7 x SP
# x ZW
[$SP $ZW] [$LB4NonBreaks-$CM];
[$SP $ZW] $CM+ $CAN_CM;
# LB 8 ZW SP* <break>
# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
# Requires an engine enhancement.
# / $SP* $ZW
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
#
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
^$CM+ $CAN_CM;
# LB 11
#
$WJ $CM* $CAN_CM;
$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
$CAN_CM $CM* $GL;
# LB 13
$CL $CM+ $CAN_CM;
$CP $CM+ $CAN_CM;
$EX $CM+ $CAN_CM;
$IS $CM+ $CAN_CM;
$SY $CM+ $CAN_CM;
$CL [$LB8NonBreaks-$CM];
$CP [$LB8NonBreaks-$CM];
$EX [$LB8NonBreaks-$CM];
$IS [$LB8NonBreaks-$CM];
$SY [$LB8NonBreaks-$CM];
# LB 14 OP SP* x
#
. $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
# LB 15
$OP $SP* $CM* $QU;
# LB 16
# Don't include $NSX here
$NS $SP* $CM* ($CL | $CP);
# LB 17
$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
#
# LB 19
#
$QU $CM* $CAN_CM; # . x QU
$QU $LB18NonBreaks;
$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
#
# LB 20 Break before and after CB.
# nothing needed here.
#
# LB 20.09 added rule for Finnish tailoring
$AL ($HY | $HH) / $SP;
# LB 21
# Don't include $NSX here
($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
# LB21a
[^$CB] $CM* ($HY | $BA | $HH) $CM* $HL;
# LB21b (reverse)
$HL $CM* $SY;
# LB 22
$IN $CM* ($ALPlus | $HL);
$IN $CM* $EX;
$IN $CM* ($ID | $EB | $EM);
# $IN $CM* $IN; # delete this rule for CSS loose
$IN $CM* $NU;
# LB 23
$NU $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* $NU;
# LB23a
($ID | $EB | $EM) $CM* $PR;
$PO $CM* ($ID | $EB | $EM);
# LB 24
($ALPlus | $HL) $CM* ($PR | $PO);
($PR | $PO) $CM* ($ALPlus | $HL);
# LB 25
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
# LB 26
($H3 | $H2 | $JV | $JL) $CM* $JL;
($JT | $JV) $CM* ($H2 | $JV);
$JT $CM* ($H3 | $JT);
# LB 27
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
($ALPlus | $HL) $CM* $IS;
# LB 30
$OP $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
# Pairs of Regional Indicators.
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
# the second with an even number. Stripping away the cruft they look like
# [^RI] RI / (RI RI)+ ^RI;
# [^RI] RI RI / (RI RI)+ ^RI;
#
# Line Loose tailoring: Don't include NSX here.
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
$RI $CM* $RI;
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EM $CM* $EB;
## -------------------------------------------------
@ -561,7 +353,6 @@ $EM $CM* $EB;
# LB 9
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
@ -587,20 +378,3 @@ $CM* ($HY | $BA | $HH) $CM* $HL;
# For dictionary-based break
$dictionary $dictionary;
## -------------------------------------------------
!!safe_forward;
# Skip forward over all character classes that are involved in
# rules containing patterns with possibly more than one char
# of context.
#
# It might be slightly more efficient to have specific rules
# instead of one generic one, but only if we could
# turn off rule chaining. We don't want to move more
# than necessary.
#
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
$dictionary $dictionary;

View file

@ -1,4 +1,4 @@
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
@ -29,6 +29,7 @@
#
!!chain;
!!quoted_literals_only;
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
@ -338,217 +339,12 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Reverse Rules.
#
## -------------------------------------------------
!!reverse;
# LB 9 Combining Marks.
# Stick together any combining sequences that don't match other rules.
^$CM+ $CAN_CM?;
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] [whatever]
# The CM needs to behave as an AL
#
$AL_FOLLOW $CM+ / (
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
# LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
$LF $CR;
# LB 7 x SP
# x ZW
[$SP $ZW] [$LB4NonBreaks-$CM];
[$SP $ZW] $CM+ $CAN_CM;
# LB 8 ZW SP* <break>
# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
# Requires an engine enhancement.
# / $SP* $ZW
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
#
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
^$CM+ $CAN_CM;
# LB 11
#
$WJ $CM* $CAN_CM;
$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
$CAN_CM $CM* $GL;
# LB 13
$CL $CM+ $CAN_CM;
$CP $CM+ $CAN_CM;
$EX $CM+ $CAN_CM;
$IS $CM+ $CAN_CM;
$SY $CM+ $CAN_CM;
$CL [$LB8NonBreaks-$CM];
$CP [$LB8NonBreaks-$CM];
$EX [$LB8NonBreaks-$CM];
$IS [$LB8NonBreaks-$CM];
$SY [$LB8NonBreaks-$CM];
# LB 14 OP SP* x
#
. $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
# LB 15
$OP $SP* $CM* $QU;
# LB 16
$NS $SP* $CM* ($CL | $CP);
# LB 17
$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
#
# LB 19
#
$QU $CM* $CAN_CM; # . x QU
$QU $LB18NonBreaks;
$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
#
# LB 20 Break before and after CB.
# nothing needed here.
#
# LB 21
($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
# LB21a
[^$CB] $CM* ($HY | $BA) $CM* $HL;
# LB21b (reverse)
$HL $CM* $SY;
# LB 22
$IN $CM* ($ALPlus | $HL);
$IN $CM* $EX;
$IN $CM* ($ID | $EB | $EM);
$IN $CM* $IN;
$IN $CM* $NU;
# LB 23
$NU $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* $NU;
# LB23a
($ID | $EB | $EM) $CM* $PR;
$PO $CM* ($ID | $EB | $EM);
# LB 24
($ALPlus | $HL) $CM* ($PR | $PO);
($PR | $PO) $CM* ($ALPlus | $HL);
# LB 25
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
# LB 26
($H3 | $H2 | $JV | $JL) $CM* $JL;
($JT | $JV) $CM* ($H2 | $JV);
$JT $CM* ($H3 | $JT);
# LB 27
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
($ALPlus | $HL) $CM* $IS;
# LB 30
$OP $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
# Pairs of Regional Indicators.
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
# the second with an even number. Stripping away the cruft they look like
# [^RI] RI / (RI RI)+ ^RI;
# [^RI] RI RI / (RI RI)+ ^RI;
#
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
$RI $CM* $RI;
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EM $CM* $EB;
## -------------------------------------------------
!!safe_reverse;
# LB 9
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
@ -574,20 +370,3 @@ $CM* ($HY | $BA) $CM* $HL;
# For dictionary-based break
$dictionary $dictionary;
## -------------------------------------------------
!!safe_forward;
# Skip forward over all character classes that are involved in
# rules containing patterns with possibly more than one char
# of context.
#
# It might be slightly more efficient to have specific rules
# instead of one generic one, but only if we could
# turn off rule chaining. We don't want to move more
# than necessary.
#
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
$dictionary $dictionary;

View file

@ -1,4 +1,4 @@
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
@ -30,6 +30,7 @@
#
!!chain;
!!quoted_literals_only;
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
@ -344,219 +345,12 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Reverse Rules.
#
## -------------------------------------------------
!!reverse;
# LB 9 Combining Marks.
# Stick together any combining sequences that don't match other rules.
^$CM+ $CAN_CM?;
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] [whatever]
# The CM needs to behave as an AL
#
$AL_FOLLOW $CM+ / (
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
# LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
$LF $CR;
# LB 7 x SP
# x ZW
[$SP $ZW] [$LB4NonBreaks-$CM];
[$SP $ZW] $CM+ $CAN_CM;
# LB 8 ZW SP* <break>
# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
# Requires an engine enhancement.
# / $SP* $ZW
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
#
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
^$CM+ $CAN_CM;
# LB 11
#
$WJ $CM* $CAN_CM;
$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
$CAN_CM $CM* $GL;
# LB 13
$CL $CM+ $CAN_CM;
$CP $CM+ $CAN_CM;
$EX $CM+ $CAN_CM;
$IS $CM+ $CAN_CM;
$SY $CM+ $CAN_CM;
$CL [$LB8NonBreaks-$CM];
$CP [$LB8NonBreaks-$CM];
$EX [$LB8NonBreaks-$CM];
$IS [$LB8NonBreaks-$CM];
$SY [$LB8NonBreaks-$CM];
# LB 14 OP SP* x
#
. $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
# LB 15
$OP $SP* $CM* $QU;
# LB 16
# Don't include $NSX here
$NS $SP* $CM* ($CL | $CP);
# LB 17
$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
#
# LB 19
#
$QU $CM* $CAN_CM; # . x QU
$QU $LB18NonBreaks;
$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
#
# LB 20 Break before and after CB.
# nothing needed here.
#
# LB 21
# Don't include $BAX or $NSX here
($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
# LB21a Don't break after Hebrew + Hyphen.
([^$CB] $CM*)? ($HY | $BA | $BAX) $CM* $HL;
# LB21b (reverse)
$HL $CM* $SY;
# LB 22
$IN $CM* ($ALPlus | $HL);
$IN $CM* $EX;
$IN $CM* ($ID | $EB | $EM);
$IN $CM* $IN;
$IN $CM* $NU;
# LB 23
$NU $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* $NU;
# LB23a
($ID | $EB | $EM) $CM* $PR;
$PO $CM* ($ID | $EB | $EM);
# LB 24
($ALPlus | $HL) $CM* ($PR | $PO);
($PR | $PO) $CM* ($ALPlus | $HL);
# LB 25
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
# LB 26
($H3 | $H2 | $JV | $JL) $CM* $JL;
($JT | $JV) $CM* ($H2 | $JV);
$JT $CM* ($H3 | $JT);
# LB 27
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
($ALPlus | $HL) $CM* $IS;
# LB 30
$OP $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
# Pairs of Regional Indicators.
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
# the second with an even number. Stripping away the cruft they look like
# [^RI] RI / (RI RI)+ ^RI;
# [^RI] RI RI / (RI RI)+ ^RI;
#
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
$RI $CM* $RI;
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EM $CM* $EB;
## -------------------------------------------------
!!safe_reverse;
# LB 9
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
^$CM+ $SP / .;
# LB 14
$SP+ $CM* $OP;
@ -582,20 +376,3 @@ $CM* ($HY | $BA | $BAX) $CM* $HL;
# For dictionary-based break
$dictionary $dictionary;
## -------------------------------------------------
!!safe_forward;
# Skip forward over all character classes that are involved in
# rules containing patterns with possibly more than one char
# of context.
#
# It might be slightly more efficient to have specific rules
# instead of one generic one, but only if we could
# turn off rule chaining. We don't want to move more
# than necessary.
#
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $BAX $RI $ZWJ $dictionary];
$dictionary $dictionary;

View file

@ -1,4 +1,4 @@
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2002-2016 International Business Machines Corporation and
# others. All Rights Reserved.
@ -28,6 +28,7 @@
#
!!chain;
!!quoted_literals_only;
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
@ -341,213 +342,6 @@ $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;
#
# Reverse Rules.
#
## -------------------------------------------------
!!reverse;
# LB 9 Combining Marks.
# Stick together any combining sequences that don't match other rules.
^$CM+ $CAN_CM?;
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] [whatever]
# The CM needs to behave as an AL
#
$AL_FOLLOW $CM+ / (
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.
# LB14 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
# LB 4, 5, 6
$LB4Breaks [$LB4NonBreaks-$CM];
$LB4Breaks $CM+ $CAN_CM;
$LF $CR;
# LB 7 x SP
# x ZW
[$SP $ZW] [$LB4NonBreaks-$CM];
[$SP $ZW] $CM+ $CAN_CM;
# LB 8 ZW SP* <break>
# TODO: to implement this, we need more than one look-ahead hard break in play at a time.
# Requires an engine enhancement.
# / $SP* $ZW
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
#
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
# LB 9,10 Combining marks.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
^$CM+ $CAN_CM;
# LB 11
#
$WJ $CM* $CAN_CM;
$WJ [$LB8NonBreaks-$CM];
$CANT_CM $CM* $WJ;
$CAN_CM $CM* $WJ;
# LB 12a
# [^SP BA HY] x GL
#
$GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HH $HY]];
# LB 12
# GL x
#
$CANT_CM $CM* $GL;
$CAN_CM $CM* $GL;
# LB 13
$CL $CM+ $CAN_CM;
$CP $CM+ $CAN_CM;
$EX $CM+ $CAN_CM;
$IS $CM+ $CAN_CM;
$SY $CM+ $CAN_CM;
$CL [$LB8NonBreaks-$CM];
$CP [$LB8NonBreaks-$CM];
$EX [$LB8NonBreaks-$CM];
$IS [$LB8NonBreaks-$CM];
$SY [$LB8NonBreaks-$CM];
# LB 14 OP SP* x
#
. $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP+ $CM* $OP
# LB 15
$OP $SP* $CM* $QU;
# LB 16
$NS $SP* $CM* ($CL | $CP);
# LB 17
$B2 $SP* $CM* $B2;
# LB 18 break after spaces
# Nothing explicit needed here.
#
# LB 19
#
$QU $CM* $CAN_CM; # . x QU
$QU $LB18NonBreaks;
$CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
#
# LB 20 Break before and after CB.
# nothing needed here.
#
# LB 20.09 added rule for Finnish tailoring
$AL ($HY | $HH) / $SP;
# LB 21
($BA | $HH | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
[$LB20NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
# LB21a
[^$CB] $CM* ($HY | $BA | $HH) $CM* $HL;
# LB21b (reverse)
$HL $CM* $SY;
# LB 22
$IN $CM* ($ALPlus | $HL);
$IN $CM* $EX;
$IN $CM* ($ID | $EB | $EM);
$IN $CM* $IN;
$IN $CM* $NU;
# LB 23
$NU $CM* ($ALPlus | $HL);
($ALPlus | $HL) $CM* $NU;
# LB23a
($ID | $EB | $EM) $CM* $PR;
$PO $CM* ($ID | $EB | $EM);
# LB 24
($ALPlus | $HL) $CM* ($PR | $PO);
($PR | $PO) $CM* ($ALPlus | $HL);
# LB 25
($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
# LB 26
($H3 | $H2 | $JV | $JL) $CM* $JL;
($JT | $JV) $CM* ($H2 | $JV);
$JT $CM* ($H3 | $JT);
# LB 27
$IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
$PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 28
($ALPlus | $HL) $CM* ($ALPlus | $HL);
# LB 29
($ALPlus | $HL) $CM* $IS;
# LB 30
$OP $CM* ($ALPlus | $HL | $NU);
($ALPlus | $HL | $NU) $CM* $CP;
# LB 30a
# Pairs of Regional Indicators.
# The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
# the second with an even number. Stripping away the cruft they look like
# [^RI] RI / (RI RI)+ ^RI;
# [^RI] RI RI / (RI RI)+ ^RI;
#
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
[{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
# In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
$RI $CM* $RI;
# WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
$RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EM $CM* $EB;
## -------------------------------------------------
!!safe_reverse;
@ -580,20 +374,3 @@ $CM* ($HY | $BA | $HH) $CM* $HL;
# For dictionary-based break
$dictionary $dictionary;
## -------------------------------------------------
!!safe_forward;
# Skip forward over all character classes that are involved in
# rules containing patterns with possibly more than one char
# of context.
#
# It might be slightly more efficient to have specific rules
# instead of one generic one, but only if we could
# turn off rule chaining. We don't want to move more
# than necessary.
#
^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
$dictionary $dictionary;

View file

@ -1,6 +1,5 @@
#
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html#License
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
#
# Copyright (C) 2002-2015, International Business Machines Corporation and others.
# All Rights Reserved.
@ -12,6 +11,7 @@
# These rules are based on UAX #29 Revision 26 for Unicode Version 8.0
#
!!quoted_literals_only;
#
# Character categories as defined in TR 29
@ -85,22 +85,13 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
## -------------------------------------------------
!!reverse;
!!safe_reverse;
$SpEx_R = ($Extend | $Format)* $Sp;
$ATermEx_R = ($Extend | $Format)* $ATerm;
$STermEx_R = ($Extend | $Format)* $STerm;
$CloseEx_R = ($Extend | $Format)* $Close;
#
# Reverse rules.
# For now, use the old style inexact reverse rules, which are easier
# to write, but less efficient.
# TODO: exact reverse rules. It appears that exact reverse rules
# may require improving support for look-ahead breaks in the
# builder. Needs more investigation.
#
[{bof}] (.? | $LF $CR) [^$Sep $CR $LF]* [$Sep $CR $LF {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*;
#.*;
@ -112,9 +103,9 @@ $CloseEx_R = ($Extend | $Format)* $Close;
# The preceding $Sep, which will be the second one that the rule matches.
# Any immediately preceding STerm or ATerm sequences. We need to see these
# to get the correct rule status when moving forwards again.
#
#
# [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match
# the entire string.
# the entire string. TODO: can bof be replaced with ^
#
# (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because position might be
# at the beginning of the string at this point, and we don't want to fail.

View file

@ -1,6 +1,6 @@
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
#
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html#License
#
# Copyright (C) 2002-2015, International Business Machines Corporation and others.
# All Rights Reserved.
@ -12,6 +12,7 @@
# These rules are based on UAX #29 Revision 26 for Unicode Version 8.0
#
!!quoted_literals_only;
#
# Character categories as defined in TR 29
@ -85,7 +86,7 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
## -------------------------------------------------
!!reverse;
!!safe_reverse;
$SpEx_R = ($Extend | $Format)* $Sp;
$ATermEx_R = ($Extend | $Format)* $ATerm;
@ -102,7 +103,6 @@ $CloseEx_R = ($Extend | $Format)* $Close;
#
[{bof}] (.? | $LF $CR) [^$Sep $CR $LF]* [$Sep $CR $LF {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*;
#.*;
# Explanation for this rule:
#
@ -112,7 +112,7 @@ $CloseEx_R = ($Extend | $Format)* $Close;
# The preceding $Sep, which will be the second one that the rule matches.
# Any immediately preceding STerm or ATerm sequences. We need to see these
# to get the correct rule status when moving forwards again.
#
#
# [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match
# the entire string.
#

View file

@ -1,5 +1,5 @@
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html#License
# License & terms of use: http://www.unicode.org/copyright.html
#
# Copyright (c) 2002-2015, International Business Machines Corporation and
# others. All Rights Reserved.
@ -7,6 +7,7 @@
# Title Casing Break Rules
#
!!quoted_literals_only;
$CaseIgnorable = [[:Mn:][:Me:][:Cf:][:Lm:][:Sk:] \u0027 \u00AD \u2019];
$Cased = [[:Upper_Case:][:Lower_Case:][:Lt:] - $CaseIgnorable];
@ -27,19 +28,6 @@ $NotCased = [[^ $Cased] - $CaseIgnorable];
$Cased ($Cased | $CaseIgnorable)* ($NotCased | $CaseIgnorable)*;
# Reverse Rules
!!reverse;
# Normal Rule, will work nearly universally, so long as there is a
# start-of-word preceding the current iteration position.
($NotCased | $CaseIgnorable)* ($Cased | $CaseIgnorable)* $Cased;
# Short rule, will be effective only when moving to the start of text,
# with no word (cased character) preceding the current iteration position.
($NotCased | $CaseIgnorable)*;
!!safe_reverse;
# Safe Reverse: the exact forward rule must not start in the middle
@ -47,10 +35,3 @@ $Cased ($Cased | $CaseIgnorable)* ($NotCased | $CaseIgnorable)*;
# leaving it just before the start of a word.
($Cased | $CaseIgnorable)*;
!!safe_forward;
# Safe Forward, nothing needs to be done, the exact Reverse rules will
# always find valid boundaries from any starting position.
# Still, some rule is needed, so '.', a one character movement.
.;

View file

@ -1,7 +1,7 @@
#
#
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (C) 2002-2016, International Business Machines Corporation
# Copyright (C) 2002-2016, International Business Machines Corporation
# and others. All Rights Reserved.
#
# file: word.txt
@ -22,6 +22,7 @@
##############################################################################
!!chain;
!!quoted_literals_only;
#
@ -194,95 +195,6 @@ $HangulSyllable $HangulSyllable {200};
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
## -------------------------------------------------
!!reverse;
$BackHebrew_LetterEx = ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
$BackALetterEx = ($Format | $Extend | $ZWJ)* $ALetterPlus;
$BackSingle_QuoteEx = ($Format | $Extend | $ZWJ)* $Single_Quote;
$BackDouble_QuoteEx = ($Format | $Extend | $ZWJ)* $Double_Quote;
$BackMidNumLetEx = ($Format | $Extend | $ZWJ)* $MidNumLet;
$BackNumericEx = ($Format | $Extend | $ZWJ)* $Numeric;
$BackMidNumEx = ($Format | $Extend | $ZWJ)* $MidNum;
$BackMidLetterEx = ($Format | $Extend | $ZWJ)* $MidLetter;
$BackKatakanaEx = ($Format | $Extend | $ZWJ)* $Katakana;
$BackHiraganaEx = ($Format | $Extend | $ZWJ)* $Hiragana;
$BackExtendNumLetEx = ($Format | $Extend | $ZWJ)* $ExtendNumLet;
$BackRegional_IndicatorEx = ($Format | $Extend | $ZWJ)* $Regional_Indicator;
# rule 3
$LF $CR;
# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
#
($Extended_Pict | $EmojiNRK) $ZWJ;
# rule 4
($Format | $Extend | $ZWJ)* [^$CR $LF $Newline]?;
# rule 5
($BackALetterEx | $BackHebrew_LetterEx) ($BackALetterEx | $BackHebrew_LetterEx);
# rule 6 and 7
($BackALetterEx | $BackHebrew_LetterEx) ($BackMidLetterEx | $BackMidNumLetEx | $BackSingle_QuoteEx) ($BackALetterEx | $BackHebrew_LetterEx);
# rule 7a
$BackSingle_QuoteEx $BackHebrew_LetterEx;
# Rule 7b and 7c
$BackHebrew_LetterEx $BackDouble_QuoteEx $BackHebrew_LetterEx;
# rule 8
$BackNumericEx $BackNumericEx;
# rule 9
$BackNumericEx ($BackALetterEx | $BackHebrew_LetterEx);
# rule 10
($BackALetterEx | $BackHebrew_LetterEx) $BackNumericEx;
# rule 11 and 12
$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx | $BackSingle_QuoteEx) $BackNumericEx;
# rule 13
$BackKatakanaEx $BackKatakanaEx;
# rules 13 a/b
#
$BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable;
$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
# rule 14
$E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $EBG);
# rule 15 - 17
# Pairs of Regional Indicators stay together.
^$BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
($Extended_Pict | $EmojiNRK) $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
($Extended_Pict | $EmojiNRK) $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
## -------------------------------------------------
!!safe_reverse;
@ -291,39 +203,17 @@ $E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $EBG);
($Extend | $Format | $ZWJ)+ .?;
# rule 6
($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx);
($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus);
# rule 7b
$Double_Quote $BackHebrew_LetterEx;
$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
# rule 11
($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx;
($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric;
# rule 13c
$BackRegional_IndicatorEx*;
# For dictionary-based break
$dictionary $dictionary;
## -------------------------------------------------
!!safe_forward;
# rule 4
($Extend | $Format | $ZWJ)+ .?;
# rule 6
($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx);
# rule 7b
$Double_QuoteEx $Hebrew_LetterEx;
# rule 11
($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx;
# rule 13c
$Regional_IndicatorEx*;
$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator;
# For dictionary-based break
$dictionary $dictionary;

View file

@ -1,7 +1,7 @@
#
#
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (C) 2002-2016, International Business Machines Corporation
# Copyright (C) 2002-2016, International Business Machines Corporation
# and others. All Rights Reserved.
#
# file: word_POSIX.txt
@ -22,6 +22,7 @@
##############################################################################
!!chain;
!!quoted_literals_only;
#
@ -62,7 +63,7 @@ $Hiragana = [:Hiragana:];
# 5.0 or later as the definition of Complex_Context was corrected to include all
# characters requiring dictionary break.
$Control = [\p{Grapheme_Cluster_Break = Control}];
$Control = [\p{Grapheme_Cluster_Break = Control}];
$HangulSyllable = [\uac00-\ud7a3];
$ComplexContext = [:LineBreak = Complex_Context:];
$KanaKanji = [$Han $Hiragana $Katakana];
@ -74,7 +75,7 @@ $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
#
# Rules 4 Ignore Format and Extend characters,
# Rules 4 Ignore Format and Extend characters,
# except when they appear at the beginning of a region of text.
#
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
@ -154,7 +155,7 @@ $NumericEx $NumericEx {100};
$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
# rule 11 and 12
# rule 11 and 12
$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
@ -191,96 +192,7 @@ $ExtendNumLetEx $KatakanaEx {400}; # (13b)
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable {200};
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
## -------------------------------------------------
!!reverse;
$BackHebrew_LetterEx = ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
$BackALetterEx = ($Format | $Extend | $ZWJ)* $ALetterPlus;
$BackSingle_QuoteEx = ($Format | $Extend | $ZWJ)* $Single_Quote;
$BackDouble_QuoteEx = ($Format | $Extend | $ZWJ)* $Double_Quote;
$BackMidNumLetEx = ($Format | $Extend | $ZWJ)* $MidNumLet;
$BackNumericEx = ($Format | $Extend | $ZWJ)* $Numeric;
$BackMidNumEx = ($Format | $Extend | $ZWJ)* $MidNum;
$BackMidLetterEx = ($Format | $Extend | $ZWJ)* $MidLetter;
$BackKatakanaEx = ($Format | $Extend | $ZWJ)* $Katakana;
$BackHiraganaEx = ($Format | $Extend | $ZWJ)* $Hiragana;
$BackExtendNumLetEx = ($Format | $Extend | $ZWJ)* $ExtendNumLet;
$BackRegional_IndicatorEx = ($Format | $Extend | $ZWJ)* $Regional_Indicator;
# rule 3
$LF $CR;
# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
#
($Extended_Pict | $EmojiNRK) $ZWJ;
# rule 4
($Format | $Extend | $ZWJ)* [^$CR $LF $Newline]?;
# rule 5
($BackALetterEx | $BackHebrew_LetterEx) ($BackALetterEx | $BackHebrew_LetterEx);
# rule 6 and 7
($BackALetterEx | $BackHebrew_LetterEx) ($BackMidLetterEx | $BackMidNumLetEx | $BackSingle_QuoteEx) ($BackALetterEx | $BackHebrew_LetterEx);
# rule 7a
$BackSingle_QuoteEx $BackHebrew_LetterEx;
# Rule 7b and 7c
$BackHebrew_LetterEx $BackDouble_QuoteEx $BackHebrew_LetterEx;
# rule 8
$BackNumericEx $BackNumericEx;
# rule 9
$BackNumericEx ($BackALetterEx | $BackHebrew_LetterEx);
# rule 10
($BackALetterEx | $BackHebrew_LetterEx) $BackNumericEx;
# rule 11 and 12
$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx | $BackSingle_QuoteEx) $BackNumericEx;
# rule 13
$BackKatakanaEx $BackKatakanaEx;
# rules 13 a/b
#
$BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable;
$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
# rule 14
$E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $EBG);
# rule 15 - 17
# Pairs of Regional Indicators stay together.
^$BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
($Extended_Pict | $EmojiNRK) $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
($Extended_Pict | $EmojiNRK) $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
## -------------------------------------------------
@ -291,39 +203,17 @@ $E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $EBG);
($Extend | $Format | $ZWJ)+ .?;
# rule 6
($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx);
($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus);
# rule 7b
$Double_Quote $BackHebrew_LetterEx;
$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
# rule 11
($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx;
($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric;
# rule 13c
$BackRegional_IndicatorEx*;
# For dictionary-based break
$dictionary $dictionary;
## -------------------------------------------------
!!safe_forward;
# rule 4
($Extend | $Format | $ZWJ)+ .?;
# rule 6
($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx);
# rule 7b
$Double_QuoteEx $Hebrew_LetterEx;
# rule 11
($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx;
# rule 13c
$Regional_IndicatorEx*;
$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator;
# For dictionary-based break
$dictionary $dictionary;

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -201,7 +201,7 @@ tzdbNames{
"meta:China"{
sd{"CDT"}
ss{"CST"}
parseRegions{"CN", "MO", "TW"}
parseRegions{"CN", "MO"}
}
"meta:Choibalsan"{
sd{"CHOST"}
@ -562,6 +562,10 @@ tzdbNames{
"meta:Ponape"{
ss{"PONT"}
}
"meta:Pyongyang"{
ss{"KST"}
parseRegions{"KP"}
}
"meta:Qyzylorda"{
sd{"QYZST"}
ss{"QYZT"}
@ -617,6 +621,7 @@ tzdbNames{
"meta:Taipei"{
sd{"CDT"}
ss{"CST"}
parseRegions{"TW"}
}
"meta:Tajikistan"{
ss{"TJT"}

View file

@ -31,9 +31,13 @@
static const UChar TARGET_SEP = 45; // '-'
static const UChar VARIANT_SEP = 47; // '/'
static const UChar ANY[] = {65,110,121,0}; // "Any"
static const UChar ANY[] = {0x41,0x6E,0x79,0}; // "Any"
static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-"
static const UChar LATIN_PIVOT[] = {0x2D,0x4C,0x61,0x74,0x6E,0x3B,0x4C,0x61,0x74,0x6E,0x2D,0}; // "-Latn;Latn-"
// initial size for an Any-XXXX transform's cache of script-XXXX transforms
// (will grow as necessary, but we don't expect to have source text with more than 7 scripts)
#define ANY_TRANS_CACHE_INIT_SIZE 7
//------------------------------------------------------------
@ -186,7 +190,7 @@ AnyTransliterator::AnyTransliterator(const UnicodeString& id,
Transliterator(id, NULL),
targetScript(theTargetScript)
{
cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec);
if (U_FAILURE(ec)) {
return;
}
@ -212,7 +216,7 @@ AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
{
// Don't copy the cache contents
UErrorCode ec = U_ZERO_ERROR;
cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec);
if (U_FAILURE(ec)) {
return;
}
@ -286,7 +290,7 @@ Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
}
if (t == NULL) {
UErrorCode ec = U_ZERO_ERROR;
UnicodeString sourceName(uscript_getName(source), -1, US_INV);
UnicodeString sourceName(uscript_getShortName(source), -1, US_INV);
UnicodeString id(sourceName);
id.append(TARGET_SEP).append(target);

View file

@ -8,12 +8,12 @@
*
* File CALENDAR.CPP
*
* Modification History:
* Modification History:
*
* Date Name Description
* 02/03/97 clhuang Creation.
* 04/22/97 aliu Cleaned up, fixed memory leak, made
* setWeekCountData() more robust.
* 04/22/97 aliu Cleaned up, fixed memory leak, made
* setWeekCountData() more robust.
* Moved platform code to TPlatformUtilities.
* 05/01/97 aliu Made equals(), before(), after() arguments const.
* 05/20/97 aliu Changed logic of when to compute fields and time
@ -26,7 +26,7 @@
*******************************************************************************
*/
#include "utypeinfo.h" // for 'typeid' to work
#include "utypeinfo.h" // for 'typeid' to work
#include "unicode/utypes.h"
@ -66,10 +66,8 @@
#if !UCONFIG_NO_SERVICE
static icu::ICULocaleService* gService = NULL;
static icu::UInitOnce gServiceInitOnce = U_INITONCE_INITIALIZER;
#endif
// INTERNAL - for cleanup
U_CDECL_BEGIN
static UBool calendar_cleanup(void) {
#if !UCONFIG_NO_SERVICE
@ -82,6 +80,7 @@ static UBool calendar_cleanup(void) {
return TRUE;
}
U_CDECL_END
#endif
// ------------------------------------------
//
@ -93,9 +92,9 @@ U_CDECL_END
#if defined( U_DEBUG_CALSVC ) || defined (U_DEBUG_CAL)
/**
* fldName was removed as a duplicate implementation.
* use udbg_ services instead,
/**
* fldName was removed as a duplicate implementation.
* use udbg_ services instead,
* which depend on include files and library from ../tools/toolutil, the following circular link:
* CPPFLAGS+=-I$(top_srcdir)/tools/toolutil
* LIBS+=$(LIBICUTOOLUTIL)
@ -123,7 +122,7 @@ void ucal_dump(const Calendar &cal) {
void Calendar::dump() const {
int i;
fprintf(stderr, "@calendar=%s, timeset=%c, fieldset=%c, allfields=%c, virtualset=%c, t=%.2f",
getType(), fIsTimeSet?'y':'n', fAreFieldsSet?'y':'n', fAreAllFieldsSet?'y':'n',
getType(), fIsTimeSet?'y':'n', fAreFieldsSet?'y':'n', fAreAllFieldsSet?'y':'n',
fAreFieldsVirtuallySet?'y':'n',
fTime);
@ -135,9 +134,9 @@ void Calendar::dump() const {
fprintf(stderr, " %25s: %-11ld", f, fFields[i]);
if(fStamp[i] == kUnset) {
fprintf(stderr, " (unset) ");
} else if(fStamp[i] == kInternallySet) {
} else if(fStamp[i] == kInternallySet) {
fprintf(stderr, " (internally set) ");
//} else if(fStamp[i] == kInternalDefault) {
//} else if(fStamp[i] == kInternalDefault) {
// fprintf(stderr, " (internal default) ");
} else {
fprintf(stderr, " %%%d ", fStamp[i]);
@ -213,7 +212,7 @@ const SharedCalendar *LocaleCacheKey<SharedCalendar>::createObject(
const void * /*unusedCreationContext*/, UErrorCode &status) const {
Calendar *calendar = Calendar::makeInstance(fLoc, status);
if (U_FAILURE(status)) {
return NULL;
return NULL;
}
SharedCalendar *shared = new SharedCalendar(calendar);
if (shared == NULL) {
@ -234,7 +233,9 @@ static ECalType getCalendarType(const char *s) {
return CALTYPE_UNKNOWN;
}
static UBool isStandardSupportedKeyword(const char *keyword, UErrorCode& status) {
#if !UCONFIG_NO_SERVICE
// Only used with service registration.
static UBool isStandardSupportedKeyword(const char *keyword, UErrorCode& status) {
if(U_FAILURE(status)) {
return FALSE;
}
@ -242,6 +243,7 @@ static UBool isStandardSupportedKeyword(const char *keyword, UErrorCode& status)
return (calType != CALTYPE_UNKNOWN);
}
// only used with service registration.
static void getCalendarKeyword(const UnicodeString &id, char *targetBuffer, int32_t targetBufferSize) {
UnicodeString calendarKeyword = UNICODE_STRING_SIMPLE("calendar=");
int32_t calKeyLen = calendarKeyword.length();
@ -255,6 +257,7 @@ static void getCalendarKeyword(const UnicodeString &id, char *targetBuffer, int3
}
targetBuffer[keyLen] = 0;
}
#endif
static ECalType getCalendarTypeForLocale(const char *locid) {
UErrorCode status = U_ZERO_ERROR;
@ -291,7 +294,7 @@ static ECalType getCalendarTypeForLocale(const char *locid) {
if (U_FAILURE(status)) {
return CALTYPE_GREGORIAN;
}
// Read preferred calendar values from supplementalData calendarPreference
UResourceBundle *rb = ures_openDirect(NULL, "supplementalData", &status);
ures_getByKey(rb, "calendarPreferenceData", rb, &status);
@ -394,7 +397,7 @@ static Calendar *createStandardCalendar(ECalType calType, const Locale &loc, UEr
// -------------------------------------
/**
* a Calendar Factory which creates the "basic" calendar types, that is, those
* a Calendar Factory which creates the "basic" calendar types, that is, those
* shipped with ICU.
*/
class BasicCalendarFactory : public LocaleKeyFactory {
@ -408,7 +411,7 @@ public:
virtual ~BasicCalendarFactory();
protected:
//virtual UBool isSupportedID( const UnicodeString& id, UErrorCode& status) const {
//virtual UBool isSupportedID( const UnicodeString& id, UErrorCode& status) const {
// if(U_FAILURE(status)) {
// return FALSE;
// }
@ -466,7 +469,7 @@ protected:
BasicCalendarFactory::~BasicCalendarFactory() {}
/**
/**
* A factory which looks up the DefaultCalendar resource to determine which class of calendar to use
*/
@ -510,7 +513,7 @@ public:
virtual UObject* cloneInstance(UObject* instance) const {
UnicodeString *s = dynamic_cast<UnicodeString *>(instance);
if(s != NULL) {
return s->clone();
return s->clone();
} else {
#ifdef U_DEBUG_CALSVC_F
UErrorCode status2 = U_ZERO_ERROR;
@ -573,7 +576,7 @@ initCalendarService(UErrorCode &status)
fprintf(stderr, "Registering classes..\n");
#endif
// Register all basic instances.
// Register all basic instances.
gService->registerFactory(new BasicCalendarFactory(),status);
#ifdef U_DEBUG_CALSVC
@ -589,7 +592,7 @@ initCalendarService(UErrorCode &status)
}
}
static ICULocaleService*
static ICULocaleService*
getCalendarService(UErrorCode &status)
{
umtx_initOnce(gServiceInitOnce, &initCalendarService, status);
@ -743,7 +746,7 @@ fSkippedWallTime(UCAL_WALLTIME_LAST)
return;
}
clear();
clear();
fZone = zone;
setWeekData(aLocale, NULL, success);
}
@ -850,7 +853,7 @@ Calendar::createInstance(const Locale& aLocale, UErrorCode& success)
return createInstance(TimeZone::createDefault(), aLocale, success);
}
// ------------------------------------- Adopting
// ------------------------------------- Adopting
// Note: this is the bottleneck that actually calls the service routines.
@ -903,7 +906,7 @@ Calendar::makeInstance(const Locale& aLocale, UErrorCode& success) {
c = (Calendar*)getCalendarService(success)->get(l, LocaleKey::KIND_ANY, &actualLoc2, success);
if(U_FAILURE(success) || !c) {
if(U_SUCCESS(success)) {
if(U_SUCCESS(success)) {
success = U_INTERNAL_PROGRAM_ERROR; // Propagate some err
}
return NULL;
@ -911,7 +914,7 @@ Calendar::makeInstance(const Locale& aLocale, UErrorCode& success) {
str = dynamic_cast<const UnicodeString*>(c);
if(str != NULL) {
// recursed! Second lookup returned a UnicodeString.
// recursed! Second lookup returned a UnicodeString.
// Perhaps DefaultCalendar{} was set to another locale.
#ifdef U_DEBUG_CALSVC
char tmp[200];
@ -985,7 +988,7 @@ Calendar::createInstance(const TimeZone& zone, const Locale& aLocale, UErrorCode
if(U_SUCCESS(success) && c) {
c->setTimeZone(zone);
}
return c;
return c;
}
// -------------------------------------
@ -1017,7 +1020,7 @@ Calendar::operator==(const Calendar& that) const
U_SUCCESS(status);
}
UBool
UBool
Calendar::isEquivalentTo(const Calendar& other) const
{
return typeid(*this) == typeid(other) &&
@ -1099,13 +1102,13 @@ Calendar::getNow()
* Gets this Calendar's current time as a long.
* @return the current time as UTC milliseconds from the epoch.
*/
double
double
Calendar::getTimeInMillis(UErrorCode& status) const
{
if(U_FAILURE(status))
if(U_FAILURE(status))
return 0.0;
if ( ! fIsTimeSet)
if ( ! fIsTimeSet)
((Calendar*)this)->updateTime(status);
/* Test for buffer overflows */
@ -1124,9 +1127,9 @@ Calendar::getTimeInMillis(UErrorCode& status) const
* when in lenient mode the out of range values are pinned to their respective min/max.
* @param date the new time in UTC milliseconds from the epoch.
*/
void
void
Calendar::setTimeInMillis( double millis, UErrorCode& status ) {
if(U_FAILURE(status))
if(U_FAILURE(status))
return;
if (millis > MAX_MILLIS) {
@ -1154,7 +1157,7 @@ Calendar::setTimeInMillis( double millis, UErrorCode& status ) {
fStamp[i] = kUnset;
fIsSet[i] = FALSE;
}
}
@ -1479,7 +1482,7 @@ void Calendar::computeFields(UErrorCode &ec)
double localMillis = internalGetTime();
int32_t rawOffset, dstOffset;
getTimeZone().getOffset(localMillis, FALSE, rawOffset, dstOffset, ec);
localMillis += (rawOffset + dstOffset);
localMillis += (rawOffset + dstOffset);
// Mark fields as set. Do this before calling handleComputeFields().
uint32_t mask = //fInternalSetMask;
@ -1488,7 +1491,7 @@ void Calendar::computeFields(UErrorCode &ec)
(1 << UCAL_MONTH) |
(1 << UCAL_DAY_OF_MONTH) | // = UCAL_DATE
(1 << UCAL_DAY_OF_YEAR) |
(1 << UCAL_EXTENDED_YEAR);
(1 << UCAL_EXTENDED_YEAR);
for (int32_t i=0; i<UCAL_FIELD_COUNT; ++i) {
if ((mask & 1) == 0) {
@ -1517,7 +1520,7 @@ void Calendar::computeFields(UErrorCode &ec)
#if defined (U_DEBUG_CAL)
//fprintf(stderr, "%s:%d- Hmm! Jules @ %d, as per %.0lf millis\n",
//__FILE__, __LINE__, fFields[UCAL_JULIAN_DAY], localMillis);
#endif
#endif
computeGregorianAndDOWFields(fFields[UCAL_JULIAN_DAY], ec);
@ -1615,7 +1618,7 @@ void Calendar::computeGregorianFields(int32_t julianDay, UErrorCode & /* ec */)
* proleptic Gregorian calendar, which has no field larger than a year.
*/
void Calendar::computeWeekFields(UErrorCode &ec) {
if(U_FAILURE(ec)) {
if(U_FAILURE(ec)) {
return;
}
int32_t eyear = fFields[UCAL_EXTENDED_YEAR];
@ -1678,7 +1681,7 @@ void Calendar::computeWeekFields(UErrorCode &ec) {
fFields[UCAL_WEEK_OF_MONTH] = weekNumber(dayOfMonth, dayOfWeek);
fFields[UCAL_DAY_OF_WEEK_IN_MONTH] = (dayOfMonth-1) / 7 + 1;
#if defined (U_DEBUG_CAL)
if(fFields[UCAL_DAY_OF_WEEK_IN_MONTH]==0) fprintf(stderr, "%s:%d: DOWIM %d on %g\n",
if(fFields[UCAL_DAY_OF_WEEK_IN_MONTH]==0) fprintf(stderr, "%s:%d: DOWIM %d on %g\n",
__FILE__, __LINE__,fFields[UCAL_DAY_OF_WEEK_IN_MONTH], fTime);
#endif
}
@ -1723,7 +1726,7 @@ void Calendar::handleComputeFields(int32_t /* julianDay */, UErrorCode &/* statu
// -------------------------------------
void Calendar::roll(EDateFields field, int32_t amount, UErrorCode& status)
void Calendar::roll(EDateFields field, int32_t amount, UErrorCode& status)
{
roll((UCalendarDateFields)field, amount, status);
}
@ -2061,7 +2064,7 @@ void Calendar::roll(UCalendarDateFields field, int32_t amount, UErrorCode& statu
default:
// Other fields cannot be rolled by this method
#if defined (U_DEBUG_CAL)
fprintf(stderr, "%s:%d: ILLEGAL ARG because of roll on non-rollable field %s\n",
fprintf(stderr, "%s:%d: ILLEGAL ARG because of roll on non-rollable field %s\n",
__FILE__, __LINE__,fldName(field));
#endif
status = U_ILLEGAL_ARGUMENT_ERROR;
@ -2252,7 +2255,7 @@ void Calendar::add(UCalendarDateFields field, int32_t amount, UErrorCode& status
}
}
}
}
}
}
// -------------------------------------
@ -2617,7 +2620,7 @@ Calendar::isWeekend(void) const
// ------------------------------------- limits
int32_t
int32_t
Calendar::getMinimum(EDateFields field) const {
return getLimit((UCalendarDateFields) field,UCAL_LIMIT_MINIMUM);
}
@ -2668,7 +2671,7 @@ Calendar::getLeastMaximum(UCalendarDateFields field) const
}
// -------------------------------------
int32_t
int32_t
Calendar::getActualMinimum(EDateFields field, UErrorCode& status) const
{
return getActualMinimum((UCalendarDateFields) field, status);
@ -2744,7 +2747,7 @@ Calendar::getActualMinimum(UCalendarDateFields field, UErrorCode& status) const
work->set(field, fieldValue);
if (work->get(field, status) != fieldValue) {
break;
}
}
else {
result = fieldValue;
fieldValue--;
@ -2800,7 +2803,7 @@ void Calendar::validateField(UCalendarDateFields field, UErrorCode &status) {
case UCAL_DAY_OF_WEEK_IN_MONTH:
if (internalGet(field) == 0) {
#if defined (U_DEBUG_CAL)
fprintf(stderr, "%s:%d: ILLEGAL ARG because DOW in month cannot be 0\n",
fprintf(stderr, "%s:%d: ILLEGAL ARG because DOW in month cannot be 0\n",
__FILE__, __LINE__);
#endif
status = U_ILLEGAL_ARGUMENT_ERROR; // "DAY_OF_WEEK_IN_MONTH cannot be zero"
@ -2826,7 +2829,7 @@ void Calendar::validateField(UCalendarDateFields field, int32_t min, int32_t max
int32_t value = fFields[field];
if (value < min || value > max) {
#if defined (U_DEBUG_CAL)
fprintf(stderr, "%s:%d: ILLEGAL ARG because of field %s out of range %d..%d at %d\n",
fprintf(stderr, "%s:%d: ILLEGAL ARG because of field %s out of range %d..%d at %d\n",
__FILE__, __LINE__,fldName(field),min,max,value);
#endif
status = U_ILLEGAL_ARGUMENT_ERROR;
@ -2892,7 +2895,7 @@ linesInGroup:
}
const UFieldResolutionTable Calendar::kDatePrecedence[] =
{
{
{
{ UCAL_DAY_OF_MONTH, kResolveSTOP },
{ UCAL_WEEK_OF_YEAR, UCAL_DAY_OF_WEEK, kResolveSTOP },
@ -2913,12 +2916,12 @@ const UFieldResolutionTable Calendar::kDatePrecedence[] =
{ kResolveRemap | UCAL_DAY_OF_WEEK_IN_MONTH, UCAL_DAY_OF_WEEK, kResolveSTOP },
{ kResolveRemap | UCAL_DAY_OF_WEEK_IN_MONTH, UCAL_DOW_LOCAL, kResolveSTOP },
{ kResolveSTOP }
},
},
{{kResolveSTOP}}
};
const UFieldResolutionTable Calendar::kDOWPrecedence[] =
const UFieldResolutionTable Calendar::kDOWPrecedence[] =
{
{
{ UCAL_DAY_OF_WEEK,kResolveSTOP, kResolveSTOP },
@ -2929,7 +2932,7 @@ const UFieldResolutionTable Calendar::kDOWPrecedence[] =
};
// precedence for calculating a year
const UFieldResolutionTable Calendar::kYearPrecedence[] =
const UFieldResolutionTable Calendar::kYearPrecedence[] =
{
{
{ UCAL_YEAR, kResolveSTOP },
@ -2966,7 +2969,7 @@ void Calendar::computeTime(UErrorCode& status) {
// }
#endif
int32_t millisInDay;
double millisInDay;
// We only use MILLISECONDS_IN_DAY if it has been set by the user.
// This makes it possible for the caller to set the calendar to a
@ -3086,10 +3089,10 @@ UBool Calendar::getImmediatePreviousZoneTransition(UDate base, UDate *transition
* reflects local zone wall time.
* @stable ICU 2.0
*/
int32_t Calendar::computeMillisInDay() {
double Calendar::computeMillisInDay() {
// Do the time portion of the conversion.
int32_t millisInDay = 0;
double millisInDay = 0;
// Find the best set of fields specifying the time of day. There
// are only two possibilities here; the HOUR_OF_DAY or the
@ -3131,7 +3134,7 @@ int32_t Calendar::computeMillisInDay() {
* or range.
* @stable ICU 2.0
*/
int32_t Calendar::computeZoneOffset(double millis, int32_t millisInDay, UErrorCode &ec) {
int32_t Calendar::computeZoneOffset(double millis, double millisInDay, UErrorCode &ec) {
int32_t rawOffset, dstOffset;
UDate wall = millis + millisInDay;
BasicTimeZone* btz = getBasicTimeZone();
@ -3178,7 +3181,7 @@ int32_t Calendar::computeZoneOffset(double millis, int32_t millisInDay, UErrorCo
return rawOffset + dstOffset;
}
int32_t Calendar::computeJulianDay()
int32_t Calendar::computeJulianDay()
{
// We want to see if any of the date fields is newer than the
// JULIAN_DAY. If not, then we use JULIAN_DAY. If so, then we do
@ -3220,9 +3223,9 @@ int32_t Calendar::handleComputeJulianDay(UCalendarDateFields bestField) {
internalSet(UCAL_EXTENDED_YEAR, year);
}
#if defined (U_DEBUG_CAL)
#if defined (U_DEBUG_CAL)
fprintf(stderr, "%s:%d: bestField= %s - y=%d\n", __FILE__, __LINE__, fldName(bestField), year);
#endif
#endif
// Get the Julian day of the day BEFORE the start of this year.
// If useMonth is true, get the day before the start of the month.
@ -3304,9 +3307,9 @@ int32_t Calendar::handleComputeJulianDay(UCalendarDateFields bestField) {
date += ((monthLength - date) / 7 + dim + 1) * 7;
}
} else {
#if defined (U_DEBUG_CAL)
#if defined (U_DEBUG_CAL)
fprintf(stderr, "%s:%d - bf= %s\n", __FILE__, __LINE__, fldName(bestField));
#endif
#endif
if(bestField == UCAL_WEEK_OF_YEAR) { // ------------------------------------- WOY -------------
if(!isSet(UCAL_YEAR_WOY) || // YWOY not set at all or
@ -3317,30 +3320,30 @@ int32_t Calendar::handleComputeJulianDay(UCalendarDateFields bestField) {
int32_t woy = internalGet(bestField);
int32_t nextJulianDay = handleComputeMonthStart(year+1, 0, FALSE); // jd of day before jan 1
int32_t nextFirst = julianDayToDayOfWeek(nextJulianDay + 1) - firstDayOfWeek;
int32_t nextFirst = julianDayToDayOfWeek(nextJulianDay + 1) - firstDayOfWeek;
if (nextFirst < 0) { // 0..6 ldow of Jan 1
nextFirst += 7;
}
if(woy==1) { // FIRST WEEK ---------------------------------
#if defined (U_DEBUG_CAL)
fprintf(stderr, "%s:%d - woy=%d, yp=%d, nj(%d)=%d, nf=%d", __FILE__, __LINE__,
internalGet(bestField), resolveFields(kYearPrecedence), year+1,
#if defined (U_DEBUG_CAL)
fprintf(stderr, "%s:%d - woy=%d, yp=%d, nj(%d)=%d, nf=%d", __FILE__, __LINE__,
internalGet(bestField), resolveFields(kYearPrecedence), year+1,
nextJulianDay, nextFirst);
fprintf(stderr, " next: %d DFW, min=%d \n", (7-nextFirst), getMinimalDaysInFirstWeek() );
#endif
#endif
// nextFirst is now the localized DOW of Jan 1 of y-woy+1
if((nextFirst > 0) && // Jan 1 starts on FDOW
(7-nextFirst) >= getMinimalDaysInFirstWeek()) // or enough days in the week
{
// Jan 1 of (yearWoy+1) is in yearWoy+1 - recalculate JD to next year
#if defined (U_DEBUG_CAL)
fprintf(stderr, "%s:%d - was going to move JD from %d to %d [d%d]\n", __FILE__, __LINE__,
#if defined (U_DEBUG_CAL)
fprintf(stderr, "%s:%d - was going to move JD from %d to %d [d%d]\n", __FILE__, __LINE__,
julianDay, nextJulianDay, (nextJulianDay-julianDay));
#endif
#endif
julianDay = nextJulianDay;
// recalculate 'first' [0-based local dow of jan 1]
@ -3351,7 +3354,7 @@ int32_t Calendar::handleComputeJulianDay(UCalendarDateFields bestField) {
// recalculate date.
date = 1 - first + dowLocal;
}
} else if(woy>=getLeastMaximum(bestField)) {
} else if(woy>=getLeastMaximum(bestField)) {
// could be in the last week- find out if this JD would overstep
int32_t testDate = date;
if ((7 - first) < getMinimalDaysInFirstWeek()) {
@ -3361,7 +3364,7 @@ int32_t Calendar::handleComputeJulianDay(UCalendarDateFields bestField) {
// Now adjust for the week number.
testDate += 7 * (woy - 1);
#if defined (U_DEBUG_CAL)
#if defined (U_DEBUG_CAL)
fprintf(stderr, "%s:%d - y=%d, y-1=%d doy%d, njd%d (C.F. %d)\n",
__FILE__, __LINE__, year, year-1, testDate, julianDay+testDate, nextJulianDay);
#endif
@ -3375,7 +3378,7 @@ int32_t Calendar::handleComputeJulianDay(UCalendarDateFields bestField) {
}
date = 1 - first + dowLocal;
#if defined (U_DEBUG_CAL)
#if defined (U_DEBUG_CAL)
fprintf(stderr, "%s:%d - date now %d, jd%d, ywoy%d\n",
__FILE__, __LINE__, date, julianDay, year-1);
#endif
@ -3400,13 +3403,13 @@ int32_t Calendar::handleComputeJulianDay(UCalendarDateFields bestField) {
}
int32_t
Calendar::getDefaultMonthInYear(int32_t /*eyear*/)
Calendar::getDefaultMonthInYear(int32_t /*eyear*/)
{
return 0;
}
int32_t
Calendar::getDefaultDayInMonth(int32_t /*eyear*/, int32_t /*month*/)
Calendar::getDefaultDayInMonth(int32_t /*eyear*/, int32_t /*month*/)
{
return 1;
}
@ -3436,13 +3439,13 @@ int32_t Calendar::getLocalDOW()
int32_t Calendar::handleGetExtendedYearFromWeekFields(int32_t yearWoy, int32_t woy)
{
// We have UCAL_YEAR_WOY and UCAL_WEEK_OF_YEAR - from those, determine
// We have UCAL_YEAR_WOY and UCAL_WEEK_OF_YEAR - from those, determine
// what year we fall in, so that other code can set it properly.
// (code borrowed from computeWeekFields and handleComputeJulianDay)
//return yearWoy;
// First, we need a reliable DOW.
UCalendarDateFields bestField = resolveFields(kDatePrecedence); // !! Note: if subclasses have a different table, they should override handleGetExtendedYearFromWeekFields
UCalendarDateFields bestField = resolveFields(kDatePrecedence); // !! Note: if subclasses have a different table, they should override handleGetExtendedYearFromWeekFields
// Now, a local DOW
int32_t dowLocal = getLocalDOW(); // 0..6
@ -3475,9 +3478,9 @@ int32_t Calendar::handleGetExtendedYearFromWeekFields(int32_t yearWoy, int32_t w
int32_t minDays = getMinimalDaysInFirstWeek();
UBool jan1InPrevYear = FALSE; // January 1st in the year of WOY is the 1st week? (i.e. first week is < minimal )
//UBool nextJan1InPrevYear = FALSE; // January 1st of Year of WOY + 1 is in the first week?
//UBool nextJan1InPrevYear = FALSE; // January 1st of Year of WOY + 1 is in the first week?
if((7 - first) < minDays) {
if((7 - first) < minDays) {
jan1InPrevYear = TRUE;
}
@ -3500,8 +3503,8 @@ int32_t Calendar::handleGetExtendedYearFromWeekFields(int32_t yearWoy, int32_t w
return yearWoy; // in this year
}
}
} else if(woy >= getLeastMaximum(bestField)) {
// we _might_ be in the last week..
} else if(woy >= getLeastMaximum(bestField)) {
// we _might_ be in the last week..
int32_t jd = // Calculate JD of our target day:
jan1Start + // JD of Jan 1
(7-first) + // days in the first week (Jan 1.. )
@ -3538,7 +3541,7 @@ int32_t Calendar::handleGetExtendedYearFromWeekFields(int32_t yearWoy, int32_t w
}
//(internalGet(UCAL_DATE) <= (7-first)) /* && in minDow */ ) {
//within 1st week and in this month..
//within 1st week and in this month..
//return yearWoy+1;
return yearWoy;
@ -3671,7 +3674,7 @@ void Calendar::prepareGetActual(UCalendarDateFields field, UBool isMinimum, UErr
dow += 7;
}
}
#if defined (U_DEBUG_CAL)
#if defined (U_DEBUG_CAL)
fprintf(stderr, "prepareGetActualHelper(WOM/WOY) - dow=%d\n", dow);
#endif
set(UCAL_DAY_OF_WEEK, dow);
@ -3687,7 +3690,7 @@ void Calendar::prepareGetActual(UCalendarDateFields field, UBool isMinimum, UErr
int32_t Calendar::getActualHelper(UCalendarDateFields field, int32_t startValue, int32_t endValue, UErrorCode &status) const
{
#if defined (U_DEBUG_CAL)
#if defined (U_DEBUG_CAL)
fprintf(stderr, "getActualHelper(%d,%d .. %d, %s)\n", field, startValue, endValue, u_errorName(status));
#endif
if (startValue == endValue) {
@ -3723,7 +3726,7 @@ int32_t Calendar::getActualHelper(UCalendarDateFields field, int32_t startValue,
int32_t result = startValue;
if ((work->get(field, status) != startValue
&& field != UCAL_WEEK_OF_MONTH && delta > 0 ) || U_FAILURE(status)) {
#if defined (U_DEBUG_CAL)
#if defined (U_DEBUG_CAL)
fprintf(stderr, "getActualHelper(fld %d) - got %d (not %d) - %s\n", field, work->get(field,status), startValue, u_errorName(status));
#endif
} else {
@ -3740,7 +3743,7 @@ int32_t Calendar::getActualHelper(UCalendarDateFields field, int32_t startValue,
} while (startValue != endValue);
}
delete work;
#if defined (U_DEBUG_CAL)
#if defined (U_DEBUG_CAL)
fprintf(stderr, "getActualHelper(%d) = %d\n", field, result);
#endif
return result;
@ -3767,18 +3770,18 @@ Calendar::setWeekData(const Locale& desiredLocale, const char *type, UErrorCode&
// Since week and weekend data is territory based instead of language based,
// we may need to tweak the locale that we are using to try to get the appropriate
// values, using the following logic:
// 1). If the locale has a language but no territory, use the territory as defined by
// 1). If the locale has a language but no territory, use the territory as defined by
// the likely subtags.
// 2). If the locale has a script designation then we ignore it,
// then remove it ( i.e. "en_Latn_US" becomes "en_US" )
char minLocaleID[ULOC_FULLNAME_CAPACITY] = { 0 };
UErrorCode myStatus = U_ZERO_ERROR;
uloc_minimizeSubtags(desiredLocale.getName(),minLocaleID,ULOC_FULLNAME_CAPACITY,&myStatus);
Locale min = Locale::createFromName(minLocaleID);
Locale useLocale;
if ( uprv_strlen(desiredLocale.getCountry()) == 0 ||
if ( uprv_strlen(desiredLocale.getCountry()) == 0 ||
(uprv_strlen(desiredLocale.getScript()) > 0 && uprv_strlen(min.getScript()) == 0) ) {
char maxLocaleID[ULOC_FULLNAME_CAPACITY] = { 0 };
myStatus = U_ZERO_ERROR;
@ -3788,8 +3791,8 @@ Calendar::setWeekData(const Locale& desiredLocale, const char *type, UErrorCode&
} else {
useLocale = Locale(desiredLocale);
}
/* The code here is somewhat of a hack, since week data and weekend data aren't really tied to
/* The code here is somewhat of a hack, since week data and weekend data aren't really tied to
a specific calendar, they aren't truly locale data. But this is the only place where valid and
actual locale can be set, so we take a shot at it here by loading a representative resource
from the calendar data. The code used to use the dateTimeElements resource to get first day
@ -3865,8 +3868,8 @@ Calendar::setWeekData(const Locale& desiredLocale, const char *type, UErrorCode&
* and areFieldsSet. Callers should check isTimeSet and only
* call this method if isTimeSet is false.
*/
void
Calendar::updateTime(UErrorCode& status)
void
Calendar::updateTime(UErrorCode& status)
{
computeTime(status);
if(U_FAILURE(status))
@ -3875,14 +3878,14 @@ Calendar::updateTime(UErrorCode& status)
// If we are lenient, we need to recompute the fields to normalize
// the values. Also, if we haven't set all the fields yet (i.e.,
// in a newly-created object), we need to fill in the fields. [LIU]
if (isLenient() || ! fAreAllFieldsSet)
if (isLenient() || ! fAreAllFieldsSet)
fAreFieldsSet = FALSE;
fIsTimeSet = TRUE;
fAreFieldsVirtuallySet = FALSE;
}
Locale
Locale
Calendar::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
U_LOCALE_BASED(locBased, *this);
return locBased.getLocale(type, status);
@ -3945,4 +3948,3 @@ U_NAMESPACE_END
//eof

View file

@ -63,8 +63,10 @@
static icu::Locale* availableLocaleList = NULL;
static int32_t availableLocaleListCount;
#if !UCONFIG_NO_SERVICE
static icu::ICULocaleService* gService = NULL;
static icu::UInitOnce gServiceInitOnce = U_INITONCE_INITIALIZER;
#endif
static icu::UInitOnce gAvailableLocaleListInitOnce;
/**

View file

@ -224,7 +224,7 @@ CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
int32_t totalSize = indexesLength * 4;
if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) {
indexes[CollationDataReader::IX_JAMO_CE32S_START] = data.jamoCE32s - data.ce32s;
indexes[CollationDataReader::IX_JAMO_CE32S_START] = static_cast<int32_t>(data.jamoCE32s - data.ce32s);
} else {
indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1;
}

View file

@ -607,7 +607,7 @@ CollationFastLatinBuilder::encodeContractions(UErrorCode &errorCode) {
}
UBool firstTriple = TRUE;
for(int32_t index = (int32_t)ce & 0x7fffffff;; index += 3) {
int32_t x = contractionCEs.elementAti(index);
int32_t x = static_cast<int32_t>(contractionCEs.elementAti(index));
if((uint32_t)x == CollationFastLatin::CONTR_CHAR_MASK && !firstTriple) { break; }
int64_t cce0 = contractionCEs.elementAti(index + 1);
int64_t cce1 = contractionCEs.elementAti(index + 2);

View file

@ -739,7 +739,7 @@ DateFormat::setBooleanAttribute(UDateFormatBooleanAttribute attr,
UBool
DateFormat::getBooleanAttribute(UDateFormatBooleanAttribute attr, UErrorCode &/*status*/) const {
return fBoolFlags.get(attr);
return static_cast<UBool>(fBoolFlags.get(attr));
}
U_NAMESPACE_END

View file

@ -386,7 +386,7 @@ U_CAPI decNumber * U_EXPORT2 uprv_decNumberFromUInt32(decNumber *dn, uInt uin) {
*up=(Unit)(uin%(DECDPUNMAX+1));
uin=uin/(DECDPUNMAX+1);
}
dn->digits=decGetDigits(dn->lsu, up-dn->lsu);
dn->digits=decGetDigits(dn->lsu, static_cast<int32_t>(up - dn->lsu));
return dn;
} /* decNumberFromUInt32 */
@ -666,7 +666,7 @@ U_CAPI decNumber * U_EXPORT2 uprv_decNumberFromString(decNumber *dn, const char
/* Handle decimal point... */
if (dotchar!=NULL && dotchar<last) /* non-trailing '.' found? */
exponent-=(last-dotchar); /* adjust exponent */
exponent -= static_cast<int32_t>(last-dotchar); /* adjust exponent */
/* [we can now ignore the .] */
/* OK, the digits string is good. Assemble in the decNumber, or in */
@ -866,7 +866,7 @@ U_CAPI decNumber * U_EXPORT2 uprv_decNumberAnd(decNumber *res, const decNumber *
} /* both OK */
} /* each unit */
/* [here uc-1 is the msu of the result] */
res->digits=decGetDigits(res->lsu, uc-res->lsu);
res->digits=decGetDigits(res->lsu, static_cast<int32_t>(uc - res->lsu));
res->exponent=0; /* integer */
res->bits=0; /* sign=0 */
return res; /* [no status to set] */
@ -1253,7 +1253,7 @@ U_CAPI decNumber * U_EXPORT2 uprv_decNumberInvert(decNumber *res, const decNumbe
} /* each digit */
} /* each unit */
/* [here uc-1 is the msu of the result] */
res->digits=decGetDigits(res->lsu, uc-res->lsu);
res->digits=decGetDigits(res->lsu, static_cast<int32_t>(uc - res->lsu));
res->exponent=0; /* integer */
res->bits=0; /* sign=0 */
return res; /* [no status to set] */
@ -1880,7 +1880,7 @@ U_CAPI decNumber * U_EXPORT2 uprv_decNumberOr(decNumber *res, const decNumber *l
} /* non-zero */
} /* each unit */
/* [here uc-1 is the msu of the result] */
res->digits=decGetDigits(res->lsu, uc-res->lsu);
res->digits=decGetDigits(res->lsu, static_cast<int32_t>(uc-res->lsu));
res->exponent=0; /* integer */
res->bits=0; /* sign=0 */
return res; /* [no status to set] */
@ -2586,7 +2586,7 @@ U_CAPI decNumber * U_EXPORT2 uprv_decNumberRotate(decNumber *res, const decNumbe
} /* whole units to rotate */
/* the rotation may have left an undetermined number of zeros */
/* on the left, so true length needs to be calculated */
res->digits=decGetDigits(res->lsu, msumax-res->lsu+1);
res->digits=decGetDigits(res->lsu, static_cast<int32_t>(msumax-res->lsu+1));
} /* rotate needed */
} /* rhs OK */
} /* numerics */
@ -3310,7 +3310,7 @@ U_CAPI decNumber * U_EXPORT2 uprv_decNumberXor(decNumber *res, const decNumber *
} /* non-zero */
} /* each unit */
/* [here uc-1 is the msu of the result] */
res->digits=decGetDigits(res->lsu, uc-res->lsu);
res->digits=decGetDigits(res->lsu, static_cast<int32_t>(uc-res->lsu));
res->exponent=0; /* integer */
res->bits=0; /* sign=0 */
return res; /* [no status to set] */
@ -5101,7 +5101,7 @@ static decNumber * decMultiplyOp(decNumber *res, const decNumber *lhs,
} /* p */
*up=(Unit)item; up++; /* [final needs no division] */
} /* lp */
accunits=up-acc; /* count of units */
accunits = static_cast<int32_t>(up-acc); /* count of units */
}
else { /* here to use units directly, without chunking ['old code'] */
#endif
@ -6587,11 +6587,11 @@ static Int decUnitAddSub(const Unit *a, Int alength,
/* OK, all A and B processed; might still have carry or borrow */
/* return number of Units in the result, negated if a borrow */
if (carry==0) return c-clsu; /* no carry, so no more to do */
if (carry==0) return static_cast<int32_t>(c-clsu); /* no carry, so no more to do */
if (carry>0) { /* positive carry */
*c=(Unit)carry; /* place as new unit */
c++; /* .. */
return c-clsu;
return static_cast<int32_t>(c-clsu);
}
/* -ve carry: it's a borrow; complement needed */
add=1; /* temporary carry... */
@ -6614,7 +6614,7 @@ static Int decUnitAddSub(const Unit *a, Int alength,
*c=(Unit)(add-carry-1);
c++; /* interesting, include it */
}
return clsu-c; /* -ve result indicates borrowed */
return static_cast<int32_t>(clsu-c); /* -ve result indicates borrowed */
} /* decUnitAddSub */
/* ------------------------------------------------------------------ */
@ -6798,7 +6798,7 @@ static Int decShiftToLeast(Unit *uar, Int units, Int shift) {
if (cut==DECDPUN) { /* unit-boundary case; easy */
up=uar+D2U(shift);
for (; up<uar+units; target++, up++) *target=*up;
return target-uar;
return static_cast<int32_t>(target-uar);
}
/* messier */
@ -6826,7 +6826,7 @@ static Int decShiftToLeast(Unit *uar, Int units, Int shift) {
count-=cut;
if (count<=0) break;
}
return target-uar+1;
return static_cast<int32_t>(target-uar+1);
} /* decShiftToLeast */
#if DECSUBSET
@ -7690,7 +7690,7 @@ static decNumber *decDecap(decNumber *dn, Int drop) {
cut=MSUDIGITS(dn->digits-drop); /* digits to be in use in msu */
if (cut!=DECDPUN) *msu%=powers[cut]; /* clear left digits */
/* that may have left leading zero digits, so do a proper count... */
dn->digits=decGetDigits(dn->lsu, msu-dn->lsu+1);
dn->digits=decGetDigits(dn->lsu, static_cast<int32_t>(msu-dn->lsu+1));
return dn;
} /* decDecap */

View file

@ -2543,7 +2543,7 @@ UnicodeString DecimalFormat::getPadCharacterString() const {
}
void DecimalFormat::setPadCharacter(const UnicodeString &padChar) {
UChar pad;
UChar32 pad;
if (padChar.length() > 0) {
pad = padChar.char32At(0);
}
@ -2792,7 +2792,7 @@ DecimalFormat::setDecimalSeparatorAlwaysShown(UBool newValue)
UBool
DecimalFormat::isDecimalPatternMatchRequired(void) const
{
return fBoolFlags.contains(UNUM_PARSE_DECIMAL_MARK_REQUIRED);
return static_cast<UBool>(fBoolFlags.contains(UNUM_PARSE_DECIMAL_MARK_REQUIRED));
}
//------------------------------------------------------------------------------

View file

@ -1200,12 +1200,11 @@ RuleBasedNumberFormat::format(double number,
UnicodeString& toAppendTo,
FieldPosition& /* pos */) const
{
int32_t startPos = toAppendTo.length();
UErrorCode status = U_ZERO_ERROR;
if (defaultRuleSet) {
format(number, *defaultRuleSet, toAppendTo, status);
}
return adjustForCapitalizationContext(startPos, toAppendTo, status);
return toAppendTo;
}

View file

@ -46,11 +46,29 @@ static const UChar LOCALE_SEP = 95; // '_'
//static const UChar VARIANT_SEP = 0x002F; // '/'
// String constants
static const UChar ANY[] = { 65, 110, 121, 0 }; // Any
static const UChar ANY[] = { 0x41, 0x6E, 0x79, 0 }; // Any
static const UChar LAT[] = { 0x4C, 0x61, 0x74, 0 }; // Lat
// empty string
#define NO_VARIANT UnicodeString()
// initial estimate for specDAG size
// ICU 60 Transliterator::countAvailableSources()
#define SPECDAG_INIT_SIZE 149
// initial estimate for number of variant names
#define VARIANT_LIST_INIT_SIZE 11
#define VARIANT_LIST_MAX_SIZE 31
// initial estimate for availableIDs count (default estimate is 8 => multiple reallocs)
// ICU 60 Transliterator::countAvailableIDs()
#define AVAILABLE_IDS_INIT_SIZE 641
// initial estimate for number of targets for source "Any", "Lat"
// ICU 60 Transliterator::countAvailableTargets("Any")/("Latn")
#define ANY_TARGETS_INIT_SIZE 125
#define LAT_TARGETS_INIT_SIZE 23
/**
* Resource bundle key for the RuleBasedTransliterator rule.
*/
@ -517,10 +535,17 @@ U_CDECL_END
TransliteratorRegistry::TransliteratorRegistry(UErrorCode& status) :
registry(TRUE, status),
specDAG(TRUE, status),
availableIDs(status)
specDAG(TRUE, SPECDAG_INIT_SIZE, status),
variantList(VARIANT_LIST_INIT_SIZE, status),
availableIDs(AVAILABLE_IDS_INIT_SIZE, status)
{
registry.setValueDeleter(deleteEntry);
variantList.setDeleter(uprv_deleteUObject);
variantList.setComparer(uhash_compareCaselessUnicodeString);
UnicodeString *emptyString = new UnicodeString();
if (emptyString != NULL) {
variantList.addElement(emptyString, status);
}
availableIDs.setDeleter(uprv_deleteUObject);
availableIDs.setComparer(uhash_compareCaselessUnicodeString);
specDAG.setValueDeleter(uhash_deleteHashtable);
@ -781,9 +806,15 @@ int32_t TransliteratorRegistry::countAvailableVariants(const UnicodeString& sour
if (targets == 0) {
return 0;
}
UVector *variants = (UVector*) targets->get(target);
// variants may be 0 if the source/target are invalid
return (variants == 0) ? 0 : variants->size();
int32_t varMask = targets->geti(target);
int32_t varCount = 0;
while (varMask > 0) {
if (varMask & 1) {
varCount++;
}
varMask >>= 1;
}
return varCount;
}
UnicodeString& TransliteratorRegistry::getAvailableVariant(int32_t index,
@ -795,17 +826,25 @@ UnicodeString& TransliteratorRegistry::getAvailableVariant(int32_t index,
result.truncate(0); // invalid source
return result;
}
UVector *variants = (UVector*) targets->get(target);
if (variants == 0) {
result.truncate(0); // invalid target
return result;
}
UnicodeString *v = (UnicodeString*) variants->elementAt(index);
if (v == 0) {
result.truncate(0); // invalid index
} else {
result = *v;
int32_t varMask = targets->geti(target);
int32_t varCount = 0;
int32_t varListIndex = 0;
while (varMask > 0) {
if (varMask & 1) {
if (varCount == index) {
UnicodeString *v = (UnicodeString*) variantList.elementAt(varListIndex);
if (v != NULL) {
result = *v;
return result;
}
break;
}
varCount++;
}
varMask >>= 1;
varListIndex++;
}
result.truncate(0); // invalid target or index
return result;
}
@ -911,9 +950,9 @@ void TransliteratorRegistry::registerEntry(const UnicodeString& ID,
UnicodeString *newID = (UnicodeString *)ID.clone();
// Check to make sure newID was created.
if (newID != NULL) {
// NUL-terminate the ID string
newID->getTerminatedBuffer();
availableIDs.addElement(newID, status);
// NUL-terminate the ID string
newID->getTerminatedBuffer();
availableIDs.addElement(newID, status);
}
}
} else {
@ -924,9 +963,7 @@ void TransliteratorRegistry::registerEntry(const UnicodeString& ID,
/**
* Register a source-target/variant in the specDAG. Variant may be
* empty, but source and target must not be. If variant is empty then
* the special variant NO_VARIANT is stored in slot zero of the
* UVector of variants.
* empty, but source and target must not be.
*/
void TransliteratorRegistry::registerSTV(const UnicodeString& source,
const UnicodeString& target,
@ -936,39 +973,38 @@ void TransliteratorRegistry::registerSTV(const UnicodeString& source,
UErrorCode status = U_ZERO_ERROR;
Hashtable *targets = (Hashtable*) specDAG.get(source);
if (targets == 0) {
targets = new Hashtable(TRUE, status);
if (U_FAILURE(status) || targets == 0) {
int32_t size = 3;
if (source.compare(ANY,3) == 0) {
size = ANY_TARGETS_INIT_SIZE;
} else if (source.compare(LAT,3) == 0) {
size = LAT_TARGETS_INIT_SIZE;
}
targets = new Hashtable(TRUE, size, status);
if (U_FAILURE(status) || targets == NULL) {
return;
}
targets->setValueDeleter(uprv_deleteUObject);
specDAG.put(source, targets, status);
}
UVector *variants = (UVector*) targets->get(target);
if (variants == 0) {
variants = new UVector(uprv_deleteUObject,
uhash_compareCaselessUnicodeString, status);
if (variants == 0) {
int32_t variantListIndex = variantList.indexOf((void*) &variant, 0);
if (variantListIndex < 0) {
if (variantList.size() >= VARIANT_LIST_MAX_SIZE) {
// can't handle any more variants
return;
}
targets->put(target, variants, status);
}
// assert(NO_VARIANT == "");
// We add the variant string. If it is the special "no variant"
// string, that is, the empty string, we add it at position zero.
if (!variants->contains((void*) &variant)) {
UnicodeString *tempus; // Used for null pointer check.
if (variant.length() > 0) {
tempus = new UnicodeString(variant);
if (tempus != NULL) {
variants->addElement(tempus, status);
}
} else {
tempus = new UnicodeString(); // = NO_VARIANT
if (tempus != NULL) {
variants->insertElementAt(tempus, 0, status);
}
UnicodeString *variantEntry = new UnicodeString(variant);
if (variantEntry != NULL) {
variantList.addElement(variantEntry, status);
if (U_SUCCESS(status)) {
variantListIndex = variantList.size() - 1;
}
}
if (variantListIndex < 0) {
return;
}
}
int32_t addMask = 1 << variantListIndex;
int32_t varMask = targets->geti(target);
targets->puti(target, varMask | addMask, status);
}
/**
@ -979,17 +1015,24 @@ void TransliteratorRegistry::removeSTV(const UnicodeString& source,
const UnicodeString& variant) {
// assert(source.length() > 0);
// assert(target.length() > 0);
// UErrorCode status = U_ZERO_ERROR;
UErrorCode status = U_ZERO_ERROR;
Hashtable *targets = (Hashtable*) specDAG.get(source);
if (targets == 0) {
if (targets == NULL) {
return; // should never happen for valid s-t/v
}
UVector *variants = (UVector*) targets->get(target);
if (variants == 0) {
int32_t varMask = targets->geti(target);
if (varMask == 0) {
return; // should never happen for valid s-t/v
}
variants->removeElement((void*) &variant);
if (variants->size() == 0) {
int32_t variantListIndex = variantList.indexOf((void*) &variant, 0);
if (variantListIndex < 0) {
return; // should never happen for valid s-t/v
}
int32_t remMask = 1 << variantListIndex;
varMask &= (~remMask);
if (varMask != 0) {
targets->puti(target, varMask, status);
} else {
targets->remove(target); // should delete variants
if (targets->count() == 0) {
specDAG.remove(source); // should delete targets
@ -1281,8 +1324,8 @@ Transliterator* TransliteratorRegistry::instantiateEntry(const UnicodeString& ID
UVector* rbts = new UVector(entry->u.dataVector->size(), status);
// Check for null pointer
if (rbts == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
int32_t passNumber = 1;
for (int32_t i = 0; U_SUCCESS(status) && i < entry->u.dataVector->size(); i++) {

View file

@ -440,13 +440,15 @@ class TransliteratorRegistry : public UMemory {
/**
* DAG of visible IDs by spec. Hashtable: source => (Hashtable:
* target => (UVector: variant)) The UVector of variants is never
* empty. For a source-target with no variant, the special
* variant NO_VARIANT (the empty string) is stored in slot zero of
* the UVector.
* target => variant bitmask)
*/
Hashtable specDAG;
/**
* Vector of all variant names
*/
UVector variantList;
/**
* Vector of public full IDs.
*/

View file

@ -2056,6 +2056,9 @@ static void U_CALLCONV prepareFind(UErrorCode &status) {
if (U_SUCCESS(status)) {
while ((mzID = mzIDs->snext(status)) && U_SUCCESS(status)) {
const TZDBNames *names = TZDBTimeZoneNames::getMetaZoneNames(*mzID, status);
if (U_FAILURE(status)) {
break;
}
if (names == NULL) {
continue;
}
@ -2187,9 +2190,11 @@ TZDBTimeZoneNames::getMetaZoneDisplayName(const UnicodeString& mzID,
UErrorCode status = U_ZERO_ERROR;
const TZDBNames *tzdbNames = TZDBTimeZoneNames::getMetaZoneNames(mzID, status);
if (U_SUCCESS(status)) {
const UChar *s = tzdbNames->getName(type);
if (s != NULL) {
name.setTo(TRUE, s, -1);
if (tzdbNames != NULL) {
const UChar *s = tzdbNames->getName(type);
if (s != NULL) {
name.setTo(TRUE, s, -1);
}
}
}

Some files were not shown because too many files have changed in this diff Show more