ICU-7264 merge Unicode 6.0 into trunk from branches/markus/uni60 -r 28339:28657

X-SVN-Rev: 28661
This commit is contained in:
Markus Scherer 2010-09-21 00:12:49 +00:00
parent 3c7ba0c2e4
commit b5e1330176
87 changed files with 21109 additions and 17841 deletions

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 1999-2009, International Business Machines
* Copyright (C) 1999-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -145,11 +145,7 @@ ubidi_openSized(int32_t maxLength, int32_t maxRunCount, UErrorCode *pErrorCode)
uprv_memset(pBiDi, 0, sizeof(UBiDi));
/* get BiDi properties */
pBiDi->bdp=ubidi_getSingleton(pErrorCode);
if(U_FAILURE(*pErrorCode)) {
uprv_free(pBiDi);
return NULL;
}
pBiDi->bdp=ubidi_getSingleton();
/* allocate memory for arrays as requested */
if(maxLength>0) {

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2004-2008, International Business Machines
* Copyright (C) 2004-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -38,286 +38,16 @@ struct UBiDiProps {
uint8_t formatVersion[4];
};
/* data loading etc. -------------------------------------------------------- */
#if UBIDI_HARDCODE_DATA
/* ubidi_props_data.c is machine-generated by genbidi --csource */
#include "ubidi_props_data.c"
#else
static UBool U_CALLCONV
isAcceptable(void *context,
const char *type, const char *name,
const UDataInfo *pInfo) {
if(
pInfo->size>=20 &&
pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
pInfo->charsetFamily==U_CHARSET_FAMILY &&
pInfo->dataFormat[0]==UBIDI_FMT_0 && /* dataFormat="BiDi" */
pInfo->dataFormat[1]==UBIDI_FMT_1 &&
pInfo->dataFormat[2]==UBIDI_FMT_2 &&
pInfo->dataFormat[3]==UBIDI_FMT_3 &&
pInfo->formatVersion[0]==1 &&
pInfo->formatVersion[2]==UTRIE_SHIFT &&
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
) {
UBiDiProps *bdp=(UBiDiProps *)context;
uprv_memcpy(bdp->formatVersion, pInfo->formatVersion, 4);
return TRUE;
} else {
return FALSE;
}
}
static UBiDiProps *
ubidi_openData(UBiDiProps *bdpProto,
const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
UBiDiProps *bdp;
int32_t size;
bdpProto->indexes=(const int32_t *)bin;
if( (length>=0 && length<16*4) ||
bdpProto->indexes[UBIDI_IX_INDEX_TOP]<16
) {
/* length or indexes[] too short for minimum indexes[] length of 16 */
*pErrorCode=U_INVALID_FORMAT_ERROR;
return NULL;
}
size=bdpProto->indexes[UBIDI_IX_INDEX_TOP]*4;
if(length>=0) {
if(length>=size && length>=bdpProto->indexes[UBIDI_IX_LENGTH]) {
length-=size;
} else {
/* length too short for indexes[] or for the whole data length */
*pErrorCode=U_INVALID_FORMAT_ERROR;
return NULL;
}
}
bin+=size;
/* from here on, assume that the sizes of the items fit into the total length */
/* unserialize the trie, after indexes[] */
size=bdpProto->indexes[UBIDI_IX_TRIE_SIZE];
utrie_unserialize(&bdpProto->trie, bin, size, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return NULL;
}
bin+=size;
/* get mirrors[] */
size=4*bdpProto->indexes[UBIDI_IX_MIRROR_LENGTH];
bdpProto->mirrors=(const uint32_t *)bin;
bin+=size;
/* get jgArray[] */
size=bdpProto->indexes[UBIDI_IX_JG_LIMIT]-bdpProto->indexes[UBIDI_IX_JG_START];
bdpProto->jgArray=bin;
bin+=size;
/* allocate, copy, and return the new UBiDiProps */
bdp=(UBiDiProps *)uprv_malloc(sizeof(UBiDiProps));
if(bdp==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
} else {
uprv_memcpy(bdp, bdpProto, sizeof(UBiDiProps));
return bdp;
}
}
U_CFUNC UBiDiProps *
ubidi_openProps(UErrorCode *pErrorCode) {
UBiDiProps bdpProto={ NULL }, *bdp;
bdpProto.mem=udata_openChoice(NULL, UBIDI_DATA_TYPE, UBIDI_DATA_NAME, isAcceptable, &bdpProto, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return NULL;
}
bdp=ubidi_openData(
&bdpProto,
udata_getMemory(bdpProto.mem),
udata_getLength(bdpProto.mem),
pErrorCode);
if(U_FAILURE(*pErrorCode)) {
udata_close(bdpProto.mem);
return NULL;
} else {
return bdp;
}
}
U_CFUNC UBiDiProps *
ubidi_openBinary(const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
UBiDiProps bdpProto={ NULL };
const DataHeader *hdr;
if(U_FAILURE(*pErrorCode)) {
return NULL;
}
if(bin==NULL) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
/* check the header */
if(length>=0 && length<20) {
*pErrorCode=U_INVALID_FORMAT_ERROR;
return NULL;
}
hdr=(const DataHeader *)bin;
if(
!(hdr->dataHeader.magic1==0xda && hdr->dataHeader.magic2==0x27 &&
hdr->info.isBigEndian==U_IS_BIG_ENDIAN &&
isAcceptable(&bdpProto, UBIDI_DATA_TYPE, UBIDI_DATA_NAME, &hdr->info))
) {
*pErrorCode=U_INVALID_FORMAT_ERROR;
return NULL;
}
bin+=hdr->dataHeader.headerSize;
if(length>=0) {
length-=hdr->dataHeader.headerSize;
}
return ubidi_openData(&bdpProto, bin, length, pErrorCode);
}
#endif
U_CFUNC void
ubidi_closeProps(UBiDiProps *bdp) {
if(bdp!=NULL) {
#if !UBIDI_HARDCODE_DATA
udata_close(bdp->mem);
#endif
uprv_free(bdp);
}
}
/* UBiDiProps singleton ----------------------------------------------------- */
#if !UBIDI_HARDCODE_DATA
static UBiDiProps *gBdpDummy=NULL;
static UBiDiProps *gBdp=NULL;
static UErrorCode gErrorCode=U_ZERO_ERROR;
static int8_t gHaveData=0;
static UBool U_CALLCONV
ubidi_cleanup(void) {
ubidi_closeProps(gBdpDummy);
gBdpDummy=NULL;
ubidi_closeProps(gBdp);
gBdp=NULL;
gErrorCode=U_ZERO_ERROR;
gHaveData=0;
return TRUE;
}
#endif
U_CFUNC const UBiDiProps *
ubidi_getSingleton(UErrorCode *pErrorCode) {
#if UBIDI_HARDCODE_DATA
if(U_FAILURE(*pErrorCode)) {
return NULL;
}
ubidi_getSingleton() {
return &ubidi_props_singleton;
#else
int8_t haveData;
if(U_FAILURE(*pErrorCode)) {
return NULL;
}
UMTX_CHECK(NULL, gHaveData, haveData);
if(haveData>0) {
/* data was loaded */
return gBdp;
} else if(haveData<0) {
/* data loading failed */
*pErrorCode=gErrorCode;
return NULL;
} else /* haveData==0 */ {
/* load the data */
UBiDiProps *bdp=ubidi_openProps(pErrorCode);
if(U_FAILURE(*pErrorCode)) {
gHaveData=-1;
gErrorCode=*pErrorCode;
return NULL;
}
/* set the static variables */
umtx_lock(NULL);
if(gBdp==NULL) {
gBdp=bdp;
bdp=NULL;
gHaveData=1;
ucln_common_registerCleanup(UCLN_COMMON_UBIDI, ubidi_cleanup);
}
umtx_unlock(NULL);
ubidi_closeProps(bdp);
return gBdp;
}
#endif
}
#if !UBIDI_HARDCODE_DATA
U_CAPI const UBiDiProps *
ubidi_getDummy(UErrorCode *pErrorCode) {
UBiDiProps *bdp;
if(U_FAILURE(*pErrorCode)) {
return NULL;
}
UMTX_CHECK(NULL, gBdpDummy, bdp);
if(bdp!=NULL) {
/* the dummy object was already created */
return bdp;
} else /* bdp==NULL */ {
/* create the dummy object */
int32_t *indexes;
bdp=(UBiDiProps *)uprv_malloc(sizeof(UBiDiProps)+UBIDI_IX_TOP*4+UTRIE_DUMMY_SIZE);
if(bdp==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
uprv_memset(bdp, 0, sizeof(UBiDiProps)+UBIDI_IX_TOP*4);
bdp->indexes=indexes=(int32_t *)(bdp+1);
indexes[UBIDI_IX_INDEX_TOP]=UBIDI_IX_TOP;
indexes[UBIDI_IX_TRIE_SIZE]=
utrie_unserializeDummy(&bdp->trie, indexes+UBIDI_IX_TOP, UTRIE_DUMMY_SIZE, 0, 0, TRUE, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
uprv_free(bdp);
return NULL;
}
bdp->formatVersion[0]=1;
bdp->formatVersion[2]=UTRIE_SHIFT;
bdp->formatVersion[3]=UTRIE_INDEX_SHIFT;
/* set the static variables */
umtx_lock(NULL);
if(gBdpDummy==NULL) {
gBdpDummy=bdp;
bdp=NULL;
ucln_common_registerCleanup(UCLN_COMMON_UBIDI, ubidi_cleanup);
}
umtx_unlock(NULL);
uprv_free(bdp);
return gBdpDummy;
}
}
#endif
/* set of property starts for UnicodeSet ------------------------------------ */
static UBool U_CALLCONV
@ -476,29 +206,15 @@ ubidi_getJoiningGroup(const UBiDiProps *bdp, UChar32 c) {
U_CFUNC UCharDirection
u_charDirection(UChar32 c) {
UErrorCode errorCode=U_ZERO_ERROR;
const UBiDiProps *bdp=ubidi_getSingleton(&errorCode);
if(bdp!=NULL) {
return ubidi_getClass(bdp, c);
} else {
return U_LEFT_TO_RIGHT;
}
return ubidi_getClass(&ubidi_props_singleton, c);
}
U_CFUNC UBool
u_isMirrored(UChar32 c) {
UErrorCode errorCode=U_ZERO_ERROR;
const UBiDiProps *bdp=ubidi_getSingleton(&errorCode);
return (UBool)(bdp!=NULL && ubidi_isMirrored(bdp, c));
return ubidi_isMirrored(&ubidi_props_singleton, c);
}
U_CFUNC UChar32
u_charMirror(UChar32 c) {
UErrorCode errorCode=U_ZERO_ERROR;
const UBiDiProps *bdp=ubidi_getSingleton(&errorCode);
if(bdp!=NULL) {
return ubidi_getMirror(bdp, c);
} else {
return c;
}
return ubidi_getMirror(&ubidi_props_singleton, c);
}

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2004-2008, International Business Machines
* Copyright (C) 2004-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -24,8 +24,6 @@
#include "uset_imp.h"
#include "udataswp.h"
#define UBIDI_HARDCODE_DATA 1
U_CDECL_BEGIN
/* library API -------------------------------------------------------------- */
@ -33,28 +31,8 @@ U_CDECL_BEGIN
struct UBiDiProps;
typedef struct UBiDiProps UBiDiProps;
U_CFUNC UBiDiProps *
ubidi_openProps(UErrorCode *pErrorCode);
U_CFUNC UBiDiProps *
ubidi_openBinary(const uint8_t *bin, int32_t length, UErrorCode *pErrorCode);
U_CFUNC void
ubidi_closeProps(UBiDiProps *bdp);
U_CFUNC const UBiDiProps *
ubidi_getSingleton(UErrorCode *pErrorCode);
#if !UBIDI_HARDCODE_DATA
/**
* Get a singleton dummy object, one that works with no real data.
* This can be used when the real data is not available.
* Using the dummy can reduce checks for available data after an initial failure.
*/
U_CAPI const UBiDiProps *
ubidi_getDummy(UErrorCode *pErrorCode);
#endif
ubidi_getSingleton(void);
U_CAPI int32_t
ubidi_swap(const UDataSwapper *ds,

File diff suppressed because it is too large Load diff

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2004-2009, International Business Machines
* Copyright (C) 2004-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -39,291 +39,16 @@ struct UCaseProps {
uint8_t formatVersion[4];
};
/* data loading etc. -------------------------------------------------------- */
#if UCASE_HARDCODE_DATA
/* ucase_props_data.c is machine-generated by gencase --csource */
#include "ucase_props_data.c"
#else
static UBool U_CALLCONV
isAcceptable(void *context,
const char *type, const char *name,
const UDataInfo *pInfo) {
if(
pInfo->size>=20 &&
pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
pInfo->charsetFamily==U_CHARSET_FAMILY &&
pInfo->dataFormat[0]==UCASE_FMT_0 && /* dataFormat="cAsE" */
pInfo->dataFormat[1]==UCASE_FMT_1 &&
pInfo->dataFormat[2]==UCASE_FMT_2 &&
pInfo->dataFormat[3]==UCASE_FMT_3 &&
pInfo->formatVersion[0]==1 &&
pInfo->formatVersion[2]==UTRIE_SHIFT &&
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
) {
UCaseProps *csp=(UCaseProps *)context;
uprv_memcpy(csp->formatVersion, pInfo->formatVersion, 4);
return TRUE;
} else {
return FALSE;
}
}
static UCaseProps *
ucase_openData(UCaseProps *cspProto,
const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
UCaseProps *csp;
int32_t size;
cspProto->indexes=(const int32_t *)bin;
if( (length>=0 && length<16*4) ||
cspProto->indexes[UCASE_IX_INDEX_TOP]<16
) {
/* length or indexes[] too short for minimum indexes[] length of 16 */
*pErrorCode=U_INVALID_FORMAT_ERROR;
return NULL;
}
size=cspProto->indexes[UCASE_IX_INDEX_TOP]*4;
if(length>=0) {
if(length>=size && length>=cspProto->indexes[UCASE_IX_LENGTH]) {
length-=size;
} else {
/* length too short for indexes[] or for the whole data length */
*pErrorCode=U_INVALID_FORMAT_ERROR;
return NULL;
}
}
bin+=size;
/* from here on, assume that the sizes of the items fit into the total length */
/* unserialize the trie, after indexes[] */
size=cspProto->indexes[UCASE_IX_TRIE_SIZE];
utrie_unserialize(&cspProto->trie, bin, size, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return NULL;
}
bin+=size;
/* get exceptions[] */
size=2*cspProto->indexes[UCASE_IX_EXC_LENGTH];
cspProto->exceptions=(const uint16_t *)bin;
bin+=size;
/* get unfold[] */
size=2*cspProto->indexes[UCASE_IX_UNFOLD_LENGTH];
if(size!=0) {
cspProto->unfold=(const UChar *)bin;
bin+=size;
} else {
cspProto->unfold=NULL;
}
/* allocate, copy, and return the new UCaseProps */
csp=(UCaseProps *)uprv_malloc(sizeof(UCaseProps));
if(csp==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
} else {
uprv_memcpy(csp, cspProto, sizeof(UCaseProps));
return csp;
}
}
U_CAPI UCaseProps * U_EXPORT2
ucase_open(UErrorCode *pErrorCode) {
UCaseProps cspProto={ NULL }, *csp;
cspProto.mem=udata_openChoice(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, isAcceptable, &cspProto, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return NULL;
}
csp=ucase_openData(
&cspProto,
udata_getMemory(cspProto.mem),
udata_getLength(cspProto.mem),
pErrorCode);
if(U_FAILURE(*pErrorCode)) {
udata_close(cspProto.mem);
return NULL;
} else {
return csp;
}
}
U_CAPI UCaseProps * U_EXPORT2
ucase_openBinary(const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
UCaseProps cspProto={ NULL };
const DataHeader *hdr;
if(U_FAILURE(*pErrorCode)) {
return NULL;
}
if(bin==NULL) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
/* check the header */
if(length>=0 && length<20) {
*pErrorCode=U_INVALID_FORMAT_ERROR;
return NULL;
}
hdr=(const DataHeader *)bin;
if(
!(hdr->dataHeader.magic1==0xda && hdr->dataHeader.magic2==0x27 &&
hdr->info.isBigEndian==U_IS_BIG_ENDIAN &&
isAcceptable(&cspProto, UCASE_DATA_TYPE, UCASE_DATA_NAME, &hdr->info))
) {
*pErrorCode=U_INVALID_FORMAT_ERROR;
return NULL;
}
bin+=hdr->dataHeader.headerSize;
if(length>=0) {
length-=hdr->dataHeader.headerSize;
}
return ucase_openData(&cspProto, bin, length, pErrorCode);
}
#endif
U_CAPI void U_EXPORT2
ucase_close(UCaseProps *csp) {
if(csp!=NULL) {
#if !UCASE_HARDCODE_DATA
udata_close(csp->mem);
#endif
uprv_free(csp);
}
}
/* UCaseProps singleton ----------------------------------------------------- */
#if !UCASE_HARDCODE_DATA
static UCaseProps *gCsp=NULL;
static UCaseProps *gCspDummy=NULL;
static UErrorCode gErrorCode=U_ZERO_ERROR;
static int8_t gHaveData=0;
#endif
#if !UCASE_HARDCODE_DATA
static UBool U_CALLCONV ucase_cleanup(void) {
ucase_close(gCsp);
gCsp=NULL;
ucase_close(gCspDummy);
gCspDummy=NULL;
gErrorCode=U_ZERO_ERROR;
gHaveData=0;
return TRUE;
}
#endif
U_CAPI const UCaseProps * U_EXPORT2
ucase_getSingleton(UErrorCode *pErrorCode) {
#if UCASE_HARDCODE_DATA
if(U_FAILURE(*pErrorCode)) {
return NULL;
}
ucase_getSingleton() {
return &ucase_props_singleton;
#else
int8_t haveData;
if(U_FAILURE(*pErrorCode)) {
return NULL;
}
UMTX_CHECK(NULL, gHaveData, haveData);
if(haveData>0) {
/* data was loaded */
return gCsp;
} else if(haveData<0) {
/* data loading failed */
*pErrorCode=gErrorCode;
return NULL;
} else /* haveData==0 */ {
/* load the data */
UCaseProps *csp=ucase_open(pErrorCode);
if(U_FAILURE(*pErrorCode)) {
gHaveData=-1;
gErrorCode=*pErrorCode;
return NULL;
}
/* set the static variables */
umtx_lock(NULL);
if(gCsp==NULL) {
gCsp=csp;
csp=NULL;
gHaveData=1;
ucln_common_registerCleanup(UCLN_COMMON_UCASE, ucase_cleanup);
}
umtx_unlock(NULL);
ucase_close(csp);
return gCsp;
}
#endif
}
#if !UCASE_HARDCODE_DATA
U_CAPI const UCaseProps * U_EXPORT2
ucase_getDummy(UErrorCode *pErrorCode) {
UCaseProps *csp;
if(U_FAILURE(*pErrorCode)) {
return NULL;
}
UMTX_CHECK(NULL, gCspDummy, csp);
if(csp!=NULL) {
/* the dummy object was already created */
return csp;
} else /* csp==NULL */ {
/* create the dummy object */
int32_t *indexes;
csp=(UCaseProps *)uprv_malloc(sizeof(UCaseProps)+UCASE_IX_TOP*4+UTRIE_DUMMY_SIZE);
if(csp==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
uprv_memset(csp, 0, sizeof(UCaseProps)+UCASE_IX_TOP*4);
csp->indexes=indexes=(int32_t *)(csp+1);
indexes[UCASE_IX_INDEX_TOP]=UCASE_IX_TOP;
indexes[UCASE_IX_TRIE_SIZE]=
utrie_unserializeDummy(&csp->trie, indexes+UCASE_IX_TOP, UTRIE_DUMMY_SIZE, 0, 0, TRUE, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
uprv_free(csp);
return NULL;
}
csp->formatVersion[0]=1;
csp->formatVersion[2]=UTRIE_SHIFT;
csp->formatVersion[3]=UTRIE_INDEX_SHIFT;
/* set the static variables */
umtx_lock(NULL);
if(gCspDummy==NULL) {
gCspDummy=csp;
csp=NULL;
ucln_common_registerCleanup(UCLN_COMMON_UCASE, ucase_cleanup);
}
umtx_unlock(NULL);
uprv_free(csp);
return gCspDummy;
}
}
#endif
/* set of property starts for UnicodeSet ------------------------------------ */
static UBool U_CALLCONV
@ -1475,69 +1200,7 @@ ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
/* case mapping properties API ---------------------------------------------- */
/* get the UCaseProps singleton, or else its dummy, once and for all */
#if !UCASE_HARDCODE_DATA
static const UCaseProps *
getCaseProps() {
/*
* This lazy intialization with double-checked locking (without mutex protection for
* the initial check) is transiently unsafe under certain circumstances.
* Check the readme and use u_init() if necessary.
*/
/* the initial check is performed by the GET_CASE_PROPS() macro */
const UCaseProps *csp;
UErrorCode errorCode=U_ZERO_ERROR;
csp=ucase_getSingleton(&errorCode);
if(U_FAILURE(errorCode)) {
errorCode=U_ZERO_ERROR;
csp=ucase_getDummy(&errorCode);
if(U_FAILURE(errorCode)) {
return NULL;
}
}
return csp;
}
#endif
/*
* In ICU 3.0, most Unicode properties were loaded from uprops.icu.
* ICU 3.2 adds ucase.icu for case mapping properties.
* ICU 3.4 adds ubidi.icu for bidi/shaping properties and
* removes case/bidi/shaping properties from uprops.icu.
*
* Loading of uprops.icu was never mutex-protected and required u_init()
* for thread safety.
* In order to maintain performance for all such properties,
* ucase.icu and ubidi.icu are loaded lazily, without mutexing.
* u_init() will try to load them for thread safety,
* but u_init() will not fail if they are missing.
*
* uchar.c maintains a tri-state flag for (not loaded/loaded/failed to load)
* and an error code for load failure.
* Instead, here we try to load at most once.
* If it works, we use the resulting singleton object.
* If it fails, then we get a dummy object, which always works unless
* we are seriously out of memory.
* After the first try, we have a never-changing pointer to either the
* real singleton or the dummy.
*
* This method is used in Unicode properties APIs (uchar.h) that
* do not have a service object and also do not have an error code parameter.
* Other API implementations get the singleton themselves
* (with mutexing), store it in the service object, and report errors.
*
* TODO: Remove this support for non-hardcoded data. u_init() is publicly
* advertised as not being required for thread safety, we cannot
* revert to unsafe data loading.
*/
#if !UCASE_HARDCODE_DATA
#define GET_CASE_PROPS() (gCsp!=NULL ? gCsp : getCaseProps())
#else
#define GET_CASE_PROPS() &ucase_props_singleton
#endif
/* public API (see uchar.h) */

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2004-2009, International Business Machines
* Copyright (C) 2004-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -31,31 +31,8 @@ U_CDECL_BEGIN
struct UCaseProps;
typedef struct UCaseProps UCaseProps;
U_CAPI UCaseProps * U_EXPORT2
ucase_open(UErrorCode *pErrorCode);
U_CAPI UCaseProps * U_EXPORT2
ucase_openBinary(const uint8_t *bin, int32_t length, UErrorCode *pErrorCode);
U_CAPI void U_EXPORT2
ucase_close(UCaseProps *csp);
U_CAPI const UCaseProps * U_EXPORT2
ucase_getSingleton(UErrorCode *pErrorCode);
#define UCASE_HARDCODE_DATA 1
#if !UCASE_HARDCODE_DATA
/**
* Get a singleton dummy object, one that works with no real data.
* This can be used when the real data is not available.
* Using the dummy can reduce checks for available data after an initial failure.
*/
U_CAPI const UCaseProps * U_EXPORT2
ucase_getDummy(UErrorCode *pErrorCode);
#endif
ucase_getSingleton(void);
U_CAPI int32_t U_EXPORT2
ucase_swap(const UDataSwapper *ds,

File diff suppressed because it is too large Load diff

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2005-2009, International Business Machines
* Copyright (C) 2005-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -45,7 +45,7 @@ ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
}
uprv_memset(csm, 0, sizeof(UCaseMap));
csm->csp=ucase_getSingleton(pErrorCode);
csm->csp=ucase_getSingleton();
ucasemap_setLocale(csm, locale, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
uprv_free(csm);

View file

@ -33,228 +33,19 @@
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
/* dynamically loaded Unicode character properties -------------------------- */
#define UCHAR_HARDCODE_DATA 1
#if UCHAR_HARDCODE_DATA
/* uchar_props_data.c is machine-generated by genprops --csource */
#include "uchar_props_data.c"
#else
/*
* loaded uprops.dat -
* for a description of the file format, see icu/source/tools/genprops/store.c
*/
static const char DATA_NAME[] = "uprops";
static const char DATA_TYPE[] = "icu";
static UDataMemory *propsData=NULL;
static UErrorCode dataErrorCode=U_ZERO_ERROR;
static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
static UVersionInfo dataVersion={ 0, 0, 0, 0 };
static UTrie propsTrie={ 0 }, propsVectorsTrie={ 0 };
static const uint32_t *pData32=NULL, *propsVectors=NULL;
static int32_t countPropsVectors=0, propsVectorsColumns=0;
static int8_t havePropsData=0; /* == 0 -> Data has not been loaded.
* < 0 -> Error occured attempting to load data.
* > 0 -> Data has been successfully loaded.
*/
/* index values loaded from uprops.dat */
static int32_t indexes[UPROPS_INDEX_COUNT];
static UBool U_CALLCONV
isAcceptable(void *context,
const char *type, const char *name,
const UDataInfo *pInfo) {
if(
pInfo->size>=20 &&
pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
pInfo->charsetFamily==U_CHARSET_FAMILY &&
pInfo->dataFormat[0]==0x55 && /* dataFormat="UPro" */
pInfo->dataFormat[1]==0x50 &&
pInfo->dataFormat[2]==0x72 &&
pInfo->dataFormat[3]==0x6f &&
pInfo->formatVersion[0]==4 &&
pInfo->formatVersion[2]==UTRIE_SHIFT &&
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
) {
uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
return TRUE;
} else {
return FALSE;
}
}
static UBool U_CALLCONV uchar_cleanup(void)
{
if (propsData) {
udata_close(propsData);
propsData=NULL;
}
pData32=NULL;
propsVectors=NULL;
countPropsVectors=0;
uprv_memset(dataVersion, 0, U_MAX_VERSION_LENGTH);
dataErrorCode=U_ZERO_ERROR;
havePropsData=0;
return TRUE;
}
struct UCharProps {
UDataMemory *propsData;
UTrie propsTrie, propsVectorsTrie;
const uint32_t *pData32;
};
typedef struct UCharProps UCharProps;
/* open uprops.icu */
static void
_openProps(UCharProps *ucp, UErrorCode *pErrorCode) {
const uint32_t *p;
int32_t length;
ucp->propsData=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return;
}
ucp->pData32=p=(const uint32_t *)udata_getMemory(ucp->propsData);
/* unserialize the trie; it is directly after the int32_t indexes[UPROPS_INDEX_COUNT] */
length=(int32_t)p[UPROPS_PROPS32_INDEX]*4;
length=utrie_unserialize(&ucp->propsTrie, (const uint8_t *)(p+UPROPS_INDEX_COUNT), length-64, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return;
}
/* unserialize the properties vectors trie */
length=(int32_t)(p[UPROPS_ADDITIONAL_VECTORS_INDEX]-p[UPROPS_ADDITIONAL_TRIE_INDEX])*4;
if(length>0) {
length=utrie_unserialize(&ucp->propsVectorsTrie, (const uint8_t *)(p+p[UPROPS_ADDITIONAL_TRIE_INDEX]), length, pErrorCode);
}
if(length<=0 || U_FAILURE(*pErrorCode)) {
/*
* length==0:
* Allow the properties vectors trie to be missing -
* also requires propsVectorsColumns=indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]
* to be zero so that this trie is never accessed.
*/
uprv_memset(&ucp->propsVectorsTrie, 0, sizeof(ucp->propsVectorsTrie));
}
}
#endif
#if !UCHAR_HARDCODE_DATA
static int8_t
uprv_loadPropsData(UErrorCode *pErrorCode) {
/* load Unicode character properties data from file if necessary */
/*
* This lazy intialization with double-checked locking (without mutex protection for
* haveNormData==0) is transiently unsafe under certain circumstances.
* Check the readme and use u_init() if necessary.
*/
if(havePropsData==0) {
UCharProps ucp={ NULL };
if(U_FAILURE(*pErrorCode)) {
return havePropsData;
}
/* open the data outside the mutex block */
_openProps(&ucp, pErrorCode);
if(U_SUCCESS(*pErrorCode)) {
/* in the mutex block, set the data for this process */
umtx_lock(NULL);
if(propsData==NULL) {
propsData=ucp.propsData;
ucp.propsData=NULL;
pData32=ucp.pData32;
ucp.pData32=NULL;
uprv_memcpy(&propsTrie, &ucp.propsTrie, sizeof(propsTrie));
uprv_memcpy(&propsVectorsTrie, &ucp.propsVectorsTrie, sizeof(propsVectorsTrie));
}
/* initialize some variables */
uprv_memcpy(indexes, pData32, sizeof(indexes));
/* additional properties */
if(indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]!=0) {
propsVectors=pData32+indexes[UPROPS_ADDITIONAL_VECTORS_INDEX];
countPropsVectors=indexes[UPROPS_RESERVED_INDEX]-indexes[UPROPS_ADDITIONAL_VECTORS_INDEX];
propsVectorsColumns=indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX];
}
havePropsData=1;
umtx_unlock(NULL);
} else {
dataErrorCode=*pErrorCode;
havePropsData=-1;
}
ucln_common_registerCleanup(UCLN_COMMON_UCHAR, uchar_cleanup);
/* if a different thread set it first, then close the extra data */
udata_close(ucp.propsData); /* NULL if it was set correctly */
}
return havePropsData;
}
static int8_t
loadPropsData(void) {
UErrorCode errorCode = U_ZERO_ERROR;
int8_t retVal = uprv_loadPropsData(&errorCode);
return retVal;
}
#endif
/* constants and macros for access to the data ------------------------------ */
/* getting a uint32_t properties word from the data */
#if UCHAR_HARDCODE_DATA
#define GET_PROPS(c, result) ((result)=UTRIE2_GET16(&propsTrie, c));
#else
#define HAVE_DATA (havePropsData>0 || loadPropsData()>0)
#define GET_PROPS_UNSAFE(c, result) \
UTRIE_GET16(&propsTrie, c, result);
#define GET_PROPS(c, result) \
if(HAVE_DATA) { \
GET_PROPS_UNSAFE(c, result); \
} else { \
(result)=0; \
}
#endif
U_CFUNC UBool
uprv_haveProperties(UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return FALSE;
}
#if !UCHAR_HARDCODE_DATA
if(havePropsData==0) {
uprv_loadPropsData(pErrorCode);
}
if(havePropsData<0) {
*pErrorCode=dataErrorCode;
return FALSE;
}
#endif
return TRUE;
}
@ -291,11 +82,7 @@ U_CAPI void U_EXPORT2
u_enumCharTypes(UCharEnumTypeRange *enumRange, const void *context) {
struct _EnumTypeCallback callback;
if(enumRange==NULL
#if !UCHAR_HARDCODE_DATA
|| !HAVE_DATA
#endif
) {
if(enumRange==NULL) {
return;
}
@ -706,9 +493,6 @@ u_getUnicodeProperties(UChar32 c, int32_t column) {
GET_PROPS(c, props);
return props;
} else if(
#if !UCHAR_HARDCODE_DATA
!HAVE_DATA || countPropsVectors==0 ||
#endif
column<0 || column>=propsVectorsColumns
) {
return 0;
@ -720,22 +504,14 @@ u_getUnicodeProperties(UChar32 c, int32_t column) {
U_CFUNC int32_t
uprv_getMaxValues(int32_t column) {
#if !UCHAR_HARDCODE_DATA
if(HAVE_DATA) {
#endif
switch(column) {
case 0:
return indexes[UPROPS_MAX_VALUES_INDEX];
case 2:
return indexes[UPROPS_MAX_VALUES_2_INDEX];
default:
return 0;
}
#if !UCHAR_HARDCODE_DATA
} else {
switch(column) {
case 0:
return indexes[UPROPS_MAX_VALUES_INDEX];
case 2:
return indexes[UPROPS_MAX_VALUES_2_INDEX];
default:
return 0;
}
#endif
}
U_CAPI void U_EXPORT2
@ -750,6 +526,7 @@ u_charAge(UChar32 c, UVersionInfo versionArray) {
U_CAPI UScriptCode U_EXPORT2
uscript_getScript(UChar32 c, UErrorCode *pErrorCode) {
uint32_t scriptX;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return USCRIPT_INVALID_CODE;
}
@ -757,8 +534,81 @@ uscript_getScript(UChar32 c, UErrorCode *pErrorCode) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return USCRIPT_INVALID_CODE;
}
scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
return (UScriptCode)scriptX;
} else if(scriptX<UPROPS_SCRIPT_X_WITH_INHERITED) {
return USCRIPT_COMMON;
} else if(scriptX<UPROPS_SCRIPT_X_WITH_OTHER) {
return USCRIPT_INHERITED;
} else {
return (UScriptCode)scriptExtensions[scriptX&UPROPS_SCRIPT_MASK];
}
}
return (UScriptCode)(u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_MASK);
U_DRAFT UBool U_EXPORT2
uscript_hasScript(UChar32 c, UScriptCode sc) {
UScriptCode script;
const uint16_t *scx;
uint32_t scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
return sc==(UScriptCode)scriptX;
}
scx=scriptExtensions+(scriptX&UPROPS_SCRIPT_MASK);
if(scriptX<UPROPS_SCRIPT_X_WITH_INHERITED) {
script=USCRIPT_COMMON;
} else if(scriptX<UPROPS_SCRIPT_X_WITH_OTHER) {
script=USCRIPT_INHERITED;
} else {
script=(UScriptCode)scx[0];
scx=scriptExtensions+scx[1];
}
if(sc==script) {
return TRUE;
}
while(sc>*scx) {
++scx;
}
return sc==(*scx&0x7fff);
}
U_DRAFT int32_t U_EXPORT2
uscript_getScriptExtensions(UChar32 c,
UScriptCode *scripts, int32_t capacity,
UErrorCode *pErrorCode) {
uint32_t scriptX;
int32_t length;
const uint16_t *scx;
uint16_t sx;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
if(capacity<0 || (capacity>0 && scripts==NULL)) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
return 0;
}
length=0;
scx=scriptExtensions+(scriptX&UPROPS_SCRIPT_MASK);
if(scriptX>=UPROPS_SCRIPT_X_WITH_OTHER) {
scx=scriptExtensions+scx[1];
}
do {
sx=*scx++;
if(length<capacity) {
scripts[length]=sx&0x7fff;
}
++length;
} while(sx<0x8000);
if(length>capacity) {
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
return length;
}
U_CAPI UBlockCode U_EXPORT2
@ -784,13 +634,6 @@ uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
return;
}
#if !UCHAR_HARDCODE_DATA
if(!HAVE_DATA) {
*pErrorCode=dataErrorCode;
return;
}
#endif
/* add the start code point of each same-value range of the main trie */
utrie2_enum(&propsTrie, NULL, _enumPropertyStartsRange, sa);
@ -851,13 +694,6 @@ upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
return;
}
#if !UCHAR_HARDCODE_DATA
if(!HAVE_DATA) {
*pErrorCode=dataErrorCode;
return;
}
#endif
/* add the start code point of each same-value range of the properties vectors trie */
if(propsVectorsColumns>0) {
/* if propsVectorsColumns==0 then the properties vectors trie may not be there at all */

File diff suppressed because it is too large Load diff

View file

@ -46,9 +46,6 @@ typedef enum ECleanupCommonType {
UCLN_COMMON_UNAMES,
UCLN_COMMON_PNAME,
UCLN_COMMON_UPROPS,
UCLN_COMMON_UBIDI,
UCLN_COMMON_UCASE,
UCLN_COMMON_UCHAR,
UCLN_COMMON_UCNV,
UCLN_COMMON_UCNV_IO,
UCLN_COMMON_UDATA,

View file

@ -39,7 +39,7 @@ U_CDECL_BEGIN
* @see u_getUnicodeVersion
* @stable ICU 2.0
*/
#define U_UNICODE_VERSION "5.2"
#define U_UNICODE_VERSION "6.0"
/**
* \file
@ -392,21 +392,21 @@ typedef enum UProperty {
See the uchar.h file documentation.
@stable ICU 3.4 */
UCHAR_POSIX_XDIGIT=48,
/** Binary property Cased. For Lowercase, Uppercase and Titlecase characters. @draft ICU 4.4 */
/** Binary property Cased. For Lowercase, Uppercase and Titlecase characters. @stable ICU 4.4 */
UCHAR_CASED=49,
/** Binary property Case_Ignorable. Used in context-sensitive case mappings. @draft ICU 4.4 */
/** Binary property Case_Ignorable. Used in context-sensitive case mappings. @stable ICU 4.4 */
UCHAR_CASE_IGNORABLE=50,
/** Binary property Changes_When_Lowercased. @draft ICU 4.4 */
/** Binary property Changes_When_Lowercased. @stable ICU 4.4 */
UCHAR_CHANGES_WHEN_LOWERCASED=51,
/** Binary property Changes_When_Uppercased. @draft ICU 4.4 */
/** Binary property Changes_When_Uppercased. @stable ICU 4.4 */
UCHAR_CHANGES_WHEN_UPPERCASED=52,
/** Binary property Changes_When_Titlecased. @draft ICU 4.4 */
/** Binary property Changes_When_Titlecased. @stable ICU 4.4 */
UCHAR_CHANGES_WHEN_TITLECASED=53,
/** Binary property Changes_When_Casefolded. @draft ICU 4.4 */
/** Binary property Changes_When_Casefolded. @stable ICU 4.4 */
UCHAR_CHANGES_WHEN_CASEFOLDED=54,
/** Binary property Changes_When_Casemapped. @draft ICU 4.4 */
/** Binary property Changes_When_Casemapped. @stable ICU 4.4 */
UCHAR_CHANGES_WHEN_CASEMAPPED=55,
/** Binary property Changes_When_NFKC_Casefolded. @draft ICU 4.4 */
/** Binary property Changes_When_NFKC_Casefolded. @stable ICU 4.4 */
UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED=56,
/** One more than the last constant for binary Unicode properties. @stable ICU 2.1 */
UCHAR_BINARY_LIMIT=57,
@ -560,6 +560,20 @@ typedef enum UProperty {
/** One more than the last constant for string Unicode properties. @stable ICU 2.4 */
UCHAR_STRING_LIMIT=0x400D,
/** Provisional property Script_Extensions (new in Unicode 6.0).
As a provisional property, it may be modified or removed
in future versions of the Unicode Standard, and thus in ICU.
Some characters are commonly used in multiple scripts.
For more information, see UAX #24: http://www.unicode.org/reports/tr24/.
Corresponds to uscript_hasScript and uscript_getScriptExtensions in uscript.h.
@draft ICU 4.6 */
UCHAR_SCRIPT_EXTENSIONS=0x7000,
/** First constant for Unicode properties with unusual value types. @draft ICU 4.6 */
UCHAR_OTHER_PROPERTY_START=UCHAR_SCRIPT_EXTENSIONS,
/** One more than the last constant for Unicode properties with unusual value types.
* @draft ICU 4.6 */
UCHAR_OTHER_PROPERTY_LIMIT=0x7001,
/** Represents a nonexistent or invalid property or property value. @stable ICU 2.4 */
UCHAR_INVALID_CODE = -1
} UProperty;
@ -1287,61 +1301,88 @@ enum UBlockCode {
/* New blocks in Unicode 5.2 */
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_SAMARITAN = 172, /*[0800]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED = 173, /*[18B0]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_TAI_THAM = 174, /*[1A20]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_VEDIC_EXTENSIONS = 175, /*[1CD0]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_LISU = 176, /*[A4D0]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_BAMUM = 177, /*[A6A0]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_COMMON_INDIC_NUMBER_FORMS = 178, /*[A830]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_DEVANAGARI_EXTENDED = 179, /*[A8E0]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_HANGUL_JAMO_EXTENDED_A = 180, /*[A960]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_JAVANESE = 181, /*[A980]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_MYANMAR_EXTENDED_A = 182, /*[AA60]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_TAI_VIET = 183, /*[AA80]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_MEETEI_MAYEK = 184, /*[ABC0]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_HANGUL_JAMO_EXTENDED_B = 185, /*[D7B0]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_IMPERIAL_ARAMAIC = 186, /*[10840]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_OLD_SOUTH_ARABIAN = 187, /*[10A60]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_AVESTAN = 188, /*[10B00]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_INSCRIPTIONAL_PARTHIAN = 189, /*[10B40]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_INSCRIPTIONAL_PAHLAVI = 190, /*[10B60]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_OLD_TURKIC = 191, /*[10C00]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_RUMI_NUMERAL_SYMBOLS = 192, /*[10E60]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_KAITHI = 193, /*[11080]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_EGYPTIAN_HIEROGLYPHS = 194, /*[13000]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_ENCLOSED_ALPHANUMERIC_SUPPLEMENT = 195, /*[1F100]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_ENCLOSED_IDEOGRAPHIC_SUPPLEMENT = 196, /*[1F200]*/
/** @draft ICU 4.4 */
/** @stable ICU 4.4 */
UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C = 197, /*[2A700]*/
/* New blocks in Unicode 6.0 */
/** @stable ICU 4.6 */
UBLOCK_MANDAIC = 198, /*[0840]*/
/** @stable ICU 4.6 */
UBLOCK_BATAK = 199, /*[1BC0]*/
/** @stable ICU 4.6 */
UBLOCK_ETHIOPIC_EXTENDED_A = 200, /*[AB00]*/
/** @stable ICU 4.6 */
UBLOCK_BRAHMI = 201, /*[11000]*/
/** @stable ICU 4.6 */
UBLOCK_BAMUM_SUPPLEMENT = 202, /*[16800]*/
/** @stable ICU 4.6 */
UBLOCK_KANA_SUPPLEMENT = 203, /*[1B000]*/
/** @stable ICU 4.6 */
UBLOCK_PLAYING_CARDS = 204, /*[1F0A0]*/
/** @stable ICU 4.6 */
UBLOCK_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS = 205, /*[1F300]*/
/** @stable ICU 4.6 */
UBLOCK_EMOTICONS = 206, /*[1F600]*/
/** @stable ICU 4.6 */
UBLOCK_TRANSPORT_AND_MAP_SYMBOLS = 207, /*[1F680]*/
/** @stable ICU 4.6 */
UBLOCK_ALCHEMICAL_SYMBOLS = 208, /*[1F700]*/
/** @stable ICU 4.6 */
UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D = 209, /*[2B740]*/
/** @stable ICU 2.0 */
UBLOCK_COUNT = 198,
UBLOCK_COUNT = 210,
/** @stable ICU 2.0 */
UBLOCK_INVALID_CODE=-1
@ -1386,7 +1427,7 @@ typedef enum UCharNameChoice {
U_UNICODE_CHAR_NAME,
U_UNICODE_10_CHAR_NAME,
U_EXTENDED_CHAR_NAME,
U_CHAR_NAME_ALIAS, /**< Corrected name from NameAliases.txt. @draft ICU 4.4 */
U_CHAR_NAME_ALIAS, /**< Corrected name from NameAliases.txt. @stable ICU 4.4 */
U_CHAR_NAME_CHOICE_COUNT
} UCharNameChoice;
@ -1474,7 +1515,8 @@ typedef enum UJoiningGroup {
U_JG_GAF,
U_JG_GAMAL,
U_JG_HAH,
U_JG_HAMZA_ON_HEH_GOAL,
U_JG_TEH_MARBUTA_GOAL, /**< @stable ICU 4.6 */
U_JG_HAMZA_ON_HEH_GOAL=U_JG_TEH_MARBUTA_GOAL,
U_JG_HE,
U_JG_HEH,
U_JG_HEH_GOAL,
@ -1515,8 +1557,8 @@ typedef enum UJoiningGroup {
U_JG_KHAPH, /**< @stable ICU 2.6 */
U_JG_ZHAIN, /**< @stable ICU 2.6 */
U_JG_BURUSHASKI_YEH_BARREE, /**< @stable ICU 4.0 */
U_JG_FARSI_YEH, /**< @draft ICU 4.4 */
U_JG_NYA, /**< @draft ICU 4.4 */
U_JG_FARSI_YEH, /**< @stable ICU 4.4 */
U_JG_NYA, /**< @stable ICU 4.4 */
U_JG_COUNT
} UJoiningGroup;

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1997-2009, International Business Machines
* Copyright (C) 1997-2010, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
@ -45,7 +45,7 @@
*/
typedef enum UScriptCode {
USCRIPT_INVALID_CODE = -1,
USCRIPT_COMMON = 0 , /* Zyyy */
USCRIPT_COMMON = 0, /* Zyyy */
USCRIPT_INHERITED = 1, /* Zinh */ /* "Code for inherited script", for non-spacing combining marks; also Qaai */
USCRIPT_ARABIC = 2, /* Arab */
USCRIPT_ARMENIAN = 3, /* Armn */
@ -107,7 +107,7 @@ typedef enum UScriptCode {
/** New script code in Unicode 4.0.1 @stable ICU 3.0 */
USCRIPT_KATAKANA_OR_HIRAGANA = 54,/*Hrkt */
/* New scripts in Unicode 4.1 @stable ICU 3.4 */
USCRIPT_BUGINESE = 55, /* Bugi */
USCRIPT_GLAGOLITIC = 56, /* Glag */
@ -140,9 +140,15 @@ typedef enum UScriptCode {
USCRIPT_LATIN_GAELIC = 81, /* Latg */
USCRIPT_LEPCHA = 82, /* Lepc */
USCRIPT_LINEAR_A = 83, /* Lina */
USCRIPT_MANDAEAN = 84, /* Mand */
/** @stable ICU 4.6 */
USCRIPT_MANDAIC = 84, /* Mand */
/** @stable ICU 3.6 */
USCRIPT_MANDAEAN = USCRIPT_MANDAIC,
USCRIPT_MAYAN_HIEROGLYPHS = 85, /* Maya */
USCRIPT_MEROITIC = 86, /* Mero */
/** @stable ICU 4.6 */
USCRIPT_MEROITIC_HIEROGLYPHS = 86, /* Mero */
/** @stable ICU 3.6 */
USCRIPT_MEROITIC = USCRIPT_MEROITIC_HIEROGLYPHS,
USCRIPT_NKO = 87, /* Nkoo */
USCRIPT_ORKHON = 88, /* Orkh */
USCRIPT_OLD_PERMIC = 89, /* Perm */
@ -191,14 +197,29 @@ typedef enum UScriptCode {
USCRIPT_MATHEMATICAL_NOTATION = 128,/* Zmth */
USCRIPT_SYMBOLS = 129,/* Zsym */
/* New script codes from ISO 15924 @draft ICU 4.4 */
/* New script codes from ISO 15924 @stable ICU 4.4 */
USCRIPT_BAMUM = 130,/* Bamu */
USCRIPT_LISU = 131,/* Lisu */
USCRIPT_NAKHI_GEBA = 132,/* Nkgb */
USCRIPT_OLD_SOUTH_ARABIAN = 133,/* Sarb */
/* Private use codes from Qaaa - Qabx are not supported*/
USCRIPT_CODE_LIMIT = 134
/* New script codes from ISO 15924 @stable ICU 4.6 */
USCRIPT_BASSA_VAH = 134,/* Bass */
USCRIPT_DUPLOYAN_SHORTAND = 135,/* Dupl */
USCRIPT_ELBASAN = 136,/* Elba */
USCRIPT_GRANTHA = 137,/* Gran */
USCRIPT_KPELLE = 138,/* Kpel */
USCRIPT_LOMA = 139,/* Loma */
USCRIPT_MENDE = 140,/* Mend */
USCRIPT_MEROITIC_CURSIVE = 141,/* Merc */
USCRIPT_OLD_NORTH_ARABIAN = 142,/* Narb */
USCRIPT_NABATAEAN = 143,/* Nbat */
USCRIPT_PALMYRENE = 144,/* Palm */
USCRIPT_SINDHI = 145,/* Sind */
USCRIPT_WARANG_CITI = 146,/* Wara */
/* Private use codes from Qaaa - Qabx are not supported */
USCRIPT_CODE_LIMIT = 147
} UScriptCode;
/**
@ -244,7 +265,7 @@ uscript_getName(UScriptCode scriptCode);
U_STABLE const char* U_EXPORT2
uscript_getShortName(UScriptCode scriptCode);
/**
/**
* Gets the script code associated with the given codepoint.
* Returns USCRIPT_MALAYALAM given 0x0D02
* @param codepoint UChar32 codepoint
@ -255,6 +276,51 @@ uscript_getShortName(UScriptCode scriptCode);
U_STABLE UScriptCode U_EXPORT2
uscript_getScript(UChar32 codepoint, UErrorCode *err);
/**
* Is code point c used in script sc?
* That is, does code point c have the Script property value sc,
* or do code point c's Script_Extensions include script code sc?
*
* Some characters are commonly used in multiple scripts.
* For more information, see UAX #24: http://www.unicode.org/reports/tr24/.
*
* The Script_Extensions property is provisional. It may be modified or removed
* in future versions of the Unicode Standard, and thus in ICU.
* @param c code point
* @param sc script code
* @return TRUE if Script(c)==sc or sc is in Script_Extensions(c)
* @draft ICU 4.6
*/
U_DRAFT UBool U_EXPORT2
uscript_hasScript(UChar32 c, UScriptCode sc);
/**
* Writes code point c's Script_Extensions as a list of UScriptCode values
* to the output scripts array.
*
* Some characters are commonly used in multiple scripts.
* For more information, see UAX #24: http://www.unicode.org/reports/tr24/.
*
* If there are more than capacity script codes to be written, then
* U_BUFFER_OVERFLOW_ERROR is set and the number of Script_Extensions is returned.
* (Usual ICU buffer handling behavior.)
*
* The Script_Extensions property is provisional. It may be modified or removed
* in future versions of the Unicode Standard, and thus in ICU.
* @param c code point
* @param scripts output script code array
* @param capacity capacity of the scripts array
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return number of script codes in c's Script_Extensions,
* written to scripts unless U_BUFFER_OVERFLOW_ERROR indicates insufficient capacity
* @draft ICU 4.6
*/
U_DRAFT int32_t U_EXPORT2
uscript_getScriptExtensions(UChar32 c,
UScriptCode *scripts, int32_t capacity,
UErrorCode *pErrorCode);
#endif

View file

@ -210,7 +210,7 @@ const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
if(U_SUCCESS(status)) {
impl->addPropertyStarts(&sa, status);
}
ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status);
ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status);
break;
}
case UPROPS_SRC_NFC: {
@ -243,10 +243,10 @@ const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
}
#endif
case UPROPS_SRC_CASE:
ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status);
ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status);
break;
case UPROPS_SRC_BIDI:
ubidi_addPropertyStarts(ubidi_getSingleton(&status), &sa, &status);
ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status);
break;
default:
status = U_INTERNAL_PROGRAM_ERROR;
@ -929,9 +929,10 @@ static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
}
static UBool versionFilter(UChar32 ch, void* context) {
UVersionInfo v, none = { 0, 0, 0, 0};
UVersionInfo* version = (UVersionInfo*)context;
static const UVersionInfo none = { 0, 0, 0, 0 };
UVersionInfo v;
u_charAge(ch, v);
UVersionInfo* version = (UVersionInfo*)context;
return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
}
@ -945,6 +946,9 @@ static UBool intPropertyFilter(UChar32 ch, void* context) {
return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
}
static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
return uscript_hasScript(ch, *(UScriptCode*)context);
}
/**
* Generic filter-based scanning code for UCD property UnicodeSets.
@ -953,20 +957,17 @@ void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
void* context,
int32_t src,
UErrorCode &status) {
// Walk through all Unicode characters, noting the start
if (U_FAILURE(status)) return;
// Logically, walk through all Unicode characters, noting the start
// and end of each range for which filter.contain(c) is
// true. Add each range to a set.
//
// To improve performance, use the INCLUSIONS set, which
// To improve performance, use an inclusions set which
// encodes information about character ranges that are known
// to have identical properties. INCLUSIONS contains
// only the first characters of such ranges.
//
// TODO Where possible, instead of scanning over code points,
// use internal property data to initialize UnicodeSets for
// those properties. Scanning code points is slow.
if (U_FAILURE(status)) return;
// to have identical properties.
// getInclusions(src) contains exactly the first characters of
// same-value ranges for the given properties "source".
const UnicodeSet* inclusions = getInclusions(src, status);
if (U_FAILURE(status)) {
return;
@ -1034,6 +1035,9 @@ UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec)
if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);
} else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
UScriptCode script = (UScriptCode)value;
applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec);
} else {
IntPropertyContext c = {prop, value};
applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);
@ -1146,6 +1150,13 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
return *this;
}
break;
case UCHAR_SCRIPT_EXTENSIONS:
v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
if (v == UCHAR_INVALID_CODE) {
FAIL(ec);
}
// fall through to calling applyIntPropertyValue()
break;
default:
// p is a non-binary, non-enumerated property that we
// don't support (yet).
@ -1183,7 +1194,7 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
}
}
}
applyIntPropertyValue(p, v, ec);
if(invert) {
complement();
@ -1395,9 +1406,8 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
return *this;
}
if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) {
UErrorCode status = U_ZERO_ERROR;
const UCaseProps *csp = ucase_getSingleton(&status);
if (U_SUCCESS(status)) {
const UCaseProps *csp = ucase_getSingleton();
{
UnicodeSet foldSet(*this);
UnicodeString str;
USetAdder sa = {
@ -1460,6 +1470,7 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
} else {
Locale root("");
#if !UCONFIG_NO_BREAK_ITERATION
UErrorCode status = U_ZERO_ERROR;
BreakIterator *bi = BreakIterator::createWordInstance(root, status);
#endif
if (U_SUCCESS(status)) {

View file

@ -99,14 +99,7 @@ UnicodeString::caseMap(BreakIterator *titleIter,
return *this;
}
UErrorCode errorCode;
errorCode = U_ZERO_ERROR;
const UCaseProps *csp=ucase_getSingleton(&errorCode);
if(U_FAILURE(errorCode)) {
setToBogus();
return *this;
}
const UCaseProps *csp=ucase_getSingleton();
// We need to allocate a new buffer for the internal string case mapping function.
// This is very similar to how doReplace() keeps the old array pointer
@ -138,6 +131,7 @@ UnicodeString::caseMap(BreakIterator *titleIter,
}
// Case-map, and if the result is too long, then reallocate and repeat.
UErrorCode errorCode;
int32_t newLength;
do {
errorCode = U_ZERO_ERROR;

View file

@ -184,7 +184,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
nfcImpl=NULL;
}
if((options&U_COMPARE_IGNORE_CASE)!=0) {
csp=ucase_getSingleton(pErrorCode);
csp=ucase_getSingleton();
} else {
csp=NULL;
}

View file

@ -38,125 +38,231 @@
U_NAMESPACE_USE
/* cleanup ------------------------------------------------------------------ */
static const UBiDiProps *gBdp=NULL;
static UBool U_CALLCONV uprops_cleanup(void) {
gBdp=NULL;
return TRUE;
}
/* bidi/shaping properties API ---------------------------------------------- */
/* get the UBiDiProps singleton, or else its dummy, once and for all */
static const UBiDiProps *
getBiDiProps() {
/*
* This lazy intialization with double-checked locking (without mutex protection for
* the initial check) is transiently unsafe under certain circumstances.
* Check the readme and use u_init() if necessary.
*/
/* the initial check is performed by the GET_BIDI_PROPS() macro */
const UBiDiProps *bdp;
UErrorCode errorCode=U_ZERO_ERROR;
bdp=ubidi_getSingleton(&errorCode);
#if !UBIDI_HARDCODE_DATA
if(U_FAILURE(errorCode)) {
errorCode=U_ZERO_ERROR;
bdp=ubidi_getDummy(&errorCode);
if(U_FAILURE(errorCode)) {
return NULL;
}
}
#endif
umtx_lock(NULL);
if(gBdp==NULL) {
gBdp=bdp;
ucln_common_registerCleanup(UCLN_COMMON_UPROPS, uprops_cleanup);
}
umtx_unlock(NULL);
return gBdp;
}
/* see comment for GET_CASE_PROPS() */
#define GET_BIDI_PROPS() (gBdp!=NULL ? gBdp : getBiDiProps())
#define GET_BIDI_PROPS() ubidi_getSingleton()
/* general properties API functions ----------------------------------------- */
static const struct {
int32_t column;
struct BinaryProperty;
typedef UBool BinaryPropertyContains(const BinaryProperty &prop, UChar32 c, UProperty which);
struct BinaryProperty {
int32_t column; // SRC_PROPSVEC column, or "source" if mask==0
uint32_t mask;
} binProps[UCHAR_BINARY_LIMIT]={
BinaryPropertyContains *contains;
};
static UBool defaultContains(const BinaryProperty &prop, UChar32 c, UProperty /*which*/) {
/* systematic, directly stored properties */
return (u_getUnicodeProperties(c, prop.column)&prop.mask)!=0;
}
static UBool caseBinaryPropertyContains(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) {
return ucase_hasBinaryProperty(c, which);
}
static UBool isBidiControl(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
return ubidi_isBidiControl(GET_BIDI_PROPS(), c);
}
static UBool isMirrored(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
return ubidi_isMirrored(GET_BIDI_PROPS(), c);
}
static UBool isJoinControl(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
return ubidi_isJoinControl(GET_BIDI_PROPS(), c);
}
static UBool hasFullCompositionExclusion(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
#if UCONFIG_NO_NORMALIZATION
return FALSE;
#else
// By definition, Full_Composition_Exclusion is the same as NFC_QC=No.
UErrorCode errorCode=U_ZERO_ERROR;
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
return U_SUCCESS(errorCode) && impl->isCompNo(impl->getNorm16(c));
#endif
}
// UCHAR_NF*_INERT properties
static UBool isNormInert(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) {
#if UCONFIG_NO_NORMALIZATION
return FALSE;
#else
UErrorCode errorCode=U_ZERO_ERROR;
const Normalizer2 *norm2=Normalizer2Factory::getInstance(
(UNormalizationMode)(which-UCHAR_NFD_INERT+UNORM_NFD), errorCode);
return U_SUCCESS(errorCode) && norm2->isInert(c);
#endif
}
static UBool changesWhenCasefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
#if UCONFIG_NO_NORMALIZATION
return FALSE;
#else
UnicodeString nfd;
UErrorCode errorCode=U_ZERO_ERROR;
const Normalizer2 *nfcNorm2=Normalizer2Factory::getNFCInstance(errorCode);
if(U_FAILURE(errorCode)) {
return FALSE;
}
if(nfcNorm2->getDecomposition(c, nfd)) {
/* c has a decomposition */
if(nfd.length()==1) {
c=nfd[0]; /* single BMP code point */
} else if(nfd.length()<=U16_MAX_LENGTH &&
nfd.length()==U16_LENGTH(c=nfd.char32At(0))
) {
/* single supplementary code point */
} else {
c=U_SENTINEL;
}
} else if(c<0) {
return FALSE; /* protect against bad input */
}
if(c>=0) {
/* single code point */
const UCaseProps *csp=ucase_getSingleton();
const UChar *resultString;
return (UBool)(ucase_toFullFolding(csp, c, &resultString, U_FOLD_CASE_DEFAULT)>=0);
} else {
/* guess some large but stack-friendly capacity */
UChar dest[2*UCASE_MAX_STRING_LENGTH];
int32_t destLength;
destLength=u_strFoldCase(dest, LENGTHOF(dest),
nfd.getBuffer(), nfd.length(),
U_FOLD_CASE_DEFAULT, &errorCode);
return (UBool)(U_SUCCESS(errorCode) &&
0!=u_strCompare(nfd.getBuffer(), nfd.length(),
dest, destLength, FALSE));
}
#endif
}
static UBool changesWhenNFKC_Casefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
#if UCONFIG_NO_NORMALIZATION
return FALSE;
#else
UErrorCode errorCode=U_ZERO_ERROR;
const Normalizer2Impl *kcf=Normalizer2Factory::getNFKC_CFImpl(errorCode);
if(U_FAILURE(errorCode)) {
return FALSE;
}
UnicodeString src(c);
UnicodeString dest;
{
// The ReorderingBuffer must be in a block because its destructor
// needs to release dest's buffer before we look at its contents.
ReorderingBuffer buffer(*kcf, dest);
// Small destCapacity for NFKC_CF(c).
if(buffer.init(5, errorCode)) {
const UChar *srcArray=src.getBuffer();
kcf->compose(srcArray, srcArray+src.length(), FALSE,
TRUE, buffer, errorCode);
}
}
return U_SUCCESS(errorCode) && dest!=src;
#endif
}
static UBool isCanonSegmentStarter(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
#if UCONFIG_NO_NORMALIZATION
return FALSE;
#else
UErrorCode errorCode=U_ZERO_ERROR;
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
return
U_SUCCESS(errorCode) && impl->ensureCanonIterData(errorCode) &&
impl->isCanonSegmentStarter(c);
#endif
}
static UBool isPOSIX_alnum(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
return u_isalnumPOSIX(c);
}
static UBool isPOSIX_blank(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
return u_isblank(c);
}
static UBool isPOSIX_graph(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
return u_isgraphPOSIX(c);
}
static UBool isPOSIX_print(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
return u_isprintPOSIX(c);
}
static UBool isPOSIX_xdigit(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
return u_isxdigit(c);
}
static const BinaryProperty binProps[UCHAR_BINARY_LIMIT]={
/*
* column and mask values for binary properties from u_getUnicodeProperties().
* Must be in order of corresponding UProperty,
* and there must be exactly one entry per binary UProperty.
*
* Properties with mask 0 are handled in code.
* Properties with mask==0 and contains==NULL are handled in code.
* For them, column is the UPropertySource value.
*/
{ 1, U_MASK(UPROPS_ALPHABETIC) },
{ 1, U_MASK(UPROPS_ASCII_HEX_DIGIT) },
{ UPROPS_SRC_BIDI, 0 }, /* UCHAR_BIDI_CONTROL */
{ UPROPS_SRC_BIDI, 0 }, /* UCHAR_BIDI_MIRRORED */
{ 1, U_MASK(UPROPS_DASH) },
{ 1, U_MASK(UPROPS_DEFAULT_IGNORABLE_CODE_POINT) },
{ 1, U_MASK(UPROPS_DEPRECATED) },
{ 1, U_MASK(UPROPS_DIACRITIC) },
{ 1, U_MASK(UPROPS_EXTENDER) },
{ UPROPS_SRC_NFC, 0 }, /* UCHAR_FULL_COMPOSITION_EXCLUSION */
{ 1, U_MASK(UPROPS_GRAPHEME_BASE) },
{ 1, U_MASK(UPROPS_GRAPHEME_EXTEND) },
{ 1, U_MASK(UPROPS_GRAPHEME_LINK) },
{ 1, U_MASK(UPROPS_HEX_DIGIT) },
{ 1, U_MASK(UPROPS_HYPHEN) },
{ 1, U_MASK(UPROPS_ID_CONTINUE) },
{ 1, U_MASK(UPROPS_ID_START) },
{ 1, U_MASK(UPROPS_IDEOGRAPHIC) },
{ 1, U_MASK(UPROPS_IDS_BINARY_OPERATOR) },
{ 1, U_MASK(UPROPS_IDS_TRINARY_OPERATOR) },
{ UPROPS_SRC_BIDI, 0 }, /* UCHAR_JOIN_CONTROL */
{ 1, U_MASK(UPROPS_LOGICAL_ORDER_EXCEPTION) },
{ UPROPS_SRC_CASE, 0 }, /* UCHAR_LOWERCASE */
{ 1, U_MASK(UPROPS_MATH) },
{ 1, U_MASK(UPROPS_NONCHARACTER_CODE_POINT) },
{ 1, U_MASK(UPROPS_QUOTATION_MARK) },
{ 1, U_MASK(UPROPS_RADICAL) },
{ UPROPS_SRC_CASE, 0 }, /* UCHAR_SOFT_DOTTED */
{ 1, U_MASK(UPROPS_TERMINAL_PUNCTUATION) },
{ 1, U_MASK(UPROPS_UNIFIED_IDEOGRAPH) },
{ UPROPS_SRC_CASE, 0 }, /* UCHAR_UPPERCASE */
{ 1, U_MASK(UPROPS_WHITE_SPACE) },
{ 1, U_MASK(UPROPS_XID_CONTINUE) },
{ 1, U_MASK(UPROPS_XID_START) },
{ UPROPS_SRC_CASE, 0 }, /* UCHAR_CASE_SENSITIVE */
{ 1, U_MASK(UPROPS_S_TERM) },
{ 1, U_MASK(UPROPS_VARIATION_SELECTOR) },
{ UPROPS_SRC_NFC, 0 }, /* UCHAR_NFD_INERT */
{ UPROPS_SRC_NFKC, 0 }, /* UCHAR_NFKD_INERT */
{ UPROPS_SRC_NFC, 0 }, /* UCHAR_NFC_INERT */
{ UPROPS_SRC_NFKC, 0 }, /* UCHAR_NFKC_INERT */
{ UPROPS_SRC_NFC_CANON_ITER, 0 }, /* UCHAR_SEGMENT_STARTER */
{ 1, U_MASK(UPROPS_PATTERN_SYNTAX) },
{ 1, U_MASK(UPROPS_PATTERN_WHITE_SPACE) },
{ UPROPS_SRC_CHAR_AND_PROPSVEC, 0 }, /* UCHAR_POSIX_ALNUM */
{ UPROPS_SRC_CHAR, 0 }, /* UCHAR_POSIX_BLANK */
{ UPROPS_SRC_CHAR, 0 }, /* UCHAR_POSIX_GRAPH */
{ UPROPS_SRC_CHAR, 0 }, /* UCHAR_POSIX_PRINT */
{ UPROPS_SRC_CHAR, 0 }, /* UCHAR_POSIX_XDIGIT */
{ UPROPS_SRC_CASE, 0 }, /* UCHAR_CASED */
{ UPROPS_SRC_CASE, 0 }, /* UCHAR_CASE_IGNORABLE */
{ UPROPS_SRC_CASE, 0 }, /* UCHAR_CHANGES_WHEN_LOWERCASED */
{ UPROPS_SRC_CASE, 0 }, /* UCHAR_CHANGES_WHEN_UPPERCASED */
{ UPROPS_SRC_CASE, 0 }, /* UCHAR_CHANGES_WHEN_TITLECASED */
{ UPROPS_SRC_CASE_AND_NORM, 0 }, /* UCHAR_CHANGES_WHEN_CASEFOLDED */
{ UPROPS_SRC_CASE, 0 }, /* UCHAR_CHANGES_WHEN_CASEMAPPED */
{ UPROPS_SRC_NFKC_CF, 0 } /* UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED */
{ 1, U_MASK(UPROPS_ALPHABETIC), defaultContains },
{ 1, U_MASK(UPROPS_ASCII_HEX_DIGIT), defaultContains },
{ UPROPS_SRC_BIDI, 0, isBidiControl },
{ UPROPS_SRC_BIDI, 0, isMirrored },
{ 1, U_MASK(UPROPS_DASH), defaultContains },
{ 1, U_MASK(UPROPS_DEFAULT_IGNORABLE_CODE_POINT), defaultContains },
{ 1, U_MASK(UPROPS_DEPRECATED), defaultContains },
{ 1, U_MASK(UPROPS_DIACRITIC), defaultContains },
{ 1, U_MASK(UPROPS_EXTENDER), defaultContains },
{ UPROPS_SRC_NFC, 0, hasFullCompositionExclusion },
{ 1, U_MASK(UPROPS_GRAPHEME_BASE), defaultContains },
{ 1, U_MASK(UPROPS_GRAPHEME_EXTEND), defaultContains },
{ 1, U_MASK(UPROPS_GRAPHEME_LINK), defaultContains },
{ 1, U_MASK(UPROPS_HEX_DIGIT), defaultContains },
{ 1, U_MASK(UPROPS_HYPHEN), defaultContains },
{ 1, U_MASK(UPROPS_ID_CONTINUE), defaultContains },
{ 1, U_MASK(UPROPS_ID_START), defaultContains },
{ 1, U_MASK(UPROPS_IDEOGRAPHIC), defaultContains },
{ 1, U_MASK(UPROPS_IDS_BINARY_OPERATOR), defaultContains },
{ 1, U_MASK(UPROPS_IDS_TRINARY_OPERATOR), defaultContains },
{ UPROPS_SRC_BIDI, 0, isJoinControl },
{ 1, U_MASK(UPROPS_LOGICAL_ORDER_EXCEPTION), defaultContains },
{ UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_LOWERCASE
{ 1, U_MASK(UPROPS_MATH), defaultContains },
{ 1, U_MASK(UPROPS_NONCHARACTER_CODE_POINT), defaultContains },
{ 1, U_MASK(UPROPS_QUOTATION_MARK), defaultContains },
{ 1, U_MASK(UPROPS_RADICAL), defaultContains },
{ UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_SOFT_DOTTED
{ 1, U_MASK(UPROPS_TERMINAL_PUNCTUATION), defaultContains },
{ 1, U_MASK(UPROPS_UNIFIED_IDEOGRAPH), defaultContains },
{ UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_UPPERCASE
{ 1, U_MASK(UPROPS_WHITE_SPACE), defaultContains },
{ 1, U_MASK(UPROPS_XID_CONTINUE), defaultContains },
{ 1, U_MASK(UPROPS_XID_START), defaultContains },
{ UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASE_SENSITIVE
{ 1, U_MASK(UPROPS_S_TERM), defaultContains },
{ 1, U_MASK(UPROPS_VARIATION_SELECTOR), defaultContains },
{ UPROPS_SRC_NFC, 0, isNormInert }, // UCHAR_NFD_INERT
{ UPROPS_SRC_NFKC, 0, isNormInert }, // UCHAR_NFKD_INERT
{ UPROPS_SRC_NFC, 0, isNormInert }, // UCHAR_NFC_INERT
{ UPROPS_SRC_NFKC, 0, isNormInert }, // UCHAR_NFKC_INERT
{ UPROPS_SRC_NFC_CANON_ITER, 0, isCanonSegmentStarter },
{ 1, U_MASK(UPROPS_PATTERN_SYNTAX), defaultContains },
{ 1, U_MASK(UPROPS_PATTERN_WHITE_SPACE), defaultContains },
{ UPROPS_SRC_CHAR_AND_PROPSVEC, 0, isPOSIX_alnum },
{ UPROPS_SRC_CHAR, 0, isPOSIX_blank },
{ UPROPS_SRC_CHAR, 0, isPOSIX_graph },
{ UPROPS_SRC_CHAR, 0, isPOSIX_print },
{ UPROPS_SRC_CHAR, 0, isPOSIX_xdigit },
{ UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASED
{ UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASE_IGNORABLE
{ UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_LOWERCASED
{ UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_UPPERCASED
{ UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_TITLECASED
{ UPROPS_SRC_CASE_AND_NORM, 0, changesWhenCasefolded },
{ UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_CASEMAPPED
{ UPROPS_SRC_NFKC_CF, 0, changesWhenNFKC_Casefolded }
};
U_CAPI UBool U_EXPORT2
@ -164,158 +270,11 @@ u_hasBinaryProperty(UChar32 c, UProperty which) {
/* c is range-checked in the functions that are called from here */
if(which<UCHAR_BINARY_START || UCHAR_BINARY_LIMIT<=which) {
/* not a known binary property */
return FALSE;
} else {
uint32_t mask=binProps[which].mask;
int32_t column=binProps[which].column;
if(mask!=0) {
/* systematic, directly stored properties */
return (u_getUnicodeProperties(c, column)&mask)!=0;
} else {
if(column==UPROPS_SRC_CASE) {
return ucase_hasBinaryProperty(c, which);
} else if(column==UPROPS_SRC_NFC) {
#if !UCONFIG_NO_NORMALIZATION
UErrorCode errorCode=U_ZERO_ERROR;
switch(which) {
case UCHAR_FULL_COMPOSITION_EXCLUSION: {
// By definition, Full_Composition_Exclusion is the same as NFC_QC=No.
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
return U_SUCCESS(errorCode) && impl->isCompNo(impl->getNorm16(c));
break;
}
default: {
// UCHAR_NF[CD]_INERT properties
const Normalizer2 *norm2=Normalizer2Factory::getInstance(
(UNormalizationMode)(which-UCHAR_NFD_INERT+UNORM_NFD), errorCode);
return U_SUCCESS(errorCode) && norm2->isInert(c);
}
}
#endif
} else if(column==UPROPS_SRC_NFKC) {
#if !UCONFIG_NO_NORMALIZATION
// UCHAR_NFK[CD]_INERT properties
UErrorCode errorCode=U_ZERO_ERROR;
const Normalizer2 *norm2=Normalizer2Factory::getInstance(
(UNormalizationMode)(which-UCHAR_NFD_INERT+UNORM_NFD), errorCode);
return U_SUCCESS(errorCode) && norm2->isInert(c);
#endif
} else if(column==UPROPS_SRC_NFKC_CF) {
// currently only for UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
#if !UCONFIG_NO_NORMALIZATION
UErrorCode errorCode=U_ZERO_ERROR;
const Normalizer2Impl *kcf=Normalizer2Factory::getNFKC_CFImpl(errorCode);
if(U_SUCCESS(errorCode)) {
UnicodeString src(c);
UnicodeString dest;
{
// The ReorderingBuffer must be in a block because its destructor
// needs to release dest's buffer before we look at its contents.
ReorderingBuffer buffer(*kcf, dest);
// Small destCapacity for NFKC_CF(c).
if(buffer.init(5, errorCode)) {
const UChar *srcArray=src.getBuffer();
kcf->compose(srcArray, srcArray+src.length(), FALSE,
TRUE, buffer, errorCode);
}
}
return U_SUCCESS(errorCode) && dest!=src;
}
#endif
} else if(column==UPROPS_SRC_NFC_CANON_ITER) {
/* normalization properties from nfc.nrm canonical iterator data */
// UCHAR_SEGMENT_STARTER
#if !UCONFIG_NO_NORMALIZATION
UErrorCode errorCode=U_ZERO_ERROR;
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
return
U_SUCCESS(errorCode) && impl->ensureCanonIterData(errorCode) &&
impl->isCanonSegmentStarter(c);
#endif
} else if(column==UPROPS_SRC_BIDI) {
/* bidi/shaping properties */
const UBiDiProps *bdp=GET_BIDI_PROPS();
if(bdp!=NULL) {
switch(which) {
case UCHAR_BIDI_MIRRORED:
return ubidi_isMirrored(bdp, c);
case UCHAR_BIDI_CONTROL:
return ubidi_isBidiControl(bdp, c);
case UCHAR_JOIN_CONTROL:
return ubidi_isJoinControl(bdp, c);
default:
break;
}
}
/* else return FALSE below */
} else if(column==UPROPS_SRC_CHAR) {
switch(which) {
case UCHAR_POSIX_BLANK:
return u_isblank(c);
case UCHAR_POSIX_GRAPH:
return u_isgraphPOSIX(c);
case UCHAR_POSIX_PRINT:
return u_isprintPOSIX(c);
case UCHAR_POSIX_XDIGIT:
return u_isxdigit(c);
default:
break;
}
} else if(column==UPROPS_SRC_CHAR_AND_PROPSVEC) {
switch(which) {
case UCHAR_POSIX_ALNUM:
return u_isalnumPOSIX(c);
default:
break;
}
} else if(column==UPROPS_SRC_CASE_AND_NORM) {
#if !UCONFIG_NO_NORMALIZATION
UnicodeString nfd;
UErrorCode errorCode=U_ZERO_ERROR;
const Normalizer2 *nfcNorm2=Normalizer2Factory::getNFCInstance(errorCode);
if(U_FAILURE(errorCode)) {
return FALSE;
}
switch(which) {
case UCHAR_CHANGES_WHEN_CASEFOLDED:
if(nfcNorm2->getDecomposition(c, nfd)) {
/* c has a decomposition */
if(nfd.length()==1) {
c=nfd[0]; /* single BMP code point */
} else if(nfd.length()<=U16_MAX_LENGTH &&
nfd.length()==U16_LENGTH(c=nfd.char32At(0))
) {
/* single supplementary code point */
} else {
c=U_SENTINEL;
}
} else if(c<0) {
return FALSE; /* protect against bad input */
}
errorCode=U_ZERO_ERROR;
if(c>=0) {
/* single code point */
const UCaseProps *csp=ucase_getSingleton(&errorCode);
const UChar *resultString;
return (UBool)(ucase_toFullFolding(csp, c, &resultString, U_FOLD_CASE_DEFAULT)>=0);
} else {
/* guess some large but stack-friendly capacity */
UChar dest[2*UCASE_MAX_STRING_LENGTH];
int32_t destLength;
destLength=u_strFoldCase(dest, LENGTHOF(dest),
nfd.getBuffer(), nfd.length(),
U_FOLD_CASE_DEFAULT, &errorCode);
return (UBool)(U_SUCCESS(errorCode) &&
0!=u_strCompare(nfd.getBuffer(), nfd.length(),
dest, destLength, FALSE));
}
default:
break;
}
#endif
}
}
const BinaryProperty &prop=binProps[which];
return prop.contains(prop, c, which);
}
return FALSE;
}
#if !UCONFIG_NO_NORMALIZATION
@ -344,6 +303,70 @@ getFCD16(UChar32 c) {
#endif
struct IntProperty;
typedef int32_t IntPropertyGetValue(const IntProperty &prop, UChar32 c, UProperty which);
typedef int32_t IntPropertyGetMaxValue(const IntProperty &prop, UProperty which);
struct IntProperty {
int32_t column; // SRC_PROPSVEC column, or "source" if mask==0
uint32_t mask;
int32_t shift; // =maxValue if getMaxValueFromShift() is used
IntPropertyGetValue *getValue;
IntPropertyGetMaxValue *getMaxValue;
};
static int32_t defaultGetValue(const IntProperty &prop, UChar32 c, UProperty /*which*/) {
/* systematic, directly stored properties */
return (int32_t)(u_getUnicodeProperties(c, prop.column)&prop.mask)>>prop.shift;
}
static int32_t defaultGetMaxValue(const IntProperty &prop, UProperty /*which*/) {
return (uprv_getMaxValues(prop.column)&prop.mask)>>prop.shift;
}
static int32_t getMaxValueFromShift(const IntProperty &prop, UProperty /*which*/) {
return prop.shift;
}
static int32_t getBiDiClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
return (int32_t)u_charDirection(c);
}
static int32_t biDiGetMaxValue(const IntProperty &/*prop*/, UProperty which) {
return ubidi_getMaxValue(GET_BIDI_PROPS(), which);
}
static int32_t getCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
#if UCONFIG_NO_NORMALIZATION
return 0;
#else
return u_getCombiningClass(c);
#endif
}
static int32_t getGeneralCategory(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
return (int32_t)u_charType(c);
}
static int32_t getJoiningGroup(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
return ubidi_getJoiningGroup(GET_BIDI_PROPS(), c);
}
static int32_t getJoiningType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
return ubidi_getJoiningType(GET_BIDI_PROPS(), c);
}
static int32_t getNumericType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
int32_t ntv=(int32_t)GET_NUMERIC_TYPE_VALUE(u_getUnicodeProperties(c, -1));
return UPROPS_NTV_GET_TYPE(ntv);
}
static int32_t getScript(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
UErrorCode errorCode=U_ZERO_ERROR;
return (int32_t)uscript_getScript(c, &errorCode);
}
/*
* Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
* Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
@ -365,79 +388,90 @@ static const UHangulSyllableType gcbToHst[]={
*/
};
static int32_t getHangulSyllableType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
/* see comments on gcbToHst[] above */
int32_t gcb=(int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_GCB_MASK)>>UPROPS_GCB_SHIFT;
if(gcb<LENGTHOF(gcbToHst)) {
return gcbToHst[gcb];
} else {
return U_HST_NOT_APPLICABLE;
}
}
static int32_t getNormQuickCheck(const IntProperty &/*prop*/, UChar32 c, UProperty which) {
#if UCONFIG_NO_NORMALIZATION
return 0;
#else
return (int32_t)unorm_getQuickCheck(c, (UNormalizationMode)(which-UCHAR_NFD_QUICK_CHECK+UNORM_NFD));
#endif
}
static int32_t getLeadCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
#if UCONFIG_NO_NORMALIZATION
return 0;
#else
return getFCD16(c)>>8;
#endif
}
static int32_t getTrailCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
#if UCONFIG_NO_NORMALIZATION
return 0;
#else
return getFCD16(c)&0xff;
#endif
}
static const IntProperty intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]={
/*
* column, mask and shift values for int-value properties from u_getUnicodeProperties().
* Must be in order of corresponding UProperty,
* and there must be exactly one entry per int UProperty.
*
* Properties with mask==0 and getValue==NULL are handled in code.
* For them, column is the UPropertySource value.
*/
{ UPROPS_SRC_BIDI, 0, 0, getBiDiClass, biDiGetMaxValue },
{ 0, UPROPS_BLOCK_MASK, UPROPS_BLOCK_SHIFT, defaultGetValue, defaultGetMaxValue },
{ UPROPS_SRC_NFC, 0, 0xff, getCombiningClass, getMaxValueFromShift },
{ 2, UPROPS_DT_MASK, 0, defaultGetValue, defaultGetMaxValue },
{ 0, UPROPS_EA_MASK, UPROPS_EA_SHIFT, defaultGetValue, defaultGetMaxValue },
{ UPROPS_SRC_CHAR, 0, (int32_t)U_CHAR_CATEGORY_COUNT-1,getGeneralCategory, getMaxValueFromShift },
{ UPROPS_SRC_BIDI, 0, 0, getJoiningGroup, biDiGetMaxValue },
{ UPROPS_SRC_BIDI, 0, 0, getJoiningType, biDiGetMaxValue },
{ 2, UPROPS_LB_MASK, UPROPS_LB_SHIFT, defaultGetValue, defaultGetMaxValue },
{ UPROPS_SRC_CHAR, 0, (int32_t)U_NT_COUNT-1, getNumericType, getMaxValueFromShift },
{ 0, UPROPS_SCRIPT_MASK, 0, getScript, defaultGetMaxValue },
{ UPROPS_SRC_PROPSVEC, 0, (int32_t)U_HST_COUNT-1, getHangulSyllableType, getMaxValueFromShift },
// UCHAR_NFD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes"
{ UPROPS_SRC_NFC, 0, (int32_t)UNORM_YES, getNormQuickCheck, getMaxValueFromShift },
// UCHAR_NFKD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes"
{ UPROPS_SRC_NFKC, 0, (int32_t)UNORM_YES, getNormQuickCheck, getMaxValueFromShift },
// UCHAR_NFC_QUICK_CHECK: max=2=MAYBE
{ UPROPS_SRC_NFC, 0, (int32_t)UNORM_MAYBE, getNormQuickCheck, getMaxValueFromShift },
// UCHAR_NFKC_QUICK_CHECK: max=2=MAYBE
{ UPROPS_SRC_NFKC, 0, (int32_t)UNORM_MAYBE, getNormQuickCheck, getMaxValueFromShift },
{ UPROPS_SRC_NFC, 0, 0xff, getLeadCombiningClass, getMaxValueFromShift },
{ UPROPS_SRC_NFC, 0, 0xff, getTrailCombiningClass, getMaxValueFromShift },
{ 2, UPROPS_GCB_MASK, UPROPS_GCB_SHIFT, defaultGetValue, defaultGetMaxValue },
{ 2, UPROPS_SB_MASK, UPROPS_SB_SHIFT, defaultGetValue, defaultGetMaxValue },
{ 2, UPROPS_WB_MASK, UPROPS_WB_SHIFT, defaultGetValue, defaultGetMaxValue }
};
U_CAPI int32_t U_EXPORT2
u_getIntPropertyValue(UChar32 c, UProperty which) {
UErrorCode errorCode;
if(which<UCHAR_BINARY_START) {
return 0; /* undefined */
} else if(which<UCHAR_BINARY_LIMIT) {
return (int32_t)u_hasBinaryProperty(c, which);
} else if(which<UCHAR_INT_START) {
return 0; /* undefined */
if(which<UCHAR_INT_START) {
if(UCHAR_BINARY_START<=which && which<UCHAR_BINARY_LIMIT) {
const BinaryProperty &prop=binProps[which];
return prop.contains(prop, c, which);
}
} else if(which<UCHAR_INT_LIMIT) {
switch(which) {
case UCHAR_BIDI_CLASS:
return (int32_t)u_charDirection(c);
case UCHAR_BLOCK:
return (int32_t)ublock_getCode(c);
#if !UCONFIG_NO_NORMALIZATION
case UCHAR_CANONICAL_COMBINING_CLASS:
return u_getCombiningClass(c);
#endif
case UCHAR_DECOMPOSITION_TYPE:
return (int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_DT_MASK);
case UCHAR_EAST_ASIAN_WIDTH:
return (int32_t)(u_getUnicodeProperties(c, 0)&UPROPS_EA_MASK)>>UPROPS_EA_SHIFT;
case UCHAR_GENERAL_CATEGORY:
return (int32_t)u_charType(c);
case UCHAR_JOINING_GROUP:
return ubidi_getJoiningGroup(GET_BIDI_PROPS(), c);
case UCHAR_JOINING_TYPE:
return ubidi_getJoiningType(GET_BIDI_PROPS(), c);
case UCHAR_LINE_BREAK:
return (int32_t)(u_getUnicodeProperties(c, UPROPS_LB_VWORD)&UPROPS_LB_MASK)>>UPROPS_LB_SHIFT;
case UCHAR_NUMERIC_TYPE: {
int32_t ntv=(int32_t)GET_NUMERIC_TYPE_VALUE(u_getUnicodeProperties(c, -1));
return UPROPS_NTV_GET_TYPE(ntv);
}
case UCHAR_SCRIPT:
errorCode=U_ZERO_ERROR;
return (int32_t)uscript_getScript(c, &errorCode);
case UCHAR_HANGUL_SYLLABLE_TYPE: {
/* see comments on gcbToHst[] above */
int32_t gcb=(int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_GCB_MASK)>>UPROPS_GCB_SHIFT;
if(gcb<LENGTHOF(gcbToHst)) {
return gcbToHst[gcb];
} else {
return U_HST_NOT_APPLICABLE;
}
}
#if !UCONFIG_NO_NORMALIZATION
case UCHAR_NFD_QUICK_CHECK:
case UCHAR_NFKD_QUICK_CHECK:
case UCHAR_NFC_QUICK_CHECK:
case UCHAR_NFKC_QUICK_CHECK:
return (int32_t)unorm_getQuickCheck(c, (UNormalizationMode)(which-UCHAR_NFD_QUICK_CHECK+UNORM_NFD));
case UCHAR_LEAD_CANONICAL_COMBINING_CLASS:
return getFCD16(c)>>8;
case UCHAR_TRAIL_CANONICAL_COMBINING_CLASS:
return getFCD16(c)&0xff;
#endif
case UCHAR_GRAPHEME_CLUSTER_BREAK:
return (int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_GCB_MASK)>>UPROPS_GCB_SHIFT;
case UCHAR_SENTENCE_BREAK:
return (int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_SB_MASK)>>UPROPS_SB_SHIFT;
case UCHAR_WORD_BREAK:
return (int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_WB_MASK)>>UPROPS_WB_SHIFT;
default:
return 0; /* undefined */
}
const IntProperty &prop=intProps[which-UCHAR_INT_START];
return prop.getValue(prop, c, which);
} else if(which==UCHAR_GENERAL_CATEGORY_MASK) {
return U_MASK(u_charType(c));
} else {
return 0; /* undefined */
}
return 0; // undefined
}
U_CAPI int32_t U_EXPORT2
@ -447,104 +481,36 @@ u_getIntPropertyMinValue(UProperty /*which*/) {
U_CAPI int32_t U_EXPORT2
u_getIntPropertyMaxValue(UProperty which) {
if(which<UCHAR_BINARY_START) {
return -1; /* undefined */
} else if(which<UCHAR_BINARY_LIMIT) {
return 1; /* maximum TRUE for all binary properties */
} else if(which<UCHAR_INT_START) {
return -1; /* undefined */
} else if(which<UCHAR_INT_LIMIT) {
switch(which) {
case UCHAR_BIDI_CLASS:
case UCHAR_JOINING_GROUP:
case UCHAR_JOINING_TYPE:
return ubidi_getMaxValue(GET_BIDI_PROPS(), which);
case UCHAR_BLOCK:
return (uprv_getMaxValues(0)&UPROPS_BLOCK_MASK)>>UPROPS_BLOCK_SHIFT;
case UCHAR_CANONICAL_COMBINING_CLASS:
case UCHAR_LEAD_CANONICAL_COMBINING_CLASS:
case UCHAR_TRAIL_CANONICAL_COMBINING_CLASS:
return 0xff; /* TODO do we need to be more precise, getting the actual maximum? */
case UCHAR_DECOMPOSITION_TYPE:
return uprv_getMaxValues(2)&UPROPS_DT_MASK;
case UCHAR_EAST_ASIAN_WIDTH:
return (uprv_getMaxValues(0)&UPROPS_EA_MASK)>>UPROPS_EA_SHIFT;
case UCHAR_GENERAL_CATEGORY:
return (int32_t)U_CHAR_CATEGORY_COUNT-1;
case UCHAR_LINE_BREAK:
return (uprv_getMaxValues(UPROPS_LB_VWORD)&UPROPS_LB_MASK)>>UPROPS_LB_SHIFT;
case UCHAR_NUMERIC_TYPE:
return (int32_t)U_NT_COUNT-1;
case UCHAR_SCRIPT:
return uprv_getMaxValues(0)&UPROPS_SCRIPT_MASK;
case UCHAR_HANGUL_SYLLABLE_TYPE:
return (int32_t)U_HST_COUNT-1;
#if !UCONFIG_NO_NORMALIZATION
case UCHAR_NFD_QUICK_CHECK:
case UCHAR_NFKD_QUICK_CHECK:
return (int32_t)UNORM_YES; /* these are never "maybe", only "no" or "yes" */
case UCHAR_NFC_QUICK_CHECK:
case UCHAR_NFKC_QUICK_CHECK:
return (int32_t)UNORM_MAYBE;
#endif
case UCHAR_GRAPHEME_CLUSTER_BREAK:
return (uprv_getMaxValues(2)&UPROPS_GCB_MASK)>>UPROPS_GCB_SHIFT;
case UCHAR_SENTENCE_BREAK:
return (uprv_getMaxValues(2)&UPROPS_SB_MASK)>>UPROPS_SB_SHIFT;
case UCHAR_WORD_BREAK:
return (uprv_getMaxValues(2)&UPROPS_WB_MASK)>>UPROPS_WB_SHIFT;
default:
return -1; /* undefined */
if(which<UCHAR_INT_START) {
if(UCHAR_BINARY_START<=which && which<UCHAR_BINARY_LIMIT) {
return 1; // maximum TRUE for all binary properties
}
} else {
return -1; /* undefined */
} else if(which<UCHAR_INT_LIMIT) {
const IntProperty &prop=intProps[which-UCHAR_INT_START];
return prop.getMaxValue(prop, which);
}
return -1; // undefined
}
/*
* TODO: Simplify, similar to binProps[].
* Use an array of column/source, mask, shift values to drive returning simple
* properties and their sources.
*
* TODO: Split the single propsvec into one per column, and have
* upropsvec_addPropertyStarts() pass a trie value function that gets the
* desired column's values.
*/
U_CFUNC UPropertySource U_EXPORT2
uprops_getSource(UProperty which) {
if(which<UCHAR_BINARY_START) {
return UPROPS_SRC_NONE; /* undefined */
} else if(which<UCHAR_BINARY_LIMIT) {
if(binProps[which].mask!=0) {
const BinaryProperty &prop=binProps[which];
if(prop.mask!=0) {
return UPROPS_SRC_PROPSVEC;
} else {
return (UPropertySource)binProps[which].column;
return (UPropertySource)prop.column;
}
} else if(which<UCHAR_INT_START) {
return UPROPS_SRC_NONE; /* undefined */
} else if(which<UCHAR_INT_LIMIT) {
switch(which) {
case UCHAR_GENERAL_CATEGORY:
case UCHAR_NUMERIC_TYPE:
return UPROPS_SRC_CHAR;
case UCHAR_CANONICAL_COMBINING_CLASS:
case UCHAR_NFD_QUICK_CHECK:
case UCHAR_NFC_QUICK_CHECK:
case UCHAR_LEAD_CANONICAL_COMBINING_CLASS:
case UCHAR_TRAIL_CANONICAL_COMBINING_CLASS:
return UPROPS_SRC_NFC;
case UCHAR_NFKD_QUICK_CHECK:
case UCHAR_NFKC_QUICK_CHECK:
return UPROPS_SRC_NFKC;
case UCHAR_BIDI_CLASS:
case UCHAR_JOINING_GROUP:
case UCHAR_JOINING_TYPE:
return UPROPS_SRC_BIDI;
default:
const IntProperty &prop=intProps[which-UCHAR_INT_START];
if(prop.mask!=0) {
return UPROPS_SRC_PROPSVEC;
} else {
return (UPropertySource)prop.column;
}
} else if(which<UCHAR_STRING_START) {
switch(which) {
@ -582,7 +548,12 @@ uprops_getSource(UProperty which) {
return UPROPS_SRC_NONE;
}
} else {
return UPROPS_SRC_NONE; /* undefined */
switch(which) {
case UCHAR_SCRIPT_EXTENSIONS:
return UPROPS_SRC_PROPSVEC;
default:
return UPROPS_SRC_NONE; /* undefined */
}
}
}
@ -604,7 +575,7 @@ u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *p
// case folding and NFKC.)
// For the derivation, see Unicode's DerivedNormalizationProps.txt.
const Normalizer2 *nfkc=Normalizer2Factory::getNFKCInstance(*pErrorCode);
const UCaseProps *csp=ucase_getSingleton(pErrorCode);
const UCaseProps *csp=ucase_getSingleton();
if(U_FAILURE(*pErrorCode)) {
return 0;
}
@ -637,114 +608,3 @@ u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *p
}
}
#endif
/*----------------------------------------------------------------
* Inclusions list
*----------------------------------------------------------------*/
/*
* Return a set of characters for property enumeration.
* The set implicitly contains 0x110000 as well, which is one more than the highest
* Unicode code point.
*
* This set is used as an ordered list - its code points are ordered, and
* consecutive code points (in Unicode code point order) in the set define a range.
* For each two consecutive characters (start, limit) in the set,
* all of the UCD/normalization and related properties for
* all code points start..limit-1 are all the same,
* except for character names and ISO comments.
*
* All Unicode code points U+0000..U+10ffff are covered by these ranges.
* The ranges define a partition of the Unicode code space.
* ICU uses the inclusions set to enumerate properties for generating
* UnicodeSets containing all code points that have a certain property value.
*
* The Inclusion List is generated from the UCD. It is generated
* by enumerating the data tries, and code points for hardcoded properties
* are added as well.
*
* --------------------------------------------------------------------------
*
* The following are ideas for getting properties-unique code point ranges,
* with possible optimizations beyond the current implementation.
* These optimizations would require more code and be more fragile.
* The current implementation generates one single list (set) for all properties.
*
* To enumerate properties efficiently, one needs to know ranges of
* repetitive values, so that the value of only each start code point
* can be applied to the whole range.
* This information is in principle available in the uprops.icu/unorm.icu data.
*
* There are two obstacles:
*
* 1. Some properties are computed from multiple data structures,
* making it necessary to get repetitive ranges by intersecting
* ranges from multiple tries.
*
* 2. It is not economical to write code for getting repetitive ranges
* that are precise for each of some 50 properties.
*
* Compromise ideas:
*
* - Get ranges per trie, not per individual property.
* Each range contains the same values for a whole group of properties.
* This would generate currently five range sets, two for uprops.icu tries
* and three for unorm.icu tries.
*
* - Combine sets of ranges for multiple tries to get sufficient sets
* for properties, e.g., the uprops.icu main and auxiliary tries
* for all non-normalization properties.
*
* Ideas for representing ranges and combining them:
*
* - A UnicodeSet could hold just the start code points of ranges.
* Multiple sets are easily combined by or-ing them together.
*
* - Alternatively, a UnicodeSet could hold each even-numbered range.
* All ranges could be enumerated by using each start code point
* (for the even-numbered ranges) as well as each limit (end+1) code point
* (for the odd-numbered ranges).
* It should be possible to combine two such sets by xor-ing them,
* but no more than two.
*
* The second way to represent ranges may(?!) yield smaller UnicodeSet arrays,
* but the first one is certainly simpler and applicable for combining more than
* two range sets.
*
* It is possible to combine all range sets for all uprops/unorm tries into one
* set that can be used for all properties.
* As an optimization, there could be less-combined range sets for certain
* groups of properties.
* The relationship of which less-combined range set to use for which property
* depends on the implementation of the properties and must be hardcoded
* - somewhat error-prone and higher maintenance but can be tested easily
* by building property sets "the simple way" in test code.
*
* ---
*
* Do not use a UnicodeSet pattern because that causes infinite recursion;
* UnicodeSet depends on the inclusions set.
*
* ---
*
* uprv_getInclusions() is commented out starting 2004-sep-13 because
* uniset_props.cpp now calls the uxyz_addPropertyStarts() directly,
* and only for the relevant property source.
*/
#if 0
U_CAPI void U_EXPORT2
uprv_getInclusions(const USetAdder *sa, UErrorCode *pErrorCode) {
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
#if !UCONFIG_NO_NORMALIZATION
unorm_addPropertyStarts(sa, pErrorCode);
#endif
uchar_addPropertyStarts(sa, pErrorCode);
ucase_addPropertyStarts(ucase_getSingleton(pErrorCode), sa, pErrorCode);
ubidi_addPropertyStarts(ubidi_getSingleton(pErrorCode), sa, pErrorCode);
}
#endif

View file

@ -35,7 +35,13 @@ enum {
UPROPS_ADDITIONAL_VECTORS_INDEX,
UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX,
UPROPS_RESERVED_INDEX, /* 6 */
UPROPS_SCRIPT_EXTENSIONS_INDEX,
UPROPS_RESERVED_INDEX_7,
UPROPS_RESERVED_INDEX_8,
/* size of the data file (number of 32-bit units after the header) */
UPROPS_DATA_TOP_INDEX,
/* maximum values for code values in vector word 0 */
UPROPS_MAX_VALUES_INDEX=10,
@ -83,16 +89,25 @@ enum {
* Properties in vector word 0
* Bits
* 31..24 DerivedAge version major/minor one nibble each
* 23..20 reserved
* 23..22 3..1: Bits 7..0 = Script_Extensions index
* 3: Script value from Script_Extensions
* 2: Script=Inherited
* 1: Script=Common
* 0: Script=bits 7..0
* 21..20 reserved
* 19..17 East Asian Width
* 16.. 8 UBlockCode
* 7.. 0 UScriptCode
* 7.. 0 UScriptCode, or index to Script_Extensions
*/
/* derived age: one nibble each for major and minor version numbers */
#define UPROPS_AGE_MASK 0xff000000
#define UPROPS_AGE_SHIFT 24
/* Script_Extensions: mask includes Script */
#define UPROPS_SCRIPT_X_MASK 0x00c000ff
#define UPROPS_SCRIPT_X_SHIFT 22
#define UPROPS_EA_MASK 0x000e0000
#define UPROPS_EA_SHIFT 17
@ -101,6 +116,11 @@ enum {
#define UPROPS_SCRIPT_MASK 0x000000ff
/* UPROPS_SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. */
#define UPROPS_SCRIPT_X_WITH_COMMON 0x400000
#define UPROPS_SCRIPT_X_WITH_INHERITED 0x800000
#define UPROPS_SCRIPT_X_WITH_OTHER 0xc00000
/*
* Properties in vector word 1
* Each bit encodes one binary property.
@ -160,7 +180,6 @@ enum {
*/
#define UPROPS_LB_MASK 0x03f00000
#define UPROPS_LB_SHIFT 20
#define UPROPS_LB_VWORD 2
#define UPROPS_SB_MASK 0x000f8000
#define UPROPS_SB_SHIFT 15

View file

@ -316,17 +316,10 @@ _shapeToArabicDigitsWithContext(UChar *s, int32_t length,
UChar digitBase,
UBool isLogical, UBool lastStrongWasAL) {
const UBiDiProps *bdp;
UErrorCode errorCode;
int32_t i;
UChar c;
errorCode=U_ZERO_ERROR;
bdp=ubidi_getSingleton(&errorCode);
if(U_FAILURE(errorCode)) {
return;
}
bdp=ubidi_getSingleton();
digitBase-=0x30;
/* the iteration direction depends on the type of input */

View file

@ -348,17 +348,13 @@ usprep_getProfile(const char* path,
if(!loadData(newProfile.getAlias(), path, name, _SPREP_DATA_TYPE, status) || U_FAILURE(*status) ){
return NULL;
}
/* get the options */
newProfile->doNFKC = (UBool)((newProfile->indexes[_SPREP_OPTIONS] & _SPREP_NORMALIZATION_ON) > 0);
newProfile->checkBiDi = (UBool)((newProfile->indexes[_SPREP_OPTIONS] & _SPREP_CHECK_BIDI_ON) > 0);
if(newProfile->checkBiDi) {
newProfile->bdp = ubidi_getSingleton(status);
if(U_FAILURE(*status)) {
usprep_unload(newProfile.getAlias());
return NULL;
}
newProfile->bdp = ubidi_getSingleton();
}
LocalMemory<UStringPrepKey> key;

View file

@ -191,10 +191,7 @@ setTempCaseMapLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode)
static U_INLINE void
setTempCaseMap(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
if(csm->csp==NULL) {
csm->csp=ucase_getSingleton(pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return;
}
csm->csp=ucase_getSingleton();
}
if(locale!=NULL && locale[0]==0) {
csm->locale[0]=0;
@ -622,7 +619,7 @@ u_strFoldCase(UChar *dest, int32_t destCapacity,
uint32_t options,
UErrorCode *pErrorCode) {
UCaseMap csm={ NULL };
csm.csp=ucase_getSingleton(pErrorCode);
csm.csp=ucase_getSingleton();
csm.options=options;
return caseMap(&csm,
dest, destCapacity,
@ -680,7 +677,7 @@ u_strcmpFold(const UChar *s1, int32_t length1,
* assume that at least the option U_COMPARE_IGNORE_CASE is set
* otherwise this function would have to behave exactly as uprv_strCompare()
*/
csp=ucase_getSingleton(pErrorCode);
csp=ucase_getSingleton();
if(U_FAILURE(*pErrorCode)) {
return 0;
}

View file

@ -583,21 +583,18 @@ utext_caseCompare(UText *s1, int32_t length1,
/* current code points */
UChar32 c1, c2;
uint8_t cLength1, cLength2;
/* argument checking */
if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if(s1==NULL || s2==NULL) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
csp=ucase_getSingleton(pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return 0;
}
csp=ucase_getSingleton();
/* for variable-length strings */
if(length1 < 0) {
length1 = INT32_MIN;
@ -709,21 +706,18 @@ utext_caseCompareNativeLimit(UText *s1, int64_t limit1,
/* native indexes into s1 and s2 */
int64_t index1, index2;
/* argument checking */
if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if(s1==NULL || s2==NULL) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
csp=ucase_getSingleton(pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return 0;
}
csp=ucase_getSingleton();
/* initialize */
index1 = (limit1 >= 0 ? UTEXT_GETNATIVEINDEX(s1) : 0);
index2 = (limit2 >= 0 ? UTEXT_GETNATIVEINDEX(s2) : 0);

View file

@ -482,6 +482,9 @@ utrie2_swap(const UDataSwapper *ds,
return size;
}
// utrie2_swapAnyVersion() should be defined here but lives in utrie2_builder.c
// to avoid a dependency from utrie2.cpp on utrie.c.
/* enumeration -------------------------------------------------------------- */
#define MIN(a, b) ((a)<(b) ? (a) : (b))

View file

@ -349,6 +349,15 @@ utrie2_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode);
/**
* Swap a serialized UTrie or UTrie2.
* @internal
*/
U_CAPI int32_t U_EXPORT2
utrie2_swapAnyVersion(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode);
/**
* Build a UTrie2 (version 2) from a UTrie (version 1).
* Enumerates all values in the UTrie and builds a UTrie2 with the same values.

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 2001-2009, International Business Machines
* Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -31,7 +31,7 @@
#include "utrie2.h"
#include "utrie2_impl.h"
#include "utrie.h" /* for utrie2_fromUTrie() */
#include "utrie.h" /* for utrie2_fromUTrie() and utrie_swap() */
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
@ -1445,3 +1445,25 @@ utrie2_serialize(UTrie2 *trie,
}
return trie->length;
}
/*
* This is here to avoid a dependency from utrie2.cpp on utrie.c.
* This file already depends on utrie.c.
* Otherwise, this should be in utrie2.cpp right after utrie2_swap().
*/
U_CAPI int32_t U_EXPORT2
utrie2_swapAnyVersion(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode) {
if(U_SUCCESS(*pErrorCode)) {
switch(utrie2_getVersion(inData, length, TRUE)) {
case 1:
return utrie_swap(ds, inData, length, outData, pErrorCode);
case 2:
return utrie2_swap(ds, inData, length, outData, pErrorCode);
default:
*pErrorCode=U_INVALID_FORMAT_ERROR;
return 0;
}
}
}

View file

@ -654,6 +654,14 @@ UTS46::mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart
return length;
}
// Some non-ASCII characters are equivalent to sequences with
// non-LDH ASCII characters. To find them:
// grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt)
static inline UBool
isNonASCIIDisallowedSTD3Valid(UChar32 c) {
return c==0x2260 || c==0x226E || c==0x226F;
}
// Replace the label in dest with the label string, if the label was modified.
// If &label==&dest then the label was modified in-place and labelLength
// is the new label length, different from label.length().
@ -778,9 +786,11 @@ UTS46::processLabel(UnicodeString &dest,
}
} else {
oredChars|=c;
if(c==0xfffd) {
if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) {
info.labelErrors|=UIDNA_ERROR_DISALLOWED;
*s=0xfffd;
} else if(c==0xfffd) {
info.labelErrors|=UIDNA_ERROR_DISALLOWED;
++s;
}
}
++s;

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -13,7 +13,7 @@
U_ICUDATA_NAME=icudt45
##############################################################################
U_ICUDATA_ENDIAN_SUFFIX=l
UNICODE_VERSION=5.2
UNICODE_VERSION=6.0
ICU_LIB_TARGET=$(DLL_OUTPUT)\$(U_ICUDATA_NAME).dll
# ICUMAKE

View file

@ -1,18 +1,19 @@
# BidiMirroring-5.2.0.txt
# Date: 2009-05-22, 12:44:00 PDT [KW]
# BidiMirroring-6.0.0.txt
# Date: 2010-06-21, 12:09:00 PDT [KW]
#
# Bidi_Mirroring_Glyph Property
#
# This file is an informative contributory data file in the
# Unicode Character Database.
#
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# This data file lists characters that have the mirrored property
# where there is another Unicode character that typically has a glyph
# This data file lists characters that have the Bidi_Mirrored=True property
# value, for which there is another Unicode character that typically has a glyph
# that is the mirror image of the original character's glyph.
# The repertoire covered by the file is Unicode 5.2.0.
#
# The repertoire covered by the file is Unicode 6.0.0.
#
# The file contains a list of lines with mappings from one code point
# to another one for character-based mirroring.
@ -25,14 +26,22 @@
# variable-length hexadecimal value with 4 to 6 digits.
# A comment indicates where the characters are "BEST FIT" mirroring.
#
# Code points with the "mirrored" property but no appropriate mirrors are
# Code points for which Bidi_Mirrored=True, but for which no appropriate
# characters exist with mirrored glyphs, are
# listed as comments at the end of the file.
#
# Formally, the default value of the Bidi_Mirroring_Glyph property
# for each code point is the code point itself, unless a mapping to
# some other character is specified in this data file. When a code
# point has the default value for the Bidi_Mirroring_Glyph property,
# that means that no other character exists whose glyph is suitable
# for character-based mirroring.
#
# For information on bidi mirroring, see UAX #9: Bidirectional Algorithm,
# at http://www.unicode.org/unicode/reports/tr9/
#
# This file was originally created by Markus Scherer.
# Extended for Unicode 3.2, 4.0, 4.1, 5.0, 5.1, and 5.2 by Ken Whistler.
# Extended for Unicode 3.2, 4.0, 4.1, 5.0, 5.1, 5.2, and 6.0 by Ken Whistler.
#
# ############################################################
@ -464,8 +473,8 @@ FF63; FF62 # [BEST FIT] HALFWIDTH RIGHT CORNER BRACKET
# 22FF; Z NOTATION BAG MEMBERSHIP
# 2320; TOP HALF INTEGRAL
# 2321; BOTTOM HALF INTEGRAL
# 27CC; LONG DIVISION
# 27C0; THREE DIMENSIONAL ANGLE
# 27CC; LONG DIVISION
# 27D3; LOWER RIGHT CORNER WITH DOT
# 27D4; UPPER LEFT CORNER WITH DOT
# 27DC; LEFT MULTIMAP

View file

@ -1,8 +1,8 @@
# Blocks-5.2.0.txt
# Date: 2009-05-19, 16:21:00 PDT [KW]
# Blocks-6.0.0.txt
# Date: 2010-06-04, 11:12:00 PDT [KW]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#
@ -45,6 +45,7 @@
0780..07BF; Thaana
07C0..07FF; NKo
0800..083F; Samaritan
0840..085F; Mandaic
0900..097F; Devanagari
0980..09FF; Bengali
0A00..0A7F; Gurmukhi
@ -82,6 +83,7 @@
1A20..1AAF; Tai Tham
1B00..1B7F; Balinese
1B80..1BBF; Sundanese
1BC0..1BFF; Batak
1C00..1C4F; Lepcha
1C50..1C7F; Ol Chiki
1CD0..1CFF; Vedic Extensions
@ -159,6 +161,7 @@ A980..A9DF; Javanese
AA00..AA5F; Cham
AA60..AA7F; Myanmar Extended-A
AA80..AADF; Tai Viet
AB00..AB2F; Ethiopic Extended-A
ABC0..ABFF; Meetei Mayek
AC00..D7AF; Hangul Syllables
D7B0..D7FF; Hangul Jamo Extended-B
@ -203,10 +206,13 @@ FFF0..FFFF; Specials
10B60..10B7F; Inscriptional Pahlavi
10C00..10C4F; Old Turkic
10E60..10E7F; Rumi Numeral Symbols
11000..1107F; Brahmi
11080..110CF; Kaithi
12000..123FF; Cuneiform
12400..1247F; Cuneiform Numbers and Punctuation
13000..1342F; Egyptian Hieroglyphs
16800..16A3F; Bamum Supplement
1B000..1B0FF; Kana Supplement
1D000..1D0FF; Byzantine Musical Symbols
1D100..1D1FF; Musical Symbols
1D200..1D24F; Ancient Greek Musical Notation
@ -215,10 +221,16 @@ FFF0..FFFF; Specials
1D400..1D7FF; Mathematical Alphanumeric Symbols
1F000..1F02F; Mahjong Tiles
1F030..1F09F; Domino Tiles
1F0A0..1F0FF; Playing Cards
1F100..1F1FF; Enclosed Alphanumeric Supplement
1F200..1F2FF; Enclosed Ideographic Supplement
1F300..1F5FF; Miscellaneous Symbols And Pictographs
1F600..1F64F; Emoticons
1F680..1F6FF; Transport And Map Symbols
1F700..1F77F; Alchemical Symbols
20000..2A6DF; CJK Unified Ideographs Extension B
2A700..2B73F; CJK Unified Ideographs Extension C
2B740..2B81F; CJK Unified Ideographs Extension D
2F800..2FA1F; CJK Compatibility Ideographs Supplement
E0000..E007F; Tags
E0100..E01EF; Variation Selectors Supplement

View file

@ -1,8 +1,8 @@
# CaseFolding-5.2.0.txt
# Date: 2009-05-28, 23:02:34 GMT [MD]
# CaseFolding-6.0.0.txt
# Date: 2010-05-18, 00:48:57 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#
@ -496,6 +496,7 @@
0520; C; 0521; # CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK
0522; C; 0523; # CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK
0524; C; 0525; # CYRILLIC CAPITAL LETTER PE WITH DESCENDER
0526; C; 0527; # CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER
0531; C; 0561; # ARMENIAN CAPITAL LETTER AYB
0532; C; 0562; # ARMENIAN CAPITAL LETTER BEN
0533; C; 0563; # ARMENIAN CAPITAL LETTER GIM
@ -1057,6 +1058,7 @@ A658; C; A659; # CYRILLIC CAPITAL LETTER CLOSED LITTLE YUS
A65A; C; A65B; # CYRILLIC CAPITAL LETTER BLENDED YUS
A65C; C; A65D; # CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS
A65E; C; A65F; # CYRILLIC CAPITAL LETTER YN
A660; C; A661; # CYRILLIC CAPITAL LETTER REVERSED TSE
A662; C; A663; # CYRILLIC CAPITAL LETTER SOFT DE
A664; C; A665; # CYRILLIC CAPITAL LETTER SOFT EL
A666; C; A667; # CYRILLIC CAPITAL LETTER SOFT EM
@ -1122,6 +1124,13 @@ A782; C; A783; # LATIN CAPITAL LETTER INSULAR R
A784; C; A785; # LATIN CAPITAL LETTER INSULAR S
A786; C; A787; # LATIN CAPITAL LETTER INSULAR T
A78B; C; A78C; # LATIN CAPITAL LETTER SALTILLO
A78D; C; 0265; # LATIN CAPITAL LETTER TURNED H
A790; C; A791; # LATIN CAPITAL LETTER N WITH DESCENDER
A7A0; C; A7A1; # LATIN CAPITAL LETTER G WITH OBLIQUE STROKE
A7A2; C; A7A3; # LATIN CAPITAL LETTER K WITH OBLIQUE STROKE
A7A4; C; A7A5; # LATIN CAPITAL LETTER N WITH OBLIQUE STROKE
A7A6; C; A7A7; # LATIN CAPITAL LETTER R WITH OBLIQUE STROKE
A7A8; C; A7A9; # LATIN CAPITAL LETTER S WITH OBLIQUE STROKE
FB00; F; 0066 0066; # LATIN SMALL LIGATURE FF
FB01; F; 0066 0069; # LATIN SMALL LIGATURE FI
FB02; F; 0066 006C; # LATIN SMALL LIGATURE FL

View file

@ -1,8 +1,8 @@
# DerivedAge-5.2.0.txt
# Date: 2009-09-17, 22:52:52 GMT [MD]
# DerivedAge-6.0.0.txt
# Date: 2010-08-19, 00:47:58 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#
@ -1061,4 +1061,117 @@ FA6B..FA6D ; 5.2 # [3] CJK COMPATIBILITY IDEOGRAPH-FA6B..CJK COMPATIBILITY
# Total code points: 6648
# ================================================
# Newly assigned in Unicode 6.0.0 (Scheduled September, 2010)
0526..0527 ; 6.0 # [2] CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER..CYRILLIC SMALL LETTER SHHA WITH DESCENDER
0620 ; 6.0 # ARABIC LETTER KASHMIRI YEH
065F ; 6.0 # ARABIC WAVY HAMZA BELOW
0840..085B ; 6.0 # [28] MANDAIC LETTER HALQA..MANDAIC GEMINATION MARK
085E ; 6.0 # MANDAIC PUNCTUATION
093A..093B ; 6.0 # [2] DEVANAGARI VOWEL SIGN OE..DEVANAGARI VOWEL SIGN OOE
094F ; 6.0 # DEVANAGARI VOWEL SIGN AW
0956..0957 ; 6.0 # [2] DEVANAGARI VOWEL SIGN UE..DEVANAGARI VOWEL SIGN UUE
0973..0977 ; 6.0 # [5] DEVANAGARI LETTER OE..DEVANAGARI LETTER UUE
0B72..0B77 ; 6.0 # [6] ORIYA FRACTION ONE QUARTER..ORIYA FRACTION THREE SIXTEENTHS
0D29 ; 6.0 # MALAYALAM LETTER NNNA
0D3A ; 6.0 # MALAYALAM LETTER TTTA
0D4E ; 6.0 # MALAYALAM LETTER DOT REPH
0F8C..0F8F ; 6.0 # [4] TIBETAN SIGN INVERTED MCHU CAN..TIBETAN SUBJOINED SIGN INVERTED MCHU CAN
0FD9..0FDA ; 6.0 # [2] TIBETAN MARK LEADING MCHAN RTAGS..TIBETAN MARK TRAILING MCHAN RTAGS
135D..135E ; 6.0 # [2] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING VOWEL LENGTH MARK
1BC0..1BF3 ; 6.0 # [52] BATAK LETTER A..BATAK PANONGONAN
1BFC..1BFF ; 6.0 # [4] BATAK SYMBOL BINDU NA METEK..BATAK SYMBOL BINDU PANGOLAT
1DFC ; 6.0 # COMBINING DOUBLE INVERTED BREVE BELOW
2095..209C ; 6.0 # [8] LATIN SUBSCRIPT SMALL LETTER H..LATIN SUBSCRIPT SMALL LETTER T
20B9 ; 6.0 # INDIAN RUPEE SIGN
23E9..23F3 ; 6.0 # [11] BLACK RIGHT-POINTING DOUBLE TRIANGLE..HOURGLASS WITH FLOWING SAND
26CE ; 6.0 # OPHIUCHUS
26E2 ; 6.0 # ASTRONOMICAL SYMBOL FOR URANUS
26E4..26E7 ; 6.0 # [4] PENTAGRAM..INVERTED PENTAGRAM
2705 ; 6.0 # WHITE HEAVY CHECK MARK
270A..270B ; 6.0 # [2] RAISED FIST..RAISED HAND
2728 ; 6.0 # SPARKLES
274C ; 6.0 # CROSS MARK
274E ; 6.0 # NEGATIVE SQUARED CROSS MARK
2753..2755 ; 6.0 # [3] BLACK QUESTION MARK ORNAMENT..WHITE EXCLAMATION MARK ORNAMENT
275F..2760 ; 6.0 # [2] HEAVY LOW SINGLE COMMA QUOTATION MARK ORNAMENT..HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT
2795..2797 ; 6.0 # [3] HEAVY PLUS SIGN..HEAVY DIVISION SIGN
27B0 ; 6.0 # CURLY LOOP
27BF ; 6.0 # DOUBLE CURLY LOOP
27CE..27CF ; 6.0 # [2] SQUARED LOGICAL AND..SQUARED LOGICAL OR
2D70 ; 6.0 # TIFINAGH SEPARATOR MARK
2D7F ; 6.0 # TIFINAGH CONSONANT JOINER
31B8..31BA ; 6.0 # [3] BOPOMOFO LETTER GH..BOPOMOFO LETTER ZY
A660..A661 ; 6.0 # [2] CYRILLIC CAPITAL LETTER REVERSED TSE..CYRILLIC SMALL LETTER REVERSED TSE
A78D..A78E ; 6.0 # [2] LATIN CAPITAL LETTER TURNED H..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT
A790..A791 ; 6.0 # [2] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN SMALL LETTER N WITH DESCENDER
A7A0..A7A9 ; 6.0 # [10] LATIN CAPITAL LETTER G WITH OBLIQUE STROKE..LATIN SMALL LETTER S WITH OBLIQUE STROKE
A7FA ; 6.0 # LATIN LETTER SMALL CAPITAL TURNED M
AB01..AB06 ; 6.0 # [6] ETHIOPIC SYLLABLE TTHU..ETHIOPIC SYLLABLE TTHO
AB09..AB0E ; 6.0 # [6] ETHIOPIC SYLLABLE DDHU..ETHIOPIC SYLLABLE DDHO
AB11..AB16 ; 6.0 # [6] ETHIOPIC SYLLABLE DZU..ETHIOPIC SYLLABLE DZO
AB20..AB26 ; 6.0 # [7] ETHIOPIC SYLLABLE CCHHA..ETHIOPIC SYLLABLE CCHHO
AB28..AB2E ; 6.0 # [7] ETHIOPIC SYLLABLE BBA..ETHIOPIC SYLLABLE BBO
FBB2..FBC1 ; 6.0 # [16] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL SMALL TAH BELOW
11000..1104D ; 6.0 # [78] BRAHMI SIGN CANDRABINDU..BRAHMI PUNCTUATION LOTUS
11052..1106F ; 6.0 # [30] BRAHMI NUMBER ONE..BRAHMI DIGIT NINE
16800..16A38 ; 6.0 # [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
1B000..1B001 ; 6.0 # [2] KATAKANA LETTER ARCHAIC E..HIRAGANA LETTER ARCHAIC YE
1F0A0..1F0AE ; 6.0 # [15] PLAYING CARD BACK..PLAYING CARD KING OF SPADES
1F0B1..1F0BE ; 6.0 # [14] PLAYING CARD ACE OF HEARTS..PLAYING CARD KING OF HEARTS
1F0C1..1F0CF ; 6.0 # [15] PLAYING CARD ACE OF DIAMONDS..PLAYING CARD BLACK JOKER
1F0D1..1F0DF ; 6.0 # [15] PLAYING CARD ACE OF CLUBS..PLAYING CARD WHITE JOKER
1F130 ; 6.0 # SQUARED LATIN CAPITAL LETTER A
1F132..1F13C ; 6.0 # [11] SQUARED LATIN CAPITAL LETTER C..SQUARED LATIN CAPITAL LETTER M
1F13E ; 6.0 # SQUARED LATIN CAPITAL LETTER O
1F140..1F141 ; 6.0 # [2] SQUARED LATIN CAPITAL LETTER Q..SQUARED LATIN CAPITAL LETTER R
1F143..1F145 ; 6.0 # [3] SQUARED LATIN CAPITAL LETTER T..SQUARED LATIN CAPITAL LETTER V
1F147..1F149 ; 6.0 # [3] SQUARED LATIN CAPITAL LETTER X..SQUARED LATIN CAPITAL LETTER Z
1F14F..1F156 ; 6.0 # [8] SQUARED WC..NEGATIVE CIRCLED LATIN CAPITAL LETTER G
1F158..1F15E ; 6.0 # [7] NEGATIVE CIRCLED LATIN CAPITAL LETTER I..NEGATIVE CIRCLED LATIN CAPITAL LETTER O
1F160..1F169 ; 6.0 # [10] NEGATIVE CIRCLED LATIN CAPITAL LETTER Q..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z
1F170..1F178 ; 6.0 # [9] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER I
1F17A ; 6.0 # NEGATIVE SQUARED LATIN CAPITAL LETTER K
1F17D..1F17E ; 6.0 # [2] NEGATIVE SQUARED LATIN CAPITAL LETTER N..NEGATIVE SQUARED LATIN CAPITAL LETTER O
1F180..1F189 ; 6.0 # [10] NEGATIVE SQUARED LATIN CAPITAL LETTER Q..NEGATIVE SQUARED LATIN CAPITAL LETTER Z
1F18E..1F18F ; 6.0 # [2] NEGATIVE SQUARED AB..NEGATIVE SQUARED WC
1F191..1F19A ; 6.0 # [10] SQUARED CL..SQUARED VS
1F1E6..1F1FF ; 6.0 # [26] REGIONAL INDICATOR SYMBOL LETTER A..REGIONAL INDICATOR SYMBOL LETTER Z
1F201..1F202 ; 6.0 # [2] SQUARED KATAKANA KOKO..SQUARED KATAKANA SA
1F232..1F23A ; 6.0 # [9] SQUARED CJK UNIFIED IDEOGRAPH-7981..SQUARED CJK UNIFIED IDEOGRAPH-55B6
1F250..1F251 ; 6.0 # [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT
1F300..1F320 ; 6.0 # [33] CYCLONE..SHOOTING STAR
1F330..1F335 ; 6.0 # [6] CHESTNUT..CACTUS
1F337..1F37C ; 6.0 # [70] TULIP..BABY BOTTLE
1F380..1F393 ; 6.0 # [20] RIBBON..GRADUATION CAP
1F3A0..1F3C4 ; 6.0 # [37] CAROUSEL HORSE..SURFER
1F3C6..1F3CA ; 6.0 # [5] TROPHY..SWIMMER
1F3E0..1F3F0 ; 6.0 # [17] HOUSE BUILDING..EUROPEAN CASTLE
1F400..1F43E ; 6.0 # [63] RAT..PAW PRINTS
1F440 ; 6.0 # EYES
1F442..1F4F7 ; 6.0 # [182] EAR..CAMERA
1F4F9..1F4FC ; 6.0 # [4] VIDEO CAMERA..VIDEOCASSETTE
1F500..1F53D ; 6.0 # [62] TWISTED RIGHTWARDS ARROWS..DOWN-POINTING SMALL RED TRIANGLE
1F550..1F567 ; 6.0 # [24] CLOCK FACE ONE OCLOCK..CLOCK FACE TWELVE-THIRTY
1F5FB..1F5FF ; 6.0 # [5] MOUNT FUJI..MOYAI
1F601..1F610 ; 6.0 # [16] GRINNING FACE WITH SMILING EYES..NEUTRAL FACE
1F612..1F614 ; 6.0 # [3] UNAMUSED FACE..PENSIVE FACE
1F616 ; 6.0 # CONFOUNDED FACE
1F618 ; 6.0 # FACE THROWING A KISS
1F61A ; 6.0 # KISSING FACE WITH CLOSED EYES
1F61C..1F61E ; 6.0 # [3] FACE WITH STUCK-OUT TONGUE AND WINKING EYE..DISAPPOINTED FACE
1F620..1F625 ; 6.0 # [6] ANGRY FACE..DISAPPOINTED BUT RELIEVED FACE
1F628..1F62B ; 6.0 # [4] FEARFUL FACE..TIRED FACE
1F62D ; 6.0 # LOUDLY CRYING FACE
1F630..1F633 ; 6.0 # [4] FACE WITH OPEN MOUTH AND COLD SWEAT..FLUSHED FACE
1F635..1F640 ; 6.0 # [12] DIZZY FACE..WEARY CAT FACE
1F645..1F64F ; 6.0 # [11] FACE WITH NO GOOD GESTURE..PERSON WITH FOLDED HANDS
1F680..1F6C5 ; 6.0 # [70] ROCKET..LEFT LUGGAGE
1F700..1F773 ; 6.0 # [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE
2B740..2B81D ; 6.0 # [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
# Total code points: 2088
# EOF

View file

@ -1,8 +1,8 @@
# DerivedBidiClass-5.2.0.txt
# Date: 2009-08-26, 00:50:45 GMT [MD]
# DerivedBidiClass-6.0.0.txt
# Date: 2010-08-19, 00:48:03 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
@ -81,7 +81,7 @@
03A3..03F5 ; L # L& [83] GREEK CAPITAL LETTER SIGMA..GREEK LUNATE EPSILON SYMBOL
03F7..0481 ; L # L& [139] GREEK CAPITAL LETTER SHO..CYRILLIC SMALL LETTER KOPPA
0482 ; L # So CYRILLIC THOUSANDS SIGN
048A..0525 ; L # L& [156] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER PE WITH DESCENDER
048A..0527 ; L # L& [158] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER SHHA WITH DESCENDER
0531..0556 ; L # L& [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH
0559 ; L # Lm ARMENIAN MODIFIER LETTER LEFT HALF RING
055A..055F ; L # Po [6] ARMENIAN APOSTROPHE..ARMENIAN ABBREVIATION MARK
@ -89,17 +89,18 @@
0589 ; L # Po ARMENIAN FULL STOP
0903 ; L # Mc DEVANAGARI SIGN VISARGA
0904..0939 ; L # Lo [54] DEVANAGARI LETTER SHORT A..DEVANAGARI LETTER HA
093B ; L # Mc DEVANAGARI VOWEL SIGN OOE
093D ; L # Lo DEVANAGARI SIGN AVAGRAHA
093E..0940 ; L # Mc [3] DEVANAGARI VOWEL SIGN AA..DEVANAGARI VOWEL SIGN II
0949..094C ; L # Mc [4] DEVANAGARI VOWEL SIGN CANDRA O..DEVANAGARI VOWEL SIGN AU
094E ; L # Mc DEVANAGARI VOWEL SIGN PRISHTHAMATRA E
094E..094F ; L # Mc [2] DEVANAGARI VOWEL SIGN PRISHTHAMATRA E..DEVANAGARI VOWEL SIGN AW
0950 ; L # Lo DEVANAGARI OM
0958..0961 ; L # Lo [10] DEVANAGARI LETTER QA..DEVANAGARI LETTER VOCALIC LL
0964..0965 ; L # Po [2] DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA
0966..096F ; L # Nd [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE
0970 ; L # Po DEVANAGARI ABBREVIATION SIGN
0971 ; L # Lm DEVANAGARI SIGN HIGH SPACING DOT
0972 ; L # Lo DEVANAGARI LETTER CANDRA A
0972..0977 ; L # Lo [6] DEVANAGARI LETTER CANDRA A..DEVANAGARI LETTER UUE
0979..097F ; L # Lo [7] DEVANAGARI LETTER ZHA..DEVANAGARI LETTER BBA
0982..0983 ; L # Mc [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA
0985..098C ; L # Lo [8] BENGALI LETTER A..BENGALI LETTER VOCALIC L
@ -165,6 +166,7 @@
0B66..0B6F ; L # Nd [10] ORIYA DIGIT ZERO..ORIYA DIGIT NINE
0B70 ; L # So ORIYA ISSHAR
0B71 ; L # Lo ORIYA LETTER WA
0B72..0B77 ; L # No [6] ORIYA FRACTION ONE QUARTER..ORIYA FRACTION THREE SIXTEENTHS
0B83 ; L # Lo TAMIL SIGN VISARGA
0B85..0B8A ; L # Lo [6] TAMIL LETTER A..TAMIL LETTER UU
0B8E..0B90 ; L # Lo [3] TAMIL LETTER E..TAMIL LETTER AI
@ -212,15 +214,16 @@
0CDE ; L # Lo KANNADA LETTER FA
0CE0..0CE1 ; L # Lo [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL
0CE6..0CEF ; L # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
0CF1..0CF2 ; L # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
0D02..0D03 ; L # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
0D05..0D0C ; L # Lo [8] MALAYALAM LETTER A..MALAYALAM LETTER VOCALIC L
0D0E..0D10 ; L # Lo [3] MALAYALAM LETTER E..MALAYALAM LETTER AI
0D12..0D28 ; L # Lo [23] MALAYALAM LETTER O..MALAYALAM LETTER NA
0D2A..0D39 ; L # Lo [16] MALAYALAM LETTER PA..MALAYALAM LETTER HA
0D12..0D3A ; L # Lo [41] MALAYALAM LETTER O..MALAYALAM LETTER TTTA
0D3D ; L # Lo MALAYALAM SIGN AVAGRAHA
0D3E..0D40 ; L # Mc [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II
0D46..0D48 ; L # Mc [3] MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN AI
0D4A..0D4C ; L # Mc [3] MALAYALAM VOWEL SIGN O..MALAYALAM VOWEL SIGN AU
0D4E ; L # Lo MALAYALAM LETTER DOT REPH
0D57 ; L # Mc MALAYALAM AU LENGTH MARK
0D60..0D61 ; L # Lo [2] MALAYALAM LETTER VOCALIC RR..MALAYALAM LETTER VOCALIC LL
0D66..0D6F ; L # Nd [10] MALAYALAM DIGIT ZERO..MALAYALAM DIGIT NINE
@ -277,12 +280,13 @@
0F49..0F6C ; L # Lo [36] TIBETAN LETTER NYA..TIBETAN LETTER RRA
0F7F ; L # Mc TIBETAN SIGN RNAM BCAD
0F85 ; L # Po TIBETAN MARK PALUTA
0F88..0F8B ; L # Lo [4] TIBETAN SIGN LCE TSA CAN..TIBETAN SIGN GRU MED RGYINGS
0F88..0F8C ; L # Lo [5] TIBETAN SIGN LCE TSA CAN..TIBETAN SIGN INVERTED MCHU CAN
0FBE..0FC5 ; L # So [8] TIBETAN KU RU KHA..TIBETAN SYMBOL RDO RJE
0FC7..0FCC ; L # So [6] TIBETAN SYMBOL RDO RJE RGYA GRAM..TIBETAN SYMBOL NOR BU BZHI -KHYIL
0FCE..0FCF ; L # So [2] TIBETAN SIGN RDEL NAG RDEL DKAR..TIBETAN SIGN RDEL NAG GSUM
0FD0..0FD4 ; L # Po [5] TIBETAN MARK BSKA- SHOG GI MGO RGYAN..TIBETAN MARK CLOSING BRDA RNYING YIG MGO SGAB MA
0FD5..0FD8 ; L # So [4] RIGHT-FACING SVASTI SIGN..LEFT-FACING SVASTI SIGN WITH DOTS
0FD9..0FDA ; L # Po [2] TIBETAN MARK LEADING MCHAN RTAGS..TIBETAN MARK TRAILING MCHAN RTAGS
1000..102A ; L # Lo [43] MYANMAR LETTER KA..MYANMAR LETTER AU
102B..102C ; L # Mc [2] MYANMAR VOWEL SIGN TALL AA..MYANMAR VOWEL SIGN AA
1031 ; L # Mc MYANMAR VOWEL SIGN E
@ -375,7 +379,8 @@
19B0..19C0 ; L # Mc [17] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE VOWEL SIGN IY
19C1..19C7 ; L # Lo [7] NEW TAI LUE LETTER FINAL V..NEW TAI LUE LETTER FINAL B
19C8..19C9 ; L # Mc [2] NEW TAI LUE TONE MARK-1..NEW TAI LUE TONE MARK-2
19D0..19DA ; L # Nd [11] NEW TAI LUE DIGIT ZERO..NEW TAI LUE THAM DIGIT ONE
19D0..19D9 ; L # Nd [10] NEW TAI LUE DIGIT ZERO..NEW TAI LUE DIGIT NINE
19DA ; L # No NEW TAI LUE THAM DIGIT ONE
1A00..1A16 ; L # Lo [23] BUGINESE LETTER KA..BUGINESE LETTER HA
1A19..1A1B ; L # Mc [3] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN AE
1A1E..1A1F ; L # Po [2] BUGINESE PALLAWA..BUGINESE END OF SECTION
@ -408,6 +413,12 @@
1BAA ; L # Mc SUNDANESE SIGN PAMAAEH
1BAE..1BAF ; L # Lo [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA
1BB0..1BB9 ; L # Nd [10] SUNDANESE DIGIT ZERO..SUNDANESE DIGIT NINE
1BC0..1BE5 ; L # Lo [38] BATAK LETTER A..BATAK LETTER U
1BE7 ; L # Mc BATAK VOWEL SIGN E
1BEA..1BEC ; L # Mc [3] BATAK VOWEL SIGN I..BATAK VOWEL SIGN O
1BEE ; L # Mc BATAK VOWEL SIGN U
1BF2..1BF3 ; L # Mc [2] BATAK PANGOLAT..BATAK PANONGONAN
1BFC..1BFF ; L # Po [4] BATAK SYMBOL BINDU NA METEK..BATAK SYMBOL BINDU PANGOLAT
1C00..1C23 ; L # Lo [36] LEPCHA LETTER KA..LEPCHA LETTER A
1C24..1C2B ; L # Mc [8] LEPCHA SUBJOINED LETTER YA..LEPCHA VOWEL SIGN UU
1C34..1C35 ; L # Mc [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG
@ -451,7 +462,7 @@
200E ; L # Cf LEFT-TO-RIGHT MARK
2071 ; L # Lm SUPERSCRIPT LATIN SMALL LETTER I
207F ; L # Lm SUPERSCRIPT LATIN SMALL LETTER N
2090..2094 ; L # Lm [5] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER SCHWA
2090..209C ; L # Lm [13] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER T
2102 ; L # L& DOUBLE-STRUCK CAPITAL C
2107 ; L # L& EULER CONSTANT
210A..2113 ; L # L& [10] SCRIPT SMALL G..SCRIPT SMALL L
@ -485,6 +496,7 @@
2D00..2D25 ; L # L& [38] GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LETTER HOE
2D30..2D65 ; L # Lo [54] TIFINAGH LETTER YA..TIFINAGH LETTER YAZZ
2D6F ; L # Lm TIFINAGH MODIFIER LETTER LABIALIZATION MARK
2D70 ; L # Po TIFINAGH SEPARATOR MARK
2D80..2D96 ; L # Lo [23] ETHIOPIC SYLLABLE LOA..ETHIOPIC SYLLABLE GGWE
2DA0..2DA6 ; L # Lo [7] ETHIOPIC SYLLABLE SSA..ETHIOPIC SYLLABLE SSO
2DA8..2DAE ; L # Lo [7] ETHIOPIC SYLLABLE CCA..ETHIOPIC SYLLABLE CCO
@ -513,7 +525,7 @@
3190..3191 ; L # So [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK
3192..3195 ; L # No [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK
3196..319F ; L # So [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK
31A0..31B7 ; L # Lo [24] BOPOMOFO LETTER BU..BOPOMOFO FINAL LETTER H
31A0..31BA ; L # Lo [27] BOPOMOFO LETTER BU..BOPOMOFO LETTER ZY
31F0..31FF ; L # Lo [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO
3200..321C ; L # So [29] PARENTHESIZED HANGUL KIYEOK..PARENTHESIZED HANGUL CIEUC U
3220..3229 ; L # No [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN
@ -540,8 +552,7 @@ A60C ; L # Lm VAI SYLLABLE LENGTHENER
A610..A61F ; L # Lo [16] VAI SYLLABLE NDOLE FA..VAI SYMBOL JONG
A620..A629 ; L # Nd [10] VAI DIGIT ZERO..VAI DIGIT NINE
A62A..A62B ; L # Lo [2] VAI SYLLABLE NDOLE MA..VAI SYLLABLE NDOLE DO
A640..A65F ; L # L& [32] CYRILLIC CAPITAL LETTER ZEMLYA..CYRILLIC SMALL LETTER YN
A662..A66D ; L # L& [12] CYRILLIC CAPITAL LETTER SOFT DE..CYRILLIC SMALL LETTER DOUBLE MONOCULAR O
A640..A66D ; L # L& [46] CYRILLIC CAPITAL LETTER ZEMLYA..CYRILLIC SMALL LETTER DOUBLE MONOCULAR O
A66E ; L # Lo CYRILLIC LETTER MULTIOCULAR O
A680..A697 ; L # L& [24] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER SHWE
A6A0..A6E5 ; L # Lo [70] BAMUM LETTER A..BAMUM LETTER KI
@ -551,7 +562,10 @@ A722..A76F ; L # L& [78] LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF..LATIN SMAL
A770 ; L # Lm MODIFIER LETTER US
A771..A787 ; L # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T
A789..A78A ; L # Sk [2] MODIFIER LETTER COLON..MODIFIER LETTER SHORT EQUALS SIGN
A78B..A78C ; L # L& [2] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER SALTILLO
A78B..A78E ; L # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT
A790..A791 ; L # L& [2] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN SMALL LETTER N WITH DESCENDER
A7A0..A7A9 ; L # L& [10] LATIN CAPITAL LETTER G WITH OBLIQUE STROKE..LATIN SMALL LETTER S WITH OBLIQUE STROKE
A7FA ; L # L& LATIN LETTER SMALL CAPITAL TURNED M
A7FB..A801 ; L # Lo [7] LATIN EPIGRAPHIC LETTER REVERSED F..SYLOTI NAGRI LETTER I
A803..A805 ; L # Lo [3] SYLOTI NAGRI LETTER U..SYLOTI NAGRI LETTER O
A807..A80A ; L # Lo [4] SYLOTI NAGRI LETTER KO..SYLOTI NAGRI LETTER GHO
@ -608,6 +622,11 @@ AAC2 ; L # Lo TAI VIET TONE MAI SONG
AADB..AADC ; L # Lo [2] TAI VIET SYMBOL KON..TAI VIET SYMBOL NUENG
AADD ; L # Lm TAI VIET SYMBOL SAM
AADE..AADF ; L # Po [2] TAI VIET SYMBOL HO HOI..TAI VIET SYMBOL KOI KOI
AB01..AB06 ; L # Lo [6] ETHIOPIC SYLLABLE TTHU..ETHIOPIC SYLLABLE TTHO
AB09..AB0E ; L # Lo [6] ETHIOPIC SYLLABLE DDHU..ETHIOPIC SYLLABLE DDHO
AB11..AB16 ; L # Lo [6] ETHIOPIC SYLLABLE DZU..ETHIOPIC SYLLABLE DZO
AB20..AB26 ; L # Lo [7] ETHIOPIC SYLLABLE CCHHA..ETHIOPIC SYLLABLE CCHHO
AB28..AB2E ; L # Lo [7] ETHIOPIC SYLLABLE BBA..ETHIOPIC SYLLABLE BBO
ABC0..ABE2 ; L # Lo [35] MEETEI MAYEK LETTER KOK..MEETEI MAYEK LETTER I LONSUM
ABE3..ABE4 ; L # Mc [2] MEETEI MAYEK VOWEL SIGN ONAP..MEETEI MAYEK VOWEL SIGN INAP
ABE6..ABE7 ; L # Mc [2] MEETEI MAYEK VOWEL SIGN YENAP..MEETEI MAYEK VOWEL SIGN SOUNAP
@ -664,6 +683,11 @@ FFDA..FFDC ; L # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER
10400..1044F ; L # L& [80] DESERET CAPITAL LETTER LONG I..DESERET SMALL LETTER EW
10450..1049D ; L # Lo [78] SHAVIAN LETTER PEEP..OSMANYA LETTER OO
104A0..104A9 ; L # Nd [10] OSMANYA DIGIT ZERO..OSMANYA DIGIT NINE
11000 ; L # Mc BRAHMI SIGN CANDRABINDU
11002 ; L # Mc BRAHMI SIGN VISARGA
11003..11037 ; L # Lo [53] BRAHMI SIGN JIHVAMULIYA..BRAHMI LETTER OLD TAMIL NNNA
11047..1104D ; L # Po [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS
11066..1106F ; L # Nd [10] BRAHMI DIGIT ZERO..BRAHMI DIGIT NINE
11082 ; L # Mc KAITHI SIGN VISARGA
11083..110AF ; L # Lo [45] KAITHI LETTER A..KAITHI LETTER HA
110B0..110B2 ; L # Mc [3] KAITHI VOWEL SIGN AA..KAITHI VOWEL SIGN II
@ -675,6 +699,8 @@ FFDA..FFDC ; L # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER
12400..12462 ; L # Nl [99] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN OLD ASSYRIAN ONE QUARTER
12470..12473 ; L # Po [4] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON
13000..1342E ; L # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
16800..16A38 ; L # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
1B000..1B001 ; L # Lo [2] KATAKANA LETTER ARCHAIC E..HIRAGANA LETTER ARCHAIC YE
1D000..1D0F5 ; L # So [246] BYZANTINE MUSICAL SYMBOL PSILI..BYZANTINE MUSICAL SYMBOL GORGON NEO KATO
1D100..1D126 ; L # So [39] MUSICAL SYMBOL SINGLE BARLINE..MUSICAL SYMBOL DRUM CLEF-2
1D129..1D164 ; L # So [60] MUSICAL SYMBOL MULTIPLE MEASURE REST..MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
@ -721,30 +747,23 @@ FFDA..FFDC ; L # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER
1D7AA..1D7C2 ; L # L& [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA
1D7C4..1D7CB ; L # L& [8] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA
1F110..1F12E ; L # So [31] PARENTHESIZED LATIN CAPITAL LETTER A..CIRCLED WZ
1F131 ; L # So SQUARED LATIN CAPITAL LETTER B
1F13D ; L # So SQUARED LATIN CAPITAL LETTER N
1F13F ; L # So SQUARED LATIN CAPITAL LETTER P
1F142 ; L # So SQUARED LATIN CAPITAL LETTER S
1F146 ; L # So SQUARED LATIN CAPITAL LETTER W
1F14A..1F14E ; L # So [5] SQUARED HV..SQUARED PPV
1F157 ; L # So NEGATIVE CIRCLED LATIN CAPITAL LETTER H
1F15F ; L # So NEGATIVE CIRCLED LATIN CAPITAL LETTER P
1F179 ; L # So NEGATIVE SQUARED LATIN CAPITAL LETTER J
1F17B..1F17C ; L # So [2] NEGATIVE SQUARED LATIN CAPITAL LETTER L..NEGATIVE SQUARED LATIN CAPITAL LETTER M
1F17F ; L # So NEGATIVE SQUARED LATIN CAPITAL LETTER P
1F18A..1F18D ; L # So [4] CROSSED NEGATIVE SQUARED LATIN CAPITAL LETTER P..NEGATIVE SQUARED SA
1F190 ; L # So SQUARE DJ
1F200 ; L # So SQUARE HIRAGANA HOKA
1F210..1F231 ; L # So [34] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-6253
1F130..1F169 ; L # So [58] SQUARED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z
1F170..1F19A ; L # So [43] NEGATIVE SQUARED LATIN CAPITAL LETTER A..SQUARED VS
1F1E6..1F202 ; L # So [29] REGIONAL INDICATOR SYMBOL LETTER A..SQUARED KATAKANA SA
1F210..1F23A ; L # So [43] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-55B6
1F240..1F248 ; L # So [9] TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557
1F250..1F251 ; L # So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT
1F48C ; L # So LOVE LETTER
1F524 ; L # So INPUT SYMBOL FOR LATIN LETTERS
20000..2A6D6 ; L # Lo [42711] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6D6
2A700..2B734 ; L # Lo [4149] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B734
2B740..2B81D ; L # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
2F800..2FA1D ; L # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
F0000..FFFFD ; L # Co [65534] <private-use-F0000>..<private-use-FFFFD>
100000..10FFFD; L # Co [65534] <private-use-100000>..<private-use-10FFFD>
# The above property value applies to 861492 code points not listed here.
# Total code points: 1099541
# The above property value applies to 859451 code points not listed here.
# Total code points: 1098619
# ================================================
@ -772,7 +791,11 @@ F0000..FFFFD ; L # Co [65534] <private-use-F0000>..<private-use-FFFFD>
0828 ; R # Lm SAMARITAN MODIFIER LETTER I
082E..082F ; R # Cn [2] <reserved-082E>..<reserved-082F>
0830..083E ; R # Po [15] SAMARITAN PUNCTUATION NEQUDAA..SAMARITAN PUNCTUATION ANNAAU
083F..08FF ; R # Cn [193] <reserved-083F>..<reserved-08FF>
083F ; R # Cn <reserved-083F>
0840..0858 ; R # Lo [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN
085C..085D ; R # Cn [2] <reserved-085C>..<reserved-085D>
085E ; R # Po MANDAIC PUNCTUATION
085F..08FF ; R # Cn [161] <reserved-085F>..<reserved-08FF>
200F ; R # Cf RIGHT-TO-LEFT MARK
FB1D ; R # Lo HEBREW LETTER YOD WITH HIRIQ
FB1F..FB28 ; R # Lo [10] HEBREW LIGATURE YIDDISH YOD YOD PATAH..HEBREW LETTER WIDE TAV
@ -841,7 +864,7 @@ FB46..FB4F ; R # Lo [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW LIGATURE AL
10E7F..10FFF ; R # Cn [385] <reserved-10E7F>..<reserved-10FFF>
1E800..1EFFF ; R # Cn [2048] <reserved-1E800>..<reserved-1EFFF>
# Total code points: 4441
# Total code points: 4438
# ================================================
@ -897,7 +920,7 @@ FF0D ; ES # Pd FULLWIDTH HYPHEN-MINUS
0E3F ; ET # Sc THAI CURRENCY SYMBOL BAHT
17DB ; ET # Sc KHMER CURRENCY SYMBOL RIEL
2030..2034 ; ET # Po [5] PER MILLE SIGN..TRIPLE PRIME
20A0..20B8 ; ET # Sc [25] EURO-CURRENCY SIGN..TENGE SIGN
20A0..20B9 ; ET # Sc [26] EURO-CURRENCY SIGN..INDIAN RUPEE SIGN
212E ; ET # So ESTIMATED SYMBOL
2213 ; ET # Sm MINUS-OR-PLUS SIGN
A838 ; ET # Sc NORTH INDIC RUPEE MARK
@ -911,7 +934,7 @@ FF05 ; ET # Po FULLWIDTH PERCENT SIGN
FFE0..FFE1 ; ET # Sc [2] FULLWIDTH CENT SIGN..FULLWIDTH POUND SIGN
FFE5..FFE6 ; ET # Sc [2] FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN
# Total code points: 63
# Total code points: 64
# ================================================
@ -921,9 +944,10 @@ FFE5..FFE6 ; ET # Sc [2] FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN
0660..0669 ; AN # Nd [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE
066B..066C ; AN # Po [2] ARABIC DECIMAL SEPARATOR..ARABIC THOUSANDS SEPARATOR
06DD ; AN # Cf ARABIC END OF AYAH
070F ; AN # Cf SYRIAC ABBREVIATION MARK
10E60..10E7E ; AN # No [31] RUMI DIGIT ONE..RUMI FRACTION TWO THIRDS
# Total code points: 48
# Total code points: 49
# ================================================
@ -1038,13 +1062,13 @@ FF1A ; CS # Po FULLWIDTH COLON
058A ; ON # Pd ARMENIAN HYPHEN
0606..0607 ; ON # Sm [2] ARABIC-INDIC CUBE ROOT..ARABIC-INDIC FOURTH ROOT
060E..060F ; ON # So [2] ARABIC POETIC VERSE SIGN..ARABIC SIGN MISRA
06DE ; ON # So ARABIC START OF RUB EL HIZB
06E9 ; ON # So ARABIC PLACE OF SAJDAH
07F6 ; ON # So NKO SYMBOL OO DENNEN
07F7..07F9 ; ON # Po [3] NKO SYMBOL GBAKURUNEN..NKO EXCLAMATION MARK
0BF3..0BF8 ; ON # So [6] TAMIL DAY SIGN..TAMIL AS ABOVE SIGN
0BFA ; ON # So TAMIL NUMBER SIGN
0C78..0C7E ; ON # No [7] TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR..TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR
0CF1..0CF2 ; ON # So [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
0F3A ; ON # Ps TIBETAN MARK GUG RTAGS GYON
0F3B ; ON # Pe TIBETAN MARK GUG RTAGS GYAS
0F3C ; ON # Ps TIBETAN MARK ANG KHANG GYON
@ -1059,8 +1083,7 @@ FF1A ; CS # Po FULLWIDTH COLON
1807..180A ; ON # Po [4] MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER..MONGOLIAN NIRUGU
1940 ; ON # So LIMBU SIGN LOO
1944..1945 ; ON # Po [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK
19DE..19DF ; ON # Po [2] NEW TAI LUE SIGN LAE..NEW TAI LUE SIGN LAEV
19E0..19FF ; ON # So [32] KHMER SYMBOL PATHAMASAT..KHMER SYMBOL DAP-PRAM ROC
19DE..19FF ; ON # So [34] NEW TAI LUE SIGN LAE..KHMER SYMBOL DAP-PRAM ROC
1FBD ; ON # Sk GREEK KORONIS
1FBF..1FC1 ; ON # Sk [3] GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI
1FCD..1FCF ; ON # Sk [3] GREEK PSILI AND VARIA..GREEK PSILI AND PERISPOMENI
@ -1100,7 +1123,8 @@ FF1A ; CS # Po FULLWIDTH COLON
2103..2106 ; ON # So [4] DEGREE CELSIUS..CADA UNA
2108..2109 ; ON # So [2] SCRUPLE..DEGREE FAHRENHEIT
2114 ; ON # So L B BAR SYMBOL
2116..2118 ; ON # So [3] NUMERO SIGN..SCRIPT CAPITAL P
2116..2117 ; ON # So [2] NUMERO SIGN..SOUND RECORDING COPYRIGHT
2118 ; ON # Sm SCRIPT CAPITAL P
211E..2123 ; ON # So [6] PRESCRIPTION TAKE..VERSICLE
2125 ; ON # So OUNCE SIGN
2127 ; ON # So INVERTED OHM SIGN
@ -1147,7 +1171,7 @@ FF1A ; CS # Po FULLWIDTH COLON
239B..23B3 ; ON # Sm [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM
23B4..23DB ; ON # So [40] TOP SQUARE BRACKET..FUSE
23DC..23E1 ; ON # Sm [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET
23E2..23E8 ; ON # So [7] WHITE TRAPEZIUM..DECIMAL EXPONENT SYMBOL
23E2..23F3 ; ON # So [18] WHITE TRAPEZIUM..HOURGLASS WITH FLOWING SAND
2400..2426 ; ON # So [39] SYMBOL FOR NULL..SYMBOL FOR SUBSTITUTE FORM TWO
2440..244A ; ON # So [11] OCR HOOK..OCR DOUBLE BACKSLASH
2460..2487 ; ON # No [40] CIRCLED DIGIT ONE..PARENTHESIZED NUMBER TWENTY
@ -1161,18 +1185,8 @@ FF1A ; CS # Po FULLWIDTH COLON
2600..266E ; ON # So [111] BLACK SUN WITH RAYS..MUSIC NATURAL SIGN
266F ; ON # Sm MUSIC SHARP SIGN
2670..26AB ; ON # So [60] WEST SYRIAC CROSS..MEDIUM BLACK CIRCLE
26AD..26CD ; ON # So [33] MARRIAGE SYMBOL..DISABLED CAR
26CF..26E1 ; ON # So [19] PICK..RESTRICTED LEFT ENTRY-2
26E3 ; ON # So HEAVY CIRCLE WITH STROKE AND TWO DOTS ABOVE
26E8..26FF ; ON # So [24] BLACK CROSS ON SHIELD..WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE
2701..2704 ; ON # So [4] UPPER BLADE SCISSORS..WHITE SCISSORS
2706..2709 ; ON # So [4] TELEPHONE LOCATION SIGN..ENVELOPE
270C..2727 ; ON # So [28] VICTORY HAND..WHITE FOUR POINTED STAR
2729..274B ; ON # So [35] STRESS OUTLINED WHITE STAR..HEAVY EIGHT TEARDROP-SPOKED PROPELLER ASTERISK
274D ; ON # So SHADOWED WHITE CIRCLE
274F..2752 ; ON # So [4] LOWER RIGHT DROP-SHADOWED WHITE SQUARE..UPPER RIGHT SHADOWED WHITE SQUARE
2756..275E ; ON # So [9] BLACK DIAMOND MINUS WHITE X..HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT
2761..2767 ; ON # So [7] CURVED STEM PARAGRAPH SIGN ORNAMENT..ROTATED FLORAL HEART BULLET
26AD..26FF ; ON # So [83] MARRIAGE SYMBOL..WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE
2701..2767 ; ON # So [103] UPPER BLADE SCISSORS..ROTATED FLORAL HEART BULLET
2768 ; ON # Ps MEDIUM LEFT PARENTHESIS ORNAMENT
2769 ; ON # Pe MEDIUM RIGHT PARENTHESIS ORNAMENT
276A ; ON # Ps MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT
@ -1188,15 +1202,13 @@ FF1A ; CS # Po FULLWIDTH COLON
2774 ; ON # Ps MEDIUM LEFT CURLY BRACKET ORNAMENT
2775 ; ON # Pe MEDIUM RIGHT CURLY BRACKET ORNAMENT
2776..2793 ; ON # No [30] DINGBAT NEGATIVE CIRCLED DIGIT ONE..DINGBAT NEGATIVE CIRCLED SANS-SERIF NUMBER TEN
2794 ; ON # So HEAVY WIDE-HEADED RIGHTWARDS ARROW
2798..27AF ; ON # So [24] HEAVY SOUTH EAST ARROW..NOTCHED LOWER RIGHT-SHADOWED WHITE RIGHTWARDS ARROW
27B1..27BE ; ON # So [14] NOTCHED UPPER RIGHT-SHADOWED WHITE RIGHTWARDS ARROW..OPEN-OUTLINED RIGHTWARDS ARROW
2794..27BF ; ON # So [44] HEAVY WIDE-HEADED RIGHTWARDS ARROW..DOUBLE CURLY LOOP
27C0..27C4 ; ON # Sm [5] THREE DIMENSIONAL ANGLE..OPEN SUPERSET
27C5 ; ON # Ps LEFT S-SHAPED BAG DELIMITER
27C6 ; ON # Pe RIGHT S-SHAPED BAG DELIMITER
27C7..27CA ; ON # Sm [4] OR WITH DOT INSIDE..VERTICAL BAR WITH HORIZONTAL STROKE
27CC ; ON # Sm LONG DIVISION
27D0..27E5 ; ON # Sm [22] WHITE DIAMOND WITH CENTRED DOT..WHITE SQUARE WITH RIGHTWARDS TICK
27CE..27E5 ; ON # Sm [24] SQUARED LOGICAL AND..WHITE SQUARE WITH RIGHTWARDS TICK
27E6 ; ON # Ps MATHEMATICAL LEFT WHITE SQUARE BRACKET
27E7 ; ON # Pe MATHEMATICAL RIGHT WHITE SQUARE BRACKET
27E8 ; ON # Ps MATHEMATICAL LEFT ANGLE BRACKET
@ -1424,6 +1436,7 @@ FFFC..FFFD ; ON # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARACTE
10190..1019B ; ON # So [12] ROMAN SEXTANS SIGN..ROMAN CENTURIAL SIGN
1091F ; ON # Po PHOENICIAN WORD SEPARATOR
10B39..10B3F ; ON # Po [7] AVESTAN ABBREVIATION MARK..LARGE ONE RING OVER TWO RINGS PUNCTUATION
11052..11065 ; ON # No [20] BRAHMI NUMBER ONE..BRAHMI NUMBER ONE THOUSAND
1D200..1D241 ; ON # So [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54
1D245 ; ON # So GREEK MUSICAL LEIMMA
1D300..1D356 ; ON # So [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING
@ -1434,8 +1447,42 @@ FFFC..FFFD ; ON # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARACTE
1D7C3 ; ON # Sm MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL
1F000..1F02B ; ON # So [44] MAHJONG TILE EAST WIND..MAHJONG TILE BACK
1F030..1F093 ; ON # So [100] DOMINO TILE HORIZONTAL BACK..DOMINO TILE VERTICAL-06-06
1F0A0..1F0AE ; ON # So [15] PLAYING CARD BACK..PLAYING CARD KING OF SPADES
1F0B1..1F0BE ; ON # So [14] PLAYING CARD ACE OF HEARTS..PLAYING CARD KING OF HEARTS
1F0C1..1F0CF ; ON # So [15] PLAYING CARD ACE OF DIAMONDS..PLAYING CARD BLACK JOKER
1F0D1..1F0DF ; ON # So [15] PLAYING CARD ACE OF CLUBS..PLAYING CARD WHITE JOKER
1F300..1F320 ; ON # So [33] CYCLONE..SHOOTING STAR
1F330..1F335 ; ON # So [6] CHESTNUT..CACTUS
1F337..1F37C ; ON # So [70] TULIP..BABY BOTTLE
1F380..1F393 ; ON # So [20] RIBBON..GRADUATION CAP
1F3A0..1F3C4 ; ON # So [37] CAROUSEL HORSE..SURFER
1F3C6..1F3CA ; ON # So [5] TROPHY..SWIMMER
1F3E0..1F3F0 ; ON # So [17] HOUSE BUILDING..EUROPEAN CASTLE
1F400..1F43E ; ON # So [63] RAT..PAW PRINTS
1F440 ; ON # So EYES
1F442..1F48B ; ON # So [74] EAR..KISS MARK
1F48D..1F4F7 ; ON # So [107] RING..CAMERA
1F4F9..1F4FC ; ON # So [4] VIDEO CAMERA..VIDEOCASSETTE
1F500..1F523 ; ON # So [36] TWISTED RIGHTWARDS ARROWS..INPUT SYMBOL FOR SYMBOLS
1F525..1F53D ; ON # So [25] FIRE..DOWN-POINTING SMALL RED TRIANGLE
1F550..1F567 ; ON # So [24] CLOCK FACE ONE OCLOCK..CLOCK FACE TWELVE-THIRTY
1F5FB..1F5FF ; ON # So [5] MOUNT FUJI..MOYAI
1F601..1F610 ; ON # So [16] GRINNING FACE WITH SMILING EYES..NEUTRAL FACE
1F612..1F614 ; ON # So [3] UNAMUSED FACE..PENSIVE FACE
1F616 ; ON # So CONFOUNDED FACE
1F618 ; ON # So FACE THROWING A KISS
1F61A ; ON # So KISSING FACE WITH CLOSED EYES
1F61C..1F61E ; ON # So [3] FACE WITH STUCK-OUT TONGUE AND WINKING EYE..DISAPPOINTED FACE
1F620..1F625 ; ON # So [6] ANGRY FACE..DISAPPOINTED BUT RELIEVED FACE
1F628..1F62B ; ON # So [4] FEARFUL FACE..TIRED FACE
1F62D ; ON # So LOUDLY CRYING FACE
1F630..1F633 ; ON # So [4] FACE WITH OPEN MOUTH AND COLD SWEAT..FLUSHED FACE
1F635..1F640 ; ON # So [12] DIZZY FACE..WEARY CAT FACE
1F645..1F64F ; ON # So [11] FACE WITH NO GOOD GESTURE..PERSON WITH FOLDED HANDS
1F680..1F6C5 ; ON # So [70] ROCKET..LEFT LUGGAGE
1F700..1F773 ; ON # So [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE
# Total code points: 3523
# Total code points: 4412
# ================================================
@ -1446,7 +1493,6 @@ FFFC..FFFD ; ON # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARACTE
007F..0084 ; BN # Cc [6] <control-007F>..<control-0084>
0086..009F ; BN # Cc [26] <control-0086>..<control-009F>
00AD ; BN # Cf SOFT HYPHEN
070F ; BN # Cf SYRIAC ABBREVIATION MARK
200B..200D ; BN # Cf [3] ZERO WIDTH SPACE..ZERO WIDTH JOINER
2060..2064 ; BN # Cf [5] WORD JOINER..INVISIBLE PLUS
2065..2069 ; BN # Cn [5] <reserved-2065>..<reserved-2069>
@ -1478,7 +1524,7 @@ EFFFE..EFFFF ; BN # Cn [2] <noncharacter-EFFFE>..<noncharacter-EFFFF>
FFFFE..FFFFF ; BN # Cn [2] <noncharacter-FFFFE>..<noncharacter-FFFFF>
10FFFE..10FFFF; BN # Cn [2] <noncharacter-10FFFE>..<noncharacter-10FFFF>
# Total code points: 4016
# Total code points: 4015
# ================================================
@ -1493,10 +1539,9 @@ FFFFE..FFFFF ; BN # Cn [2] <noncharacter-FFFFE>..<noncharacter-FFFFF>
05C4..05C5 ; NSM # Mn [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT
05C7 ; NSM # Mn HEBREW POINT QAMATS QATAN
0610..061A ; NSM # Mn [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA
064B..065E ; NSM # Mn [20] ARABIC FATHATAN..ARABIC FATHA WITH TWO DOTS
064B..065F ; NSM # Mn [21] ARABIC FATHATAN..ARABIC WAVY HAMZA BELOW
0670 ; NSM # Mn ARABIC LETTER SUPERSCRIPT ALEF
06D6..06DC ; NSM # Mn [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN
06DE ; NSM # Me ARABIC START OF RUB EL HIZB
06DF..06E4 ; NSM # Mn [6] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH MADDA
06E7..06E8 ; NSM # Mn [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON
06EA..06ED ; NSM # Mn [4] ARABIC EMPTY CENTRE LOW STOP..ARABIC SMALL LOW MEEM
@ -1508,11 +1553,13 @@ FFFFE..FFFFF ; BN # Cn [2] <noncharacter-FFFFE>..<noncharacter-FFFFF>
081B..0823 ; NSM # Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A
0825..0827 ; NSM # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
0829..082D ; NSM # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA
0859..085B ; NSM # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
0900..0902 ; NSM # Mn [3] DEVANAGARI SIGN INVERTED CANDRABINDU..DEVANAGARI SIGN ANUSVARA
093A ; NSM # Mn DEVANAGARI VOWEL SIGN OE
093C ; NSM # Mn DEVANAGARI SIGN NUKTA
0941..0948 ; NSM # Mn [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI
094D ; NSM # Mn DEVANAGARI SIGN VIRAMA
0951..0955 ; NSM # Mn [5] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI VOWEL SIGN CANDRA LONG E
0951..0957 ; NSM # Mn [7] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI VOWEL SIGN UUE
0962..0963 ; NSM # Mn [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL
0981 ; NSM # Mn BENGALI SIGN CANDRABINDU
09BC ; NSM # Mn BENGALI SIGN NUKTA
@ -1571,7 +1618,7 @@ FFFFE..FFFFF ; BN # Cn [2] <noncharacter-FFFFE>..<noncharacter-FFFFF>
0F71..0F7E ; NSM # Mn [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO
0F80..0F84 ; NSM # Mn [5] TIBETAN VOWEL SIGN REVERSED I..TIBETAN MARK HALANTA
0F86..0F87 ; NSM # Mn [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS
0F90..0F97 ; NSM # Mn [8] TIBETAN SUBJOINED LETTER KA..TIBETAN SUBJOINED LETTER JA
0F8D..0F97 ; NSM # Mn [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA
0F99..0FBC ; NSM # Mn [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA
0FC6 ; NSM # Mn TIBETAN SYMBOL PADMA GDAN
102D..1030 ; NSM # Mn [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU
@ -1585,7 +1632,7 @@ FFFFE..FFFFF ; BN # Cn [2] <noncharacter-FFFFE>..<noncharacter-FFFFF>
1085..1086 ; NSM # Mn [2] MYANMAR VOWEL SIGN SHAN E ABOVE..MYANMAR VOWEL SIGN SHAN FINAL Y
108D ; NSM # Mn MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE
109D ; NSM # Mn MYANMAR VOWEL SIGN AITON AI
135F ; NSM # Mn ETHIOPIC COMBINING GEMINATION MARK
135D..135F ; NSM # Mn [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK
1712..1714 ; NSM # Mn [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA
1732..1734 ; NSM # Mn [3] HANUNOO VOWEL SIGN I..HANUNOO SIGN PAMUDPOD
1752..1753 ; NSM # Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U
@ -1617,6 +1664,10 @@ FFFFE..FFFFF ; BN # Cn [2] <noncharacter-FFFFE>..<noncharacter-FFFFF>
1B80..1B81 ; NSM # Mn [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR
1BA2..1BA5 ; NSM # Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU
1BA8..1BA9 ; NSM # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG
1BE6 ; NSM # Mn BATAK SIGN TOMPI
1BE8..1BE9 ; NSM # Mn [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE
1BED ; NSM # Mn BATAK VOWEL SIGN KARO O
1BEF..1BF1 ; NSM # Mn [3] BATAK VOWEL SIGN U FOR SIMALUNGUN SA..BATAK CONSONANT SIGN H
1C2C..1C33 ; NSM # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T
1C36..1C37 ; NSM # Mn [2] LEPCHA SIGN RAN..LEPCHA SIGN NUKTA
1CD0..1CD2 ; NSM # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA
@ -1624,13 +1675,14 @@ FFFFE..FFFFF ; BN # Cn [2] <noncharacter-FFFFE>..<noncharacter-FFFFF>
1CE2..1CE8 ; NSM # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
1CED ; NSM # Mn VEDIC SIGN TIRYAK
1DC0..1DE6 ; NSM # Mn [39] COMBINING DOTTED GRAVE ACCENT..COMBINING LATIN SMALL LETTER Z
1DFD..1DFF ; NSM # Mn [3] COMBINING ALMOST EQUAL TO BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
1DFC..1DFF ; NSM # Mn [4] COMBINING DOUBLE INVERTED BREVE BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
20D0..20DC ; NSM # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE
20DD..20E0 ; NSM # Me [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH
20E1 ; NSM # Mn COMBINING LEFT RIGHT ARROW ABOVE
20E2..20E4 ; NSM # Me [3] COMBINING ENCLOSING SCREEN..COMBINING ENCLOSING UPWARD POINTING TRIANGLE
20E5..20F0 ; NSM # Mn [12] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING ASTERISK ABOVE
2CEF..2CF1 ; NSM # Mn [3] COPTIC COMBINING NI ABOVE..COPTIC COMBINING SPIRITUS LENIS
2D7F ; NSM # Mn TIFINAGH CONSONANT JOINER
2DE0..2DFF ; NSM # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS
302A..302F ; NSM # Mn [6] IDEOGRAPHIC LEVEL TONE MARK..HANGUL DOUBLE DOT TONE MARK
3099..309A ; NSM # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
@ -1672,6 +1724,8 @@ FE20..FE26 ; NSM # Mn [7] COMBINING LIGATURE LEFT HALF..COMBINING CONJOININ
10A0C..10A0F ; NSM # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA
10A38..10A3A ; NSM # Mn [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW
10A3F ; NSM # Mn KHAROSHTHI VIRAMA
11001 ; NSM # Mn BRAHMI SIGN ANUSVARA
11038..11046 ; NSM # Mn [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA
11080..11081 ; NSM # Mn [2] KAITHI SIGN CANDRABINDU..KAITHI SIGN ANUSVARA
110B3..110B6 ; NSM # Mn [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI
110B9..110BA ; NSM # Mn [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA
@ -1682,7 +1736,7 @@ FE20..FE26 ; NSM # Mn [7] COMBINING LIGATURE LEFT HALF..COMBINING CONJOININ
1D242..1D244 ; NSM # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME
E0100..E01EF ; NSM # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
# Total code points: 1173
# Total code points: 1209
# ================================================
@ -1695,11 +1749,9 @@ E0100..E01EF ; NSM # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
061B ; AL # Po ARABIC SEMICOLON
061C..061D ; AL # Cn [2] <reserved-061C>..<reserved-061D>
061E..061F ; AL # Po [2] ARABIC TRIPLE DOT PUNCTUATION MARK..ARABIC QUESTION MARK
0620 ; AL # Cn <reserved-0620>
0621..063F ; AL # Lo [31] ARABIC LETTER HAMZA..ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE
0620..063F ; AL # Lo [32] ARABIC LETTER KASHMIRI YEH..ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE
0640 ; AL # Lm ARABIC TATWEEL
0641..064A ; AL # Lo [10] ARABIC LETTER FEH..ARABIC LETTER YEH
065F ; AL # Cn <reserved-065F>
066D ; AL # Po ARABIC FIVE POINTED STAR
066E..066F ; AL # Lo [2] ARABIC LETTER DOTLESS BEH..ARABIC LETTER DOTLESS QAF
0671..06D3 ; AL # Lo [99] ARABIC LETTER ALEF WASLA..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE
@ -1719,7 +1771,8 @@ E0100..E01EF ; NSM # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
07B1 ; AL # Lo THAANA LETTER NAA
07B2..07BF ; AL # Cn [14] <reserved-07B2>..<reserved-07BF>
FB50..FBB1 ; AL # Lo [98] ARABIC LETTER ALEF WASLA ISOLATED FORM..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM
FBB2..FBD2 ; AL # Cn [33] <reserved-FBB2>..<reserved-FBD2>
FBB2..FBC1 ; AL # Sk [16] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL SMALL TAH BELOW
FBC2..FBD2 ; AL # Cn [17] <reserved-FBC2>..<reserved-FBD2>
FBD3..FD3D ; AL # Lo [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM
FD40..FD4F ; AL # Cn [16] <reserved-FD40>..<reserved-FD4F>
FD50..FD8F ; AL # Lo [64] ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM..ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM
@ -1734,7 +1787,7 @@ FE75 ; AL # Cn <reserved-FE75>
FE76..FEFC ; AL # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM
FEFD..FEFE ; AL # Cn [2] <reserved-FEFD>..<reserved-FEFE>
# Total code points: 1116
# Total code points: 1115
# ================================================

File diff suppressed because it is too large Load diff

View file

@ -1,8 +1,8 @@
# DerivedJoiningGroup-5.2.0.txt
# Date: 2009-05-22, 18:51:25 GMT [MD]
# DerivedJoiningGroup-6.0.0.txt
# Date: 2010-07-17, 22:46:14 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
@ -126,7 +126,7 @@
# ================================================
06C3 ; Hamza_On_Heh_Goal # Lo ARABIC LETTER TEH MARBUTA GOAL
06C3 ; Teh_Marbuta_Goal # Lo ARABIC LETTER TEH MARBUTA GOAL
# Total code points: 1
@ -343,13 +343,14 @@
# ================================================
0620 ; Yeh # Lo ARABIC LETTER KASHMIRI YEH
0626 ; Yeh # Lo ARABIC LETTER YEH WITH HAMZA ABOVE
0649..064A ; Yeh # Lo [2] ARABIC LETTER ALEF MAKSURA..ARABIC LETTER YEH
0678 ; Yeh # Lo ARABIC LETTER HIGH HAMZA YEH
06D0..06D1 ; Yeh # Lo [2] ARABIC LETTER E..ARABIC LETTER YEH WITH THREE DOTS BELOW
0777 ; Yeh # Lo ARABIC LETTER FARSI YEH WITH EXTENDED ARABIC-INDIC DIGIT FOUR BELOW
# Total code points: 7
# Total code points: 8
# ================================================

View file

@ -1,8 +1,8 @@
# DerivedJoiningType-5.2.0.txt
# Date: 2009-05-28, 20:37:39 GMT [MD]
# DerivedJoiningType-6.0.0.txt
# Date: 2010-08-19, 00:48:10 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
@ -29,6 +29,7 @@
# Joining_Type=Dual_Joining
0620 ; D # Lo ARABIC LETTER KASHMIRI YEH
0626 ; D # Lo ARABIC LETTER YEH WITH HAMZA ABOVE
0628 ; D # Lo ARABIC LETTER BEH
062A..062E ; D # Lo [5] ARABIC LETTER TEH..ARABIC LETTER KHAH
@ -58,7 +59,7 @@
077A..077F ; D # Lo [6] ARABIC LETTER YEH BARREE WITH EXTENDED ARABIC-INDIC DIGIT TWO ABOVE..ARABIC LETTER KAF WITH TWO DOTS ABOVE
07CA..07EA ; D # Lo [33] NKO LETTER A..NKO LETTER JONA RA
# Total code points: 188
# Total code points: 189
# ================================================
@ -109,10 +110,9 @@
05C4..05C5 ; T # Mn [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT
05C7 ; T # Mn HEBREW POINT QAMATS QATAN
0610..061A ; T # Mn [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA
064B..065E ; T # Mn [20] ARABIC FATHATAN..ARABIC FATHA WITH TWO DOTS
064B..065F ; T # Mn [21] ARABIC FATHATAN..ARABIC WAVY HAMZA BELOW
0670 ; T # Mn ARABIC LETTER SUPERSCRIPT ALEF
06D6..06DC ; T # Mn [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN
06DE ; T # Me ARABIC START OF RUB EL HIZB
06DF..06E4 ; T # Mn [6] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH MADDA
06E7..06E8 ; T # Mn [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON
06EA..06ED ; T # Mn [4] ARABIC EMPTY CENTRE LOW STOP..ARABIC SMALL LOW MEEM
@ -125,11 +125,13 @@
081B..0823 ; T # Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A
0825..0827 ; T # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
0829..082D ; T # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA
0859..085B ; T # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
0900..0902 ; T # Mn [3] DEVANAGARI SIGN INVERTED CANDRABINDU..DEVANAGARI SIGN ANUSVARA
093A ; T # Mn DEVANAGARI VOWEL SIGN OE
093C ; T # Mn DEVANAGARI SIGN NUKTA
0941..0948 ; T # Mn [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI
094D ; T # Mn DEVANAGARI SIGN VIRAMA
0951..0955 ; T # Mn [5] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI VOWEL SIGN CANDRA LONG E
0951..0957 ; T # Mn [7] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI VOWEL SIGN UUE
0962..0963 ; T # Mn [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL
0981 ; T # Mn BENGALI SIGN CANDRABINDU
09BC ; T # Mn BENGALI SIGN NUKTA
@ -190,7 +192,7 @@
0F71..0F7E ; T # Mn [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO
0F80..0F84 ; T # Mn [5] TIBETAN VOWEL SIGN REVERSED I..TIBETAN MARK HALANTA
0F86..0F87 ; T # Mn [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS
0F90..0F97 ; T # Mn [8] TIBETAN SUBJOINED LETTER KA..TIBETAN SUBJOINED LETTER JA
0F8D..0F97 ; T # Mn [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA
0F99..0FBC ; T # Mn [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA
0FC6 ; T # Mn TIBETAN SYMBOL PADMA GDAN
102D..1030 ; T # Mn [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU
@ -204,7 +206,7 @@
1085..1086 ; T # Mn [2] MYANMAR VOWEL SIGN SHAN E ABOVE..MYANMAR VOWEL SIGN SHAN FINAL Y
108D ; T # Mn MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE
109D ; T # Mn MYANMAR VOWEL SIGN AITON AI
135F ; T # Mn ETHIOPIC COMBINING GEMINATION MARK
135D..135F ; T # Mn [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK
1712..1714 ; T # Mn [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA
1732..1734 ; T # Mn [3] HANUNOO VOWEL SIGN I..HANUNOO SIGN PAMUDPOD
1752..1753 ; T # Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U
@ -237,6 +239,10 @@
1B80..1B81 ; T # Mn [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR
1BA2..1BA5 ; T # Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU
1BA8..1BA9 ; T # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG
1BE6 ; T # Mn BATAK SIGN TOMPI
1BE8..1BE9 ; T # Mn [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE
1BED ; T # Mn BATAK VOWEL SIGN KARO O
1BEF..1BF1 ; T # Mn [3] BATAK VOWEL SIGN U FOR SIMALUNGUN SA..BATAK CONSONANT SIGN H
1C2C..1C33 ; T # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T
1C36..1C37 ; T # Mn [2] LEPCHA SIGN RAN..LEPCHA SIGN NUKTA
1CD0..1CD2 ; T # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA
@ -244,7 +250,7 @@
1CE2..1CE8 ; T # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
1CED ; T # Mn VEDIC SIGN TIRYAK
1DC0..1DE6 ; T # Mn [39] COMBINING DOTTED GRAVE ACCENT..COMBINING LATIN SMALL LETTER Z
1DFD..1DFF ; T # Mn [3] COMBINING ALMOST EQUAL TO BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
1DFC..1DFF ; T # Mn [4] COMBINING DOUBLE INVERTED BREVE BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
200B ; T # Cf ZERO WIDTH SPACE
200E..200F ; T # Cf [2] LEFT-TO-RIGHT MARK..RIGHT-TO-LEFT MARK
202A..202E ; T # Cf [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE
@ -256,6 +262,7 @@
20E2..20E4 ; T # Me [3] COMBINING ENCLOSING SCREEN..COMBINING ENCLOSING UPWARD POINTING TRIANGLE
20E5..20F0 ; T # Mn [12] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING ASTERISK ABOVE
2CEF..2CF1 ; T # Mn [3] COPTIC COMBINING NI ABOVE..COPTIC COMBINING SPIRITUS LENIS
2D7F ; T # Mn TIFINAGH CONSONANT JOINER
2DE0..2DFF ; T # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS
302A..302F ; T # Mn [6] IDEOGRAPHIC LEVEL TONE MARK..HANGUL DOUBLE DOT TONE MARK
3099..309A ; T # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
@ -299,6 +306,8 @@ FFF9..FFFB ; T # Cf [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATI
10A0C..10A0F ; T # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA
10A38..10A3A ; T # Mn [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW
10A3F ; T # Mn KHAROSHTHI VIRAMA
11001 ; T # Mn BRAHMI SIGN ANUSVARA
11038..11046 ; T # Mn [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA
11080..11081 ; T # Mn [2] KAITHI SIGN CANDRABINDU..KAITHI SIGN ANUSVARA
110B3..110B6 ; T # Mn [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI
110B9..110BA ; T # Mn [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA
@ -313,6 +322,6 @@ E0001 ; T # Cf LANGUAGE TAG
E0020..E007F ; T # Cf [96] TAG SPACE..CANCEL TAG
E0100..E01EF ; T # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
# Total code points: 1308
# Total code points: 1344
# EOF

View file

@ -1,14 +1,14 @@
# DerivedNormalizationProps-5.2.0.txt
# Date: 2009-08-26, 18:18:50 GMT [MD]
# DerivedNormalizationProps-6.0.0.txt
# Date: 2010-05-20, 15:14:12 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
# ================================================
# Derived Property: FC_NFKC_Closure
# Derived Property: FC_NFKC_Closure (DEPRECATED as of Unicode 6.0.0)
# Generated from computing: b = NFKC(Fold(a)); c = NFKC(Fold(b));
# Then if (c != b) add the mapping from a to c to the set of
# mappings that constitute the FC_NFKC_Closure list
@ -611,19 +611,41 @@
1F12C ; FC_NFKC; 0072
1F12D ; FC_NFKC; 0063 0064
1F12E ; FC_NFKC; 0077 007A
1F130 ; FC_NFKC; 0061
1F131 ; FC_NFKC; 0062
1F132 ; FC_NFKC; 0063
1F133 ; FC_NFKC; 0064
1F134 ; FC_NFKC; 0065
1F135 ; FC_NFKC; 0066
1F136 ; FC_NFKC; 0067
1F137 ; FC_NFKC; 0068
1F138 ; FC_NFKC; 0069
1F139 ; FC_NFKC; 006A
1F13A ; FC_NFKC; 006B
1F13B ; FC_NFKC; 006C
1F13C ; FC_NFKC; 006D
1F13D ; FC_NFKC; 006E
1F13E ; FC_NFKC; 006F
1F13F ; FC_NFKC; 0070
1F140 ; FC_NFKC; 0071
1F141 ; FC_NFKC; 0072
1F142 ; FC_NFKC; 0073
1F143 ; FC_NFKC; 0074
1F144 ; FC_NFKC; 0075
1F145 ; FC_NFKC; 0076
1F146 ; FC_NFKC; 0077
1F147 ; FC_NFKC; 0078
1F148 ; FC_NFKC; 0079
1F149 ; FC_NFKC; 007A
1F14A ; FC_NFKC; 0068 0076
1F14B ; FC_NFKC; 006D 0076
1F14C ; FC_NFKC; 0073 0064
1F14D ; FC_NFKC; 0073 0073
1F14E ; FC_NFKC; 0070 0070 0076
1F14F ; FC_NFKC; 0077 0063
1F190 ; FC_NFKC; 0064 006A
# Total code points: 608
# Total code points: 630
# ================================================
@ -1299,7 +1321,7 @@ FB46..FB4E ; NFC_QC; N
208A..208C ; NFKD_QC; N
208D ; NFKD_QC; N
208E ; NFKD_QC; N
2090..2094 ; NFKD_QC; N
2090..209C ; NFKD_QC; N
20A8 ; NFKD_QC; N
2100..2101 ; NFKD_QC; N
2102 ; NFKD_QC; N
@ -1603,19 +1625,15 @@ FFED..FFEE ; NFKD_QC; N
1D7CE..1D7FF ; NFKD_QC; N
1F100..1F10A ; NFKD_QC; N
1F110..1F12E ; NFKD_QC; N
1F131 ; NFKD_QC; N
1F13D ; NFKD_QC; N
1F13F ; NFKD_QC; N
1F142 ; NFKD_QC; N
1F146 ; NFKD_QC; N
1F14A..1F14E ; NFKD_QC; N
1F130..1F14F ; NFKD_QC; N
1F190 ; NFKD_QC; N
1F200 ; NFKD_QC; N
1F210..1F231 ; NFKD_QC; N
1F200..1F202 ; NFKD_QC; N
1F210..1F23A ; NFKD_QC; N
1F240..1F248 ; NFKD_QC; N
1F250..1F251 ; NFKD_QC; N
2F800..2FA1D ; NFKD_QC; N
# Total code points: 16688
# Total code points: 16731
# ================================================
@ -1745,7 +1763,7 @@ FFED..FFEE ; NFKD_QC; N
208A..208C ; NFKC_QC; N
208D ; NFKC_QC; N
208E ; NFKC_QC; N
2090..2094 ; NFKC_QC; N
2090..209C ; NFKC_QC; N
20A8 ; NFKC_QC; N
2100..2101 ; NFKC_QC; N
2102 ; NFKC_QC; N
@ -1976,19 +1994,15 @@ FFED..FFEE ; NFKC_QC; N
1D7CE..1D7FF ; NFKC_QC; N
1F100..1F10A ; NFKC_QC; N
1F110..1F12E ; NFKC_QC; N
1F131 ; NFKC_QC; N
1F13D ; NFKC_QC; N
1F13F ; NFKC_QC; N
1F142 ; NFKC_QC; N
1F146 ; NFKC_QC; N
1F14A..1F14E ; NFKC_QC; N
1F130..1F14F ; NFKC_QC; N
1F190 ; NFKC_QC; N
1F200 ; NFKC_QC; N
1F210..1F231 ; NFKC_QC; N
1F200..1F202 ; NFKC_QC; N
1F210..1F23A ; NFKC_QC; N
1F240..1F248 ; NFKC_QC; N
1F250..1F251 ; NFKC_QC; N
2F800..2FA1D ; NFKC_QC; N
# Total code points: 4597
# Total code points: 4640
# ================================================
@ -2034,7 +2048,7 @@ FFED..FFEE ; NFKC_QC; N
# ================================================
# Derived Property: Expands_On_NFD
# Derived Property: Expands_On_NFD (DEPRECATED as of Unicode 6.0.0)
# Generated according to UAX #15.
# Characters whose normalized length is not one.
# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact.
@ -2262,7 +2276,7 @@ FB46..FB4E ; Expands_On_NFD
# ================================================
# Derived Property: Expands_On_NFC
# Derived Property: Expands_On_NFC (DEPRECATED as of Unicode 6.0.0)
# Generated according to UAX #15.
# Characters whose normalized length is not one.
# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact.
@ -2309,7 +2323,7 @@ FB46..FB4E ; Expands_On_NFC
# ================================================
# Derived Property: Expands_On_NFKD
# Derived Property: Expands_On_NFKD (DEPRECATED as of Unicode 6.0.0)
# Generated according to UAX #15.
# Characters whose normalized length is not one.
# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact.
@ -2608,17 +2622,17 @@ FFE3 ; Expands_On_NFKD
1F100..1F10A ; Expands_On_NFKD
1F110..1F12A ; Expands_On_NFKD
1F12D..1F12E ; Expands_On_NFKD
1F14A..1F14E ; Expands_On_NFKD
1F14A..1F14F ; Expands_On_NFKD
1F190 ; Expands_On_NFKD
1F200 ; Expands_On_NFKD
1F200..1F201 ; Expands_On_NFKD
1F213 ; Expands_On_NFKD
1F240..1F248 ; Expands_On_NFKD
# Total code points: 13374
# Total code points: 13376
# ================================================
# Derived Property: Expands_On_NFKC
# Derived Property: Expands_On_NFKC (DEPRECATED as of Unicode 6.0.0)
# Generated according to UAX #15.
# Characters whose normalized length is not one.
# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact.
@ -2746,12 +2760,12 @@ FFE3 ; Expands_On_NFKC
1F100..1F10A ; Expands_On_NFKC
1F110..1F12A ; Expands_On_NFKC
1F12D..1F12E ; Expands_On_NFKC
1F14A..1F14E ; Expands_On_NFKC
1F14A..1F14F ; Expands_On_NFKC
1F190 ; Expands_On_NFKC
1F200 ; Expands_On_NFKC
1F200..1F201 ; Expands_On_NFKC
1F240..1F248 ; Expands_On_NFKC
# Total code points: 1231
# Total code points: 1233
# ================================================
@ -3251,6 +3265,7 @@ FFE3 ; Expands_On_NFKC
0520 ; NFKC_CF; 0521
0522 ; NFKC_CF; 0523
0524 ; NFKC_CF; 0525
0526 ; NFKC_CF; 0527
0531 ; NFKC_CF; 0561
0532 ; NFKC_CF; 0562
0533 ; NFKC_CF; 0563
@ -3817,6 +3832,14 @@ FFE3 ; Expands_On_NFKC
2092 ; NFKC_CF; 006F
2093 ; NFKC_CF; 0078
2094 ; NFKC_CF; 0259
2095 ; NFKC_CF; 0068
2096 ; NFKC_CF; 006B
2097 ; NFKC_CF; 006C
2098 ; NFKC_CF; 006D
2099 ; NFKC_CF; 006E
209A ; NFKC_CF; 0070
209B ; NFKC_CF; 0073
209C ; NFKC_CF; 0074
20A8 ; NFKC_CF; 0072 0073
2100 ; NFKC_CF; 0061 002F 0063
2101 ; NFKC_CF; 0061 002F 0073
@ -5032,6 +5055,7 @@ A658 ; NFKC_CF; A659
A65A ; NFKC_CF; A65B
A65C ; NFKC_CF; A65D
A65E ; NFKC_CF; A65F
A660 ; NFKC_CF; A661
A662 ; NFKC_CF; A663
A664 ; NFKC_CF; A665
A666 ; NFKC_CF; A667
@ -5098,6 +5122,13 @@ A782 ; NFKC_CF; A783
A784 ; NFKC_CF; A785
A786 ; NFKC_CF; A787
A78B ; NFKC_CF; A78C
A78D ; NFKC_CF; 0265
A790 ; NFKC_CF; A791
A7A0 ; NFKC_CF; A7A1
A7A2 ; NFKC_CF; A7A3
A7A4 ; NFKC_CF; A7A5
A7A6 ; NFKC_CF; A7A7
A7A8 ; NFKC_CF; A7A9
F900 ; NFKC_CF; 8C48
F901 ; NFKC_CF; 66F4
F902 ; NFKC_CF; 8ECA
@ -7518,18 +7549,42 @@ FFF0..FFF8 ; NFKC_CF;
1F12C ; NFKC_CF; 0072
1F12D ; NFKC_CF; 0063 0064
1F12E ; NFKC_CF; 0077 007A
1F130 ; NFKC_CF; 0061
1F131 ; NFKC_CF; 0062
1F132 ; NFKC_CF; 0063
1F133 ; NFKC_CF; 0064
1F134 ; NFKC_CF; 0065
1F135 ; NFKC_CF; 0066
1F136 ; NFKC_CF; 0067
1F137 ; NFKC_CF; 0068
1F138 ; NFKC_CF; 0069
1F139 ; NFKC_CF; 006A
1F13A ; NFKC_CF; 006B
1F13B ; NFKC_CF; 006C
1F13C ; NFKC_CF; 006D
1F13D ; NFKC_CF; 006E
1F13E ; NFKC_CF; 006F
1F13F ; NFKC_CF; 0070
1F140 ; NFKC_CF; 0071
1F141 ; NFKC_CF; 0072
1F142 ; NFKC_CF; 0073
1F143 ; NFKC_CF; 0074
1F144 ; NFKC_CF; 0075
1F145 ; NFKC_CF; 0076
1F146 ; NFKC_CF; 0077
1F147 ; NFKC_CF; 0078
1F148 ; NFKC_CF; 0079
1F149 ; NFKC_CF; 007A
1F14A ; NFKC_CF; 0068 0076
1F14B ; NFKC_CF; 006D 0076
1F14C ; NFKC_CF; 0073 0064
1F14D ; NFKC_CF; 0073 0073
1F14E ; NFKC_CF; 0070 0070 0076
1F14F ; NFKC_CF; 0077 0063
1F190 ; NFKC_CF; 0064 006A
1F200 ; NFKC_CF; 307B 304B
1F201 ; NFKC_CF; 30B3 30B3
1F202 ; NFKC_CF; 30B5
1F210 ; NFKC_CF; 624B
1F211 ; NFKC_CF; 5B57
1F212 ; NFKC_CF; 53CC
@ -7564,6 +7619,15 @@ FFF0..FFF8 ; NFKC_CF;
1F22F ; NFKC_CF; 6307
1F230 ; NFKC_CF; 8D70
1F231 ; NFKC_CF; 6253
1F232 ; NFKC_CF; 7981
1F233 ; NFKC_CF; 7A7A
1F234 ; NFKC_CF; 5408
1F235 ; NFKC_CF; 6E80
1F236 ; NFKC_CF; 6709
1F237 ; NFKC_CF; 6708
1F238 ; NFKC_CF; 7533
1F239 ; NFKC_CF; 5272
1F23A ; NFKC_CF; 55B6
1F240 ; NFKC_CF; 3014 672C 3015
1F241 ; NFKC_CF; 3014 4E09 3015
1F242 ; NFKC_CF; 3014 4E8C 3015
@ -7573,6 +7637,8 @@ FFF0..FFF8 ; NFKC_CF;
1F246 ; NFKC_CF; 3014 76D7 3015
1F247 ; NFKC_CF; 3014 52DD 3015
1F248 ; NFKC_CF; 3014 6557 3015
1F250 ; NFKC_CF; 5F97
1F251 ; NFKC_CF; 53EF
2F800 ; NFKC_CF; 4E3D
2F801 ; NFKC_CF; 4E38
2F802 ; NFKC_CF; 4E41
@ -8113,7 +8179,7 @@ E0080..E00FF ; NFKC_CF;
E0100..E01EF ; NFKC_CF;
E01F0..E0FFF ; NFKC_CF;
# Total code points: 9740
# Total code points: 9792
# ================================================
@ -8405,6 +8471,7 @@ E01F0..E0FFF ; NFKC_CF;
0520 ; Changes_When_NFKC_Casefolded
0522 ; Changes_When_NFKC_Casefolded
0524 ; Changes_When_NFKC_Casefolded
0526 ; Changes_When_NFKC_Casefolded
0531..0556 ; Changes_When_NFKC_Casefolded
0587 ; Changes_When_NFKC_Casefolded
0675..0678 ; Changes_When_NFKC_Casefolded
@ -8635,7 +8702,7 @@ E01F0..E0FFF ; NFKC_CF;
208A..208C ; Changes_When_NFKC_Casefolded
208D ; Changes_When_NFKC_Casefolded
208E ; Changes_When_NFKC_Casefolded
2090..2094 ; Changes_When_NFKC_Casefolded
2090..209C ; Changes_When_NFKC_Casefolded
20A8 ; Changes_When_NFKC_Casefolded
2100..2101 ; Changes_When_NFKC_Casefolded
2102 ; Changes_When_NFKC_Casefolded
@ -8776,6 +8843,7 @@ A658 ; Changes_When_NFKC_Casefolded
A65A ; Changes_When_NFKC_Casefolded
A65C ; Changes_When_NFKC_Casefolded
A65E ; Changes_When_NFKC_Casefolded
A660 ; Changes_When_NFKC_Casefolded
A662 ; Changes_When_NFKC_Casefolded
A664 ; Changes_When_NFKC_Casefolded
A666 ; Changes_When_NFKC_Casefolded
@ -8841,6 +8909,13 @@ A782 ; Changes_When_NFKC_Casefolded
A784 ; Changes_When_NFKC_Casefolded
A786 ; Changes_When_NFKC_Casefolded
A78B ; Changes_When_NFKC_Casefolded
A78D ; Changes_When_NFKC_Casefolded
A790 ; Changes_When_NFKC_Casefolded
A7A0 ; Changes_When_NFKC_Casefolded
A7A2 ; Changes_When_NFKC_Casefolded
A7A4 ; Changes_When_NFKC_Casefolded
A7A6 ; Changes_When_NFKC_Casefolded
A7A8 ; Changes_When_NFKC_Casefolded
F900..FA0D ; Changes_When_NFKC_Casefolded
FA10 ; Changes_When_NFKC_Casefolded
FA12 ; Changes_When_NFKC_Casefolded
@ -9012,16 +9087,12 @@ FFF0..FFF8 ; Changes_When_NFKC_Casefolded
1D7CE..1D7FF ; Changes_When_NFKC_Casefolded
1F100..1F10A ; Changes_When_NFKC_Casefolded
1F110..1F12E ; Changes_When_NFKC_Casefolded
1F131 ; Changes_When_NFKC_Casefolded
1F13D ; Changes_When_NFKC_Casefolded
1F13F ; Changes_When_NFKC_Casefolded
1F142 ; Changes_When_NFKC_Casefolded
1F146 ; Changes_When_NFKC_Casefolded
1F14A..1F14E ; Changes_When_NFKC_Casefolded
1F130..1F14F ; Changes_When_NFKC_Casefolded
1F190 ; Changes_When_NFKC_Casefolded
1F200 ; Changes_When_NFKC_Casefolded
1F210..1F231 ; Changes_When_NFKC_Casefolded
1F200..1F202 ; Changes_When_NFKC_Casefolded
1F210..1F23A ; Changes_When_NFKC_Casefolded
1F240..1F248 ; Changes_When_NFKC_Casefolded
1F250..1F251 ; Changes_When_NFKC_Casefolded
2F800..2FA1D ; Changes_When_NFKC_Casefolded
E0000 ; Changes_When_NFKC_Casefolded
E0001 ; Changes_When_NFKC_Casefolded
@ -9031,6 +9102,6 @@ E0080..E00FF ; Changes_When_NFKC_Casefolded
E0100..E01EF ; Changes_When_NFKC_Casefolded
E01F0..E0FFF ; Changes_When_NFKC_Casefolded
# Total code points: 9740
# Total code points: 9792
# EOF

View file

@ -1,8 +1,8 @@
# DerivedNumericValues-5.2.0.txt
# Date: 2009-08-22, 04:58:28 GMT [MD]
# DerivedNumericValues-6.0.0.txt
# Date: 2010-08-19, 00:48:14 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
@ -76,6 +76,7 @@ F9B2 ; 0.0 ; ; 0 # Lo CJK COMPATIBILITY IDEOGRAPH-F9B2
FF10 ; 0.0 ; ; 0 # Nd FULLWIDTH DIGIT ZERO
1018A ; 0.0 ; ; 0 # No GREEK ZERO SIGN
104A0 ; 0.0 ; ; 0 # Nd OSMANYA DIGIT ZERO
11066 ; 0.0 ; ; 0 # Nd BRAHMI DIGIT ZERO
1D7CE ; 0.0 ; ; 0 # Nd MATHEMATICAL BOLD DIGIT ZERO
1D7D8 ; 0.0 ; ; 0 # Nd MATHEMATICAL DOUBLE-STRUCK DIGIT ZERO
1D7E2 ; 0.0 ; ; 0 # Nd MATHEMATICAL SANS-SERIF DIGIT ZERO
@ -83,14 +84,15 @@ FF10 ; 0.0 ; ; 0 # Nd FULLWIDTH DIGIT ZERO
1D7F6 ; 0.0 ; ; 0 # Nd MATHEMATICAL MONOSPACE DIGIT ZERO
1F100..1F101 ; 0.0 ; ; 0 # No [2] DIGIT ZERO FULL STOP..DIGIT ZERO COMMA
# Total code points: 55
# Total code points: 56
# ================================================
09F4 ; 0.0625 ; ; 1/16 # No BENGALI CURRENCY NUMERATOR ONE
0B75 ; 0.0625 ; ; 1/16 # No ORIYA FRACTION ONE SIXTEENTH
A833 ; 0.0625 ; ; 1/16 # No NORTH INDIC FRACTION ONE SIXTEENTH
# Total code points: 2
# Total code points: 3
# ================================================
@ -107,11 +109,12 @@ A833 ; 0.0625 ; ; 1/16 # No NORTH INDIC FRACTION ONE SIXTEENTH
# ================================================
09F5 ; 0.125 ; ; 1/8 # No BENGALI CURRENCY NUMERATOR TWO
0B76 ; 0.125 ; ; 1/8 # No ORIYA FRACTION ONE EIGHTH
215B ; 0.125 ; ; 1/8 # No VULGAR FRACTION ONE EIGHTH
A834 ; 0.125 ; ; 1/8 # No NORTH INDIC FRACTION ONE EIGHTH
1245F ; 0.125 ; ; 1/8 # Nl CUNEIFORM NUMERIC SIGN ONE EIGHTH ASH
# Total code points: 4
# Total code points: 5
# ================================================
@ -129,9 +132,10 @@ A834 ; 0.125 ; ; 1/8 # No NORTH INDIC FRACTION ONE EIGHTH
# ================================================
09F6 ; 0.1875 ; ; 3/16 # No BENGALI CURRENCY NUMERATOR THREE
0B77 ; 0.1875 ; ; 3/16 # No ORIYA FRACTION THREE SIXTEENTHS
A835 ; 0.1875 ; ; 3/16 # No NORTH INDIC FRACTION THREE SIXTEENTHS
# Total code points: 2
# Total code points: 3
# ================================================
@ -143,6 +147,7 @@ A835 ; 0.1875 ; ; 3/16 # No NORTH INDIC FRACTION THREE SIXTEENTHS
00BC ; 0.25 ; ; 1/4 # No VULGAR FRACTION ONE QUARTER
09F7 ; 0.25 ; ; 1/4 # No BENGALI CURRENCY NUMERATOR FOUR
0B72 ; 0.25 ; ; 1/4 # No ORIYA FRACTION ONE QUARTER
0D73 ; 0.25 ; ; 1/4 # No MALAYALAM FRACTION ONE QUARTER
A830 ; 0.25 ; ; 1/4 # No NORTH INDIC FRACTION ONE QUARTER
10140 ; 0.25 ; ; 1/4 # Nl GREEK ACROPHONIC ATTIC ONE QUARTER
@ -150,7 +155,7 @@ A830 ; 0.25 ; ; 1/4 # No NORTH INDIC FRACTION ONE QUARTER
12460 ; 0.25 ; ; 1/4 # Nl CUNEIFORM NUMERIC SIGN ONE QUARTER ASH
12462 ; 0.25 ; ; 1/4 # Nl CUNEIFORM NUMERIC SIGN OLD ASSYRIAN ONE QUARTER
# Total code points: 8
# Total code points: 9
# ================================================
@ -176,6 +181,7 @@ A830 ; 0.25 ; ; 1/4 # No NORTH INDIC FRACTION ONE QUARTER
# ================================================
00BD ; 0.5 ; ; 1/2 # No VULGAR FRACTION ONE HALF
0B73 ; 0.5 ; ; 1/2 # No ORIYA FRACTION ONE HALF
0D74 ; 0.5 ; ; 1/2 # No MALAYALAM FRACTION ONE HALF
0F2A ; 0.5 ; ; 1/2 # No TIBETAN DIGIT HALF ONE
2CFD ; 0.5 ; ; 1/2 # No COPTIC FRACTION ONE HALF
@ -184,7 +190,7 @@ A831 ; 0.5 ; ; 1/2 # No NORTH INDIC FRACTION ONE HALF
10175..10176 ; 0.5 ; ; 1/2 # No [2] GREEK ONE HALF SIGN..GREEK ONE HALF SIGN ALTERNATE FORM
10E7B ; 0.5 ; ; 1/2 # No RUMI FRACTION ONE HALF
# Total code points: 9
# Total code points: 10
# ================================================
@ -212,11 +218,12 @@ A831 ; 0.5 ; ; 1/2 # No NORTH INDIC FRACTION ONE HALF
00BE ; 0.75 ; ; 3/4 # No VULGAR FRACTION THREE QUARTERS
09F8 ; 0.75 ; ; 3/4 # No BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR
0B74 ; 0.75 ; ; 3/4 # No ORIYA FRACTION THREE QUARTERS
0D75 ; 0.75 ; ; 3/4 # No MALAYALAM FRACTION THREE QUARTERS
A832 ; 0.75 ; ; 3/4 # No NORTH INDIC FRACTION THREE QUARTERS
10178 ; 0.75 ; ; 3/4 # No GREEK THREE QUARTERS SIGN
# Total code points: 5
# Total code points: 6
# ================================================
@ -266,7 +273,7 @@ A832 ; 0.75 ; ; 3/4 # No NORTH INDIC FRACTION THREE QUARTERS
1811 ; 1.0 ; ; 1 # Nd MONGOLIAN DIGIT ONE
1947 ; 1.0 ; ; 1 # Nd LIMBU DIGIT ONE
19D1 ; 1.0 ; ; 1 # Nd NEW TAI LUE DIGIT ONE
19DA ; 1.0 ; ; 1 # Nd NEW TAI LUE THAM DIGIT ONE
19DA ; 1.0 ; ; 1 # No NEW TAI LUE THAM DIGIT ONE
1A81 ; 1.0 ; ; 1 # Nd TAI THAM HORA DIGIT ONE
1A91 ; 1.0 ; ; 1 # Nd TAI THAM THAM DIGIT ONE
1B51 ; 1.0 ; ; 1 # Nd BALINESE DIGIT ONE
@ -314,6 +321,8 @@ FF11 ; 1.0 ; ; 1 # Nd FULLWIDTH DIGIT ONE
10B58 ; 1.0 ; ; 1 # No INSCRIPTIONAL PARTHIAN NUMBER ONE
10B78 ; 1.0 ; ; 1 # No INSCRIPTIONAL PAHLAVI NUMBER ONE
10E60 ; 1.0 ; ; 1 # No RUMI DIGIT ONE
11052 ; 1.0 ; ; 1 # No BRAHMI NUMBER ONE
11067 ; 1.0 ; ; 1 # Nd BRAHMI DIGIT ONE
12415 ; 1.0 ; ; 1 # Nl CUNEIFORM NUMERIC SIGN ONE GESH2
1241E ; 1.0 ; ; 1 # Nl CUNEIFORM NUMERIC SIGN ONE GESHU
1242C ; 1.0 ; ; 1 # Nl CUNEIFORM NUMERIC SIGN ONE SHARU
@ -329,7 +338,7 @@ FF11 ; 1.0 ; ; 1 # Nd FULLWIDTH DIGIT ONE
1F102 ; 1.0 ; ; 1 # No DIGIT ONE COMMA
2092A ; 1.0 ; ; 1 # Lo CJK UNIFIED IDEOGRAPH-2092A
# Total code points: 91
# Total code points: 93
# ================================================
@ -413,6 +422,8 @@ FF12 ; 2.0 ; ; 2 # Nd FULLWIDTH DIGIT TWO
10B59 ; 2.0 ; ; 2 # No INSCRIPTIONAL PARTHIAN NUMBER TWO
10B79 ; 2.0 ; ; 2 # No INSCRIPTIONAL PAHLAVI NUMBER TWO
10E61 ; 2.0 ; ; 2 # No RUMI DIGIT TWO
11053 ; 2.0 ; ; 2 # No BRAHMI NUMBER TWO
11068 ; 2.0 ; ; 2 # Nd BRAHMI DIGIT TWO
12400 ; 2.0 ; ; 2 # Nl CUNEIFORM NUMERIC SIGN TWO ASH
12416 ; 2.0 ; ; 2 # Nl CUNEIFORM NUMERIC SIGN TWO GESH2
1241F ; 2.0 ; ; 2 # Nl CUNEIFORM NUMERIC SIGN TWO GESHU
@ -431,7 +442,7 @@ FF12 ; 2.0 ; ; 2 # Nd FULLWIDTH DIGIT TWO
1F103 ; 2.0 ; ; 2 # No DIGIT TWO COMMA
22390 ; 2.0 ; ; 2 # Lo CJK UNIFIED IDEOGRAPH-22390
# Total code points: 94
# Total code points: 96
# ================================================
@ -509,6 +520,8 @@ FF13 ; 3.0 ; ; 3 # Nd FULLWIDTH DIGIT THREE
10B5A ; 3.0 ; ; 3 # No INSCRIPTIONAL PARTHIAN NUMBER THREE
10B7A ; 3.0 ; ; 3 # No INSCRIPTIONAL PAHLAVI NUMBER THREE
10E62 ; 3.0 ; ; 3 # No RUMI DIGIT THREE
11054 ; 3.0 ; ; 3 # No BRAHMI NUMBER THREE
11069 ; 3.0 ; ; 3 # Nd BRAHMI DIGIT THREE
12401 ; 3.0 ; ; 3 # Nl CUNEIFORM NUMERIC SIGN THREE ASH
12408 ; 3.0 ; ; 3 # Nl CUNEIFORM NUMERIC SIGN THREE DISH
12417 ; 3.0 ; ; 3 # Nl CUNEIFORM NUMERIC SIGN THREE GESH2
@ -531,7 +544,7 @@ FF13 ; 3.0 ; ; 3 # Nd FULLWIDTH DIGIT THREE
22998 ; 3.0 ; ; 3 # Lo CJK UNIFIED IDEOGRAPH-22998
23B1B ; 3.0 ; ; 3 # Lo CJK UNIFIED IDEOGRAPH-23B1B
# Total code points: 96
# Total code points: 98
# ================================================
@ -603,6 +616,8 @@ FF14 ; 4.0 ; ; 4 # Nd FULLWIDTH DIGIT FOUR
10B5B ; 4.0 ; ; 4 # No INSCRIPTIONAL PARTHIAN NUMBER FOUR
10B7B ; 4.0 ; ; 4 # No INSCRIPTIONAL PAHLAVI NUMBER FOUR
10E63 ; 4.0 ; ; 4 # No RUMI DIGIT FOUR
11055 ; 4.0 ; ; 4 # No BRAHMI NUMBER FOUR
1106A ; 4.0 ; ; 4 # Nd BRAHMI DIGIT FOUR
12402 ; 4.0 ; ; 4 # Nl CUNEIFORM NUMERIC SIGN FOUR ASH
12409 ; 4.0 ; ; 4 # Nl CUNEIFORM NUMERIC SIGN FOUR DISH
1240F ; 4.0 ; ; 4 # Nl CUNEIFORM NUMERIC SIGN FOUR U
@ -625,7 +640,7 @@ FF14 ; 4.0 ; ; 4 # Nd FULLWIDTH DIGIT FOUR
200E2 ; 4.0 ; ; 4 # Lo CJK UNIFIED IDEOGRAPH-200E2
2626D ; 4.0 ; ; 4 # Lo CJK UNIFIED IDEOGRAPH-2626D
# Total code points: 87
# Total code points: 89
# ================================================
@ -700,6 +715,8 @@ FF15 ; 5.0 ; ; 5 # Nd FULLWIDTH DIGIT FIVE
10321 ; 5.0 ; ; 5 # No OLD ITALIC NUMERAL FIVE
104A5 ; 5.0 ; ; 5 # Nd OSMANYA DIGIT FIVE
10E64 ; 5.0 ; ; 5 # No RUMI DIGIT FIVE
11056 ; 5.0 ; ; 5 # No BRAHMI NUMBER FIVE
1106B ; 5.0 ; ; 5 # Nd BRAHMI DIGIT FIVE
12403 ; 5.0 ; ; 5 # Nl CUNEIFORM NUMERIC SIGN FIVE ASH
1240A ; 5.0 ; ; 5 # Nl CUNEIFORM NUMERIC SIGN FIVE DISH
12410 ; 5.0 ; ; 5 # Nl CUNEIFORM NUMERIC SIGN FIVE U
@ -719,7 +736,7 @@ FF15 ; 5.0 ; ; 5 # Nd FULLWIDTH DIGIT FIVE
1F106 ; 5.0 ; ; 5 # No DIGIT FIVE COMMA
20121 ; 5.0 ; ; 5 # Lo CJK UNIFIED IDEOGRAPH-20121
# Total code points: 84
# Total code points: 86
# ================================================
@ -790,6 +807,8 @@ FF16 ; 6.0 ; ; 6 # Nd FULLWIDTH DIGIT SIX
1010C ; 6.0 ; ; 6 # No AEGEAN NUMBER SIX
104A6 ; 6.0 ; ; 6 # Nd OSMANYA DIGIT SIX
10E65 ; 6.0 ; ; 6 # No RUMI DIGIT SIX
11057 ; 6.0 ; ; 6 # No BRAHMI NUMBER SIX
1106C ; 6.0 ; ; 6 # Nd BRAHMI DIGIT SIX
12404 ; 6.0 ; ; 6 # Nl CUNEIFORM NUMERIC SIGN SIX ASH
1240B ; 6.0 ; ; 6 # Nl CUNEIFORM NUMERIC SIGN SIX DISH
12411 ; 6.0 ; ; 6 # Nl CUNEIFORM NUMERIC SIGN SIX U
@ -806,7 +825,7 @@ FF16 ; 6.0 ; ; 6 # Nd FULLWIDTH DIGIT SIX
1F107 ; 6.0 ; ; 6 # No DIGIT SIX COMMA
20AEA ; 6.0 ; ; 6 # Lo CJK UNIFIED IDEOGRAPH-20AEA
# Total code points: 76
# Total code points: 78
# ================================================
@ -875,6 +894,8 @@ FF17 ; 7.0 ; ; 7 # Nd FULLWIDTH DIGIT SEVEN
1010D ; 7.0 ; ; 7 # No AEGEAN NUMBER SEVEN
104A7 ; 7.0 ; ; 7 # Nd OSMANYA DIGIT SEVEN
10E66 ; 7.0 ; ; 7 # No RUMI DIGIT SEVEN
11058 ; 7.0 ; ; 7 # No BRAHMI NUMBER SEVEN
1106D ; 7.0 ; ; 7 # Nd BRAHMI DIGIT SEVEN
12405 ; 7.0 ; ; 7 # Nl CUNEIFORM NUMERIC SIGN SEVEN ASH
1240C ; 7.0 ; ; 7 # Nl CUNEIFORM NUMERIC SIGN SEVEN DISH
12412 ; 7.0 ; ; 7 # Nl CUNEIFORM NUMERIC SIGN SEVEN U
@ -890,7 +911,7 @@ FF17 ; 7.0 ; ; 7 # Nd FULLWIDTH DIGIT SEVEN
1F108 ; 7.0 ; ; 7 # No DIGIT SEVEN COMMA
20001 ; 7.0 ; ; 7 # Lo CJK UNIFIED IDEOGRAPH-20001
# Total code points: 75
# Total code points: 77
# ================================================
@ -957,6 +978,8 @@ FF18 ; 8.0 ; ; 8 # Nd FULLWIDTH DIGIT EIGHT
1010E ; 8.0 ; ; 8 # No AEGEAN NUMBER EIGHT
104A8 ; 8.0 ; ; 8 # Nd OSMANYA DIGIT EIGHT
10E67 ; 8.0 ; ; 8 # No RUMI DIGIT EIGHT
11059 ; 8.0 ; ; 8 # No BRAHMI NUMBER EIGHT
1106E ; 8.0 ; ; 8 # Nd BRAHMI DIGIT EIGHT
12406 ; 8.0 ; ; 8 # Nl CUNEIFORM NUMERIC SIGN EIGHT ASH
1240D ; 8.0 ; ; 8 # Nl CUNEIFORM NUMERIC SIGN EIGHT DISH
12413 ; 8.0 ; ; 8 # Nl CUNEIFORM NUMERIC SIGN EIGHT U
@ -971,7 +994,7 @@ FF18 ; 8.0 ; ; 8 # Nd FULLWIDTH DIGIT EIGHT
1D7FE ; 8.0 ; ; 8 # Nd MATHEMATICAL MONOSPACE DIGIT EIGHT
1F109 ; 8.0 ; ; 8 # No DIGIT EIGHT COMMA
# Total code points: 71
# Total code points: 73
# ================================================
@ -1039,6 +1062,8 @@ FF19 ; 9.0 ; ; 9 # Nd FULLWIDTH DIGIT NINE
1010F ; 9.0 ; ; 9 # No AEGEAN NUMBER NINE
104A9 ; 9.0 ; ; 9 # Nd OSMANYA DIGIT NINE
10E68 ; 9.0 ; ; 9 # No RUMI DIGIT NINE
1105A ; 9.0 ; ; 9 # No BRAHMI NUMBER NINE
1106F ; 9.0 ; ; 9 # Nd BRAHMI DIGIT NINE
12407 ; 9.0 ; ; 9 # Nl CUNEIFORM NUMERIC SIGN NINE ASH
1240E ; 9.0 ; ; 9 # Nl CUNEIFORM NUMERIC SIGN NINE DISH
12414 ; 9.0 ; ; 9 # Nl CUNEIFORM NUMERIC SIGN NINE U
@ -1054,7 +1079,7 @@ FF19 ; 9.0 ; ; 9 # Nd FULLWIDTH DIGIT NINE
1F10A ; 9.0 ; ; 9 # No DIGIT NINE COMMA
2F890 ; 9.0 ; ; 9 # Lo CJK COMPATIBILITY IDEOGRAPH-2F890
# Total code points: 75
# Total code points: 77
# ================================================
@ -1091,9 +1116,10 @@ F9FD ; 10.0 ; ; 10 # Lo CJK COMPATIBILITY IDEOGRAPH-F9FD
10B5C ; 10.0 ; ; 10 # No INSCRIPTIONAL PARTHIAN NUMBER TEN
10B7C ; 10.0 ; ; 10 # No INSCRIPTIONAL PAHLAVI NUMBER TEN
10E69 ; 10.0 ; ; 10 # No RUMI NUMBER TEN
1105B ; 10.0 ; ; 10 # No BRAHMI NUMBER TEN
1D369 ; 10.0 ; ; 10 # No COUNTING ROD TENS DIGIT ONE
# Total code points: 38
# Total code points: 39
# ================================================
@ -1202,9 +1228,10 @@ F9FD ; 10.0 ; ; 10 # Lo CJK COMPATIBILITY IDEOGRAPH-F9FD
10B5D ; 20.0 ; ; 20 # No INSCRIPTIONAL PARTHIAN NUMBER TWENTY
10B7D ; 20.0 ; ; 20 # No INSCRIPTIONAL PAHLAVI NUMBER TWENTY
10E6A ; 20.0 ; ; 20 # No RUMI NUMBER TWENTY
1105C ; 20.0 ; ; 20 # No BRAHMI NUMBER TWENTY
1D36A ; 20.0 ; ; 20 # No COUNTING ROD TENS DIGIT TWO
# Total code points: 17
# Total code points: 18
# ================================================
@ -1269,10 +1296,11 @@ F9FD ; 10.0 ; ; 10 # Lo CJK COMPATIBILITY IDEOGRAPH-F9FD
10112 ; 30.0 ; ; 30 # No AEGEAN NUMBER THIRTY
10165 ; 30.0 ; ; 30 # Nl GREEK ACROPHONIC THESPIAN THIRTY
10E6B ; 30.0 ; ; 30 # No RUMI NUMBER THIRTY
1105D ; 30.0 ; ; 30 # No BRAHMI NUMBER THIRTY
1D36B ; 30.0 ; ; 30 # No COUNTING ROD TENS DIGIT THREE
20983 ; 30.0 ; ; 30 # Lo CJK UNIFIED IDEOGRAPH-20983
# Total code points: 9
# Total code points: 10
# ================================================
@ -1335,11 +1363,12 @@ F9FD ; 10.0 ; ; 10 # Lo CJK COMPATIBILITY IDEOGRAPH-F9FD
534C ; 40.0 ; ; 40 # Lo CJK UNIFIED IDEOGRAPH-534C
10113 ; 40.0 ; ; 40 # No AEGEAN NUMBER FORTY
10E6C ; 40.0 ; ; 40 # No RUMI NUMBER FORTY
1105E ; 40.0 ; ; 40 # No BRAHMI NUMBER FORTY
1D36C ; 40.0 ; ; 40 # No COUNTING ROD TENS DIGIT FOUR
2098C ; 40.0 ; ; 40 # Lo CJK UNIFIED IDEOGRAPH-2098C
2099C ; 40.0 ; ; 40 # Lo CJK UNIFIED IDEOGRAPH-2099C
# Total code points: 8
# Total code points: 9
# ================================================
@ -1411,36 +1440,40 @@ F9FD ; 10.0 ; ; 10 # Lo CJK COMPATIBILITY IDEOGRAPH-F9FD
10323 ; 50.0 ; ; 50 # No OLD ITALIC NUMERAL FIFTY
10A7E ; 50.0 ; ; 50 # No OLD SOUTH ARABIAN NUMBER FIFTY
10E6D ; 50.0 ; ; 50 # No RUMI NUMBER FIFTY
1105F ; 50.0 ; ; 50 # No BRAHMI NUMBER FIFTY
1D36D ; 50.0 ; ; 50 # No COUNTING ROD TENS DIGIT FIVE
# Total code points: 18
# Total code points: 19
# ================================================
1377 ; 60.0 ; ; 60 # No ETHIOPIC NUMBER SIXTY
10115 ; 60.0 ; ; 60 # No AEGEAN NUMBER SIXTY
10E6E ; 60.0 ; ; 60 # No RUMI NUMBER SIXTY
11060 ; 60.0 ; ; 60 # No BRAHMI NUMBER SIXTY
1D36E ; 60.0 ; ; 60 # No COUNTING ROD TENS DIGIT SIX
# Total code points: 4
# Total code points: 5
# ================================================
1378 ; 70.0 ; ; 70 # No ETHIOPIC NUMBER SEVENTY
10116 ; 70.0 ; ; 70 # No AEGEAN NUMBER SEVENTY
10E6F ; 70.0 ; ; 70 # No RUMI NUMBER SEVENTY
11061 ; 70.0 ; ; 70 # No BRAHMI NUMBER SEVENTY
1D36F ; 70.0 ; ; 70 # No COUNTING ROD TENS DIGIT SEVEN
# Total code points: 4
# Total code points: 5
# ================================================
1379 ; 80.0 ; ; 80 # No ETHIOPIC NUMBER EIGHTY
10117 ; 80.0 ; ; 80 # No AEGEAN NUMBER EIGHTY
10E70 ; 80.0 ; ; 80 # No RUMI NUMBER EIGHTY
11062 ; 80.0 ; ; 80 # No BRAHMI NUMBER EIGHTY
1D370 ; 80.0 ; ; 80 # No COUNTING ROD TENS DIGIT EIGHT
# Total code points: 4
# Total code points: 5
# ================================================
@ -1448,9 +1481,10 @@ F9FD ; 10.0 ; ; 10 # Lo CJK COMPATIBILITY IDEOGRAPH-F9FD
10118 ; 90.0 ; ; 90 # No AEGEAN NUMBER NINETY
10341 ; 90.0 ; ; 90 # Nl GOTHIC LETTER NINETY
10E71 ; 90.0 ; ; 90 # No RUMI NUMBER NINETY
11063 ; 90.0 ; ; 90 # No BRAHMI NUMBER NINETY
1D371 ; 90.0 ; ; 90 # No COUNTING ROD TENS DIGIT NINE
# Total code points: 5
# Total code points: 6
# ================================================
@ -1473,8 +1507,9 @@ F9FD ; 10.0 ; ; 10 # Lo CJK COMPATIBILITY IDEOGRAPH-F9FD
10B5E ; 100.0 ; ; 100 # No INSCRIPTIONAL PARTHIAN NUMBER ONE HUNDRED
10B7E ; 100.0 ; ; 100 # No INSCRIPTIONAL PAHLAVI NUMBER ONE HUNDRED
10E72 ; 100.0 ; ; 100 # No RUMI NUMBER ONE HUNDRED
11064 ; 100.0 ; ; 100 # No BRAHMI NUMBER ONE HUNDRED
# Total code points: 19
# Total code points: 20
# ================================================
@ -1557,8 +1592,9 @@ F9FD ; 10.0 ; ; 10 # Lo CJK COMPATIBILITY IDEOGRAPH-F9FD
10A47 ; 1000.0 ; ; 1000 # No KHAROSHTHI NUMBER ONE THOUSAND
10B5F ; 1000.0 ; ; 1000 # No INSCRIPTIONAL PARTHIAN NUMBER ONE THOUSAND
10B7F ; 1000.0 ; ; 1000 # No INSCRIPTIONAL PAHLAVI NUMBER ONE THOUSAND
11065 ; 1000.0 ; ; 1000 # No BRAHMI NUMBER ONE THOUSAND
# Total code points: 16
# Total code points: 17
# ================================================

View file

@ -1,12 +1,12 @@
# EastAsianWidth-5.2.0.txt
# Date: 2009-06-09, 17:47:00 PDT [KW]
# EastAsianWidth-6.0.0.txt
# Date: 2010-08-17, 12:17:00 PDT [KW]
#
# East Asian Width Properties
#
# This file is an informative contributory data file in the
# Unicode Character Database.
#
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# The format is two fields separated by a semicolon.
@ -22,6 +22,7 @@
# CJK Compatibility Ideographs: U+F900..U+FAFF
# CJK Unified Ideographs Extension B: U+20000..U+2A6DF
# CJK Unified Ideographs Extension C: U+2A700..U+2B73F
# CJK Unified Ideographs Extension D: U+2B740..U+2B81F
# CJK Compatibility Ideographs Supplement: U+2F800..U+2FA1F
# and any other reserved code points on
# Planes 2 and 3: U+20000..U+2FFFD
@ -163,7 +164,7 @@
0410..044F;A
0450;N
0451;A
0452..0525;N
0452..0527;N
0531..0556;N
0559..055F;N
0561..0587;N
@ -173,18 +174,15 @@
05F0..05F4;N
0600..0603;N
0606..061B;N
061E..061F;N
0621..065E;N
0660..070D;N
061E..070D;N
070F..074A;N
074D..07B1;N
07C0..07FA;N
0800..082D;N
0830..083E;N
0900..0939;N
093C..094E;N
0950..0955;N
0958..0972;N
0840..085B;N
085E;N
0900..0977;N
0979..097F;N
0981..0983;N
0985..098C;N
@ -243,7 +241,7 @@
0B56..0B57;N
0B5C..0B5D;N
0B5F..0B63;N
0B66..0B71;N
0B66..0B77;N
0B82..0B83;N
0B85..0B8A;N
0B8E..0B90;N
@ -291,11 +289,10 @@
0D02..0D03;N
0D05..0D0C;N
0D0E..0D10;N
0D12..0D28;N
0D2A..0D39;N
0D12..0D3A;N
0D3D..0D44;N
0D46..0D48;N
0D4A..0D4D;N
0D4A..0D4E;N
0D57;N
0D60..0D63;N
0D66..0D75;N
@ -333,11 +330,10 @@
0EDC..0EDD;N
0F00..0F47;N
0F49..0F6C;N
0F71..0F8B;N
0F90..0F97;N
0F71..0F97;N
0F99..0FBC;N
0FBE..0FCC;N
0FCE..0FD8;N
0FCE..0FDA;N
1000..10C5;N
10D0..10FC;N
1100..115F;W
@ -361,7 +357,7 @@
12D8..1310;N
1312..1315;N
1318..135A;N
135F..137C;N
135D..137C;N
1380..1399;N
13A0..13F4;N
1400..169C;N
@ -400,12 +396,13 @@
1B50..1B7C;N
1B80..1BAA;N
1BAE..1BB9;N
1C00..1C37;N
1BC0..1BF3;N
1BFC..1C37;N
1C3B..1C49;N
1C4D..1C7F;N
1CD0..1CF2;N
1D00..1DE6;N
1DFD..1F15;N
1DFC..1F15;N
1F18..1F1D;N
1F20..1F45;N
1F48..1F4D;N
@ -451,12 +448,12 @@
2080;N
2081..2084;A
2085..208E;N
2090..2094;N
2090..209C;N
20A0..20A8;N
20A9;H
20AA..20AB;N
20AC;A
20AD..20B8;N
20AD..20B9;N
20D0..20F0;N
2100..2102;N
2103;A
@ -553,7 +550,7 @@
2312;A
2313..2328;N
2329..232A;W
232B..23E8;N
232B..23F3;N
2400..2426;N
2440..244A;N
2460..24E9;A
@ -619,28 +616,21 @@
26BE..26BF;A
26C0..26C3;N
26C4..26CD;A
26CE;N
26CF..26E1;A
26E2;N
26E3;A
26E4..26E7;N
26E8..26FF;A
2701..2704;N
2706..2709;N
270C..2727;N
2729..273C;N
2701..273C;N
273D;A
273E..274B;N
274D;N
274F..2752;N
2756;N
273E..2756;N
2757;A
2758..275E;N
2761..2775;N
2758..2775;N
2776..277F;A
2780..2794;N
2798..27AF;N
27B1..27BE;N
27C0..27CA;N
2780..27CA;N
27CC;N
27D0..27E5;N
27CE..27E5;N
27E6..27ED;Na
27EE..2984;N
2985..2986;Na
@ -652,8 +642,8 @@
2C60..2CF1;N
2CF9..2D25;N
2D30..2D65;N
2D6F;N
2D80..2D96;N
2D6F..2D70;N
2D7F..2D96;N
2DA0..2DA6;N
2DA8..2DAE;N
2DB0..2DB6;N
@ -674,7 +664,7 @@
3099..30FF;W
3105..312D;W
3131..318E;W
3190..31B7;W
3190..31BA;W
31C0..31E3;W
31F0..321E;W
3220..3247;W
@ -689,12 +679,13 @@
A000..A48C;W
A490..A4C6;W
A4D0..A62B;N
A640..A65F;N
A662..A673;N
A640..A673;N
A67C..A697;N
A6A0..A6F7;N
A700..A78C;N
A7FB..A82B;N
A700..A78E;N
A790..A791;N
A7A0..A7A9;N
A7FA..A82B;N
A830..A839;N
A840..A877;N
A880..A8C4;N
@ -712,6 +703,11 @@ AA50..AA59;N
AA5C..AA7B;N
AA80..AAC2;N
AADB..AADF;N
AB01..AB06;N
AB09..AB0E;N
AB11..AB16;N
AB20..AB26;N
AB28..AB2E;N
ABC0..ABED;N
ABF0..ABF9;N
AC00..D7A3;W
@ -734,7 +730,7 @@ FB38..FB3C;N
FB3E;N
FB40..FB41;N
FB43..FB44;N
FB46..FBB1;N
FB46..FBC1;N
FBD3..FD3F;N
FD50..FD8F;N
FD92..FDC7;N
@ -805,11 +801,15 @@ FFFD;A
10B78..10B7F;N
10C00..10C48;N
10E60..10E7E;N
11000..1104D;N
11052..1106F;N
11080..110C1;N
12000..1236E;N
12400..12462;N
12470..12473;N
13000..1342E;N
16800..16A38;N
1B000..1B001;W
1D000..1D0F5;N
1D100..1D126;N
1D129..1D1DD;N
@ -839,29 +839,54 @@ FFFD;A
1D7CE..1D7FF;N
1F000..1F02B;N
1F030..1F093;N
1F0A0..1F0AE;N
1F0B1..1F0BE;N
1F0C1..1F0CF;N
1F0D1..1F0DF;N
1F100..1F10A;A
1F110..1F12D;A
1F12E;N
1F131;A
1F13D;A
1F13F;A
1F142;A
1F146;A
1F14A..1F14E;A
1F157;A
1F15F;A
1F179;A
1F17B..1F17C;A
1F17F;A
1F18A..1F18D;A
1F190;A
1F200;W
1F210..1F231;W
1F130..1F169;A
1F170..1F19A;A
1F1E6..1F1FF;N
1F200..1F202;W
1F210..1F23A;W
1F240..1F248;W
1F250..1F251;W
1F300..1F320;N
1F330..1F335;N
1F337..1F37C;N
1F380..1F393;N
1F3A0..1F3C4;N
1F3C6..1F3CA;N
1F3E0..1F3F0;N
1F400..1F43E;N
1F440;N
1F442..1F4F7;N
1F4F9..1F4FC;N
1F500..1F53D;N
1F550..1F567;N
1F5FB..1F5FF;N
1F601..1F610;N
1F612..1F614;N
1F616;N
1F618;N
1F61A;N
1F61C..1F61E;N
1F620..1F625;N
1F628..1F62B;N
1F62D;N
1F630..1F633;N
1F635..1F640;N
1F645..1F64F;N
1F680..1F6C5;N
1F700..1F773;N
20000..2A6D6;W
2A6D7..2A6FF;W
2A700..2B734;W
2B735..2F7FF;W
2B735..2F73F;W
2B740..2B81D;W
2B81E..2F7FF;W
2F800..2FA1D;W
2FA1E..2FFFD;W
30000..3FFFD;W

View file

@ -1,6 +1,9 @@
# Fractional UCA Table, generated from standard UCA
# 2009-10-24, 00:18:26 GMT [MD]
# VERSION: UCA=5.2.0, UCD=5.2.0
# Markus 2010-08-18: This is really the 5.2 version, but for testing with
# UCD 6.0 I need the version numbers here to be 6.0 as well.
# VERSION: UCA=6.0.0, UCD=6.0.0
# Generated processed version, as described in ICU design document.
# NOTES
@ -17,7 +20,7 @@
# - Differs from previous version in that MAX value was introduced at 1F.
# All tertiary values are shifted down by 1, filling the gap at 7!
[UCA version = 5.2.0]
[UCA version = 6.0.0]
0000; [,,]
0001; [,,]

View file

@ -1,8 +1,8 @@
# GraphemeBreakProperty-5.2.0.txt
# Date: 2009-06-09, 21:40:09 GMT [MD]
# GraphemeBreakProperty-6.0.0.txt
# Date: 2010-09-01, 18:48:17 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
@ -65,10 +65,9 @@ E0020..E007F ; Control
05C4..05C5 ; Extend
05C7 ; Extend
0610..061A ; Extend
064B..065E ; Extend
064B..065F ; Extend
0670 ; Extend
06D6..06DC ; Extend
06DE ; Extend
06DF..06E4 ; Extend
06E7..06E8 ; Extend
06EA..06ED ; Extend
@ -80,11 +79,13 @@ E0020..E007F ; Control
081B..0823 ; Extend
0825..0827 ; Extend
0829..082D ; Extend
0859..085B ; Extend
0900..0902 ; Extend
093A ; Extend
093C ; Extend
0941..0948 ; Extend
094D ; Extend
0951..0955 ; Extend
0951..0957 ; Extend
0962..0963 ; Extend
0981 ; Extend
09BC ; Extend
@ -143,15 +144,10 @@ E0020..E007F ; Control
0DD2..0DD4 ; Extend
0DD6 ; Extend
0DDF ; Extend
0E30 ; Extend
0E31 ; Extend
0E32..0E33 ; Extend
0E34..0E3A ; Extend
0E45 ; Extend
0E47..0E4E ; Extend
0EB0 ; Extend
0EB1 ; Extend
0EB2..0EB3 ; Extend
0EB4..0EB9 ; Extend
0EBB..0EBC ; Extend
0EC8..0ECD ; Extend
@ -162,7 +158,7 @@ E0020..E007F ; Control
0F71..0F7E ; Extend
0F80..0F84 ; Extend
0F86..0F87 ; Extend
0F90..0F97 ; Extend
0F8D..0F97 ; Extend
0F99..0FBC ; Extend
0FC6 ; Extend
102D..1030 ; Extend
@ -176,7 +172,7 @@ E0020..E007F ; Control
1085..1086 ; Extend
108D ; Extend
109D ; Extend
135F ; Extend
135D..135F ; Extend
1712..1714 ; Extend
1732..1734 ; Extend
1752..1753 ; Extend
@ -208,6 +204,10 @@ E0020..E007F ; Control
1B80..1B81 ; Extend
1BA2..1BA5 ; Extend
1BA8..1BA9 ; Extend
1BE6 ; Extend
1BE8..1BE9 ; Extend
1BED ; Extend
1BEF..1BF1 ; Extend
1C2C..1C33 ; Extend
1C36..1C37 ; Extend
1CD0..1CD2 ; Extend
@ -215,7 +215,7 @@ E0020..E007F ; Control
1CE2..1CE8 ; Extend
1CED ; Extend
1DC0..1DE6 ; Extend
1DFD..1DFF ; Extend
1DFC..1DFF ; Extend
200C..200D ; Extend
20D0..20DC ; Extend
20DD..20E0 ; Extend
@ -223,6 +223,7 @@ E0020..E007F ; Control
20E2..20E4 ; Extend
20E5..20F0 ; Extend
2CEF..2CF1 ; Extend
2D7F ; Extend
2DE0..2DFF ; Extend
302A..302F ; Extend
3099..309A ; Extend
@ -265,6 +266,8 @@ FF9E..FF9F ; Extend
10A0C..10A0F ; Extend
10A38..10A3A ; Extend
10A3F ; Extend
11001 ; Extend
11038..11046 ; Extend
11080..11081 ; Extend
110B3..110B6 ; Extend
110B9..110BA ; Extend
@ -277,7 +280,7 @@ FF9E..FF9F ; Extend
1D242..1D244 ; Extend
E0100..E01EF ; Extend
# Total code points: 1205
# Total code points: 1234
# ================================================
@ -292,9 +295,10 @@ AABB..AABC ; Prepend
# ================================================
0903 ; SpacingMark
093B ; SpacingMark
093E..0940 ; SpacingMark
0949..094C ; SpacingMark
094E ; SpacingMark
094E..094F ; SpacingMark
0982..0983 ; SpacingMark
09BF..09C0 ; SpacingMark
09C7..09C8 ; SpacingMark
@ -329,6 +333,11 @@ AABB..AABC ; Prepend
0DD0..0DD1 ; SpacingMark
0DD8..0DDE ; SpacingMark
0DF2..0DF3 ; SpacingMark
0E30 ; SpacingMark
0E32..0E33 ; SpacingMark
0E45 ; SpacingMark
0EB0 ; SpacingMark
0EB2..0EB3 ; SpacingMark
0F3E..0F3F ; SpacingMark
0F7F ; SpacingMark
102B..102C ; SpacingMark
@ -366,6 +375,10 @@ AABB..AABC ; Prepend
1BA1 ; SpacingMark
1BA6..1BA7 ; SpacingMark
1BAA ; SpacingMark
1BE7 ; SpacingMark
1BEA..1BEC ; SpacingMark
1BEE ; SpacingMark
1BF2..1BF3 ; SpacingMark
1C24..1C2B ; SpacingMark
1C34..1C35 ; SpacingMark
1CE1 ; SpacingMark
@ -387,13 +400,15 @@ ABE3..ABE4 ; SpacingMark
ABE6..ABE7 ; SpacingMark
ABE9..ABEA ; SpacingMark
ABEC ; SpacingMark
11000 ; SpacingMark
11002 ; SpacingMark
11082 ; SpacingMark
110B0..110B2 ; SpacingMark
110B7..110B8 ; SpacingMark
1D166 ; SpacingMark
1D16D ; SpacingMark
# Total code points: 257
# Total code points: 275
# ================================================

View file

@ -1,5 +1,5 @@
# LineBreak-5.2.0.txt
# Date: 2009-08-17, 12:21:00 PDT [KW]
# LineBreak-6.0.0.txt
# Date: 2010-08-18, 17:25:00 PDT [KW]
#
# Line Break Properties
#
@ -7,7 +7,7 @@
# Unicode Character Database.
# It contains both normative and informative data.
#
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# The format is two fields separated by a semicolon.
@ -29,6 +29,7 @@
# CJK Compatibility Ideographs: U+F900..U+FAFF
# CJK Unified Ideographs Extension B: U+20000..U+2A6DF
# CJK Unified Ideographs Extension C: U+2A700..U+2B73F
# CJK Unified Ideographs Extension D: U+2B740..U+2B81F
# CJK Compatibility Ideographs Supplement: U+2F800..U+2FA1F
# and any other reserved code points on
# Planes 2 and 3: U+20000..U+2FFFD
@ -133,7 +134,7 @@
038E..03A1;AL
03A3..0482;AL
0483..0489;CM
048A..0525;AL
048A..0527;AL
0531..0556;AL
0559..055F;AL
0561..0587;AL
@ -158,8 +159,8 @@
0610..061A;CM
061B;EX
061E..061F;EX
0621..064A;AL
064B..065E;CM
0620..064A;AL
064B..065F;CM
0660..0669;NU
066A;PO
066B..066C;NU
@ -169,8 +170,8 @@
06D4;EX
06D5;AL
06D6..06DC;CM
06DD;AL
06DE..06E4;CM
06DD..06DE;AL
06DF..06E4;CM
06E5..06E6;AL
06E7..06E8;CM
06E9;AL
@ -201,18 +202,21 @@
0828;AL
0829..082D;CM
0830..083E;AL
0840..0858;AL
0859..085B;CM
085E;AL
0900..0903;CM
0904..0939;AL
093C;CM
093A..093C;CM
093D;AL
093E..094E;CM
093E..094F;CM
0950;AL
0951..0955;CM
0951..0957;CM
0958..0961;AL
0962..0963;CM
0964..0965;BA
0966..096F;NU
0970..0972;AL
0970..0977;AL
0979..097F;AL
0981..0983;CM
0985..098C;AL
@ -291,7 +295,7 @@
0B5F..0B61;AL
0B62..0B63;CM
0B66..0B6F;NU
0B70..0B71;AL
0B70..0B77;AL
0B82;CM
0B83;AL
0B85..0B8A;AL
@ -348,12 +352,12 @@
0D02..0D03;CM
0D05..0D0C;AL
0D0E..0D10;AL
0D12..0D28;AL
0D2A..0D39;AL
0D12..0D3A;AL
0D3D;AL
0D3E..0D44;CM
0D46..0D48;CM
0D4A..0D4D;CM
0D4E;AL
0D57;CM
0D60..0D61;AL
0D62..0D63;CM
@ -432,8 +436,8 @@
0F80..0F84;CM
0F85;BA
0F86..0F87;CM
0F88..0F8B;AL
0F90..0F97;CM
0F88..0F8C;AL
0F8D..0F97;CM
0F99..0FBC;CM
0FBE..0FBF;BA
0FC0..0FC5;AL
@ -444,6 +448,7 @@
0FD2;BA
0FD3;BB
0FD4..0FD8;AL
0FD9..0FDA;GL
1000..103F;SA
1040..1049;NU
104A..104B;BA
@ -472,7 +477,7 @@
12D8..1310;AL
1312..1315;AL
1318..135A;AL
135F;CM
135D..135F;CM
1360;AL
1361;BA
1362..137C;AL
@ -534,7 +539,8 @@
1970..1974;SA
1980..19AB;SA
19B0..19C9;SA
19D0..19DA;NU
19D0..19D9;NU
19DA;SA
19DE..19DF;SA
19E0..1A16;AL
1A17..1A1B;CM
@ -561,7 +567,9 @@
1BA1..1BAA;CM
1BAE..1BAF;AL
1BB0..1BB9;NU
1C00..1C23;AL
1BC0..1BE5;AL
1BE6..1BF3;CM
1BFC..1C23;AL
1C24..1C37;CM
1C3B..1C3F;BA
1C40..1C49;NU
@ -578,7 +586,7 @@
1CF2;CM
1D00..1DBF;AL
1DC0..1DE6;CM
1DFD..1DFF;CM
1DFC..1DFF;CM
1E00..1F15;AL
1F18..1F1D;AL
1F20..1F45;AL
@ -650,12 +658,12 @@
2085..208C;AL
208D;OP
208E;CL
2090..2094;AL
2090..209C;AL
20A0..20A6;PR
20A7;PO
20A8..20B5;PR
20B6;PO
20B7..20B8;PR
20B7..20B9;PR
20D0..20F0;CM
2100..2102;AL
2103;PO
@ -750,7 +758,7 @@
2313..2328;AL
2329;OP
232A;CL
232B..23E8;AL
232B..23F3;AL
2400..2426;AL
2440..244A;AL
2460..24FE;AI
@ -816,20 +824,17 @@
26BE..26BF;AI
26C0..26C3;AL
26C4..26CD;AI
26CE;AL
26CF..26E1;AI
26E2;AL
26E3;AI
26E4..26E7;AL
26E8..26FF;AI
2701..2704;AL
2706..2709;AL
270C..2727;AL
2729..274B;AL
274D;AL
274F..2752;AL
2756;AL
2701..2756;AL
2757;AI
2758..275A;AL
275B..275E;QU
2761;AL
275F..2761;AL
2762..2763;EX
2764..2767;AL
2768;OP
@ -847,15 +852,12 @@
2774;OP
2775;CL
2776..2793;AI
2794;AL
2798..27AF;AL
27B1..27BE;AL
27C0..27C4;AL
2794..27C4;AL
27C5;OP
27C6;CL
27C7..27CA;AL
27CC;AL
27D0..27E5;AL
27CE..27E5;AL
27E6;OP
27E7;CL
27E8;OP
@ -912,6 +914,8 @@
2D00..2D25;AL
2D30..2D65;AL
2D6F;AL
2D70;BA
2D7F;CM
2D80..2D96;AL
2DA0..2DA6;AL
2DA8..2DAE;AL
@ -1030,7 +1034,7 @@
30FF;ID
3105..312D;ID
3131..318E;ID
3190..31B7;ID
3190..31BA;ID
31C0..31E3;ID
31F0..31FF;NS
3200..321E;ID
@ -1056,8 +1060,7 @@ A60F;BA
A610..A61F;AL
A620..A629;NU
A62A..A62B;AL
A640..A65F;AL
A662..A66E;AL
A640..A66E;AL
A66F..A672;CM
A673;AL
A67C..A67D;CM
@ -1066,8 +1069,10 @@ A6A0..A6EF;AL
A6F0..A6F1;CM
A6F2;AL
A6F3..A6F7;BA
A700..A78C;AL
A7FB..A801;AL
A700..A78E;AL
A790..A791;AL
A7A0..A7A9;AL
A7FA..A801;AL
A802;CM
A803..A805;AL
A806;CM
@ -1118,6 +1123,11 @@ AA5D..AA5F;BA
AA60..AA7B;SA
AA80..AAC2;SA
AADB..AADF;SA
AB01..AB06;AL
AB09..AB0E;AL
AB11..AB16;AL
AB20..AB26;AL
AB28..AB2E;AL
ABC0..ABE2;AL
ABE3..ABEA;CM
ABEB;BA
@ -1942,7 +1952,7 @@ FB38..FB3C;AL
FB3E;AL
FB40..FB41;AL
FB43..FB44;AL
FB46..FBB1;AL
FB46..FBC1;AL
FBD3..FD3D;AL
FD3E;OP
FD3F;CL
@ -2105,6 +2115,13 @@ FFFD;AI
10B78..10B7F;AL
10C00..10C48;AL
10E60..10E7E;AL
11000..11002;CM
11003..11037;AL
11038..11046;CM
11047..11048;BA
11049..1104D;AL
11052..11065;AL
11066..1106F;NU
11080..11082;CM
11083..110AF;AL
110B0..110BA;CM
@ -2127,6 +2144,8 @@ FFFD;AI
13379;OP
1337A..1337B;CL
1337C..1342E;AL
16800..16A38;AL
1B000..1B001;ID
1D000..1D0F5;AL
1D100..1D126;AL
1D129..1D164;AL
@ -2166,29 +2185,54 @@ FFFD;AI
1D7CE..1D7FF;NU
1F000..1F02B;AL
1F030..1F093;AL
1F0A0..1F0AE;AL
1F0B1..1F0BE;AL
1F0C1..1F0CF;AL
1F0D1..1F0DF;AL
1F100..1F10A;AI
1F110..1F12D;AI
1F12E;AL
1F131;AI
1F13D;AI
1F13F;AI
1F142;AI
1F146;AI
1F14A..1F14E;AI
1F157;AI
1F15F;AI
1F179;AI
1F17B..1F17C;AI
1F17F;AI
1F18A..1F18D;AI
1F190;AI
1F200;ID
1F210..1F231;ID
1F130..1F169;AI
1F170..1F19A;AI
1F1E6..1F1FF;AL
1F200..1F202;ID
1F210..1F23A;ID
1F240..1F248;ID
1F250..1F251;ID
1F300..1F320;AL
1F330..1F335;AL
1F337..1F37C;AL
1F380..1F393;AL
1F3A0..1F3C4;AL
1F3C6..1F3CA;AL
1F3E0..1F3F0;AL
1F400..1F43E;AL
1F440;AL
1F442..1F4F7;AL
1F4F9..1F4FC;AL
1F500..1F53D;AL
1F550..1F567;AL
1F5FB..1F5FF;AL
1F601..1F610;AL
1F612..1F614;AL
1F616;AL
1F618;AL
1F61A;AL
1F61C..1F61E;AL
1F620..1F625;AL
1F628..1F62B;AL
1F62D;AL
1F630..1F633;AL
1F635..1F640;AL
1F645..1F64F;AL
1F680..1F6C5;AL
1F700..1F773;AL
20000..2A6D6;ID
2A6D7..2A6FF;ID
2A700..2B734;ID
2B735..2F7FF;ID
2B735..2B73F;ID
2B740..2B81D;ID
2B81E..2F7FF;ID
2F800..2FA1D;ID
2FA1E..2FFFD;ID
30000..3FFFD;ID

View file

@ -1,10 +1,10 @@
# NameAliases-5.2.0.txt
# Date: 2009-05-22, 13:05:00 PDT [KW]
# NameAliases-6.0.0.txt
# Date: 2010-05-10, 11:58:00 PDT [KW]
#
# This file is a normative contributory data file in the
# Unicode Character Database.
#
# Copyright (c) 2005-2009 Unicode, Inc.
# Copyright (c) 2005-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# This file defines the formal name aliases for Unicode characters.

View file

@ -1,10 +1,10 @@
# NormalizationCorrections-5.2.0.txt
# Date: 2009-05-22, 13:54:00 PDT [KW]
# NormalizationCorrections-6.0.0.txt
# Date: 2010-05-19, 11:21:00 PDT [KW]
#
# This file is a normative contributory data file in the
# Unicode Character Database.
#
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# The normalization stabilization policy of the Unicode

View file

@ -1,8 +1,8 @@
# NormalizationTest-5.2.0.txt
# Date: 2009-08-22, 04:58:39 GMT [MD]
# NormalizationTest-6.0.0.txt
# Date: 2010-05-18, 00:49:30 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#
@ -1196,6 +1196,14 @@
2092;2092;2092;006F;006F;
2093;2093;2093;0078;0078;
2094;2094;2094;0259;0259;
2095;2095;2095;0068;0068;
2096;2096;2096;006B;006B;
2097;2097;2097;006C;006C;
2098;2098;2098;006D;006D;
2099;2099;2099;006E;006E;
209A;209A;209A;0070;0070;
209B;209B;209B;0073;0073;
209C;209C;209C;0074;0074;
20A8;20A8;20A8;0052 0073;0052 0073;
2100;2100;2100;0061 002F 0063;0061 002F 0063;
2101;2101;2101;0061 002F 0073;0061 002F 0073;
@ -16155,18 +16163,42 @@ FFEE;FFEE;FFEE;25CB;25CB;
1F12C;1F12C;1F12C;0052;0052;
1F12D;1F12D;1F12D;0043 0044;0043 0044;
1F12E;1F12E;1F12E;0057 005A;0057 005A;
1F130;1F130;1F130;0041;0041;
1F131;1F131;1F131;0042;0042;
1F132;1F132;1F132;0043;0043;
1F133;1F133;1F133;0044;0044;
1F134;1F134;1F134;0045;0045;
1F135;1F135;1F135;0046;0046;
1F136;1F136;1F136;0047;0047;
1F137;1F137;1F137;0048;0048;
1F138;1F138;1F138;0049;0049;
1F139;1F139;1F139;004A;004A;
1F13A;1F13A;1F13A;004B;004B;
1F13B;1F13B;1F13B;004C;004C;
1F13C;1F13C;1F13C;004D;004D;
1F13D;1F13D;1F13D;004E;004E;
1F13E;1F13E;1F13E;004F;004F;
1F13F;1F13F;1F13F;0050;0050;
1F140;1F140;1F140;0051;0051;
1F141;1F141;1F141;0052;0052;
1F142;1F142;1F142;0053;0053;
1F143;1F143;1F143;0054;0054;
1F144;1F144;1F144;0055;0055;
1F145;1F145;1F145;0056;0056;
1F146;1F146;1F146;0057;0057;
1F147;1F147;1F147;0058;0058;
1F148;1F148;1F148;0059;0059;
1F149;1F149;1F149;005A;005A;
1F14A;1F14A;1F14A;0048 0056;0048 0056;
1F14B;1F14B;1F14B;004D 0056;004D 0056;
1F14C;1F14C;1F14C;0053 0044;0053 0044;
1F14D;1F14D;1F14D;0053 0053;0053 0053;
1F14E;1F14E;1F14E;0050 0050 0056;0050 0050 0056;
1F14F;1F14F;1F14F;0057 0043;0057 0043;
1F190;1F190;1F190;0044 004A;0044 004A;
1F200;1F200;1F200;307B 304B;307B 304B;
1F201;1F201;1F201;30B3 30B3;30B3 30B3;
1F202;1F202;1F202;30B5;30B5;
1F210;1F210;1F210;624B;624B;
1F211;1F211;1F211;5B57;5B57;
1F212;1F212;1F212;53CC;53CC;
@ -16201,6 +16233,15 @@ FFEE;FFEE;FFEE;25CB;25CB;
1F22F;1F22F;1F22F;6307;6307;
1F230;1F230;1F230;8D70;8D70;
1F231;1F231;1F231;6253;6253;
1F232;1F232;1F232;7981;7981;
1F233;1F233;1F233;7A7A;7A7A;
1F234;1F234;1F234;5408;5408;
1F235;1F235;1F235;6E80;6E80;
1F236;1F236;1F236;6709;6709;
1F237;1F237;1F237;6708;6708;
1F238;1F238;1F238;7533;7533;
1F239;1F239;1F239;5272;5272;
1F23A;1F23A;1F23A;55B6;55B6;
1F240;1F240;1F240;3014 672C 3015;3014 672C 3015;
1F241;1F241;1F241;3014 4E09 3015;3014 4E09 3015;
1F242;1F242;1F242;3014 4E8C 3015;3014 4E8C 3015;
@ -16210,6 +16251,8 @@ FFEE;FFEE;FFEE;25CB;25CB;
1F246;1F246;1F246;3014 76D7 3015;3014 76D7 3015;
1F247;1F247;1F247;3014 52DD 3015;3014 52DD 3015;
1F248;1F248;1F248;3014 6557 3015;3014 6557 3015;
1F250;1F250;1F250;5F97;5F97;
1F251;1F251;1F251;53EF;53EF;
2F800;4E3D;4E3D;4E3D;4E3D;
2F801;4E38;4E38;4E38;4E38;
2F802;4E41;4E41;4E41;4E41;
@ -17151,6 +17194,8 @@ FFEE;FFEE;FFEE;25CB;25CB;
0061 065D 0315 0300 05AE 0062;0061 05AE 065D 0300 0315 0062;0061 05AE 065D 0300 0315 0062;0061 05AE 065D 0300 0315 0062;0061 05AE 065D 0300 0315 0062;
0061 0315 0300 05AE 065E 0062;00E0 05AE 065E 0315 0062;0061 05AE 0300 065E 0315 0062;00E0 05AE 065E 0315 0062;0061 05AE 0300 065E 0315 0062;
0061 065E 0315 0300 05AE 0062;0061 05AE 065E 0300 0315 0062;0061 05AE 065E 0300 0315 0062;0061 05AE 065E 0300 0315 0062;0061 05AE 065E 0300 0315 0062;
0061 059A 0316 302A 065F 0062;0061 302A 0316 065F 059A 0062;0061 302A 0316 065F 059A 0062;0061 302A 0316 065F 059A 0062;0061 302A 0316 065F 059A 0062;
0061 065F 059A 0316 302A 0062;0061 302A 065F 0316 059A 0062;0061 302A 065F 0316 059A 0062;0061 302A 065F 0316 059A 0062;0061 302A 065F 0316 059A 0062;
0061 0711 0670 0652 0670 0062;0061 0652 0670 0670 0711 0062;0061 0652 0670 0670 0711 0062;0061 0652 0670 0670 0711 0062;0061 0652 0670 0670 0711 0062;
0061 0670 0711 0670 0652 0062;0061 0652 0670 0670 0711 0062;0061 0652 0670 0670 0711 0062;0061 0652 0670 0670 0711 0062;0061 0652 0670 0670 0711 0062;
0061 0315 0300 05AE 06D6 0062;00E0 05AE 06D6 0315 0062;0061 05AE 0300 06D6 0315 0062;00E0 05AE 06D6 0315 0062;0061 05AE 0300 06D6 0315 0062;
@ -17307,6 +17352,12 @@ FFEE;FFEE;FFEE;25CB;25CB;
0061 082C 0315 0300 05AE 0062;0061 05AE 082C 0300 0315 0062;0061 05AE 082C 0300 0315 0062;0061 05AE 082C 0300 0315 0062;0061 05AE 082C 0300 0315 0062;
0061 0315 0300 05AE 082D 0062;00E0 05AE 082D 0315 0062;0061 05AE 0300 082D 0315 0062;00E0 05AE 082D 0315 0062;0061 05AE 0300 082D 0315 0062;
0061 082D 0315 0300 05AE 0062;0061 05AE 082D 0300 0315 0062;0061 05AE 082D 0300 0315 0062;0061 05AE 082D 0300 0315 0062;0061 05AE 082D 0300 0315 0062;
0061 059A 0316 302A 0859 0062;0061 302A 0316 0859 059A 0062;0061 302A 0316 0859 059A 0062;0061 302A 0316 0859 059A 0062;0061 302A 0316 0859 059A 0062;
0061 0859 059A 0316 302A 0062;0061 302A 0859 0316 059A 0062;0061 302A 0859 0316 059A 0062;0061 302A 0859 0316 059A 0062;0061 302A 0859 0316 059A 0062;
0061 059A 0316 302A 085A 0062;0061 302A 0316 085A 059A 0062;0061 302A 0316 085A 059A 0062;0061 302A 0316 085A 059A 0062;0061 302A 0316 085A 059A 0062;
0061 085A 059A 0316 302A 0062;0061 302A 085A 0316 059A 0062;0061 302A 085A 0316 059A 0062;0061 302A 085A 0316 059A 0062;0061 302A 085A 0316 059A 0062;
0061 059A 0316 302A 085B 0062;0061 302A 0316 085B 059A 0062;0061 302A 0316 085B 059A 0062;0061 302A 0316 085B 059A 0062;0061 302A 0316 085B 059A 0062;
0061 085B 059A 0316 302A 0062;0061 302A 085B 0316 059A 0062;0061 302A 085B 0316 059A 0062;0061 302A 085B 0316 059A 0062;0061 302A 085B 0316 059A 0062;
0061 3099 093C 0334 093C 0062;0061 0334 093C 093C 3099 0062;0061 0334 093C 093C 3099 0062;0061 0334 093C 093C 3099 0062;0061 0334 093C 093C 3099 0062;
0061 093C 3099 093C 0334 0062;0061 0334 093C 093C 3099 0062;0061 0334 093C 093C 3099 0062;0061 0334 093C 093C 3099 0062;0061 0334 093C 093C 3099 0062;
0061 05B0 094D 3099 094D 0062;0061 3099 094D 094D 05B0 0062;0061 3099 094D 094D 05B0 0062;0061 3099 094D 094D 05B0 0062;0061 3099 094D 094D 05B0 0062;
@ -17423,6 +17474,10 @@ FFEE;FFEE;FFEE;25CB;25CB;
0061 103A 05B0 094D 3099 0062;0061 3099 103A 094D 05B0 0062;0061 3099 103A 094D 05B0 0062;0061 3099 103A 094D 05B0 0062;0061 3099 103A 094D 05B0 0062;
0061 059A 0316 302A 108D 0062;0061 302A 0316 108D 059A 0062;0061 302A 0316 108D 059A 0062;0061 302A 0316 108D 059A 0062;0061 302A 0316 108D 059A 0062;
0061 108D 059A 0316 302A 0062;0061 302A 108D 0316 059A 0062;0061 302A 108D 0316 059A 0062;0061 302A 108D 0316 059A 0062;0061 302A 108D 0316 059A 0062;
0061 0315 0300 05AE 135D 0062;00E0 05AE 135D 0315 0062;0061 05AE 0300 135D 0315 0062;00E0 05AE 135D 0315 0062;0061 05AE 0300 135D 0315 0062;
0061 135D 0315 0300 05AE 0062;0061 05AE 135D 0300 0315 0062;0061 05AE 135D 0300 0315 0062;0061 05AE 135D 0300 0315 0062;0061 05AE 135D 0300 0315 0062;
0061 0315 0300 05AE 135E 0062;00E0 05AE 135E 0315 0062;0061 05AE 0300 135E 0315 0062;00E0 05AE 135E 0315 0062;0061 05AE 0300 135E 0315 0062;
0061 135E 0315 0300 05AE 0062;0061 05AE 135E 0300 0315 0062;0061 05AE 135E 0300 0315 0062;0061 05AE 135E 0300 0315 0062;0061 05AE 135E 0300 0315 0062;
0061 0315 0300 05AE 135F 0062;00E0 05AE 135F 0315 0062;0061 05AE 0300 135F 0315 0062;00E0 05AE 135F 0315 0062;0061 05AE 0300 135F 0315 0062;
0061 135F 0315 0300 05AE 0062;0061 05AE 135F 0300 0315 0062;0061 05AE 135F 0300 0315 0062;0061 05AE 135F 0300 0315 0062;0061 05AE 135F 0300 0315 0062;
0061 05B0 094D 3099 1714 0062;0061 3099 094D 1714 05B0 0062;0061 3099 094D 1714 05B0 0062;0061 3099 094D 1714 05B0 0062;0061 3099 094D 1714 05B0 0062;
@ -17489,6 +17544,12 @@ FFEE;FFEE;FFEE;25CB;25CB;
0061 1B73 0315 0300 05AE 0062;0061 05AE 1B73 0300 0315 0062;0061 05AE 1B73 0300 0315 0062;0061 05AE 1B73 0300 0315 0062;0061 05AE 1B73 0300 0315 0062;
0061 05B0 094D 3099 1BAA 0062;0061 3099 094D 1BAA 05B0 0062;0061 3099 094D 1BAA 05B0 0062;0061 3099 094D 1BAA 05B0 0062;0061 3099 094D 1BAA 05B0 0062;
0061 1BAA 05B0 094D 3099 0062;0061 3099 1BAA 094D 05B0 0062;0061 3099 1BAA 094D 05B0 0062;0061 3099 1BAA 094D 05B0 0062;0061 3099 1BAA 094D 05B0 0062;
0061 3099 093C 0334 1BE6 0062;0061 0334 093C 1BE6 3099 0062;0061 0334 093C 1BE6 3099 0062;0061 0334 093C 1BE6 3099 0062;0061 0334 093C 1BE6 3099 0062;
0061 1BE6 3099 093C 0334 0062;0061 0334 1BE6 093C 3099 0062;0061 0334 1BE6 093C 3099 0062;0061 0334 1BE6 093C 3099 0062;0061 0334 1BE6 093C 3099 0062;
0061 05B0 094D 3099 1BF2 0062;0061 3099 094D 1BF2 05B0 0062;0061 3099 094D 1BF2 05B0 0062;0061 3099 094D 1BF2 05B0 0062;0061 3099 094D 1BF2 05B0 0062;
0061 1BF2 05B0 094D 3099 0062;0061 3099 1BF2 094D 05B0 0062;0061 3099 1BF2 094D 05B0 0062;0061 3099 1BF2 094D 05B0 0062;0061 3099 1BF2 094D 05B0 0062;
0061 05B0 094D 3099 1BF3 0062;0061 3099 094D 1BF3 05B0 0062;0061 3099 094D 1BF3 05B0 0062;0061 3099 094D 1BF3 05B0 0062;0061 3099 094D 1BF3 05B0 0062;
0061 1BF3 05B0 094D 3099 0062;0061 3099 1BF3 094D 05B0 0062;0061 3099 1BF3 094D 05B0 0062;0061 3099 1BF3 094D 05B0 0062;0061 3099 1BF3 094D 05B0 0062;
0061 3099 093C 0334 1C37 0062;0061 0334 093C 1C37 3099 0062;0061 0334 093C 1C37 3099 0062;0061 0334 093C 1C37 3099 0062;0061 0334 093C 1C37 3099 0062;
0061 1C37 3099 093C 0334 0062;0061 0334 1C37 093C 3099 0062;0061 0334 1C37 093C 3099 0062;0061 0334 1C37 093C 3099 0062;0061 0334 1C37 093C 3099 0062;
0061 0315 0300 05AE 1CD0 0062;00E0 05AE 1CD0 0315 0062;0061 05AE 0300 1CD0 0315 0062;00E0 05AE 1CD0 0315 0062;0061 05AE 0300 1CD0 0315 0062;
@ -17617,6 +17678,8 @@ FFEE;FFEE;FFEE;25CB;25CB;
0061 1DE5 0315 0300 05AE 0062;0061 05AE 1DE5 0300 0315 0062;0061 05AE 1DE5 0300 0315 0062;0061 05AE 1DE5 0300 0315 0062;0061 05AE 1DE5 0300 0315 0062;
0061 0315 0300 05AE 1DE6 0062;00E0 05AE 1DE6 0315 0062;0061 05AE 0300 1DE6 0315 0062;00E0 05AE 1DE6 0315 0062;0061 05AE 0300 1DE6 0315 0062;
0061 1DE6 0315 0300 05AE 0062;0061 05AE 1DE6 0300 0315 0062;0061 05AE 1DE6 0300 0315 0062;0061 05AE 1DE6 0300 0315 0062;0061 05AE 1DE6 0300 0315 0062;
0061 035D 035C 0315 1DFC 0062;0061 0315 035C 1DFC 035D 0062;0061 0315 035C 1DFC 035D 0062;0061 0315 035C 1DFC 035D 0062;0061 0315 035C 1DFC 035D 0062;
0061 1DFC 035D 035C 0315 0062;0061 0315 1DFC 035C 035D 0062;0061 0315 1DFC 035C 035D 0062;0061 0315 1DFC 035C 035D 0062;0061 0315 1DFC 035C 035D 0062;
0061 059A 0316 302A 1DFD 0062;0061 302A 0316 1DFD 059A 0062;0061 302A 0316 1DFD 059A 0062;0061 302A 0316 1DFD 059A 0062;0061 302A 0316 1DFD 059A 0062;
0061 1DFD 059A 0316 302A 0062;0061 302A 1DFD 0316 059A 0062;0061 302A 1DFD 0316 059A 0062;0061 302A 1DFD 0316 059A 0062;0061 302A 1DFD 0316 059A 0062;
0061 0315 0300 05AE 1DFE 0062;00E0 05AE 1DFE 0315 0062;0061 05AE 0300 1DFE 0315 0062;00E0 05AE 1DFE 0315 0062;0061 05AE 0300 1DFE 0315 0062;
@ -17681,6 +17744,8 @@ FFEE;FFEE;FFEE;25CB;25CB;
0061 2CF0 0315 0300 05AE 0062;0061 05AE 2CF0 0300 0315 0062;0061 05AE 2CF0 0300 0315 0062;0061 05AE 2CF0 0300 0315 0062;0061 05AE 2CF0 0300 0315 0062;
0061 0315 0300 05AE 2CF1 0062;00E0 05AE 2CF1 0315 0062;0061 05AE 0300 2CF1 0315 0062;00E0 05AE 2CF1 0315 0062;0061 05AE 0300 2CF1 0315 0062;
0061 2CF1 0315 0300 05AE 0062;0061 05AE 2CF1 0300 0315 0062;0061 05AE 2CF1 0300 0315 0062;0061 05AE 2CF1 0300 0315 0062;0061 05AE 2CF1 0300 0315 0062;
0061 05B0 094D 3099 2D7F 0062;0061 3099 094D 2D7F 05B0 0062;0061 3099 094D 2D7F 05B0 0062;0061 3099 094D 2D7F 05B0 0062;0061 3099 094D 2D7F 05B0 0062;
0061 2D7F 05B0 094D 3099 0062;0061 3099 2D7F 094D 05B0 0062;0061 3099 2D7F 094D 05B0 0062;0061 3099 2D7F 094D 05B0 0062;0061 3099 2D7F 094D 05B0 0062;
0061 0315 0300 05AE 2DE0 0062;00E0 05AE 2DE0 0315 0062;0061 05AE 0300 2DE0 0315 0062;00E0 05AE 2DE0 0315 0062;0061 05AE 0300 2DE0 0315 0062;
0061 2DE0 0315 0300 05AE 0062;0061 05AE 2DE0 0300 0315 0062;0061 05AE 2DE0 0300 0315 0062;0061 05AE 2DE0 0300 0315 0062;0061 05AE 2DE0 0300 0315 0062;
0061 0315 0300 05AE 2DE1 0062;00E0 05AE 2DE1 0315 0062;0061 05AE 0300 2DE1 0315 0062;00E0 05AE 2DE1 0315 0062;0061 05AE 0300 2DE1 0315 0062;
@ -17873,6 +17938,8 @@ FFEE;FFEE;FFEE;25CB;25CB;
0061 10A3A 059A 0316 302A 0062;0061 302A 10A3A 0316 059A 0062;0061 302A 10A3A 0316 059A 0062;0061 302A 10A3A 0316 059A 0062;0061 302A 10A3A 0316 059A 0062;
0061 05B0 094D 3099 10A3F 0062;0061 3099 094D 10A3F 05B0 0062;0061 3099 094D 10A3F 05B0 0062;0061 3099 094D 10A3F 05B0 0062;0061 3099 094D 10A3F 05B0 0062;
0061 10A3F 05B0 094D 3099 0062;0061 3099 10A3F 094D 05B0 0062;0061 3099 10A3F 094D 05B0 0062;0061 3099 10A3F 094D 05B0 0062;0061 3099 10A3F 094D 05B0 0062;
0061 05B0 094D 3099 11046 0062;0061 3099 094D 11046 05B0 0062;0061 3099 094D 11046 05B0 0062;0061 3099 094D 11046 05B0 0062;0061 3099 094D 11046 05B0 0062;
0061 11046 05B0 094D 3099 0062;0061 3099 11046 094D 05B0 0062;0061 3099 11046 094D 05B0 0062;0061 3099 11046 094D 05B0 0062;0061 3099 11046 094D 05B0 0062;
0061 05B0 094D 3099 110B9 0062;0061 3099 094D 110B9 05B0 0062;0061 3099 094D 110B9 05B0 0062;0061 3099 094D 110B9 05B0 0062;0061 3099 094D 110B9 05B0 0062;
0061 110B9 05B0 094D 3099 0062;0061 3099 110B9 094D 05B0 0062;0061 3099 110B9 094D 05B0 0062;0061 3099 110B9 094D 05B0 0062;0061 3099 110B9 094D 05B0 0062;
0061 3099 093C 0334 110BA 0062;0061 0334 093C 110BA 3099 0062;0061 0334 093C 110BA 3099 0062;0061 0334 093C 110BA 3099 0062;0061 0334 093C 110BA 3099 0062;

View file

@ -1,8 +1,8 @@
# PropList-5.2.0.txt
# Date: 2009-08-22, 04:58:40 GMT [MD]
# PropList-6.0.0.txt
# Date: 2010-08-19, 00:48:28 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
@ -126,6 +126,7 @@ FF63 ; Quotation_Mark
070C ; Terminal_Punctuation
07F8..07F9 ; Terminal_Punctuation
0830..083E ; Terminal_Punctuation
085E ; Terminal_Punctuation
0964..0965 ; Terminal_Punctuation
0E5A..0E5B ; Terminal_Punctuation
0F08 ; Terminal_Punctuation
@ -172,10 +173,11 @@ FF64 ; Terminal_Punctuation
10857 ; Terminal_Punctuation
1091F ; Terminal_Punctuation
10B3A..10B3F ; Terminal_Punctuation
11047..1104D ; Terminal_Punctuation
110BE..110C1 ; Terminal_Punctuation
12470..12473 ; Terminal_Punctuation
# Total code points: 161
# Total code points: 169
# ================================================
@ -197,6 +199,7 @@ FF64 ; Terminal_Punctuation
20E5..20E6 ; Other_Math
20EB..20EF ; Other_Math
2102 ; Other_Math
2107 ; Other_Math
210A..2113 ; Other_Math
2115 ; Other_Math
2119..211D ; Other_Math
@ -318,7 +321,7 @@ FF3E ; Other_Math
1D7C4..1D7CB ; Other_Math
1D7CE..1D7FF ; Other_Math
# Total code points: 1216
# Total code points: 1217
# ================================================
@ -349,7 +352,7 @@ FF41..FF46 ; Hex_Digit
05C7 ; Other_Alphabetic
0610..061A ; Other_Alphabetic
064B..0657 ; Other_Alphabetic
0659..065E ; Other_Alphabetic
0659..065F ; Other_Alphabetic
0670 ; Other_Alphabetic
06D6..06DC ; Other_Alphabetic
06E1..06E4 ; Other_Alphabetic
@ -364,11 +367,13 @@ FF41..FF46 ; Hex_Digit
0829..082C ; Other_Alphabetic
0900..0902 ; Other_Alphabetic
0903 ; Other_Alphabetic
093A ; Other_Alphabetic
093B ; Other_Alphabetic
093E..0940 ; Other_Alphabetic
0941..0948 ; Other_Alphabetic
0949..094C ; Other_Alphabetic
094E ; Other_Alphabetic
0955 ; Other_Alphabetic
094E..094F ; Other_Alphabetic
0955..0957 ; Other_Alphabetic
0962..0963 ; Other_Alphabetic
0981 ; Other_Alphabetic
0982..0983 ; Other_Alphabetic
@ -453,7 +458,7 @@ FF41..FF46 ; Hex_Digit
0F71..0F7E ; Other_Alphabetic
0F7F ; Other_Alphabetic
0F80..0F81 ; Other_Alphabetic
0F90..0F97 ; Other_Alphabetic
0F8D..0F97 ; Other_Alphabetic
0F99..0FBC ; Other_Alphabetic
102B..102C ; Other_Alphabetic
102D..1030 ; Other_Alphabetic
@ -520,6 +525,12 @@ FF41..FF46 ; Hex_Digit
1BA2..1BA5 ; Other_Alphabetic
1BA6..1BA7 ; Other_Alphabetic
1BA8..1BA9 ; Other_Alphabetic
1BE7 ; Other_Alphabetic
1BE8..1BE9 ; Other_Alphabetic
1BEA..1BEC ; Other_Alphabetic
1BED ; Other_Alphabetic
1BEE ; Other_Alphabetic
1BEF..1BF1 ; Other_Alphabetic
1C24..1C2B ; Other_Alphabetic
1C2C..1C33 ; Other_Alphabetic
1C34..1C35 ; Other_Alphabetic
@ -536,7 +547,6 @@ A947..A951 ; Other_Alphabetic
A952 ; Other_Alphabetic
A980..A982 ; Other_Alphabetic
A983 ; Other_Alphabetic
A9B3 ; Other_Alphabetic
A9B4..A9B5 ; Other_Alphabetic
A9B6..A9B9 ; Other_Alphabetic
A9BA..A9BB ; Other_Alphabetic
@ -563,12 +573,16 @@ FB1E ; Other_Alphabetic
10A01..10A03 ; Other_Alphabetic
10A05..10A06 ; Other_Alphabetic
10A0C..10A0F ; Other_Alphabetic
11000 ; Other_Alphabetic
11001 ; Other_Alphabetic
11002 ; Other_Alphabetic
11038..11045 ; Other_Alphabetic
11082 ; Other_Alphabetic
110B0..110B2 ; Other_Alphabetic
110B3..110B6 ; Other_Alphabetic
110B7..110B8 ; Other_Alphabetic
# Total code points: 759
# Total code points: 795
# ================================================
@ -583,9 +597,10 @@ FA30..FA6D ; Ideographic
FA70..FAD9 ; Ideographic
20000..2A6D6 ; Ideographic
2A700..2B734 ; Ideographic
2B740..2B81D ; Ideographic
2F800..2FA1D ; Ideographic
# Total code points: 75408
# Total code points: 75630
# ================================================
@ -863,8 +878,9 @@ FA23..FA24 ; Unified_Ideograph
FA27..FA29 ; Unified_Ideograph
20000..2A6D6 ; Unified_Ideograph
2A700..2B734 ; Unified_Ideograph
2B740..2B81D ; Unified_Ideograph
# Total code points: 74394
# Total code points: 74616
# ================================================
@ -884,6 +900,7 @@ E01F0..E0FFF ; Other_Default_Ignorable_Code_Point
# ================================================
0149 ; Deprecated
0673 ; Deprecated
0F77 ; Deprecated
0F79 ; Deprecated
17A3..17A4 ; Deprecated
@ -893,7 +910,7 @@ E01F0..E0FFF ; Other_Default_Ignorable_Code_Point
E0001 ; Deprecated
E0020..E007F ; Deprecated
# Total code points: 110
# Total code points: 111
# ================================================
@ -954,8 +971,9 @@ AABB..AABC ; Logical_Order_Exception
00B7 ; Other_ID_Continue
0387 ; Other_ID_Continue
1369..1371 ; Other_ID_Continue
19DA ; Other_ID_Continue
# Total code points: 11
# Total code points: 12
# ================================================
@ -974,9 +992,11 @@ AABB..AABC ; Logical_Order_Exception
1362 ; STerm
1367..1368 ; STerm
166E ; STerm
1735..1736 ; STerm
1803 ; STerm
1809 ; STerm
1944..1945 ; STerm
1AA8..1AAB ; STerm
1B5A..1B5B ; STerm
1B5E..1B5F ; STerm
1C3B..1C3C ; STerm
@ -1001,9 +1021,11 @@ FF01 ; STerm
FF0E ; STerm
FF1F ; STerm
FF61 ; STerm
10A56..10A57 ; STerm
11047..11048 ; STerm
110BE..110C1 ; STerm
# Total code points: 66
# Total code points: 76
# ================================================
@ -1116,8 +1138,8 @@ E0100..E01EF ; Variation_Selector
239B..23B3 ; Pattern_Syntax
23B4..23DB ; Pattern_Syntax
23DC..23E1 ; Pattern_Syntax
23E2..23E8 ; Pattern_Syntax
23E9..23FF ; Pattern_Syntax
23E2..23F3 ; Pattern_Syntax
23F4..23FF ; Pattern_Syntax
2400..2426 ; Pattern_Syntax
2427..243F ; Pattern_Syntax
2440..244A ; Pattern_Syntax
@ -1130,29 +1152,9 @@ E0100..E01EF ; Variation_Selector
25F8..25FF ; Pattern_Syntax
2600..266E ; Pattern_Syntax
266F ; Pattern_Syntax
2670..26CD ; Pattern_Syntax
26CE ; Pattern_Syntax
26CF..26E1 ; Pattern_Syntax
26E2 ; Pattern_Syntax
26E3 ; Pattern_Syntax
26E4..26E7 ; Pattern_Syntax
26E8..26FF ; Pattern_Syntax
2670..26FF ; Pattern_Syntax
2700 ; Pattern_Syntax
2701..2704 ; Pattern_Syntax
2705 ; Pattern_Syntax
2706..2709 ; Pattern_Syntax
270A..270B ; Pattern_Syntax
270C..2727 ; Pattern_Syntax
2728 ; Pattern_Syntax
2729..274B ; Pattern_Syntax
274C ; Pattern_Syntax
274D ; Pattern_Syntax
274E ; Pattern_Syntax
274F..2752 ; Pattern_Syntax
2753..2755 ; Pattern_Syntax
2756..275E ; Pattern_Syntax
275F..2760 ; Pattern_Syntax
2761..2767 ; Pattern_Syntax
2701..2767 ; Pattern_Syntax
2768 ; Pattern_Syntax
2769 ; Pattern_Syntax
276A ; Pattern_Syntax
@ -1167,20 +1169,15 @@ E0100..E01EF ; Variation_Selector
2773 ; Pattern_Syntax
2774 ; Pattern_Syntax
2775 ; Pattern_Syntax
2794 ; Pattern_Syntax
2795..2797 ; Pattern_Syntax
2798..27AF ; Pattern_Syntax
27B0 ; Pattern_Syntax
27B1..27BE ; Pattern_Syntax
27BF ; Pattern_Syntax
2794..27BF ; Pattern_Syntax
27C0..27C4 ; Pattern_Syntax
27C5 ; Pattern_Syntax
27C6 ; Pattern_Syntax
27C7..27CA ; Pattern_Syntax
27CB ; Pattern_Syntax
27CC ; Pattern_Syntax
27CD..27CF ; Pattern_Syntax
27D0..27E5 ; Pattern_Syntax
27CD ; Pattern_Syntax
27CE..27E5 ; Pattern_Syntax
27E6 ; Pattern_Syntax
27E7 ; Pattern_Syntax
27E8 ; Pattern_Syntax

View file

@ -1,8 +1,8 @@
# PropertyAliases-5.2.0.txt
# Date: 2009-08-24, 03:26:46 GMT [MD]
# PropertyAliases-6.0.0.txt
# Date: 2010-05-18, 00:49:38 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#

View file

@ -1,8 +1,8 @@
# PropertyValueAliases-5.2.0.txt
# Date: 2009-08-24, 03:27:01 GMT [MD]
# PropertyValueAliases-6.0.0.txt
# Date: 2010-07-17, 22:44:06 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#
@ -73,6 +73,7 @@ age; n/a ; 4.1
age; n/a ; 5.0
age; n/a ; 5.1
age; n/a ; 5.2
age; n/a ; 6.0
age; n/a ; unassigned
# Alphabetic (Alpha)
@ -119,6 +120,7 @@ Bidi_M; Y ; Yes ; T
# Block (blk)
blk; n/a ; Aegean_Numbers
blk; n/a ; Alchemical_Symbols
blk; n/a ; Alphabetic_Presentation_Forms
blk; n/a ; Ancient_Greek_Musical_Notation
blk; n/a ; Ancient_Greek_Numbers
@ -132,12 +134,15 @@ blk; n/a ; Arrows
blk; n/a ; Avestan
blk; n/a ; Balinese
blk; n/a ; Bamum
blk; n/a ; Bamum_Supplement
blk; n/a ; Basic_Latin ; ASCII
blk; n/a ; Batak
blk; n/a ; Bengali
blk; n/a ; Block_Elements
blk; n/a ; Bopomofo
blk; n/a ; Bopomofo_Extended
blk; n/a ; Box_Drawing
blk; n/a ; Brahmi
blk; n/a ; Braille_Patterns
blk; n/a ; Buginese
blk; n/a ; Buhid
@ -156,6 +161,7 @@ blk; n/a ; CJK_Unified_Ideographs
blk; n/a ; CJK_Unified_Ideographs_Extension_A
blk; n/a ; CJK_Unified_Ideographs_Extension_B
blk; n/a ; CJK_Unified_Ideographs_Extension_C
blk; n/a ; CJK_Unified_Ideographs_Extension_D
blk; n/a ; Combining_Diacritical_Marks
blk; n/a ; Combining_Diacritical_Marks_For_Symbols; Combining_Marks_For_Symbols
blk; n/a ; Combining_Diacritical_Marks_Supplement
@ -178,12 +184,14 @@ blk; n/a ; Devanagari_Extended
blk; n/a ; Dingbats
blk; n/a ; Domino_Tiles
blk; n/a ; Egyptian_Hieroglyphs
blk; n/a ; Emoticons
blk; n/a ; Enclosed_Alphanumeric_Supplement
blk; n/a ; Enclosed_Alphanumerics
blk; n/a ; Enclosed_CJK_Letters_And_Months
blk; n/a ; Enclosed_Ideographic_Supplement
blk; n/a ; Ethiopic
blk; n/a ; Ethiopic_Extended
blk; n/a ; Ethiopic_Extended_A
blk; n/a ; Ethiopic_Supplement
blk; n/a ; General_Punctuation
blk; n/a ; Geometric_Shapes
@ -213,6 +221,7 @@ blk; n/a ; Inscriptional_Parthian
blk; n/a ; IPA_Extensions
blk; n/a ; Javanese
blk; n/a ; Kaithi
blk; n/a ; Kana_Supplement
blk; n/a ; Kanbun
blk; n/a ; Kangxi_Radicals
blk; n/a ; Kannada
@ -240,6 +249,7 @@ blk; n/a ; Lycian
blk; n/a ; Lydian
blk; n/a ; Mahjong_Tiles
blk; n/a ; Malayalam
blk; n/a ; Mandaic
blk; n/a ; Mathematical_Alphanumeric_Symbols
blk; n/a ; Mathematical_Operators
blk; n/a ; Meetei_Mayek
@ -247,6 +257,7 @@ blk; n/a ; Miscellaneous_Mathematical_Symbols_A
blk; n/a ; Miscellaneous_Mathematical_Symbols_B
blk; n/a ; Miscellaneous_Symbols
blk; n/a ; Miscellaneous_Symbols_And_Arrows
blk; n/a ; Miscellaneous_Symbols_And_Pictographs
blk; n/a ; Miscellaneous_Technical
blk; n/a ; Modifier_Tone_Letters
blk; n/a ; Mongolian
@ -271,6 +282,7 @@ blk; n/a ; Phaistos_Disc
blk; n/a ; Phoenician
blk; n/a ; Phonetic_Extensions
blk; n/a ; Phonetic_Extensions_Supplement
blk; n/a ; Playing_Cards
blk; n/a ; Private_Use_Area ; Private_Use
blk; n/a ; Rejang
blk; n/a ; Rumi_Numeral_Symbols
@ -305,6 +317,7 @@ blk; n/a ; Thaana
blk; n/a ; Thai
blk; n/a ; Tibetan
blk; n/a ; Tifinagh
blk; n/a ; Transport_And_Map_Symbols
blk; n/a ; Ugaritic
blk; n/a ; Unified_Canadian_Aboriginal_Syllabics; Canadian_Syllabics
blk; n/a ; Unified_Canadian_Aboriginal_Syllabics_Extended
@ -675,7 +688,6 @@ jg ; n/a ; Final_Semkath
jg ; n/a ; Gaf
jg ; n/a ; Gamal
jg ; n/a ; Hah
jg ; n/a ; Hamza_On_Heh_Goal
jg ; n/a ; He
jg ; n/a ; Heh
jg ; n/a ; Heh_Goal
@ -707,6 +719,7 @@ jg ; n/a ; Syriac_Waw
jg ; n/a ; Tah
jg ; n/a ; Taw
jg ; n/a ; Teh_Marbuta
jg ; n/a ; Teh_Marbuta_Goal ; Hamza_On_Heh_Goal
jg ; n/a ; Teth
jg ; n/a ; Waw
jg ; n/a ; Yeh
@ -908,8 +921,10 @@ sc ; Armn ; Armenian
sc ; Avst ; Avestan
sc ; Bali ; Balinese
sc ; Bamu ; Bamum
sc ; Batk ; Batak
sc ; Beng ; Bengali
sc ; Bopo ; Bopomofo
sc ; Brah ; Brahmi
sc ; Brai ; Braille
sc ; Bugi ; Buginese
sc ; Buhd ; Buhid
@ -953,6 +968,7 @@ sc ; Linb ; Linear_B
sc ; Lisu ; Lisu
sc ; Lyci ; Lycian
sc ; Lydi ; Lydian
sc ; Mand ; Mandaic
sc ; Mlym ; Malayalam
sc ; Mong ; Mongolian
sc ; Mtei ; Meetei_Mayek

View file

@ -0,0 +1,172 @@
# ScriptExtensions-6.0.0.txt
# Date: 2010-08-30, 01:48:36 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#
# The Script Extensions contain data about characters that belong to multiple scripts.
# This data is provisional, and expected to change over time, as more information becomes available.
# The script values are space-delimited short values, such as Hang for Hangul.
# For more information, see UAX #24: http://www.unicode.org/reports/tr24/.
# ================================================
# Property: Script_Extensions
# ================================================
# Script_Extensions=Arab Syrc
0640 ; Arab Syrc # Lm ARABIC TATWEEL
064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
0670 ; Arab Syrc # Mn ARABIC LETTER SUPERSCRIPT ALEF
# Total code points: 13
# ================================================
# Script_Extensions=Arab Thaa
0660..0669 ; Arab Thaa # Nd [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE
FDF2 ; Arab Thaa # Lo ARABIC LIGATURE ALLAH ISOLATED FORM
FDFD ; Arab Thaa # So ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
# Total code points: 12
# ================================================
# Script_Extensions=Armn Geor
0589 ; Armn Geor # Po ARMENIAN FULL STOP
# Total code points: 1
# ================================================
# Script_Extensions=Bopo Hani
302A..302D ; Bopo Hani # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK
# Total code points: 4
# ================================================
# Script_Extensions=Hira Kana
3031..3035 ; Hira Kana # Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF
3099..309A ; Hira Kana # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
309B..309C ; Hira Kana # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
30A0 ; Hira Kana # Pd KATAKANA-HIRAGANA DOUBLE HYPHEN
30FC ; Hira Kana # Lm KATAKANA-HIRAGANA PROLONGED SOUND MARK
FF70 ; Hira Kana # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
FF9E..FF9F ; Hira Kana # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
# Total code points: 14
# ================================================
# Script_Extensions=Mong Phag
1802..1803 ; Mong Phag # Po [2] MONGOLIAN COMMA..MONGOLIAN FULL STOP
1805 ; Mong Phag # Po MONGOLIAN FOUR DOTS
# Total code points: 3
# ================================================
# Script_Extensions=Arab Syrc Thaa
060C ; Arab Syrc Thaa # Po ARABIC COMMA
061B ; Arab Syrc Thaa # Po ARABIC SEMICOLON
061F ; Arab Syrc Thaa # Po ARABIC QUESTION MARK
# Total code points: 3
# ================================================
# Script_Extensions=Hani Hira Kana
3006 ; Hani Hira Kana # Lo IDEOGRAPHIC CLOSING MARK
303C ; Hani Hira Kana # Lo MASU MARK
303D ; Hani Hira Kana # Po PART ALTERNATION MARK
3190..3191 ; Hani Hira Kana # So [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK
3192..3195 ; Hani Hira Kana # No [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK
3196..319F ; Hani Hira Kana # So [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK
# Total code points: 19
# ================================================
# Script_Extensions=Beng Deva Guru Orya
0964..0965 ; Beng Deva Guru Orya # Po [2] DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA
# Total code points: 2
# ================================================
# Script_Extensions=Buhd Hano Tagb Tglg
1735..1736 ; Buhd Hano Tagb Tglg # Po [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION
# Total code points: 2
# ================================================
# Script_Extensions=Bopo Hang Hani Hira Kana
3003 ; Bopo Hang Hani Hira Kana # Po DITTO MARK
3013 ; Bopo Hang Hani Hira Kana # So GETA MARK
301C ; Bopo Hang Hani Hira Kana # Pd WAVE DASH
301D ; Bopo Hang Hani Hira Kana # Ps REVERSED DOUBLE PRIME QUOTATION MARK
301E..301F ; Bopo Hang Hani Hira Kana # Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK
3030 ; Bopo Hang Hani Hira Kana # Pd WAVY DASH
3037 ; Bopo Hang Hani Hira Kana # So IDEOGRAPHIC TELEGRAPH LINE FEED SEPARATOR SYMBOL
303E..303F ; Bopo Hang Hani Hira Kana # So [2] IDEOGRAPHIC VARIATION INDICATOR..IDEOGRAPHIC HALF FILL SPACE
31C0..31E3 ; Bopo Hang Hani Hira Kana # So [36] CJK STROKE T..CJK STROKE Q
3220..3229 ; Bopo Hang Hani Hira Kana # No [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN
322A..3243 ; Bopo Hang Hani Hira Kana # So [26] PARENTHESIZED IDEOGRAPH MOON..PARENTHESIZED IDEOGRAPH REACH
3280..3289 ; Bopo Hang Hani Hira Kana # No [10] CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN
328A..32B0 ; Bopo Hang Hani Hira Kana # So [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT
32C0..32CB ; Bopo Hang Hani Hira Kana # So [12] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..IDEOGRAPHIC TELEGRAPH SYMBOL FOR DECEMBER
3358..3370 ; Bopo Hang Hani Hira Kana # So [25] IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR ZERO..IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR TWENTY-FOUR
337B..337F ; Bopo Hang Hani Hira Kana # So [5] SQUARE ERA NAME HEISEI..SQUARE CORPORATION
33E0..33FE ; Bopo Hang Hani Hira Kana # So [31] IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY ONE..IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
FE45..FE46 ; Bopo Hang Hani Hira Kana # Po [2] SESAME DOT..WHITE SESAME DOT
# Total code points: 206
# ================================================
# Script_Extensions=Bopo Hang Hani Hira Kana Yiii
3001..3002 ; Bopo Hang Hani Hira Kana Yiii # Po [2] IDEOGRAPHIC COMMA..IDEOGRAPHIC FULL STOP
3008 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT ANGLE BRACKET
3009 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT ANGLE BRACKET
300A ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT DOUBLE ANGLE BRACKET
300B ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT DOUBLE ANGLE BRACKET
300C ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT CORNER BRACKET
300D ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT CORNER BRACKET
300E ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE CORNER BRACKET
300F ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE CORNER BRACKET
3010 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT BLACK LENTICULAR BRACKET
3011 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT BLACK LENTICULAR BRACKET
3014 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT TORTOISE SHELL BRACKET
3015 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT TORTOISE SHELL BRACKET
3016 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE LENTICULAR BRACKET
3017 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE LENTICULAR BRACKET
3018 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE TORTOISE SHELL BRACKET
3019 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE TORTOISE SHELL BRACKET
301A ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE SQUARE BRACKET
301B ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE SQUARE BRACKET
30FB ; Bopo Hang Hani Hira Kana Yiii # Po KATAKANA MIDDLE DOT
FF61 ; Bopo Hang Hani Hira Kana Yiii # Po HALFWIDTH IDEOGRAPHIC FULL STOP
FF62 ; Bopo Hang Hani Hira Kana Yiii # Ps HALFWIDTH LEFT CORNER BRACKET
FF63 ; Bopo Hang Hani Hira Kana Yiii # Pe HALFWIDTH RIGHT CORNER BRACKET
FF64..FF65 ; Bopo Hang Hani Hira Kana Yiii # Po [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDLE DOT
# Total code points: 26
# EOF

View file

@ -1,8 +1,8 @@
# Scripts-5.2.0.txt
# Date: 2009-08-22, 04:58:43 GMT [MD]
# Scripts-6.0.0.txt
# Date: 2010-08-19, 00:48:47 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
@ -73,7 +73,7 @@
02C2..02C5 ; Common
02C6..02D1 ; Common
02D2..02DF ; Common
02E5..02EB ; Common
02E5..02E9 ; Common
02EC ; Common
02ED ; Common
02EE ; Common
@ -83,7 +83,6 @@
0385 ; Common
0387 ; Common
0589 ; Common
0600..0603 ; Common
060C ; Common
061B ; Common
061F ; Common
@ -92,7 +91,6 @@
06DD ; Common
0964..0965 ; Common
0970 ; Common
0CF1..0CF2 ; Common
0E3F ; Common
0FD5..0FD8 ; Common
10FB ; Common
@ -148,7 +146,7 @@
208A..208C ; Common
208D ; Common
208E ; Common
20A0..20B8 ; Common
20A0..20B9 ; Common
2100..2101 ; Common
2102 ; Common
2103..2106 ; Common
@ -157,7 +155,8 @@
210A..2113 ; Common
2114 ; Common
2115 ; Common
2116..2118 ; Common
2116..2117 ; Common
2118 ; Common
2119..211D ; Common
211E..2123 ; Common
2124 ; Common
@ -213,7 +212,7 @@
239B..23B3 ; Common
23B4..23DB ; Common
23DC..23E1 ; Common
23E2..23E8 ; Common
23E2..23F3 ; Common
2400..2426 ; Common
2440..244A ; Common
2460..249B ; Common
@ -227,18 +226,8 @@
25F8..25FF ; Common
2600..266E ; Common
266F ; Common
2670..26CD ; Common
26CF..26E1 ; Common
26E3 ; Common
26E8..26FF ; Common
2701..2704 ; Common
2706..2709 ; Common
270C..2727 ; Common
2729..274B ; Common
274D ; Common
274F..2752 ; Common
2756..275E ; Common
2761..2767 ; Common
2670..26FF ; Common
2701..2767 ; Common
2768 ; Common
2769 ; Common
276A ; Common
@ -254,15 +243,13 @@
2774 ; Common
2775 ; Common
2776..2793 ; Common
2794 ; Common
2798..27AF ; Common
27B1..27BE ; Common
2794..27BF ; Common
27C0..27C4 ; Common
27C5 ; Common
27C6 ; Common
27C7..27CA ; Common
27CC ; Common
27D0..27E5 ; Common
27CE..27E5 ; Common
27E6 ; Common
27E7 ; Common
27E8 ; Common
@ -555,27 +542,51 @@ FFFC..FFFD ; Common
1D7CE..1D7FF ; Common
1F000..1F02B ; Common
1F030..1F093 ; Common
1F0A0..1F0AE ; Common
1F0B1..1F0BE ; Common
1F0C1..1F0CF ; Common
1F0D1..1F0DF ; Common
1F100..1F10A ; Common
1F110..1F12E ; Common
1F131 ; Common
1F13D ; Common
1F13F ; Common
1F142 ; Common
1F146 ; Common
1F14A..1F14E ; Common
1F157 ; Common
1F15F ; Common
1F179 ; Common
1F17B..1F17C ; Common
1F17F ; Common
1F18A..1F18D ; Common
1F190 ; Common
1F210..1F231 ; Common
1F130..1F169 ; Common
1F170..1F19A ; Common
1F1E6..1F1FF ; Common
1F201..1F202 ; Common
1F210..1F23A ; Common
1F240..1F248 ; Common
1F250..1F251 ; Common
1F300..1F320 ; Common
1F330..1F335 ; Common
1F337..1F37C ; Common
1F380..1F393 ; Common
1F3A0..1F3C4 ; Common
1F3C6..1F3CA ; Common
1F3E0..1F3F0 ; Common
1F400..1F43E ; Common
1F440 ; Common
1F442..1F4F7 ; Common
1F4F9..1F4FC ; Common
1F500..1F53D ; Common
1F550..1F567 ; Common
1F5FB..1F5FF ; Common
1F601..1F610 ; Common
1F612..1F614 ; Common
1F616 ; Common
1F618 ; Common
1F61A ; Common
1F61C..1F61E ; Common
1F620..1F625 ; Common
1F628..1F62B ; Common
1F62D ; Common
1F630..1F633 ; Common
1F635..1F640 ; Common
1F645..1F64F ; Common
1F680..1F6C5 ; Common
1F700..1F773 ; Common
E0001 ; Common
E0020..E007F ; Common
# Total code points: 5395
# Total code points: 6379
# ================================================
@ -603,7 +614,7 @@ E0020..E007F ; Common
1E00..1EFF ; Latin
2071 ; Latin
207F ; Latin
2090..2094 ; Latin
2090..209C ; Latin
212A..212B ; Latin
2132 ; Latin
214E ; Latin
@ -616,13 +627,16 @@ E0020..E007F ; Common
A722..A76F ; Latin
A770 ; Latin
A771..A787 ; Latin
A78B..A78C ; Latin
A78B..A78E ; Latin
A790..A791 ; Latin
A7A0..A7A9 ; Latin
A7FA ; Latin
A7FB..A7FF ; Latin
FB00..FB06 ; Latin
FF21..FF3A ; Latin
FF41..FF5A ; Latin
# Total code points: 1244
# Total code points: 1267
# ================================================
@ -687,12 +701,11 @@ FF41..FF5A ; Latin
0483..0484 ; Cyrillic
0487 ; Cyrillic
0488..0489 ; Cyrillic
048A..0525 ; Cyrillic
048A..0527 ; Cyrillic
1D2B ; Cyrillic
1D78 ; Cyrillic
2DE0..2DFF ; Cyrillic
A640..A65F ; Cyrillic
A662..A66D ; Cyrillic
A640..A66D ; Cyrillic
A66E ; Cyrillic
A66F ; Cyrillic
A670..A672 ; Cyrillic
@ -702,7 +715,7 @@ A67E ; Cyrillic
A67F ; Cyrillic
A680..A697 ; Cyrillic
# Total code points: 404
# Total code points: 408
# ================================================
@ -744,6 +757,7 @@ FB46..FB4F ; Hebrew
# ================================================
0600..0603 ; Arabic
0606..0608 ; Arabic
0609..060A ; Arabic
060B ; Arabic
@ -751,7 +765,7 @@ FB46..FB4F ; Hebrew
060E..060F ; Arabic
0610..061A ; Arabic
061E ; Arabic
0621..063F ; Arabic
0620..063F ; Arabic
0641..064A ; Arabic
0656..065E ; Arabic
066A..066D ; Arabic
@ -773,6 +787,7 @@ FB46..FB4F ; Hebrew
06FF ; Arabic
0750..077F ; Arabic
FB50..FBB1 ; Arabic
FBB2..FBC1 ; Arabic
FBD3..FD3D ; Arabic
FD50..FD8F ; Arabic
FD92..FDC7 ; Arabic
@ -782,7 +797,7 @@ FE70..FE74 ; Arabic
FE76..FEFC ; Arabic
10E60..10E7E ; Arabic
# Total code points: 1030
# Total code points: 1051
# ================================================
@ -809,27 +824,29 @@ FE76..FEFC ; Arabic
0900..0902 ; Devanagari
0903 ; Devanagari
0904..0939 ; Devanagari
093A ; Devanagari
093B ; Devanagari
093C ; Devanagari
093D ; Devanagari
093E..0940 ; Devanagari
0941..0948 ; Devanagari
0949..094C ; Devanagari
094D ; Devanagari
094E ; Devanagari
094E..094F ; Devanagari
0950 ; Devanagari
0953..0955 ; Devanagari
0953..0957 ; Devanagari
0958..0961 ; Devanagari
0962..0963 ; Devanagari
0966..096F ; Devanagari
0971 ; Devanagari
0972 ; Devanagari
0972..0977 ; Devanagari
0979..097F ; Devanagari
A8E0..A8F1 ; Devanagari
A8F2..A8F7 ; Devanagari
A8F8..A8FA ; Devanagari
A8FB ; Devanagari
# Total code points: 140
# Total code points: 150
# ================================================
@ -941,8 +958,9 @@ A8FB ; Devanagari
0B66..0B6F ; Oriya
0B70 ; Oriya
0B71 ; Oriya
0B72..0B77 ; Oriya
# Total code points: 84
# Total code points: 90
# ================================================
@ -1018,22 +1036,23 @@ A8FB ; Devanagari
0CE0..0CE1 ; Kannada
0CE2..0CE3 ; Kannada
0CE6..0CEF ; Kannada
0CF1..0CF2 ; Kannada
# Total code points: 84
# Total code points: 86
# ================================================
0D02..0D03 ; Malayalam
0D05..0D0C ; Malayalam
0D0E..0D10 ; Malayalam
0D12..0D28 ; Malayalam
0D2A..0D39 ; Malayalam
0D12..0D3A ; Malayalam
0D3D ; Malayalam
0D3E..0D40 ; Malayalam
0D41..0D44 ; Malayalam
0D46..0D48 ; Malayalam
0D4A..0D4C ; Malayalam
0D4D ; Malayalam
0D4E ; Malayalam
0D57 ; Malayalam
0D60..0D61 ; Malayalam
0D62..0D63 ; Malayalam
@ -1042,7 +1061,7 @@ A8FB ; Devanagari
0D79 ; Malayalam
0D7A..0D7F ; Malayalam
# Total code points: 95
# Total code points: 98
# ================================================
@ -1132,16 +1151,17 @@ A8FB ; Devanagari
0F80..0F84 ; Tibetan
0F85 ; Tibetan
0F86..0F87 ; Tibetan
0F88..0F8B ; Tibetan
0F90..0F97 ; Tibetan
0F88..0F8C ; Tibetan
0F8D..0F97 ; Tibetan
0F99..0FBC ; Tibetan
0FBE..0FC5 ; Tibetan
0FC6 ; Tibetan
0FC7..0FCC ; Tibetan
0FCE..0FCF ; Tibetan
0FD0..0FD4 ; Tibetan
0FD9..0FDA ; Tibetan
# Total code points: 201
# Total code points: 207
# ================================================
@ -1201,6 +1221,7 @@ AA7B ; Myanmar
# ================================================
1100..11FF ; Hangul
302E..302F ; Hangul
3131..318E ; Hangul
3200..321E ; Hangul
3260..327E ; Hangul
@ -1214,7 +1235,7 @@ FFCA..FFCF ; Hangul
FFD2..FFD7 ; Hangul
FFDA..FFDC ; Hangul
# Total code points: 11737
# Total code points: 11739
# ================================================
@ -1234,7 +1255,7 @@ FFDA..FFDC ; Hangul
12D8..1310 ; Ethiopic
1312..1315 ; Ethiopic
1318..135A ; Ethiopic
135F ; Ethiopic
135D..135F ; Ethiopic
1360 ; Ethiopic
1361..1368 ; Ethiopic
1369..137C ; Ethiopic
@ -1249,8 +1270,13 @@ FFDA..FFDC ; Hangul
2DC8..2DCE ; Ethiopic
2DD0..2DD6 ; Ethiopic
2DD8..2DDE ; Ethiopic
AB01..AB06 ; Ethiopic
AB09..AB0E ; Ethiopic
AB11..AB16 ; Ethiopic
AB20..AB26 ; Ethiopic
AB28..AB2E ; Ethiopic
# Total code points: 461
# Total code points: 495
# ================================================
@ -1329,9 +1355,10 @@ FFDA..FFDC ; Hangul
3041..3096 ; Hiragana
309D..309E ; Hiragana
309F ; Hiragana
1B001 ; Hiragana
1F200 ; Hiragana
# Total code points: 90
# Total code points: 91
# ================================================
@ -1343,15 +1370,17 @@ FFDA..FFDC ; Hangul
3300..3357 ; Katakana
FF66..FF6F ; Katakana
FF71..FF9D ; Katakana
1B000 ; Katakana
# Total code points: 299
# Total code points: 300
# ================================================
02EA..02EB ; Bopomofo
3105..312D ; Bopomofo
31A0..31B7 ; Bopomofo
31A0..31BA ; Bopomofo
# Total code points: 65
# Total code points: 70
# ================================================
@ -1370,9 +1399,10 @@ FA30..FA6D ; Han
FA70..FAD9 ; Han
20000..2A6D6 ; Han
2A700..2B734 ; Han
2B740..2B81D ; Han
2F800..2FA1D ; Han
# Total code points: 75738
# Total code points: 75960
# ================================================
@ -1410,6 +1440,7 @@ A490..A4C6 ; Yi
0300..036F ; Inherited
0485..0486 ; Inherited
064B..0655 ; Inherited
065F ; Inherited
0670 ; Inherited
0951..0952 ; Inherited
1CD0..1CD2 ; Inherited
@ -1417,14 +1448,14 @@ A490..A4C6 ; Yi
1CE2..1CE8 ; Inherited
1CED ; Inherited
1DC0..1DE6 ; Inherited
1DFD..1DFF ; Inherited
1DFC..1DFF ; Inherited
200C..200D ; Inherited
20D0..20DC ; Inherited
20DD..20E0 ; Inherited
20E1 ; Inherited
20E2..20E4 ; Inherited
20E5..20F0 ; Inherited
302A..302F ; Inherited
302A..302D ; Inherited
3099..309A ; Inherited
FE00..FE0F ; Inherited
FE20..FE26 ; Inherited
@ -1568,7 +1599,8 @@ E0100..E01EF ; Inherited
19B0..19C0 ; New_Tai_Lue
19C1..19C7 ; New_Tai_Lue
19C8..19C9 ; New_Tai_Lue
19D0..19DA ; New_Tai_Lue
19D0..19D9 ; New_Tai_Lue
19DA ; New_Tai_Lue
19DE..19DF ; New_Tai_Lue
# Total code points: 83
@ -1584,8 +1616,10 @@ E0100..E01EF ; Inherited
2D30..2D65 ; Tifinagh
2D6F ; Tifinagh
2D70 ; Tifinagh
2D7F ; Tifinagh
# Total code points: 55
# Total code points: 57
# ================================================
@ -1882,8 +1916,9 @@ A6A0..A6E5 ; Bamum
A6E6..A6EF ; Bamum
A6F0..A6F1 ; Bamum
A6F2..A6F7 ; Bamum
16800..16A38 ; Bamum
# Total code points: 88
# Total code points: 657
# ================================================
@ -1969,4 +2004,40 @@ ABF0..ABF9 ; Meetei_Mayek
# Total code points: 66
# ================================================
1BC0..1BE5 ; Batak
1BE6 ; Batak
1BE7 ; Batak
1BE8..1BE9 ; Batak
1BEA..1BEC ; Batak
1BED ; Batak
1BEE ; Batak
1BEF..1BF1 ; Batak
1BF2..1BF3 ; Batak
1BFC..1BFF ; Batak
# Total code points: 56
# ================================================
11000 ; Brahmi
11001 ; Brahmi
11002 ; Brahmi
11003..11037 ; Brahmi
11038..11046 ; Brahmi
11047..1104D ; Brahmi
11052..11065 ; Brahmi
11066..1106F ; Brahmi
# Total code points: 108
# ================================================
0840..0858 ; Mandaic
0859..085B ; Mandaic
085E ; Mandaic
# Total code points: 29
# EOF

View file

@ -1,8 +1,8 @@
# SentenceBreakProperty-5.2.0.txt
# Date: 2009-08-22, 04:58:44 GMT [MD]
# SentenceBreakProperty-6.0.0.txt
# Date: 2010-08-19, 00:48:47 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
@ -38,10 +38,9 @@
05C4..05C5 ; Extend
05C7 ; Extend
0610..061A ; Extend
064B..065E ; Extend
064B..065F ; Extend
0670 ; Extend
06D6..06DC ; Extend
06DE ; Extend
06DF..06E4 ; Extend
06E7..06E8 ; Extend
06EA..06ED ; Extend
@ -53,15 +52,18 @@
081B..0823 ; Extend
0825..0827 ; Extend
0829..082D ; Extend
0859..085B ; Extend
0900..0902 ; Extend
0903 ; Extend
093A ; Extend
093B ; Extend
093C ; Extend
093E..0940 ; Extend
0941..0948 ; Extend
0949..094C ; Extend
094D ; Extend
094E ; Extend
0951..0955 ; Extend
094E..094F ; Extend
0951..0957 ; Extend
0962..0963 ; Extend
0981 ; Extend
0982..0983 ; Extend
@ -163,7 +165,7 @@
0F7F ; Extend
0F80..0F84 ; Extend
0F86..0F87 ; Extend
0F90..0F97 ; Extend
0F8D..0F97 ; Extend
0F99..0FBC ; Extend
0FC6 ; Extend
102B..102C ; Extend
@ -188,7 +190,7 @@
108F ; Extend
109A..109C ; Extend
109D ; Extend
135F ; Extend
135D..135F ; Extend
1712..1714 ; Extend
1732..1734 ; Extend
1752..1753 ; Extend
@ -244,6 +246,14 @@
1BA6..1BA7 ; Extend
1BA8..1BA9 ; Extend
1BAA ; Extend
1BE6 ; Extend
1BE7 ; Extend
1BE8..1BE9 ; Extend
1BEA..1BEC ; Extend
1BED ; Extend
1BEE ; Extend
1BEF..1BF1 ; Extend
1BF2..1BF3 ; Extend
1C24..1C2B ; Extend
1C2C..1C33 ; Extend
1C34..1C35 ; Extend
@ -255,7 +265,7 @@
1CED ; Extend
1CF2 ; Extend
1DC0..1DE6 ; Extend
1DFD..1DFF ; Extend
1DFC..1DFF ; Extend
200C..200D ; Extend
20D0..20DC ; Extend
20DD..20E0 ; Extend
@ -263,6 +273,7 @@
20E2..20E4 ; Extend
20E5..20F0 ; Extend
2CEF..2CF1 ; Extend
2D7F ; Extend
2DE0..2DFF ; Extend
302A..302F ; Extend
3099..309A ; Extend
@ -322,6 +333,10 @@ FF9E..FF9F ; Extend
10A0C..10A0F ; Extend
10A38..10A3A ; Extend
10A3F ; Extend
11000 ; Extend
11001 ; Extend
11002 ; Extend
11038..11046 ; Extend
11080..11081 ; Extend
11082 ; Extend
110B0..110B2 ; Extend
@ -337,7 +352,7 @@ FF9E..FF9F ; Extend
1D242..1D244 ; Extend
E0100..E01EF ; Extend
# Total code points: 1455
# Total code points: 1502
# ================================================
@ -655,6 +670,7 @@ E0020..E007F ; Format
0521 ; Lower
0523 ; Lower
0525 ; Lower
0527 ; Lower
0561..0587 ; Lower
1D00..1D2B ; Lower
1D2C..1D61 ; Lower
@ -898,6 +914,7 @@ A659 ; Lower
A65B ; Lower
A65D ; Lower
A65F ; Lower
A661 ; Lower
A663 ; Lower
A665 ; Lower
A667 ; Lower
@ -964,6 +981,14 @@ A783 ; Lower
A785 ; Lower
A787 ; Lower
A78C ; Lower
A78E ; Lower
A791 ; Lower
A7A1 ; Lower
A7A3 ; Lower
A7A5 ; Lower
A7A7 ; Lower
A7A9 ; Lower
A7FA ; Lower
FB00..FB06 ; Lower
FB13..FB17 ; Lower
FF41..FF5A ; Lower
@ -997,7 +1022,7 @@ FF41..FF5A ; Lower
1D7C4..1D7C9 ; Lower
1D7CB ; Lower
# Total code points: 1907
# Total code points: 1917
# ================================================
@ -1266,6 +1291,7 @@ FF41..FF5A ; Lower
0520 ; Upper
0522 ; Upper
0524 ; Upper
0526 ; Upper
0531..0556 ; Upper
10A0..10C5 ; Upper
1E00 ; Upper
@ -1503,6 +1529,7 @@ A658 ; Upper
A65A ; Upper
A65C ; Upper
A65E ; Upper
A660 ; Upper
A662 ; Upper
A664 ; Upper
A666 ; Upper
@ -1567,6 +1594,13 @@ A782 ; Upper
A784 ; Upper
A786 ; Upper
A78B ; Upper
A78D ; Upper
A790 ; Upper
A7A0 ; Upper
A7A2 ; Upper
A7A4 ; Upper
A7A6 ; Upper
A7A8 ; Upper
FF21..FF3A ; Upper
10400..10427 ; Upper
1D400..1D419 ; Upper
@ -1601,7 +1635,7 @@ FF21..FF3A ; Upper
1D790..1D7A8 ; Upper
1D7CA ; Upper
# Total code points: 1500
# Total code points: 1509
# ================================================
@ -1617,7 +1651,7 @@ FF21..FF3A ; Upper
05D0..05EA ; OLetter
05F0..05F2 ; OLetter
05F3 ; OLetter
0621..063F ; OLetter
0620..063F ; OLetter
0640 ; OLetter
0641..064A ; OLetter
066E..066F ; OLetter
@ -1638,12 +1672,13 @@ FF21..FF3A ; Upper
081A ; OLetter
0824 ; OLetter
0828 ; OLetter
0840..0858 ; OLetter
0904..0939 ; OLetter
093D ; OLetter
0950 ; OLetter
0958..0961 ; OLetter
0971 ; OLetter
0972 ; OLetter
0972..0977 ; OLetter
0979..097F ; OLetter
0985..098C ; OLetter
098F..0990 ; OLetter
@ -1712,11 +1747,12 @@ FF21..FF3A ; Upper
0CBD ; OLetter
0CDE ; OLetter
0CE0..0CE1 ; OLetter
0CF1..0CF2 ; OLetter
0D05..0D0C ; OLetter
0D0E..0D10 ; OLetter
0D12..0D28 ; OLetter
0D2A..0D39 ; OLetter
0D12..0D3A ; OLetter
0D3D ; OLetter
0D4E ; OLetter
0D60..0D61 ; OLetter
0D7A..0D7F ; OLetter
0D85..0D96 ; OLetter
@ -1748,7 +1784,7 @@ FF21..FF3A ; Upper
0F00 ; OLetter
0F40..0F47 ; OLetter
0F49..0F6C ; OLetter
0F88..0F8B ; OLetter
0F88..0F8C ; OLetter
1000..102A ; OLetter
103F ; OLetter
1050..1055 ; OLetter
@ -1810,6 +1846,7 @@ FF21..FF3A ; Upper
1B45..1B4B ; OLetter
1B83..1BA0 ; OLetter
1BAE..1BAF ; OLetter
1BC0..1BE5 ; OLetter
1C00..1C23 ; OLetter
1C4D..1C4F ; OLetter
1C5A..1C77 ; OLetter
@ -1818,6 +1855,7 @@ FF21..FF3A ; Upper
1CEE..1CF1 ; OLetter
2071 ; OLetter
207F ; OLetter
2095..209C ; OLetter
2135..2138 ; OLetter
2180..2182 ; OLetter
2185..2188 ; OLetter
@ -1849,7 +1887,7 @@ FF21..FF3A ; Upper
30FF ; OLetter
3105..312D ; OLetter
3131..318E ; OLetter
31A0..31B7 ; OLetter
31A0..31BA ; OLetter
31F0..31FF ; OLetter
3400..4DB5 ; OLetter
4E00..9FCB ; OLetter
@ -1896,6 +1934,11 @@ AAC0 ; OLetter
AAC2 ; OLetter
AADB..AADC ; OLetter
AADD ; OLetter
AB01..AB06 ; OLetter
AB09..AB0E ; OLetter
AB11..AB16 ; OLetter
AB20..AB26 ; OLetter
AB28..AB2E ; OLetter
ABC0..ABE2 ; OLetter
AC00..D7A3 ; OLetter
D7B0..D7C6 ; OLetter
@ -1962,15 +2005,19 @@ FFDA..FFDC ; OLetter
10B40..10B55 ; OLetter
10B60..10B72 ; OLetter
10C00..10C48 ; OLetter
11003..11037 ; OLetter
11083..110AF ; OLetter
12000..1236E ; OLetter
12400..12462 ; OLetter
13000..1342E ; OLetter
16800..16A38 ; OLetter
1B000..1B001 ; OLetter
20000..2A6D6 ; OLetter
2A700..2B734 ; OLetter
2B740..2B81D ; OLetter
2F800..2FA1D ; OLetter
# Total code points: 96405
# Total code points: 97369
# ================================================
@ -1996,7 +2043,7 @@ FFDA..FFDC ; OLetter
17E0..17E9 ; Numeric
1810..1819 ; Numeric
1946..194F ; Numeric
19D0..19DA ; Numeric
19D0..19D9 ; Numeric
1A80..1A89 ; Numeric
1A90..1A99 ; Numeric
1B50..1B59 ; Numeric
@ -2010,9 +2057,10 @@ A9D0..A9D9 ; Numeric
AA50..AA59 ; Numeric
ABF0..ABF9 ; Numeric
104A0..104A9 ; Numeric
11066..1106F ; Numeric
1D7CE..1D7FF ; Numeric
# Total code points: 403
# Total code points: 412
# ================================================
@ -2039,9 +2087,11 @@ FF0E ; ATerm
1362 ; STerm
1367..1368 ; STerm
166E ; STerm
1735..1736 ; STerm
1803 ; STerm
1809 ; STerm
1944..1945 ; STerm
1AA8..1AAB ; STerm
1B5A..1B5B ; STerm
1B5E..1B5F ; STerm
1C3B..1C3C ; STerm
@ -2064,9 +2114,11 @@ FE56..FE57 ; STerm
FF01 ; STerm
FF1F ; STerm
FF61 ; STerm
10A56..10A57 ; STerm
11047..11048 ; STerm
110BE..110C1 ; STerm
# Total code points: 63
# Total code points: 73
# ================================================

View file

@ -1,8 +1,8 @@
# SpecialCasing-5.2.0.txt
# Date: 2009-09-22, 23:25:59 GMT [MD]
# SpecialCasing-6.0.0.txt
# Date: 2010-05-18, 00:49:39 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#

File diff suppressed because it is too large Load diff

View file

@ -1,8 +1,8 @@
# WordBreakProperty-5.2.0.txt
# Date: 2009-07-12, 04:17:35 GMT [MD]
# WordBreakProperty-6.0.0.txt
# Date: 2010-08-19, 00:48:48 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
@ -47,10 +47,9 @@
05C4..05C5 ; Extend
05C7 ; Extend
0610..061A ; Extend
064B..065E ; Extend
064B..065F ; Extend
0670 ; Extend
06D6..06DC ; Extend
06DE ; Extend
06DF..06E4 ; Extend
06E7..06E8 ; Extend
06EA..06ED ; Extend
@ -62,15 +61,18 @@
081B..0823 ; Extend
0825..0827 ; Extend
0829..082D ; Extend
0859..085B ; Extend
0900..0902 ; Extend
0903 ; Extend
093A ; Extend
093B ; Extend
093C ; Extend
093E..0940 ; Extend
0941..0948 ; Extend
0949..094C ; Extend
094D ; Extend
094E ; Extend
0951..0955 ; Extend
094E..094F ; Extend
0951..0957 ; Extend
0962..0963 ; Extend
0981 ; Extend
0982..0983 ; Extend
@ -172,7 +174,7 @@
0F7F ; Extend
0F80..0F84 ; Extend
0F86..0F87 ; Extend
0F90..0F97 ; Extend
0F8D..0F97 ; Extend
0F99..0FBC ; Extend
0FC6 ; Extend
102B..102C ; Extend
@ -197,7 +199,7 @@
108F ; Extend
109A..109C ; Extend
109D ; Extend
135F ; Extend
135D..135F ; Extend
1712..1714 ; Extend
1732..1734 ; Extend
1752..1753 ; Extend
@ -253,6 +255,14 @@
1BA6..1BA7 ; Extend
1BA8..1BA9 ; Extend
1BAA ; Extend
1BE6 ; Extend
1BE7 ; Extend
1BE8..1BE9 ; Extend
1BEA..1BEC ; Extend
1BED ; Extend
1BEE ; Extend
1BEF..1BF1 ; Extend
1BF2..1BF3 ; Extend
1C24..1C2B ; Extend
1C2C..1C33 ; Extend
1C34..1C35 ; Extend
@ -264,7 +274,7 @@
1CED ; Extend
1CF2 ; Extend
1DC0..1DE6 ; Extend
1DFD..1DFF ; Extend
1DFC..1DFF ; Extend
200C..200D ; Extend
20D0..20DC ; Extend
20DD..20E0 ; Extend
@ -272,6 +282,7 @@
20E2..20E4 ; Extend
20E5..20F0 ; Extend
2CEF..2CF1 ; Extend
2D7F ; Extend
2DE0..2DFF ; Extend
302A..302F ; Extend
3099..309A ; Extend
@ -331,6 +342,10 @@ FF9E..FF9F ; Extend
10A0C..10A0F ; Extend
10A38..10A3A ; Extend
10A3F ; Extend
11000 ; Extend
11001 ; Extend
11002 ; Extend
11038..11046 ; Extend
11080..11081 ; Extend
11082 ; Extend
110B0..110B2 ; Extend
@ -346,7 +361,7 @@ FF9E..FF9F ; Extend
1D242..1D244 ; Extend
E0100..E01EF ; Extend
# Total code points: 1455
# Total code points: 1502
# ================================================
@ -382,8 +397,9 @@ E0020..E007F ; Format
FF66..FF6F ; Katakana
FF70 ; Katakana
FF71..FF9D ; Katakana
1B000 ; Katakana
# Total code points: 309
# Total code points: 310
# ================================================
@ -417,14 +433,14 @@ FF71..FF9D ; Katakana
038E..03A1 ; ALetter
03A3..03F5 ; ALetter
03F7..0481 ; ALetter
048A..0525 ; ALetter
048A..0527 ; ALetter
0531..0556 ; ALetter
0559 ; ALetter
0561..0587 ; ALetter
05D0..05EA ; ALetter
05F0..05F2 ; ALetter
05F3 ; ALetter
0621..063F ; ALetter
0620..063F ; ALetter
0640 ; ALetter
0641..064A ; ALetter
066E..066F ; ALetter
@ -445,12 +461,13 @@ FF71..FF9D ; Katakana
081A ; ALetter
0824 ; ALetter
0828 ; ALetter
0840..0858 ; ALetter
0904..0939 ; ALetter
093D ; ALetter
0950 ; ALetter
0958..0961 ; ALetter
0971 ; ALetter
0972 ; ALetter
0972..0977 ; ALetter
0979..097F ; ALetter
0985..098C ; ALetter
098F..0990 ; ALetter
@ -519,11 +536,12 @@ FF71..FF9D ; Katakana
0CBD ; ALetter
0CDE ; ALetter
0CE0..0CE1 ; ALetter
0CF1..0CF2 ; ALetter
0D05..0D0C ; ALetter
0D0E..0D10 ; ALetter
0D12..0D28 ; ALetter
0D2A..0D39 ; ALetter
0D12..0D3A ; ALetter
0D3D ; ALetter
0D4E ; ALetter
0D60..0D61 ; ALetter
0D7A..0D7F ; ALetter
0D85..0D96 ; ALetter
@ -534,7 +552,7 @@ FF71..FF9D ; Katakana
0F00 ; ALetter
0F40..0F47 ; ALetter
0F49..0F6C ; ALetter
0F88..0F8B ; ALetter
0F88..0F8C ; ALetter
10A0..10C5 ; ALetter
10D0..10FA ; ALetter
10FC ; ALetter
@ -579,6 +597,7 @@ FF71..FF9D ; Katakana
1B45..1B4B ; ALetter
1B83..1BA0 ; ALetter
1BAE..1BAF ; ALetter
1BC0..1BE5 ; ALetter
1C00..1C23 ; ALetter
1C4D..1C4F ; ALetter
1C5A..1C77 ; ALetter
@ -612,7 +631,7 @@ FF71..FF9D ; Katakana
1FF6..1FFC ; ALetter
2071 ; ALetter
207F ; ALetter
2090..2094 ; ALetter
2090..209C ; ALetter
2102 ; ALetter
2107 ; ALetter
210A..2113 ; ALetter
@ -656,7 +675,7 @@ FF71..FF9D ; Katakana
303C ; ALetter
3105..312D ; ALetter
3131..318E ; ALetter
31A0..31B7 ; ALetter
31A0..31BA ; ALetter
A000..A014 ; ALetter
A015 ; ALetter
A016..A48C ; ALetter
@ -666,8 +685,7 @@ A500..A60B ; ALetter
A60C ; ALetter
A610..A61F ; ALetter
A62A..A62B ; ALetter
A640..A65F ; ALetter
A662..A66D ; ALetter
A640..A66D ; ALetter
A66E ; ALetter
A67F ; ALetter
A680..A697 ; ALetter
@ -678,7 +696,10 @@ A722..A76F ; ALetter
A770 ; ALetter
A771..A787 ; ALetter
A788 ; ALetter
A78B..A78C ; ALetter
A78B..A78E ; ALetter
A790..A791 ; ALetter
A7A0..A7A9 ; ALetter
A7FA ; ALetter
A7FB..A801 ; ALetter
A803..A805 ; ALetter
A807..A80A ; ALetter
@ -695,6 +716,11 @@ A9CF ; ALetter
AA00..AA28 ; ALetter
AA40..AA42 ; ALetter
AA44..AA4B ; ALetter
AB01..AB06 ; ALetter
AB09..AB0E ; ALetter
AB11..AB16 ; ALetter
AB20..AB26 ; ALetter
AB28..AB2E ; ALetter
ABC0..ABE2 ; ALetter
AC00..D7A3 ; ALetter
D7B0..D7C6 ; ALetter
@ -760,10 +786,12 @@ FFDA..FFDC ; ALetter
10B40..10B55 ; ALetter
10B60..10B72 ; ALetter
10C00..10C48 ; ALetter
11003..11037 ; ALetter
11083..110AF ; ALetter
12000..1236E ; ALetter
12400..12462 ; ALetter
13000..1342E ; ALetter
16800..16A38 ; ALetter
1D400..1D454 ; ALetter
1D456..1D49C ; ALetter
1D49E..1D49F ; ALetter
@ -795,7 +823,7 @@ FFDA..FFDC ; ALetter
1D7AA..1D7C2 ; ALetter
1D7C4..1D7CB ; ALetter
# Total code points: 23694
# Total code points: 24453
# ================================================
@ -866,7 +894,7 @@ FF0E ; MidNumLet
17E0..17E9 ; Numeric
1810..1819 ; Numeric
1946..194F ; Numeric
19D0..19DA ; Numeric
19D0..19D9 ; Numeric
1A80..1A89 ; Numeric
1A90..1A99 ; Numeric
1B50..1B59 ; Numeric
@ -880,9 +908,10 @@ A9D0..A9D9 ; Numeric
AA50..AA59 ; Numeric
ABF0..ABF9 ; Numeric
104A0..104A9 ; Numeric
11066..1106F ; Numeric
1D7CE..1D7FF ; Numeric
# Total code points: 402
# Total code points: 411
# ================================================

View file

@ -13,6 +13,220 @@
---------------------------------------------------------------------------- ***
Unicode 6.0 update
*** related ICU Trac tickets
7264 Unicode 6.0 Update
*** Unicode version numbers
- makedata.mak
- uchar.h
(configure.in & configure: have been modified to extract the version from uchar.h)
- com.ibm.icu.util.VersionInfo
*** data files & enums & parser code
* file preparation
~/svn.icu/tools/trunk/src/unicode/c/genprops/misc$ ./ucdcopy.py ~/uni60/20100720/ucd ~/uni60/processed
- This now prepares both unidata and testdata files in respective output subfolders.
* PropertyAliases.txt changes
- new Script_Extensions property defined in the new ScriptExtensions.txt file
but not listed in PropertyAliases.txt; reported to unicode.org;
-> added to tools/trunk/src/unicode/c/genpname/SyntheticPropertyAliases.txt
scx; Script_Extensions
-> uchar.h with new UProperty section
-> com.ibm.icu.lang.UProperty, parallel with uchar.h
* PropertyValueAliases.txt changes
- 12 new block names:
Alchemical_Symbols
Bamum_Supplement
Batak
Brahmi
CJK_Unified_Ideographs_Extension_D
Emoticons
Ethiopic_Extended_A
Kana_Supplement
Mandaic
Miscellaneous_Symbols_And_Pictographs
Playing_Cards
Transport_And_Map_Symbols
-> add to uchar.h
-> add to UCharacter.UnicodeBlock
Eclipse find UBLOCK_([^ ]+) = [0-9]+, (/.+)
replace public static final UnicodeBlock \1 = new UnicodeBlock("\1", \1_ID); \2
- Joining_Group (jg) values:
Teh_Marbuta_Goal becomes the new canonical value for the old Hamza_On_Heh_Goal which becomes an alias
-> uchar.h & UCharacter.JoiningGroup
- 3 new scripts:
sc ; Batk ; Batak
sc ; Brah ; Brahmi
sc ; Mand ; Mandaic
-> remove these from SyntheticPropertyValueAliases.txt
-> add alias USCRIPT_MANDAIC to USCRIPT_MANDAEAN
-> fix expectedLong names in cucdapi.c/TestUScriptCodeAPI()
and in com.ibm.icu.dev.test.lang.TestUScript.java
- 13 new script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html
(added 2009-11-11..2010-07-18)
Bass 259 Bassa Vah
Dupl 755 Duployan shortand
Elba 226 Elbasan
Gran 343 Grantha
Kpel 436 Kpelle
Loma 437 Loma
Mend 438 Mende
Merc 101 Meroitic Cursive
Narb 106 Old North Arabian
Nbat 159 Nabataean
Palm 126 Palmyrene
Sind 318 Sindhi
Wara 262 Warang Citi
-> uscript.h
-> com.ibm.icu.lang.UScript
find USCRIPT_([^ ]+) *= ([0-9]+),(.+)
replace public static final int \1 = \2;\3
-> SyntheticPropertyValueAliases.txt
-> add to expectedLong and expectedShort names in cintltst/cucdapi.c/TestUScriptCodeAPI()
and in com.ibm.icu.dev.test.lang.TestUScript.java
- ISO 15924 name change
Mero 100 Meroitic Hieroglyphs (was Meroitic)
-> add new alias USCRIPT_MEROITIC_HIEROGLYPHS to USCRIPT_MEROITIC
- property value alias added for Cham, was already moved out of SyntheticPropertyValueAliases.txt
* UnicodeData.txt changes
- new CJK block:
2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
-> add to tools/trunk/src/unicode/c/gennames/gennames.c, with new ucdVersion
* build Unicode tools using CMake+make
* run genpname/preparse.pl (on Linux)
+ cd ~/svn.icu/tools/trunk/src/unicode/c/genpname
+ make sure that data.h is writable
+ perl preparse.pl ~/svn.icu/trunk/src > out.txt
+ preparse.pl shows no errors, out.txt Info and Warning lines look ok
* rebuild Unicode tools (at least genpname) using make
- You might first need to "make install" ICU so that the tools build can pick
up the new definitions from the installed header files.
* run genpname
- ~/svn.icu/tools/trunk/bld/unicode$ c/genpname/genpname -v -d ~/svn.icu/trunk/src/source/data/in
- rebuild ICU & tools
* update source/data/unidata/norm2/nfkc_cf.txt
- follow the instructions in nfkc_cf.txt for updating it from DerivedNormalizationProps.txt
* update source/data/unidata/norm2/uts46.txt
- download http://www.unicode.org/Public/idna/6.0.0/IdnaMappingTable.txt
to ~/svn.icu/tools/trunk/src/unicode/py
- adjust idna2nrm.py to handle new disallowed_STD3_valid and disallowed_STD3_mapped values
- ~/svn.icu/tools/trunk/src/unicode/py$ ./idna2nrm.py
- ~/svn.icu/tools/trunk/src/unicode/py$ cp uts46.txt ~/svn.icu/trunk/src/source/data/unidata/norm2
* update uts46test.cpp and UTS46Test.java if there are new characters that are equivalent to
sequences with non-LDH ASCII (that is, their decompositions contain '=' or similar)
- grep IdnaMappingTable.txt or uts46.txt for "disallowed_STD3_valid" on non-ASCII characters
- Unicode 6.0: U+2260, U+226E, U+226F
* generate core properties data files
- ~/svn.icu/tools/trunk/src/unicode$ ./makeprops.sh ~/svn.icu/trunk/src ~/svn.icu/trunk/bld
- rebuild ICU & tools
- run makeuca.sh so that genuca picks up the new nfc.nrm:
~/svn.icu/tools/trunk/src/unicode$ ./makeuca.sh ~/svn.icu/trunk/src ~/svn.icu/trunk/bld
- rebuild ICU & tools
* implement new Script_Extensions property (provisional)
- parser & generator: genprops & uprops.icu
- uscript.h, uprops.h, uchar.c, uniset_props.cpp and others, plus cintltst/cucdapi.c & intltest/usettest.cpp
- UScript.java, UCharacterProperty.java, UnicodeSet.java, TestUScript.java, UnicodeSetTest.java
* switch ubidi.icu, ucase.icu and uprops.icu from UTrie to UTrie2
- (one-time change)
- genbidi/gencase/genprops tools changes
- re-run makeprops.sh (see above)
- UCharacterProperty.java, UCharacterTypeIterator.java,
UBiDiProps.java, UCaseProps.java, and several others with minor changes;
UCharacterPropertyReader.java deleted and its code folded into UCharacterProperty.java
* update Java data files
- refresh just the UCD-related files, just to be safe
- see (ICU4C)/source/data/icu4j-readme.txt
- mkdir /tmp/icu4j
- ~/svn.icu/trunk/bld$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install
output:
...
Unicode .icu files built to ./out/build/icudt45l
mkdir -p ./out/icu4j/com/ibm/icu/impl/data/icudt45b
echo ubidi.icu ucase.icu uprops.icu > ./out/icu4j/add.txt
LD_LIBRARY_PATH=../lib:../stubdata:../tools/ctestfw:$LD_LIBRARY_PATH ../bin/icupkg ./out/tmp/icudt45l.dat ./out/icu4j/icudt45b.dat -a ./out/icu4j/add.txt -s ./out/build/icudt45l -x '*' -tb -d ./out/icu4j/com/ibm/icu/impl/data/icudt45b
jar cf ./out/icu4j/icudata.jar -C ./out/icu4j com/ibm/icu/impl/data/icudt45b
mkdir -p /tmp/icu4j/main/shared/data
cp ./out/icu4j/icudata.jar /tmp/icu4j/main/shared/data
- copy the big-endian Unicode data files to another location,
separate from the other data files
mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/coll
mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/brkitr
~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt45b/*.icu /tmp/icu4j/com/ibm/icu/impl/data/icudt45b
~/svn.icu/trunk/bld/data/out/icu4j$ rm /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/cnvalias.icu
~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt45b/*.nrm /tmp/icu4j/com/ibm/icu/impl/data/icudt45b
~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt45b/coll/*.icu /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/coll
~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt45b/brkitr/* /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/brkitr
- refresh ICU4J
~/svn.icu/trunk/bld/data/out/icu4j$ jar uf ~/svn.icu4j/trunk/src/main/shared/data/icudata.jar -C /tmp/icu4j com/ibm/icu/impl/data/icudt45b
* refresh Java test .txt files
- copy new .txt files into ICU4J's main/tests/core/src/com/ibm/icu/dev/data/unicode
* un-hardcode normalization skippable (NF*_Inert) test data
- removes one manual step from the Unicode upgrade, and removes dependency on one of Mark's tools
* copy updated break iterator test files
- now handled by early ucdcopy.py and
copying the uni60/processed/testdata files to ~/svn.icu/trunk/src/source/test/testdata
(old instructions:
copy from (Unicode 6.0)/ucd/auxiliary/*BreakTest-6....txt
to ~/svn.icu/trunk/src/source/test/testdata)
- they are not used in ICU4J
* UCA
- get output from Mark's tools; look in
http://www.unicode.org/~book/incoming/mark/uca6.0.0/
http://www.macchiato.com/unicode/utc/additional-uca-files
http://www.unicode.org/Public/UCA/6.0.0/
http://www.unicode.org/~mdavis/uca/
- update source/data/unidata/FractionalUCA.txt with FractionalUCA_SHORT.txt
- update source/data/unidata/UCARules.txt with UCA_Rules_SHORT.txt
- run makeuca.sh:
~/svn.icu/tools/trunk/src/unicode$ ./makeuca.sh ~/svn.icu/trunk/src ~/svn.icu/trunk/bld
- rebuild ICU4C
- refresh ICU4J collation data:
(subset of instructions above for properties data refresh, except copies all coll/*)
~/svn.icu/trunk/bld$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install
mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/coll
~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt45b/coll/* /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/coll
~/svn.icu/trunk/bld/data/out/icu4j$ jar uf ~/svn.icu4j/trunk/src/main/shared/data/icudata.jar -C /tmp/icu4j com/ibm/icu/impl/data/icudt45b
- update (ICU)/source/test/testdata/CollationTest_*.txt
and (ICU4J)/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_*.txt
with output from Mark's Unicode tools
- run all tests with the *_SHORT.txt or the full files (the full ones have comments)
- note on intltest: if collate/UCAConformanceTest fails, then
utility/MultithreadTest/TestCollators will fail as well;
fix the conformance test before looking into the multi-thread test
* When refreshing all of ICU4J data from ICU4C
- ~/svn.icu/trunk/bld$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install
- cp /tmp/icu4j/main/shared/data/icudata.jar ~/svn.icu4j/trunk/src/main/shared/data
or
- ~/svn.icu/trunk/bld$ make ICU4J_ROOT=~/svn.icu4j/trunk/src icu4j-data-install
---------------------------------------------------------------------------- ***
Unicode 5.2 update
*** related ICU Trac tickets

View file

@ -3,7 +3,7 @@
#
# file name: nfc.txt
#
# machine-generated on: 2009-11-30
# machine-generated on: 2010-07-23
#
# Canonical_Combining_Class (ccc) values
@ -88,6 +88,7 @@
0657..065B:230
065C:220
065D..065E:230
065F:220
0670:35
06D6..06DC:230
06DF..06E2:230
@ -124,6 +125,7 @@
081B..0823:230
0825..0827:230
0829..082D:230
0859..085B:220
093C:7
094D:9
0951:230
@ -166,7 +168,7 @@
1037:7
1039..103A:9
108D:220
135F:230
135D..135F:230
1714:9
1734:9
17D2:9
@ -186,6 +188,8 @@
1B6C:220
1B6D..1B73:230
1BAA:9
1BE6:7
1BF2..1BF3:9
1C37:7
1CD0..1CD2:230
1CD4:1
@ -205,6 +209,7 @@
1DCF:220
1DD0:202
1DD1..1DE6:230
1DFC:233
1DFD:220
1DFE:230
1DFF:220
@ -222,6 +227,7 @@
20EC..20EF:220
20F0:230
2CEF..2CF1:230
2D7F:9
2DE0..2DFF:230
302A:218
302B:228
@ -255,6 +261,7 @@ FE20..FE26:230
10A39:1
10A3A:220
10A3F:9
11046:9
110B9:9
110BA:7
1D165..1D166:216

View file

@ -3,7 +3,7 @@
#
# file name: nfkc.txt
#
# machine-generated on: 2009-11-30
# machine-generated on: 2010-07-23
#
# Canonical_Combining_Class (ccc) values
@ -88,6 +88,7 @@
0657..065B:230
065C:220
065D..065E:230
065F:220
0670:35
06D6..06DC:230
06DF..06E2:230
@ -124,6 +125,7 @@
081B..0823:230
0825..0827:230
0829..082D:230
0859..085B:220
093C:7
094D:9
0951:230
@ -166,7 +168,7 @@
1037:7
1039..103A:9
108D:220
135F:230
135D..135F:230
1714:9
1734:9
17D2:9
@ -186,6 +188,8 @@
1B6C:220
1B6D..1B73:230
1BAA:9
1BE6:7
1BF2..1BF3:9
1C37:7
1CD0..1CD2:230
1CD4:1
@ -205,6 +209,7 @@
1DCF:220
1DD0:202
1DD1..1DE6:230
1DFC:233
1DFD:220
1DFE:230
1DFF:220
@ -222,6 +227,7 @@
20EC..20EF:220
20F0:230
2CEF..2CF1:230
2D7F:9
2DE0..2DFF:230
302A:218
302B:228
@ -255,6 +261,7 @@ FE20..FE26:230
10A39:1
10A3A:220
10A3F:9
11046:9
110B9:9
110BA:7
1D165..1D166:216
@ -1400,6 +1407,14 @@ FE20..FE26:230
2092>006F
2093>0078
2094>0259
2095>0068
2096>006B
2097>006C
2098>006D
2099>006E
209A>0070
209B>0073
209C>0074
20A8>0052 0073
2100>0061 002F 0063
2101>0061 002F 0073
@ -5187,18 +5202,42 @@ FFEE>25CB
1F12C>0052
1F12D>0043 0044
1F12E>0057 005A
1F130>0041
1F131>0042
1F132>0043
1F133>0044
1F134>0045
1F135>0046
1F136>0047
1F137>0048
1F138>0049
1F139>004A
1F13A>004B
1F13B>004C
1F13C>004D
1F13D>004E
1F13E>004F
1F13F>0050
1F140>0051
1F141>0052
1F142>0053
1F143>0054
1F144>0055
1F145>0056
1F146>0057
1F147>0058
1F148>0059
1F149>005A
1F14A>0048 0056
1F14B>004D 0056
1F14C>0053 0044
1F14D>0053 0053
1F14E>0050 0050 0056
1F14F>0057 0043
1F190>0044 004A
1F200>307B 304B
1F201>30B3 30B3
1F202>30B5
1F210>624B
1F211>5B57
1F212>53CC
@ -5233,6 +5272,15 @@ FFEE>25CB
1F22F>6307
1F230>8D70
1F231>6253
1F232>7981
1F233>7A7A
1F234>5408
1F235>6E80
1F236>6709
1F237>6708
1F238>7533
1F239>5272
1F23A>55B6
1F240>3014 672C 3015
1F241>3014 4E09 3015
1F242>3014 4E8C 3015
@ -5242,6 +5290,8 @@ FFEE>25CB
1F246>3014 76D7 3015
1F247>3014 52DD 3015
1F248>3014 6557 3015
1F250>5F97
1F251>53EF
2F800>4E3D
2F801>4E38
2F802>4E41

View file

@ -1,9 +1,9 @@
# Extracted from:
# DerivedNormalizationProps-5.2.0.txt
# Date: 2009-08-26, 18:18:50 GMT [MD]
# DerivedNormalizationProps-6.0.0.txt
# Date: 2010-05-20, 15:14:12 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
@ -23,7 +23,7 @@
# WARNING: Application to STRINGS must apply NFC after mapping each character, because characters may interact.
# For more information, see [http://www.unicode.org/reports/tr44/]
# Omitted code points are unchanged by this mapping.
# @missing: 0000..10FFFF><code point>
# @missing: 0000..10FFFF; NFKC_CF; <code point>
# All code points not explicitly listed for NFKC_Casefold
# have the value <codepoint>.
@ -511,6 +511,7 @@
0520>0521
0522>0523
0524>0525
0526>0527
0531>0561
0532>0562
0533>0563
@ -1077,6 +1078,14 @@
2092>006F
2093>0078
2094>0259
2095>0068
2096>006B
2097>006C
2098>006D
2099>006E
209A>0070
209B>0073
209C>0074
20A8>0072 0073
2100>0061 002F 0063
2101>0061 002F 0073
@ -2292,6 +2301,7 @@ A658>A659
A65A>A65B
A65C>A65D
A65E>A65F
A660>A661
A662>A663
A664>A665
A666>A667
@ -2358,6 +2368,13 @@ A782>A783
A784>A785
A786>A787
A78B>A78C
A78D>0265
A790>A791
A7A0>A7A1
A7A2>A7A3
A7A4>A7A5
A7A6>A7A7
A7A8>A7A9
F900>8C48
F901>66F4
F902>8ECA
@ -4778,18 +4795,42 @@ FFF0..FFF8>
1F12C>0072
1F12D>0063 0064
1F12E>0077 007A
1F130>0061
1F131>0062
1F132>0063
1F133>0064
1F134>0065
1F135>0066
1F136>0067
1F137>0068
1F138>0069
1F139>006A
1F13A>006B
1F13B>006C
1F13C>006D
1F13D>006E
1F13E>006F
1F13F>0070
1F140>0071
1F141>0072
1F142>0073
1F143>0074
1F144>0075
1F145>0076
1F146>0077
1F147>0078
1F148>0079
1F149>007A
1F14A>0068 0076
1F14B>006D 0076
1F14C>0073 0064
1F14D>0073 0073
1F14E>0070 0070 0076
1F14F>0077 0063
1F190>0064 006A
1F200>307B 304B
1F201>30B3 30B3
1F202>30B5
1F210>624B
1F211>5B57
1F212>53CC
@ -4824,6 +4865,15 @@ FFF0..FFF8>
1F22F>6307
1F230>8D70
1F231>6253
1F232>7981
1F233>7A7A
1F234>5408
1F235>6E80
1F236>6709
1F237>6708
1F238>7533
1F239>5272
1F23A>55B6
1F240>3014 672C 3015
1F241>3014 4E09 3015
1F242>3014 4E8C 3015
@ -4833,6 +4883,8 @@ FFF0..FFF8>
1F246>3014 76D7 3015
1F247>3014 52DD 3015
1F248>3014 6557 3015
1F250>5F97
1F251>53EF
2F800>4E3D
2F801>4E38
2F802>4E41
@ -5373,4 +5425,4 @@ E0080..E00FF>
E0100..E01EF>
E01F0..E0FFF>
# Total code points: 9740
# Total code points: 9792

File diff suppressed because it is too large Load diff

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2001-2008, International Business Machines
* Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -88,12 +88,9 @@ UOBJECT_DEFINE_ABSTRACT_RTTI_IMPLEMENTATION(CaseMapTransliterator)
*/
CaseMapTransliterator::CaseMapTransliterator(const UnicodeString &id, UCaseMapFull *map) :
Transliterator(id, 0),
fCsp(NULL),
fCsp(ucase_getSingleton()),
fMap(map)
{
UErrorCode errorCode = U_ZERO_ERROR;
fCsp = ucase_getSingleton(&errorCode); // expect to get NULL if failure
// TODO test incremental mode with context-sensitive text (e.g. greek sigma)
// TODO need to call setMaximumContextLength()?!
}

View file

@ -3903,8 +3903,8 @@ GC_Done:
// Test input against a literal string.
// Strings require two slots in the compiled pattern, one for the
// offset to the string text, and one for the length.
const UCaseProps *csp = ucase_getSingleton(&status);
if (U_SUCCESS(status)) {
const UCaseProps *csp = ucase_getSingleton();
{
int32_t stringStartIdx, stringLen;
stringStartIdx = opValue;
@ -5580,8 +5580,8 @@ GC_Done:
// Test input against a literal string.
// Strings require two slots in the compiled pattern, one for the
// offset to the string text, and one for the length.
const UCaseProps *csp = ucase_getSingleton(&status);
if (U_SUCCESS(status)) {
const UCaseProps *csp = ucase_getSingleton();
{
int32_t stringStartIdx, stringLen;
stringStartIdx = opValue;

View file

@ -29,6 +29,7 @@
#include "capitst.h"
#include "ccolltst.h"
#include "putilimp.h"
#include "cmemory.h"
#include "cstring.h"
static void TestAttribute(void);
@ -279,7 +280,7 @@ void TestProperty()
{
UCollator *col, *ruled;
UChar *disName;
int32_t len = 0, i = 0;
int32_t len = 0;
UChar *source, *target;
int32_t tempLength;
UErrorCode status = U_ZERO_ERROR;
@ -293,10 +294,10 @@ void TestProperty()
* needs to be adjusted.
* Same in intltest/apicoll.cpp.
*/
UVersionInfo currVersionArray = {0x31, 0xC0, 0x05, 0x2A};
UVersionInfo currUCAVersionArray = {5, 2, 0, 0};
UVersionInfo currVersionArray = {0x31, 0xC0, 0x05, 0x2A}; /* from ICU 4.4/UCA 5.2 */
UVersionInfo versionArray = {0, 0, 0, 0};
UVersionInfo versionUCAArray = {0, 0, 0, 0};
UVersionInfo versionUCDArray = {0, 0, 0, 0};
log_verbose("The property tests begin : \n");
log_verbose("Test ucol_strcoll : \n");
@ -307,21 +308,23 @@ void TestProperty()
}
ucol_getVersion(col, versionArray);
for (i=0; i<4; ++i) {
if (versionArray[i] != currVersionArray[i]) {
log_err("Testing ucol_getVersion() - unexpected result: %hu.%hu.%hu.%hu\n",
versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
break;
}
/* Check for a version greater than some value rather than equality
* so that we need not update the expected version each time. */
if (uprv_memcmp(versionArray, currVersionArray, 4)<0) {
log_err("Testing ucol_getVersion() - unexpected result: %02x.%02x.%02x.%02x\n",
versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
} else {
log_verbose("ucol_getVersion() result: %02x.%02x.%02x.%02x\n",
versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
}
/* Assume that the UCD and UCA versions are the same,
* rather than hardcoding (and updating each time) a particular UCA version. */
u_getUnicodeVersion(versionUCDArray);
ucol_getUCAVersion(col, versionUCAArray);
for (i=0; i<4; ++i) {
if (versionUCAArray[i] != currUCAVersionArray[i]) {
log_err("Testing ucol_getUCAVersion() - unexpected result: %hu.%hu.%hu.%hu\n",
versionUCAArray[0], versionUCAArray[1], versionUCAArray[2], versionUCAArray[3]);
break;
}
if (0!=uprv_memcmp(versionUCAArray, versionUCDArray, 4)) {
log_err("Testing ucol_getUCAVersion() - unexpected result: %hu.%hu.%hu.%hu\n",
versionUCAArray[0], versionUCAArray[1], versionUCAArray[2], versionUCAArray[3]);
}
source=(UChar*)malloc(sizeof(UChar) * 12);

View file

@ -1,5 +1,5 @@
/********************************************************************
* Copyright (c) 1997-2009, International Business Machines
* Copyright (c) 1997-2010, International Business Machines
* Corporation and others. All Rights Reserved.
********************************************************************/
@ -293,7 +293,7 @@ void TestUScriptCodeAPI(){
}
}
}
{
/*
* These script codes were originally added to ICU pre-3.6, so that ICU would
@ -304,9 +304,9 @@ void TestUScriptCodeAPI(){
* Whenever this happens, the long script names here need to be updated.
*/
static const char* expectedLong[] = {
"Balinese", "Batk", "Blis", "Brah", "Cham", "Cirt", "Cyrs", "Egyd", "Egyh", "Egyptian_Hieroglyphs",
"Balinese", "Batak", "Blis", "Brahmi", "Cham", "Cirt", "Cyrs", "Egyd", "Egyh", "Egyptian_Hieroglyphs",
"Geok", "Hans", "Hant", "Hmng", "Hung", "Inds", "Javanese", "Kayah_Li", "Latf", "Latg",
"Lepcha", "Lina", "Mand", "Maya", "Mero", "Nko", "Old_Turkic", "Perm", "Phags_Pa", "Phoenician",
"Lepcha", "Lina", "Mandaic", "Maya", "Mero", "Nko", "Old_Turkic", "Perm", "Phags_Pa", "Phoenician",
"Plrd", "Roro", "Sara", "Syre", "Syrj", "Syrn", "Teng", "Vai", "Visp", "Cuneiform",
"Zxxx", "Unknown",
"Carian", "Jpan", "Tai_Tham", "Lycian", "Lydian", "Ol_Chiki", "Rejang", "Saurashtra", "Sgnw", "Sundanese",
@ -317,6 +317,9 @@ void TestUScriptCodeAPI(){
"Zmth", "Zsym",
/* new in ICU 4.4 */
"Bamum", "Lisu", "Nkgb", "Old_South_Arabian",
/* new in ICU 4.6 */
"Bass", "Dupl", "Elba", "Gran", "Kpel", "Loma", "Mend", "Merc",
"Narb", "Nbat", "Palm", "Sind", "Wara",
};
static const char* expectedShort[] = {
"Bali", "Batk", "Blis", "Brah", "Cham", "Cirt", "Cyrs", "Egyd", "Egyh", "Egyp",
@ -332,6 +335,9 @@ void TestUScriptCodeAPI(){
"Zmth", "Zsym",
/* new in ICU 4.4 */
"Bamu", "Lisu", "Nkgb", "Sarb",
/* new in ICU 4.6 */
"Bass", "Dupl", "Elba", "Gran", "Kpel", "Loma", "Mend", "Merc",
"Narb", "Nbat", "Palm", "Sind", "Wara",
};
int32_t j = 0;
if(LENGTHOF(expectedLong)!=(USCRIPT_CODE_LIMIT-USCRIPT_BALINESE)) {
@ -364,6 +370,123 @@ void TestUScriptCodeAPI(){
}
}
}
{
/* test characters which have Script_Extensions */
UErrorCode errorCode=U_ZERO_ERROR;
if(!(
USCRIPT_COMMON==uscript_getScript(0x0640, &errorCode) &&
USCRIPT_INHERITED==uscript_getScript(0x0650, &errorCode) &&
USCRIPT_ARABIC==uscript_getScript(0xfdf2, &errorCode)) ||
U_FAILURE(errorCode)
) {
log_err("uscript_getScript(character with Script_Extensions) failed\n");
}
}
}
void TestHasScript() {
if(!(
!uscript_hasScript(0x063f, USCRIPT_COMMON) &&
uscript_hasScript(0x063f, USCRIPT_ARABIC) && /* main Script value */
!uscript_hasScript(0x063f, USCRIPT_SYRIAC) &&
!uscript_hasScript(0x063f, USCRIPT_THAANA))
) {
log_err("uscript_hasScript(U+063F, ...) is wrong\n");
}
if(!(
uscript_hasScript(0x0640, USCRIPT_COMMON) && /* main Script value */
uscript_hasScript(0x0640, USCRIPT_ARABIC) &&
uscript_hasScript(0x0640, USCRIPT_SYRIAC) &&
!uscript_hasScript(0x0640, USCRIPT_THAANA))
) {
log_err("uscript_hasScript(U+0640, ...) is wrong\n");
}
if(!(
uscript_hasScript(0x0650, USCRIPT_INHERITED) && /* main Script value */
uscript_hasScript(0x0650, USCRIPT_ARABIC) &&
uscript_hasScript(0x0650, USCRIPT_SYRIAC) &&
!uscript_hasScript(0x0650, USCRIPT_THAANA))
) {
log_err("uscript_hasScript(U+0650, ...) is wrong\n");
}
if(!(
uscript_hasScript(0x0660, USCRIPT_COMMON) && /* main Script value */
uscript_hasScript(0x0660, USCRIPT_ARABIC) &&
!uscript_hasScript(0x0660, USCRIPT_SYRIAC) &&
uscript_hasScript(0x0660, USCRIPT_THAANA))
) {
log_err("uscript_hasScript(U+0660, ...) is wrong\n");
}
if(!(
!uscript_hasScript(0xfdf2, USCRIPT_COMMON) &&
uscript_hasScript(0xfdf2, USCRIPT_ARABIC) && /* main Script value */
!uscript_hasScript(0xfdf2, USCRIPT_SYRIAC) &&
uscript_hasScript(0xfdf2, USCRIPT_THAANA))
) {
log_err("uscript_hasScript(U+FDF2, ...) is wrong\n");
}
}
void TestGetScriptExtensions() {
UScriptCode scripts[20];
int32_t length;
UErrorCode errorCode;
/* errors and overflows */
errorCode=U_PARSE_ERROR;
length=uscript_getScriptExtensions(0x0640, scripts, LENGTHOF(scripts), &errorCode);
if(errorCode!=U_PARSE_ERROR) {
log_err("uscript_getScriptExtensions(U_PARSE_ERROR) did not preserve the UErrorCode - %s\n",
u_errorName(errorCode));
}
errorCode=U_ZERO_ERROR;
length=uscript_getScriptExtensions(0x0640, NULL, LENGTHOF(scripts), &errorCode);
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR) {
log_err("uscript_getScriptExtensions(NULL) did not set U_ILLEGAL_ARGUMENT_ERROR - %s\n",
u_errorName(errorCode));
}
errorCode=U_ZERO_ERROR;
length=uscript_getScriptExtensions(0x0640, scripts, -1, &errorCode);
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR) {
log_err("uscript_getScriptExtensions(capacity<0) did not set U_ILLEGAL_ARGUMENT_ERROR - %s\n",
u_errorName(errorCode));
}
errorCode=U_ZERO_ERROR;
length=uscript_getScriptExtensions(0x0640, scripts, 0, &errorCode);
if(errorCode!=U_BUFFER_OVERFLOW_ERROR || length!=2) {
log_err("uscript_getScriptExtensions(capacity=0: pure preflighting)=%d != 2 - %s\n",
(int)length, u_errorName(errorCode));
}
errorCode=U_ZERO_ERROR;
length=uscript_getScriptExtensions(0x0640, scripts, 1, &errorCode);
if(errorCode!=U_BUFFER_OVERFLOW_ERROR || length!=2) {
log_err("uscript_getScriptExtensions(capacity=1: preflighting)=%d != 2 - %s\n",
(int)length, u_errorName(errorCode));
}
/* normal usage */
errorCode=U_ZERO_ERROR;
length=uscript_getScriptExtensions(0x063f, scripts, 0, &errorCode);
if(U_FAILURE(errorCode) || length!=0) {
log_err("uscript_getScriptExtensions(U+063F, capacity=0)=%d != 0 - %s\n",
(int)length, u_errorName(errorCode));
}
length=uscript_getScriptExtensions(0x0640, scripts, LENGTHOF(scripts), &errorCode);
if(U_FAILURE(errorCode) || length!=2 || scripts[0]!=USCRIPT_ARABIC || scripts[1]!=USCRIPT_SYRIAC) {
log_err("uscript_getScriptExtensions(U+0640)=%d failed - %s\n",
(int)length, u_errorName(errorCode));
}
length=uscript_getScriptExtensions(0xfdf2, scripts, LENGTHOF(scripts), &errorCode);
if(U_FAILURE(errorCode) || length!=2 || scripts[0]!=USCRIPT_ARABIC || scripts[1]!=USCRIPT_THAANA) {
log_err("uscript_getScriptExtensions(U+FDF2)=%d failed - %s\n",
(int)length, u_errorName(errorCode));
}
length=uscript_getScriptExtensions(0xff65, scripts, LENGTHOF(scripts), &errorCode);
if(U_FAILURE(errorCode) || length!=6 || scripts[0]!=USCRIPT_BOPOMOFO || scripts[5]!=USCRIPT_YI) {
log_err("uscript_getScriptExtensions(U+FF65)=%d failed - %s\n",
(int)length, u_errorName(errorCode));
}
}
void TestBinaryValues() {

View file

@ -1,8 +1,10 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2003-2008, International Business Machines Corporation and
* Copyright (c) 2003-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
void TestUScriptCodeAPI(void);
void TestHasScript(void);
void TestGetScriptExtensions(void);
void TestBinaryValues(void);

View file

@ -182,6 +182,8 @@ void addUnicodeTest(TestNode** root)
addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames");
addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring");
addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript");
addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions");
addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
@ -2344,7 +2346,6 @@ TestAdditionalProperties() {
{ 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH },
{ 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH },
{ 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL },
{ 0x06C3, UCHAR_JOINING_GROUP, U_JG_HAMZA_ON_HEH_GOAL },
{ 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING },
{ 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
@ -2477,6 +2478,11 @@ TestAdditionalProperties() {
{ 0xa4d0, UCHAR_SCRIPT, USCRIPT_LISU },
{ 0x10a7f, UCHAR_SCRIPT, USCRIPT_OLD_SOUTH_ARABIAN },
{ -1, 0x600, 0 }, /* version break for Unicode 6.0 */
/* value changed in Unicode 6.0 */
{ 0x06C3, UCHAR_JOINING_GROUP, U_JG_TEH_MARBUTA_GOAL },
/* undefined UProperty values */
{ 0x61, 0x4a7, 0 },
{ 0x234bc, 0x15ed, 0 }
@ -2919,7 +2925,7 @@ TestConsistency() {
*
* Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
* but not from Hyphen.
* UTC 94 (2003mar) decided to leave it that way and to changed UCD.html.
* UTC 94 (2003mar) decided to leave it that way and to change UCD.html.
* Therefore, do not show errors when testing the Hyphen property.
*/
log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"

View file

@ -104,9 +104,8 @@ CollationAPITest::TestProperty(/* char* par */)
* needs to be adjusted.
* Same in cintltst/capitst.c.
*/
UVersionInfo currVersionArray = {0x31, 0xC0, 0x05, 0x2A};
UVersionInfo currVersionArray = {0x31, 0xC0, 0x05, 0x2A}; // from ICU 4.4/UCA 5.2
UVersionInfo versionArray;
int i = 0;
logln("The property tests begin : ");
logln("Test ctors : ");
@ -124,12 +123,14 @@ CollationAPITest::TestProperty(/* char* par */)
delete kwEnum;
col->getVersion(versionArray);
for (i=0; i<4; ++i) {
if (versionArray[i] != currVersionArray[i]) {
errln("Testing Collator::getVersion() - unexpected result: %02x.%02x.%02x.%02x",
// Check for a version greater than some value rather than equality
// so that we need not update the expected version each time.
if (uprv_memcmp(versionArray, currVersionArray, 4)<0) {
errln("Testing Collator::getVersion() - unexpected result: %02x.%02x.%02x.%02x",
versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
} else {
logln("Collator::getVersion() result: %02x.%02x.%02x.%02x",
versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
break;
}
}
doAssert((col->compare("ab", "abc") == Collator::LESS), "ab < abc comparison failed");

View file

@ -1965,6 +1965,10 @@ void RBBITest::TestTailoredBreaks() {
UErrorCode status = U_ZERO_ERROR;
switch (tbItemPtr->type) {
case UBRK_CHARACTER:
// TODO(andy): Match Thai grapheme break behavior to Unicode 6.0 and remove this time bomb.
{ UVersionInfo icu453 = { 4, 5, 3, 0 };
if (!isICUVersionAtLeast(icu453)) continue;
}
tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status);
rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status);
break;
@ -2201,6 +2205,10 @@ void RBBITest::TestUnicodeFiles() {
//-------------------------------------------------------------------------------------------
void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
// TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb.
UVersionInfo icu453 = { 4, 5, 3, 0 };
UBool isICUVersionAtLeast453 = isICUVersionAtLeast(icu453);
UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
UErrorCode status = U_ZERO_ERROR;
//
@ -2294,7 +2302,10 @@ void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *
// If the line from the file contained test data, run the test now.
//
if (testString.length() > 0) {
// TODO(andy): Remove this time bomb code.
if (!isLineBreak || isICUVersionAtLeast453 || !(4658 <= lineNumber && lineNumber <= 4758)) {
checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
}
}
// Clear out this test case.
@ -4589,8 +4600,8 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
breakPos = bi->preceding(i);
if (breakPos >= i ||
breakPos > lastBreakPos ||
breakPos < 0 && testText.getChar32Start(i)>0 ||
breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i) ) {
(breakPos < 0 && testText.getChar32Start(i)>0) ||
(breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
errln("%s break monkey test: "
"Out of range value returned by BreakIterator::preceding().\n"
"index=%d; prev returned %d; lastBreak=%d" ,

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2000-2009, International Business Machines
* Copyright (C) 2000-2010, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
@ -962,8 +962,8 @@ void RTTest::logRoundTripFailure(const UnicodeString& from,
A bug has been filed to remind us to do this: #1979.
*/
static const char KATAKANA[] = "[[[:katakana:][\\u30A1-\\u30FA\\u30FC]]-[\\u30FF\\u31F0-\\u31FF]]";
static const char HIRAGANA[] = "[[[:hiragana:][\\u3040-\\u3094]]-[\\u3095-\\u3096\\u309F-\\u30A0\\U0001F200-\\U0001F2FF]]";
static const char KATAKANA[] = "[[[:katakana:][\\u30A1-\\u30FA\\u30FC]]-[\\u30FF\\u31F0-\\u31FF]-[:^age=5.2:]]";
static const char HIRAGANA[] = "[[[:hiragana:][\\u3040-\\u3094]]-[\\u3095-\\u3096\\u309F-\\u30A0\\U0001F200-\\U0001F2FF]-[:^age=5.2:]]";
static const char LENGTH[] = "[\\u30FC]";
static const char HALFWIDTH_KATAKANA[] = "[\\uFF65-\\uFF9D]";
static const char KATAKANA_ITERATION[] = "[\\u30FD\\u30FE]";

View file

@ -1280,441 +1280,64 @@ BasicNormalizerTest::FindFoldFCDExceptions() {
}
}
/*
* Hardcoded "NF* Skippable" sets, generated from
* Mark Davis' com.ibm.text.UCD.NFSkippable (see ICU4J CVS, module unicodetools).
* Run com.ibm.text.UCD.Main with the option NFSkippable.
*
* Must be updated for each Unicode version.
*/
static void
initExpectedSkippables(UnicodeSet skipSets[UNORM_MODE_COUNT]) {
UErrorCode errorCode=U_ZERO_ERROR;
initExpectedSkippables(UnicodeSet skipSets[UNORM_MODE_COUNT], UErrorCode &errorCode) {
skipSets[UNORM_NFD].applyPattern(
UNICODE_STRING_SIMPLE("[[:NFD_QC=Yes:]&[:ccc=0:]]"), errorCode);
skipSets[UNORM_NFC].applyPattern(
UNICODE_STRING_SIMPLE("[[:NFC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]"), errorCode);
skipSets[UNORM_NFKD].applyPattern(
UNICODE_STRING_SIMPLE("[[:NFKD_QC=Yes:]&[:ccc=0:]]"), errorCode);
skipSets[UNORM_NFKC].applyPattern(
UNICODE_STRING_SIMPLE("[[:NFKC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]"), errorCode);
skipSets[UNORM_NFD].applyPattern(UnicodeString(
"[^\\u00C0-\\u00C5\\u00C7-\\u00CF\\u00D1-\\u00D6\\u00D9-\\u00DD"
"\\u00E0-\\u00E5\\u00E7-\\u00EF\\u00F1-\\u00F6\\u00F9-\\u00FD"
"\\u00FF-\\u010F\\u0112-\\u0125\\u0128-\\u0130\\u0134-\\u0137"
"\\u0139-\\u013E\\u0143-\\u0148\\u014C-\\u0151\\u0154-\\u0165"
"\\u0168-\\u017E\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-\\u01DC"
"\\u01DE-\\u01E3\\u01E6-\\u01F0\\u01F4\\u01F5\\u01F8-\\u021B"
"\\u021E\\u021F\\u0226-\\u0233\\u0300-\\u034E\\u0350-\\u036F"
"\\u0374\\u037E\\u0385-\\u038A\\u038C\\u038E-\\u0390\\u03AA-"
"\\u03B0\\u03CA-\\u03CE\\u03D3\\u03D4\\u0400\\u0401\\u0403\\u0407"
"\\u040C-\\u040E\\u0419\\u0439\\u0450\\u0451\\u0453\\u0457\\u045C"
"-\\u045E\\u0476\\u0477\\u0483-\\u0487\\u04C1\\u04C2\\u04D0-"
"\\u04D3\\u04D6\\u04D7\\u04DA-\\u04DF\\u04E2-\\u04E7\\u04EA-"
"\\u04F5\\u04F8\\u04F9\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4"
"\\u05C5\\u05C7\\u0610-\\u061A\\u0622-\\u0626\\u064B-\\u065E"
"\\u0670\\u06C0\\u06C2\\u06D3\\u06D6-\\u06DC\\u06DF-\\u06E4"
"\\u06E7\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07EB-"
"\\u07F3\\u0816-\\u0819\\u081B-\\u0823\\u0825-\\u0827\\u0829-"
"\\u082D\\u0929\\u0931\\u0934\\u093C\\u094D\\u0951-\\u0954\\u0958"
"-\\u095F\\u09BC\\u09CB-\\u09CD\\u09DC\\u09DD\\u09DF\\u0A33"
"\\u0A36\\u0A3C\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD\\u0B3C"
"\\u0B48\\u0B4B-\\u0B4D\\u0B5C\\u0B5D\\u0B94\\u0BCA-\\u0BCD"
"\\u0C48\\u0C4D\\u0C55\\u0C56\\u0CBC\\u0CC0\\u0CC7\\u0CC8\\u0CCA"
"\\u0CCB\\u0CCD\\u0D4A-\\u0D4D\\u0DCA\\u0DDA\\u0DDC-\\u0DDE"
"\\u0E38-\\u0E3A\\u0E48-\\u0E4B\\u0EB8\\u0EB9\\u0EC8-\\u0ECB"
"\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F43\\u0F4D\\u0F52\\u0F57"
"\\u0F5C\\u0F69\\u0F71-\\u0F76\\u0F78\\u0F7A-\\u0F7D\\u0F80-"
"\\u0F84\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9"
"\\u0FC6\\u1026\\u1037\\u1039\\u103A\\u108D\\u135F\\u1714\\u1734"
"\\u17D2\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1A60\\u1A75"
"-\\u1A7C\\u1A7F\\u1B06\\u1B08\\u1B0A\\u1B0C\\u1B0E\\u1B12\\u1B34"
"\\u1B3B\\u1B3D\\u1B40\\u1B41\\u1B43\\u1B44\\u1B6B-\\u1B73\\u1BAA"
"\\u1C37\\u1CD0-\\u1CD2\\u1CD4-\\u1CE0\\u1CE2-\\u1CE8\\u1CED"
"\\u1DC0-\\u1DE6\\u1DFD-\\u1E99\\u1E9B\\u1EA0-\\u1EF9\\u1F00-"
"\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1F50-"
"\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F7D\\u1F80-\\u1FB4"
"\\u1FB6-\\u1FBC\\u1FBE\\u1FC1-\\u1FC4\\u1FC6-\\u1FD3\\u1FD6-"
"\\u1FDB\\u1FDD-\\u1FEF\\u1FF2-\\u1FF4\\u1FF6-\\u1FFD\\u2000"
"\\u2001\\u20D0-\\u20DC\\u20E1\\u20E5-\\u20F0\\u2126\\u212A"
"\\u212B\\u219A\\u219B\\u21AE\\u21CD-\\u21CF\\u2204\\u2209\\u220C"
"\\u2224\\u2226\\u2241\\u2244\\u2247\\u2249\\u2260\\u2262\\u226D-"
"\\u2271\\u2274\\u2275\\u2278\\u2279\\u2280\\u2281\\u2284\\u2285"
"\\u2288\\u2289\\u22AC-\\u22AF\\u22E0-\\u22E3\\u22EA-\\u22ED"
"\\u2329\\u232A\\u2ADC\\u2CEF-\\u2CF1\\u2DE0-\\u2DFF\\u302A-"
"\\u302F\\u304C\\u304E\\u3050\\u3052\\u3054\\u3056\\u3058\\u305A"
"\\u305C\\u305E\\u3060\\u3062\\u3065\\u3067\\u3069\\u3070\\u3071"
"\\u3073\\u3074\\u3076\\u3077\\u3079\\u307A\\u307C\\u307D\\u3094"
"\\u3099\\u309A\\u309E\\u30AC\\u30AE\\u30B0\\u30B2\\u30B4\\u30B6"
"\\u30B8\\u30BA\\u30BC\\u30BE\\u30C0\\u30C2\\u30C5\\u30C7\\u30C9"
"\\u30D0\\u30D1\\u30D3\\u30D4\\u30D6\\u30D7\\u30D9\\u30DA\\u30DC"
"\\u30DD\\u30F4\\u30F7-\\u30FA\\u30FE\\uA66F\\uA67C\\uA67D\\uA6F0"
"\\uA6F1\\uA806\\uA8C4\\uA8E0-\\uA8F1\\uA92B-\\uA92D\\uA953"
"\\uA9B3\\uA9C0\\uAAB0\\uAAB2-\\uAAB4\\uAAB7\\uAAB8\\uAABE\\uAABF"
"\\uAAC1\\uABED\\uAC00-\\uD7A3\\uF900-\\uFA0D\\uFA10\\uFA12"
"\\uFA15-\\uFA1E\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A-\\uFA2D"
"\\uFA30-\\uFA6D\\uFA70-\\uFAD9\\uFB1D-\\uFB1F\\uFB2A-\\uFB36"
"\\uFB38-\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46-"
"\\uFB4E\\uFE20-\\uFE26\\U000101FD\\U00010A0D\\U00010A0F\\U00010A"
"38-\\U00010A3A\\U00010A3F\\U0001109A\\U0001109C\\U000110AB"
"\\U000110B9\\U000110BA\\U0001D15E-\\U0001D169\\U0001D16D-\\U0001"
"D172\\U0001D17B-\\U0001D182\\U0001D185-\\U0001D18B\\U0001D1AA-"
"\\U0001D1AD\\U0001D1BB-\\U0001D1C0\\U0001D242-\\U0001D244\\U0002"
"F800-\\U0002FA1D]"
, ""), errorCode);
// Remove from the NFC and NFKC sets all those characters that change
// when a back-combining character is added.
// First, get all of the back-combining characters and their combining classes.
UnicodeSet combineBack("[:NFC_QC=Maybe:]", errorCode);
int32_t numCombineBack=combineBack.size();
int32_t *combineBackCharsAndCc=new int32_t[numCombineBack*2];
UnicodeSetIterator iter(combineBack);
for(int32_t i=0; i<numCombineBack; ++i) {
iter.next();
UChar32 c=iter.getCodepoint();
combineBackCharsAndCc[2*i]=c;
combineBackCharsAndCc[2*i+1]=u_getCombiningClass(c);
}
skipSets[UNORM_NFC].applyPattern(UnicodeString(
"[^<->A-PR-Za-pr-z\\u00A8\\u00C0-\\u00CF\\u00D1-\\u00D6\\u00D8-"
"\\u00DD\\u00E0-\\u00EF\\u00F1-\\u00F6\\u00F8-\\u00FD\\u00FF-"
"\\u0103\\u0106-\\u010F\\u0112-\\u0117\\u011A-\\u0121\\u0124"
"\\u0125\\u0128-\\u012D\\u0130\\u0139\\u013A\\u013D\\u013E\\u0143"
"\\u0144\\u0147\\u0148\\u014C-\\u0151\\u0154\\u0155\\u0158-"
"\\u015D\\u0160\\u0161\\u0164\\u0165\\u0168-\\u0171\\u0174-"
"\\u017F\\u01A0\\u01A1\\u01AF\\u01B0\\u01B7\\u01CD-\\u01DC\\u01DE"
"-\\u01E1\\u01E6-\\u01EB\\u01F4\\u01F5\\u01F8-\\u01FB\\u0200-"
"\\u021B\\u021E\\u021F\\u0226-\\u0233\\u0292\\u0300-\\u034E"
"\\u0350-\\u036F\\u0374\\u037E\\u0387\\u0391\\u0395\\u0397\\u0399"
"\\u039F\\u03A1\\u03A5\\u03A9\\u03AC\\u03AE\\u03B1\\u03B5\\u03B7"
"\\u03B9\\u03BF\\u03C1\\u03C5\\u03C9-\\u03CB\\u03CE\\u03D2\\u0406"
"\\u0410\\u0413\\u0415-\\u0418\\u041A\\u041E\\u0423\\u0427\\u042B"
"\\u042D\\u0430\\u0433\\u0435-\\u0438\\u043A\\u043E\\u0443\\u0447"
"\\u044B\\u044D\\u0456\\u0474\\u0475\\u0483-\\u0487\\u04D8\\u04D9"
"\\u04E8\\u04E9\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u05C5"
"\\u05C7\\u0610-\\u061A\\u0622\\u0623\\u0627\\u0648\\u064A-"
"\\u065E\\u0670\\u06C1\\u06D2\\u06D5-\\u06DC\\u06DF-\\u06E4"
"\\u06E7\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07EB-"
"\\u07F3\\u0816-\\u0819\\u081B-\\u0823\\u0825-\\u0827\\u0829-"
"\\u082D\\u0928\\u0930\\u0933\\u093C\\u094D\\u0951-\\u0954\\u0958"
"-\\u095F\\u09BC\\u09BE\\u09C7\\u09CD\\u09D7\\u09DC\\u09DD\\u09DF"
"\\u0A33\\u0A36\\u0A3C\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD"
"\\u0B3C\\u0B3E\\u0B47\\u0B4D\\u0B56\\u0B57\\u0B5C\\u0B5D\\u0B92"
"\\u0BBE\\u0BC6\\u0BC7\\u0BCD\\u0BD7\\u0C46\\u0C4D\\u0C55\\u0C56"
"\\u0CBC\\u0CBF\\u0CC2\\u0CC6\\u0CCA\\u0CCD\\u0CD5\\u0CD6\\u0D3E"
"\\u0D46\\u0D47\\u0D4D\\u0D57\\u0DCA\\u0DCF\\u0DD9\\u0DDC\\u0DDF"
"\\u0E38-\\u0E3A\\u0E48-\\u0E4B\\u0EB8\\u0EB9\\u0EC8-\\u0ECB"
"\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F43\\u0F4D\\u0F52\\u0F57"
"\\u0F5C\\u0F69\\u0F71-\\u0F76\\u0F78\\u0F7A-\\u0F7D\\u0F80-"
"\\u0F84\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9"
"\\u0FC6\\u1025\\u102E\\u1037\\u1039\\u103A\\u108D\\u1100-\\u1112"
"\\u1161-\\u1175\\u11A8-\\u11C2\\u135F\\u1714\\u1734\\u17D2"
"\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1A60\\u1A75-"
"\\u1A7C\\u1A7F\\u1B05\\u1B07\\u1B09\\u1B0B\\u1B0D\\u1B11\\u1B34"
"\\u1B35\\u1B3A\\u1B3C\\u1B3E\\u1B3F\\u1B42\\u1B44\\u1B6B-\\u1B73"
"\\u1BAA\\u1C37\\u1CD0-\\u1CD2\\u1CD4-\\u1CE0\\u1CE2-\\u1CE8"
"\\u1CED\\u1DC0-\\u1DE6\\u1DFD-\\u1E03\\u1E0A-\\u1E0F\\u1E12-"
"\\u1E1B\\u1E20-\\u1E27\\u1E2A-\\u1E41\\u1E44-\\u1E53\\u1E58-"
"\\u1E7D\\u1E80-\\u1E87\\u1E8E-\\u1E91\\u1E96-\\u1E99\\u1EA0-"
"\\u1EF3\\u1EF6-\\u1EF9\\u1F00-\\u1F11\\u1F18\\u1F19\\u1F20-"
"\\u1F31\\u1F38\\u1F39\\u1F40\\u1F41\\u1F48\\u1F49\\u1F50\\u1F51"
"\\u1F59\\u1F60-\\u1F71\\u1F73-\\u1F75\\u1F77\\u1F79\\u1F7B-"
"\\u1F7D\\u1F80\\u1F81\\u1F88\\u1F89\\u1F90\\u1F91\\u1F98\\u1F99"
"\\u1FA0\\u1FA1\\u1FA8\\u1FA9\\u1FB3\\u1FB6\\u1FBB\\u1FBC\\u1FBE"
"\\u1FBF\\u1FC3\\u1FC6\\u1FC9\\u1FCB\\u1FCC\\u1FD3\\u1FDB\\u1FE3"
"\\u1FEB\\u1FEE\\u1FEF\\u1FF3\\u1FF6\\u1FF9\\u1FFB-\\u1FFE\\u2000"
"\\u2001\\u20D0-\\u20DC\\u20E1\\u20E5-\\u20F0\\u2126\\u212A"
"\\u212B\\u2190\\u2192\\u2194\\u21D0\\u21D2\\u21D4\\u2203\\u2208"
"\\u220B\\u2223\\u2225\\u223C\\u2243\\u2245\\u2248\\u224D\\u2261"
"\\u2264\\u2265\\u2272\\u2273\\u2276\\u2277\\u227A-\\u227D\\u2282"
"\\u2283\\u2286\\u2287\\u2291\\u2292\\u22A2\\u22A8\\u22A9\\u22AB"
"\\u22B2-\\u22B5\\u2329\\u232A\\u2ADC\\u2CEF-\\u2CF1\\u2DE0-"
"\\u2DFF\\u302A-\\u302F\\u3046\\u304B\\u304D\\u304F\\u3051\\u3053"
"\\u3055\\u3057\\u3059\\u305B\\u305D\\u305F\\u3061\\u3064\\u3066"
"\\u3068\\u306F\\u3072\\u3075\\u3078\\u307B\\u3099\\u309A\\u309D"
"\\u30A6\\u30AB\\u30AD\\u30AF\\u30B1\\u30B3\\u30B5\\u30B7\\u30B9"
"\\u30BB\\u30BD\\u30BF\\u30C1\\u30C4\\u30C6\\u30C8\\u30CF\\u30D2"
"\\u30D5\\u30D8\\u30DB\\u30EF-\\u30F2\\u30FD\\uA66F\\uA67C\\uA67D"
"\\uA6F0\\uA6F1\\uA806\\uA8C4\\uA8E0-\\uA8F1\\uA92B-\\uA92D"
"\\uA953\\uA9B3\\uA9C0\\uAAB0\\uAAB2-\\uAAB4\\uAAB7\\uAAB8\\uAABE"
"\\uAABF\\uAAC1\\uABED\\uAC00\\uAC1C\\uAC38\\uAC54\\uAC70\\uAC8C"
"\\uACA8\\uACC4\\uACE0\\uACFC\\uAD18\\uAD34\\uAD50\\uAD6C\\uAD88"
"\\uADA4\\uADC0\\uADDC\\uADF8\\uAE14\\uAE30\\uAE4C\\uAE68\\uAE84"
"\\uAEA0\\uAEBC\\uAED8\\uAEF4\\uAF10\\uAF2C\\uAF48\\uAF64\\uAF80"
"\\uAF9C\\uAFB8\\uAFD4\\uAFF0\\uB00C\\uB028\\uB044\\uB060\\uB07C"
"\\uB098\\uB0B4\\uB0D0\\uB0EC\\uB108\\uB124\\uB140\\uB15C\\uB178"
"\\uB194\\uB1B0\\uB1CC\\uB1E8\\uB204\\uB220\\uB23C\\uB258\\uB274"
"\\uB290\\uB2AC\\uB2C8\\uB2E4\\uB300\\uB31C\\uB338\\uB354\\uB370"
"\\uB38C\\uB3A8\\uB3C4\\uB3E0\\uB3FC\\uB418\\uB434\\uB450\\uB46C"
"\\uB488\\uB4A4\\uB4C0\\uB4DC\\uB4F8\\uB514\\uB530\\uB54C\\uB568"
"\\uB584\\uB5A0\\uB5BC\\uB5D8\\uB5F4\\uB610\\uB62C\\uB648\\uB664"
"\\uB680\\uB69C\\uB6B8\\uB6D4\\uB6F0\\uB70C\\uB728\\uB744\\uB760"
"\\uB77C\\uB798\\uB7B4\\uB7D0\\uB7EC\\uB808\\uB824\\uB840\\uB85C"
"\\uB878\\uB894\\uB8B0\\uB8CC\\uB8E8\\uB904\\uB920\\uB93C\\uB958"
"\\uB974\\uB990\\uB9AC\\uB9C8\\uB9E4\\uBA00\\uBA1C\\uBA38\\uBA54"
"\\uBA70\\uBA8C\\uBAA8\\uBAC4\\uBAE0\\uBAFC\\uBB18\\uBB34\\uBB50"
"\\uBB6C\\uBB88\\uBBA4\\uBBC0\\uBBDC\\uBBF8\\uBC14\\uBC30\\uBC4C"
"\\uBC68\\uBC84\\uBCA0\\uBCBC\\uBCD8\\uBCF4\\uBD10\\uBD2C\\uBD48"
"\\uBD64\\uBD80\\uBD9C\\uBDB8\\uBDD4\\uBDF0\\uBE0C\\uBE28\\uBE44"
"\\uBE60\\uBE7C\\uBE98\\uBEB4\\uBED0\\uBEEC\\uBF08\\uBF24\\uBF40"
"\\uBF5C\\uBF78\\uBF94\\uBFB0\\uBFCC\\uBFE8\\uC004\\uC020\\uC03C"
"\\uC058\\uC074\\uC090\\uC0AC\\uC0C8\\uC0E4\\uC100\\uC11C\\uC138"
"\\uC154\\uC170\\uC18C\\uC1A8\\uC1C4\\uC1E0\\uC1FC\\uC218\\uC234"
"\\uC250\\uC26C\\uC288\\uC2A4\\uC2C0\\uC2DC\\uC2F8\\uC314\\uC330"
"\\uC34C\\uC368\\uC384\\uC3A0\\uC3BC\\uC3D8\\uC3F4\\uC410\\uC42C"
"\\uC448\\uC464\\uC480\\uC49C\\uC4B8\\uC4D4\\uC4F0\\uC50C\\uC528"
"\\uC544\\uC560\\uC57C\\uC598\\uC5B4\\uC5D0\\uC5EC\\uC608\\uC624"
"\\uC640\\uC65C\\uC678\\uC694\\uC6B0\\uC6CC\\uC6E8\\uC704\\uC720"
"\\uC73C\\uC758\\uC774\\uC790\\uC7AC\\uC7C8\\uC7E4\\uC800\\uC81C"
"\\uC838\\uC854\\uC870\\uC88C\\uC8A8\\uC8C4\\uC8E0\\uC8FC\\uC918"
"\\uC934\\uC950\\uC96C\\uC988\\uC9A4\\uC9C0\\uC9DC\\uC9F8\\uCA14"
"\\uCA30\\uCA4C\\uCA68\\uCA84\\uCAA0\\uCABC\\uCAD8\\uCAF4\\uCB10"
"\\uCB2C\\uCB48\\uCB64\\uCB80\\uCB9C\\uCBB8\\uCBD4\\uCBF0\\uCC0C"
"\\uCC28\\uCC44\\uCC60\\uCC7C\\uCC98\\uCCB4\\uCCD0\\uCCEC\\uCD08"
"\\uCD24\\uCD40\\uCD5C\\uCD78\\uCD94\\uCDB0\\uCDCC\\uCDE8\\uCE04"
"\\uCE20\\uCE3C\\uCE58\\uCE74\\uCE90\\uCEAC\\uCEC8\\uCEE4\\uCF00"
"\\uCF1C\\uCF38\\uCF54\\uCF70\\uCF8C\\uCFA8\\uCFC4\\uCFE0\\uCFFC"
"\\uD018\\uD034\\uD050\\uD06C\\uD088\\uD0A4\\uD0C0\\uD0DC\\uD0F8"
"\\uD114\\uD130\\uD14C\\uD168\\uD184\\uD1A0\\uD1BC\\uD1D8\\uD1F4"
"\\uD210\\uD22C\\uD248\\uD264\\uD280\\uD29C\\uD2B8\\uD2D4\\uD2F0"
"\\uD30C\\uD328\\uD344\\uD360\\uD37C\\uD398\\uD3B4\\uD3D0\\uD3EC"
"\\uD408\\uD424\\uD440\\uD45C\\uD478\\uD494\\uD4B0\\uD4CC\\uD4E8"
"\\uD504\\uD520\\uD53C\\uD558\\uD574\\uD590\\uD5AC\\uD5C8\\uD5E4"
"\\uD600\\uD61C\\uD638\\uD654\\uD670\\uD68C\\uD6A8\\uD6C4\\uD6E0"
"\\uD6FC\\uD718\\uD734\\uD750\\uD76C\\uD788\\uF900-\\uFA0D\\uFA10"
"\\uFA12\\uFA15-\\uFA1E\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A-"
"\\uFA2D\\uFA30-\\uFA6D\\uFA70-\\uFAD9\\uFB1D-\\uFB1F\\uFB2A-"
"\\uFB36\\uFB38-\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46"
"-\\uFB4E\\uFE20-\\uFE26\\U000101FD\\U00010A0D\\U00010A0F\\U00010"
"A38-\\U00010A3A\\U00010A3F\\U00011099\\U0001109B\\U000110A5"
"\\U000110B9\\U000110BA\\U0001D15E-\\U0001D169\\U0001D16D-\\U0001"
"D172\\U0001D17B-\\U0001D182\\U0001D185-\\U0001D18B\\U0001D1AA-"
"\\U0001D1AD\\U0001D1BB-\\U0001D1C0\\U0001D242-\\U0001D244\\U0002"
"F800-\\U0002FA1D]"
, ""), errorCode);
// We need not look at control codes, Han characters nor Hangul LVT syllables because they
// do not combine forward. LV syllables are already removed.
UnicodeSet notInteresting("[[:C:][:Unified_Ideograph:][:HST=LVT:]]", errorCode);
LocalPointer<UnicodeSet> unsure(&((UnicodeSet *)(skipSets[UNORM_NFC].clone()))->removeAll(notInteresting));
// System.out.format("unsure.size()=%d\n", unsure.size());
skipSets[UNORM_NFKD].applyPattern(UnicodeString(
"[^\\u00A0\\u00A8\\u00AA\\u00AF\\u00B2-\\u00B5\\u00B8-\\u00BA"
"\\u00BC-\\u00BE\\u00C0-\\u00C5\\u00C7-\\u00CF\\u00D1-\\u00D6"
"\\u00D9-\\u00DD\\u00E0-\\u00E5\\u00E7-\\u00EF\\u00F1-\\u00F6"
"\\u00F9-\\u00FD\\u00FF-\\u010F\\u0112-\\u0125\\u0128-\\u0130"
"\\u0132-\\u0137\\u0139-\\u0140\\u0143-\\u0149\\u014C-\\u0151"
"\\u0154-\\u0165\\u0168-\\u017F\\u01A0\\u01A1\\u01AF\\u01B0"
"\\u01C4-\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F5\\u01F8-\\u021B"
"\\u021E\\u021F\\u0226-\\u0233\\u02B0-\\u02B8\\u02D8-\\u02DD"
"\\u02E0-\\u02E4\\u0300-\\u034E\\u0350-\\u036F\\u0374\\u037A"
"\\u037E\\u0384-\\u038A\\u038C\\u038E-\\u0390\\u03AA-\\u03B0"
"\\u03CA-\\u03CE\\u03D0-\\u03D6\\u03F0-\\u03F2\\u03F4\\u03F5"
"\\u03F9\\u0400\\u0401\\u0403\\u0407\\u040C-\\u040E\\u0419\\u0439"
"\\u0450\\u0451\\u0453\\u0457\\u045C-\\u045E\\u0476\\u0477\\u0483"
"-\\u0487\\u04C1\\u04C2\\u04D0-\\u04D3\\u04D6\\u04D7\\u04DA-"
"\\u04DF\\u04E2-\\u04E7\\u04EA-\\u04F5\\u04F8\\u04F9\\u0587"
"\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u05C5\\u05C7\\u0610"
"-\\u061A\\u0622-\\u0626\\u064B-\\u065E\\u0670\\u0675-\\u0678"
"\\u06C0\\u06C2\\u06D3\\u06D6-\\u06DC\\u06DF-\\u06E4\\u06E7"
"\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07EB-\\u07F3"
"\\u0816-\\u0819\\u081B-\\u0823\\u0825-\\u0827\\u0829-\\u082D"
"\\u0929\\u0931\\u0934\\u093C\\u094D\\u0951-\\u0954\\u0958-"
"\\u095F\\u09BC\\u09CB-\\u09CD\\u09DC\\u09DD\\u09DF\\u0A33\\u0A36"
"\\u0A3C\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD\\u0B3C\\u0B48"
"\\u0B4B-\\u0B4D\\u0B5C\\u0B5D\\u0B94\\u0BCA-\\u0BCD\\u0C48"
"\\u0C4D\\u0C55\\u0C56\\u0CBC\\u0CC0\\u0CC7\\u0CC8\\u0CCA\\u0CCB"
"\\u0CCD\\u0D4A-\\u0D4D\\u0DCA\\u0DDA\\u0DDC-\\u0DDE\\u0E33"
"\\u0E38-\\u0E3A\\u0E48-\\u0E4B\\u0EB3\\u0EB8\\u0EB9\\u0EC8-"
"\\u0ECB\\u0EDC\\u0EDD\\u0F0C\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39"
"\\u0F43\\u0F4D\\u0F52\\u0F57\\u0F5C\\u0F69\\u0F71-\\u0F7D\\u0F80"
"-\\u0F84\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9"
"\\u0FC6\\u1026\\u1037\\u1039\\u103A\\u108D\\u10FC\\u135F\\u1714"
"\\u1734\\u17D2\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1A60"
"\\u1A75-\\u1A7C\\u1A7F\\u1B06\\u1B08\\u1B0A\\u1B0C\\u1B0E\\u1B12"
"\\u1B34\\u1B3B\\u1B3D\\u1B40\\u1B41\\u1B43\\u1B44\\u1B6B-\\u1B73"
"\\u1BAA\\u1C37\\u1CD0-\\u1CD2\\u1CD4-\\u1CE0\\u1CE2-\\u1CE8"
"\\u1CED\\u1D2C-\\u1D2E\\u1D30-\\u1D3A\\u1D3C-\\u1D4D\\u1D4F-"
"\\u1D6A\\u1D78\\u1D9B-\\u1DE6\\u1DFD-\\u1E9B\\u1EA0-\\u1EF9"
"\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D"
"\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F7D\\u1F80-"
"\\u1FB4\\u1FB6-\\u1FC4\\u1FC6-\\u1FD3\\u1FD6-\\u1FDB\\u1FDD-"
"\\u1FEF\\u1FF2-\\u1FF4\\u1FF6-\\u1FFE\\u2000-\\u200A\\u2011"
"\\u2017\\u2024-\\u2026\\u202F\\u2033\\u2034\\u2036\\u2037\\u203C"
"\\u203E\\u2047-\\u2049\\u2057\\u205F\\u2070\\u2071\\u2074-"
"\\u208E\\u2090-\\u2094\\u20A8\\u20D0-\\u20DC\\u20E1\\u20E5-"
"\\u20F0\\u2100-\\u2103\\u2105-\\u2107\\u2109-\\u2113\\u2115"
"\\u2116\\u2119-\\u211D\\u2120-\\u2122\\u2124\\u2126\\u2128"
"\\u212A-\\u212D\\u212F-\\u2131\\u2133-\\u2139\\u213B-\\u2140"
"\\u2145-\\u2149\\u2150-\\u217F\\u2189\\u219A\\u219B\\u21AE"
"\\u21CD-\\u21CF\\u2204\\u2209\\u220C\\u2224\\u2226\\u222C\\u222D"
"\\u222F\\u2230\\u2241\\u2244\\u2247\\u2249\\u2260\\u2262\\u226D-"
"\\u2271\\u2274\\u2275\\u2278\\u2279\\u2280\\u2281\\u2284\\u2285"
"\\u2288\\u2289\\u22AC-\\u22AF\\u22E0-\\u22E3\\u22EA-\\u22ED"
"\\u2329\\u232A\\u2460-\\u24EA\\u2A0C\\u2A74-\\u2A76\\u2ADC"
"\\u2C7C\\u2C7D\\u2CEF-\\u2CF1\\u2D6F\\u2DE0-\\u2DFF\\u2E9F"
"\\u2EF3\\u2F00-\\u2FD5\\u3000\\u302A-\\u302F\\u3036\\u3038-"
"\\u303A\\u304C\\u304E\\u3050\\u3052\\u3054\\u3056\\u3058\\u305A"
"\\u305C\\u305E\\u3060\\u3062\\u3065\\u3067\\u3069\\u3070\\u3071"
"\\u3073\\u3074\\u3076\\u3077\\u3079\\u307A\\u307C\\u307D\\u3094"
"\\u3099-\\u309C\\u309E\\u309F\\u30AC\\u30AE\\u30B0\\u30B2\\u30B4"
"\\u30B6\\u30B8\\u30BA\\u30BC\\u30BE\\u30C0\\u30C2\\u30C5\\u30C7"
"\\u30C9\\u30D0\\u30D1\\u30D3\\u30D4\\u30D6\\u30D7\\u30D9\\u30DA"
"\\u30DC\\u30DD\\u30F4\\u30F7-\\u30FA\\u30FE\\u30FF\\u3131-"
"\\u318E\\u3192-\\u319F\\u3200-\\u321E\\u3220-\\u3247\\u3250-"
"\\u327E\\u3280-\\u32FE\\u3300-\\u33FF\\uA66F\\uA67C\\uA67D"
"\\uA6F0\\uA6F1\\uA770\\uA806\\uA8C4\\uA8E0-\\uA8F1\\uA92B-"
"\\uA92D\\uA953\\uA9B3\\uA9C0\\uAAB0\\uAAB2-\\uAAB4\\uAAB7\\uAAB8"
"\\uAABE\\uAABF\\uAAC1\\uABED\\uAC00-\\uD7A3\\uF900-\\uFA0D"
"\\uFA10\\uFA12\\uFA15-\\uFA1E\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A"
"-\\uFA2D\\uFA30-\\uFA6D\\uFA70-\\uFAD9\\uFB00-\\uFB06\\uFB13-"
"\\uFB17\\uFB1D-\\uFB36\\uFB38-\\uFB3C\\uFB3E\\uFB40\\uFB41"
"\\uFB43\\uFB44\\uFB46-\\uFBB1\\uFBD3-\\uFD3D\\uFD50-\\uFD8F"
"\\uFD92-\\uFDC7\\uFDF0-\\uFDFC\\uFE10-\\uFE19\\uFE20-\\uFE26"
"\\uFE30-\\uFE44\\uFE47-\\uFE52\\uFE54-\\uFE66\\uFE68-\\uFE6B"
"\\uFE70-\\uFE72\\uFE74\\uFE76-\\uFEFC\\uFF01-\\uFFBE\\uFFC2-"
"\\uFFC7\\uFFCA-\\uFFCF\\uFFD2-\\uFFD7\\uFFDA-\\uFFDC\\uFFE0-"
"\\uFFE6\\uFFE8-\\uFFEE\\U000101FD\\U00010A0D\\U00010A0F\\U00010A"
"38-\\U00010A3A\\U00010A3F\\U0001109A\\U0001109C\\U000110AB"
"\\U000110B9\\U000110BA\\U0001D15E-\\U0001D169\\U0001D16D-\\U0001"
"D172\\U0001D17B-\\U0001D182\\U0001D185-\\U0001D18B\\U0001D1AA-"
"\\U0001D1AD\\U0001D1BB-\\U0001D1C0\\U0001D242-\\U0001D244\\U0001"
"D400-\\U0001D454\\U0001D456-\\U0001D49C\\U0001D49E\\U0001D49F"
"\\U0001D4A2\\U0001D4A5\\U0001D4A6\\U0001D4A9-\\U0001D4AC\\U0001D"
"4AE-\\U0001D4B9\\U0001D4BB\\U0001D4BD-\\U0001D4C3\\U0001D4C5-"
"\\U0001D505\\U0001D507-\\U0001D50A\\U0001D50D-\\U0001D514\\U0001"
"D516-\\U0001D51C\\U0001D51E-\\U0001D539\\U0001D53B-\\U0001D53E"
"\\U0001D540-\\U0001D544\\U0001D546\\U0001D54A-\\U0001D550\\U0001"
"D552-\\U0001D6A5\\U0001D6A8-\\U0001D7CB\\U0001D7CE-\\U0001D7FF"
"\\U0001F100-\\U0001F10A\\U0001F110-\\U0001F12E\\U0001F131\\U0001"
"F13D\\U0001F13F\\U0001F142\\U0001F146\\U0001F14A-\\U0001F14E"
"\\U0001F190\\U0001F200\\U0001F210-\\U0001F231\\U0001F240-\\U0001"
"F248\\U0002F800-\\U0002FA1D]"
, ""), errorCode);
skipSets[UNORM_NFKC].applyPattern(UnicodeString(
"[^<->A-PR-Za-pr-z\\u00A0\\u00A8\\u00AA\\u00AF\\u00B2-\\u00B5"
"\\u00B8-\\u00BA\\u00BC-\\u00BE\\u00C0-\\u00CF\\u00D1-\\u00D6"
"\\u00D8-\\u00DD\\u00E0-\\u00EF\\u00F1-\\u00F6\\u00F8-\\u00FD"
"\\u00FF-\\u0103\\u0106-\\u010F\\u0112-\\u0117\\u011A-\\u0121"
"\\u0124\\u0125\\u0128-\\u012D\\u0130\\u0132\\u0133\\u0139\\u013A"
"\\u013D-\\u0140\\u0143\\u0144\\u0147-\\u0149\\u014C-\\u0151"
"\\u0154\\u0155\\u0158-\\u015D\\u0160\\u0161\\u0164\\u0165\\u0168"
"-\\u0171\\u0174-\\u017F\\u01A0\\u01A1\\u01AF\\u01B0\\u01B7"
"\\u01C4-\\u01DC\\u01DE-\\u01E1\\u01E6-\\u01EB\\u01F1-\\u01F5"
"\\u01F8-\\u01FB\\u0200-\\u021B\\u021E\\u021F\\u0226-\\u0233"
"\\u0292\\u02B0-\\u02B8\\u02D8-\\u02DD\\u02E0-\\u02E4\\u0300-"
"\\u034E\\u0350-\\u036F\\u0374\\u037A\\u037E\\u0384\\u0385\\u0387"
"\\u0391\\u0395\\u0397\\u0399\\u039F\\u03A1\\u03A5\\u03A9\\u03AC"
"\\u03AE\\u03B1\\u03B5\\u03B7\\u03B9\\u03BF\\u03C1\\u03C5\\u03C9-"
"\\u03CB\\u03CE\\u03D0-\\u03D6\\u03F0-\\u03F2\\u03F4\\u03F5"
"\\u03F9\\u0406\\u0410\\u0413\\u0415-\\u0418\\u041A\\u041E\\u0423"
"\\u0427\\u042B\\u042D\\u0430\\u0433\\u0435-\\u0438\\u043A\\u043E"
"\\u0443\\u0447\\u044B\\u044D\\u0456\\u0474\\u0475\\u0483-\\u0487"
"\\u04D8\\u04D9\\u04E8\\u04E9\\u0587\\u0591-\\u05BD\\u05BF\\u05C1"
"\\u05C2\\u05C4\\u05C5\\u05C7\\u0610-\\u061A\\u0622\\u0623\\u0627"
"\\u0648\\u064A-\\u065E\\u0670\\u0675-\\u0678\\u06C1\\u06D2"
"\\u06D5-\\u06DC\\u06DF-\\u06E4\\u06E7\\u06E8\\u06EA-\\u06ED"
"\\u0711\\u0730-\\u074A\\u07EB-\\u07F3\\u0816-\\u0819\\u081B-"
"\\u0823\\u0825-\\u0827\\u0829-\\u082D\\u0928\\u0930\\u0933"
"\\u093C\\u094D\\u0951-\\u0954\\u0958-\\u095F\\u09BC\\u09BE"
"\\u09C7\\u09CD\\u09D7\\u09DC\\u09DD\\u09DF\\u0A33\\u0A36\\u0A3C"
"\\u0A4D\\u0A59-\\u0A5B\\u0A5E\\u0ABC\\u0ACD\\u0B3C\\u0B3E\\u0B47"
"\\u0B4D\\u0B56\\u0B57\\u0B5C\\u0B5D\\u0B92\\u0BBE\\u0BC6\\u0BC7"
"\\u0BCD\\u0BD7\\u0C46\\u0C4D\\u0C55\\u0C56\\u0CBC\\u0CBF\\u0CC2"
"\\u0CC6\\u0CCA\\u0CCD\\u0CD5\\u0CD6\\u0D3E\\u0D46\\u0D47\\u0D4D"
"\\u0D57\\u0DCA\\u0DCF\\u0DD9\\u0DDC\\u0DDF\\u0E33\\u0E38-\\u0E3A"
"\\u0E48-\\u0E4B\\u0EB3\\u0EB8\\u0EB9\\u0EC8-\\u0ECB\\u0EDC"
"\\u0EDD\\u0F0C\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F43\\u0F4D"
"\\u0F52\\u0F57\\u0F5C\\u0F69\\u0F71-\\u0F7D\\u0F80-\\u0F84"
"\\u0F86\\u0F87\\u0F93\\u0F9D\\u0FA2\\u0FA7\\u0FAC\\u0FB9\\u0FC6"
"\\u1025\\u102E\\u1037\\u1039\\u103A\\u108D\\u10FC\\u1100-\\u1112"
"\\u1161-\\u1175\\u11A8-\\u11C2\\u135F\\u1714\\u1734\\u17D2"
"\\u17DD\\u18A9\\u1939-\\u193B\\u1A17\\u1A18\\u1A60\\u1A75-"
"\\u1A7C\\u1A7F\\u1B05\\u1B07\\u1B09\\u1B0B\\u1B0D\\u1B11\\u1B34"
"\\u1B35\\u1B3A\\u1B3C\\u1B3E\\u1B3F\\u1B42\\u1B44\\u1B6B-\\u1B73"
"\\u1BAA\\u1C37\\u1CD0-\\u1CD2\\u1CD4-\\u1CE0\\u1CE2-\\u1CE8"
"\\u1CED\\u1D2C-\\u1D2E\\u1D30-\\u1D3A\\u1D3C-\\u1D4D\\u1D4F-"
"\\u1D6A\\u1D78\\u1D9B-\\u1DE6\\u1DFD-\\u1E03\\u1E0A-\\u1E0F"
"\\u1E12-\\u1E1B\\u1E20-\\u1E27\\u1E2A-\\u1E41\\u1E44-\\u1E53"
"\\u1E58-\\u1E7D\\u1E80-\\u1E87\\u1E8E-\\u1E91\\u1E96-\\u1E9B"
"\\u1EA0-\\u1EF3\\u1EF6-\\u1EF9\\u1F00-\\u1F11\\u1F18\\u1F19"
"\\u1F20-\\u1F31\\u1F38\\u1F39\\u1F40\\u1F41\\u1F48\\u1F49\\u1F50"
"\\u1F51\\u1F59\\u1F60-\\u1F71\\u1F73-\\u1F75\\u1F77\\u1F79"
"\\u1F7B-\\u1F7D\\u1F80\\u1F81\\u1F88\\u1F89\\u1F90\\u1F91\\u1F98"
"\\u1F99\\u1FA0\\u1FA1\\u1FA8\\u1FA9\\u1FB3\\u1FB6\\u1FBB-\\u1FC1"
"\\u1FC3\\u1FC6\\u1FC9\\u1FCB-\\u1FCF\\u1FD3\\u1FDB\\u1FDD-"
"\\u1FDF\\u1FE3\\u1FEB\\u1FED-\\u1FEF\\u1FF3\\u1FF6\\u1FF9\\u1FFB"
"-\\u1FFE\\u2000-\\u200A\\u2011\\u2017\\u2024-\\u2026\\u202F"
"\\u2033\\u2034\\u2036\\u2037\\u203C\\u203E\\u2047-\\u2049\\u2057"
"\\u205F\\u2070\\u2071\\u2074-\\u208E\\u2090-\\u2094\\u20A8"
"\\u20D0-\\u20DC\\u20E1\\u20E5-\\u20F0\\u2100-\\u2103\\u2105-"
"\\u2107\\u2109-\\u2113\\u2115\\u2116\\u2119-\\u211D\\u2120-"
"\\u2122\\u2124\\u2126\\u2128\\u212A-\\u212D\\u212F-\\u2131"
"\\u2133-\\u2139\\u213B-\\u2140\\u2145-\\u2149\\u2150-\\u217F"
"\\u2189\\u2190\\u2192\\u2194\\u21D0\\u21D2\\u21D4\\u2203\\u2208"
"\\u220B\\u2223\\u2225\\u222C\\u222D\\u222F\\u2230\\u223C\\u2243"
"\\u2245\\u2248\\u224D\\u2261\\u2264\\u2265\\u2272\\u2273\\u2276"
"\\u2277\\u227A-\\u227D\\u2282\\u2283\\u2286\\u2287\\u2291\\u2292"
"\\u22A2\\u22A8\\u22A9\\u22AB\\u22B2-\\u22B5\\u2329\\u232A\\u2460"
"-\\u24EA\\u2A0C\\u2A74-\\u2A76\\u2ADC\\u2C7C\\u2C7D\\u2CEF-"
"\\u2CF1\\u2D6F\\u2DE0-\\u2DFF\\u2E9F\\u2EF3\\u2F00-\\u2FD5"
"\\u3000\\u302A-\\u302F\\u3036\\u3038-\\u303A\\u3046\\u304B"
"\\u304D\\u304F\\u3051\\u3053\\u3055\\u3057\\u3059\\u305B\\u305D"
"\\u305F\\u3061\\u3064\\u3066\\u3068\\u306F\\u3072\\u3075\\u3078"
"\\u307B\\u3099-\\u309D\\u309F\\u30A6\\u30AB\\u30AD\\u30AF\\u30B1"
"\\u30B3\\u30B5\\u30B7\\u30B9\\u30BB\\u30BD\\u30BF\\u30C1\\u30C4"
"\\u30C6\\u30C8\\u30CF\\u30D2\\u30D5\\u30D8\\u30DB\\u30EF-\\u30F2"
"\\u30FD\\u30FF\\u3131-\\u318E\\u3192-\\u319F\\u3200-\\u321E"
"\\u3220-\\u3247\\u3250-\\u327E\\u3280-\\u32FE\\u3300-\\u33FF"
"\\uA66F\\uA67C\\uA67D\\uA6F0\\uA6F1\\uA770\\uA806\\uA8C4\\uA8E0-"
"\\uA8F1\\uA92B-\\uA92D\\uA953\\uA9B3\\uA9C0\\uAAB0\\uAAB2-"
"\\uAAB4\\uAAB7\\uAAB8\\uAABE\\uAABF\\uAAC1\\uABED\\uAC00\\uAC1C"
"\\uAC38\\uAC54\\uAC70\\uAC8C\\uACA8\\uACC4\\uACE0\\uACFC\\uAD18"
"\\uAD34\\uAD50\\uAD6C\\uAD88\\uADA4\\uADC0\\uADDC\\uADF8\\uAE14"
"\\uAE30\\uAE4C\\uAE68\\uAE84\\uAEA0\\uAEBC\\uAED8\\uAEF4\\uAF10"
"\\uAF2C\\uAF48\\uAF64\\uAF80\\uAF9C\\uAFB8\\uAFD4\\uAFF0\\uB00C"
"\\uB028\\uB044\\uB060\\uB07C\\uB098\\uB0B4\\uB0D0\\uB0EC\\uB108"
"\\uB124\\uB140\\uB15C\\uB178\\uB194\\uB1B0\\uB1CC\\uB1E8\\uB204"
"\\uB220\\uB23C\\uB258\\uB274\\uB290\\uB2AC\\uB2C8\\uB2E4\\uB300"
"\\uB31C\\uB338\\uB354\\uB370\\uB38C\\uB3A8\\uB3C4\\uB3E0\\uB3FC"
"\\uB418\\uB434\\uB450\\uB46C\\uB488\\uB4A4\\uB4C0\\uB4DC\\uB4F8"
"\\uB514\\uB530\\uB54C\\uB568\\uB584\\uB5A0\\uB5BC\\uB5D8\\uB5F4"
"\\uB610\\uB62C\\uB648\\uB664\\uB680\\uB69C\\uB6B8\\uB6D4\\uB6F0"
"\\uB70C\\uB728\\uB744\\uB760\\uB77C\\uB798\\uB7B4\\uB7D0\\uB7EC"
"\\uB808\\uB824\\uB840\\uB85C\\uB878\\uB894\\uB8B0\\uB8CC\\uB8E8"
"\\uB904\\uB920\\uB93C\\uB958\\uB974\\uB990\\uB9AC\\uB9C8\\uB9E4"
"\\uBA00\\uBA1C\\uBA38\\uBA54\\uBA70\\uBA8C\\uBAA8\\uBAC4\\uBAE0"
"\\uBAFC\\uBB18\\uBB34\\uBB50\\uBB6C\\uBB88\\uBBA4\\uBBC0\\uBBDC"
"\\uBBF8\\uBC14\\uBC30\\uBC4C\\uBC68\\uBC84\\uBCA0\\uBCBC\\uBCD8"
"\\uBCF4\\uBD10\\uBD2C\\uBD48\\uBD64\\uBD80\\uBD9C\\uBDB8\\uBDD4"
"\\uBDF0\\uBE0C\\uBE28\\uBE44\\uBE60\\uBE7C\\uBE98\\uBEB4\\uBED0"
"\\uBEEC\\uBF08\\uBF24\\uBF40\\uBF5C\\uBF78\\uBF94\\uBFB0\\uBFCC"
"\\uBFE8\\uC004\\uC020\\uC03C\\uC058\\uC074\\uC090\\uC0AC\\uC0C8"
"\\uC0E4\\uC100\\uC11C\\uC138\\uC154\\uC170\\uC18C\\uC1A8\\uC1C4"
"\\uC1E0\\uC1FC\\uC218\\uC234\\uC250\\uC26C\\uC288\\uC2A4\\uC2C0"
"\\uC2DC\\uC2F8\\uC314\\uC330\\uC34C\\uC368\\uC384\\uC3A0\\uC3BC"
"\\uC3D8\\uC3F4\\uC410\\uC42C\\uC448\\uC464\\uC480\\uC49C\\uC4B8"
"\\uC4D4\\uC4F0\\uC50C\\uC528\\uC544\\uC560\\uC57C\\uC598\\uC5B4"
"\\uC5D0\\uC5EC\\uC608\\uC624\\uC640\\uC65C\\uC678\\uC694\\uC6B0"
"\\uC6CC\\uC6E8\\uC704\\uC720\\uC73C\\uC758\\uC774\\uC790\\uC7AC"
"\\uC7C8\\uC7E4\\uC800\\uC81C\\uC838\\uC854\\uC870\\uC88C\\uC8A8"
"\\uC8C4\\uC8E0\\uC8FC\\uC918\\uC934\\uC950\\uC96C\\uC988\\uC9A4"
"\\uC9C0\\uC9DC\\uC9F8\\uCA14\\uCA30\\uCA4C\\uCA68\\uCA84\\uCAA0"
"\\uCABC\\uCAD8\\uCAF4\\uCB10\\uCB2C\\uCB48\\uCB64\\uCB80\\uCB9C"
"\\uCBB8\\uCBD4\\uCBF0\\uCC0C\\uCC28\\uCC44\\uCC60\\uCC7C\\uCC98"
"\\uCCB4\\uCCD0\\uCCEC\\uCD08\\uCD24\\uCD40\\uCD5C\\uCD78\\uCD94"
"\\uCDB0\\uCDCC\\uCDE8\\uCE04\\uCE20\\uCE3C\\uCE58\\uCE74\\uCE90"
"\\uCEAC\\uCEC8\\uCEE4\\uCF00\\uCF1C\\uCF38\\uCF54\\uCF70\\uCF8C"
"\\uCFA8\\uCFC4\\uCFE0\\uCFFC\\uD018\\uD034\\uD050\\uD06C\\uD088"
"\\uD0A4\\uD0C0\\uD0DC\\uD0F8\\uD114\\uD130\\uD14C\\uD168\\uD184"
"\\uD1A0\\uD1BC\\uD1D8\\uD1F4\\uD210\\uD22C\\uD248\\uD264\\uD280"
"\\uD29C\\uD2B8\\uD2D4\\uD2F0\\uD30C\\uD328\\uD344\\uD360\\uD37C"
"\\uD398\\uD3B4\\uD3D0\\uD3EC\\uD408\\uD424\\uD440\\uD45C\\uD478"
"\\uD494\\uD4B0\\uD4CC\\uD4E8\\uD504\\uD520\\uD53C\\uD558\\uD574"
"\\uD590\\uD5AC\\uD5C8\\uD5E4\\uD600\\uD61C\\uD638\\uD654\\uD670"
"\\uD68C\\uD6A8\\uD6C4\\uD6E0\\uD6FC\\uD718\\uD734\\uD750\\uD76C"
"\\uD788\\uF900-\\uFA0D\\uFA10\\uFA12\\uFA15-\\uFA1E\\uFA20"
"\\uFA22\\uFA25\\uFA26\\uFA2A-\\uFA2D\\uFA30-\\uFA6D\\uFA70-"
"\\uFAD9\\uFB00-\\uFB06\\uFB13-\\uFB17\\uFB1D-\\uFB36\\uFB38-"
"\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46-\\uFBB1\\uFBD3"
"-\\uFD3D\\uFD50-\\uFD8F\\uFD92-\\uFDC7\\uFDF0-\\uFDFC\\uFE10-"
"\\uFE19\\uFE20-\\uFE26\\uFE30-\\uFE44\\uFE47-\\uFE52\\uFE54-"
"\\uFE66\\uFE68-\\uFE6B\\uFE70-\\uFE72\\uFE74\\uFE76-\\uFEFC"
"\\uFF01-\\uFFBE\\uFFC2-\\uFFC7\\uFFCA-\\uFFCF\\uFFD2-\\uFFD7"
"\\uFFDA-\\uFFDC\\uFFE0-\\uFFE6\\uFFE8-\\uFFEE\\U000101FD\\U00010"
"A0D\\U00010A0F\\U00010A38-\\U00010A3A\\U00010A3F\\U00011099"
"\\U0001109B\\U000110A5\\U000110B9\\U000110BA\\U0001D15E-\\U0001D"
"169\\U0001D16D-\\U0001D172\\U0001D17B-\\U0001D182\\U0001D185-"
"\\U0001D18B\\U0001D1AA-\\U0001D1AD\\U0001D1BB-\\U0001D1C0\\U0001"
"D242-\\U0001D244\\U0001D400-\\U0001D454\\U0001D456-\\U0001D49C"
"\\U0001D49E\\U0001D49F\\U0001D4A2\\U0001D4A5\\U0001D4A6\\U0001D4"
"A9-\\U0001D4AC\\U0001D4AE-\\U0001D4B9\\U0001D4BB\\U0001D4BD-"
"\\U0001D4C3\\U0001D4C5-\\U0001D505\\U0001D507-\\U0001D50A\\U0001"
"D50D-\\U0001D514\\U0001D516-\\U0001D51C\\U0001D51E-\\U0001D539"
"\\U0001D53B-\\U0001D53E\\U0001D540-\\U0001D544\\U0001D546\\U0001"
"D54A-\\U0001D550\\U0001D552-\\U0001D6A5\\U0001D6A8-\\U0001D7CB"
"\\U0001D7CE-\\U0001D7FF\\U0001F100-\\U0001F10A\\U0001F110-"
"\\U0001F12E\\U0001F131\\U0001F13D\\U0001F13F\\U0001F142\\U0001F1"
"46\\U0001F14A-\\U0001F14E\\U0001F190\\U0001F200\\U0001F210-"
"\\U0001F231\\U0001F240-\\U0001F248\\U0002F800-\\U0002FA1D]"
, ""), errorCode);
// For each character about which we are unsure, see if it changes when we add
// one of the back-combining characters.
const Normalizer2 *norm2=Normalizer2::getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode);
UnicodeString s;
iter.reset(*unsure);
while(iter.next()) {
UChar32 c=iter.getCodepoint();
s.setTo(c);
int32_t cLength=s.length();
int32_t tccc=u_getIntPropertyValue(c, UCHAR_TRAIL_CANONICAL_COMBINING_CLASS);
for(int32_t i=0; i<numCombineBack; ++i) {
// If c's decomposition ends with a character with non-zero combining class, then
// c can only change if it combines with a character with a non-zero combining class.
int32_t cc2=combineBackCharsAndCc[2*i+1];
if(tccc==0 || cc2!=0) {
UChar32 c2=combineBackCharsAndCc[2*i];
s.append(c2);
if(!norm2->isNormalized(s, errorCode)) {
// System.out.format("remove U+%04x (tccc=%d) + U+%04x (cc=%d)\n", c, tccc, c2, cc2);
skipSets[UNORM_NFC].remove(c);
skipSets[UNORM_NFKC].remove(c);
break;
}
s.truncate(cLength);
}
}
}
}
void
@ -1733,15 +1356,17 @@ BasicNormalizerTest::TestSkippable() {
}
/* get expected sets from hardcoded patterns */
initExpectedSkippables(expectSets);
initExpectedSkippables(expectSets, errorCode);
errorCode.assertSuccess();
for(int32_t i=UNORM_NONE; i<UNORM_MODE_COUNT; ++i) {
if(skipSets[i]!=expectSets[i]) {
errln("error: TestSkippable skipSets[%d]!=expectedSets[%d]\n"
"may need to update hardcoded UnicodeSet patterns in\n"
"tstnorm.cpp/initExpectedSkippables(),\n"
"see ICU4J - unicodetools.com.ibm.text.UCD.NFSkippable\n",
i, i);
errln("error: TestSkippable skipSets[%d]!=expectedSets[%d]\n", i, i);
// Note: This used to depend on hardcoded UnicodeSet patterns generated by
// Mark's unicodetools.com.ibm.text.UCD.NFSkippable, by
// running com.ibm.text.UCD.Main with the option NFSkippable.
// Since ICU 4.6/Unicode 6, we are generating the
// expectSets ourselves in initSkippables().
s=UNICODE_STRING_SIMPLE("skip-expect=");
(diff=skipSets[i]).removeAll(expectSets[i]).toPattern(pattern, TRUE);

View file

@ -1044,7 +1044,18 @@ void UnicodeSetTest::TestPropertySet() {
"[:Assigned:]",
"A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
"\\u0888\\uFDD3\\uFFFE\\U00050005"
"\\u0888\\uFDD3\\uFFFE\\U00050005",
// Script_Extensions, new in Unicode 6.0
"[:scx=Arab:]",
"\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
"\\u061D\\u065F\\uFDEF\\uFDFE",
// U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
// so scx-sc is missing U+FDF2.
"[[:Script_Extensions=Arabic:]-[:Arab:]]",
"\\u0640\\u064B\\u0650\\u0655\\uFDFD",
"\\uFDF2"
};
static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);

View file

@ -223,6 +223,14 @@ void UTS46Test::TestNotSTD3() {
if(result!=input || info.getErrors()!=UIDNA_ERROR_BIDI) {
errln("notSTD3.nameToASCII(ASCII-with-space.alef.edu) failed");
}
// Characters that are canonically equivalent to sequences with non-LDH ASCII.
input=UNICODE_STRING_SIMPLE("a\\u2260b\\u226Ec\\u226Fd").unescape();
not3->nameToUnicode(input, result, info, errorCode);
if(result!=input || info.hasErrors()) {
prettify(result).extract(0, 0x7fffffff, buffer, LENGTHOF(buffer));
errln("notSTD3.nameToUnicode(equiv to non-LDH ASCII) unexpected errors %04lx string %s",
(long)info.getErrors(), buffer);
}
}
struct TestCase {
@ -283,6 +291,10 @@ static const TestCase testCases[]={
{ "\\u65E5\\u672C\\u8A9E\\u3002\\uFF2A\\uFF30", "B", // Japanese with fullwidth ".jp"
"\\u65E5\\u672C\\u8A9E.jp", 0 },
{ "\\u2615", "B", "\\u2615", 0 }, // Unicode 4.0 HOT BEVERAGE
// some characters are disallowed because they are canonically equivalent
// to sequences with non-LDH ASCII
{ "a\\u2260b\\u226Ec\\u226Fd", "B",
"a\\uFFFDb\\uFFFDc\\uFFFDd", UIDNA_ERROR_DISALLOWED },
// many deviation characters, test the special mapping code
{ "1.a\\u00DF\\u200C\\u200Db\\u200C\\u200Dc\\u00DF\\u00DF\\u00DF\\u00DFd"
"\\u03C2\\u03C3\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DF\\u00DFe"

View file

@ -1,9 +1,8 @@
# Note: Please make sure that this utf-8 file contains a BOM.
# GraphemeBreakTest-5.2.0.txt
# Date: 2009-05-28, 20:37:56 GMT [MD]
# GraphemeBreakTest-6.0.0.txt
# Date: 2010-05-18, 00:49:27 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#
@ -16,7 +15,7 @@
# × wherever there is not.
# <comment> the format can change, but currently it shows:
# - the sample character name
# - (x) the line_break property* for the sample character
# - (x) the Grapheme_Break property* for the sample character
# - [x] the rule that determines whether there is a break or not
#
# These samples may be extended or changed in the future.

View file

@ -1,116 +0,0 @@
# Default GraphemeCluster Break Test
# Generated: 2003-04-19, 01:55:08 GMT, MED
#
# Format:
# <string> (# <comment>)?
# <string> contains hex Unicode code points, with
# ÷ wherever there is a break opportunity, and
# × wherever there is not.
# <comment> the format can change, but currently it shows:
# - the sample character name
# - (x) the line_break property* for the sample character
# - [x] the rule that determines whether there is a break or not
#
# These samples may be extended or changed in the future.
#
÷ 000D ÷ 000D ÷ # ÷ [1: sot ÷] <CARRIAGE RETURN (CR)> (CR) ÷ [4: ( Control | CR | LF ) ÷] <CARRIAGE RETURN (CR)> (CR) ÷ [2: ÷ eot]
÷ 000D × 000A ÷ # ÷ [1: sot ÷] <CARRIAGE RETURN (CR)> (CR) × [3: CR × LF] <LINE FEED (LF)> (LF) ÷ [2: ÷ eot]
÷ 000D ÷ 0001 ÷ # ÷ [1: sot ÷] <CARRIAGE RETURN (CR)> (CR) ÷ [4: ( Control | CR | LF ) ÷] <START OF HEADING> (Control) ÷ [2: ÷ eot]
÷ 000D ÷ 0300 ÷ # ÷ [1: sot ÷] <CARRIAGE RETURN (CR)> (CR) ÷ [4: ( Control | CR | LF ) ÷] COMBINING GRAVE ACCENT (Extend) ÷ [2: ÷ eot]
÷ 000D ÷ 1100 ÷ # ÷ [1: sot ÷] <CARRIAGE RETURN (CR)> (CR) ÷ [4: ( Control | CR | LF ) ÷] HANGUL CHOSEONG KIYEOK (L) ÷ [2: ÷ eot]
÷ 000D ÷ 1160 ÷ # ÷ [1: sot ÷] <CARRIAGE RETURN (CR)> (CR) ÷ [4: ( Control | CR | LF ) ÷] HANGUL JUNGSEONG FILLER (V) ÷ [2: ÷ eot]
÷ 000D ÷ 11A8 ÷ # ÷ [1: sot ÷] <CARRIAGE RETURN (CR)> (CR) ÷ [4: ( Control | CR | LF ) ÷] HANGUL JONGSEONG KIYEOK (T) ÷ [2: ÷ eot]
÷ 000D ÷ AC00 ÷ # ÷ [1: sot ÷] <CARRIAGE RETURN (CR)> (CR) ÷ [4: ( Control | CR | LF ) ÷] HANGUL SYLLABLE GA (LV) ÷ [2: ÷ eot]
÷ 000D ÷ AC01 ÷ # ÷ [1: sot ÷] <CARRIAGE RETURN (CR)> (CR) ÷ [4: ( Control | CR | LF ) ÷] HANGUL SYLLABLE GAG (LVT) ÷ [2: ÷ eot]
÷ 000D ÷ 0020 ÷ # ÷ [1: sot ÷] <CARRIAGE RETURN (CR)> (CR) ÷ [4: ( Control | CR | LF ) ÷] SPACE (Other) ÷ [2: ÷ eot]
÷ 000A ÷ 000D ÷ # ÷ [1: sot ÷] <LINE FEED (LF)> (LF) ÷ [4: ( Control | CR | LF ) ÷] <CARRIAGE RETURN (CR)> (CR) ÷ [2: ÷ eot]
÷ 000A ÷ 000A ÷ # ÷ [1: sot ÷] <LINE FEED (LF)> (LF) ÷ [4: ( Control | CR | LF ) ÷] <LINE FEED (LF)> (LF) ÷ [2: ÷ eot]
÷ 000A ÷ 0001 ÷ # ÷ [1: sot ÷] <LINE FEED (LF)> (LF) ÷ [4: ( Control | CR | LF ) ÷] <START OF HEADING> (Control) ÷ [2: ÷ eot]
÷ 000A ÷ 0300 ÷ # ÷ [1: sot ÷] <LINE FEED (LF)> (LF) ÷ [4: ( Control | CR | LF ) ÷] COMBINING GRAVE ACCENT (Extend) ÷ [2: ÷ eot]
÷ 000A ÷ 1100 ÷ # ÷ [1: sot ÷] <LINE FEED (LF)> (LF) ÷ [4: ( Control | CR | LF ) ÷] HANGUL CHOSEONG KIYEOK (L) ÷ [2: ÷ eot]
÷ 000A ÷ 1160 ÷ # ÷ [1: sot ÷] <LINE FEED (LF)> (LF) ÷ [4: ( Control | CR | LF ) ÷] HANGUL JUNGSEONG FILLER (V) ÷ [2: ÷ eot]
÷ 000A ÷ 11A8 ÷ # ÷ [1: sot ÷] <LINE FEED (LF)> (LF) ÷ [4: ( Control | CR | LF ) ÷] HANGUL JONGSEONG KIYEOK (T) ÷ [2: ÷ eot]
÷ 000A ÷ AC00 ÷ # ÷ [1: sot ÷] <LINE FEED (LF)> (LF) ÷ [4: ( Control | CR | LF ) ÷] HANGUL SYLLABLE GA (LV) ÷ [2: ÷ eot]
÷ 000A ÷ AC01 ÷ # ÷ [1: sot ÷] <LINE FEED (LF)> (LF) ÷ [4: ( Control | CR | LF ) ÷] HANGUL SYLLABLE GAG (LVT) ÷ [2: ÷ eot]
÷ 000A ÷ 0020 ÷ # ÷ [1: sot ÷] <LINE FEED (LF)> (LF) ÷ [4: ( Control | CR | LF ) ÷] SPACE (Other) ÷ [2: ÷ eot]
÷ 0001 ÷ 000D ÷ # ÷ [1: sot ÷] <START OF HEADING> (Control) ÷ [4: ( Control | CR | LF ) ÷] <CARRIAGE RETURN (CR)> (CR) ÷ [2: ÷ eot]
÷ 0001 ÷ 000A ÷ # ÷ [1: sot ÷] <START OF HEADING> (Control) ÷ [4: ( Control | CR | LF ) ÷] <LINE FEED (LF)> (LF) ÷ [2: ÷ eot]
÷ 0001 ÷ 0001 ÷ # ÷ [1: sot ÷] <START OF HEADING> (Control) ÷ [4: ( Control | CR | LF ) ÷] <START OF HEADING> (Control) ÷ [2: ÷ eot]
÷ 0001 ÷ 0300 ÷ # ÷ [1: sot ÷] <START OF HEADING> (Control) ÷ [4: ( Control | CR | LF ) ÷] COMBINING GRAVE ACCENT (Extend) ÷ [2: ÷ eot]
÷ 0001 ÷ 1100 ÷ # ÷ [1: sot ÷] <START OF HEADING> (Control) ÷ [4: ( Control | CR | LF ) ÷] HANGUL CHOSEONG KIYEOK (L) ÷ [2: ÷ eot]
÷ 0001 ÷ 1160 ÷ # ÷ [1: sot ÷] <START OF HEADING> (Control) ÷ [4: ( Control | CR | LF ) ÷] HANGUL JUNGSEONG FILLER (V) ÷ [2: ÷ eot]
÷ 0001 ÷ 11A8 ÷ # ÷ [1: sot ÷] <START OF HEADING> (Control) ÷ [4: ( Control | CR | LF ) ÷] HANGUL JONGSEONG KIYEOK (T) ÷ [2: ÷ eot]
÷ 0001 ÷ AC00 ÷ # ÷ [1: sot ÷] <START OF HEADING> (Control) ÷ [4: ( Control | CR | LF ) ÷] HANGUL SYLLABLE GA (LV) ÷ [2: ÷ eot]
÷ 0001 ÷ AC01 ÷ # ÷ [1: sot ÷] <START OF HEADING> (Control) ÷ [4: ( Control | CR | LF ) ÷] HANGUL SYLLABLE GAG (LVT) ÷ [2: ÷ eot]
÷ 0001 ÷ 0020 ÷ # ÷ [1: sot ÷] <START OF HEADING> (Control) ÷ [4: ( Control | CR | LF ) ÷] SPACE (Other) ÷ [2: ÷ eot]
÷ 0300 ÷ 000D ÷ # ÷ [1: sot ÷] COMBINING GRAVE ACCENT (Extend) ÷ [5: ÷ ( Control | CR | LF )] <CARRIAGE RETURN (CR)> (CR) ÷ [2: ÷ eot]
÷ 0300 ÷ 000A ÷ # ÷ [1: sot ÷] COMBINING GRAVE ACCENT (Extend) ÷ [5: ÷ ( Control | CR | LF )] <LINE FEED (LF)> (LF) ÷ [2: ÷ eot]
÷ 0300 ÷ 0001 ÷ # ÷ [1: sot ÷] COMBINING GRAVE ACCENT (Extend) ÷ [5: ÷ ( Control | CR | LF )] <START OF HEADING> (Control) ÷ [2: ÷ eot]
÷ 0300 × 0300 ÷ # ÷ [1: sot ÷] COMBINING GRAVE ACCENT (Extend) × [9: × Extend] COMBINING GRAVE ACCENT (Extend) ÷ [2: ÷ eot]
÷ 0300 ÷ 1100 ÷ # ÷ [1: sot ÷] COMBINING GRAVE ACCENT (Extend) ÷ [10: Any ÷ Any] HANGUL CHOSEONG KIYEOK (L) ÷ [2: ÷ eot]
÷ 0300 ÷ 1160 ÷ # ÷ [1: sot ÷] COMBINING GRAVE ACCENT (Extend) ÷ [10: Any ÷ Any] HANGUL JUNGSEONG FILLER (V) ÷ [2: ÷ eot]
÷ 0300 ÷ 11A8 ÷ # ÷ [1: sot ÷] COMBINING GRAVE ACCENT (Extend) ÷ [10: Any ÷ Any] HANGUL JONGSEONG KIYEOK (T) ÷ [2: ÷ eot]
÷ 0300 ÷ AC00 ÷ # ÷ [1: sot ÷] COMBINING GRAVE ACCENT (Extend) ÷ [10: Any ÷ Any] HANGUL SYLLABLE GA (LV) ÷ [2: ÷ eot]
÷ 0300 ÷ AC01 ÷ # ÷ [1: sot ÷] COMBINING GRAVE ACCENT (Extend) ÷ [10: Any ÷ Any] HANGUL SYLLABLE GAG (LVT) ÷ [2: ÷ eot]
÷ 0300 ÷ 0020 ÷ # ÷ [1: sot ÷] COMBINING GRAVE ACCENT (Extend) ÷ [10: Any ÷ Any] SPACE (Other) ÷ [2: ÷ eot]
÷ 1100 ÷ 000D ÷ # ÷ [1: sot ÷] HANGUL CHOSEONG KIYEOK (L) ÷ [5: ÷ ( Control | CR | LF )] <CARRIAGE RETURN (CR)> (CR) ÷ [2: ÷ eot]
÷ 1100 ÷ 000A ÷ # ÷ [1: sot ÷] HANGUL CHOSEONG KIYEOK (L) ÷ [5: ÷ ( Control | CR | LF )] <LINE FEED (LF)> (LF) ÷ [2: ÷ eot]
÷ 1100 ÷ 0001 ÷ # ÷ [1: sot ÷] HANGUL CHOSEONG KIYEOK (L) ÷ [5: ÷ ( Control | CR | LF )] <START OF HEADING> (Control) ÷ [2: ÷ eot]
÷ 1100 × 0300 ÷ # ÷ [1: sot ÷] HANGUL CHOSEONG KIYEOK (L) × [9: × Extend] COMBINING GRAVE ACCENT (Extend) ÷ [2: ÷ eot]
÷ 1100 × 1100 ÷ # ÷ [1: sot ÷] HANGUL CHOSEONG KIYEOK (L) × [6: L × ( L | V | LV | LVT )] HANGUL CHOSEONG KIYEOK (L) ÷ [2: ÷ eot]
÷ 1100 × 1160 ÷ # ÷ [1: sot ÷] HANGUL CHOSEONG KIYEOK (L) × [6: L × ( L | V | LV | LVT )] HANGUL JUNGSEONG FILLER (V) ÷ [2: ÷ eot]
÷ 1100 ÷ 11A8 ÷ # ÷ [1: sot ÷] HANGUL CHOSEONG KIYEOK (L) ÷ [10: Any ÷ Any] HANGUL JONGSEONG KIYEOK (T) ÷ [2: ÷ eot]
÷ 1100 × AC00 ÷ # ÷ [1: sot ÷] HANGUL CHOSEONG KIYEOK (L) × [6: L × ( L | V | LV | LVT )] HANGUL SYLLABLE GA (LV) ÷ [2: ÷ eot]
÷ 1100 × AC01 ÷ # ÷ [1: sot ÷] HANGUL CHOSEONG KIYEOK (L) × [6: L × ( L | V | LV | LVT )] HANGUL SYLLABLE GAG (LVT) ÷ [2: ÷ eot]
÷ 1100 ÷ 0020 ÷ # ÷ [1: sot ÷] HANGUL CHOSEONG KIYEOK (L) ÷ [10: Any ÷ Any] SPACE (Other) ÷ [2: ÷ eot]
÷ 1160 ÷ 000D ÷ # ÷ [1: sot ÷] HANGUL JUNGSEONG FILLER (V) ÷ [5: ÷ ( Control | CR | LF )] <CARRIAGE RETURN (CR)> (CR) ÷ [2: ÷ eot]
÷ 1160 ÷ 000A ÷ # ÷ [1: sot ÷] HANGUL JUNGSEONG FILLER (V) ÷ [5: ÷ ( Control | CR | LF )] <LINE FEED (LF)> (LF) ÷ [2: ÷ eot]
÷ 1160 ÷ 0001 ÷ # ÷ [1: sot ÷] HANGUL JUNGSEONG FILLER (V) ÷ [5: ÷ ( Control | CR | LF )] <START OF HEADING> (Control) ÷ [2: ÷ eot]
÷ 1160 × 0300 ÷ # ÷ [1: sot ÷] HANGUL JUNGSEONG FILLER (V) × [9: × Extend] COMBINING GRAVE ACCENT (Extend) ÷ [2: ÷ eot]
÷ 1160 ÷ 1100 ÷ # ÷ [1: sot ÷] HANGUL JUNGSEONG FILLER (V) ÷ [10: Any ÷ Any] HANGUL CHOSEONG KIYEOK (L) ÷ [2: ÷ eot]
÷ 1160 × 1160 ÷ # ÷ [1: sot ÷] HANGUL JUNGSEONG FILLER (V) × [7: ( LV | V ) × ( V | T )] HANGUL JUNGSEONG FILLER (V) ÷ [2: ÷ eot]
÷ 1160 × 11A8 ÷ # ÷ [1: sot ÷] HANGUL JUNGSEONG FILLER (V) × [7: ( LV | V ) × ( V | T )] HANGUL JONGSEONG KIYEOK (T) ÷ [2: ÷ eot]
÷ 1160 ÷ AC00 ÷ # ÷ [1: sot ÷] HANGUL JUNGSEONG FILLER (V) ÷ [10: Any ÷ Any] HANGUL SYLLABLE GA (LV) ÷ [2: ÷ eot]
÷ 1160 ÷ AC01 ÷ # ÷ [1: sot ÷] HANGUL JUNGSEONG FILLER (V) ÷ [10: Any ÷ Any] HANGUL SYLLABLE GAG (LVT) ÷ [2: ÷ eot]
÷ 1160 ÷ 0020 ÷ # ÷ [1: sot ÷] HANGUL JUNGSEONG FILLER (V) ÷ [10: Any ÷ Any] SPACE (Other) ÷ [2: ÷ eot]
÷ 11A8 ÷ 000D ÷ # ÷ [1: sot ÷] HANGUL JONGSEONG KIYEOK (T) ÷ [5: ÷ ( Control | CR | LF )] <CARRIAGE RETURN (CR)> (CR) ÷ [2: ÷ eot]
÷ 11A8 ÷ 000A ÷ # ÷ [1: sot ÷] HANGUL JONGSEONG KIYEOK (T) ÷ [5: ÷ ( Control | CR | LF )] <LINE FEED (LF)> (LF) ÷ [2: ÷ eot]
÷ 11A8 ÷ 0001 ÷ # ÷ [1: sot ÷] HANGUL JONGSEONG KIYEOK (T) ÷ [5: ÷ ( Control | CR | LF )] <START OF HEADING> (Control) ÷ [2: ÷ eot]
÷ 11A8 × 0300 ÷ # ÷ [1: sot ÷] HANGUL JONGSEONG KIYEOK (T) × [9: × Extend] COMBINING GRAVE ACCENT (Extend) ÷ [2: ÷ eot]
÷ 11A8 ÷ 1100 ÷ # ÷ [1: sot ÷] HANGUL JONGSEONG KIYEOK (T) ÷ [10: Any ÷ Any] HANGUL CHOSEONG KIYEOK (L) ÷ [2: ÷ eot]
÷ 11A8 ÷ 1160 ÷ # ÷ [1: sot ÷] HANGUL JONGSEONG KIYEOK (T) ÷ [10: Any ÷ Any] HANGUL JUNGSEONG FILLER (V) ÷ [2: ÷ eot]
÷ 11A8 × 11A8 ÷ # ÷ [1: sot ÷] HANGUL JONGSEONG KIYEOK (T) × [8: ( LVT | T ) × T] HANGUL JONGSEONG KIYEOK (T) ÷ [2: ÷ eot]
÷ 11A8 ÷ AC00 ÷ # ÷ [1: sot ÷] HANGUL JONGSEONG KIYEOK (T) ÷ [10: Any ÷ Any] HANGUL SYLLABLE GA (LV) ÷ [2: ÷ eot]
÷ 11A8 ÷ AC01 ÷ # ÷ [1: sot ÷] HANGUL JONGSEONG KIYEOK (T) ÷ [10: Any ÷ Any] HANGUL SYLLABLE GAG (LVT) ÷ [2: ÷ eot]
÷ 11A8 ÷ 0020 ÷ # ÷ [1: sot ÷] HANGUL JONGSEONG KIYEOK (T) ÷ [10: Any ÷ Any] SPACE (Other) ÷ [2: ÷ eot]
÷ AC00 ÷ 000D ÷ # ÷ [1: sot ÷] HANGUL SYLLABLE GA (LV) ÷ [5: ÷ ( Control | CR | LF )] <CARRIAGE RETURN (CR)> (CR) ÷ [2: ÷ eot]
÷ AC00 ÷ 000A ÷ # ÷ [1: sot ÷] HANGUL SYLLABLE GA (LV) ÷ [5: ÷ ( Control | CR | LF )] <LINE FEED (LF)> (LF) ÷ [2: ÷ eot]
÷ AC00 ÷ 0001 ÷ # ÷ [1: sot ÷] HANGUL SYLLABLE GA (LV) ÷ [5: ÷ ( Control | CR | LF )] <START OF HEADING> (Control) ÷ [2: ÷ eot]
÷ AC00 × 0300 ÷ # ÷ [1: sot ÷] HANGUL SYLLABLE GA (LV) × [9: × Extend] COMBINING GRAVE ACCENT (Extend) ÷ [2: ÷ eot]
÷ AC00 ÷ 1100 ÷ # ÷ [1: sot ÷] HANGUL SYLLABLE GA (LV) ÷ [10: Any ÷ Any] HANGUL CHOSEONG KIYEOK (L) ÷ [2: ÷ eot]
÷ AC00 × 1160 ÷ # ÷ [1: sot ÷] HANGUL SYLLABLE GA (LV) × [7: ( LV | V ) × ( V | T )] HANGUL JUNGSEONG FILLER (V) ÷ [2: ÷ eot]
÷ AC00 × 11A8 ÷ # ÷ [1: sot ÷] HANGUL SYLLABLE GA (LV) × [7: ( LV | V ) × ( V | T )] HANGUL JONGSEONG KIYEOK (T) ÷ [2: ÷ eot]
÷ AC00 ÷ AC00 ÷ # ÷ [1: sot ÷] HANGUL SYLLABLE GA (LV) ÷ [10: Any ÷ Any] HANGUL SYLLABLE GA (LV) ÷ [2: ÷ eot]
÷ AC00 ÷ AC01 ÷ # ÷ [1: sot ÷] HANGUL SYLLABLE GA (LV) ÷ [10: Any ÷ Any] HANGUL SYLLABLE GAG (LVT) ÷ [2: ÷ eot]
÷ AC00 ÷ 0020 ÷ # ÷ [1: sot ÷] HANGUL SYLLABLE GA (LV) ÷ [10: Any ÷ Any] SPACE (Other) ÷ [2: ÷ eot]
÷ AC01 ÷ 000D ÷ # ÷ [1: sot ÷] HANGUL SYLLABLE GAG (LVT) ÷ [5: ÷ ( Control | CR | LF )] <CARRIAGE RETURN (CR)> (CR) ÷ [2: ÷ eot]
÷ AC01 ÷ 000A ÷ # ÷ [1: sot ÷] HANGUL SYLLABLE GAG (LVT) ÷ [5: ÷ ( Control | CR | LF )] <LINE FEED (LF)> (LF) ÷ [2: ÷ eot]
÷ AC01 ÷ 0001 ÷ # ÷ [1: sot ÷] HANGUL SYLLABLE GAG (LVT) ÷ [5: ÷ ( Control | CR | LF )] <START OF HEADING> (Control) ÷ [2: ÷ eot]
÷ AC01 × 0300 ÷ # ÷ [1: sot ÷] HANGUL SYLLABLE GAG (LVT) × [9: × Extend] COMBINING GRAVE ACCENT (Extend) ÷ [2: ÷ eot]
÷ AC01 ÷ 1100 ÷ # ÷ [1: sot ÷] HANGUL SYLLABLE GAG (LVT) ÷ [10: Any ÷ Any] HANGUL CHOSEONG KIYEOK (L) ÷ [2: ÷ eot]
÷ AC01 ÷ 1160 ÷ # ÷ [1: sot ÷] HANGUL SYLLABLE GAG (LVT) ÷ [10: Any ÷ Any] HANGUL JUNGSEONG FILLER (V) ÷ [2: ÷ eot]
÷ AC01 × 11A8 ÷ # ÷ [1: sot ÷] HANGUL SYLLABLE GAG (LVT) × [8: ( LVT | T ) × T] HANGUL JONGSEONG KIYEOK (T) ÷ [2: ÷ eot]
÷ AC01 ÷ AC00 ÷ # ÷ [1: sot ÷] HANGUL SYLLABLE GAG (LVT) ÷ [10: Any ÷ Any] HANGUL SYLLABLE GA (LV) ÷ [2: ÷ eot]
÷ AC01 ÷ AC01 ÷ # ÷ [1: sot ÷] HANGUL SYLLABLE GAG (LVT) ÷ [10: Any ÷ Any] HANGUL SYLLABLE GAG (LVT) ÷ [2: ÷ eot]
÷ AC01 ÷ 0020 ÷ # ÷ [1: sot ÷] HANGUL SYLLABLE GAG (LVT) ÷ [10: Any ÷ Any] SPACE (Other) ÷ [2: ÷ eot]
÷ 0020 ÷ 000D ÷ # ÷ [1: sot ÷] SPACE (Other) ÷ [5: ÷ ( Control | CR | LF )] <CARRIAGE RETURN (CR)> (CR) ÷ [2: ÷ eot]
÷ 0020 ÷ 000A ÷ # ÷ [1: sot ÷] SPACE (Other) ÷ [5: ÷ ( Control | CR | LF )] <LINE FEED (LF)> (LF) ÷ [2: ÷ eot]
÷ 0020 ÷ 0001 ÷ # ÷ [1: sot ÷] SPACE (Other) ÷ [5: ÷ ( Control | CR | LF )] <START OF HEADING> (Control) ÷ [2: ÷ eot]
÷ 0020 × 0300 ÷ # ÷ [1: sot ÷] SPACE (Other) × [9: × Extend] COMBINING GRAVE ACCENT (Extend) ÷ [2: ÷ eot]
÷ 0020 ÷ 1100 ÷ # ÷ [1: sot ÷] SPACE (Other) ÷ [10: Any ÷ Any] HANGUL CHOSEONG KIYEOK (L) ÷ [2: ÷ eot]
÷ 0020 ÷ 1160 ÷ # ÷ [1: sot ÷] SPACE (Other) ÷ [10: Any ÷ Any] HANGUL JUNGSEONG FILLER (V) ÷ [2: ÷ eot]
÷ 0020 ÷ 11A8 ÷ # ÷ [1: sot ÷] SPACE (Other) ÷ [10: Any ÷ Any] HANGUL JONGSEONG KIYEOK (T) ÷ [2: ÷ eot]
÷ 0020 ÷ AC00 ÷ # ÷ [1: sot ÷] SPACE (Other) ÷ [10: Any ÷ Any] HANGUL SYLLABLE GA (LV) ÷ [2: ÷ eot]
÷ 0020 ÷ AC01 ÷ # ÷ [1: sot ÷] SPACE (Other) ÷ [10: Any ÷ Any] HANGUL SYLLABLE GAG (LVT) ÷ [2: ÷ eot]
÷ 0020 ÷ 0020 ÷ # ÷ [1: sot ÷] SPACE (Other) ÷ [10: Any ÷ Any] SPACE (Other) ÷ [2: ÷ eot]
# Lines: 100

File diff suppressed because it is too large Load diff

View file

@ -1,9 +1,8 @@
# Note: Please make sure that this utf-8 file contains a BOM.
# SentenceBreakTest-5.2.0.txt
# Date: 2009-05-28, 20:38:05 GMT [MD]
# SentenceBreakTest-6.0.0.txt
# Date: 2010-08-19, 01:19:53 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#
@ -16,7 +15,7 @@
# × wherever there is not.
# <comment> the format can change, but currently it shows:
# - the sample character name
# - (x) the line_break property* for the sample character
# - (x) the Sentence_Break property* for the sample character
# - [x] the rule that determines whether there is a break or not
#
# These samples may be extended or changed in the future.
@ -484,6 +483,8 @@
÷ 0065 × 0074 × 0063 × 002E × 0029 × 2019 × 00A0 ÷ 2018 × 0028 × 0054 × 0068 × 0065 ÷ # ÷ [0.2] LATIN SMALL LETTER E (Lower) × [12.0] LATIN SMALL LETTER T (Lower) × [12.0] LATIN SMALL LETTER C (Lower) × [12.0] FULL STOP (ATerm) × [9.0] RIGHT PARENTHESIS (Close) × [9.0] RIGHT SINGLE QUOTATION MARK (Close) × [9.0] NO-BREAK SPACE (Sp) ÷ [11.0] LEFT SINGLE QUOTATION MARK (Close) × [12.0] LEFT PARENTHESIS (Close) × [12.0] LATIN CAPITAL LETTER T (Upper) × [12.0] LATIN SMALL LETTER H (Lower) × [12.0] LATIN SMALL LETTER E (Lower) ÷ [0.3]
÷ 0065 × 0074 × 0063 × 002E × 0029 × 2019 × 00A0 × 0308 × 0074 × 0068 × 0065 ÷ # ÷ [0.2] LATIN SMALL LETTER E (Lower) × [12.0] LATIN SMALL LETTER T (Lower) × [12.0] LATIN SMALL LETTER C (Lower) × [12.0] FULL STOP (ATerm) × [8.0] RIGHT PARENTHESIS (Close) × [8.0] RIGHT SINGLE QUOTATION MARK (Close) × [8.0] NO-BREAK SPACE (Sp) × [5.0] COMBINING DIAERESIS (Extend_FE) × [8.0] LATIN SMALL LETTER T (Lower) × [12.0] LATIN SMALL LETTER H (Lower) × [12.0] LATIN SMALL LETTER E (Lower) ÷ [0.3]
÷ 0065 × 0074 × 0063 × 002E × 0029 × 2019 × 00A0 × 0308 ÷ 0054 × 0068 × 0065 ÷ # ÷ [0.2] LATIN SMALL LETTER E (Lower) × [12.0] LATIN SMALL LETTER T (Lower) × [12.0] LATIN SMALL LETTER C (Lower) × [12.0] FULL STOP (ATerm) × [9.0] RIGHT PARENTHESIS (Close) × [9.0] RIGHT SINGLE QUOTATION MARK (Close) × [9.0] NO-BREAK SPACE (Sp) × [5.0] COMBINING DIAERESIS (Extend_FE) ÷ [11.0] LATIN CAPITAL LETTER T (Upper) × [12.0] LATIN SMALL LETTER H (Lower) × [12.0] LATIN SMALL LETTER E (Lower) ÷ [0.3]
÷ 0065 × 0074 × 0063 × 002E × 0029 × 2019 × 0308 ÷ 0054 × 0068 × 0065 ÷ # ÷ [0.2] LATIN SMALL LETTER E (Lower) × [12.0] LATIN SMALL LETTER T (Lower) × [12.0] LATIN SMALL LETTER C (Lower) × [12.0] FULL STOP (ATerm) × [9.0] RIGHT PARENTHESIS (Close) × [9.0] RIGHT SINGLE QUOTATION MARK (Close) × [5.0] COMBINING DIAERESIS (Extend_FE) ÷ [11.0] LATIN CAPITAL LETTER T (Upper) × [12.0] LATIN SMALL LETTER H (Lower) × [12.0] LATIN SMALL LETTER E (Lower) ÷ [0.3]
÷ 0065 × 0074 × 0063 × 002E × 0029 × 000A ÷ 0308 × 0054 × 0068 × 0065 ÷ # ÷ [0.2] LATIN SMALL LETTER E (Lower) × [12.0] LATIN SMALL LETTER T (Lower) × [12.0] LATIN SMALL LETTER C (Lower) × [12.0] FULL STOP (ATerm) × [9.0] RIGHT PARENTHESIS (Close) × [9.0] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend_FE) × [12.0] LATIN CAPITAL LETTER T (Upper) × [12.0] LATIN SMALL LETTER H (Lower) × [12.0] LATIN SMALL LETTER E (Lower) ÷ [0.3]
÷ 0074 × 0068 × 0065 × 0020 × 0072 × 0065 × 0073 × 0070 × 002E × 0020 × 006C × 0065 × 0061 × 0064 × 0065 × 0072 × 0073 × 0020 × 0061 × 0072 × 0065 ÷ # ÷ [0.2] LATIN SMALL LETTER T (Lower) × [12.0] LATIN SMALL LETTER H (Lower) × [12.0] LATIN SMALL LETTER E (Lower) × [12.0] SPACE (Sp) × [12.0] LATIN SMALL LETTER R (Lower) × [12.0] LATIN SMALL LETTER E (Lower) × [12.0] LATIN SMALL LETTER S (Lower) × [12.0] LATIN SMALL LETTER P (Lower) × [12.0] FULL STOP (ATerm) × [8.0] SPACE (Sp) × [8.0] LATIN SMALL LETTER L (Lower) × [12.0] LATIN SMALL LETTER E (Lower) × [12.0] LATIN SMALL LETTER A (Lower) × [12.0] LATIN SMALL LETTER D (Lower) × [12.0] LATIN SMALL LETTER E (Lower) × [12.0] LATIN SMALL LETTER R (Lower) × [12.0] LATIN SMALL LETTER S (Lower) × [12.0] SPACE (Sp) × [12.0] LATIN SMALL LETTER A (Lower) × [12.0] LATIN SMALL LETTER R (Lower) × [12.0] LATIN SMALL LETTER E (Lower) ÷ [0.3]
÷ 5B57 × 002E ÷ 5B57 ÷ # ÷ [0.2] CJK UNIFIED IDEOGRAPH-5B57 (OLetter) × [12.0] FULL STOP (ATerm) ÷ [11.0] CJK UNIFIED IDEOGRAPH-5B57 (OLetter) ÷ [0.3]
÷ 0065 × 0074 × 0063 × 002E ÷ 5B83 ÷ # ÷ [0.2] LATIN SMALL LETTER E (Lower) × [12.0] LATIN SMALL LETTER T (Lower) × [12.0] LATIN SMALL LETTER C (Lower) × [12.0] FULL STOP (ATerm) ÷ [11.0] CJK UNIFIED IDEOGRAPH-5B83 (OLetter) ÷ [0.3]
@ -502,9 +503,11 @@
÷ 2060 × 0065 × 2060 × 0074 × 2060 × 0063 × 2060 × 002E × 2060 × 0029 × 2060 × 2019 × 2060 × 00A0 × 2060 ÷ 2018 × 2060 × 0028 × 2060 × 0054 × 2060 × 0068 × 2060 × 0065 × 2060 × 2060 ÷ # ÷ [0.2] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER E (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER T (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER C (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] FULL STOP (ATerm) × [5.0] WORD JOINER (Format_FE) × [9.0] RIGHT PARENTHESIS (Close) × [5.0] WORD JOINER (Format_FE) × [9.0] RIGHT SINGLE QUOTATION MARK (Close) × [5.0] WORD JOINER (Format_FE) × [9.0] NO-BREAK SPACE (Sp) × [5.0] WORD JOINER (Format_FE) ÷ [11.0] LEFT SINGLE QUOTATION MARK (Close) × [5.0] WORD JOINER (Format_FE) × [12.0] LEFT PARENTHESIS (Close) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN CAPITAL LETTER T (Upper) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER H (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER E (Lower) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]
÷ 2060 × 0065 × 2060 × 0074 × 2060 × 0063 × 2060 × 002E × 2060 × 0029 × 2060 × 2019 × 2060 × 00A0 × 2060 × 0308 × 0074 × 2060 × 0068 × 2060 × 0065 × 2060 × 2060 ÷ # ÷ [0.2] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER E (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER T (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER C (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] FULL STOP (ATerm) × [5.0] WORD JOINER (Format_FE) × [8.0] RIGHT PARENTHESIS (Close) × [5.0] WORD JOINER (Format_FE) × [8.0] RIGHT SINGLE QUOTATION MARK (Close) × [5.0] WORD JOINER (Format_FE) × [8.0] NO-BREAK SPACE (Sp) × [5.0] WORD JOINER (Format_FE) × [5.0] COMBINING DIAERESIS (Extend_FE) × [8.0] LATIN SMALL LETTER T (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER H (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER E (Lower) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]
÷ 2060 × 0065 × 2060 × 0074 × 2060 × 0063 × 2060 × 002E × 2060 × 0029 × 2060 × 2019 × 2060 × 00A0 × 2060 × 0308 ÷ 0054 × 2060 × 0068 × 2060 × 0065 × 2060 × 2060 ÷ # ÷ [0.2] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER E (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER T (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER C (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] FULL STOP (ATerm) × [5.0] WORD JOINER (Format_FE) × [9.0] RIGHT PARENTHESIS (Close) × [5.0] WORD JOINER (Format_FE) × [9.0] RIGHT SINGLE QUOTATION MARK (Close) × [5.0] WORD JOINER (Format_FE) × [9.0] NO-BREAK SPACE (Sp) × [5.0] WORD JOINER (Format_FE) × [5.0] COMBINING DIAERESIS (Extend_FE) ÷ [11.0] LATIN CAPITAL LETTER T (Upper) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER H (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER E (Lower) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]
÷ 2060 × 0065 × 2060 × 0074 × 2060 × 0063 × 2060 × 002E × 2060 × 0029 × 2060 × 2019 × 2060 × 0308 ÷ 0054 × 2060 × 0068 × 2060 × 0065 × 2060 × 2060 ÷ # ÷ [0.2] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER E (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER T (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER C (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] FULL STOP (ATerm) × [5.0] WORD JOINER (Format_FE) × [9.0] RIGHT PARENTHESIS (Close) × [5.0] WORD JOINER (Format_FE) × [9.0] RIGHT SINGLE QUOTATION MARK (Close) × [5.0] WORD JOINER (Format_FE) × [5.0] COMBINING DIAERESIS (Extend_FE) ÷ [11.0] LATIN CAPITAL LETTER T (Upper) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER H (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER E (Lower) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]
÷ 2060 × 0065 × 2060 × 0074 × 2060 × 0063 × 2060 × 002E × 2060 × 0029 × 2060 × 000A ÷ 2060 × 0308 × 2060 × 0054 × 2060 × 0068 × 2060 × 0065 × 2060 × 2060 ÷ # ÷ [0.2] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER E (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER T (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER C (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] FULL STOP (ATerm) × [5.0] WORD JOINER (Format_FE) × [9.0] RIGHT PARENTHESIS (Close) × [5.0] WORD JOINER (Format_FE) × [9.0] <LINE FEED (LF)> (LF) ÷ [4.0] WORD JOINER (Format_FE) × [5.0] COMBINING DIAERESIS (Extend_FE) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN CAPITAL LETTER T (Upper) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER H (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER E (Lower) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]
÷ 2060 × 0074 × 2060 × 0068 × 2060 × 0065 × 2060 × 0020 × 2060 × 0072 × 2060 × 0065 × 2060 × 0073 × 2060 × 0070 × 2060 × 002E × 2060 × 0020 × 2060 × 006C × 2060 × 0065 × 2060 × 0061 × 2060 × 0064 × 2060 × 0065 × 2060 × 0072 × 2060 × 0073 × 2060 × 0020 × 2060 × 0061 × 2060 × 0072 × 2060 × 0065 × 2060 × 2060 ÷ # ÷ [0.2] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER T (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER H (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER E (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] SPACE (Sp) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER R (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER E (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER S (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER P (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] FULL STOP (ATerm) × [5.0] WORD JOINER (Format_FE) × [8.0] SPACE (Sp) × [5.0] WORD JOINER (Format_FE) × [8.0] LATIN SMALL LETTER L (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER E (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER A (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER D (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER E (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER R (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER S (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] SPACE (Sp) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER A (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER R (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER E (Lower) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]
÷ 2060 × 5B57 × 2060 × 002E × 2060 ÷ 5B57 × 2060 × 2060 ÷ # ÷ [0.2] WORD JOINER (Format_FE) × [12.0] CJK UNIFIED IDEOGRAPH-5B57 (OLetter) × [5.0] WORD JOINER (Format_FE) × [12.0] FULL STOP (ATerm) × [5.0] WORD JOINER (Format_FE) ÷ [11.0] CJK UNIFIED IDEOGRAPH-5B57 (OLetter) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]
÷ 2060 × 0065 × 2060 × 0074 × 2060 × 0063 × 2060 × 002E × 2060 ÷ 5B83 × 2060 × 2060 ÷ # ÷ [0.2] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER E (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER T (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER C (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] FULL STOP (ATerm) × [5.0] WORD JOINER (Format_FE) ÷ [11.0] CJK UNIFIED IDEOGRAPH-5B83 (OLetter) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]
÷ 2060 × 0065 × 2060 × 0074 × 2060 × 0063 × 2060 × 002E × 2060 × 3002 × 2060 × 2060 ÷ # ÷ [0.2] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER E (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER T (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] LATIN SMALL LETTER C (Lower) × [5.0] WORD JOINER (Format_FE) × [12.0] FULL STOP (ATerm) × [5.0] WORD JOINER (Format_FE) × [8.1] IDEOGRAPHIC FULL STOP (STerm) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]
÷ 2060 × 5B57 × 2060 × 3002 × 2060 ÷ 5B83 × 2060 × 2060 ÷ # ÷ [0.2] WORD JOINER (Format_FE) × [12.0] CJK UNIFIED IDEOGRAPH-5B57 (OLetter) × [5.0] WORD JOINER (Format_FE) × [12.0] IDEOGRAPHIC FULL STOP (STerm) × [5.0] WORD JOINER (Format_FE) ÷ [11.0] CJK UNIFIED IDEOGRAPH-5B83 (OLetter) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]
# Lines: 450
# Lines: 490

View file

@ -1,9 +1,8 @@
# Note: Please make sure that this utf-8 file contains a BOM.
# WordBreakTest-5.2.0.txt
# Date: 2009-05-28, 20:38:06 GMT [MD]
# WordBreakTest-6.0.0.txt
# Date: 2010-08-19, 01:19:54 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2009 Unicode, Inc.
# Copyright (c) 1991-2010 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#
@ -16,7 +15,7 @@
# × wherever there is not.
# <comment> the format can change, but currently it shows:
# - the sample character name
# - (x) the line_break property* for the sample character
# - (x) the Word_Break property* for the sample character
# - [x] the rule that determines whether there is a break or not
#
# These samples may be extended or changed in the future.
@ -999,4 +998,4 @@
÷ 2060 ÷ 0061 × 2060 × 0062 × 2060 × 00AD × 2060 × 0062 × 2060 × 0079 × 2060 × 2060 ÷ # ÷ [0.2] WORD JOINER (Format_FE) ÷ [999.0] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) × [5.0] LATIN SMALL LETTER B (ALetter) × [4.0] WORD JOINER (Format_FE) × [4.0] SOFT HYPHEN (Format_FE) × [4.0] WORD JOINER (Format_FE) × [5.0] LATIN SMALL LETTER B (ALetter) × [4.0] WORD JOINER (Format_FE) × [5.0] LATIN SMALL LETTER Y (ALetter) × [4.0] WORD JOINER (Format_FE) × [4.0] WORD JOINER (Format_FE) ÷ [0.3]
÷ 2060 ÷ 0061 × 2060 ÷ 0024 × 2060 ÷ 002D × 2060 ÷ 0033 × 2060 × 0034 × 2060 × 002C × 2060 × 0035 × 2060 × 0036 × 2060 × 0037 × 2060 × 002E × 2060 × 0031 × 2060 × 0034 × 2060 ÷ 0025 × 2060 ÷ 0062 × 2060 × 2060 ÷ # ÷ [0.2] WORD JOINER (Format_FE) ÷ [999.0] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) ÷ [999.0] DOLLAR SIGN (Other) × [4.0] WORD JOINER (Format_FE) ÷ [999.0] HYPHEN-MINUS (Other) × [4.0] WORD JOINER (Format_FE) ÷ [999.0] DIGIT THREE (Numeric) × [4.0] WORD JOINER (Format_FE) × [8.0] DIGIT FOUR (Numeric) × [4.0] WORD JOINER (Format_FE) × [12.0] COMMA (MidNum) × [4.0] WORD JOINER (Format_FE) × [11.0] DIGIT FIVE (Numeric) × [4.0] WORD JOINER (Format_FE) × [8.0] DIGIT SIX (Numeric) × [4.0] WORD JOINER (Format_FE) × [8.0] DIGIT SEVEN (Numeric) × [4.0] WORD JOINER (Format_FE) × [12.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [11.0] DIGIT ONE (Numeric) × [4.0] WORD JOINER (Format_FE) × [8.0] DIGIT FOUR (Numeric) × [4.0] WORD JOINER (Format_FE) ÷ [999.0] PERCENT SIGN (Other) × [4.0] WORD JOINER (Format_FE) ÷ [999.0] LATIN SMALL LETTER B (ALetter) × [4.0] WORD JOINER (Format_FE) × [4.0] WORD JOINER (Format_FE) ÷ [0.3]
÷ 2060 ÷ 0033 × 2060 × 0061 × 2060 × 2060 ÷ # ÷ [0.2] WORD JOINER (Format_FE) ÷ [999.0] DIGIT THREE (Numeric) × [4.0] WORD JOINER (Format_FE) × [10.0] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) × [4.0] WORD JOINER (Format_FE) ÷ [0.3]
# Lines: 968
# Lines: 978

View file

@ -92,9 +92,10 @@ uprops_swap(const UDataSwapper *ds,
pInfo->dataFormat[1]==0x50 &&
pInfo->dataFormat[2]==0x72 &&
pInfo->dataFormat[3]==0x6f &&
(3<=pInfo->formatVersion[0] && pInfo->formatVersion[0]<=6) &&
pInfo->formatVersion[2]==UTRIE_SHIFT &&
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
(3<=pInfo->formatVersion[0] && pInfo->formatVersion[0]<=7) &&
(pInfo->formatVersion[0]>=7 ||
(pInfo->formatVersion[2]==UTRIE_SHIFT &&
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT))
)) {
udata_printError(ds, "uprops_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not a Unicode properties file\n",
pInfo->dataFormat[0], pInfo->dataFormat[1],
@ -122,10 +123,18 @@ uprops_swap(const UDataSwapper *ds,
* comments are copied from the data format description in genprops/store.c
* indexes[] constants are in uprops.h
*/
int32_t dataTop;
if(length>=0) {
int32_t *outData32;
if((length-headerSize)<(4*dataIndexes[UPROPS_RESERVED_INDEX])) {
/*
* In formatVersion 7, UPROPS_DATA_TOP_INDEX has the post-header data size.
* In earlier formatVersions, it is 0 and a lower dataIndexes entry
* has the top of the last item.
*/
for(i=UPROPS_DATA_TOP_INDEX; i>0 && (dataTop=dataIndexes[i])==0; --i) {}
if((length-headerSize)<(4*dataTop)) {
udata_printError(ds, "uprops_swap(): too few bytes (%d after header) for a Unicode properties file\n",
length-headerSize);
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
@ -136,7 +145,7 @@ uprops_swap(const UDataSwapper *ds,
/* copy everything for inaccessible data (padding) */
if(inData32!=outData32) {
uprv_memcpy(outData32, inData32, 4*dataIndexes[UPROPS_RESERVED_INDEX]);
uprv_memcpy(outData32, inData32, 4*dataTop);
}
/* swap the indexes[16] */
@ -146,7 +155,7 @@ uprops_swap(const UDataSwapper *ds,
* swap the main properties UTrie
* PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
*/
utrie_swap(ds,
utrie2_swapAnyVersion(ds,
inData32+UPROPS_INDEX_COUNT,
4*(dataIndexes[UPROPS_PROPS32_INDEX]-UPROPS_INDEX_COUNT),
outData32+UPROPS_INDEX_COUNT,
@ -177,7 +186,7 @@ uprops_swap(const UDataSwapper *ds,
* swap the additional UTrie
* i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
*/
utrie_swap(ds,
utrie2_swapAnyVersion(ds,
inData32+dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX],
4*(dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX]-dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX]),
outData32+dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX],
@ -189,13 +198,21 @@ uprops_swap(const UDataSwapper *ds,
*/
ds->swapArray32(ds,
inData32+dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX],
4*(dataIndexes[UPROPS_RESERVED_INDEX]-dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX]),
4*(dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]-dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX]),
outData32+dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX],
pErrorCode);
// swap the Script_Extensions data
// SCX const uint16_t scriptExtensions[2*(i7-i6)];
ds->swapArray16(ds,
inData32+dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX],
4*(dataIndexes[UPROPS_RESERVED_INDEX_7]-dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]),
outData32+dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX],
pErrorCode);
}
/* i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table */
return headerSize+4*dataIndexes[UPROPS_RESERVED_INDEX];
/* i7 reservedIndex7; -- 32-bit unit index to the top of the Script_Extensions data */
return headerSize+4*dataIndexes[UPROPS_RESERVED_INDEX_7];
}
/* Unicode case mapping data swapping --------------------------------------- */
@ -228,9 +245,10 @@ ucase_swap(const UDataSwapper *ds,
pInfo->dataFormat[1]==UCASE_FMT_1 &&
pInfo->dataFormat[2]==UCASE_FMT_2 &&
pInfo->dataFormat[3]==UCASE_FMT_3 &&
pInfo->formatVersion[0]==1 &&
pInfo->formatVersion[2]==UTRIE_SHIFT &&
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
((pInfo->formatVersion[0]==1 &&
pInfo->formatVersion[2]==UTRIE_SHIFT &&
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT) ||
pInfo->formatVersion[0]==2)
)) {
udata_printError(ds, "ucase_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as case mapping data\n",
pInfo->dataFormat[0], pInfo->dataFormat[1],
@ -285,7 +303,7 @@ ucase_swap(const UDataSwapper *ds,
/* swap the UTrie */
count=indexes[UCASE_IX_TRIE_SIZE];
utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
utrie2_swapAnyVersion(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
offset+=count;
/* swap the uint16_t exceptions[] and unfold[] */
@ -329,9 +347,10 @@ ubidi_swap(const UDataSwapper *ds,
pInfo->dataFormat[1]==UBIDI_FMT_1 &&
pInfo->dataFormat[2]==UBIDI_FMT_2 &&
pInfo->dataFormat[3]==UBIDI_FMT_3 &&
pInfo->formatVersion[0]==1 &&
pInfo->formatVersion[2]==UTRIE_SHIFT &&
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
((pInfo->formatVersion[0]==1 &&
pInfo->formatVersion[2]==UTRIE_SHIFT &&
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT) ||
pInfo->formatVersion[0]==2)
)) {
udata_printError(ds, "ubidi_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as bidi/shaping data\n",
pInfo->dataFormat[0], pInfo->dataFormat[1],
@ -386,7 +405,7 @@ ubidi_swap(const UDataSwapper *ds,
/* swap the UTrie */
count=indexes[UBIDI_IX_TRIE_SIZE];
utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
utrie2_swapAnyVersion(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
offset+=count;
/* swap the uint32_t mirrors[] */