ICU-1970 add properties: joining group, joining type, line break

X-SVN-Rev: 9043
This commit is contained in:
Markus Scherer 2002-07-04 17:08:34 +00:00
parent 8c4e52bd51
commit ecfab592b4
3 changed files with 14335 additions and 14 deletions

View file

@ -0,0 +1,226 @@
# ArabicShaping-3.2.0.txt
#
# This file is a normative contributory data file in the
# Unicode Character Database.
#
# This file defines the shaping classes for Arabic and Syriac
# positional shaping, repeating in machine readable form the
# information printed in Tables 8-6, 8-7, 8-8, 8-10, 8-11, and
# 8-13 of The Unicode Standard, Version 3.0, plus additions
# for Unicode 3.1 and Unicode 3.2.
#
# See sections 8.2 and 8.3 of The Unicode Standard, Version 3.0
# for more information.
#
# Each line contains four fields, separated by a semicolon.
#
# The first field gives the code point, in 4-digit hexadecimal
# form, of an Arabic or Syriac character.
# The second field gives a short schematic name for that character,
# abbreviated from the normative Unicode character name.
# The third field defines the joining type: R right-joining,
# D dual-joining, U non-joining
# The fourth field defines the joining group.
#
#
# Note: Characters of joining type T and most characters of
# joining type U are not explicitly listed in this file.
#
# Characters of joining type T can derived by the following formula:
# T = Mn + Cf - ZWNJ - ZWJ
# where Mn and Cf are the general category values. In other words,
# any non-spacing mark or any format control character, except
# U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO WIDTH
# JOINER (joining type C).
#
# For an explicit listing of characters of joining type T, see
# the derived property file DerivedJoiningType.txt.
#
# There are currently no characters of type L defined in Unicode.
#
# Joining type U includes all characters which are neither joining
# type T, nor explicitly marked in this file as types R, L, D, or C.
#
# #############################################################
# Unicode; Schematic Name; Joining Type; Joining Group
# Arabic characters
0621; HAMZA; U; <no shaping>
0622; MADDA ON ALEF; R; ALEF
0623; HAMZA ON ALEF; R; ALEF
0624; HAMZA ON WAW; R; WAW
0625; HAMZA UNDER ALEF; R; ALEF
0626; HAMZA ON YEH; D; YEH
0627; ALEF; R; ALEF
0628; BEH; D; BEH
0629; TEH MARBUTA; R; TEH MARBUTA
062A; TEH; D; BEH
062B; THEH; D; BEH
062C; JEEM; D; HAH
062D; HAH; D; HAH
062E; KHAH; D; HAH
062F; DAL; R; DAL
0630; THAL; R; DAL
0631; REH; R; REH
0632; ZAIN; R; REH
0633; SEEN; D; SEEN
0634; SHEEN; D; SEEN
0635; SAD; D; SAD
0636; DAD; D; SAD
0637; TAH; D; TAH
0638; ZAH; D; TAH
0639; AIN; D; AIN
063A; GHAIN; D; AIN
0640; TATWEEL; C; <no shaping>
0641; FEH; D; FEH
0642; QAF; D; QAF
0643; KAF; D; KAF
0644; LAM; D; LAM
0645; MEEM; D; MEEM
0646; NOON; D; NOON
0647; HEH; D; HEH
0648; WAW; R; WAW
0649; ALEF MAKSURA; D; YEH
064A; YEH; D; YEH
066E; DOTLESS BEH; D; BEH
066F; DOTLESS QAF; D; QAF
0671; HAMZAT WASL ON ALEF; R; ALEF
0672; WAVY HAMZA ON ALEF; R; ALEF
0673; WAVY HAMZA UNDER ALEF; R; ALEF
0674; HIGH HAMZA; U; <no shaping>
0675; HIGH HAMZA ALEF; R; ALEF
0676; HIGH HAMZA WAW; R; WAW
0677; HIGH HAMZA WAW WITH DAMMA; R; WAW
0678; HIGH HAMZA YEH; D; YEH
0679; TEH WITH SMALL TAH; D; BEH
067A; TEH WITH 2 DOTS VERTICAL ABOVE; D; BEH
067B; BEH WITH 2 DOTS VERTICAL BELOW; D; BEH
067C; TEH WITH RING; D; BEH
067D; TEH WITH 3 DOTS ABOVE DOWNWARD; D; BEH
067E; TEH WITH 3 DOTS BELOW; D; BEH
067F; TEH WITH 4 DOTS ABOVE; D; BEH
0680; BEH WITH 4 DOTS BELOW; D; BEH
0681; HAMZA ON HAH; D; HAH
0682; HAH WITH 2 DOTS VERTICAL ABOVE; D; HAH
0683; HAH WITH MIDDLE 2 DOTS; D; HAH
0684; HAH WITH MIDDLE 2 DOTS VERTICAL; D; HAH
0685; HAH WITH 3 DOTS ABOVE; D; HAH
0686; HAH WITH MIDDLE 3 DOTS DOWNWARD; D; HAH
0687; HAH WITH MIDDLE 4 DOTS; D; HAH
0688; DAL WITH SMALL TAH; R; DAL
0689; DAL WITH RING; R; DAL
068A; DAL WITH DOT BELOW; R; DAL
068B; DAL WITH DOT BELOW AND SMALL TAH; R; DAL
068C; DAL WITH 2 DOTS ABOVE; R; DAL
068D; DAL WITH 2 DOTS BELOW; R; DAL
068E; DAL WITH 3 DOTS ABOVE; R; DAL
068F; DAL WITH 3 DOTS ABOVE DOWNWARD; R; DAL
0690; DAL WITH 4 DOTS ABOVE; R; DAL
0691; REH WITH SMALL TAH; R; REH
0692; REH WITH SMALL V; R; REH
0693; REH WITH RING; R; REH
0694; REH WITH DOT BELOW; R; REH
0695; REH WITH SMALL V BELOW; R; REH
0696; REH WITH DOT BELOW AND DOT ABOVE; R; REH
0697; REH WITH 2 DOTS ABOVE; R; REH
0698; REH WITH 3 DOTS ABOVE; R; REH
0699; REH WITH 4 DOTS ABOVE; R; REH
069A; SEEN WITH DOT BELOW AND DOT ABOVE; D; SEEN
069B; SEEN WITH 3 DOTS BELOW; D; SEEN
069C; SEEN WITH 3 DOTS BELOW AND 3 DOTS ABOVE; D; SEEN
069D; SAD WITH 2 DOTS BELOW; D; SAD
069E; SAD WITH 3 DOTS ABOVE; D; SAD
069F; TAH WITH 3 DOTS ABOVE; D; TAH
06A0; AIN WITH 3 DOTS ABOVE; D; AIN
06A1; DOTLESS FEH; D; FEH
06A2; FEH WITH DOT MOVED BELOW; D; FEH
06A3; FEH WITH DOT BELOW; D; FEH
06A4; FEH WITH 3 DOTS ABOVE; D; FEH
06A5; FEH WITH 3 DOTS BELOW; D; FEH
06A6; FEH WITH 4 DOTS ABOVE; D; FEH
06A7; QAF WITH DOT ABOVE; D; QAF
06A8; QAF WITH 3 DOTS ABOVE; D; QAF
06A9; OPEN KAF; D; GAF
06AA; SWASH KAF; D; SWASH KAF
06AB; KAF WITH RING; D; GAF
06AC; KAF WITH DOT ABOVE; D; KAF
06AD; KAF WITH 3 DOTS ABOVE; D; KAF
06AE; KAF WITH 3 DOTS BELOW; D; KAF
06AF; GAF; D; GAF
06B0; GAF WITH RING; D; GAF
06B1; GAF WITH 2 DOTS ABOVE; D; GAF
06B2; GAF WITH 2 DOTS BELOW; D; GAF
06B3; GAF WITH 2 DOTS VERTICAL BELOW; D; GAF
06B4; GAF WITH 3 DOTS ABOVE; D; GAF
06B5; LAM WITH SMALL V; D; LAM
06B6; LAM WITH DOT ABOVE; D; LAM
06B7; LAM WITH 3 DOTS ABOVE; D; LAM
06B8; LAM WITH 3 DOTS BELOW; D; LAM
06B9; NOON WITH DOT BELOW; D; NOON
06BA; DOTLESS NOON; D; NOON
06BB; DOTLESS NOON WITH SMALL TAH; D; NOON
06BC; NOON WITH RING; D; NOON
06BD; NOON WITH 3 DOTS ABOVE; D; NOON
06BE; KNOTTED HEH; D; KNOTTED HEH
06BF; HAH WITH MIDDLE 3 DOTS DOWNWARD AND DOT ABOVE; D; HAH
06C0; HAMZA ON HEH; R; TEH MARBUTA
06C1; HEH GOAL; D; HEH GOAL
06C2; HAMZA ON HEH GOAL; R; HAMZA ON HEH GOAL
06C3; TEH MARBUTA GOAL; R; HAMZA ON HEH GOAL
06C4; WAW WITH RING; R; WAW
06C5; WAW WITH BAR; R; WAW
06C6; WAW WITH SMALL V; R; WAW
06C7; WAW WITH DAMMA; R; WAW
06C8; WAW WITH ALEF ABOVE; R; WAW
06C9; WAW WITH INVERTED SMALL V; R; WAW
06CA; WAW WITH 2 DOTS ABOVE; R; WAW
06CB; WAW WITH 3 DOTS ABOVE; R; WAW
06CC; DOTLESS YEH; D; YEH
06CD; YEH WITH TAIL; R; YEH WITH TAIL
06CE; YEH WITH SMALL V; D; YEH
06CF; WAW WITH DOT ABOVE; R; WAW
06D0; YEH WITH 2 DOTS VERTICAL BELOW; D; YEH
06D1; YEH WITH 3 DOTS BELOW; D; YEH
06D2; YEH BARREE; R; YEH BARREE
06D3; HAMZA ON YEH BARREE; R; YEH BARREE
06D5; AE; R; TEH MARBUTA
06FA; SEEN WITH DOT BELOW AND 3 DOTS ABOVE; D; SEEN
06FB; DAD WITH DOT BELOW; D; SAD
06FC; GHAIN WITH DOT BELOW; D; AIN
# Syriac characters
0710; ALAPH; R; ALAPH
0712; BETH; D; BETH
0713; GAMAL; D; GAMAL
0714; GAMAL GARSHUNI; D; GAMAL
0715; DALATH; R; DALATH RISH
0716; DOTLESS DALATH RISH; R; DALATH RISH
0717; HE; R; HE
0718; WAW; R; SYRIAC WAW
0719; ZAIN; R; ZAIN
071A; HETH; D; HETH
071B; TETH; D; TETH
071C; TETH GARSHUNI; D; TETH
071D; YUDH; D; YUDH
071E; YUDH HE; R; YUDH HE
071F; KAPH; D; KAPH
0720; LAMADH; D; LAMADH
0721; MIM; D; MIM
0722; NUN; D; NUN
0723; SEMKATH; D; SEMKATH
0724; FINAL SEMKATH; D; FINAL SEMKATH
0725; E; D; E
0726; PE; D; PE
0727; REVERSED PE; D; REVERSED PE
0728; SADHE; R; SADHE
0729; QAPH; D; QAPH
072A; RISH; R; DALATH RISH
072B; SHIN; D; SHIN
072C; TAW; R; TAW
# Other
200D; ZERO WIDTH JOINER; C; <no shaping>

File diff suppressed because it is too large Load diff

View file

@ -34,7 +34,7 @@
/* data --------------------------------------------------------------------- */
static UNewTrie *trie;
static uint32_t *pv;
uint32_t *pv;
static int32_t pvCount;
static uint32_t prevStart=0, prevLimit=0, prevValue=0;
@ -47,6 +47,11 @@ parseTwoFieldFile(char *filename, char *basename,
UParseLineFn *lineFn,
UErrorCode *pErrorCode);
static void
parseArabicShaping(char *filename, char *basename,
const char *suffix,
UErrorCode *pErrorCode);
static void
ageLineFn(void *context,
char *fields[][2], int32_t fieldCount,
@ -77,6 +82,11 @@ eaWidthLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode);
static void
lineBreakLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode);
/* -------------------------------------------------------------------------- */
U_CFUNC void
@ -87,7 +97,6 @@ initAdditionalProperties() {
U_CFUNC void
generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) {
char *basename;
UErrorCode errorCode;
basename=filename+uprv_strlen(filename);
@ -118,18 +127,22 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr
parseTwoFieldFile(filename, basename, "DerivedCoreProperties", suffix, derivedPropListLineFn, pErrorCode);
parseTwoFieldFile(filename, basename, "LineBreak", suffix, lineBreakLineFn, pErrorCode);
parseArabicShaping(filename, basename, suffix, pErrorCode);
/*
* Preset East Asian Width defaults:
* N for all
* A for Private Use
* W for plane 2
*/
errorCode=U_ZERO_ERROR;
if( !upvec_setValue(pv, 0, 0x110000, 0, (uint32_t)(U_EA_NEUTRAL<<UPROPS_EA_WIDTH_SHIFT), UPROPS_EA_WIDTH_MASK, pErrorCode) ||
!upvec_setValue(pv, 0xe000, 0xf900, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_WIDTH_SHIFT), UPROPS_EA_WIDTH_MASK, pErrorCode) ||
!upvec_setValue(pv, 0xf0000, 0xffffe, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_WIDTH_SHIFT), UPROPS_EA_WIDTH_MASK, pErrorCode) ||
!upvec_setValue(pv, 0x100000, 0x10fffe, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_WIDTH_SHIFT), UPROPS_EA_WIDTH_MASK, pErrorCode) ||
!upvec_setValue(pv, 0x20000, 0x2fffe, 0, (uint32_t)(U_EA_WIDE<<UPROPS_EA_WIDTH_SHIFT), UPROPS_EA_WIDTH_MASK, pErrorCode)
*pErrorCode=U_ZERO_ERROR;
if( !upvec_setValue(pv, 0, 0x110000, 0, (uint32_t)(U_EA_NEUTRAL<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
!upvec_setValue(pv, 0xe000, 0xf900, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
!upvec_setValue(pv, 0xf0000, 0xffffe, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
!upvec_setValue(pv, 0x100000, 0x10fffe, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
!upvec_setValue(pv, 0x20000, 0x2fffe, 0, (uint32_t)(U_EA_WIDE<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode)
) {
fprintf(stderr, "genprops: unable to set default East Asian Widths: %s\n", u_errorName(*pErrorCode));
exit(*pErrorCode);
@ -138,7 +151,7 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr
/* parse EastAsianWidth.txt */
parseTwoFieldFile(filename, basename, "EastAsianWidth", suffix, eaWidthLineFn, pErrorCode);
/* set last range */
if(!upvec_setValue(pv, prevStart, prevLimit, 0, (uint32_t)(prevValue<<UPROPS_EA_WIDTH_SHIFT), UPROPS_EA_WIDTH_MASK, pErrorCode)) {
if(!upvec_setValue(pv, prevStart, prevLimit, 0, (uint32_t)(prevValue<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode)) {
fprintf(stderr, "genprops error: unable to set East Asian Width: %s\n", u_errorName(*pErrorCode));
exit(*pErrorCode);
}
@ -164,13 +177,16 @@ parseTwoFieldFile(char *filename, char *basename,
UErrorCode *pErrorCode) {
char *fields[2][2];
writeUCDFilename(basename, ucdFile, suffix);
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
writeUCDFilename(basename, ucdFile, suffix);
u_parseDelimitedFile(filename, ';', fields, 2, lineFn, NULL, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
}
}
/* DerivedAge.txt ----------------------------------------------------------- */
@ -522,7 +538,7 @@ derivedPropListLineFn(void *context,
/* keep this list in sync with UEAWidthCode in uprops.h or uchar.h */
static const char *const
eaNames[U_EA_TOP]={
eaNames[U_EA_COUNT]={
"N", /* Non-East Asian Neutral, default for unassigned code points */
"A", /* Ambiguous, default for Private Use code points */
"H", /* Half-width */
@ -546,7 +562,7 @@ eaWidthLineFn(void *context,
++limit;
/* parse binary property name */
i=getTokenIndex(eaNames, U_EA_TOP, fields[1][0]);
i=getTokenIndex(eaNames, U_EA_COUNT, fields[1][0]);
if(i<0) {
fprintf(stderr, "genprops error: unknown width name \"%s\" in EastAsianWidth.txt\n", fields[1][0]);
*pErrorCode=U_PARSE_ERROR;
@ -557,7 +573,7 @@ eaWidthLineFn(void *context,
if(prevLimit==start && (uint32_t)i==prevValue) {
prevLimit=limit;
} else {
if(!upvec_setValue(pv, prevStart, prevLimit, 0, (uint32_t)(prevValue<<UPROPS_EA_WIDTH_SHIFT), UPROPS_EA_WIDTH_MASK, pErrorCode)) {
if(!upvec_setValue(pv, prevStart, prevLimit, 0, (uint32_t)(prevValue<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode)) {
fprintf(stderr, "genprops error: unable to set East Asian Width: %s\n", u_errorName(*pErrorCode));
exit(*pErrorCode);
}
@ -567,6 +583,192 @@ eaWidthLineFn(void *context,
}
}
/* LineBreak.txt ------------------------------------------------------------ */
/* LineBreak.txt block names in the order of the parallel ULineBreak constants */
static const char *const
lbNames[U_LB_COUNT]={
"XX",
"AI",
"AL",
"B2",
"BA",
"BB",
"BK",
"CB",
"CL",
"CM",
"CR",
"EX",
"GL",
"HY",
"ID",
"IN",
"IS",
"LF",
"NS",
"NU",
"OP",
"PO",
"PR",
"QU",
"SA",
"SG",
"SP",
"SY",
"ZW"
};
static void
lineBreakLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
uint32_t start, limit;
int32_t i;
u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genprops: syntax error in LineBreak.txt field 0 at %s\n", fields[0][0]);
exit(*pErrorCode);
}
++limit;
/* parse block name */
i=getTokenIndex(lbNames, U_LB_COUNT, fields[1][0]);
if(i<0) {
fprintf(stderr, "genprops error: unknown line break name \"%s\" in LineBreak.txt\n", fields[1][0]);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
if(!upvec_setValue(pv, start, limit, 2, (uint32_t)i<<UPROPS_LB_SHIFT, UPROPS_LB_MASK, pErrorCode)) {
fprintf(stderr, "genprops error: unable to set line break code: %s\n", u_errorName(*pErrorCode));
exit(*pErrorCode);
}
}
/* ArabicShaping.txt -------------------------------------------------------- */
/* Joining Type/Joining Group names in the order of the parallel UJoiningType/UJoiningGroup constants */
static const char *const
jtNames[U_JT_COUNT]={
"U",
"C",
"D",
"L",
"R",
"T"
};
static const char *const
jgNames[U_JG_COUNT]={
"<no shaping>",
"AIN",
"ALAPH",
"ALEF",
"BEH",
"BETH",
"DAL",
"DALATH RISH",
"E",
"FEH",
"FINAL SEMKATH",
"GAF",
"GAMAL",
"HAH",
"HAMZA ON HEH GOAL",
"HE",
"HEH",
"HEH GOAL",
"HETH",
"KAF",
"KAPH",
"KNOTTED HEH",
"LAM",
"LAMADH",
"MEEM",
"MIM",
"NOON",
"NUN",
"PE",
"QAF",
"QAPH",
"REH",
"REVERSED PE",
"SAD",
"SADHE",
"SEEN",
"SEMKATH",
"SHIN",
"SWASH KAF",
"SYRIAC WAW",
"TAH",
"TAW",
"TEH MARBUTA",
"TETH",
"WAW",
"YEH",
"YEH BARREE",
"YEH WITH TAIL",
"YUDH",
"YUDH HE",
"ZAIN"
};
static void
arabicShapingLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
uint32_t start, limit;
int32_t jt, jg;
u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genprops: syntax error in ArabicShaping.txt field 0 at %s\n", fields[0][0]);
exit(*pErrorCode);
}
++limit;
/* parse joining type */
jt=getTokenIndex(jtNames, U_JT_COUNT, fields[2][0]);
if(jt<0) {
fprintf(stderr, "genprops error: unknown joining type in \"%s\" in ArabicShaping.txt\n", fields[2][0]);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* parse joining group */
jg=getTokenIndex(jgNames, U_JG_COUNT, fields[3][0]);
if(jg<0) {
fprintf(stderr, "genprops error: unknown joining group in \"%s\" in ArabicShaping.txt\n", fields[3][0]);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
if(!upvec_setValue(pv, start, limit, 2, ((uint32_t)jt<<UPROPS_JT_SHIFT)|((uint32_t)jg<<UPROPS_JG_SHIFT), UPROPS_JT_MASK|UPROPS_JG_MASK, pErrorCode)) {
fprintf(stderr, "genprops error: unable to set joining type/group code: %s\n", u_errorName(*pErrorCode));
exit(*pErrorCode);
}
}
static void
parseArabicShaping(char *filename, char *basename,
const char *suffix,
UErrorCode *pErrorCode) {
char *fields[4][2];
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
writeUCDFilename(basename, "ArabicShaping", suffix);
u_parseDelimitedFile(filename, ';', fields, 4, arabicShapingLineFn, NULL, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "error parsing ArabicShaping.txt: %s\n", u_errorName(*pErrorCode));
}
}
/* data serialization ------------------------------------------------------- */
U_CFUNC int32_t
@ -593,6 +795,10 @@ writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_C
indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
indexes[UPROPS_RESERVED_INDEX]=
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
indexes[UPROPS_MAX_VALUES_INDEX]=
(((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
((int32_t)USCRIPT_CODE_LIMIT-1);
}
if(p!=NULL && (pvCount*4)<=capacity) {