utf-8 change

X-SVN-Rev: 15005
This commit is contained in:
Mark Davis 2004-04-17 18:21:39 +00:00
parent 3055bdaa34
commit 7ca61b13cc
8 changed files with 315 additions and 284 deletions

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $
* $Date: 2004/02/18 03:08:59 $
* $Revision: 1.11 $
* $Date: 2004/04/17 18:21:39 $
* $Revision: 1.12 $
*
*******************************************************************************
*/
@ -782,11 +782,11 @@ abstract public class GenerateBreakTest implements UCD_Types {
public boolean isBreak(String source, int offset) {
setRule("1: sot ÷");
setRule("1: sot ÷");
if (offset < 0 || offset > source.length()) return false;
if (offset == 0) return true;
setRule("2: ÷ eot");
setRule("2: ÷ eot");
if (offset == source.length()) return true;
// UTF-16: never break in the middle of a code point
@ -801,29 +801,29 @@ abstract public class GenerateBreakTest implements UCD_Types {
byte before = getResolvedType(cpBefore);
byte after = getResolvedType(cpAfter);
setRule("3: CR × LF");
setRule("3: CR × LF");
if (before == CR && after == LF) return false;
setRule("4: ( Control | CR | LF ) ÷");
setRule("4: ( Control | CR | LF ) ÷");
if (before == CR || before == LF || before == Control) return true;
setRule("5: ÷ ( Control | CR | LF )");
setRule("5: ÷ ( Control | CR | LF )");
if (after == Control || after == LF || after == CR) return true;
setRule("6: L × ( L | V | LV | LVT )");
setRule("6: L × ( L | V | LV | LVT )");
if (before == L && (after == L || after == V || after == LV || after == LVT)) return false;
setRule("7: ( LV | V ) × ( V | T )");
setRule("7: ( LV | V ) × ( V | T )");
if ((before == LV || before == V) && (after == V || after == T)) return false;
setRule("8: ( LVT | T ) × T");
setRule("8: ( LVT | T ) × T");
if ((before == LVT || before == T) && (after == T)) return false;
setRule("9: × Extend");
setRule("9: × Extend");
if (after == Extend) return false;
// Otherwise break after all characters.
setRule("10: Any ÷ Any");
setRule("10: Any ÷ Any");
return true;
}
@ -914,12 +914,12 @@ abstract public class GenerateBreakTest implements UCD_Types {
public boolean isBreak(String source, int offset) {
setRule("1: sot ÷");
setRule("1: sot ÷");
if (offset < 0 || offset > source.length()) return false;
if (offset == 0) return true;
setRule("2: ÷ eot");
setRule("2: ÷ eot");
if (offset == source.length()) return true;
// Treat a grapheme cluster as if it were a single character:
@ -943,43 +943,43 @@ abstract public class GenerateBreakTest implements UCD_Types {
//Don't break between most letters
setRule("5: ALetter × ALetter");
setRule("5: ALetter × ALetter");
if (before == ALetter && after == ALetter) return false;
// Dont break letters across certain punctuation
// Dont break letters across certain punctuation
setRule("6: ALetter × (MidLetter | MidNumLet) ALetter");
setRule("6: ALetter × (MidLetter | MidNumLet) ALetter");
if (before == ALetter && (after == MidLetter || after == MidNumLet) && after2 == ALetter) return false;
setRule("7: ALetter (MidLetter | MidNumLet) × ALetter");
setRule("7: ALetter (MidLetter | MidNumLet) × ALetter");
if (before2 == ALetter && (before == MidLetter || before == MidNumLet) && after == ALetter) return false;
// Dont break within sequences of digits, or digits adjacent to letters.
// Dont break within sequences of digits, or digits adjacent to letters.
setRule("8: Numeric × Numeric");
setRule("8: Numeric × Numeric");
if (before == Numeric && after == Numeric) return false;
setRule("9: ALetter × Numeric");
setRule("9: ALetter × Numeric");
if (before == ALetter && after == Numeric) return false;
setRule("10: Numeric × ALetter");
setRule("10: Numeric × ALetter");
if (before == Numeric && after == ALetter) return false;
// Dont break within sequences like: '-3.2'
setRule("11: Numeric (MidNum | MidNumLet) × Numeric");
// Dont break within sequences like: '-3.2'
setRule("11: Numeric (MidNum | MidNumLet) × Numeric");
if (before2 == Numeric && (before == MidNum || before == MidNumLet) && after == Numeric) return false;
setRule("12: Numeric × (MidNum | MidNumLet) Numeric");
setRule("12: Numeric × (MidNum | MidNumLet) Numeric");
if (before == Numeric && (after == MidNum || after == MidNumLet) && after2 == Numeric) return false;
// Don't break between Katakana
setRule("13: Katakana × Katakana");
setRule("13: Katakana × Katakana");
if (before == Katakana && after == Katakana) return false;
// Otherwise break always.
setRule("14: Any ÷ Any");
setRule("14: Any ÷ Any");
return true;
}
@ -1235,7 +1235,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
// LB 2a Never break at the start of text
setRule("2a: × sot");
setRule("2a: × sot");
if (offset <= 0) return false;
// LB 2b Always break at the end of text
@ -1269,26 +1269,26 @@ abstract public class GenerateBreakTest implements UCD_Types {
//byte after = getResolvedType(cpAfter);
setRule("3a: CR × LF ; ( BK | CR | LF | NL ) !");
setRule("3a: CR × LF ; ( BK | CR | LF | NL ) !");
// Always break after hard line breaks (but never between CR and LF).
// CR ^ LF
if (before == LB_CR && after == LB_LF) return false;
if (before == LB_BK || before == LB_LF || before == LB_CR) return true;
//LB 3b Dont break before hard line breaks.
setRule("3b: × ( BK | CR | LF )");
//LB 3b Dont break before hard line breaks.
setRule("3b: × ( BK | CR | LF )");
if (after == LB_BK || after == LB_LF || after == LB_CR) return false;
// LB 4 Dont break before spaces or zero-width space.
setRule("4: × ( SP | ZW )");
// LB 4 Dont break before spaces or zero-width space.
setRule("4: × ( SP | ZW )");
if (after == LB_SP || after == LB_ZW) return false;
// LB 5 Break after zero-width space.
setRule("5: ZW ÷");
setRule("5: ZW ÷");
if (before == LB_ZW) return true;
// LB 6 Dont break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
// LB 6 Dont break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
setRule("6: DGC -> FC");
if (!grapheme.isBreak( source, offset)) return false;
@ -1324,9 +1324,9 @@ abstract public class GenerateBreakTest implements UCD_Types {
if (setBase && backBase == -1) before = LB_AL;
// LB 8 Dont break before ] or ! or ; or /, even after spaces.
// × CL, × EX, × IS, × SY
setRule("8: × ( CL | EX | IS | SY )");
// LB 8 Dont break before ] or ! or ; or /, even after spaces.
// × CL, × EX, × IS, × SY
setRule("8: × ( CL | EX | IS | SY )");
if (after == LB_CL || after == LB_EX || after == LB_SY | after == LB_IS) return false;
@ -1339,97 +1339,97 @@ abstract public class GenerateBreakTest implements UCD_Types {
}
}
// LB 9 Dont break after [, even after spaces.
// OP SP* ×
setRule("9: OP SP* ×");
// LB 9 Dont break after [, even after spaces.
// OP SP* ×
setRule("9: OP SP* ×");
if (lastNonSpace == LB_OP) return false;
// LB 10 Dont break within [, , even with intervening spaces.
// QU SP* × OP
setRule("10: QU SP* × OP");
// LB 10 Dont break within [, , even with intervening spaces.
// QU SP* × OP
setRule("10: QU SP* × OP");
if (lastNonSpace == LB_QU && after == LB_OP) return false;
// LB 11 Dont break within ]h, even with intervening spaces.
// CL SP* × NS
setRule("11: CL SP* × NS");
// LB 11 Dont break within ]h, even with intervening spaces.
// CL SP* × NS
setRule("11: CL SP* × NS");
if (lastNonSpace == LB_CL && after == LB_NS) return false;
// LB 11a Dont break within , even with intervening spaces.
// B2 × B2
setRule("11a: B2 × B2");
// LB 11a Dont break within , even with intervening spaces.
// B2 × B2
setRule("11a: B2 × B2");
if (lastNonSpace == LB_B2 && after == LB_B2) return false;
// LB 13 Dont break before or after NBSP or WORD JOINER
// × GL
// GL ×
// LB 13 Dont break before or after NBSP or WORD JOINER
// × GL
// GL ×
setRule("11b: × WJ ; WJ ×");
setRule("11b: × WJ ; WJ ×");
if (after == LB_WJ || before == LB_WJ) return false;
// [Note: by this time, all of the "X" in the table are accounted for. We can safely break after spaces.]
// LB 12 Break after spaces
setRule("12: SP ÷");
setRule("12: SP ÷");
if (before == LB_SP) return true;
// LB 13 Dont break before or after NBSP or WORD JOINER
setRule("13: × GL ; GL ×");
// LB 13 Dont break before or after NBSP or WORD JOINER
setRule("13: × GL ; GL ×");
if (after == LB_GL || before == LB_GL) return false;
// LB 14 Dont break before or after
setRule("14: × QU ; QU ×");
// LB 14 Dont break before or after
setRule("14: × QU ; QU ×");
if (before == LB_QU || after == LB_QU) return false;
// LB 14a Break before and after CB
setRule("14a: ÷ CB ; CB ÷");
setRule("14a: ÷ CB ; CB ÷");
if (before == LB_CB || after == LB_CB) return true;
// LB 15 Dont break before hyphen-minus, other hyphens, fixed-width spaces,
// LB 15 Dont break before hyphen-minus, other hyphens, fixed-width spaces,
// small kana and other non- starters, or after acute accents:
setRule("15: × ( BA | HY | NS ) ; BB ×");
setRule("15: × ( BA | HY | NS ) ; BB ×");
if (after == LB_NS) return false;
if (after == LB_HY) return false;
if (after == LB_BA) return false;
if (before == LB_BB) return false;
//setRule("15a: HY × NU"); // NEW
//setRule("15a: HY × NU"); // NEW
//if (before == LB_HY && after == LB_NU) return false;
// LB 16 Dont break between two ellipses, or between letters or numbers and ellipsis:
// Examples: 9..., a..., H...
setRule("16: ( AL | ID | IN | NU ) × IN");
// LB 16 Dont break between two ellipses, or between letters or numbers and ellipsis:
// Examples: 9..., a..., H...
setRule("16: ( AL | ID | IN | NU ) × IN");
if ((before == LB_NU || before == LB_AL || before == LB_ID) && after == LB_IN) return false;
if (before == LB_IN && after == LB_IN) return false;
// Don't break alphanumerics.
// LB 17 Dont break within a9, 3a, or H%
// LB 17 Dont break within a9, 3a, or H%
// Numbers are of the form PR ? ( OP | HY ) ? NU (NU | IS) * CL ? PO ?
// Examples: $(12.35) 2,1234 (12)¢ 12.54¢
// Examples: $(12.35) 2,1234 (12)¢ 12.54¢
// This is approximated with the following rules. (Some cases already handled above,
// like 9,, [9.)
setRule("17: ID × PO ; AL × NU; NU × AL");
// like 9,, [9.)
setRule("17: ID × PO ; AL × NU; NU × AL");
if (before == LB_ID && after == LB_PO) return false;
if (before == LB_AL && after == LB_NU) return false;
if (before == LB_NU && after == LB_AL) return false;
// LB 18 Dont break between the following pairs of classes.
// CL × PO
// HY × NU
// IS × NU
// NU × NU
// NU × PO
// PR × AL
// PR × HY
// PR × ID
// PR × NU
// PR × OP
// SY × NU
// Example pairs: $9, $[, $-, -9, /9, 99, ,9, 9% ]%
// LB 18 Dont break between the following pairs of classes.
// CL × PO
// HY × NU
// IS × NU
// NU × NU
// NU × PO
// PR × AL
// PR × HY
// PR × ID
// PR × NU
// PR × OP
// SY × NU
// Example pairs: $9, $[, $-, -9, /9, 99, ,9, 9% ]%
setRule("18: CL × PO ; NU × PO ; ( IS | NU | HY | PR | SY ) × NU ; PR × ( AL | HY | ID | OP )");
setRule("18: CL × PO ; NU × PO ; ( IS | NU | HY | PR | SY ) × NU ; PR × ( AL | HY | ID | OP )");
if (before == LB_CL && after == LB_PO) return false;
if (before == LB_IS && after == LB_NU) return false;
if (before == LB_NU && after == LB_NU) return false;
@ -1446,30 +1446,30 @@ abstract public class GenerateBreakTest implements UCD_Types {
if (before == LB_SY && after == LB_NU) return false;
// LB 15b Break after hyphen-minus, and before acute accents:
setRule("18b: HY ÷ ; ÷ BB");
setRule("18b: HY ÷ ; ÷ BB");
if (before == LB_HY) return true;
if (after == LB_BB) return true;
// LB 19 Dont break between alphabetics (at)
// AL × AL
// LB 19 Dont break between alphabetics (at)
// AL × AL
setRule("19: AL × AL");
setRule("19: AL × AL");
if (before == LB_AL && after == LB_AL) return false;
// LB 20 Break everywhere else
// ALL ÷
// ÷ ALL
// ALL ÷
// ÷ ALL
if (ucd.getCompositeVersion() > 0x040000) {
setRule("19b: IS × AL");
setRule("19b: IS × AL");
if (before == LB_IS && after == LB_AL) return false;
}
// LB 20 Break everywhere else
// ALL ÷
// ÷ ALL
// ALL ÷
// ÷ ALL
setRule("20: ALL ÷ ; ÷ ALL");
setRule("20: ALL ÷ ; ÷ ALL");
return true;
}
}
@ -1498,8 +1498,8 @@ abstract public class GenerateBreakTest implements UCD_Types {
"U.S.A\u0300.",
"3.4",
"c.d",
"etc.)\u2019 \u2018(the",
"etc.)\u2019 \u2018(The",
"etc.)\u2019 \u2018(the",
"etc.)\u2019 \u2018(The",
"the resp. leaders are",
"\u5B57.\u5B57",
"etc.\u5B83",
@ -1631,15 +1631,15 @@ abstract public class GenerateBreakTest implements UCD_Types {
public boolean isBreak(String source, int offset) {
// Break at the start and end of text.
setRule("1: sot ÷");
setRule("1: sot ÷");
if (offset < 0 || offset > source.length()) return false;
if (offset == 0) return true;
setRule("2: ÷ eot");
setRule("2: ÷ eot");
if (offset == source.length()) return true;
setRule("3: Sep ÷");
setRule("3: Sep ÷");
byte beforeChar = getResolvedType(source.charAt(offset-1));
if (beforeChar == Sep) return true;
@ -1662,22 +1662,22 @@ abstract public class GenerateBreakTest implements UCD_Types {
// HACK COPY for rule collection!
if (collectingRules) {
setRule("6: ATerm × ( Numeric | Lower )");
setRule("7: Upper ATerm × Upper");
setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower");
setRule("9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )");
setRule("10: ( Term | ATerm ) Close* Sp × ( Sp | Sep )");
setRule("11: ( Term | ATerm ) Close* Sp* ÷");
setRule("12: Any × Any");
setRule("6: ATerm × ( Numeric | Lower )");
setRule("7: Upper ATerm × Upper");
setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower");
setRule("9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )");
setRule("10: ( Term | ATerm ) Close* Sp × ( Sp | Sep )");
setRule("11: ( Term | ATerm ) Close* Sp* ÷");
setRule("12: Any × Any");
collectingRules = false;
}
// Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter, is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
if (before == ATerm) {
setRule("6: ATerm × ( Numeric | Lower )");
setRule("6: ATerm × ( Numeric | Lower )");
if (after == Lower || after == Numeric) return false;
setRule("7: Upper ATerm × Upper");
setRule("7: Upper ATerm × Upper");
if (DEBUG_GRAPHEMES) System.out.println(context + ", " + Upper);
if (before2 == Upper && after == Upper) return false;
}
@ -1736,17 +1736,17 @@ abstract public class GenerateBreakTest implements UCD_Types {
if (lookAfter == -1) {
// Otherwise, do not break
// Any × Any (11)
setRule("12: Any × Any");
// Any × Any (11)
setRule("12: Any × Any");
return false;
}
// ATerm Close* Sp*×(¬( OLetter))* Lower(8)
// ATerm Close* Sp*×(¬( OLetter))* Lower(8)
// Break after sentence terminators, but include closing punctuation, trailing spaces, and (optionally) a paragraph separator.
// ( Term | ATerm ) Close*×( Close | Sp | Sep )(9)
// ( Term | ATerm ) Close* Sp×( Sp | Sep )(10)
// ( Term | ATerm ) Close* Sp*÷(11)
// ( Term | ATerm ) Close*×( Close | Sp | Sep )(9)
// ( Term | ATerm ) Close* Sp×( Sp | Sep )(10)
// ( Term | ATerm ) Close* Sp*÷(11)
// We DID find one. Loop to see if the right side is ok.
@ -1764,16 +1764,16 @@ abstract public class GenerateBreakTest implements UCD_Types {
if (isFirst) {
isFirst = false;
if (lookAfter == ATerm && t == Upper) {
setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower");
setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower");
return false;
}
if (gotSpace) {
if (t == Sp || t == Sep) {
setRule("10: ( Term | ATerm ) Close* Sp × ( Sp | Sep )");
setRule("10: ( Term | ATerm ) Close* Sp × ( Sp | Sep )");
return false;
}
} else if (t == Close || t == Sp || t == Sep) {
setRule("9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )");
setRule("9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )");
return false;
}
if (lookAfter == Term) break;
@ -1782,12 +1782,12 @@ abstract public class GenerateBreakTest implements UCD_Types {
// at this point, we have an ATerm. All other conditions are ok, but we need to verify 6
if (t != OLetter && t != Upper && t != Lower) continue;
if (t == Lower) {
setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower");
setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower");
return false;
}
break;
}
setRule("11: ( Term | ATerm ) Close* Sp* ÷");
setRule("11: ( Term | ATerm ) Close* Sp* ÷");
return true;
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
* $Date: 2004/02/07 01:01:15 $
* $Revision: 1.14 $
* $Date: 2004/04/17 18:21:39 $
* $Revision: 1.15 $
*
*******************************************************************************
*/
@ -547,18 +547,18 @@ public final class GenerateHanTransliterator implements UCD_Types {
"e", "ei", "er", "en", "eng",
"i", "ia", "iao", "ie", "iu", "ian", "in", "iang", "ing", "iong",
"u", "ua", "uo", "uai", "ui", "uan", "un", "uang", "ueng",
"ü", "üe", "üan", "ün"
"ü", "üe", "üan", "ün"
};
// Don't bother with the following rules; just add w,y to initials
// When i stands alone, a y will be added before it as yi.
// If i is the first letter of the syllable it will be changed to y.
// When u stands alone, a w will be added before it as wu.
// If u is the first letter of the syllable it will be changed to w. e.g. uang -> wang.
// When ü stands alone, a y will be added before it and ü will be changed to u as yu.
// If ü is the first letter of the syllable, then the spelling will be changed to yu. e.g. üan -> yuan.
//Note: The nasal final ueng never occurs after an initial but always form a syllable by itself.
// The o in iou is hidden, so it will be wrote as iu. But, dont forget to pronounce it.
// The e in uei is hidden, so it will be wrote as ui. But, dont forget to pronounce it.
// When i stands alone, a y will be added before it as yi.
// If i is the first letter of the syllable it will be changed to y.
// When u stands alone, a w will be added before it as wu.
// If u is the first letter of the syllable it will be changed to w. e.g. uang -> wang.
// When ü stands alone, a y will be added before it and ü will be changed to u as yu.
// If ü is the first letter of the syllable, then the spelling will be changed to yu. e.g. üan -> yuan.
//Note: The nasal final ueng never occurs after an initial but always form a syllable by itself.
// The o in iou is hidden, so it will be wrote as iu. But, dont forget to pronounce it.
// The e in uei is hidden, so it will be wrote as ui. But, dont forget to pronounce it.
public static final String[] pinyin_bopomofo = {
@ -749,9 +749,9 @@ public final class GenerateHanTransliterator implements UCD_Types {
"long", "\u310c\u3128\u3125",
"lou", "\u310c\u3121",
"lu", "\u310c\u3128",
"", "\u310c\u3129",
"", "\u310c\u3129",
"luan", "\u310c\u3128\u3122",
"lüe", "\u310c\u3129\u311d",
"lüe", "\u310c\u3129\u311d",
"lun", "\u310c\u3128\u3123",
"luo", "\u310c\u3128\u311b",
"m", "\u3107",
@ -796,9 +796,9 @@ public final class GenerateHanTransliterator implements UCD_Types {
"nong", "\u310b\u3128\u3125",
"nou", "\u310b\u3121",
"nu", "\u310b\u3128",
"", "\u310b\u3129",
"", "\u310b\u3129",
"nuan", "\u310b\u3128\u3122",
"nüe", "\u310b\u3129\u311d",
"nüe", "\u310b\u3129\u311d",
"nuo", "\u310b\u3128\u311b",
"o", "\u311b",
"ou", "\u3121",
@ -1007,52 +1007,52 @@ public final class GenerateHanTransliterator implements UCD_Types {
}
/*
U+347C · liù #lyuè
U+3500 · lüè #lvè
U+3527 · liù #lyù
U+3729 · ào #àu
U+380E · #jjí
U+3825 · l· #lv·
U+3A3C · lüè #luè
U+3B5A · li· #ly· *** ?
U+3CB6 · l· #lv·
U+3D56 · niù #nyù *** ?
U+3D88 · li·ng #li·ng
U+3EF2 · li· #ly·*** ?
U+3F94 · li· #ly·*** ?
U+4071 · ào #àu
U+40AE · liù #lyuè *** lüe?
U+430E · liù #lyuè *** lüe?
U+451E · liù #lyù *** ?
U+4588 · nüè #nuè
U+458B · nüè #nuè
U+45A1 · niù #nyù *** ?
U+4610 · niù #nyù *** ?
U+46BC · niù #nyù *** ?
U+46DA · liù #lyuè *** lüe?
U+4896 · liù #lyù *** ?
U+4923 · liù #lyuè *** lüe?
U+4968 · liù #lyù *** ?
U+4A0B · niù #nyuè *** nüe?
U+4AC4 · chuò #chuà
U+4D08 · ·o #·u
U+4D8A · niù #nyù *** ?
U+51CA · qíng #qýng
U+51D6 · zhu·n #zhu·n *** this is probably zh·n
U+5481 · gàn #gèm
U+5838 · féng #fúng
U+639F · · #lu· *** this pronunciation surprises me, but I don't know...
U+66D5 · yàn #yiàn
U+6B3B · chu· #chu· *** chua _is_ ok after all, my table missed an entry
U+6B56 · chu· #chu· *** chua
U+6C7C · ni· #ni·u
U+6E6D · qiú #qióu
U+6F71 · y· #yi·
U+7493 · xiù #xiòu
U+7607 · zh·ng #zh·ng *** I suspect zh·ng
U+7674 · luán #lüán
U+7867 · y·ng #i·ng
U+7878 · nüè #nuè
U+347C · liù #lyuè
U+3500 · lüè #lvè
U+3527 · liù #lyù
U+3729 · ào #àu
U+380E · #jjí
U+3825 · l· #lv·
U+3A3C · lüè #luè
U+3B5A · li· #ly· *** ?
U+3CB6 · l· #lv·
U+3D56 · niù #nyù *** ?
U+3D88 · li·ng #li·ng
U+3EF2 · li· #ly·*** ?
U+3F94 · li· #ly·*** ?
U+4071 · ào #àu
U+40AE · liù #lyuè *** lüe?
U+430E · liù #lyuè *** lüe?
U+451E · liù #lyù *** ?
U+4588 · nüè #nuè
U+458B · nüè #nuè
U+45A1 · niù #nyù *** ?
U+4610 · niù #nyù *** ?
U+46BC · niù #nyù *** ?
U+46DA · liù #lyuè *** lüe?
U+4896 · liù #lyù *** ?
U+4923 · liù #lyuè *** lüe?
U+4968 · liù #lyù *** ?
U+4A0B · niù #nyuè *** nüe?
U+4AC4 · chuò #chuà
U+4D08 · ·o #·u
U+4D8A · niù #nyù *** ?
U+51CA · qíng #qýng
U+51D6 · zhu·n #zhu·n *** this is probably zh·n
U+5481 · gàn #gèm
U+5838 · féng #fúng
U+639F · · #lu· *** this pronunciation surprises me, but I don't know...
U+66D5 · yàn #yiàn
U+6B3B · chu· #chu· *** chua _is_ ok after all, my table missed an entry
U+6B56 · chu· #chu· *** chua
U+6C7C · ni· #ni·u
U+6E6D · qiú #qióu
U+6F71 · y· #yi·
U+7493 · xiù #xiòu
U+7607 · zh·ng #zh·ng *** I suspect zh·ng
U+7674 · luán #lüán
U+7867 · y·ng #i·ng
U+7878 · nüè #nuè
*/
static Transliterator fixTypos = Transliterator.createFromRules("fix_typos",
@ -1061,12 +1061,12 @@ U+7878
+"$cons{iou}$nlet > iu;"
+"$cons{em}$nlet > an;"
+"$cons{uen}$nlet > ueng;"
+"$cons{ve}$nlet > üe;"
+"$cons{v}$nlet > ü;"
+"$cons{ve}$nlet > üe;"
+"$cons{v}$nlet > ü;"
+"$cons{yue}$nlet > iu;"
+"$cons{yng}$nlet > ing;"
+"$cons{yu}$nlet > iu;"
//+"$cons{ue} > üe;"
//+"$cons{ue} > üe;"
+"jj > j;"
//+"$nlet{ng}$nlet > eng;"
//+"$nlet{n}$nlet > en;"
@ -1076,13 +1076,13 @@ U+7878
// new fixes
+"zhueng}$nlet > zhong;"
+"zhuen}$nlet > zhuan;"
+"lue > lüe;"
+"lue > lüe;"
+"liong > liang;"
+"nue > nüe;"
+"nue > nüe;"
+"chua > chuo;"
+"yian > yan;"
+"yie > ye;"
+"lüan > luan;"
+"lüan > luan;"
+"iong > yong;"
, Transliterator.FORWARD);
@ -1113,7 +1113,7 @@ U+7878
try {
// chinese_frequency.txt
// 1 çš 1588561 1588561 3.5008%
// 1 çš 1588561 1588561 3.5008%
// japanese_frequency.txt
// 1 ? 17176
@ -1421,7 +1421,7 @@ U+7878
@Unihan Data
Bad pinyin data: \u4E7F ? LE
\u7684 ? de, de, ,
\u7684 ? de, de, ,
*/
static void fixChineseOverrides() throws IOException {
@ -2024,7 +2024,7 @@ Bad pinyin data: \u4E7F ? LE
+ "# otherwise 'o'\n"
+ "# otherwise last vowel\n"
+ "::NFC;\n"
+ "$vowel = [aAeEiIoOuUüÜ];\n"
+ "$vowel = [aAeEiIoOuUüÜ];\n"
+ "$consonant = [[a-z A-Z] - [$vowel]];\n"
+ "$digit = [1-5];\n"
+ "([aAeE]) ($vowel* $consonant*) ($digit) > $1 &digit-tone($3) $2;\n"
@ -2054,10 +2054,10 @@ Bad pinyin data: \u4E7F ? LE
if (i > 0) {
char last = result.charAt(result.length()-1);
if (last == 'u') {
result.setCharAt(result.length()-1, 'ü');
result.setCharAt(result.length()-1, 'ü');
continue main;
} else if (last == 'U') {
result.setCharAt(result.length()-1, 'Ü');
result.setCharAt(result.length()-1, 'Ü');
continue main;
}
}
@ -2085,22 +2085,22 @@ Bad pinyin data: \u4E7F ? LE
for (int i = source.length()-2; i >= 0; --i) {
ch = source.charAt(i);
if (ch == ':') {
ch = 'Ü';
ch = 'Ü';
--i;
}
if ('0' <= ch && ch <= '9') break;
if (ch != 'Ü' && (ch < 'A' || ch > 'Z')) {
if (ch != 'Ü' && (ch < 'A' || ch > 'Z')) {
Utility.fixDot();
System.out.println("Warning: non-ASCII in " + hex.transliterate(source) + " (" + hex.transliterate(debugLine) + ")");
break;
}
if (!gotIt) switch (ch) {
case 'A': ch = "\u0102À\u0100".charAt(num); gotIt = true; break;
case 'E': ch = "\u0114È\u0112".charAt(num); gotIt = true; break;
case 'I': ch = "\u012CÌ\u012A".charAt(num); gotIt = true; break;
case 'O': ch = "\u014EÒ\u014C".charAt(num); gotIt = true; break;
case 'U': ch = "\u016CÙ\u016A".charAt(num); gotIt = true; break;
case 'Ü': ch = "Ü\u01D7\u01D9\u01DB\u01D5".charAt(num); gotIt = true; break;
case 'A': ch = "\u0102À\u0100".charAt(num); gotIt = true; break;
case 'E': ch = "\u0114È\u0112".charAt(num); gotIt = true; break;
case 'I': ch = "\u012CÌ\u012A".charAt(num); gotIt = true; break;
case 'O': ch = "\u014EÒ\u014C".charAt(num); gotIt = true; break;
case 'U': ch = "\u016CÙ\u016A".charAt(num); gotIt = true; break;
case 'Ü': ch = "Ü\u01D7\u01D9\u01DB\u01D5".charAt(num); gotIt = true; break;
}
handlePinyinTemp.insert(0,ch);
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java,v $
* $Date: 2004/02/07 01:01:15 $
* $Revision: 1.4 $
* $Date: 2004/04/17 18:21:39 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
@ -398,23 +398,23 @@ public class GenerateLineBreakTest implements UCD_Types {
if (before == LB_CR && after == LB_LF) return false;
if (before == LB_BK || before == LB_LF || before == LB_CR) return true;
//LB 3b Dont break before hard line breaks.
//LB 3b Dont break before hard line breaks.
rule="3b";
if (after == LB_BK || after == LB_LF | after == LB_CR) return false;
// LB 4 Dont break before spaces or zero-width space.
// × SP
// × ZW
// LB 4 Dont break before spaces or zero-width space.
// × SP
// × ZW
rule="4";
if (after == LB_SP || after == LB_ZW) return false;
// LB 5 Break after zero-width space.
// ZW ÷
// ZW ÷
rule="5";
if (before == LB_ZW) return true;
// LB 6 Dont break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
// LB 6 Dont break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
rule="6";
if (after == LB_CM) return false;
@ -441,8 +441,8 @@ public class GenerateLineBreakTest implements UCD_Types {
rule="7";
if (setBase && before == LB_SP) before = LB_ID;
// LB 8 Dont break before ] or ! or ; or /, even after spaces.
// × CL, × EX, × IS, × SY
// LB 8 Dont break before ] or ! or ; or /, even after spaces.
// × CL, × EX, × IS, × SY
rule="8";
if (after == LB_CL || after == LB_EX || after == LB_SY | after == LB_IS) return false;
@ -456,31 +456,31 @@ public class GenerateLineBreakTest implements UCD_Types {
}
}
// LB 9 Dont break after [, even after spaces.
// OP SP* ×
// LB 9 Dont break after [, even after spaces.
// OP SP* ×
rule="9";
if (lastNonSpace == LB_OP) return false;
// LB 10 Dont break within [, , even with intervening spaces.
// QU SP* × OP
// LB 10 Dont break within [, , even with intervening spaces.
// QU SP* × OP
rule="10";
if (lastNonSpace == LB_QU && after == LB_OP) return false;
// LB 11 Dont break within ]h, even with intervening spaces.
// CL SP* × NS
// LB 11 Dont break within ]h, even with intervening spaces.
// CL SP* × NS
rule="11";
if (lastNonSpace == LB_CL && after == LB_NS) return false;
// LB 11a Dont break within , even with intervening spaces.
// B2 × B2
// LB 11a Dont break within , even with intervening spaces.
// B2 × B2
rule="11a";
if (lastNonSpace == LB_B2 && after == LB_B2) return false;
if (recommended) {
// LB 13 Dont break before or after NBSP or WORD JOINER
// × GL
// GL ×
// LB 13 Dont break before or after NBSP or WORD JOINER
// × GL
// GL ×
rule="11b";
if (after == LB_GL || before == LB_GL) return false;
@ -490,36 +490,36 @@ public class GenerateLineBreakTest implements UCD_Types {
rule="12";
// LB 12 Break after spaces
// SP ÷
// SP ÷
if (before == LB_SP) return true;
if (!recommended) {
// LB 13 Dont break before or after NBSP or WORD JOINER
// × GL
// GL ×
// LB 13 Dont break before or after NBSP or WORD JOINER
// × GL
// GL ×
rule="13";
if (after == LB_GL || before == LB_GL) return false;
}
rule="14";
// LB 14 Dont break before or after
// × QU
// QU ×
// LB 14 Dont break before or after
// × QU
// QU ×
if (before == LB_QU || after == LB_QU) return false;
// LB 15 Dont break before hyphen-minus, other hyphens, fixed-width spaces,
// LB 15 Dont break before hyphen-minus, other hyphens, fixed-width spaces,
// small kana and other non- starters, or after acute accents:
// × BA
// × HY
// × NS
// BB ×
// × BA
// × HY
// × NS
// BB ×
if (recommended) {
// LB 14a Break before and after CB
// CB ÷
// ÷ CB
// CB ÷
// ÷ CB
if (before == LB_CB || after == LB_CB) return true;
}
@ -532,51 +532,51 @@ public class GenerateLineBreakTest implements UCD_Types {
if (!recommended) {
// LB 15b Break after hyphen-minus, and before acute accents:
// HY ÷
// ÷ BB
// HY ÷
// ÷ BB
rule="15b";
if (before == LB_HY) return true;
if (after == LB_BB) return true;
}
// LB 16 Dont break between two ellipses, or between letters or numbers and ellipsis:
// AL × IN
// ID × IN
// IN × IN
// NU × IN
// Examples: 9..., a..., H...
// LB 16 Dont break between two ellipses, or between letters or numbers and ellipsis:
// AL × IN
// ID × IN
// IN × IN
// NU × IN
// Examples: 9..., a..., H...
rule="16";
if ((before == LB_NU || before == LB_AL || before == LB_ID) && after == LB_IN) return false;
if (before == LB_IN && after == LB_IN) return false;
// Don't break alphanumerics.
// LB 17 Dont break within a9, 3a, or H%
// ID × PO
// AL × NU
// NU × AL
// LB 17 Dont break within a9, 3a, or H%
// ID × PO
// AL × NU
// NU × AL
// Numbers are of the form PR ? ( OP | HY ) ? NU (NU | IS) * CL ? PO ?
// Examples: $(12.35) 2,1234 (12)¢ 12.54¢
// Examples: $(12.35) 2,1234 (12)¢ 12.54¢
// This is approximated with the following rules. (Some cases already handled above,
// like 9,, [9.)
// like 9,, [9.)
rule="17";
if (before == LB_ID && after == LB_PO) return false;
if (before == LB_AL && after == LB_NU) return false;
if (before == LB_NU && after == LB_AL) return false;
// LB 18 Dont break between the following pairs of classes.
// CL × PO
// HY × NU
// IS × NU
// NU × NU
// NU × PO
// PR × AL
// PR × HY
// PR × ID
// PR × NU
// PR × OP
// SY × NU
// Example pairs: $9, $[, $-, -9, /9, 99, ,9, 9% ]%
// LB 18 Dont break between the following pairs of classes.
// CL × PO
// HY × NU
// IS × NU
// NU × NU
// NU × PO
// PR × AL
// PR × HY
// PR × ID
// PR × NU
// PR × OP
// SY × NU
// Example pairs: $9, $[, $-, -9, /9, 99, ,9, 9% ]%
rule="18";
if (before == LB_CL && after == LB_PO) return false;
@ -595,23 +595,23 @@ public class GenerateLineBreakTest implements UCD_Types {
if (recommended) {
// LB 15b Break after hyphen-minus, and before acute accents:
// HY ÷
// ÷ BB
// HY ÷
// ÷ BB
rule="18b";
if (before == LB_HY) return true;
if (after == LB_BB) return true;
}
// LB 19 Dont break between alphabetics (at)
// AL × AL
// LB 19 Dont break between alphabetics (at)
// AL × AL
rule="19";
if (before == LB_AL && after == LB_AL) return false;
// LB 20 Break everywhere else
// ALL ÷
// ÷ ALL
// ALL ÷
// ÷ ALL
rule="20";
return true;
@ -754,7 +754,7 @@ public class GenerateLineBreakTest implements UCD_Types {
// Do not break between linking characters and letters, or before linking characters. This provides for Indic graphemes, where virama (halant) will link character clusters together.
rule = "12";
//Link Extend* × LetterBase (12)
//Link Extend* × LetterBase (12)
if (after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT) {
int backOffset = findLastNon(source, offset, Extend, recommended);
if (backOffset >= 0) {

View file

@ -1169,6 +1169,7 @@ public class MakeUnicodeFiles {
String line = in.readLine();
if (line == null) break;
if (line.startsWith("\uFEFF")) line = line.substring(1);
out.println(line);
line = line.trim();
int pos = line.indexOf('#');
if (pos >= 0) line = line.substring(0,pos).trim();
@ -1232,9 +1233,9 @@ public class MakeUnicodeFiles {
break;
default: throw new IllegalArgumentException("Internal Error");
}
out.println(String.valueOf(ok).toUpperCase(Locale.ENGLISH) + ":\t" + line);
if (ok) continue;
out.println();
out.println(String.valueOf(ok).toUpperCase(Locale.ENGLISH));
out.println("**** START Error Info ****");
bf.showSetDifferences(out, rightSide, rightSet, leftSide, leftSet);
out.println("**** END Error Info ****");

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
* $Date: 2004/02/06 18:30:20 $
* $Revision: 1.15 $
* $Date: 2004/04/17 18:21:39 $
* $Revision: 1.16 $
*
*******************************************************************************
*/
@ -22,7 +22,7 @@ import com.ibm.text.utility.*;
/**
* Implements Unicode Normalization Forms C, D, KC, KD.<br>
* See UTR#15 for details.<br>
* Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.<br>
* Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.<br>
* The Unicode Consortium makes no expressed or implied warranty of any
* kind, and assumes no liability for errors or omissions.
* No liability is assumed for incidental and consequential damages

View file

@ -10,7 +10,7 @@ import com.ibm.text.utility.*;
/**
* Implements Unicode Normalization Forms C, D, KC, KD.<br>
* See UTR#15 for details.<br>
* Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.<br>
* Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.<br>
* The Unicode Consortium makes no expressed or implied warranty of any
* kind, and assumes no liability for errors or omissions.
* No liability is assumed for incidental and consequential damages

View file

@ -41,6 +41,8 @@
#$East_Asian_Width:Neutral ? $GC:Uppercase_Letter
$GC:Zs ? $Name:«.*SPACE.*»
[$script:greek&$gc:«.*letter.*»] = [;\u00B5\u00B7\u00C4\u00CB\u00CF\u00D6\u00DC\u00E4\u00EB\u00EF\u00F6\u00FC\u00FF-\u0101\u0112-\u0113\u012A-\u012B\u014C-\u014D\u016A-\u016B\u0178\u01D5-\u01DC\u01DE-\u01E3\u01EC-\u01ED\u022A-\u022D\u0230-\u0233\u0304\u0308\u0313-\u0314\u0342-\u0345\u037A\u037E\u0386-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D7\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F5\u0401\u0407\u0451\u0457\u04D2-\u04D3\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F1\u04F4-\u04F5\u04F8-\u04F9\u1E14-\u1E17\u1E20-\u1E21\u1E26-\u1E27\u1E2E-\u1E2F\u1E38-\u1E39\u1E4E-\u1E53\u1E5C-\u1E5D\u1E7A-\u1E7B\u1E84-\u1E85\u1E8C-\u1E8D\u1E97\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC1-\u1FC4\u1FC6-\u1FCC\u1FCF-\u1FD3\u1FD6-\u1FDB\u1FDF-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126]
# Examples of parsing errors
# $LBA:Neutral = $GC:Zp # example of non-existant property
@ -54,7 +56,35 @@ $Line_Break:Unknown = [$General_Category:Unassigned $GeneralCategory:PrivateUse]
$LB:OP = $GC:Ps
$General_Category:Decimal_Number = $Numeric_Type:Decimal
$Whitespace ⊃ [$GC:Zs $GC:Zp $GC:Zl]
# Comparisons across versions
$ID_Start ⊇ $×ID_Start
$ID_Continue ⊇ $×ID_Continue
#$age:4.0.1 = $age4.0.0
# Derivations
$Math = [$GC:Sm $Other_Math]
$Alphabetic = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_Alphabetic]
$Lowercase = [$GC:Ll $Other_Lowercase]
$Uppercase = [$GC:Lu $Other_Uppercase]
$ID_Start = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_ID_Start]
$ID_Continue = [$ID_Start $GC:Mn $GC:Mc $GC:Nd $GC:Pc]
$Default_Ignorable_Code_Point = [[$Other_Default_Ignorable_Code_Point $GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]]
$Grapheme_Extend = [$GC:Me $GC:Mn $Other_Grapheme_Extend]
$Grapheme_Base = [^$GC:Cc $GC:Cf $GC:Cs $GC:Co $GC:Cn $GC:Zl $GC:Zp $Grapheme_Extend]
# "Minimal" Other_: NOT hard requirements; just if we want to be minimal
$Other_Math = [$Math - $GC:Sm]
$Other_Alphabetic = [$Alphabetic - [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl]]
$Other_Lowercase = [$Lowercase - $GC:Ll]
$Other_Uppercase = [$Uppercase - $GC:Lu]
$Other_ID_Start = [$ID_Start - [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl]]
$Other_Default_Ignorable_Code_Point = [$Default_Ignorable_Code_Point - [[$GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]]]
$Other_Grapheme_Extend = [$Grapheme_Extend - [$GC:Me $GC:Mn]]
# Testing
$script:greek = $×script:greek

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/UTF32.java,v $
* $Date: 2001/08/31 00:19:16 $
* $Revision: 1.2 $
* $Date: 2004/04/17 18:21:38 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -53,28 +53,28 @@ package com.ibm.text.utility;
<pre>
// iteration forwards: Original
for (int i = 0; i < s.length(); ++i) {
    char ch = s.charAt(i);
    doSomethingWith(ch);
    char ch = s.charAt(i);
    doSomethingWith(ch);
}
// iteration forwards: Changes for UTF-32
int ch;
for (int i = 0; i < s.length(); i+=UTF32.count16(ch)) {
    ch = UTF32.char32At(s,i);
    doSomethingWith(ch);
    ch = UTF32.char32At(s,i);
    doSomethingWith(ch);
}
// iteration backwards: Original
for (int i = s.length()-1; i >= 0; --i) {
    char ch = s.charAt(i);
    doSomethingWith(ch);
    char ch = s.charAt(i);
    doSomethingWith(ch);
}
// iteration backwards: Changes for UTF-32
int ch;
for (int i = s.length()-1; i > 0; i-=UTF32.count16(ch)) {
    ch = UTF32.char32At(s,i);
    doSomethingWith(ch);
    ch = UTF32.char32At(s,i);
    doSomethingWith(ch);
}
* </pre>