diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java b/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java index cf1c99d5bc6..9f7237e516b 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $ -* $Date: 2004/02/18 03:08:59 $ -* $Revision: 1.11 $ +* $Date: 2004/04/17 18:21:39 $ +* $Revision: 1.12 $ * ******************************************************************************* */ @@ -782,11 +782,11 @@ abstract public class GenerateBreakTest implements UCD_Types { public boolean isBreak(String source, int offset) { - setRule("1: sot "); + setRule("1: sot ÷"); if (offset < 0 || offset > source.length()) return false; if (offset == 0) return true; - setRule("2: eot"); + setRule("2: ÷ eot"); if (offset == source.length()) return true; // UTF-16: never break in the middle of a code point @@ -801,29 +801,29 @@ abstract public class GenerateBreakTest implements UCD_Types { byte before = getResolvedType(cpBefore); byte after = getResolvedType(cpAfter); - setRule("3: CR LF"); + setRule("3: CR × LF"); if (before == CR && after == LF) return false; - setRule("4: ( Control | CR | LF ) "); + setRule("4: ( Control | CR | LF ) ÷"); if (before == CR || before == LF || before == Control) return true; - setRule("5: ( Control | CR | LF )"); + setRule("5: ÷ ( Control | CR | LF )"); if (after == Control || after == LF || after == CR) return true; - setRule("6: L ( L | V | LV | LVT )"); + setRule("6: L × ( L | V | LV | LVT )"); if (before == L && (after == L || after == V || after == LV || after == LVT)) return false; - setRule("7: ( LV | V ) ( V | T )"); + setRule("7: ( LV | V ) × ( V | T )"); if ((before == LV || before == V) && (after == V || after == T)) return false; - setRule("8: ( LVT | T ) T"); + setRule("8: ( LVT | T ) × T"); if ((before == LVT || before == T) && (after == T)) return false; - setRule("9: Extend"); + setRule("9: × Extend"); if (after == Extend) return false; // Otherwise break after all characters. - setRule("10: Any Any"); + setRule("10: Any ÷ Any"); return true; } @@ -914,12 +914,12 @@ abstract public class GenerateBreakTest implements UCD_Types { public boolean isBreak(String source, int offset) { - setRule("1: sot "); + setRule("1: sot ÷"); if (offset < 0 || offset > source.length()) return false; if (offset == 0) return true; - setRule("2: eot"); + setRule("2: ÷ eot"); if (offset == source.length()) return true; // Treat a grapheme cluster as if it were a single character: @@ -943,43 +943,43 @@ abstract public class GenerateBreakTest implements UCD_Types { //Don't break between most letters - setRule("5: ALetter ALetter"); + setRule("5: ALetter × ALetter"); if (before == ALetter && after == ALetter) return false; - // Dont break letters across certain punctuation + // Don’t break letters across certain punctuation - setRule("6: ALetter (MidLetter | MidNumLet) ALetter"); + setRule("6: ALetter × (MidLetter | MidNumLet) ALetter"); if (before == ALetter && (after == MidLetter || after == MidNumLet) && after2 == ALetter) return false; - setRule("7: ALetter (MidLetter | MidNumLet) ALetter"); + setRule("7: ALetter (MidLetter | MidNumLet) × ALetter"); if (before2 == ALetter && (before == MidLetter || before == MidNumLet) && after == ALetter) return false; - // Dont break within sequences of digits, or digits adjacent to letters. + // Don’t break within sequences of digits, or digits adjacent to letters. - setRule("8: Numeric Numeric"); + setRule("8: Numeric × Numeric"); if (before == Numeric && after == Numeric) return false; - setRule("9: ALetter Numeric"); + setRule("9: ALetter × Numeric"); if (before == ALetter && after == Numeric) return false; - setRule("10: Numeric ALetter"); + setRule("10: Numeric × ALetter"); if (before == Numeric && after == ALetter) return false; - // Dont break within sequences like: '-3.2' - setRule("11: Numeric (MidNum | MidNumLet) Numeric"); + // Don’t break within sequences like: '-3.2' + setRule("11: Numeric (MidNum | MidNumLet) × Numeric"); if (before2 == Numeric && (before == MidNum || before == MidNumLet) && after == Numeric) return false; - setRule("12: Numeric (MidNum | MidNumLet) Numeric"); + setRule("12: Numeric × (MidNum | MidNumLet) Numeric"); if (before == Numeric && (after == MidNum || after == MidNumLet) && after2 == Numeric) return false; // Don't break between Katakana - setRule("13: Katakana Katakana"); + setRule("13: Katakana × Katakana"); if (before == Katakana && after == Katakana) return false; // Otherwise break always. - setRule("14: Any Any"); + setRule("14: Any ÷ Any"); return true; } @@ -1235,7 +1235,7 @@ abstract public class GenerateBreakTest implements UCD_Types { // LB 2a Never break at the start of text - setRule("2a: sot"); + setRule("2a: × sot"); if (offset <= 0) return false; // LB 2b Always break at the end of text @@ -1269,26 +1269,26 @@ abstract public class GenerateBreakTest implements UCD_Types { //byte after = getResolvedType(cpAfter); - setRule("3a: CR LF ; ( BK | CR | LF | NL ) !"); + setRule("3a: CR × LF ; ( BK | CR | LF | NL ) !"); // Always break after hard line breaks (but never between CR and LF). // CR ^ LF if (before == LB_CR && after == LB_LF) return false; if (before == LB_BK || before == LB_LF || before == LB_CR) return true; - //LB 3b Dont break before hard line breaks. - setRule("3b: ( BK | CR | LF )"); + //LB 3b Don’t break before hard line breaks. + setRule("3b: × ( BK | CR | LF )"); if (after == LB_BK || after == LB_LF || after == LB_CR) return false; - // LB 4 Dont break before spaces or zero-width space. - setRule("4: ( SP | ZW )"); + // LB 4 Don’t break before spaces or zero-width space. + setRule("4: × ( SP | ZW )"); if (after == LB_SP || after == LB_ZW) return false; // LB 5 Break after zero-width space. - setRule("5: ZW "); + setRule("5: ZW ÷"); if (before == LB_ZW) return true; - // LB 6 Dont break graphemes (before combining marks, around virama or on sequences of conjoining Jamos. + // LB 6 Don’t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos. setRule("6: DGC -> FC"); if (!grapheme.isBreak( source, offset)) return false; @@ -1324,9 +1324,9 @@ abstract public class GenerateBreakTest implements UCD_Types { if (setBase && backBase == -1) before = LB_AL; - // LB 8 Dont break before ] or ! or ; or /, even after spaces. - // CL, EX, IS, SY - setRule("8: ( CL | EX | IS | SY )"); + // LB 8 Don’t break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. + // × CL, × EX, × IS, × SY + setRule("8: × ( CL | EX | IS | SY )"); if (after == LB_CL || after == LB_EX || after == LB_SY | after == LB_IS) return false; @@ -1339,97 +1339,97 @@ abstract public class GenerateBreakTest implements UCD_Types { } } - // LB 9 Dont break after [, even after spaces. - // OP SP* - setRule("9: OP SP* "); + // LB 9 Don’t break after ‘[’, even after spaces. + // OP SP* × + setRule("9: OP SP* ×"); if (lastNonSpace == LB_OP) return false; - // LB 10 Dont break within [, , even with intervening spaces. - // QU SP* OP - setRule("10: QU SP* OP"); + // LB 10 Don’t break within ‘”[’, , even with intervening spaces. + // QU SP* × OP + setRule("10: QU SP* × OP"); if (lastNonSpace == LB_QU && after == LB_OP) return false; - // LB 11 Dont break within ]h, even with intervening spaces. - // CL SP* NS - setRule("11: CL SP* NS"); + // LB 11 Don’t break within ‘]h’, even with intervening spaces. + // CL SP* × NS + setRule("11: CL SP* × NS"); if (lastNonSpace == LB_CL && after == LB_NS) return false; - // LB 11a Dont break within , even with intervening spaces. - // B2 B2 - setRule("11a: B2 B2"); + // LB 11a Don’t break within ‘——’, even with intervening spaces. + // B2 × B2 + setRule("11a: B2 × B2"); if (lastNonSpace == LB_B2 && after == LB_B2) return false; - // LB 13 Dont break before or after NBSP or WORD JOINER - // GL - // GL + // LB 13 Don’t break before or after NBSP or WORD JOINER + // × GL + // GL × - setRule("11b: WJ ; WJ "); + setRule("11b: × WJ ; WJ ×"); if (after == LB_WJ || before == LB_WJ) return false; // [Note: by this time, all of the "X" in the table are accounted for. We can safely break after spaces.] // LB 12 Break after spaces - setRule("12: SP "); + setRule("12: SP ÷"); if (before == LB_SP) return true; - // LB 13 Dont break before or after NBSP or WORD JOINER - setRule("13: GL ; GL "); + // LB 13 Don’t break before or after NBSP or WORD JOINER + setRule("13: × GL ; GL ×"); if (after == LB_GL || before == LB_GL) return false; - // LB 14 Dont break before or after - setRule("14: QU ; QU "); + // LB 14 Don’t break before or after ‘”’ + setRule("14: × QU ; QU ×"); if (before == LB_QU || after == LB_QU) return false; // LB 14a Break before and after CB - setRule("14a: CB ; CB "); + setRule("14a: ÷ CB ; CB ÷"); if (before == LB_CB || after == LB_CB) return true; - // LB 15 Dont break before hyphen-minus, other hyphens, fixed-width spaces, + // LB 15 Don’t break before hyphen-minus, other hyphens, fixed-width spaces, // small kana and other non- starters, or after acute accents: - setRule("15: ( BA | HY | NS ) ; BB "); + setRule("15: × ( BA | HY | NS ) ; BB ×"); if (after == LB_NS) return false; if (after == LB_HY) return false; if (after == LB_BA) return false; if (before == LB_BB) return false; - //setRule("15a: HY NU"); // NEW + //setRule("15a: HY × NU"); // NEW //if (before == LB_HY && after == LB_NU) return false; - // LB 16 Dont break between two ellipses, or between letters or numbers and ellipsis: - // Examples: 9..., a..., H... - setRule("16: ( AL | ID | IN | NU ) IN"); + // LB 16 Don’t break between two ellipses, or between letters or numbers and ellipsis: + // Examples: ’9...’, ‘a...’, ‘H...’ + setRule("16: ( AL | ID | IN | NU ) × IN"); if ((before == LB_NU || before == LB_AL || before == LB_ID) && after == LB_IN) return false; if (before == LB_IN && after == LB_IN) return false; // Don't break alphanumerics. - // LB 17 Dont break within a9, 3a, or H% + // LB 17 Don’t break within ‘a9’, ‘3a’, or ‘H%’ // Numbers are of the form PR ? ( OP | HY ) ? NU (NU | IS) * CL ? PO ? - // Examples: $(12.35) 2,1234 (12) 12.54 + // Examples: $(12.35) 2,1234 (12)¢ 12.54¢ // This is approximated with the following rules. (Some cases already handled above, - // like 9,, [9.) - setRule("17: ID PO ; AL NU; NU AL"); + // like ‘9,’, ‘[9’.) + setRule("17: ID × PO ; AL × NU; NU × AL"); if (before == LB_ID && after == LB_PO) return false; if (before == LB_AL && after == LB_NU) return false; if (before == LB_NU && after == LB_AL) return false; - // LB 18 Dont break between the following pairs of classes. - // CL PO - // HY NU - // IS NU - // NU NU - // NU PO - // PR AL - // PR HY - // PR ID - // PR NU - // PR OP - // SY NU - // Example pairs: $9, $[, $-, -9, /9, 99, ,9, 9% ]% + // LB 18 Don’t break between the following pairs of classes. + // CL × PO + // HY × NU + // IS × NU + // NU × NU + // NU × PO + // PR × AL + // PR × HY + // PR × ID + // PR × NU + // PR × OP + // SY × NU + // Example pairs: ‘$9’, ‘$[’, ‘$-‘, ‘-9’, ‘/9’, ‘99’, ‘,9’, ‘9%’ ‘]%’ - setRule("18: CL PO ; NU PO ; ( IS | NU | HY | PR | SY ) NU ; PR ( AL | HY | ID | OP )"); + setRule("18: CL × PO ; NU × PO ; ( IS | NU | HY | PR | SY ) × NU ; PR × ( AL | HY | ID | OP )"); if (before == LB_CL && after == LB_PO) return false; if (before == LB_IS && after == LB_NU) return false; if (before == LB_NU && after == LB_NU) return false; @@ -1446,30 +1446,30 @@ abstract public class GenerateBreakTest implements UCD_Types { if (before == LB_SY && after == LB_NU) return false; // LB 15b Break after hyphen-minus, and before acute accents: - setRule("18b: HY ; BB"); + setRule("18b: HY ÷ ; ÷ BB"); if (before == LB_HY) return true; if (after == LB_BB) return true; - // LB 19 Dont break between alphabetics (at) - // AL AL + // LB 19 Don’t break between alphabetics (“at”) + // AL × AL - setRule("19: AL AL"); + setRule("19: AL × AL"); if (before == LB_AL && after == LB_AL) return false; // LB 20 Break everywhere else - // ALL - // ALL + // ALL ÷ + // ÷ ALL if (ucd.getCompositeVersion() > 0x040000) { - setRule("19b: IS AL"); + setRule("19b: IS × AL"); if (before == LB_IS && after == LB_AL) return false; } // LB 20 Break everywhere else - // ALL - // ALL + // ALL ÷ + // ÷ ALL - setRule("20: ALL ; ALL"); + setRule("20: ALL ÷ ; ÷ ALL"); return true; } } @@ -1498,8 +1498,8 @@ abstract public class GenerateBreakTest implements UCD_Types { "U.S.A\u0300.", "3.4", "c.d", - "etc.)\u2019\u2018(the", - "etc.)\u2019\u2018(The", + "etc.)\u2019 \u2018(the", + "etc.)\u2019 \u2018(The", "the resp. leaders are", "\u5B57.\u5B57", "etc.\u5B83", @@ -1631,15 +1631,15 @@ abstract public class GenerateBreakTest implements UCD_Types { public boolean isBreak(String source, int offset) { // Break at the start and end of text. - setRule("1: sot "); + setRule("1: sot ÷"); if (offset < 0 || offset > source.length()) return false; if (offset == 0) return true; - setRule("2: eot"); + setRule("2: ÷ eot"); if (offset == source.length()) return true; - setRule("3: Sep "); + setRule("3: Sep ÷"); byte beforeChar = getResolvedType(source.charAt(offset-1)); if (beforeChar == Sep) return true; @@ -1662,22 +1662,22 @@ abstract public class GenerateBreakTest implements UCD_Types { // HACK COPY for rule collection! if (collectingRules) { - setRule("6: ATerm ( Numeric | Lower )"); - setRule("7: Upper ATerm Upper"); - setRule("8: ATerm Close* Sp* ( (OLetter | Upper | Lower) )* Lower"); - setRule("9: ( Term | ATerm ) Close* ( Close | Sp | Sep )"); - setRule("10: ( Term | ATerm ) Close* Sp ( Sp | Sep )"); - setRule("11: ( Term | ATerm ) Close* Sp* "); - setRule("12: Any Any"); + setRule("6: ATerm × ( Numeric | Lower )"); + setRule("7: Upper ATerm × Upper"); + setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower"); + setRule("9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )"); + setRule("10: ( Term | ATerm ) Close* Sp × ( Sp | Sep )"); + setRule("11: ( Term | ATerm ) Close* Sp* ÷"); + setRule("12: Any × Any"); collectingRules = false; } // Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter, is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence. if (before == ATerm) { - setRule("6: ATerm ( Numeric | Lower )"); + setRule("6: ATerm × ( Numeric | Lower )"); if (after == Lower || after == Numeric) return false; - setRule("7: Upper ATerm Upper"); + setRule("7: Upper ATerm × Upper"); if (DEBUG_GRAPHEMES) System.out.println(context + ", " + Upper); if (before2 == Upper && after == Upper) return false; } @@ -1736,17 +1736,17 @@ abstract public class GenerateBreakTest implements UCD_Types { if (lookAfter == -1) { // Otherwise, do not break - // Any Any (11) - setRule("12: Any Any"); + // Any × Any (11) + setRule("12: Any × Any"); return false; } - // ATerm Close* Sp*(( OLetter))* Lower(8) + // ATerm Close* Sp*×(¬( OLetter))* Lower(8) // Break after sentence terminators, but include closing punctuation, trailing spaces, and (optionally) a paragraph separator. - // ( Term | ATerm ) Close*( Close | Sp | Sep )(9) - // ( Term | ATerm ) Close* Sp( Sp | Sep )(10) - // ( Term | ATerm ) Close* Sp*(11) + // ( Term | ATerm ) Close*×( Close | Sp | Sep )(9) + // ( Term | ATerm ) Close* Sp×( Sp | Sep )(10) + // ( Term | ATerm ) Close* Sp*÷(11) // We DID find one. Loop to see if the right side is ok. @@ -1764,16 +1764,16 @@ abstract public class GenerateBreakTest implements UCD_Types { if (isFirst) { isFirst = false; if (lookAfter == ATerm && t == Upper) { - setRule("8: ATerm Close* Sp* ( (OLetter | Upper | Lower) )* Lower"); + setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower"); return false; } if (gotSpace) { if (t == Sp || t == Sep) { - setRule("10: ( Term | ATerm ) Close* Sp ( Sp | Sep )"); + setRule("10: ( Term | ATerm ) Close* Sp × ( Sp | Sep )"); return false; } } else if (t == Close || t == Sp || t == Sep) { - setRule("9: ( Term | ATerm ) Close* ( Close | Sp | Sep )"); + setRule("9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )"); return false; } if (lookAfter == Term) break; @@ -1782,12 +1782,12 @@ abstract public class GenerateBreakTest implements UCD_Types { // at this point, we have an ATerm. All other conditions are ok, but we need to verify 6 if (t != OLetter && t != Upper && t != Lower) continue; if (t == Lower) { - setRule("8: ATerm Close* Sp* ( (OLetter | Upper | Lower) )* Lower"); + setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower"); return false; } break; } - setRule("11: ( Term | ATerm ) Close* Sp* "); + setRule("11: ( Term | ATerm ) Close* Sp* ÷"); return true; } } diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java b/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java index 3c4fc7a9a65..6816bd49df8 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $ -* $Date: 2004/02/07 01:01:15 $ -* $Revision: 1.14 $ +* $Date: 2004/04/17 18:21:39 $ +* $Revision: 1.15 $ * ******************************************************************************* */ @@ -547,18 +547,18 @@ public final class GenerateHanTransliterator implements UCD_Types { "e", "ei", "er", "en", "eng", "i", "ia", "iao", "ie", "iu", "ian", "in", "iang", "ing", "iong", "u", "ua", "uo", "uai", "ui", "uan", "un", "uang", "ueng", - "", "e", "an", "n" + "ü", "üe", "üan", "ün" }; // Don't bother with the following rules; just add w,y to initials - // When i stands alone, a y will be added before it as yi. - // If i is the first letter of the syllable it will be changed to y. - // When u stands alone, a w will be added before it as wu. - // If u is the first letter of the syllable it will be changed to w. e.g. uang -> wang. - // When stands alone, a y will be added before it and will be changed to u as yu. - // If is the first letter of the syllable, then the spelling will be changed to yu. e.g. an -> yuan. - //Note: The nasal final ueng never occurs after an initial but always form a syllable by itself. - // The o in iou is hidden, so it will be wrote as iu. But, dont forget to pronounce it. - // The e in uei is hidden, so it will be wrote as ui. But, dont forget to pronounce it. + // When “i” stands alone, a “y” will be added before it as “yi”. + // If “i” is the first letter of the syllable it will be changed to “y”. + // When “u” stands alone, a “w” will be added before it as “wu”. + // If “u” is the first letter of the syllable it will be changed to “w”. e.g. “uang -> wang”. + // When “ü” stands alone, a “y” will be added before it and “ü” will be changed to “u” as “yu”. + // If “ü” is the first letter of the syllable, then the spelling will be changed to “yu”. e.g. “üan -> yuan”. + //Note: The nasal final “ueng” never occurs after an initial but always form a syllable by itself. + // The “o” in “iou” is hidden, so it will be wrote as “iu”. But, don’t forget to pronounce it. + // The “e” in “uei” is hidden, so it will be wrote as “ui”. But, don’t forget to pronounce it. public static final String[] pinyin_bopomofo = { @@ -749,9 +749,9 @@ public final class GenerateHanTransliterator implements UCD_Types { "long", "\u310c\u3128\u3125", "lou", "\u310c\u3121", "lu", "\u310c\u3128", - "l", "\u310c\u3129", + "lü", "\u310c\u3129", "luan", "\u310c\u3128\u3122", - "le", "\u310c\u3129\u311d", + "lüe", "\u310c\u3129\u311d", "lun", "\u310c\u3128\u3123", "luo", "\u310c\u3128\u311b", "m", "\u3107", @@ -796,9 +796,9 @@ public final class GenerateHanTransliterator implements UCD_Types { "nong", "\u310b\u3128\u3125", "nou", "\u310b\u3121", "nu", "\u310b\u3128", - "n", "\u310b\u3129", + "nü", "\u310b\u3129", "nuan", "\u310b\u3128\u3122", - "ne", "\u310b\u3129\u311d", + "nüe", "\u310b\u3129\u311d", "nuo", "\u310b\u3128\u311b", "o", "\u311b", "ou", "\u3121", @@ -1007,52 +1007,52 @@ public final class GenerateHanTransliterator implements UCD_Types { } /* - U+347C li #lyu -U+3500 l #lv -U+3527 li #ly -U+3729 o #u -U+380E j #jj -U+3825 l #lv -U+3A3C l #lu -U+3B5A li #ly *** l? -U+3CB6 l #lv -U+3D56 ni #ny *** n? -U+3D88 ling #ling -U+3EF2 li #ly*** l? -U+3F94 li #ly*** l? -U+4071 o #u -U+40AE li #lyu *** le? -U+430E li #lyu *** le? -U+451E li #ly *** l? -U+4588 n #nu -U+458B n #nu -U+45A1 ni #ny *** n? -U+4610 ni #ny *** n? -U+46BC ni #ny *** n? -U+46DA li #lyu *** le? -U+4896 li #ly *** l? -U+4923 li #lyu *** le? -U+4968 li #ly *** l? -U+4A0B ni #nyu *** ne? -U+4AC4 chu #chu -U+4D08 o #u -U+4D8A ni #ny *** n? -U+51CA qng #qng -U+51D6 zhun #zhun *** this is probably zhn -U+5481 gn #gm -U+5838 fng #fng -U+639F l #lu *** this pronunciation surprises me, but I don't know... -U+66D5 yn #yin -U+6B3B chu #chu *** chua _is_ ok after all, my table missed an entry -U+6B56 chu #chu *** chua -U+6C7C ni #niu -U+6E6D qi #qiu -U+6F71 y #yi -U+7493 xi #xiu -U+7607 zhng #zhng *** I suspect zhng -U+7674 lun #ln -U+7867 yng #ing -U+7878 n #nu + U+347C · liù #lyuè +U+3500 · lüè #lvè +U+3527 · liù #lyù +U+3729 · ào #àu +U+380E · jí #jjí +U+3825 · l· #lv· +U+3A3C · lüè #luè +U+3B5A · li· #ly· *** lü? +U+3CB6 · l· #lv· +U+3D56 · niù #nyù *** nü? +U+3D88 · li·ng #li·ng +U+3EF2 · li· #ly·*** lü? +U+3F94 · li· #ly·*** lü? +U+4071 · ào #àu +U+40AE · liù #lyuè *** lüe? +U+430E · liù #lyuè *** lüe? +U+451E · liù #lyù *** lü? +U+4588 · nüè #nuè +U+458B · nüè #nuè +U+45A1 · niù #nyù *** nü? +U+4610 · niù #nyù *** nü? +U+46BC · niù #nyù *** nü? +U+46DA · liù #lyuè *** lüe? +U+4896 · liù #lyù *** lü? +U+4923 · liù #lyuè *** lüe? +U+4968 · liù #lyù *** lü? +U+4A0B · niù #nyuè *** nüe? +U+4AC4 · chuò #chuà +U+4D08 · ·o #·u +U+4D8A · niù #nyù *** nü? +U+51CA · qíng #qýng +U+51D6 · zhu·n #zhu·n *** this is probably zh·n +U+5481 · gàn #gèm +U+5838 · féng #fúng +U+639F · lü· #lu· *** this pronunciation surprises me, but I don't know... +U+66D5 · yàn #yiàn +U+6B3B · chu· #chu· *** chua _is_ ok after all, my table missed an entry +U+6B56 · chu· #chu· *** chua +U+6C7C · ni· #ni·u +U+6E6D · qiú #qióu +U+6F71 · y· #yi· +U+7493 · xiù #xiòu +U+7607 · zh·ng #zh·ng *** I suspect zh·ng +U+7674 · luán #lüán +U+7867 · y·ng #i·ng +U+7878 · nüè #nuè */ static Transliterator fixTypos = Transliterator.createFromRules("fix_typos", @@ -1061,12 +1061,12 @@ U+7878 +"$cons{iou}$nlet > iu;" +"$cons{em}$nlet > an;" +"$cons{uen}$nlet > ueng;" - +"$cons{ve}$nlet > e;" - +"$cons{v}$nlet > ;" + +"$cons{ve}$nlet > üe;" + +"$cons{v}$nlet > ü;" +"$cons{yue}$nlet > iu;" +"$cons{yng}$nlet > ing;" +"$cons{yu}$nlet > iu;" - //+"$cons{ue} > e;" + //+"$cons{ue} > üe;" +"jj > j;" //+"$nlet{ng}$nlet > eng;" //+"$nlet{n}$nlet > en;" @@ -1076,13 +1076,13 @@ U+7878 // new fixes +"zhueng}$nlet > zhong;" +"zhuen}$nlet > zhuan;" - +"lue > le;" + +"lue > lüe;" +"liong > liang;" - +"nue > ne;" + +"nue > nüe;" +"chua > chuo;" +"yian > yan;" +"yie > ye;" - +"lan > luan;" + +"lüan > luan;" +"iong > yong;" , Transliterator.FORWARD); @@ -1113,7 +1113,7 @@ U+7878 try { // chinese_frequency.txt - // 1 的 1588561 1588561 3.5008% + // 1 çš„ 1588561 1588561 3.5008% // japanese_frequency.txt // 1 ? 17176 @@ -1421,7 +1421,7 @@ U+7878 @Unihan Data Bad pinyin data: \u4E7F ? LE -\u7684 ? de, de, d, d +\u7684 ? de, de, dí, dì */ static void fixChineseOverrides() throws IOException { @@ -2024,7 +2024,7 @@ Bad pinyin data: \u4E7F ? LE + "# otherwise 'o'\n" + "# otherwise last vowel\n" + "::NFC;\n" - + "$vowel = [aAeEiIoOuU];\n" + + "$vowel = [aAeEiIoOuUüÜ];\n" + "$consonant = [[a-z A-Z] - [$vowel]];\n" + "$digit = [1-5];\n" + "([aAeE]) ($vowel* $consonant*) ($digit) > $1 &digit-tone($3) $2;\n" @@ -2054,10 +2054,10 @@ Bad pinyin data: \u4E7F ? LE if (i > 0) { char last = result.charAt(result.length()-1); if (last == 'u') { - result.setCharAt(result.length()-1, ''); + result.setCharAt(result.length()-1, 'ü'); continue main; } else if (last == 'U') { - result.setCharAt(result.length()-1, ''); + result.setCharAt(result.length()-1, 'Ü'); continue main; } } @@ -2085,22 +2085,22 @@ Bad pinyin data: \u4E7F ? LE for (int i = source.length()-2; i >= 0; --i) { ch = source.charAt(i); if (ch == ':') { - ch = ''; + ch = 'Ü'; --i; } if ('0' <= ch && ch <= '9') break; - if (ch != '' && (ch < 'A' || ch > 'Z')) { + if (ch != 'Ü' && (ch < 'A' || ch > 'Z')) { Utility.fixDot(); System.out.println("Warning: non-ASCII in " + hex.transliterate(source) + " (" + hex.transliterate(debugLine) + ")"); break; } if (!gotIt) switch (ch) { - case 'A': ch = "A\u0102\u0100".charAt(num); gotIt = true; break; - case 'E': ch = "E\u0114\u0112".charAt(num); gotIt = true; break; - case 'I': ch = "I\u012C\u012A".charAt(num); gotIt = true; break; - case 'O': ch = "O\u014E\u014C".charAt(num); gotIt = true; break; - case 'U': ch = "U\u016C\u016A".charAt(num); gotIt = true; break; - case '': ch = "\u01D7\u01D9\u01DB\u01D5".charAt(num); gotIt = true; break; + case 'A': ch = "AÁ\u0102À\u0100".charAt(num); gotIt = true; break; + case 'E': ch = "EÉ\u0114È\u0112".charAt(num); gotIt = true; break; + case 'I': ch = "IÍ\u012CÌ\u012A".charAt(num); gotIt = true; break; + case 'O': ch = "OÓ\u014EÒ\u014C".charAt(num); gotIt = true; break; + case 'U': ch = "UÚ\u016CÙ\u016A".charAt(num); gotIt = true; break; + case 'Ü': ch = "Ü\u01D7\u01D9\u01DB\u01D5".charAt(num); gotIt = true; break; } handlePinyinTemp.insert(0,ch); } diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java b/tools/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java index fbb451d2465..f77a3760400 100644 --- a/tools/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java,v $ -* $Date: 2004/02/07 01:01:15 $ -* $Revision: 1.4 $ +* $Date: 2004/04/17 18:21:39 $ +* $Revision: 1.5 $ * ******************************************************************************* */ @@ -398,23 +398,23 @@ public class GenerateLineBreakTest implements UCD_Types { if (before == LB_CR && after == LB_LF) return false; if (before == LB_BK || before == LB_LF || before == LB_CR) return true; - //LB 3b Dont break before hard line breaks. + //LB 3b Don’t break before hard line breaks. rule="3b"; if (after == LB_BK || after == LB_LF | after == LB_CR) return false; - // LB 4 Dont break before spaces or zero-width space. - // SP - // ZW + // LB 4 Don’t break before spaces or zero-width space. + // × SP + // × ZW rule="4"; if (after == LB_SP || after == LB_ZW) return false; // LB 5 Break after zero-width space. - // ZW + // ZW ÷ rule="5"; if (before == LB_ZW) return true; - // LB 6 Dont break graphemes (before combining marks, around virama or on sequences of conjoining Jamos. + // LB 6 Don’t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos. rule="6"; if (after == LB_CM) return false; @@ -441,8 +441,8 @@ public class GenerateLineBreakTest implements UCD_Types { rule="7"; if (setBase && before == LB_SP) before = LB_ID; - // LB 8 Dont break before ] or ! or ; or /, even after spaces. - // CL, EX, IS, SY + // LB 8 Don’t break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. + // × CL, × EX, × IS, × SY rule="8"; if (after == LB_CL || after == LB_EX || after == LB_SY | after == LB_IS) return false; @@ -456,31 +456,31 @@ public class GenerateLineBreakTest implements UCD_Types { } } - // LB 9 Dont break after [, even after spaces. - // OP SP* + // LB 9 Don’t break after ‘[’, even after spaces. + // OP SP* × rule="9"; if (lastNonSpace == LB_OP) return false; - // LB 10 Dont break within [, , even with intervening spaces. - // QU SP* OP + // LB 10 Don’t break within ‘”[’, , even with intervening spaces. + // QU SP* × OP rule="10"; if (lastNonSpace == LB_QU && after == LB_OP) return false; - // LB 11 Dont break within ]h, even with intervening spaces. - // CL SP* NS + // LB 11 Don’t break within ‘]h’, even with intervening spaces. + // CL SP* × NS rule="11"; if (lastNonSpace == LB_CL && after == LB_NS) return false; - // LB 11a Dont break within , even with intervening spaces. - // B2 B2 + // LB 11a Don’t break within ‘——’, even with intervening spaces. + // B2 × B2 rule="11a"; if (lastNonSpace == LB_B2 && after == LB_B2) return false; if (recommended) { - // LB 13 Dont break before or after NBSP or WORD JOINER - // GL - // GL + // LB 13 Don’t break before or after NBSP or WORD JOINER + // × GL + // GL × rule="11b"; if (after == LB_GL || before == LB_GL) return false; @@ -490,36 +490,36 @@ public class GenerateLineBreakTest implements UCD_Types { rule="12"; // LB 12 Break after spaces - // SP + // SP ÷ if (before == LB_SP) return true; if (!recommended) { - // LB 13 Dont break before or after NBSP or WORD JOINER - // GL - // GL + // LB 13 Don’t break before or after NBSP or WORD JOINER + // × GL + // GL × rule="13"; if (after == LB_GL || before == LB_GL) return false; } rule="14"; - // LB 14 Dont break before or after - // QU - // QU + // LB 14 Don’t break before or after ‘”’ + // × QU + // QU × if (before == LB_QU || after == LB_QU) return false; - // LB 15 Dont break before hyphen-minus, other hyphens, fixed-width spaces, + // LB 15 Don’t break before hyphen-minus, other hyphens, fixed-width spaces, // small kana and other non- starters, or after acute accents: - // BA - // HY - // NS - // BB + // × BA + // × HY + // × NS + // BB × if (recommended) { // LB 14a Break before and after CB - // CB - // CB + // CB ÷ + // ÷ CB if (before == LB_CB || after == LB_CB) return true; } @@ -532,51 +532,51 @@ public class GenerateLineBreakTest implements UCD_Types { if (!recommended) { // LB 15b Break after hyphen-minus, and before acute accents: - // HY - // BB + // HY ÷ + // ÷ BB rule="15b"; if (before == LB_HY) return true; if (after == LB_BB) return true; } - // LB 16 Dont break between two ellipses, or between letters or numbers and ellipsis: - // AL IN - // ID IN - // IN IN - // NU IN - // Examples: 9..., a..., H... + // LB 16 Don’t break between two ellipses, or between letters or numbers and ellipsis: + // AL × IN + // ID × IN + // IN × IN + // NU × IN + // Examples: ’9...’, ‘a...’, ‘H...’ rule="16"; if ((before == LB_NU || before == LB_AL || before == LB_ID) && after == LB_IN) return false; if (before == LB_IN && after == LB_IN) return false; // Don't break alphanumerics. - // LB 17 Dont break within a9, 3a, or H% - // ID PO - // AL NU - // NU AL + // LB 17 Don’t break within ‘a9’, ‘3a’, or ‘H%’ + // ID × PO + // AL × NU + // NU × AL // Numbers are of the form PR ? ( OP | HY ) ? NU (NU | IS) * CL ? PO ? - // Examples: $(12.35) 2,1234 (12) 12.54 + // Examples: $(12.35) 2,1234 (12)¢ 12.54¢ // This is approximated with the following rules. (Some cases already handled above, - // like 9,, [9.) + // like ‘9,’, ‘[9’.) rule="17"; if (before == LB_ID && after == LB_PO) return false; if (before == LB_AL && after == LB_NU) return false; if (before == LB_NU && after == LB_AL) return false; - // LB 18 Dont break between the following pairs of classes. - // CL PO - // HY NU - // IS NU - // NU NU - // NU PO - // PR AL - // PR HY - // PR ID - // PR NU - // PR OP - // SY NU - // Example pairs: $9, $[, $-, -9, /9, 99, ,9, 9% ]% + // LB 18 Don’t break between the following pairs of classes. + // CL × PO + // HY × NU + // IS × NU + // NU × NU + // NU × PO + // PR × AL + // PR × HY + // PR × ID + // PR × NU + // PR × OP + // SY × NU + // Example pairs: ‘$9’, ‘$[’, ‘$-‘, ‘-9’, ‘/9’, ‘99’, ‘,9’, ‘9%’ ‘]%’ rule="18"; if (before == LB_CL && after == LB_PO) return false; @@ -595,23 +595,23 @@ public class GenerateLineBreakTest implements UCD_Types { if (recommended) { // LB 15b Break after hyphen-minus, and before acute accents: - // HY - // BB + // HY ÷ + // ÷ BB rule="18b"; if (before == LB_HY) return true; if (after == LB_BB) return true; } - // LB 19 Dont break between alphabetics (at) - // AL AL + // LB 19 Don’t break between alphabetics (“at”) + // AL × AL rule="19"; if (before == LB_AL && after == LB_AL) return false; // LB 20 Break everywhere else - // ALL - // ALL + // ALL ÷ + // ÷ ALL rule="20"; return true; @@ -754,7 +754,7 @@ public class GenerateLineBreakTest implements UCD_Types { // Do not break between linking characters and letters, or before linking characters. This provides for Indic graphemes, where virama (halant) will link character clusters together. rule = "12"; - //Link Extend* LetterBase (12) + //Link Extend* × LetterBase (12) if (after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT) { int backOffset = findLastNon(source, offset, Extend, recommended); if (backOffset >= 0) { diff --git a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java index f175b5e531e..9e272ecfdcb 100644 --- a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java +++ b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java @@ -1169,6 +1169,7 @@ public class MakeUnicodeFiles { String line = in.readLine(); if (line == null) break; if (line.startsWith("\uFEFF")) line = line.substring(1); + out.println(line); line = line.trim(); int pos = line.indexOf('#'); if (pos >= 0) line = line.substring(0,pos).trim(); @@ -1232,9 +1233,9 @@ public class MakeUnicodeFiles { break; default: throw new IllegalArgumentException("Internal Error"); } - out.println(String.valueOf(ok).toUpperCase(Locale.ENGLISH) + ":\t" + line); if (ok) continue; out.println(); + out.println(String.valueOf(ok).toUpperCase(Locale.ENGLISH)); out.println("**** START Error Info ****"); bf.showSetDifferences(out, rightSide, rightSet, leftSide, leftSet); out.println("**** END Error Info ****"); diff --git a/tools/unicodetools/com/ibm/text/UCD/Normalizer.java b/tools/unicodetools/com/ibm/text/UCD/Normalizer.java index beedcc1dd69..163bf2de87a 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Normalizer.java +++ b/tools/unicodetools/com/ibm/text/UCD/Normalizer.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $ -* $Date: 2004/02/06 18:30:20 $ -* $Revision: 1.15 $ +* $Date: 2004/04/17 18:21:39 $ +* $Revision: 1.16 $ * ******************************************************************************* */ @@ -22,7 +22,7 @@ import com.ibm.text.utility.*; /** * Implements Unicode Normalization Forms C, D, KC, KD.
* See UTR#15 for details.
- * Copyright 1998-1999 Unicode, Inc. All Rights Reserved.
+ * Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.
* The Unicode Consortium makes no expressed or implied warranty of any * kind, and assumes no liability for errors or omissions. * No liability is assumed for incidental and consequential damages diff --git a/tools/unicodetools/com/ibm/text/UCD/NormalizerSample.java b/tools/unicodetools/com/ibm/text/UCD/NormalizerSample.java index acd52ebe4bf..9c780e844e4 100644 --- a/tools/unicodetools/com/ibm/text/UCD/NormalizerSample.java +++ b/tools/unicodetools/com/ibm/text/UCD/NormalizerSample.java @@ -10,7 +10,7 @@ import com.ibm.text.utility.*; /** * Implements Unicode Normalization Forms C, D, KC, KD.
* See UTR#15 for details.
- * Copyright 1998-1999 Unicode, Inc. All Rights Reserved.
+ * Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.
* The Unicode Consortium makes no expressed or implied warranty of any * kind, and assumes no liability for errors or omissions. * No liability is assumed for incidental and consequential damages diff --git a/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt b/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt index bc101f29ef0..6d587175267 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt +++ b/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt @@ -41,6 +41,8 @@ #$East_Asian_Width:Neutral ? $GC:Uppercase_Letter $GC:Zs ? $Name:«.*SPACE.*» +[$script:greek&$gc:«.*letter.*»] = [;\u00B5\u00B7\u00C4\u00CB\u00CF\u00D6\u00DC\u00E4\u00EB\u00EF\u00F6\u00FC\u00FF-\u0101\u0112-\u0113\u012A-\u012B\u014C-\u014D\u016A-\u016B\u0178\u01D5-\u01DC\u01DE-\u01E3\u01EC-\u01ED\u022A-\u022D\u0230-\u0233\u0304\u0308\u0313-\u0314\u0342-\u0345\u037A\u037E\u0386-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D7\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F5\u0401\u0407\u0451\u0457\u04D2-\u04D3\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F1\u04F4-\u04F5\u04F8-\u04F9\u1E14-\u1E17\u1E20-\u1E21\u1E26-\u1E27\u1E2E-\u1E2F\u1E38-\u1E39\u1E4E-\u1E53\u1E5C-\u1E5D\u1E7A-\u1E7B\u1E84-\u1E85\u1E8C-\u1E8D\u1E97\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC1-\u1FC4\u1FC6-\u1FCC\u1FCF-\u1FD3\u1FD6-\u1FDB\u1FDF-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126] + # Examples of parsing errors # $LBA:Neutral = $GC:Zp # example of non-existant property @@ -54,7 +56,35 @@ $Line_Break:Unknown = [$General_Category:Unassigned $GeneralCategory:PrivateUse] $LB:OP = $GC:Ps $General_Category:Decimal_Number = $Numeric_Type:Decimal $Whitespace ⊃ [$GC:Zs $GC:Zp $GC:Zl] + +# Comparisons across versions + $ID_Start ⊇ $×ID_Start $ID_Continue ⊇ $×ID_Continue +#$age:4.0.1 = $age4.0.0 +# Derivations + +$Math = [$GC:Sm $Other_Math] +$Alphabetic = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_Alphabetic] +$Lowercase = [$GC:Ll $Other_Lowercase] +$Uppercase = [$GC:Lu $Other_Uppercase] +$ID_Start = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_ID_Start] +$ID_Continue = [$ID_Start $GC:Mn $GC:Mc $GC:Nd $GC:Pc] +$Default_Ignorable_Code_Point = [[$Other_Default_Ignorable_Code_Point $GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]] +$Grapheme_Extend = [$GC:Me $GC:Mn $Other_Grapheme_Extend] +$Grapheme_Base = [^$GC:Cc $GC:Cf $GC:Cs $GC:Co $GC:Cn $GC:Zl $GC:Zp $Grapheme_Extend] + +# "Minimal" Other_: NOT hard requirements; just if we want to be minimal + +$Other_Math = [$Math - $GC:Sm] +$Other_Alphabetic = [$Alphabetic - [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl]] +$Other_Lowercase = [$Lowercase - $GC:Ll] +$Other_Uppercase = [$Uppercase - $GC:Lu] +$Other_ID_Start = [$ID_Start - [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl]] +$Other_Default_Ignorable_Code_Point = [$Default_Ignorable_Code_Point - [[$GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]]] +$Other_Grapheme_Extend = [$Grapheme_Extend - [$GC:Me $GC:Mn]] + +# Testing +$script:greek = $×script:greek diff --git a/tools/unicodetools/com/ibm/text/utility/UTF32.java b/tools/unicodetools/com/ibm/text/utility/UTF32.java index 5e34251930c..138abbcfa15 100644 --- a/tools/unicodetools/com/ibm/text/utility/UTF32.java +++ b/tools/unicodetools/com/ibm/text/utility/UTF32.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/UTF32.java,v $ -* $Date: 2001/08/31 00:19:16 $ -* $Revision: 1.2 $ +* $Date: 2004/04/17 18:21:38 $ +* $Revision: 1.3 $ * ******************************************************************************* */ @@ -53,28 +53,28 @@ package com.ibm.text.utility;
 // iteration forwards: Original
 for (int i = 0; i < s.length(); ++i) {
-char ch = s.charAt(i);
-doSomethingWith(ch);
+    char ch = s.charAt(i);
+    doSomethingWith(ch);
 }
 
 // iteration forwards: Changes for UTF-32
 int ch;
 for (int i = 0; i < s.length(); i+=UTF32.count16(ch)) {
-ch = UTF32.char32At(s,i);
-doSomethingWith(ch);
+    ch = UTF32.char32At(s,i);
+    doSomethingWith(ch);
 }
 
 // iteration backwards: Original
 for (int i = s.length()-1; i >= 0; --i) {
-char ch = s.charAt(i);
-doSomethingWith(ch);
+    char ch = s.charAt(i);
+    doSomethingWith(ch);
 }
 
 // iteration backwards: Changes for UTF-32
 int ch;
 for (int i = s.length()-1; i > 0; i-=UTF32.count16(ch)) {
-ch = UTF32.char32At(s,i);
-doSomethingWith(ch);
+    ch = UTF32.char32At(s,i);
+    doSomethingWith(ch);
 }
 
 *