mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
utf-8 change
X-SVN-Rev: 15005
This commit is contained in:
parent
3055bdaa34
commit
7ca61b13cc
8 changed files with 315 additions and 284 deletions
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $
|
||||
* $Date: 2004/02/18 03:08:59 $
|
||||
* $Revision: 1.11 $
|
||||
* $Date: 2004/04/17 18:21:39 $
|
||||
* $Revision: 1.12 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -782,11 +782,11 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
|
||||
public boolean isBreak(String source, int offset) {
|
||||
|
||||
setRule("1: sot ÷");
|
||||
setRule("1: sot ÷");
|
||||
if (offset < 0 || offset > source.length()) return false;
|
||||
if (offset == 0) return true;
|
||||
|
||||
setRule("2: ÷ eot");
|
||||
setRule("2: ÷ eot");
|
||||
if (offset == source.length()) return true;
|
||||
|
||||
// UTF-16: never break in the middle of a code point
|
||||
|
@ -801,29 +801,29 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
byte before = getResolvedType(cpBefore);
|
||||
byte after = getResolvedType(cpAfter);
|
||||
|
||||
setRule("3: CR × LF");
|
||||
setRule("3: CR × LF");
|
||||
if (before == CR && after == LF) return false;
|
||||
|
||||
setRule("4: ( Control | CR | LF ) ÷");
|
||||
setRule("4: ( Control | CR | LF ) ÷");
|
||||
if (before == CR || before == LF || before == Control) return true;
|
||||
|
||||
setRule("5: ÷ ( Control | CR | LF )");
|
||||
setRule("5: ÷ ( Control | CR | LF )");
|
||||
if (after == Control || after == LF || after == CR) return true;
|
||||
|
||||
setRule("6: L × ( L | V | LV | LVT )");
|
||||
setRule("6: L × ( L | V | LV | LVT )");
|
||||
if (before == L && (after == L || after == V || after == LV || after == LVT)) return false;
|
||||
|
||||
setRule("7: ( LV | V ) × ( V | T )");
|
||||
setRule("7: ( LV | V ) × ( V | T )");
|
||||
if ((before == LV || before == V) && (after == V || after == T)) return false;
|
||||
|
||||
setRule("8: ( LVT | T ) × T");
|
||||
setRule("8: ( LVT | T ) × T");
|
||||
if ((before == LVT || before == T) && (after == T)) return false;
|
||||
|
||||
setRule("9: × Extend");
|
||||
setRule("9: × Extend");
|
||||
if (after == Extend) return false;
|
||||
|
||||
// Otherwise break after all characters.
|
||||
setRule("10: Any ÷ Any");
|
||||
setRule("10: Any ÷ Any");
|
||||
return true;
|
||||
|
||||
}
|
||||
|
@ -914,12 +914,12 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
|
||||
public boolean isBreak(String source, int offset) {
|
||||
|
||||
setRule("1: sot ÷");
|
||||
setRule("1: sot ÷");
|
||||
if (offset < 0 || offset > source.length()) return false;
|
||||
|
||||
if (offset == 0) return true;
|
||||
|
||||
setRule("2: ÷ eot");
|
||||
setRule("2: ÷ eot");
|
||||
if (offset == source.length()) return true;
|
||||
|
||||
// Treat a grapheme cluster as if it were a single character:
|
||||
|
@ -943,43 +943,43 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
|
||||
//Don't break between most letters
|
||||
|
||||
setRule("5: ALetter × ALetter");
|
||||
setRule("5: ALetter × ALetter");
|
||||
if (before == ALetter && after == ALetter) return false;
|
||||
|
||||
// Don’t break letters across certain punctuation
|
||||
// Don’t break letters across certain punctuation
|
||||
|
||||
setRule("6: ALetter × (MidLetter | MidNumLet) ALetter");
|
||||
setRule("6: ALetter × (MidLetter | MidNumLet) ALetter");
|
||||
if (before == ALetter && (after == MidLetter || after == MidNumLet) && after2 == ALetter) return false;
|
||||
|
||||
setRule("7: ALetter (MidLetter | MidNumLet) × ALetter");
|
||||
setRule("7: ALetter (MidLetter | MidNumLet) × ALetter");
|
||||
if (before2 == ALetter && (before == MidLetter || before == MidNumLet) && after == ALetter) return false;
|
||||
|
||||
// Don’t break within sequences of digits, or digits adjacent to letters.
|
||||
// Don’t break within sequences of digits, or digits adjacent to letters.
|
||||
|
||||
setRule("8: Numeric × Numeric");
|
||||
setRule("8: Numeric × Numeric");
|
||||
if (before == Numeric && after == Numeric) return false;
|
||||
|
||||
setRule("9: ALetter × Numeric");
|
||||
setRule("9: ALetter × Numeric");
|
||||
if (before == ALetter && after == Numeric) return false;
|
||||
|
||||
setRule("10: Numeric × ALetter");
|
||||
setRule("10: Numeric × ALetter");
|
||||
if (before == Numeric && after == ALetter) return false;
|
||||
|
||||
|
||||
// Don’t break within sequences like: '-3.2'
|
||||
setRule("11: Numeric (MidNum | MidNumLet) × Numeric");
|
||||
// Don’t break within sequences like: '-3.2'
|
||||
setRule("11: Numeric (MidNum | MidNumLet) × Numeric");
|
||||
if (before2 == Numeric && (before == MidNum || before == MidNumLet) && after == Numeric) return false;
|
||||
|
||||
setRule("12: Numeric × (MidNum | MidNumLet) Numeric");
|
||||
setRule("12: Numeric × (MidNum | MidNumLet) Numeric");
|
||||
if (before == Numeric && (after == MidNum || after == MidNumLet) && after2 == Numeric) return false;
|
||||
|
||||
// Don't break between Katakana
|
||||
|
||||
setRule("13: Katakana × Katakana");
|
||||
setRule("13: Katakana × Katakana");
|
||||
if (before == Katakana && after == Katakana) return false;
|
||||
|
||||
// Otherwise break always.
|
||||
setRule("14: Any ÷ Any");
|
||||
setRule("14: Any ÷ Any");
|
||||
return true;
|
||||
|
||||
}
|
||||
|
@ -1235,7 +1235,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
|
||||
// LB 2a Never break at the start of text
|
||||
|
||||
setRule("2a: × sot");
|
||||
setRule("2a: × sot");
|
||||
if (offset <= 0) return false;
|
||||
|
||||
// LB 2b Always break at the end of text
|
||||
|
@ -1269,26 +1269,26 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
//byte after = getResolvedType(cpAfter);
|
||||
|
||||
|
||||
setRule("3a: CR × LF ; ( BK | CR | LF | NL ) !");
|
||||
setRule("3a: CR × LF ; ( BK | CR | LF | NL ) !");
|
||||
|
||||
// Always break after hard line breaks (but never between CR and LF).
|
||||
// CR ^ LF
|
||||
if (before == LB_CR && after == LB_LF) return false;
|
||||
if (before == LB_BK || before == LB_LF || before == LB_CR) return true;
|
||||
|
||||
//LB 3b Don’t break before hard line breaks.
|
||||
setRule("3b: × ( BK | CR | LF )");
|
||||
//LB 3b Don’t break before hard line breaks.
|
||||
setRule("3b: × ( BK | CR | LF )");
|
||||
if (after == LB_BK || after == LB_LF || after == LB_CR) return false;
|
||||
|
||||
// LB 4 Don’t break before spaces or zero-width space.
|
||||
setRule("4: × ( SP | ZW )");
|
||||
// LB 4 Don’t break before spaces or zero-width space.
|
||||
setRule("4: × ( SP | ZW )");
|
||||
if (after == LB_SP || after == LB_ZW) return false;
|
||||
|
||||
// LB 5 Break after zero-width space.
|
||||
setRule("5: ZW ÷");
|
||||
setRule("5: ZW ÷");
|
||||
if (before == LB_ZW) return true;
|
||||
|
||||
// LB 6 Don’t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
|
||||
// LB 6 Don’t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
|
||||
setRule("6: DGC -> FC");
|
||||
if (!grapheme.isBreak( source, offset)) return false;
|
||||
|
||||
|
@ -1324,9 +1324,9 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
if (setBase && backBase == -1) before = LB_AL;
|
||||
|
||||
|
||||
// LB 8 Don’t break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces.
|
||||
// × CL, × EX, × IS, × SY
|
||||
setRule("8: × ( CL | EX | IS | SY )");
|
||||
// LB 8 Don’t break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces.
|
||||
// × CL, × EX, × IS, × SY
|
||||
setRule("8: × ( CL | EX | IS | SY )");
|
||||
if (after == LB_CL || after == LB_EX || after == LB_SY | after == LB_IS) return false;
|
||||
|
||||
|
||||
|
@ -1339,97 +1339,97 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
}
|
||||
}
|
||||
|
||||
// LB 9 Don’t break after ‘[’, even after spaces.
|
||||
// OP SP* ×
|
||||
setRule("9: OP SP* ×");
|
||||
// LB 9 Don’t break after ‘[’, even after spaces.
|
||||
// OP SP* ×
|
||||
setRule("9: OP SP* ×");
|
||||
if (lastNonSpace == LB_OP) return false;
|
||||
|
||||
// LB 10 Don’t break within ‘”[’, , even with intervening spaces.
|
||||
// QU SP* × OP
|
||||
setRule("10: QU SP* × OP");
|
||||
// LB 10 Don’t break within ‘”[’, , even with intervening spaces.
|
||||
// QU SP* × OP
|
||||
setRule("10: QU SP* × OP");
|
||||
if (lastNonSpace == LB_QU && after == LB_OP) return false;
|
||||
|
||||
// LB 11 Don’t break within ‘]h’, even with intervening spaces.
|
||||
// CL SP* × NS
|
||||
setRule("11: CL SP* × NS");
|
||||
// LB 11 Don’t break within ‘]h’, even with intervening spaces.
|
||||
// CL SP* × NS
|
||||
setRule("11: CL SP* × NS");
|
||||
if (lastNonSpace == LB_CL && after == LB_NS) return false;
|
||||
|
||||
// LB 11a Don’t break within ‘——’, even with intervening spaces.
|
||||
// B2 × B2
|
||||
setRule("11a: B2 × B2");
|
||||
// LB 11a Don’t break within ‘——’, even with intervening spaces.
|
||||
// B2 × B2
|
||||
setRule("11a: B2 × B2");
|
||||
if (lastNonSpace == LB_B2 && after == LB_B2) return false;
|
||||
|
||||
|
||||
// LB 13 Don’t break before or after NBSP or WORD JOINER
|
||||
// × GL
|
||||
// GL ×
|
||||
// LB 13 Don’t break before or after NBSP or WORD JOINER
|
||||
// × GL
|
||||
// GL ×
|
||||
|
||||
setRule("11b: × WJ ; WJ ×");
|
||||
setRule("11b: × WJ ; WJ ×");
|
||||
if (after == LB_WJ || before == LB_WJ) return false;
|
||||
|
||||
// [Note: by this time, all of the "X" in the table are accounted for. We can safely break after spaces.]
|
||||
|
||||
// LB 12 Break after spaces
|
||||
setRule("12: SP ÷");
|
||||
setRule("12: SP ÷");
|
||||
if (before == LB_SP) return true;
|
||||
|
||||
// LB 13 Don’t break before or after NBSP or WORD JOINER
|
||||
setRule("13: × GL ; GL ×");
|
||||
// LB 13 Don’t break before or after NBSP or WORD JOINER
|
||||
setRule("13: × GL ; GL ×");
|
||||
if (after == LB_GL || before == LB_GL) return false;
|
||||
|
||||
// LB 14 Don’t break before or after ‘”’
|
||||
setRule("14: × QU ; QU ×");
|
||||
// LB 14 Don’t break before or after ‘”’
|
||||
setRule("14: × QU ; QU ×");
|
||||
if (before == LB_QU || after == LB_QU) return false;
|
||||
|
||||
// LB 14a Break before and after CB
|
||||
setRule("14a: ÷ CB ; CB ÷");
|
||||
setRule("14a: ÷ CB ; CB ÷");
|
||||
if (before == LB_CB || after == LB_CB) return true;
|
||||
|
||||
// LB 15 Don’t break before hyphen-minus, other hyphens, fixed-width spaces,
|
||||
// LB 15 Don’t break before hyphen-minus, other hyphens, fixed-width spaces,
|
||||
// small kana and other non- starters, or after acute accents:
|
||||
|
||||
setRule("15: × ( BA | HY | NS ) ; BB ×");
|
||||
setRule("15: × ( BA | HY | NS ) ; BB ×");
|
||||
if (after == LB_NS) return false;
|
||||
if (after == LB_HY) return false;
|
||||
if (after == LB_BA) return false;
|
||||
if (before == LB_BB) return false;
|
||||
|
||||
|
||||
//setRule("15a: HY × NU"); // NEW
|
||||
//setRule("15a: HY × NU"); // NEW
|
||||
//if (before == LB_HY && after == LB_NU) return false;
|
||||
|
||||
// LB 16 Don’t break between two ellipses, or between letters or numbers and ellipsis:
|
||||
// Examples: ’9...’, ‘a...’, ‘H...’
|
||||
setRule("16: ( AL | ID | IN | NU ) × IN");
|
||||
// LB 16 Don’t break between two ellipses, or between letters or numbers and ellipsis:
|
||||
// Examples: ’9...’, ‘a...’, ‘H...’
|
||||
setRule("16: ( AL | ID | IN | NU ) × IN");
|
||||
if ((before == LB_NU || before == LB_AL || before == LB_ID) && after == LB_IN) return false;
|
||||
if (before == LB_IN && after == LB_IN) return false;
|
||||
|
||||
// Don't break alphanumerics.
|
||||
// LB 17 Don’t break within ‘a9’, ‘3a’, or ‘H%’
|
||||
// LB 17 Don’t break within ‘a9’, ‘3a’, or ‘H%’
|
||||
// Numbers are of the form PR ? ( OP | HY ) ? NU (NU | IS) * CL ? PO ?
|
||||
// Examples: $(12.35) 2,1234 (12)¢ 12.54¢
|
||||
// Examples: $(12.35) 2,1234 (12)¢ 12.54¢
|
||||
// This is approximated with the following rules. (Some cases already handled above,
|
||||
// like ‘9,’, ‘[9’.)
|
||||
setRule("17: ID × PO ; AL × NU; NU × AL");
|
||||
// like ‘9,’, ‘[9’.)
|
||||
setRule("17: ID × PO ; AL × NU; NU × AL");
|
||||
if (before == LB_ID && after == LB_PO) return false;
|
||||
if (before == LB_AL && after == LB_NU) return false;
|
||||
if (before == LB_NU && after == LB_AL) return false;
|
||||
|
||||
// LB 18 Don’t break between the following pairs of classes.
|
||||
// CL × PO
|
||||
// HY × NU
|
||||
// IS × NU
|
||||
// NU × NU
|
||||
// NU × PO
|
||||
// PR × AL
|
||||
// PR × HY
|
||||
// PR × ID
|
||||
// PR × NU
|
||||
// PR × OP
|
||||
// SY × NU
|
||||
// Example pairs: ‘$9’, ‘$[’, ‘$-‘, ‘-9’, ‘/9’, ‘99’, ‘,9’, ‘9%’ ‘]%’
|
||||
// LB 18 Don’t break between the following pairs of classes.
|
||||
// CL × PO
|
||||
// HY × NU
|
||||
// IS × NU
|
||||
// NU × NU
|
||||
// NU × PO
|
||||
// PR × AL
|
||||
// PR × HY
|
||||
// PR × ID
|
||||
// PR × NU
|
||||
// PR × OP
|
||||
// SY × NU
|
||||
// Example pairs: ‘$9’, ‘$[’, ‘$-‘, ‘-9’, ‘/9’, ‘99’, ‘,9’, ‘9%’ ‘]%’
|
||||
|
||||
setRule("18: CL × PO ; NU × PO ; ( IS | NU | HY | PR | SY ) × NU ; PR × ( AL | HY | ID | OP )");
|
||||
setRule("18: CL × PO ; NU × PO ; ( IS | NU | HY | PR | SY ) × NU ; PR × ( AL | HY | ID | OP )");
|
||||
if (before == LB_CL && after == LB_PO) return false;
|
||||
if (before == LB_IS && after == LB_NU) return false;
|
||||
if (before == LB_NU && after == LB_NU) return false;
|
||||
|
@ -1446,30 +1446,30 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
if (before == LB_SY && after == LB_NU) return false;
|
||||
|
||||
// LB 15b Break after hyphen-minus, and before acute accents:
|
||||
setRule("18b: HY ÷ ; ÷ BB");
|
||||
setRule("18b: HY ÷ ; ÷ BB");
|
||||
if (before == LB_HY) return true;
|
||||
if (after == LB_BB) return true;
|
||||
|
||||
// LB 19 Don’t break between alphabetics (“at”)
|
||||
// AL × AL
|
||||
// LB 19 Don’t break between alphabetics (“at”)
|
||||
// AL × AL
|
||||
|
||||
setRule("19: AL × AL");
|
||||
setRule("19: AL × AL");
|
||||
if (before == LB_AL && after == LB_AL) return false;
|
||||
|
||||
// LB 20 Break everywhere else
|
||||
// ALL ÷
|
||||
// ÷ ALL
|
||||
// ALL ÷
|
||||
// ÷ ALL
|
||||
|
||||
if (ucd.getCompositeVersion() > 0x040000) {
|
||||
setRule("19b: IS × AL");
|
||||
setRule("19b: IS × AL");
|
||||
if (before == LB_IS && after == LB_AL) return false;
|
||||
}
|
||||
|
||||
// LB 20 Break everywhere else
|
||||
// ALL ÷
|
||||
// ÷ ALL
|
||||
// ALL ÷
|
||||
// ÷ ALL
|
||||
|
||||
setRule("20: ALL ÷ ; ÷ ALL");
|
||||
setRule("20: ALL ÷ ; ÷ ALL");
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -1498,8 +1498,8 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
"U.S.A\u0300.",
|
||||
"3.4",
|
||||
"c.d",
|
||||
"etc.)\u2019 \u2018(the",
|
||||
"etc.)\u2019 \u2018(The",
|
||||
"etc.)\u2019 \u2018(the",
|
||||
"etc.)\u2019 \u2018(The",
|
||||
"the resp. leaders are",
|
||||
"\u5B57.\u5B57",
|
||||
"etc.\u5B83",
|
||||
|
@ -1631,15 +1631,15 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
public boolean isBreak(String source, int offset) {
|
||||
|
||||
// Break at the start and end of text.
|
||||
setRule("1: sot ÷");
|
||||
setRule("1: sot ÷");
|
||||
if (offset < 0 || offset > source.length()) return false;
|
||||
|
||||
if (offset == 0) return true;
|
||||
|
||||
setRule("2: ÷ eot");
|
||||
setRule("2: ÷ eot");
|
||||
if (offset == source.length()) return true;
|
||||
|
||||
setRule("3: Sep ÷");
|
||||
setRule("3: Sep ÷");
|
||||
byte beforeChar = getResolvedType(source.charAt(offset-1));
|
||||
if (beforeChar == Sep) return true;
|
||||
|
||||
|
@ -1662,22 +1662,22 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
|
||||
// HACK COPY for rule collection!
|
||||
if (collectingRules) {
|
||||
setRule("6: ATerm × ( Numeric | Lower )");
|
||||
setRule("7: Upper ATerm × Upper");
|
||||
setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower");
|
||||
setRule("9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )");
|
||||
setRule("10: ( Term | ATerm ) Close* Sp × ( Sp | Sep )");
|
||||
setRule("11: ( Term | ATerm ) Close* Sp* ÷");
|
||||
setRule("12: Any × Any");
|
||||
setRule("6: ATerm × ( Numeric | Lower )");
|
||||
setRule("7: Upper ATerm × Upper");
|
||||
setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower");
|
||||
setRule("9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )");
|
||||
setRule("10: ( Term | ATerm ) Close* Sp × ( Sp | Sep )");
|
||||
setRule("11: ( Term | ATerm ) Close* Sp* ÷");
|
||||
setRule("12: Any × Any");
|
||||
collectingRules = false;
|
||||
}
|
||||
|
||||
// Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter, is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
|
||||
|
||||
if (before == ATerm) {
|
||||
setRule("6: ATerm × ( Numeric | Lower )");
|
||||
setRule("6: ATerm × ( Numeric | Lower )");
|
||||
if (after == Lower || after == Numeric) return false;
|
||||
setRule("7: Upper ATerm × Upper");
|
||||
setRule("7: Upper ATerm × Upper");
|
||||
if (DEBUG_GRAPHEMES) System.out.println(context + ", " + Upper);
|
||||
if (before2 == Upper && after == Upper) return false;
|
||||
}
|
||||
|
@ -1736,17 +1736,17 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
|
||||
if (lookAfter == -1) {
|
||||
// Otherwise, do not break
|
||||
// Any × Any (11)
|
||||
setRule("12: Any × Any");
|
||||
// Any × Any (11)
|
||||
setRule("12: Any × Any");
|
||||
return false;
|
||||
}
|
||||
|
||||
// ATerm Close* Sp*×(¬( OLetter))* Lower(8)
|
||||
// ATerm Close* Sp*×(¬( OLetter))* Lower(8)
|
||||
|
||||
// Break after sentence terminators, but include closing punctuation, trailing spaces, and (optionally) a paragraph separator.
|
||||
// ( Term | ATerm ) Close*×( Close | Sp | Sep )(9)
|
||||
// ( Term | ATerm ) Close* Sp×( Sp | Sep )(10)
|
||||
// ( Term | ATerm ) Close* Sp*÷(11)
|
||||
// ( Term | ATerm ) Close*×( Close | Sp | Sep )(9)
|
||||
// ( Term | ATerm ) Close* Sp×( Sp | Sep )(10)
|
||||
// ( Term | ATerm ) Close* Sp*÷(11)
|
||||
|
||||
|
||||
// We DID find one. Loop to see if the right side is ok.
|
||||
|
@ -1764,16 +1764,16 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
if (isFirst) {
|
||||
isFirst = false;
|
||||
if (lookAfter == ATerm && t == Upper) {
|
||||
setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower");
|
||||
setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower");
|
||||
return false;
|
||||
}
|
||||
if (gotSpace) {
|
||||
if (t == Sp || t == Sep) {
|
||||
setRule("10: ( Term | ATerm ) Close* Sp × ( Sp | Sep )");
|
||||
setRule("10: ( Term | ATerm ) Close* Sp × ( Sp | Sep )");
|
||||
return false;
|
||||
}
|
||||
} else if (t == Close || t == Sp || t == Sep) {
|
||||
setRule("9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )");
|
||||
setRule("9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )");
|
||||
return false;
|
||||
}
|
||||
if (lookAfter == Term) break;
|
||||
|
@ -1782,12 +1782,12 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
// at this point, we have an ATerm. All other conditions are ok, but we need to verify 6
|
||||
if (t != OLetter && t != Upper && t != Lower) continue;
|
||||
if (t == Lower) {
|
||||
setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower");
|
||||
setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower");
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
setRule("11: ( Term | ATerm ) Close* Sp* ÷");
|
||||
setRule("11: ( Term | ATerm ) Close* Sp* ÷");
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
|
||||
* $Date: 2004/02/07 01:01:15 $
|
||||
* $Revision: 1.14 $
|
||||
* $Date: 2004/04/17 18:21:39 $
|
||||
* $Revision: 1.15 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -547,18 +547,18 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
"e", "ei", "er", "en", "eng",
|
||||
"i", "ia", "iao", "ie", "iu", "ian", "in", "iang", "ing", "iong",
|
||||
"u", "ua", "uo", "uai", "ui", "uan", "un", "uang", "ueng",
|
||||
"ü", "üe", "üan", "ün"
|
||||
"ü", "üe", "üan", "ün"
|
||||
};
|
||||
// Don't bother with the following rules; just add w,y to initials
|
||||
// When “i” stands alone, a “y” will be added before it as “yi”.
|
||||
// If “i” is the first letter of the syllable it will be changed to “y”.
|
||||
// When “u” stands alone, a “w” will be added before it as “wu”.
|
||||
// If “u” is the first letter of the syllable it will be changed to “w”. e.g. “uang -> wang”.
|
||||
// When “ü” stands alone, a “y” will be added before it and “ü” will be changed to “u” as “yu”.
|
||||
// If “ü” is the first letter of the syllable, then the spelling will be changed to “yu”. e.g. “üan -> yuan”.
|
||||
//Note: The nasal final “ueng” never occurs after an initial but always form a syllable by itself.
|
||||
// The “o” in “iou” is hidden, so it will be wrote as “iu”. But, don’t forget to pronounce it.
|
||||
// The “e” in “uei” is hidden, so it will be wrote as “ui”. But, don’t forget to pronounce it.
|
||||
// When “i” stands alone, a “y” will be added before it as “yi”.
|
||||
// If “i” is the first letter of the syllable it will be changed to “y”.
|
||||
// When “u” stands alone, a “w” will be added before it as “wu”.
|
||||
// If “u” is the first letter of the syllable it will be changed to “w”. e.g. “uang -> wang”.
|
||||
// When “ü” stands alone, a “y” will be added before it and “ü” will be changed to “u” as “yu”.
|
||||
// If “ü” is the first letter of the syllable, then the spelling will be changed to “yu”. e.g. “üan -> yuan”.
|
||||
//Note: The nasal final “ueng” never occurs after an initial but always form a syllable by itself.
|
||||
// The “o” in “iou” is hidden, so it will be wrote as “iu”. But, don’t forget to pronounce it.
|
||||
// The “e” in “uei” is hidden, so it will be wrote as “ui”. But, don’t forget to pronounce it.
|
||||
|
||||
|
||||
public static final String[] pinyin_bopomofo = {
|
||||
|
@ -749,9 +749,9 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
"long", "\u310c\u3128\u3125",
|
||||
"lou", "\u310c\u3121",
|
||||
"lu", "\u310c\u3128",
|
||||
"lü", "\u310c\u3129",
|
||||
"lü", "\u310c\u3129",
|
||||
"luan", "\u310c\u3128\u3122",
|
||||
"lüe", "\u310c\u3129\u311d",
|
||||
"lüe", "\u310c\u3129\u311d",
|
||||
"lun", "\u310c\u3128\u3123",
|
||||
"luo", "\u310c\u3128\u311b",
|
||||
"m", "\u3107",
|
||||
|
@ -796,9 +796,9 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
"nong", "\u310b\u3128\u3125",
|
||||
"nou", "\u310b\u3121",
|
||||
"nu", "\u310b\u3128",
|
||||
"nü", "\u310b\u3129",
|
||||
"nü", "\u310b\u3129",
|
||||
"nuan", "\u310b\u3128\u3122",
|
||||
"nüe", "\u310b\u3129\u311d",
|
||||
"nüe", "\u310b\u3129\u311d",
|
||||
"nuo", "\u310b\u3128\u311b",
|
||||
"o", "\u311b",
|
||||
"ou", "\u3121",
|
||||
|
@ -1007,52 +1007,52 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
}
|
||||
|
||||
/*
|
||||
U+347C · liù #lyuè
|
||||
U+3500 · lüè #lvè
|
||||
U+3527 · liù #lyù
|
||||
U+3729 · ào #àu
|
||||
U+380E · jí #jjí
|
||||
U+3825 · l· #lv·
|
||||
U+3A3C · lüè #luè
|
||||
U+3B5A · li· #ly· *** lü?
|
||||
U+3CB6 · l· #lv·
|
||||
U+3D56 · niù #nyù *** nü?
|
||||
U+3D88 · li·ng #li·ng
|
||||
U+3EF2 · li· #ly·*** lü?
|
||||
U+3F94 · li· #ly·*** lü?
|
||||
U+4071 · ào #àu
|
||||
U+40AE · liù #lyuè *** lüe?
|
||||
U+430E · liù #lyuè *** lüe?
|
||||
U+451E · liù #lyù *** lü?
|
||||
U+4588 · nüè #nuè
|
||||
U+458B · nüè #nuè
|
||||
U+45A1 · niù #nyù *** nü?
|
||||
U+4610 · niù #nyù *** nü?
|
||||
U+46BC · niù #nyù *** nü?
|
||||
U+46DA · liù #lyuè *** lüe?
|
||||
U+4896 · liù #lyù *** lü?
|
||||
U+4923 · liù #lyuè *** lüe?
|
||||
U+4968 · liù #lyù *** lü?
|
||||
U+4A0B · niù #nyuè *** nüe?
|
||||
U+4AC4 · chuò #chuà
|
||||
U+4D08 · ·o #·u
|
||||
U+4D8A · niù #nyù *** nü?
|
||||
U+51CA · qíng #qýng
|
||||
U+51D6 · zhu·n #zhu·n *** this is probably zh·n
|
||||
U+5481 · gàn #gèm
|
||||
U+5838 · féng #fúng
|
||||
U+639F · lü· #lu· *** this pronunciation surprises me, but I don't know...
|
||||
U+66D5 · yàn #yiàn
|
||||
U+6B3B · chu· #chu· *** chua _is_ ok after all, my table missed an entry
|
||||
U+6B56 · chu· #chu· *** chua
|
||||
U+6C7C · ni· #ni·u
|
||||
U+6E6D · qiú #qióu
|
||||
U+6F71 · y· #yi·
|
||||
U+7493 · xiù #xiòu
|
||||
U+7607 · zh·ng #zh·ng *** I suspect zh·ng
|
||||
U+7674 · luán #lüán
|
||||
U+7867 · y·ng #i·ng
|
||||
U+7878 · nüè #nuè
|
||||
U+347C · liù #lyuè
|
||||
U+3500 · lüè #lvè
|
||||
U+3527 · liù #lyù
|
||||
U+3729 · ào #àu
|
||||
U+380E · jí #jjí
|
||||
U+3825 · l· #lv·
|
||||
U+3A3C · lüè #luè
|
||||
U+3B5A · li· #ly· *** lü?
|
||||
U+3CB6 · l· #lv·
|
||||
U+3D56 · niù #nyù *** nü?
|
||||
U+3D88 · li·ng #li·ng
|
||||
U+3EF2 · li· #ly·*** lü?
|
||||
U+3F94 · li· #ly·*** lü?
|
||||
U+4071 · ào #àu
|
||||
U+40AE · liù #lyuè *** lüe?
|
||||
U+430E · liù #lyuè *** lüe?
|
||||
U+451E · liù #lyù *** lü?
|
||||
U+4588 · nüè #nuè
|
||||
U+458B · nüè #nuè
|
||||
U+45A1 · niù #nyù *** nü?
|
||||
U+4610 · niù #nyù *** nü?
|
||||
U+46BC · niù #nyù *** nü?
|
||||
U+46DA · liù #lyuè *** lüe?
|
||||
U+4896 · liù #lyù *** lü?
|
||||
U+4923 · liù #lyuè *** lüe?
|
||||
U+4968 · liù #lyù *** lü?
|
||||
U+4A0B · niù #nyuè *** nüe?
|
||||
U+4AC4 · chuò #chuà
|
||||
U+4D08 · ·o #·u
|
||||
U+4D8A · niù #nyù *** nü?
|
||||
U+51CA · qíng #qýng
|
||||
U+51D6 · zhu·n #zhu·n *** this is probably zh·n
|
||||
U+5481 · gàn #gèm
|
||||
U+5838 · féng #fúng
|
||||
U+639F · lü· #lu· *** this pronunciation surprises me, but I don't know...
|
||||
U+66D5 · yàn #yiàn
|
||||
U+6B3B · chu· #chu· *** chua _is_ ok after all, my table missed an entry
|
||||
U+6B56 · chu· #chu· *** chua
|
||||
U+6C7C · ni· #ni·u
|
||||
U+6E6D · qiú #qióu
|
||||
U+6F71 · y· #yi·
|
||||
U+7493 · xiù #xiòu
|
||||
U+7607 · zh·ng #zh·ng *** I suspect zh·ng
|
||||
U+7674 · luán #lüán
|
||||
U+7867 · y·ng #i·ng
|
||||
U+7878 · nüè #nuè
|
||||
*/
|
||||
|
||||
static Transliterator fixTypos = Transliterator.createFromRules("fix_typos",
|
||||
|
@ -1061,12 +1061,12 @@ U+7878
|
|||
+"$cons{iou}$nlet > iu;"
|
||||
+"$cons{em}$nlet > an;"
|
||||
+"$cons{uen}$nlet > ueng;"
|
||||
+"$cons{ve}$nlet > üe;"
|
||||
+"$cons{v}$nlet > ü;"
|
||||
+"$cons{ve}$nlet > üe;"
|
||||
+"$cons{v}$nlet > ü;"
|
||||
+"$cons{yue}$nlet > iu;"
|
||||
+"$cons{yng}$nlet > ing;"
|
||||
+"$cons{yu}$nlet > iu;"
|
||||
//+"$cons{ue} > üe;"
|
||||
//+"$cons{ue} > üe;"
|
||||
+"jj > j;"
|
||||
//+"$nlet{ng}$nlet > eng;"
|
||||
//+"$nlet{n}$nlet > en;"
|
||||
|
@ -1076,13 +1076,13 @@ U+7878
|
|||
// new fixes
|
||||
+"zhueng}$nlet > zhong;"
|
||||
+"zhuen}$nlet > zhuan;"
|
||||
+"lue > lüe;"
|
||||
+"lue > lüe;"
|
||||
+"liong > liang;"
|
||||
+"nue > nüe;"
|
||||
+"nue > nüe;"
|
||||
+"chua > chuo;"
|
||||
+"yian > yan;"
|
||||
+"yie > ye;"
|
||||
+"lüan > luan;"
|
||||
+"lüan > luan;"
|
||||
+"iong > yong;"
|
||||
, Transliterator.FORWARD);
|
||||
|
||||
|
@ -1113,7 +1113,7 @@ U+7878
|
|||
try {
|
||||
|
||||
// chinese_frequency.txt
|
||||
// 1 çš„ 1588561 1588561 3.5008%
|
||||
// 1 çš„ 1588561 1588561 3.5008%
|
||||
// japanese_frequency.txt
|
||||
// 1 ? 17176
|
||||
|
||||
|
@ -1421,7 +1421,7 @@ U+7878
|
|||
@Unihan Data
|
||||
|
||||
Bad pinyin data: \u4E7F ? LE
|
||||
\u7684 ? de, de, dí, dì
|
||||
\u7684 ? de, de, dí, dì
|
||||
*/
|
||||
|
||||
static void fixChineseOverrides() throws IOException {
|
||||
|
@ -2024,7 +2024,7 @@ Bad pinyin data: \u4E7F ? LE
|
|||
+ "# otherwise 'o'\n"
|
||||
+ "# otherwise last vowel\n"
|
||||
+ "::NFC;\n"
|
||||
+ "$vowel = [aAeEiIoOuUüÜ];\n"
|
||||
+ "$vowel = [aAeEiIoOuUüÜ];\n"
|
||||
+ "$consonant = [[a-z A-Z] - [$vowel]];\n"
|
||||
+ "$digit = [1-5];\n"
|
||||
+ "([aAeE]) ($vowel* $consonant*) ($digit) > $1 &digit-tone($3) $2;\n"
|
||||
|
@ -2054,10 +2054,10 @@ Bad pinyin data: \u4E7F ? LE
|
|||
if (i > 0) {
|
||||
char last = result.charAt(result.length()-1);
|
||||
if (last == 'u') {
|
||||
result.setCharAt(result.length()-1, 'ü');
|
||||
result.setCharAt(result.length()-1, 'ü');
|
||||
continue main;
|
||||
} else if (last == 'U') {
|
||||
result.setCharAt(result.length()-1, 'Ü');
|
||||
result.setCharAt(result.length()-1, 'Ü');
|
||||
continue main;
|
||||
}
|
||||
}
|
||||
|
@ -2085,22 +2085,22 @@ Bad pinyin data: \u4E7F ? LE
|
|||
for (int i = source.length()-2; i >= 0; --i) {
|
||||
ch = source.charAt(i);
|
||||
if (ch == ':') {
|
||||
ch = 'Ü';
|
||||
ch = 'Ü';
|
||||
--i;
|
||||
}
|
||||
if ('0' <= ch && ch <= '9') break;
|
||||
if (ch != 'Ü' && (ch < 'A' || ch > 'Z')) {
|
||||
if (ch != 'Ü' && (ch < 'A' || ch > 'Z')) {
|
||||
Utility.fixDot();
|
||||
System.out.println("Warning: non-ASCII in " + hex.transliterate(source) + " (" + hex.transliterate(debugLine) + ")");
|
||||
break;
|
||||
}
|
||||
if (!gotIt) switch (ch) {
|
||||
case 'A': ch = "AÁ\u0102À\u0100".charAt(num); gotIt = true; break;
|
||||
case 'E': ch = "EÉ\u0114È\u0112".charAt(num); gotIt = true; break;
|
||||
case 'I': ch = "IÍ\u012CÌ\u012A".charAt(num); gotIt = true; break;
|
||||
case 'O': ch = "OÓ\u014EÒ\u014C".charAt(num); gotIt = true; break;
|
||||
case 'U': ch = "UÚ\u016CÙ\u016A".charAt(num); gotIt = true; break;
|
||||
case 'Ü': ch = "Ü\u01D7\u01D9\u01DB\u01D5".charAt(num); gotIt = true; break;
|
||||
case 'A': ch = "AÁ\u0102À\u0100".charAt(num); gotIt = true; break;
|
||||
case 'E': ch = "EÉ\u0114È\u0112".charAt(num); gotIt = true; break;
|
||||
case 'I': ch = "IÍ\u012CÌ\u012A".charAt(num); gotIt = true; break;
|
||||
case 'O': ch = "OÓ\u014EÒ\u014C".charAt(num); gotIt = true; break;
|
||||
case 'U': ch = "UÚ\u016CÙ\u016A".charAt(num); gotIt = true; break;
|
||||
case 'Ü': ch = "Ü\u01D7\u01D9\u01DB\u01D5".charAt(num); gotIt = true; break;
|
||||
}
|
||||
handlePinyinTemp.insert(0,ch);
|
||||
}
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java,v $
|
||||
* $Date: 2004/02/07 01:01:15 $
|
||||
* $Revision: 1.4 $
|
||||
* $Date: 2004/04/17 18:21:39 $
|
||||
* $Revision: 1.5 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -398,23 +398,23 @@ public class GenerateLineBreakTest implements UCD_Types {
|
|||
if (before == LB_CR && after == LB_LF) return false;
|
||||
if (before == LB_BK || before == LB_LF || before == LB_CR) return true;
|
||||
|
||||
//LB 3b Don’t break before hard line breaks.
|
||||
//LB 3b Don’t break before hard line breaks.
|
||||
rule="3b";
|
||||
if (after == LB_BK || after == LB_LF | after == LB_CR) return false;
|
||||
|
||||
// LB 4 Don’t break before spaces or zero-width space.
|
||||
// × SP
|
||||
// × ZW
|
||||
// LB 4 Don’t break before spaces or zero-width space.
|
||||
// × SP
|
||||
// × ZW
|
||||
|
||||
rule="4";
|
||||
if (after == LB_SP || after == LB_ZW) return false;
|
||||
|
||||
// LB 5 Break after zero-width space.
|
||||
// ZW ÷
|
||||
// ZW ÷
|
||||
rule="5";
|
||||
if (before == LB_ZW) return true;
|
||||
|
||||
// LB 6 Don’t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
|
||||
// LB 6 Don’t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
|
||||
rule="6";
|
||||
if (after == LB_CM) return false;
|
||||
|
||||
|
@ -441,8 +441,8 @@ public class GenerateLineBreakTest implements UCD_Types {
|
|||
rule="7";
|
||||
if (setBase && before == LB_SP) before = LB_ID;
|
||||
|
||||
// LB 8 Don’t break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces.
|
||||
// × CL, × EX, × IS, × SY
|
||||
// LB 8 Don’t break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces.
|
||||
// × CL, × EX, × IS, × SY
|
||||
rule="8";
|
||||
if (after == LB_CL || after == LB_EX || after == LB_SY | after == LB_IS) return false;
|
||||
|
||||
|
@ -456,31 +456,31 @@ public class GenerateLineBreakTest implements UCD_Types {
|
|||
}
|
||||
}
|
||||
|
||||
// LB 9 Don’t break after ‘[’, even after spaces.
|
||||
// OP SP* ×
|
||||
// LB 9 Don’t break after ‘[’, even after spaces.
|
||||
// OP SP* ×
|
||||
rule="9";
|
||||
if (lastNonSpace == LB_OP) return false;
|
||||
|
||||
// LB 10 Don’t break within ‘”[’, , even with intervening spaces.
|
||||
// QU SP* × OP
|
||||
// LB 10 Don’t break within ‘”[’, , even with intervening spaces.
|
||||
// QU SP* × OP
|
||||
rule="10";
|
||||
if (lastNonSpace == LB_QU && after == LB_OP) return false;
|
||||
|
||||
// LB 11 Don’t break within ‘]h’, even with intervening spaces.
|
||||
// CL SP* × NS
|
||||
// LB 11 Don’t break within ‘]h’, even with intervening spaces.
|
||||
// CL SP* × NS
|
||||
rule="11";
|
||||
if (lastNonSpace == LB_CL && after == LB_NS) return false;
|
||||
|
||||
// LB 11a Don’t break within ‘——’, even with intervening spaces.
|
||||
// B2 × B2
|
||||
// LB 11a Don’t break within ‘——’, even with intervening spaces.
|
||||
// B2 × B2
|
||||
rule="11a";
|
||||
if (lastNonSpace == LB_B2 && after == LB_B2) return false;
|
||||
|
||||
|
||||
if (recommended) {
|
||||
// LB 13 Don’t break before or after NBSP or WORD JOINER
|
||||
// × GL
|
||||
// GL ×
|
||||
// LB 13 Don’t break before or after NBSP or WORD JOINER
|
||||
// × GL
|
||||
// GL ×
|
||||
|
||||
rule="11b";
|
||||
if (after == LB_GL || before == LB_GL) return false;
|
||||
|
@ -490,36 +490,36 @@ public class GenerateLineBreakTest implements UCD_Types {
|
|||
|
||||
rule="12";
|
||||
// LB 12 Break after spaces
|
||||
// SP ÷
|
||||
// SP ÷
|
||||
|
||||
if (before == LB_SP) return true;
|
||||
|
||||
if (!recommended) {
|
||||
// LB 13 Don’t break before or after NBSP or WORD JOINER
|
||||
// × GL
|
||||
// GL ×
|
||||
// LB 13 Don’t break before or after NBSP or WORD JOINER
|
||||
// × GL
|
||||
// GL ×
|
||||
|
||||
rule="13";
|
||||
if (after == LB_GL || before == LB_GL) return false;
|
||||
}
|
||||
|
||||
rule="14";
|
||||
// LB 14 Don’t break before or after ‘”’
|
||||
// × QU
|
||||
// QU ×
|
||||
// LB 14 Don’t break before or after ‘”’
|
||||
// × QU
|
||||
// QU ×
|
||||
if (before == LB_QU || after == LB_QU) return false;
|
||||
|
||||
// LB 15 Don’t break before hyphen-minus, other hyphens, fixed-width spaces,
|
||||
// LB 15 Don’t break before hyphen-minus, other hyphens, fixed-width spaces,
|
||||
// small kana and other non- starters, or after acute accents:
|
||||
// × BA
|
||||
// × HY
|
||||
// × NS
|
||||
// BB ×
|
||||
// × BA
|
||||
// × HY
|
||||
// × NS
|
||||
// BB ×
|
||||
|
||||
if (recommended) {
|
||||
// LB 14a Break before and after CB
|
||||
// CB ÷
|
||||
// ÷ CB
|
||||
// CB ÷
|
||||
// ÷ CB
|
||||
if (before == LB_CB || after == LB_CB) return true;
|
||||
|
||||
}
|
||||
|
@ -532,51 +532,51 @@ public class GenerateLineBreakTest implements UCD_Types {
|
|||
|
||||
if (!recommended) {
|
||||
// LB 15b Break after hyphen-minus, and before acute accents:
|
||||
// HY ÷
|
||||
// ÷ BB
|
||||
// HY ÷
|
||||
// ÷ BB
|
||||
|
||||
rule="15b";
|
||||
if (before == LB_HY) return true;
|
||||
if (after == LB_BB) return true;
|
||||
}
|
||||
|
||||
// LB 16 Don’t break between two ellipses, or between letters or numbers and ellipsis:
|
||||
// AL × IN
|
||||
// ID × IN
|
||||
// IN × IN
|
||||
// NU × IN
|
||||
// Examples: ’9...’, ‘a...’, ‘H...’
|
||||
// LB 16 Don’t break between two ellipses, or between letters or numbers and ellipsis:
|
||||
// AL × IN
|
||||
// ID × IN
|
||||
// IN × IN
|
||||
// NU × IN
|
||||
// Examples: ’9...’, ‘a...’, ‘H...’
|
||||
rule="16";
|
||||
if ((before == LB_NU || before == LB_AL || before == LB_ID) && after == LB_IN) return false;
|
||||
if (before == LB_IN && after == LB_IN) return false;
|
||||
|
||||
// Don't break alphanumerics.
|
||||
// LB 17 Don’t break within ‘a9’, ‘3a’, or ‘H%’
|
||||
// ID × PO
|
||||
// AL × NU
|
||||
// NU × AL
|
||||
// LB 17 Don’t break within ‘a9’, ‘3a’, or ‘H%’
|
||||
// ID × PO
|
||||
// AL × NU
|
||||
// NU × AL
|
||||
// Numbers are of the form PR ? ( OP | HY ) ? NU (NU | IS) * CL ? PO ?
|
||||
// Examples: $(12.35) 2,1234 (12)¢ 12.54¢
|
||||
// Examples: $(12.35) 2,1234 (12)¢ 12.54¢
|
||||
// This is approximated with the following rules. (Some cases already handled above,
|
||||
// like ‘9,’, ‘[9’.)
|
||||
// like ‘9,’, ‘[9’.)
|
||||
rule="17";
|
||||
if (before == LB_ID && after == LB_PO) return false;
|
||||
if (before == LB_AL && after == LB_NU) return false;
|
||||
if (before == LB_NU && after == LB_AL) return false;
|
||||
|
||||
// LB 18 Don’t break between the following pairs of classes.
|
||||
// CL × PO
|
||||
// HY × NU
|
||||
// IS × NU
|
||||
// NU × NU
|
||||
// NU × PO
|
||||
// PR × AL
|
||||
// PR × HY
|
||||
// PR × ID
|
||||
// PR × NU
|
||||
// PR × OP
|
||||
// SY × NU
|
||||
// Example pairs: ‘$9’, ‘$[’, ‘$-‘, ‘-9’, ‘/9’, ‘99’, ‘,9’, ‘9%’ ‘]%’
|
||||
// LB 18 Don’t break between the following pairs of classes.
|
||||
// CL × PO
|
||||
// HY × NU
|
||||
// IS × NU
|
||||
// NU × NU
|
||||
// NU × PO
|
||||
// PR × AL
|
||||
// PR × HY
|
||||
// PR × ID
|
||||
// PR × NU
|
||||
// PR × OP
|
||||
// SY × NU
|
||||
// Example pairs: ‘$9’, ‘$[’, ‘$-‘, ‘-9’, ‘/9’, ‘99’, ‘,9’, ‘9%’ ‘]%’
|
||||
|
||||
rule="18";
|
||||
if (before == LB_CL && after == LB_PO) return false;
|
||||
|
@ -595,23 +595,23 @@ public class GenerateLineBreakTest implements UCD_Types {
|
|||
|
||||
if (recommended) {
|
||||
// LB 15b Break after hyphen-minus, and before acute accents:
|
||||
// HY ÷
|
||||
// ÷ BB
|
||||
// HY ÷
|
||||
// ÷ BB
|
||||
|
||||
rule="18b";
|
||||
if (before == LB_HY) return true;
|
||||
if (after == LB_BB) return true;
|
||||
}
|
||||
|
||||
// LB 19 Don’t break between alphabetics (“at”)
|
||||
// AL × AL
|
||||
// LB 19 Don’t break between alphabetics (“at”)
|
||||
// AL × AL
|
||||
|
||||
rule="19";
|
||||
if (before == LB_AL && after == LB_AL) return false;
|
||||
|
||||
// LB 20 Break everywhere else
|
||||
// ALL ÷
|
||||
// ÷ ALL
|
||||
// ALL ÷
|
||||
// ÷ ALL
|
||||
|
||||
rule="20";
|
||||
return true;
|
||||
|
@ -754,7 +754,7 @@ public class GenerateLineBreakTest implements UCD_Types {
|
|||
// Do not break between linking characters and letters, or before linking characters. This provides for Indic graphemes, where virama (halant) will link character clusters together.
|
||||
|
||||
rule = "12";
|
||||
//Link Extend* × LetterBase (12)
|
||||
//Link Extend* × LetterBase (12)
|
||||
if (after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT) {
|
||||
int backOffset = findLastNon(source, offset, Extend, recommended);
|
||||
if (backOffset >= 0) {
|
||||
|
|
|
@ -1169,6 +1169,7 @@ public class MakeUnicodeFiles {
|
|||
String line = in.readLine();
|
||||
if (line == null) break;
|
||||
if (line.startsWith("\uFEFF")) line = line.substring(1);
|
||||
out.println(line);
|
||||
line = line.trim();
|
||||
int pos = line.indexOf('#');
|
||||
if (pos >= 0) line = line.substring(0,pos).trim();
|
||||
|
@ -1232,9 +1233,9 @@ public class MakeUnicodeFiles {
|
|||
break;
|
||||
default: throw new IllegalArgumentException("Internal Error");
|
||||
}
|
||||
out.println(String.valueOf(ok).toUpperCase(Locale.ENGLISH) + ":\t" + line);
|
||||
if (ok) continue;
|
||||
out.println();
|
||||
out.println(String.valueOf(ok).toUpperCase(Locale.ENGLISH));
|
||||
out.println("**** START Error Info ****");
|
||||
bf.showSetDifferences(out, rightSide, rightSet, leftSide, leftSet);
|
||||
out.println("**** END Error Info ****");
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
|
||||
* $Date: 2004/02/06 18:30:20 $
|
||||
* $Revision: 1.15 $
|
||||
* $Date: 2004/04/17 18:21:39 $
|
||||
* $Revision: 1.16 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -22,7 +22,7 @@ import com.ibm.text.utility.*;
|
|||
/**
|
||||
* Implements Unicode Normalization Forms C, D, KC, KD.<br>
|
||||
* See UTR#15 for details.<br>
|
||||
* Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.<br>
|
||||
* Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.<br>
|
||||
* The Unicode Consortium makes no expressed or implied warranty of any
|
||||
* kind, and assumes no liability for errors or omissions.
|
||||
* No liability is assumed for incidental and consequential damages
|
||||
|
|
|
@ -10,7 +10,7 @@ import com.ibm.text.utility.*;
|
|||
/**
|
||||
* Implements Unicode Normalization Forms C, D, KC, KD.<br>
|
||||
* See UTR#15 for details.<br>
|
||||
* Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.<br>
|
||||
* Copyright © 1998-1999 Unicode, Inc. All Rights Reserved.<br>
|
||||
* The Unicode Consortium makes no expressed or implied warranty of any
|
||||
* kind, and assumes no liability for errors or omissions.
|
||||
* No liability is assumed for incidental and consequential damages
|
||||
|
|
|
@ -41,6 +41,8 @@
|
|||
#$East_Asian_Width:Neutral ? $GC:Uppercase_Letter
|
||||
$GC:Zs ? $Name:«.*SPACE.*»
|
||||
|
||||
[$script:greek&$gc:«.*letter.*»] = [;\u00B5\u00B7\u00C4\u00CB\u00CF\u00D6\u00DC\u00E4\u00EB\u00EF\u00F6\u00FC\u00FF-\u0101\u0112-\u0113\u012A-\u012B\u014C-\u014D\u016A-\u016B\u0178\u01D5-\u01DC\u01DE-\u01E3\u01EC-\u01ED\u022A-\u022D\u0230-\u0233\u0304\u0308\u0313-\u0314\u0342-\u0345\u037A\u037E\u0386-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D7\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F5\u0401\u0407\u0451\u0457\u04D2-\u04D3\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F1\u04F4-\u04F5\u04F8-\u04F9\u1E14-\u1E17\u1E20-\u1E21\u1E26-\u1E27\u1E2E-\u1E2F\u1E38-\u1E39\u1E4E-\u1E53\u1E5C-\u1E5D\u1E7A-\u1E7B\u1E84-\u1E85\u1E8C-\u1E8D\u1E97\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC1-\u1FC4\u1FC6-\u1FCC\u1FCF-\u1FD3\u1FD6-\u1FDB\u1FDF-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126]
|
||||
|
||||
# Examples of parsing errors
|
||||
|
||||
# $LBA:Neutral = $GC:Zp # example of non-existant property
|
||||
|
@ -54,7 +56,35 @@ $Line_Break:Unknown = [$General_Category:Unassigned $GeneralCategory:PrivateUse]
|
|||
$LB:OP = $GC:Ps
|
||||
$General_Category:Decimal_Number = $Numeric_Type:Decimal
|
||||
$Whitespace ⊃ [$GC:Zs $GC:Zp $GC:Zl]
|
||||
|
||||
# Comparisons across versions
|
||||
|
||||
$ID_Start ⊇ $×ID_Start
|
||||
$ID_Continue ⊇ $×ID_Continue
|
||||
|
||||
#$age:4.0.1 = $age4.0.0
|
||||
|
||||
# Derivations
|
||||
|
||||
$Math = [$GC:Sm $Other_Math]
|
||||
$Alphabetic = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_Alphabetic]
|
||||
$Lowercase = [$GC:Ll $Other_Lowercase]
|
||||
$Uppercase = [$GC:Lu $Other_Uppercase]
|
||||
$ID_Start = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_ID_Start]
|
||||
$ID_Continue = [$ID_Start $GC:Mn $GC:Mc $GC:Nd $GC:Pc]
|
||||
$Default_Ignorable_Code_Point = [[$Other_Default_Ignorable_Code_Point $GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]]
|
||||
$Grapheme_Extend = [$GC:Me $GC:Mn $Other_Grapheme_Extend]
|
||||
$Grapheme_Base = [^$GC:Cc $GC:Cf $GC:Cs $GC:Co $GC:Cn $GC:Zl $GC:Zp $Grapheme_Extend]
|
||||
|
||||
# "Minimal" Other_: NOT hard requirements; just if we want to be minimal
|
||||
|
||||
$Other_Math = [$Math - $GC:Sm]
|
||||
$Other_Alphabetic = [$Alphabetic - [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl]]
|
||||
$Other_Lowercase = [$Lowercase - $GC:Ll]
|
||||
$Other_Uppercase = [$Uppercase - $GC:Lu]
|
||||
$Other_ID_Start = [$ID_Start - [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl]]
|
||||
$Other_Default_Ignorable_Code_Point = [$Default_Ignorable_Code_Point - [[$GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]]]
|
||||
$Other_Grapheme_Extend = [$Grapheme_Extend - [$GC:Me $GC:Mn]]
|
||||
|
||||
# Testing
|
||||
$script:greek = $×script:greek
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/UTF32.java,v $
|
||||
* $Date: 2001/08/31 00:19:16 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2004/04/17 18:21:38 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -53,28 +53,28 @@ package com.ibm.text.utility;
|
|||
<pre>
|
||||
// iteration forwards: Original
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
char ch = s.charAt(i);
|
||||
doSomethingWith(ch);
|
||||
char ch = s.charAt(i);
|
||||
doSomethingWith(ch);
|
||||
}
|
||||
|
||||
// iteration forwards: Changes for UTF-32
|
||||
int ch;
|
||||
for (int i = 0; i < s.length(); i+=UTF32.count16(ch)) {
|
||||
ch = UTF32.char32At(s,i);
|
||||
doSomethingWith(ch);
|
||||
ch = UTF32.char32At(s,i);
|
||||
doSomethingWith(ch);
|
||||
}
|
||||
|
||||
// iteration backwards: Original
|
||||
for (int i = s.length()-1; i >= 0; --i) {
|
||||
char ch = s.charAt(i);
|
||||
doSomethingWith(ch);
|
||||
char ch = s.charAt(i);
|
||||
doSomethingWith(ch);
|
||||
}
|
||||
|
||||
// iteration backwards: Changes for UTF-32
|
||||
int ch;
|
||||
for (int i = s.length()-1; i > 0; i-=UTF32.count16(ch)) {
|
||||
ch = UTF32.char32At(s,i);
|
||||
doSomethingWith(ch);
|
||||
ch = UTF32.char32At(s,i);
|
||||
doSomethingWith(ch);
|
||||
}
|
||||
|
||||
* </pre>
|
||||
|
|
Loading…
Add table
Reference in a new issue