mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-0 latest changes for Unicode generation
X-SVN-Rev: 16954
This commit is contained in:
parent
a1487bcd57
commit
b997c53273
4 changed files with 91 additions and 7 deletions
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
|
||||
* $Date: 2004/03/11 19:03:17 $
|
||||
* $Revision: 1.35 $
|
||||
* $Date: 2004/12/15 02:39:25 $
|
||||
* $Revision: 1.36 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -793,7 +793,7 @@ public class GenerateData implements UCD_Types {
|
|||
log.println("@Part1 # Character by character test");
|
||||
log.println("# All characters not explicitly occurring in c1 of Part 1 have identical NFC, D, KC, KD forms.");
|
||||
log.println("#");
|
||||
|
||||
|
||||
for (int ch = 0; ch < 0x10FFFF; ++ch) {
|
||||
Utility.dot(ch);
|
||||
if (!Default.ucd().isAssigned(ch)) continue;
|
||||
|
@ -846,6 +846,42 @@ public class GenerateData implements UCD_Types {
|
|||
writeLine("a" + sample + UTF32.valueOf32(ch) + "b", log, false);
|
||||
writeLine("a" + UTF32.valueOf32(ch) + sample + "b", log, false);
|
||||
}
|
||||
|
||||
System.out.println("Writing Part 3");
|
||||
log.println("#");
|
||||
log.println("@Part3 # PRI #29 Test");
|
||||
log.println("#");
|
||||
|
||||
Set prilist = new TreeSet();
|
||||
|
||||
for (int ch = 0; ch < 0x10FFFF; ++ch) {
|
||||
Utility.dot(ch);
|
||||
if (!Default.ucd().isAssigned(ch)) continue;
|
||||
if (Default.ucd().isPUA(ch)) continue;
|
||||
if (0xAC00 <= ch && ch <= 0xD7FF) { // skip most
|
||||
if (((ch - 0xAC00) % 91) != 0) continue;
|
||||
}
|
||||
// also gather data for pri29 test
|
||||
if (ch == 0x09CB) {
|
||||
System.out.println("debug");
|
||||
}
|
||||
if (Default.ucd().getDecompositionType(ch) != CANONICAL) continue;
|
||||
//if (!Default.nfc().isNormalized(ch)) continue;
|
||||
String s = Default.ucd().getDecompositionMapping(ch);
|
||||
if (UTF16.hasMoreCodePointsThan(s, 2)) continue;
|
||||
if (!UTF16.hasMoreCodePointsThan(s, 1)) continue;
|
||||
int c1 = UTF16.charAt(s, 0);
|
||||
int c2 = UTF16.charAt(s, UTF16.getCharCount(c1));
|
||||
if (Default.ucd().getCombiningClass(c1) != 0) continue;
|
||||
if (Default.ucd().getCombiningClass(c2) != 0) continue;
|
||||
prilist.add(UTF16.valueOf(c1) + '\u0334' + UTF16.valueOf(c2));
|
||||
}
|
||||
Utility.fixDot();
|
||||
|
||||
for (Iterator it = prilist.iterator(); it.hasNext();) {
|
||||
writeLine((String)it.next(),log, false);
|
||||
}
|
||||
|
||||
Utility.fixDot();
|
||||
log.println("#");
|
||||
log.println("# END OF FILE");
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
Generate:
|
||||
DeltaVersion: 8
|
||||
Generate: Derived.*
|
||||
DeltaVersion: 9
|
||||
CopyrightYear: 2005
|
||||
|
||||
File: uax29/GraphemeBreakProperty
|
||||
|
|
|
@ -32,8 +32,55 @@
|
|||
# <ISO_3166_code> := 2-letter ISO country code,
|
||||
# <ISO_639_code> := 2-letter ISO language code
|
||||
#
|
||||
# A context is one of the following, as defined in the Unicode Standard:
|
||||
# Final_Sigma, After_Soft_Dotted, More_Above, Before_Dot, Not_Before_Dot, After_I
|
||||
# A context for a character C is one of the following. This overrides Table
|
||||
# 3-13. Context Specification for Casing on p. 89 of The Unicode Standard,
|
||||
# Version 4.0.
|
||||
#
|
||||
# Definitions
|
||||
# - The property "cased" is defined in D47 on that same page (p. 89)
|
||||
# - A character C is defined to be "case-ignorable" if it meets either of the
|
||||
# following criteria:
|
||||
# A. The general category of C is Nonspacing Mark (Mn), or Enclosing Mark
|
||||
# (Me), or Format Control (Cf), or Letter Modifier (Lm), or
|
||||
# Symbol Modifier (Sk)
|
||||
# B. C is a MidLetter as defined in UAX #29
|
||||
# - A "case-ignorable sequence" is a sequence of zero or more case-ignorable
|
||||
# characters.
|
||||
#
|
||||
# A description of each context is followed by the equivalent regular
|
||||
# expression(s) describing the context before C and/or the context after C.
|
||||
# The regular expression uses the syntax of UTS #18, with one addition:
|
||||
# "!" means that the expression does not match. All regular expressions
|
||||
# below are case-sensitive.
|
||||
#
|
||||
# Context: Final_Sigma
|
||||
# Description: C is preceded by a sequence consisting of a cased letter and
|
||||
# a case-ignorable sequence, and C is not followed by a sequence consisting
|
||||
# of an ignorable sequence
|
||||
# and then a cased letter.
|
||||
# Before C: \p{cased} (\p{case-ignorable})*
|
||||
# After C: !( (\p{case-ignorable})* \p{cased} )
|
||||
#
|
||||
# Context: After_Soft_Dotted
|
||||
# Description: The last preceding character with combining class of zero before C was
|
||||
# Soft_Dotted, and there is no intervening combining character class 230 (ABOVE).
|
||||
# Before C: [\p{Soft_Dotted}] ([^{cc=230} {cc=0}])*
|
||||
#
|
||||
# Context: More_Above
|
||||
# Description: C is followed by one or more characters of combining class
|
||||
# 230 (ABOVE) in the combining character sequence.
|
||||
# After C: [^\p{cc=0}]* [\p{cc=230}]
|
||||
#
|
||||
# Context: Before_Dot
|
||||
# Description: C is followed by combining dot above (U+0307). Any sequence
|
||||
# of characters with a combining class that is neither 0 nor 230 may intervene
|
||||
# between the current character and the combining dot above.
|
||||
# After C: ([^\p{cc=230} \p{cc=0}])* [\u0307]
|
||||
#
|
||||
# Context: After_I
|
||||
# Description: The last preceding base character was an uppercase I, and
|
||||
# there is no intervening combining character class 230 (ABOVE).
|
||||
# Before C: [I] ([^\p{cc=230} \p{cc=0}])
|
||||
#
|
||||
# Parsers of this file must be prepared to deal with future additions to this format:
|
||||
# * Additional contexts
|
||||
|
|
|
@ -65,6 +65,7 @@ $Alphabetic ⊃ [$Uppercase $Lowercase]
|
|||
|
||||
$ID_Start ⊇ $×ID_Start
|
||||
$ID_Continue ⊇ $×ID_Continue
|
||||
[$Decomposition_Type:Canonical - $Full_Composition_Exclusion] = [$×Decomposition_Type:Canonical - $×Full_Composition_Exclusion]
|
||||
|
||||
#$age:4.0.1 = $age4.0.0
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue