mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
added new parameters to rules, fixed a bit of the xml
X-SVN-Rev: 8930
This commit is contained in:
parent
ce8d7a8716
commit
cbe4468265
4 changed files with 109 additions and 41 deletions
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $
|
||||
* $Date: 2002/06/15 02:47:12 $
|
||||
* $Revision: 1.14 $
|
||||
* $Date: 2002/06/24 15:25:10 $
|
||||
* $Revision: 1.15 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -199,6 +199,7 @@ final public class UCA implements Comparator, UCA_Types {
|
|||
hangulBuffer.setLength(0); // clear hangul buffer
|
||||
|
||||
char weight4 = '\u0000'; // DEFAULT FOR NON_IGNORABLE
|
||||
boolean lastWasVariable = false;
|
||||
|
||||
// process CEs, building weight strings
|
||||
while (true) {
|
||||
|
@ -219,8 +220,13 @@ final public class UCA implements Comparator, UCA_Types {
|
|||
weight4 = 0;
|
||||
} else if (isVariable(ce)) { // variables
|
||||
weight4 = getPrimary(ce);
|
||||
lastWasVariable = true;
|
||||
ce = 0;
|
||||
} else if (lastWasVariable && getPrimary(ce) == 0) { // zap trailing ignorables
|
||||
ce = 0;
|
||||
weight4 = 0;
|
||||
} else { // above variables
|
||||
lastWasVariable = false;
|
||||
weight4 = '\uFFFF';
|
||||
}
|
||||
break;
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
|
||||
* $Date: 2002/06/22 21:02:16 $
|
||||
* $Revision: 1.22 $
|
||||
* $Date: 2002/06/24 15:25:10 $
|
||||
* $Revision: 1.23 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -415,7 +415,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
|
|||
log.print(
|
||||
";\t# " + (extra != LOW_ACCENT ? extra : '.') + " " + ucd.getName(clipped, SHORT) + "\t" + UCA.toString(key));
|
||||
} else {
|
||||
log.print(Utility.hex(source) + "\t" + Utility.hex(clipped));
|
||||
log.print(Utility.hex(source) + ";\t" + Utility.hex(clipped));
|
||||
}
|
||||
log.println();
|
||||
}
|
||||
|
@ -430,6 +430,11 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
|
|||
}
|
||||
|
||||
static final char LOW_ACCENT = '\u0334';
|
||||
static final String SUPPLEMENTARY_ACCENT = UTF16.valueOf(0x1D165);
|
||||
static final String COMPLETELY_IGNOREABLE = "\u0001";
|
||||
static final String COMPLETELY_IGNOREABLE_ACCENT = "\u0591";
|
||||
static final String[] CONTRACTION_TEST = {SUPPLEMENTARY_ACCENT, COMPLETELY_IGNOREABLE, COMPLETELY_IGNOREABLE_ACCENT};
|
||||
|
||||
static int addCounter = 0;
|
||||
|
||||
static void addStringX(String s, byte option) {
|
||||
|
@ -460,6 +465,17 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
|
|||
}
|
||||
}
|
||||
}
|
||||
if (UTF16.countCodePoint(s) > 1) {
|
||||
for (int i = 1; i < s.length(); ++i) {
|
||||
if (UTF16.isLeadSurrogate(s.charAt(i-1))) continue; // skip if in middle of supplementary
|
||||
|
||||
for (int j = 0; j < CONTRACTION_TEST.length; ++j) {
|
||||
String extra = s.substring(0,i) + CONTRACTION_TEST[j] + s.substring(i);
|
||||
addStringY(extra + 'a', option);
|
||||
System.out.println(addCounter++ + " Adding " + Default.ucd.getCodeAndName(extra));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static CanonicalIterator canIt = null;
|
||||
|
@ -1399,13 +1415,14 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
};
|
||||
|
||||
if (option == IN_XML) {
|
||||
log.println("<uca>");
|
||||
log.println("<collation>");
|
||||
log.println("<!--");
|
||||
for (int i = 0; i < commentText.length; ++i) {
|
||||
log.println(commentText[i]);
|
||||
}
|
||||
log.println("-->");
|
||||
log.println("<version UCA='" + collator.getDataVersion() + "' UCD='" + collator.getUCDVersion() + "'/>");
|
||||
log.println("<base uca='" + collator.getDataVersion() + "/" + collator.getUCDVersion() + "'/>");
|
||||
log.println("<rules>");
|
||||
} else {
|
||||
log.write('\uFEFF'); // BOM
|
||||
for (int i = 0; i < commentText.length; ++i) {
|
||||
|
@ -1538,38 +1555,60 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
String reset = "";
|
||||
String resetComment = "";
|
||||
int xmlReset = 0;
|
||||
|
||||
boolean insertVariableTop = false;
|
||||
boolean resetToParameter = false;
|
||||
|
||||
int ceLayout = getCELayout(ce);
|
||||
if (ceLayout == IMPLICIT) {
|
||||
if (relation == PRIMARY_DIFF) {
|
||||
int primary = UCA.getPrimary(ce);
|
||||
int resetCp = UCA.ImplicitToCodePoint(primary, UCA.getPrimary(ces[1]));
|
||||
|
||||
int[] ces2 = new int[50];
|
||||
int len2 = collator.getCEs(UTF16.valueOf(resetCp), true, ces2);
|
||||
relation = getStrengthDifference(ces, len, ces2, len2);
|
||||
|
||||
reset = quoteOperand(UTF16.valueOf(resetCp));
|
||||
resetComment = ucd.getCodeAndName(resetCp);
|
||||
// lastCE = UCA.makeKey(primary, UCA.NEUTRAL_SECONDARY, UCA.NEUTRAL_TERTIARY);
|
||||
xmlReset = 2;
|
||||
}
|
||||
// lastCJKPrimary = primary;
|
||||
} else if (ceLayout != getCELayout(lastCE) || firstTime) {
|
||||
resetToParameter = true;
|
||||
switch (ceLayout) {
|
||||
case T_IGNORE: reset = "last tertiary ignorable"; break;
|
||||
case S_IGNORE: reset = "last secondary ignorable"; break;
|
||||
case P_IGNORE: reset = "last primary ignorable"; break;
|
||||
case VARIABLE: reset = "last non-ignorable"; break;
|
||||
case NON_IGNORE: /*reset = "top"; */ insertVariableTop = true; break;
|
||||
case TRAILING: reset = "last trailing"; break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
if (firstTime
|
||||
|| collator.getPrimary(lastCE) == 0 && collator.getPrimary(ce) != 0
|
||||
|| collator.getSecondary(lastCE) == 0 && collator.getSecondary(ce) != 0
|
||||
|| collator.getTertiary(lastCE) == 0 && collator.getTertiary(ce) != 0) {
|
||||
if (collator.getPrimary(ce) != 0) {
|
||||
reset = "[top]";
|
||||
|
||||
} else if (collator.getSecondary(ce) != 0) {
|
||||
reset = "[last secondary ignorable]";
|
||||
} else if (collator.getTertiary(ce) != 0) {
|
||||
reset = "[last tertiary ignorable]";
|
||||
} else {
|
||||
reset = quoteOperand(chr);
|
||||
|
||||
//reset = quoteOperand(chr);
|
||||
}
|
||||
} else if (variableTop != 0 && (ce & 0xFFFF0000L) > variableTop) {
|
||||
reset = "[variable\\u0020top]";
|
||||
} else if (variableTop != 0 && ce > variableTop) {
|
||||
reset = "[variable top]";
|
||||
xmlReset = 1;
|
||||
variableTop = 0;
|
||||
} else {
|
||||
int primary = collator.getPrimary(ce);
|
||||
if (UCA.isImplicitLeadPrimary(primary)) {
|
||||
if (relation == PRIMARY_DIFF) {
|
||||
int resetCp = UCA.ImplicitToCodePoint(primary, UCA.getPrimary(ces[1]));
|
||||
|
||||
int[] ces2 = new int[50];
|
||||
int len2 = collator.getCEs(UTF16.valueOf(resetCp), true, ces2);
|
||||
relation = getStrengthDifference(ces, len, ces2, len2);
|
||||
|
||||
reset = quoteOperand(UTF16.valueOf(resetCp));
|
||||
resetComment = ucd.getCodeAndName(resetCp);
|
||||
// lastCE = UCA.makeKey(primary, UCA.NEUTRAL_SECONDARY, UCA.NEUTRAL_TERTIARY);
|
||||
xmlReset = 2;
|
||||
}
|
||||
// lastCJKPrimary = primary;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
if (primary >= 0x3400) {
|
||||
|
@ -1607,26 +1646,30 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
// print results
|
||||
|
||||
if (option == IN_XML) {
|
||||
if (xmlReset == 1) log.print("<variableTop/>");
|
||||
if (insertVariableTop) log.println(XML_RELATION_NAMES[0] + "<variableTop/>");
|
||||
|
||||
/*log.print(" <!--" + ucd.getCodeAndName(chr));
|
||||
if (len > 1) log.print(" / " + Utility.hex(expansion));
|
||||
log.println("-->");
|
||||
*/
|
||||
|
||||
if (xmlReset == 2) {
|
||||
log.print("<reset/>" + Utility.quoteXML(reset));
|
||||
if (reset.length() != 0) {
|
||||
log.println("<reset/>"
|
||||
+ (resetToParameter ? "<position at=\"" + reset + "\"/>" : Utility.quoteXML(reset))
|
||||
+ (resetComment.length() != 0 ? "<!-- " + resetComment + "-->": ""));
|
||||
}
|
||||
if (!firstTime) {
|
||||
log.print(" <" + XML_RELATION_NAMES[relation] + "/>");
|
||||
log.print(Utility.quoteXML(chr));
|
||||
log.println(Utility.quoteXML(chr));
|
||||
//log.print("</" + XML_RELATION_NAMES[relation] + ">");
|
||||
}
|
||||
if (expansion.length() > 0) {
|
||||
log.print("<x/>" + Utility.quoteXML(expansion));
|
||||
log.println("<x/>" + Utility.quoteXML(expansion));
|
||||
}
|
||||
} else {
|
||||
if (reset.length() != 0) log.println("& " + reset
|
||||
if (insertVariableTop) log.println(RELATION_NAMES[0] + " [variable top]");
|
||||
if (reset.length() != 0) log.println("& "
|
||||
+ (resetToParameter ? "[" : "") + reset + (resetToParameter ? "]" : "")
|
||||
+ (resetComment.length() != 0 ? "\t\t# " + resetComment : ""));
|
||||
if (!firstTime) log.print(RELATION_NAMES[relation] + " " + quoteOperand(chr));
|
||||
if (expansion.length() > 0) log.print(" / " + quoteOperand(expansion));
|
||||
|
@ -1641,12 +1684,31 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
firstTime = false;
|
||||
}
|
||||
// log.println("& [top]"); // RESET
|
||||
if (option == IN_XML) log.println("</uca>");
|
||||
if (option == IN_XML) log.println("</rules></collation>");
|
||||
log2.close();
|
||||
log.close();
|
||||
Utility.fixDot();
|
||||
}
|
||||
|
||||
static final int NONE = 0, T_IGNORE = 1, S_IGNORE = 2, P_IGNORE = 3, VARIABLE = 4, NON_IGNORE = 5, IMPLICIT = 6, TRAILING = 7;
|
||||
|
||||
static int getCELayout(int ce) {
|
||||
int primary = collator.getPrimary(ce);
|
||||
int secondary = collator.getSecondary(ce);
|
||||
int tertiary = collator.getSecondary(ce);
|
||||
if (primary == 0) {
|
||||
if (secondary == 0) {
|
||||
if (tertiary == 0) return T_IGNORE;
|
||||
return S_IGNORE;
|
||||
}
|
||||
return P_IGNORE;
|
||||
}
|
||||
if (collator.isVariable(ce)) return VARIABLE;
|
||||
if (primary < UNSUPPORTED_BASE) return NON_IGNORE;
|
||||
if (primary < UNSUPPORTED_LIMIT) return IMPLICIT;
|
||||
return TRAILING;
|
||||
}
|
||||
|
||||
static long getPrimary(int[] ces, int len) {
|
||||
if (len <= 0) return 0;
|
||||
if (UCA.isImplicitLeadCE(ces[0])) {
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
|
||||
* $Date: 2002/06/22 21:02:16 $
|
||||
* $Revision: 1.11 $
|
||||
* $Date: 2002/06/24 15:25:10 $
|
||||
* $Revision: 1.12 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -212,7 +212,7 @@ public final class Normalizer implements UCD_Types {
|
|||
* @return value from 0 to 255
|
||||
*/
|
||||
|
||||
public short getCanonicalClass(char ch) {
|
||||
public short getCanonicalClass(int ch) {
|
||||
return data.getCanonicalClass(ch);
|
||||
}
|
||||
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
|
||||
* $Date: 2002/06/22 01:21:11 $
|
||||
* $Revision: 1.19 $
|
||||
* $Date: 2002/06/24 15:25:10 $
|
||||
* $Revision: 1.20 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -430,12 +430,12 @@ public final class Utility { // COMMON UTILITIES
|
|||
// fix noncharacters, since XML can't handle
|
||||
case 0xFFFE: case 0xFFFF:
|
||||
|
||||
return "#" + hex(c,1);
|
||||
return "<codepoint hex=\"" + hex(c,1) + "\"/>";
|
||||
}
|
||||
|
||||
// fix surrogates, since XML can't handle
|
||||
if (UTF32.isSurrogate(c)) {
|
||||
return "#" + hex(c,1);
|
||||
return "<codepoint hex=\"" + hex(c,1) + "\"/>";
|
||||
}
|
||||
|
||||
if (c <= 0x7E || UTF8) {
|
||||
|
|
Loading…
Add table
Reference in a new issue