ICU-2790 recognize alternate transliterator syntax characters

X-SVN-Rev: 11623
This commit is contained in:
Alan Liu 2003-04-23 00:20:16 +00:00
parent 42475ef053
commit d2cea9a9eb
7 changed files with 94 additions and 16 deletions
icu4c/source
icu4j/src/com/ibm/icu

View file

@ -274,9 +274,9 @@ t_FWidth_HWidth {
"¥<>'¥';" // from FULLWIDTH YEN SIGN
"₩<>₩;" // from FULLWIDTH WON SIGN
"│<>;" // to HALFWIDTH FORMS LIGHT VERTICAL
"←<>←;" // to HALFWIDTH LEFTWARDS ARROW
"'←'<>'←';" // to HALFWIDTH LEFTWARDS ARROW
"↑<>↑;" // to HALFWIDTH UPWARDS ARROW
"→<>→;" // to HALFWIDTH RIGHTWARDS ARROW
"'→'<>'→';" // to HALFWIDTH RIGHTWARDS ARROW
"↓<>↓;" // to HALFWIDTH DOWNWARDS ARROW
"■<>■;" // to HALFWIDTH BLACK SQUARE
"○<>○;" // to HALFWIDTH WHITE CIRCLE

View file

@ -69,6 +69,14 @@ static const UChar DOT_SET[] = { // "[^[:Zp:][:Zl:]\r\n$]";
// A function is denoted &Source-Target/Variant(text)
#define FUNCTION ((UChar)38) /*&*/
// Aliases for some of the syntax characters. These are provided so
// transliteration rules can be expressed in XML without clashing with
// XML syntax characters '<', '>', and '&'.
#define ALT_REVERSE_RULE_OP ((UChar)0x2190) // Left Arrow
#define ALT_FORWARD_RULE_OP ((UChar)0x2192) // Right Arrow
#define ALT_FWDREV_RULE_OP ((UChar)0x2194) // Left Right Arrow
#define ALT_FUNCTION ((UChar)0x2206) // Increment (~Greek Capital Delta)
// Special characters disallowed at the top level
static const UChar ILLEGAL_TOP[] = {41,0}; // ")"
@ -82,12 +90,17 @@ static const UChar ILLEGAL_FUNC[] = {94,40,46,42,43,63,123,125,124,64,0}; // "^(
// trailing SymbolTable.SYMBOL_REF character.
// private static final char ANCHOR_END = '$';
static const UChar gOPERATORS[] = {
0x3D, 0x3E, 0x3C, 0 // "=><"
static const UChar gOPERATORS[] = { // "=><"
VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP,
ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP,
0
};
static const UChar HALF_ENDERS[] = {
0x3D, 0x3E, 0x3C, 59, 0 // "=><;"
static const UChar HALF_ENDERS[] = { // "=><;"
VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP,
ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP,
END_OF_RULE,
0
};
// These are also used in Transliterator::toRules()
@ -511,6 +524,7 @@ int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t l
}
break;
case FUNCTION:
case ALT_FUNCTION:
{
int32_t iref = pos;
TransliteratorIDParser::SingleID* single =
@ -1212,6 +1226,19 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
op = FWDREV_RULE_OP;
}
// Translate alternate op characters.
switch (op) {
case ALT_FORWARD_RULE_OP:
op = FORWARD_RULE_OP;
break;
case ALT_REVERSE_RULE_OP:
op = REVERSE_RULE_OP;
break;
case ALT_FWDREV_RULE_OP:
op = FWDREV_RULE_OP;
break;
}
pos = right->parse(rule, pos, limit);
if (U_FAILURE(status)) {
return start;

View file

@ -183,6 +183,7 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
TESTCASE(74,TestRuleWhitespace);
TESTCASE(75,TestAllCodepoints);
TESTCASE(76,TestBoilerplate);
TESTCASE(77,TestAlternateSyntax);
default: name = ""; break;
}
}
@ -3913,6 +3914,19 @@ void TransliteratorTest::TestBoilerplate() {
TEST_TRANSLIT_RULE("a>b;", RuleBasedTransliterator);
}
void TransliteratorTest::TestAlternateSyntax() {
// U+2206 == &
// U+2190 == <
// U+2192 == >
// U+2194 == <>
expect(CharsToUnicodeString("a \\u2192 x; b \\u2190 y; c \\u2194 z"),
"abc",
"xbz");
expect(CharsToUnicodeString("([:^ASCII:]) \\u2192 \\u2206Name($1);"),
CharsToUnicodeString("<=\\u2190; >=\\u2192; <>=\\u2194; &=\\u2206"),
"<=\\N{LEFTWARDS ARROW}; >=\\N{RIGHTWARDS ARROW}; <>=\\N{LEFT RIGHT ARROW}; &=\\N{INCREMENT}");
}
//======================================================================
// Support methods
//======================================================================

View file

@ -342,6 +342,8 @@ private:
void TestBoilerplate(void);
void TestAlternateSyntax(void);
//======================================================================
// Support methods
//======================================================================

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
* $Date: 2003/02/19 00:18:46 $
* $Revision: 1.121 $
* $Date: 2003/04/23 00:20:16 $
* $Revision: 1.122 $
*
*****************************************************************************************
*/
@ -2825,6 +2825,19 @@ public class TransliteratorTest extends TestFmwk {
}
}
public void TestAlternateSyntax() {
// U+2206 == &
// U+2190 == <
// U+2192 == >
// U+2194 == <>
expect("a \u2192 x; b \u2190 y; c \u2194 z",
"abc",
"xbz");
expect("([:^ASCII:]) \u2192 \u2206Name($1);",
"<=\u2190; >=\u2192; <>=\u2194; &=\u2206",
"<=\\N{LEFTWARDS ARROW}; >=\\N{RIGHTWARDS ARROW}; <>=\\N{LEFT RIGHT ARROW}; &=\\N{INCREMENT}");
}
//======================================================================
// These tests are not mirrored (yet) in icu4c at
// source/test/intltest/transtst.cpp

View file

@ -3,8 +3,8 @@
# Corporation and others. All Rights Reserved.
#--------------------------------------------------------------------
# $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/Attic/Transliterator_Fullwidth_Halfwidth.txt,v $
# $Date: 2002/03/02 00:27:27 $
# $Revision: 1.3 $
# $Date: 2003/04/23 00:20:15 $
# $Revision: 1.4 $
#--------------------------------------------------------------------
# Fullwidth-Halfwidth
@ -264,9 +264,9 @@
¥<>'¥'; # from FULLWIDTH YEN SIGN
₩<>₩; # from FULLWIDTH WON SIGN
│<>; # to HALFWIDTH FORMS LIGHT VERTICAL
←<>←; # to HALFWIDTH LEFTWARDS ARROW
'←'<>'←'; # to HALFWIDTH LEFTWARDS ARROW
↑<>↑; # to HALFWIDTH UPWARDS ARROW
→<>→; # to HALFWIDTH RIGHTWARDS ARROW
'→'<>'→'; # to HALFWIDTH RIGHTWARDS ARROW
↓<>↓; # to HALFWIDTH DOWNWARDS ARROW
■<>■; # to HALFWIDTH BLACK SQUARE
○<>○; # to HALFWIDTH WHITE CIRCLE

View file

@ -4,8 +4,8 @@
* Corporation and others. All Rights Reserved.
**********************************************************************
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliteratorParser.java,v $
* $Date: 2002/07/26 21:12:36 $
* $Revision: 1.23 $
* $Date: 2003/04/23 00:20:15 $
* $Revision: 1.24 $
**********************************************************************
*/
package com.ibm.icu.text;
@ -133,8 +133,8 @@ class TransliteratorParser {
private static final char REVERSE_RULE_OP = '<';
private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op
private static final String OPERATORS = "=><";
private static final String HALF_ENDERS = "=><;";
private static final String OPERATORS = "=><\u2190\u2192\u2194";
private static final String HALF_ENDERS = "=><\u2190\u2192\u2194;";
// Other special characters
private static final char QUOTE = '\'';
@ -167,6 +167,14 @@ class TransliteratorParser {
// A function is denoted &Source-Target/Variant(text)
private static final char FUNCTION = '&';
// Aliases for some of the syntax characters. These are provided so
// transliteration rules can be expressed in XML without clashing with
// XML syntax characters '<', '>', and '&'.
private static final char ALT_REVERSE_RULE_OP = '\u2190'; // Left Arrow
private static final char ALT_FORWARD_RULE_OP = '\u2192'; // Right Arrow
private static final char ALT_FWDREV_RULE_OP = '\u2194'; // Left Right Arrow
private static final char ALT_FUNCTION = '\u2206'; // Increment (~Greek Capital Delta)
// Special characters disallowed at the top level
private static UnicodeSet ILLEGAL_TOP = new UnicodeSet("[\\)]");
@ -569,6 +577,7 @@ class TransliteratorParser {
}
break;
case FUNCTION:
case ALT_FUNCTION:
{
iref[0] = pos;
TransliteratorIDParser.SingleID single = TransliteratorIDParser.parseFilterID(rule, iref);
@ -1112,6 +1121,19 @@ class TransliteratorParser {
operator = FWDREV_RULE_OP;
}
// Translate alternate op characters.
switch (operator) {
case ALT_FORWARD_RULE_OP:
operator = FORWARD_RULE_OP;
break;
case ALT_REVERSE_RULE_OP:
operator = REVERSE_RULE_OP;
break;
case ALT_FWDREV_RULE_OP:
operator = FWDREV_RULE_OP;
break;
}
pos = right.parse(rule, pos, limit, this);
if (pos < limit) {