mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-2790 recognize alternate transliterator syntax characters
X-SVN-Rev: 11623
This commit is contained in:
parent
42475ef053
commit
d2cea9a9eb
7 changed files with 94 additions and 16 deletions
icu4c/source
icu4j/src/com/ibm/icu
dev/test/translit
impl/data
text
|
@ -274,9 +274,9 @@ t_FWidth_HWidth {
|
|||
"¥<>'¥';" // from FULLWIDTH YEN SIGN
|
||||
"₩<>₩;" // from FULLWIDTH WON SIGN
|
||||
"│<>│;" // to HALFWIDTH FORMS LIGHT VERTICAL
|
||||
"←<>←;" // to HALFWIDTH LEFTWARDS ARROW
|
||||
"'←'<>'←';" // to HALFWIDTH LEFTWARDS ARROW
|
||||
"↑<>↑;" // to HALFWIDTH UPWARDS ARROW
|
||||
"→<>→;" // to HALFWIDTH RIGHTWARDS ARROW
|
||||
"'→'<>'→';" // to HALFWIDTH RIGHTWARDS ARROW
|
||||
"↓<>↓;" // to HALFWIDTH DOWNWARDS ARROW
|
||||
"■<>■;" // to HALFWIDTH BLACK SQUARE
|
||||
"○<>○;" // to HALFWIDTH WHITE CIRCLE
|
||||
|
|
|
@ -69,6 +69,14 @@ static const UChar DOT_SET[] = { // "[^[:Zp:][:Zl:]\r\n$]";
|
|||
// A function is denoted &Source-Target/Variant(text)
|
||||
#define FUNCTION ((UChar)38) /*&*/
|
||||
|
||||
// Aliases for some of the syntax characters. These are provided so
|
||||
// transliteration rules can be expressed in XML without clashing with
|
||||
// XML syntax characters '<', '>', and '&'.
|
||||
#define ALT_REVERSE_RULE_OP ((UChar)0x2190) // Left Arrow
|
||||
#define ALT_FORWARD_RULE_OP ((UChar)0x2192) // Right Arrow
|
||||
#define ALT_FWDREV_RULE_OP ((UChar)0x2194) // Left Right Arrow
|
||||
#define ALT_FUNCTION ((UChar)0x2206) // Increment (~Greek Capital Delta)
|
||||
|
||||
// Special characters disallowed at the top level
|
||||
static const UChar ILLEGAL_TOP[] = {41,0}; // ")"
|
||||
|
||||
|
@ -82,12 +90,17 @@ static const UChar ILLEGAL_FUNC[] = {94,40,46,42,43,63,123,125,124,64,0}; // "^(
|
|||
// trailing SymbolTable.SYMBOL_REF character.
|
||||
// private static final char ANCHOR_END = '$';
|
||||
|
||||
static const UChar gOPERATORS[] = {
|
||||
0x3D, 0x3E, 0x3C, 0 // "=><"
|
||||
static const UChar gOPERATORS[] = { // "=><"
|
||||
VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP,
|
||||
ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP,
|
||||
0
|
||||
};
|
||||
|
||||
static const UChar HALF_ENDERS[] = {
|
||||
0x3D, 0x3E, 0x3C, 59, 0 // "=><;"
|
||||
static const UChar HALF_ENDERS[] = { // "=><;"
|
||||
VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP,
|
||||
ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP,
|
||||
END_OF_RULE,
|
||||
0
|
||||
};
|
||||
|
||||
// These are also used in Transliterator::toRules()
|
||||
|
@ -511,6 +524,7 @@ int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t l
|
|||
}
|
||||
break;
|
||||
case FUNCTION:
|
||||
case ALT_FUNCTION:
|
||||
{
|
||||
int32_t iref = pos;
|
||||
TransliteratorIDParser::SingleID* single =
|
||||
|
@ -1212,6 +1226,19 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
|
|||
op = FWDREV_RULE_OP;
|
||||
}
|
||||
|
||||
// Translate alternate op characters.
|
||||
switch (op) {
|
||||
case ALT_FORWARD_RULE_OP:
|
||||
op = FORWARD_RULE_OP;
|
||||
break;
|
||||
case ALT_REVERSE_RULE_OP:
|
||||
op = REVERSE_RULE_OP;
|
||||
break;
|
||||
case ALT_FWDREV_RULE_OP:
|
||||
op = FWDREV_RULE_OP;
|
||||
break;
|
||||
}
|
||||
|
||||
pos = right->parse(rule, pos, limit);
|
||||
if (U_FAILURE(status)) {
|
||||
return start;
|
||||
|
|
|
@ -183,6 +183,7 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
|
|||
TESTCASE(74,TestRuleWhitespace);
|
||||
TESTCASE(75,TestAllCodepoints);
|
||||
TESTCASE(76,TestBoilerplate);
|
||||
TESTCASE(77,TestAlternateSyntax);
|
||||
default: name = ""; break;
|
||||
}
|
||||
}
|
||||
|
@ -3913,6 +3914,19 @@ void TransliteratorTest::TestBoilerplate() {
|
|||
TEST_TRANSLIT_RULE("a>b;", RuleBasedTransliterator);
|
||||
}
|
||||
|
||||
void TransliteratorTest::TestAlternateSyntax() {
|
||||
// U+2206 == &
|
||||
// U+2190 == <
|
||||
// U+2192 == >
|
||||
// U+2194 == <>
|
||||
expect(CharsToUnicodeString("a \\u2192 x; b \\u2190 y; c \\u2194 z"),
|
||||
"abc",
|
||||
"xbz");
|
||||
expect(CharsToUnicodeString("([:^ASCII:]) \\u2192 \\u2206Name($1);"),
|
||||
CharsToUnicodeString("<=\\u2190; >=\\u2192; <>=\\u2194; &=\\u2206"),
|
||||
"<=\\N{LEFTWARDS ARROW}; >=\\N{RIGHTWARDS ARROW}; <>=\\N{LEFT RIGHT ARROW}; &=\\N{INCREMENT}");
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
|
|
|
@ -342,6 +342,8 @@ private:
|
|||
|
||||
void TestBoilerplate(void);
|
||||
|
||||
void TestAlternateSyntax(void);
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
|
||||
* $Date: 2003/02/19 00:18:46 $
|
||||
* $Revision: 1.121 $
|
||||
* $Date: 2003/04/23 00:20:16 $
|
||||
* $Revision: 1.122 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -2825,6 +2825,19 @@ public class TransliteratorTest extends TestFmwk {
|
|||
}
|
||||
}
|
||||
|
||||
public void TestAlternateSyntax() {
|
||||
// U+2206 == &
|
||||
// U+2190 == <
|
||||
// U+2192 == >
|
||||
// U+2194 == <>
|
||||
expect("a \u2192 x; b \u2190 y; c \u2194 z",
|
||||
"abc",
|
||||
"xbz");
|
||||
expect("([:^ASCII:]) \u2192 \u2206Name($1);",
|
||||
"<=\u2190; >=\u2192; <>=\u2194; &=\u2206",
|
||||
"<=\\N{LEFTWARDS ARROW}; >=\\N{RIGHTWARDS ARROW}; <>=\\N{LEFT RIGHT ARROW}; &=\\N{INCREMENT}");
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// These tests are not mirrored (yet) in icu4c at
|
||||
// source/test/intltest/transtst.cpp
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
# Corporation and others. All Rights Reserved.
|
||||
#--------------------------------------------------------------------
|
||||
# $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/Attic/Transliterator_Fullwidth_Halfwidth.txt,v $
|
||||
# $Date: 2002/03/02 00:27:27 $
|
||||
# $Revision: 1.3 $
|
||||
# $Date: 2003/04/23 00:20:15 $
|
||||
# $Revision: 1.4 $
|
||||
#--------------------------------------------------------------------
|
||||
|
||||
# Fullwidth-Halfwidth
|
||||
|
@ -264,9 +264,9 @@
|
|||
¥<>'¥'; # from FULLWIDTH YEN SIGN
|
||||
₩<>₩; # from FULLWIDTH WON SIGN
|
||||
│<>│; # to HALFWIDTH FORMS LIGHT VERTICAL
|
||||
←<>←; # to HALFWIDTH LEFTWARDS ARROW
|
||||
'←'<>'←'; # to HALFWIDTH LEFTWARDS ARROW
|
||||
↑<>↑; # to HALFWIDTH UPWARDS ARROW
|
||||
→<>→; # to HALFWIDTH RIGHTWARDS ARROW
|
||||
'→'<>'→'; # to HALFWIDTH RIGHTWARDS ARROW
|
||||
↓<>↓; # to HALFWIDTH DOWNWARDS ARROW
|
||||
■<>■; # to HALFWIDTH BLACK SQUARE
|
||||
○<>○; # to HALFWIDTH WHITE CIRCLE
|
||||
|
|
|
@ -4,8 +4,8 @@
|
|||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliteratorParser.java,v $
|
||||
* $Date: 2002/07/26 21:12:36 $
|
||||
* $Revision: 1.23 $
|
||||
* $Date: 2003/04/23 00:20:15 $
|
||||
* $Revision: 1.24 $
|
||||
**********************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
@ -133,8 +133,8 @@ class TransliteratorParser {
|
|||
private static final char REVERSE_RULE_OP = '<';
|
||||
private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op
|
||||
|
||||
private static final String OPERATORS = "=><";
|
||||
private static final String HALF_ENDERS = "=><;";
|
||||
private static final String OPERATORS = "=><\u2190\u2192\u2194";
|
||||
private static final String HALF_ENDERS = "=><\u2190\u2192\u2194;";
|
||||
|
||||
// Other special characters
|
||||
private static final char QUOTE = '\'';
|
||||
|
@ -167,6 +167,14 @@ class TransliteratorParser {
|
|||
// A function is denoted &Source-Target/Variant(text)
|
||||
private static final char FUNCTION = '&';
|
||||
|
||||
// Aliases for some of the syntax characters. These are provided so
|
||||
// transliteration rules can be expressed in XML without clashing with
|
||||
// XML syntax characters '<', '>', and '&'.
|
||||
private static final char ALT_REVERSE_RULE_OP = '\u2190'; // Left Arrow
|
||||
private static final char ALT_FORWARD_RULE_OP = '\u2192'; // Right Arrow
|
||||
private static final char ALT_FWDREV_RULE_OP = '\u2194'; // Left Right Arrow
|
||||
private static final char ALT_FUNCTION = '\u2206'; // Increment (~Greek Capital Delta)
|
||||
|
||||
// Special characters disallowed at the top level
|
||||
private static UnicodeSet ILLEGAL_TOP = new UnicodeSet("[\\)]");
|
||||
|
||||
|
@ -569,6 +577,7 @@ class TransliteratorParser {
|
|||
}
|
||||
break;
|
||||
case FUNCTION:
|
||||
case ALT_FUNCTION:
|
||||
{
|
||||
iref[0] = pos;
|
||||
TransliteratorIDParser.SingleID single = TransliteratorIDParser.parseFilterID(rule, iref);
|
||||
|
@ -1112,6 +1121,19 @@ class TransliteratorParser {
|
|||
operator = FWDREV_RULE_OP;
|
||||
}
|
||||
|
||||
// Translate alternate op characters.
|
||||
switch (operator) {
|
||||
case ALT_FORWARD_RULE_OP:
|
||||
operator = FORWARD_RULE_OP;
|
||||
break;
|
||||
case ALT_REVERSE_RULE_OP:
|
||||
operator = REVERSE_RULE_OP;
|
||||
break;
|
||||
case ALT_FWDREV_RULE_OP:
|
||||
operator = FWDREV_RULE_OP;
|
||||
break;
|
||||
}
|
||||
|
||||
pos = right.parse(rule, pos, limit, this);
|
||||
|
||||
if (pos < limit) {
|
||||
|
|
Loading…
Add table
Reference in a new issue