mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-09 15:27:38 +00:00
ICU-352 rbt support for segments, cursor offset, and new syntax
X-SVN-Rev: 1422
This commit is contained in:
parent
6a59bb39c8
commit
563d9e5006
25 changed files with 6885 additions and 5751 deletions
|
@ -1,142 +1,147 @@
|
|||
//--------------------------------------------------------------------
|
||||
// Copyright (c) 2000, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
// Copyright (c) 1999-2000, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//--------------------------------------------------------------------
|
||||
// Date Name Description
|
||||
// 01/13/2000 aliu Creation.
|
||||
// THIS IS A MACHINE-GENERATED FILE
|
||||
// Tool: src\com\ibm\tools\translit\dumpICUrules.bat
|
||||
// Source: src\com\ibm\text\resources/TransliterationRule_Fullwidth_Halfwidth.java
|
||||
// Date: Fri May 19 15:50:22 2000
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
// Fullwidth-Halfwidth
|
||||
|
||||
fullhalf {
|
||||
Rule {
|
||||
Rule {
|
||||
|
||||
// Mechanically generated from Unicode Character Database
|
||||
|
||||
|
||||
// multicharacter
|
||||
|
||||
"\u30AC<>\uFF76\uFF9E;" // to KATAKANA LETTER GA
|
||||
"\u30AE<>\uFF77\uFF9E;" // to KATAKANA LETTER GI
|
||||
"\u30B0<>\uFF78\uFF9E;" // to KATAKANA LETTER GU
|
||||
"\u30B2<>\uFF79\uFF9E;" // to KATAKANA LETTER GE
|
||||
"\u30B4<>\uFF7A\uFF9E;" // to KATAKANA LETTER GO
|
||||
"\u30B6<>\uFF7B\uFF9E;" // to KATAKANA LETTER ZA
|
||||
"\u30B8<>\uFF7C\uFF9E;" // to KATAKANA LETTER ZI
|
||||
"\u30BA<>\uFF7D\uFF9E;" // to KATAKANA LETTER ZU
|
||||
"\u30BC<>\uFF7E\uFF9E;" // to KATAKANA LETTER ZE
|
||||
"\u30BE<>\uFF7F\uFF9E;" // to KATAKANA LETTER ZO
|
||||
"\u30C0<>\uFF80\uFF9E;" // to KATAKANA LETTER DA
|
||||
"\u30C2<>\uFF81\uFF9E;" // to KATAKANA LETTER DI
|
||||
"\u30C5<>\uFF82\uFF9E;" // to KATAKANA LETTER DU
|
||||
"\u30C7<>\uFF83\uFF9E;" // to KATAKANA LETTER DE
|
||||
"\u30C9<>\uFF84\uFF9E;" // to KATAKANA LETTER DO
|
||||
"\u30D0<>\uFF8A\uFF9E;" // to KATAKANA LETTER BA
|
||||
"\u30D1<>\uFF8A\uFF9F;" // to KATAKANA LETTER PA
|
||||
"\u30D3<>\uFF8B\uFF9E;" // to KATAKANA LETTER BI
|
||||
"\u30D4<>\uFF8B\uFF9F;" // to KATAKANA LETTER PI
|
||||
"\u30D6<>\uFF8C\uFF9E;" // to KATAKANA LETTER BU
|
||||
"\u30D7<>\uFF8C\uFF9F;" // to KATAKANA LETTER PU
|
||||
"\u30D9<>\uFF8D\uFF9E;" // to KATAKANA LETTER BE
|
||||
"\u30DA<>\uFF8D\uFF9F;" // to KATAKANA LETTER PE
|
||||
"\u30DC<>\uFF8E\uFF9E;" // to KATAKANA LETTER BO
|
||||
"\u30DD<>\uFF8E\uFF9F;" // to KATAKANA LETTER PO
|
||||
"\u30F4<>\uFF73\uFF9E;" // to KATAKANA LETTER VU
|
||||
"\u30F7<>\uFF9C\uFF9E;" // to KATAKANA LETTER VA
|
||||
"\u30FA<>\uFF66\uFF9E;" // to KATAKANA LETTER VO
|
||||
|
||||
"\u30AC<>\uFF76\uFF9E;" // to KATAKANA LETTER GA
|
||||
"\u30AE<>\uFF77\uFF9E;" // to KATAKANA LETTER GI
|
||||
"\u30B0<>\uFF78\uFF9E;" // to KATAKANA LETTER GU
|
||||
"\u30B2<>\uFF79\uFF9E;" // to KATAKANA LETTER GE
|
||||
"\u30B4<>\uFF7A\uFF9E;" // to KATAKANA LETTER GO
|
||||
"\u30B6<>\uFF7B\uFF9E;" // to KATAKANA LETTER ZA
|
||||
"\u30B8<>\uFF7C\uFF9E;" // to KATAKANA LETTER ZI
|
||||
"\u30BA<>\uFF7D\uFF9E;" // to KATAKANA LETTER ZU
|
||||
"\u30BC<>\uFF7E\uFF9E;" // to KATAKANA LETTER ZE
|
||||
"\u30BE<>\uFF7F\uFF9E;" // to KATAKANA LETTER ZO
|
||||
"\u30C0<>\uFF80\uFF9E;" // to KATAKANA LETTER DA
|
||||
"\u30C2<>\uFF81\uFF9E;" // to KATAKANA LETTER DI
|
||||
"\u30C5<>\uFF82\uFF9E;" // to KATAKANA LETTER DU
|
||||
"\u30C7<>\uFF83\uFF9E;" // to KATAKANA LETTER DE
|
||||
"\u30C9<>\uFF84\uFF9E;" // to KATAKANA LETTER DO
|
||||
"\u30D0<>\uFF8A\uFF9E;" // to KATAKANA LETTER BA
|
||||
"\u30D1<>\uFF8A\uFF9F;" // to KATAKANA LETTER PA
|
||||
"\u30D3<>\uFF8B\uFF9E;" // to KATAKANA LETTER BI
|
||||
"\u30D4<>\uFF8B\uFF9F;" // to KATAKANA LETTER PI
|
||||
"\u30D6<>\uFF8C\uFF9E;" // to KATAKANA LETTER BU
|
||||
"\u30D7<>\uFF8C\uFF9F;" // to KATAKANA LETTER PU
|
||||
"\u30D9<>\uFF8D\uFF9E;" // to KATAKANA LETTER BE
|
||||
"\u30DA<>\uFF8D\uFF9F;" // to KATAKANA LETTER PE
|
||||
"\u30DC<>\uFF8E\uFF9E;" // to KATAKANA LETTER BO
|
||||
"\u30DD<>\uFF8E\uFF9F;" // to KATAKANA LETTER PO
|
||||
"\u30F4<>\uFF73\uFF9E;" // to KATAKANA LETTER VU
|
||||
"\u30F7<>\uFF9C\uFF9E;" // to KATAKANA LETTER VA
|
||||
"\u30FA<>\uFF66\uFF9E;" // to KATAKANA LETTER VO
|
||||
|
||||
// single character
|
||||
|
||||
"\uFF01<>'!';" // from FULLWIDTH EXCLAMATION MARK
|
||||
"\uFF02<>'\"';" // from FULLWIDTH QUOTATION MARK
|
||||
"\uFF03<>'#';" // from FULLWIDTH NUMBER SIGN
|
||||
"\uFF04<>'$';" // from FULLWIDTH DOLLAR SIGN
|
||||
"\uFF05<>'%';" // from FULLWIDTH PERCENT SIGN
|
||||
"\uFF06<>'&';" // from FULLWIDTH AMPERSAND
|
||||
|
||||
"\uFF01<>'!';" // from FULLWIDTH EXCLAMATION MARK
|
||||
"\uFF02<>'\"';" // from FULLWIDTH QUOTATION MARK
|
||||
"\uFF03<>'#';" // from FULLWIDTH NUMBER SIGN
|
||||
"\uFF04<>'$';" // from FULLWIDTH DOLLAR SIGN
|
||||
"\uFF05<>'%';" // from FULLWIDTH PERCENT SIGN
|
||||
"\uFF06<>'&';" // from FULLWIDTH AMPERSAND
|
||||
"\uFF07<>'';" // from FULLWIDTH APOSTROPHE
|
||||
"\uFF08<>'(';" // from FULLWIDTH LEFT PARENTHESIS
|
||||
"\uFF09<>')';" // from FULLWIDTH RIGHT PARENTHESIS
|
||||
"\uFF0A<>'*';" // from FULLWIDTH ASTERISK
|
||||
"\uFF0B<>'+';" // from FULLWIDTH PLUS SIGN
|
||||
"\uFF0C<>',';" // from FULLWIDTH COMMA
|
||||
"\uFF0D<>'-';" // from FULLWIDTH HYPHEN-MINUS
|
||||
"\uFF0E<>'.';" // from FULLWIDTH FULL STOP
|
||||
"\uFF0F<>'/';" // from FULLWIDTH SOLIDUS
|
||||
"\uFF10<>'0';" // from FULLWIDTH DIGIT ZERO
|
||||
"\uFF11<>'1';" // from FULLWIDTH DIGIT ONE
|
||||
"\uFF12<>'2';" // from FULLWIDTH DIGIT TWO
|
||||
"\uFF13<>'3';" // from FULLWIDTH DIGIT THREE
|
||||
"\uFF14<>'4';" // from FULLWIDTH DIGIT FOUR
|
||||
"\uFF15<>'5';" // from FULLWIDTH DIGIT FIVE
|
||||
"\uFF16<>'6';" // from FULLWIDTH DIGIT SIX
|
||||
"\uFF17<>'7';" // from FULLWIDTH DIGIT SEVEN
|
||||
"\uFF18<>'8';" // from FULLWIDTH DIGIT EIGHT
|
||||
"\uFF19<>'9';" // from FULLWIDTH DIGIT NINE
|
||||
"\uFF1A<>':';" // from FULLWIDTH COLON
|
||||
"\uFF1B<>';';" // from FULLWIDTH SEMICOLON
|
||||
"\uFF1C<>'<';" // from FULLWIDTH LESS-THAN SIGN
|
||||
"\uFF1D<>'=';" // from FULLWIDTH EQUALS SIGN
|
||||
"\uFF1E<>'>';" // from FULLWIDTH GREATER-THAN SIGN
|
||||
"\uFF1F<>'?';" // from FULLWIDTH QUESTION MARK
|
||||
"\uFF20<>'@';" // from FULLWIDTH COMMERCIAL AT
|
||||
"\uFF21<>A;" // from FULLWIDTH LATIN CAPITAL LETTER A
|
||||
"\uFF22<>B;" // from FULLWIDTH LATIN CAPITAL LETTER B
|
||||
"\uFF23<>C;" // from FULLWIDTH LATIN CAPITAL LETTER C
|
||||
"\uFF24<>D;" // from FULLWIDTH LATIN CAPITAL LETTER D
|
||||
"\uFF25<>E;" // from FULLWIDTH LATIN CAPITAL LETTER E
|
||||
"\uFF26<>F;" // from FULLWIDTH LATIN CAPITAL LETTER F
|
||||
"\uFF27<>G;" // from FULLWIDTH LATIN CAPITAL LETTER G
|
||||
"\uFF28<>H;" // from FULLWIDTH LATIN CAPITAL LETTER H
|
||||
"\uFF29<>I;" // from FULLWIDTH LATIN CAPITAL LETTER I
|
||||
"\uFF2A<>J;" // from FULLWIDTH LATIN CAPITAL LETTER J
|
||||
"\uFF2B<>K;" // from FULLWIDTH LATIN CAPITAL LETTER K
|
||||
"\uFF2C<>L;" // from FULLWIDTH LATIN CAPITAL LETTER L
|
||||
"\uFF2D<>M;" // from FULLWIDTH LATIN CAPITAL LETTER M
|
||||
"\uFF2E<>N;" // from FULLWIDTH LATIN CAPITAL LETTER N
|
||||
"\uFF2F<>O;" // from FULLWIDTH LATIN CAPITAL LETTER O
|
||||
"\uFF30<>P;" // from FULLWIDTH LATIN CAPITAL LETTER P
|
||||
"\uFF31<>Q;" // from FULLWIDTH LATIN CAPITAL LETTER Q
|
||||
"\uFF32<>R;" // from FULLWIDTH LATIN CAPITAL LETTER R
|
||||
"\uFF33<>S;" // from FULLWIDTH LATIN CAPITAL LETTER S
|
||||
"\uFF34<>T;" // from FULLWIDTH LATIN CAPITAL LETTER T
|
||||
"\uFF35<>U;" // from FULLWIDTH LATIN CAPITAL LETTER U
|
||||
"\uFF36<>V;" // from FULLWIDTH LATIN CAPITAL LETTER V
|
||||
"\uFF37<>W;" // from FULLWIDTH LATIN CAPITAL LETTER W
|
||||
"\uFF38<>X;" // from FULLWIDTH LATIN CAPITAL LETTER X
|
||||
"\uFF39<>Y;" // from FULLWIDTH LATIN CAPITAL LETTER Y
|
||||
"\uFF3A<>Z;" // from FULLWIDTH LATIN CAPITAL LETTER Z
|
||||
"\uFF3B<>'[';" // from FULLWIDTH LEFT SQUARE BRACKET
|
||||
"\uFF3C<>'\\';" // from FULLWIDTH REVERSE SOLIDUS {double escape - aliu}
|
||||
"\uFF3D<>']';" // from FULLWIDTH RIGHT SQUARE BRACKET
|
||||
"\uFF3E<>'^';" // from FULLWIDTH CIRCUMFLEX ACCENT
|
||||
"\uFF3F<>'_';" // from FULLWIDTH LOW LINE
|
||||
"\uFF40<>'`';" // from FULLWIDTH GRAVE ACCENT
|
||||
"\uFF41<>a;" // from FULLWIDTH LATIN SMALL LETTER A
|
||||
"\uFF42<>b;" // from FULLWIDTH LATIN SMALL LETTER B
|
||||
"\uFF43<>c;" // from FULLWIDTH LATIN SMALL LETTER C
|
||||
"\uFF44<>d;" // from FULLWIDTH LATIN SMALL LETTER D
|
||||
"\uFF45<>e;" // from FULLWIDTH LATIN SMALL LETTER E
|
||||
"\uFF46<>f;" // from FULLWIDTH LATIN SMALL LETTER F
|
||||
"\uFF47<>g;" // from FULLWIDTH LATIN SMALL LETTER G
|
||||
"\uFF48<>h;" // from FULLWIDTH LATIN SMALL LETTER H
|
||||
"\uFF49<>i;" // from FULLWIDTH LATIN SMALL LETTER I
|
||||
"\uFF4A<>j;" // from FULLWIDTH LATIN SMALL LETTER J
|
||||
"\uFF4B<>k;" // from FULLWIDTH LATIN SMALL LETTER K
|
||||
"\uFF4C<>l;" // from FULLWIDTH LATIN SMALL LETTER L
|
||||
"\uFF4D<>m;" // from FULLWIDTH LATIN SMALL LETTER M
|
||||
"\uFF4E<>n;" // from FULLWIDTH LATIN SMALL LETTER N
|
||||
"\uFF4F<>o;" // from FULLWIDTH LATIN SMALL LETTER O
|
||||
"\uFF50<>p;" // from FULLWIDTH LATIN SMALL LETTER P
|
||||
"\uFF51<>q;" // from FULLWIDTH LATIN SMALL LETTER Q
|
||||
"\uFF52<>r;" // from FULLWIDTH LATIN SMALL LETTER R
|
||||
"\uFF53<>s;" // from FULLWIDTH LATIN SMALL LETTER S
|
||||
"\uFF54<>t;" // from FULLWIDTH LATIN SMALL LETTER T
|
||||
"\uFF55<>u;" // from FULLWIDTH LATIN SMALL LETTER U
|
||||
"\uFF56<>v;" // from FULLWIDTH LATIN SMALL LETTER V
|
||||
"\uFF57<>w;" // from FULLWIDTH LATIN SMALL LETTER W
|
||||
"\uFF58<>x;" // from FULLWIDTH LATIN SMALL LETTER X
|
||||
"\uFF59<>y;" // from FULLWIDTH LATIN SMALL LETTER Y
|
||||
"\uFF5A<>z;" // from FULLWIDTH LATIN SMALL LETTER Z
|
||||
"\uFF5B<>'{';" // from FULLWIDTH LEFT CURLY BRACKET
|
||||
"\uFF5C<>'|';" // from FULLWIDTH VERTICAL LINE
|
||||
"\uFF5D<>'}';" // from FULLWIDTH RIGHT CURLY BRACKET
|
||||
"\uFF5E<>'~';" // from FULLWIDTH TILDE
|
||||
"\uFF08<>'(';" // from FULLWIDTH LEFT PARENTHESIS
|
||||
"\uFF09<>')';" // from FULLWIDTH RIGHT PARENTHESIS
|
||||
"\uFF0A<>'*';" // from FULLWIDTH ASTERISK
|
||||
"\uFF0B<>'+';" // from FULLWIDTH PLUS SIGN
|
||||
"\uFF0C<>',';" // from FULLWIDTH COMMA
|
||||
"\uFF0D<>'-';" // from FULLWIDTH HYPHEN-MINUS
|
||||
"\uFF0E<>'.';" // from FULLWIDTH FULL STOP
|
||||
"\uFF0F<>'/';" // from FULLWIDTH SOLIDUS
|
||||
"\uFF10<>'0';" // from FULLWIDTH DIGIT ZERO
|
||||
"\uFF11<>'1';" // from FULLWIDTH DIGIT ONE
|
||||
"\uFF12<>'2';" // from FULLWIDTH DIGIT TWO
|
||||
"\uFF13<>'3';" // from FULLWIDTH DIGIT THREE
|
||||
"\uFF14<>'4';" // from FULLWIDTH DIGIT FOUR
|
||||
"\uFF15<>'5';" // from FULLWIDTH DIGIT FIVE
|
||||
"\uFF16<>'6';" // from FULLWIDTH DIGIT SIX
|
||||
"\uFF17<>'7';" // from FULLWIDTH DIGIT SEVEN
|
||||
"\uFF18<>'8';" // from FULLWIDTH DIGIT EIGHT
|
||||
"\uFF19<>'9';" // from FULLWIDTH DIGIT NINE
|
||||
"\uFF1A<>':';" // from FULLWIDTH COLON
|
||||
"\uFF1B<>';';" // from FULLWIDTH SEMICOLON
|
||||
"\uFF1C<>'<';" // from FULLWIDTH LESS-THAN SIGN
|
||||
"\uFF1D<>'=';" // from FULLWIDTH EQUALS SIGN
|
||||
"\uFF1E<>'>';" // from FULLWIDTH GREATER-THAN SIGN
|
||||
"\uFF1F<>'?';" // from FULLWIDTH QUESTION MARK
|
||||
"\uFF20<>'@';" // from FULLWIDTH COMMERCIAL AT
|
||||
"\uFF21<>A;" // from FULLWIDTH LATIN CAPITAL LETTER A
|
||||
"\uFF22<>B;" // from FULLWIDTH LATIN CAPITAL LETTER B
|
||||
"\uFF23<>C;" // from FULLWIDTH LATIN CAPITAL LETTER C
|
||||
"\uFF24<>D;" // from FULLWIDTH LATIN CAPITAL LETTER D
|
||||
"\uFF25<>E;" // from FULLWIDTH LATIN CAPITAL LETTER E
|
||||
"\uFF26<>F;" // from FULLWIDTH LATIN CAPITAL LETTER F
|
||||
"\uFF27<>G;" // from FULLWIDTH LATIN CAPITAL LETTER G
|
||||
"\uFF28<>H;" // from FULLWIDTH LATIN CAPITAL LETTER H
|
||||
"\uFF29<>I;" // from FULLWIDTH LATIN CAPITAL LETTER I
|
||||
"\uFF2A<>J;" // from FULLWIDTH LATIN CAPITAL LETTER J
|
||||
"\uFF2B<>K;" // from FULLWIDTH LATIN CAPITAL LETTER K
|
||||
"\uFF2C<>L;" // from FULLWIDTH LATIN CAPITAL LETTER L
|
||||
"\uFF2D<>M;" // from FULLWIDTH LATIN CAPITAL LETTER M
|
||||
"\uFF2E<>N;" // from FULLWIDTH LATIN CAPITAL LETTER N
|
||||
"\uFF2F<>O;" // from FULLWIDTH LATIN CAPITAL LETTER O
|
||||
"\uFF30<>P;" // from FULLWIDTH LATIN CAPITAL LETTER P
|
||||
"\uFF31<>Q;" // from FULLWIDTH LATIN CAPITAL LETTER Q
|
||||
"\uFF32<>R;" // from FULLWIDTH LATIN CAPITAL LETTER R
|
||||
"\uFF33<>S;" // from FULLWIDTH LATIN CAPITAL LETTER S
|
||||
"\uFF34<>T;" // from FULLWIDTH LATIN CAPITAL LETTER T
|
||||
"\uFF35<>U;" // from FULLWIDTH LATIN CAPITAL LETTER U
|
||||
"\uFF36<>V;" // from FULLWIDTH LATIN CAPITAL LETTER V
|
||||
"\uFF37<>W;" // from FULLWIDTH LATIN CAPITAL LETTER W
|
||||
"\uFF38<>X;" // from FULLWIDTH LATIN CAPITAL LETTER X
|
||||
"\uFF39<>Y;" // from FULLWIDTH LATIN CAPITAL LETTER Y
|
||||
"\uFF3A<>Z;" // from FULLWIDTH LATIN CAPITAL LETTER Z
|
||||
"\uFF3B<>'[';" // from FULLWIDTH LEFT SQUARE BRACKET
|
||||
"\uFF3C<>'\\';" // from FULLWIDTH REVERSE SOLIDUS {double escape - aliu}
|
||||
"\uFF3D<>']';" // from FULLWIDTH RIGHT SQUARE BRACKET
|
||||
"\uFF3E<>'^';" // from FULLWIDTH CIRCUMFLEX ACCENT
|
||||
"\uFF3F<>'_';" // from FULLWIDTH LOW LINE
|
||||
"\uFF40<>'`';" // from FULLWIDTH GRAVE ACCENT
|
||||
"\uFF41<>a;" // from FULLWIDTH LATIN SMALL LETTER A
|
||||
"\uFF42<>b;" // from FULLWIDTH LATIN SMALL LETTER B
|
||||
"\uFF43<>c;" // from FULLWIDTH LATIN SMALL LETTER C
|
||||
"\uFF44<>d;" // from FULLWIDTH LATIN SMALL LETTER D
|
||||
"\uFF45<>e;" // from FULLWIDTH LATIN SMALL LETTER E
|
||||
"\uFF46<>f;" // from FULLWIDTH LATIN SMALL LETTER F
|
||||
"\uFF47<>g;" // from FULLWIDTH LATIN SMALL LETTER G
|
||||
"\uFF48<>h;" // from FULLWIDTH LATIN SMALL LETTER H
|
||||
"\uFF49<>i;" // from FULLWIDTH LATIN SMALL LETTER I
|
||||
"\uFF4A<>j;" // from FULLWIDTH LATIN SMALL LETTER J
|
||||
"\uFF4B<>k;" // from FULLWIDTH LATIN SMALL LETTER K
|
||||
"\uFF4C<>l;" // from FULLWIDTH LATIN SMALL LETTER L
|
||||
"\uFF4D<>m;" // from FULLWIDTH LATIN SMALL LETTER M
|
||||
"\uFF4E<>n;" // from FULLWIDTH LATIN SMALL LETTER N
|
||||
"\uFF4F<>o;" // from FULLWIDTH LATIN SMALL LETTER O
|
||||
"\uFF50<>p;" // from FULLWIDTH LATIN SMALL LETTER P
|
||||
"\uFF51<>q;" // from FULLWIDTH LATIN SMALL LETTER Q
|
||||
"\uFF52<>r;" // from FULLWIDTH LATIN SMALL LETTER R
|
||||
"\uFF53<>s;" // from FULLWIDTH LATIN SMALL LETTER S
|
||||
"\uFF54<>t;" // from FULLWIDTH LATIN SMALL LETTER T
|
||||
"\uFF55<>u;" // from FULLWIDTH LATIN SMALL LETTER U
|
||||
"\uFF56<>v;" // from FULLWIDTH LATIN SMALL LETTER V
|
||||
"\uFF57<>w;" // from FULLWIDTH LATIN SMALL LETTER W
|
||||
"\uFF58<>x;" // from FULLWIDTH LATIN SMALL LETTER X
|
||||
"\uFF59<>y;" // from FULLWIDTH LATIN SMALL LETTER Y
|
||||
"\uFF5A<>z;" // from FULLWIDTH LATIN SMALL LETTER Z
|
||||
"\uFF5B<>'{';" // from FULLWIDTH LEFT CURLY BRACKET
|
||||
"\uFF5C<>'|';" // from FULLWIDTH VERTICAL LINE
|
||||
"\uFF5D<>'}';" // from FULLWIDTH RIGHT CURLY BRACKET
|
||||
"\uFF5E<>'~';" // from FULLWIDTH TILDE
|
||||
"\u3002<>\uFF61;" // to HALFWIDTH IDEOGRAPHIC FULL STOP
|
||||
"\u300C<>\uFF62;" // to HALFWIDTH LEFT CORNER BRACKET
|
||||
"\u300D<>\uFF63;" // to HALFWIDTH RIGHT CORNER BRACKET
|
||||
|
@ -252,12 +257,12 @@ fullhalf {
|
|||
"\u1173<>\uFFDA;" // to HALFWIDTH HANGUL LETTER EU
|
||||
"\u1174<>\uFFDB;" // to HALFWIDTH HANGUL LETTER YI
|
||||
"\u1175<>\uFFDC;" // to HALFWIDTH HANGUL LETTER I
|
||||
"\uFFE0<>'\u00a2';" // from FULLWIDTH CENT SIGN
|
||||
"\uFFE1<>'\u00a3';" // from FULLWIDTH POUND SIGN
|
||||
"\uFFE2<>'\u00ac';" // from FULLWIDTH NOT SIGN
|
||||
"\uFFE3<>' '\u0304;" // from FULLWIDTH MACRON
|
||||
"\uFFE4<>'\u00a6';" // from FULLWIDTH BROKEN BAR
|
||||
"\uFFE5<>'\u00a5';" // from FULLWIDTH YEN SIGN
|
||||
"\uFFE0<>'\u00a2';" // from FULLWIDTH CENT SIGN
|
||||
"\uFFE1<>'\u00a3';" // from FULLWIDTH POUND SIGN
|
||||
"\uFFE2<>'\u00ac';" // from FULLWIDTH NOT SIGN
|
||||
"\uFFE3<>' '\u0304;" // from FULLWIDTH MACRON
|
||||
"\uFFE4<>'\u00a6';" // from FULLWIDTH BROKEN BAR
|
||||
"\uFFE5<>'\u00a5';" // from FULLWIDTH YEN SIGN
|
||||
"\uFFE6<>\u20A9;" // from FULLWIDTH WON SIGN
|
||||
"\u2502<>\uFFE8;" // to HALFWIDTH FORMS LIGHT VERTICAL
|
||||
"\u2190<>\uFFE9;" // to HALFWIDTH LEFTWARDS ARROW
|
||||
|
@ -266,5 +271,6 @@ fullhalf {
|
|||
"\u2193<>\uFFEC;" // to HALFWIDTH DOWNWARDS ARROW
|
||||
"\u25A0<>\uFFED;" // to HALFWIDTH BLACK SQUARE
|
||||
"\u25CB<>\uFFEE;" // to HALFWIDTH WHITE CIRCLE
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,128 +1,130 @@
|
|||
//--------------------------------------------------------------------
|
||||
// Copyright (C) 1999, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
// Copyright (c) 1999-2000, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//--------------------------------------------------------------------
|
||||
// Date Name Description
|
||||
// 11/17/99 aliu Creation.
|
||||
// THIS IS A MACHINE-GENERATED FILE
|
||||
// Tool: src\com\ibm\tools\translit\dumpICUrules.bat
|
||||
// Source: src\com\ibm\text\resources/TransliterationRule_KeyboardEscape_Latin1.java
|
||||
// Date: Fri May 19 15:50:22 2000
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
// KeyboardEscape-Latin1
|
||||
|
||||
kbdescl1 {
|
||||
Rule {
|
||||
"esc='';"
|
||||
"grave=`;"
|
||||
"acute='';"
|
||||
"hat=^;"
|
||||
"tilde=~;"
|
||||
"umlaut=:;"
|
||||
"ring=.;"
|
||||
"cedilla=,;"
|
||||
"slash=/;"
|
||||
"super=^;"
|
||||
|
||||
// Make keyboard entry of {esc} possible
|
||||
// and of backslash
|
||||
"'\\'{esc}>{esc};"
|
||||
"'\\\\'>'\\';"
|
||||
|
||||
// Long keys
|
||||
"cur{esc}>\u00A4;"
|
||||
"sec{esc}>\u00A7;"
|
||||
"not{esc}>\u00AC;"
|
||||
"mul{esc}>\u00D7;"
|
||||
"div{esc}>\u00F7;"
|
||||
|
||||
"\\ {esc}>\u00A0;" // non-breaking space
|
||||
"!{esc}>\u00A1;" // inverted exclamation
|
||||
"c/{esc}>\u00A2;" // cent sign
|
||||
"lb{esc}>\u00A3;" // pound sign
|
||||
"'|'{esc}>\u00A6;" // broken vertical bar
|
||||
":{esc}>\u00A8;" // umlaut
|
||||
"{super}a{esc}>\u00AA;" // feminine ordinal
|
||||
"'<<'{esc}>\u00AB;"
|
||||
"r{esc}>\u00AE;"
|
||||
"--{esc}>\u00AF;"
|
||||
"-{esc}>\u00AD;"
|
||||
"+-{esc}>\u00B1;"
|
||||
"{super}2{esc}>\u00B2;"
|
||||
"{super}3{esc}>\u00B3;"
|
||||
"{acute}{esc}>\u00B4;"
|
||||
"m{esc}>\u00B5;"
|
||||
"para{esc}>\u00B6;"
|
||||
"dot{esc}>\u00B7;"
|
||||
"{cedilla}{esc}>\u00B8;"
|
||||
"{super}1{esc}>\u00B9;"
|
||||
"{super}o{esc}>\u00BA;" // masculine ordinal
|
||||
"'>>'{esc}>\u00BB;"
|
||||
"1/4{esc}>\u00BC;"
|
||||
"1/2{esc}>\u00BD;"
|
||||
"3/4{esc}>\u00BE;"
|
||||
"?{esc}>\u00BF;"
|
||||
"A{grave}{esc}>\u00C0;"
|
||||
"A{acute}{esc}>\u00C1;"
|
||||
"A{hat}{esc}>\u00C2;"
|
||||
"A{tilde}{esc}>\u00C3;"
|
||||
"A{umlaut}{esc}>\u00C4;"
|
||||
"A{ring}{esc}>\u00C5;"
|
||||
"AE{esc}>\u00C6;"
|
||||
"C{cedilla}{esc}>\u00C7;"
|
||||
"E{grave}{esc}>\u00C8;"
|
||||
"E{acute}{esc}>\u00C9;"
|
||||
"E{hat}{esc}>\u00CA;"
|
||||
"E{umlaut}{esc}>\u00CB;"
|
||||
"I{grave}{esc}>\u00CC;"
|
||||
"I{acute}{esc}>\u00CD;"
|
||||
"I{hat}{esc}>\u00CE;"
|
||||
"I{umlaut}{esc}>\u00CF;"
|
||||
"D-{esc}>\u00D0;"
|
||||
"N{tilde}{esc}>\u00D1;"
|
||||
"O{grave}{esc}>\u00D2;"
|
||||
"O{acute}{esc}>\u00D3;"
|
||||
"O{hat}{esc}>\u00D4;"
|
||||
"O{tilde}{esc}>\u00D5;"
|
||||
"O{umlaut}{esc}>\u00D6;"
|
||||
"O{slash}{esc}>\u00D8;"
|
||||
"U{grave}{esc}>\u00D9;"
|
||||
"U{acute}{esc}>\u00DA;"
|
||||
"U{hat}{esc}>\u00DB;"
|
||||
"U{umlaut}{esc}>\u00DC;"
|
||||
"Y{acute}{esc}>\u00DD;"
|
||||
"TH{esc}>\u00DE;"
|
||||
"ss{esc}>\u00DF;"
|
||||
"a{grave}{esc}>\u00E0;"
|
||||
"a{acute}{esc}>\u00E1;"
|
||||
"a{hat}{esc}>\u00E2;"
|
||||
"a{tilde}{esc}>\u00E3;"
|
||||
"a{umlaut}{esc}>\u00E4;"
|
||||
"a{ring}{esc}>\u00E5;"
|
||||
"ae{esc}>\u00E6;"
|
||||
"c{cedilla}{esc}>\u00E7;"
|
||||
"c{esc}>\u00A9;" // copyright - after c{cedilla}
|
||||
"e{grave}{esc}>\u00E8;"
|
||||
"e{acute}{esc}>\u00E9;"
|
||||
"e{hat}{esc}>\u00EA;"
|
||||
"e{umlaut}{esc}>\u00EB;"
|
||||
"i{grave}{esc}>\u00EC;"
|
||||
"i{acute}{esc}>\u00ED;"
|
||||
"i{hat}{esc}>\u00EE;"
|
||||
"i{umlaut}{esc}>\u00EF;"
|
||||
"d-{esc}>\u00F0;"
|
||||
"n{tilde}{esc}>\u00F1;"
|
||||
"o{grave}{esc}>\u00F2;"
|
||||
"o{acute}{esc}>\u00F3;"
|
||||
"o{hat}{esc}>\u00F4;"
|
||||
"o{tilde}{esc}>\u00F5;"
|
||||
"o{umlaut}{esc}>\u00F6;"
|
||||
"o{slash}{esc}>\u00F8;"
|
||||
"o{esc}>\u00B0;"
|
||||
"u{grave}{esc}>\u00F9;"
|
||||
"u{acute}{esc}>\u00FA;"
|
||||
"u{hat}{esc}>\u00FB;"
|
||||
"u{umlaut}{esc}>\u00FC;"
|
||||
"y{acute}{esc}>\u00FD;"
|
||||
"y{esc}>\u00A5;" // yen sign
|
||||
"th{esc}>\u00FE;"
|
||||
//masked: "ss{esc}>\u00FF;"
|
||||
}
|
||||
Rule {
|
||||
"$esc='';"
|
||||
"$grave='`';"
|
||||
"$acute='';"
|
||||
"$hat='^';"
|
||||
"$tilde='~';"
|
||||
"$umlaut=':';"
|
||||
"$ring='.';"
|
||||
"$cedilla=',';"
|
||||
"$slash='/';"
|
||||
"$super='^';"
|
||||
|
||||
// Make keyboard entry of {esc} possible
|
||||
// and of backslash
|
||||
"'\\'$esc>$esc;"
|
||||
"'\\\\'>'\\';"
|
||||
|
||||
// Long keys
|
||||
"cur$esc>\u00A4;"
|
||||
"sec$esc>\u00A7;"
|
||||
"not$esc>\u00AC;"
|
||||
"mul$esc>\u00D7;"
|
||||
"div$esc>\u00F7;"
|
||||
|
||||
"\\ $esc>\u00A0;" // non-breaking space
|
||||
"'!'$esc>\u00A1;" // inverted exclamation
|
||||
"c'/'$esc>\u00A2;" // cent sign
|
||||
"lb$esc>\u00A3;" // pound sign
|
||||
"'|'$esc>\u00A6;" // broken vertical bar
|
||||
"':'$esc>\u00A8;" // umlaut
|
||||
"$super a$esc>\u00AA;" // feminine ordinal
|
||||
"'<<'$esc>\u00AB;"
|
||||
"r$esc>\u00AE;"
|
||||
"'--'$esc>\u00AF;"
|
||||
"'-'$esc>\u00AD;"
|
||||
"'+-'$esc>\u00B1;"
|
||||
"$super 2$esc>\u00B2;"
|
||||
"$super 3$esc>\u00B3;"
|
||||
"$acute$esc>\u00B4;"
|
||||
"m$esc>\u00B5;"
|
||||
"para$esc>\u00B6;"
|
||||
"dot$esc>\u00B7;"
|
||||
"$cedilla$esc>\u00B8;"
|
||||
"$super 1$esc>\u00B9;"
|
||||
"$super o$esc>\u00BA;" // masculine ordinal
|
||||
"'>>'$esc>\u00BB;"
|
||||
"'1/4'$esc>\u00BC;"
|
||||
"'1/2'$esc>\u00BD;"
|
||||
"'3/4'$esc>\u00BE;"
|
||||
"'?'$esc>\u00BF;"
|
||||
"A$grave$esc>\u00C0;"
|
||||
"A$acute$esc>\u00C1;"
|
||||
"A$hat$esc>\u00C2;"
|
||||
"A$tilde$esc>\u00C3;"
|
||||
"A$umlaut$esc>\u00C4;"
|
||||
"A$ring$esc>\u00C5;"
|
||||
"AE$esc>\u00C6;"
|
||||
"C$cedilla$esc>\u00C7;"
|
||||
"E$grave$esc>\u00C8;"
|
||||
"E$acute$esc>\u00C9;"
|
||||
"E$hat$esc>\u00CA;"
|
||||
"E$umlaut$esc>\u00CB;"
|
||||
"I$grave$esc>\u00CC;"
|
||||
"I$acute$esc>\u00CD;"
|
||||
"I$hat$esc>\u00CE;"
|
||||
"I$umlaut$esc>\u00CF;"
|
||||
"'D-'$esc>\u00D0;"
|
||||
"N$tilde$esc>\u00D1;"
|
||||
"O$grave$esc>\u00D2;"
|
||||
"O$acute$esc>\u00D3;"
|
||||
"O$hat$esc>\u00D4;"
|
||||
"O$tilde$esc>\u00D5;"
|
||||
"O$umlaut$esc>\u00D6;"
|
||||
"O$slash$esc>\u00D8;"
|
||||
"U$grave$esc>\u00D9;"
|
||||
"U$acute$esc>\u00DA;"
|
||||
"U$hat$esc>\u00DB;"
|
||||
"U$umlaut$esc>\u00DC;"
|
||||
"Y$acute$esc>\u00DD;"
|
||||
"TH$esc>\u00DE;"
|
||||
"ss$esc>\u00DF;"
|
||||
"a$grave$esc>\u00E0;"
|
||||
"a$acute$esc>\u00E1;"
|
||||
"a$hat$esc>\u00E2;"
|
||||
"a$tilde$esc>\u00E3;"
|
||||
"a$umlaut$esc>\u00E4;"
|
||||
"a$ring$esc>\u00E5;"
|
||||
"ae$esc>\u00E6;"
|
||||
"c$cedilla$esc>\u00E7;"
|
||||
"c$esc>\u00A9;" // copyright - after c{cedilla}
|
||||
"e$grave$esc>\u00E8;"
|
||||
"e$acute$esc>\u00E9;"
|
||||
"e$hat$esc>\u00EA;"
|
||||
"e$umlaut$esc>\u00EB;"
|
||||
"i$grave$esc>\u00EC;"
|
||||
"i$acute$esc>\u00ED;"
|
||||
"i$hat$esc>\u00EE;"
|
||||
"i$umlaut$esc>\u00EF;"
|
||||
"'d-'$esc>\u00F0;"
|
||||
"n$tilde$esc>\u00F1;"
|
||||
"o$grave$esc>\u00F2;"
|
||||
"o$acute$esc>\u00F3;"
|
||||
"o$hat$esc>\u00F4;"
|
||||
"o$tilde$esc>\u00F5;"
|
||||
"o$umlaut$esc>\u00F6;"
|
||||
"o$slash$esc>\u00F8;"
|
||||
"o$esc>\u00B0;"
|
||||
"u$grave$esc>\u00F9;"
|
||||
"u$acute$esc>\u00FA;"
|
||||
"u$hat$esc>\u00FB;"
|
||||
"u$umlaut$esc>\u00FC;"
|
||||
"y$acute$esc>\u00FD;"
|
||||
"y$esc>\u00A5;" // yen sign
|
||||
"th$esc>\u00FE;"
|
||||
//masked: + "ss$esc>\u00FF;"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,240 +1,257 @@
|
|||
//--------------------------------------------------------------------
|
||||
// Copyright (C) 1999, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
// Copyright (c) 1999-2000, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//--------------------------------------------------------------------
|
||||
// Date Name Description
|
||||
// 11/17/99 aliu Creation.
|
||||
// THIS IS A MACHINE-GENERATED FILE
|
||||
// Tool: src\com\ibm\tools\translit\dumpICUrules.bat
|
||||
// Source: src\com\ibm\text\resources/TransliterationRule_Latin_Arabic.java
|
||||
// Date: Fri May 19 15:50:22 2000
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
// Latin-Arabic
|
||||
|
||||
larabic {
|
||||
Rule {
|
||||
// To Do: finish adding shadda, add sokoon
|
||||
|
||||
"alefmadda=\u0622;"
|
||||
"alefuhamza=\u0623;"
|
||||
"wauuhamza=\u0624;"
|
||||
"alefhamza=\u0625;"
|
||||
"yehuhamza=\u0626;"
|
||||
"alef=\u0627;"
|
||||
"beh=\u0628;"
|
||||
"tehmarbuta=\u0629;"
|
||||
"teh=\u062A;"
|
||||
"theh=\u062B;"
|
||||
"geem=\u062C;"
|
||||
"hah=\u062D;"
|
||||
"kha=\u062E;"
|
||||
"dal=\u062F;"
|
||||
"dhal=\u0630;"
|
||||
"reh=\u0631;"
|
||||
"zain=\u0632;"
|
||||
"seen=\u0633;"
|
||||
"sheen=\u0634;"
|
||||
"sad=\u0635;"
|
||||
"dad=\u0636;"
|
||||
"tah=\u0637;"
|
||||
"zah=\u0638;"
|
||||
"ein=\u0639;"
|
||||
"ghein=\u063A;"
|
||||
"feh=\u0641;"
|
||||
"qaaf=\u0642;"
|
||||
"kaf=\u0643;"
|
||||
"lam=\u0644;"
|
||||
"meem=\u0645;"
|
||||
"noon=\u0646;"
|
||||
"heh=\u0647;"
|
||||
"wau=\u0648;"
|
||||
"yehmaqsura=\u0649;"
|
||||
"yeh=\u064A;"
|
||||
"peh=\u06A4;"
|
||||
|
||||
"hamza=\u0621;"
|
||||
"fathatein=\u064B;"
|
||||
"dammatein=\u064C;"
|
||||
"kasratein=\u064D;"
|
||||
"fatha=\u064E;"
|
||||
"damma=\u064F;"
|
||||
"kasra=\u0650;"
|
||||
"shadda=\u0651;"
|
||||
"sokoon=\u0652;"
|
||||
|
||||
// convert English to Arabic
|
||||
"Arabic>"
|
||||
"\u062a\u062a\u0645\u062a\u0639' '"
|
||||
"\u0627\u0644\u0644\u063a\u0629' '"
|
||||
"\u0627\u0644\u0639\u0631\u0628\u0628\u064a\u0629' '"
|
||||
"\u0628\u0628\u0646\u0638\u0645' '"
|
||||
"\u0643\u062a\u0627\u0628\u0628\u064a\u0629' '"
|
||||
"\u062c\u0645\u064a\u0644\u0629;"
|
||||
|
||||
"ai>{alefmadda};"
|
||||
"ae>{alefuhamza};"
|
||||
"ao>{alefhamza};"
|
||||
"aa>{alef};"
|
||||
"an>{fathatein};"
|
||||
"a>{fatha};"
|
||||
"b>{beh};"
|
||||
"c>{kaf};"
|
||||
"{dhal})dh>{shadda};"
|
||||
"dh>{dhal};"
|
||||
"{dad})dd>{shadda};"
|
||||
"dd>{dad};"
|
||||
"{dal})d>{shadda};"
|
||||
"d>{dal};"
|
||||
"e>{ein};"
|
||||
"f>{feh};"
|
||||
"gh>{ghein};"
|
||||
"g>{geem};"
|
||||
"hh>{hah};"
|
||||
"h>{heh};"
|
||||
"ii>{kasratein};"
|
||||
"i>{kasra};"
|
||||
"j>{geem};"
|
||||
"kh>{kha};"
|
||||
"k>{kaf};"
|
||||
"l>{lam};"
|
||||
"m>{meem};"
|
||||
"n>{noon};"
|
||||
"o>{hamza};"
|
||||
"p>{peh};"
|
||||
"q>{qaaf};"
|
||||
"r>{reh};"
|
||||
"sh>{sheen};"
|
||||
"ss>{sad};"
|
||||
"s>{seen};"
|
||||
"th>{theh};"
|
||||
"tm>{tehmarbuta};"
|
||||
"tt>{tah};"
|
||||
"t>{teh};"
|
||||
"uu>{dammatein};"
|
||||
"u>{damma};"
|
||||
"v>{beh};"
|
||||
"we>{wauuhamza};"
|
||||
"w>{wau};"
|
||||
"x>{kaf}{shadda}{seen};"
|
||||
"ye>{yehuhamza};"
|
||||
"ym>{yehmaqsura};"
|
||||
"y>{yeh};"
|
||||
"zz>{zah};"
|
||||
"z>{zain};"
|
||||
|
||||
"0>\u0660;"+ // Arabic digit 0
|
||||
"1>\u0661;"+ // Arabic digit 1
|
||||
"2>\u0662;"+ // Arabic digit 2
|
||||
"3>\u0663;"+ // Arabic digit 3
|
||||
"4>\u0664;"+ // Arabic digit 4
|
||||
"5>\u0665;"+ // Arabic digit 5
|
||||
"6>\u0666;"+ // Arabic digit 6
|
||||
"7>\u0667;"+ // Arabic digit 7
|
||||
"8>\u0668;"+ // Arabic digit 8
|
||||
"9>\u0669;"+ // Arabic digit 9
|
||||
"%>\u066A;"+ // Arabic %
|
||||
".>\u066B;"+ // Arabic decimal separator
|
||||
",>\u066C;"+ // Arabic thousands separator
|
||||
"*>\u066D;"+ // Arabic five-pointed star
|
||||
|
||||
"`0>0;"+ // Escaped forms of the above
|
||||
"`1>1;"
|
||||
"`2>2;"
|
||||
"`3>3;"
|
||||
"`4>4;"
|
||||
"`5>5;"
|
||||
"`6>6;"
|
||||
"`7>7;"
|
||||
"`8>8;"
|
||||
"`9>9;"
|
||||
"`%>%;"
|
||||
"`.>.;"
|
||||
"`,>,;"
|
||||
"`*>*;"
|
||||
"``>`;"
|
||||
|
||||
"''>;"
|
||||
|
||||
// now Arabic to English
|
||||
|
||||
"''ai<a){alefmadda};"
|
||||
"ai<{alefmadda};"
|
||||
"''ae<a){alefuhamza};"
|
||||
"ae<{alefuhamza};"
|
||||
"''ao<a){alefhamza};"
|
||||
"ao<{alefhamza};"
|
||||
"''aa<a){alef};"
|
||||
"aa<{alef};"
|
||||
"''an<a){fathatein};"
|
||||
"an<{fathatein};"
|
||||
"''a<a){fatha};"
|
||||
"a<{fatha};"
|
||||
"b<{beh};"
|
||||
"''dh<d){dhal};"
|
||||
"dh<{dhal};"
|
||||
"''dd<d){dad};"
|
||||
"dd<{dad};"
|
||||
"''d<d){dal};"
|
||||
"d<{dal};"
|
||||
"''e<a){ein};"
|
||||
"''e<w){ein};"
|
||||
"''e<y){ein};"
|
||||
"e<{ein};"
|
||||
"f<{feh};"
|
||||
"gh<{ghein};"
|
||||
"''hh<d){hah};"
|
||||
"''hh<t){hah};"
|
||||
"''hh<k){hah};"
|
||||
"''hh<s){hah};"
|
||||
"hh<{hah};"
|
||||
"''h<d){heh};"
|
||||
"''h<t){heh};"
|
||||
"''h<k){heh};"
|
||||
"''h<s){heh};"
|
||||
"h<{heh};"
|
||||
"''ii<i){kasratein};"
|
||||
"ii<{kasratein};"
|
||||
"''i<i){kasra};"
|
||||
"i<{kasra};"
|
||||
"j<{geem};"
|
||||
"kh<{kha};"
|
||||
"x<{kaf}{shadda}{seen};"
|
||||
"k<{kaf};"
|
||||
"l<{lam};"
|
||||
"''m<y){meem};"
|
||||
"''m<t){meem};"
|
||||
"m<{meem};"
|
||||
"n<{noon};"
|
||||
"''o<a){hamza};"
|
||||
"o<{hamza};"
|
||||
"p<{peh};"
|
||||
"q<{qaaf};"
|
||||
"r<{reh};"
|
||||
"sh<{sheen};"
|
||||
"''ss<s){sad};"
|
||||
"ss<{sad};"
|
||||
"''s<s){seen};"
|
||||
"s<{seen};"
|
||||
"th<{theh};"
|
||||
"tm<{tehmarbuta};"
|
||||
"''tt<t){tah};"
|
||||
"tt<{tah};"
|
||||
"''t<t){teh};"
|
||||
"t<{teh};"
|
||||
"''uu<u){dammatein};"
|
||||
"uu<{dammatein};"
|
||||
"''u<u){damma};"
|
||||
"u<{damma};"
|
||||
"we<{wauuhamza};"
|
||||
"w<{wau};"
|
||||
"ye<{yehuhamza};"
|
||||
"ym<{yehmaqsura};"
|
||||
"''y<y){yeh};"
|
||||
"y<{yeh};"
|
||||
"''zz<z){zah};"
|
||||
"zz<{zah};"
|
||||
"''z<z){zain};"
|
||||
"z<{zain};"
|
||||
|
||||
"dh<dh){shadda};"
|
||||
"dd<dd){shadda};"
|
||||
"''d<d){shadda};"
|
||||
}
|
||||
Rule {
|
||||
// To Do: finish adding shadda, add sokoon
|
||||
|
||||
"$alefmadda=\u0622;"
|
||||
"$alefuhamza=\u0623;"
|
||||
"$wauuhamza=\u0624;"
|
||||
"$alefhamza=\u0625;"
|
||||
"$yehuhamza=\u0626;"
|
||||
"$alef=\u0627;"
|
||||
"$beh=\u0628;"
|
||||
"$tehmarbuta=\u0629;"
|
||||
"$teh=\u062A;"
|
||||
"$theh=\u062B;"
|
||||
"$geem=\u062C;"
|
||||
"$hah=\u062D;"
|
||||
"$kha=\u062E;"
|
||||
"$dal=\u062F;"
|
||||
"$dhal=\u0630;"
|
||||
"$reh=\u0631;"
|
||||
"$zain=\u0632;"
|
||||
"$seen=\u0633;"
|
||||
"$sheen=\u0634;"
|
||||
"$sad=\u0635;"
|
||||
"$dad=\u0636;"
|
||||
"$tah=\u0637;"
|
||||
"$zah=\u0638;"
|
||||
"$ein=\u0639;"
|
||||
"$ghein=\u063A;"
|
||||
"$feh=\u0641;"
|
||||
"$qaaf=\u0642;"
|
||||
"$kaf=\u0643;"
|
||||
"$lam=\u0644;"
|
||||
"$meem=\u0645;"
|
||||
"$noon=\u0646;"
|
||||
"$heh=\u0647;"
|
||||
"$wau=\u0648;"
|
||||
"$yehmaqsura=\u0649;"
|
||||
"$yeh=\u064A;"
|
||||
"$peh=\u06A4;"
|
||||
|
||||
"$hamza=\u0621;"
|
||||
"$fathatein=\u064B;"
|
||||
"$dammatein=\u064C;"
|
||||
"$kasratein=\u064D;"
|
||||
"$fatha=\u064E;"
|
||||
"$damma=\u064F;"
|
||||
"$kasra=\u0650;"
|
||||
"$shadda=\u0651;"
|
||||
"$sokoon=\u0652;"
|
||||
|
||||
// handle doubles - liu
|
||||
"t'' < {$teh} [$teh$theh$tehmarbuta$tah];"
|
||||
"h'' < {$heh} [$heh$hah];"
|
||||
"s'' < {$seen} $sheen;"
|
||||
// handle a few pathological special cases to make round
|
||||
// trip work. - liu
|
||||
"d'~'d <> $dal $dal;"
|
||||
"dh'~'dh <> $dhal $dhal;"
|
||||
"dd'~'dd <> $dad $dad;"
|
||||
|
||||
// convert English to Arabic
|
||||
"Arabic>"
|
||||
"\u062a\u062a\u0645\u062a\u0639' '"
|
||||
"\u0627\u0644\u0644\u063a\u0629' '"
|
||||
"\u0627\u0644\u0639\u0631\u0628\u0628\u064a\u0629' '"
|
||||
"\u0628\u0628\u0646\u0638\u0645' '"
|
||||
"\u0643\u062a\u0627\u0628\u0628\u064a\u0629' '"
|
||||
"\u062c\u0645\u064a\u0644\u0629;"
|
||||
|
||||
"ai>$alefmadda;"
|
||||
"ae>$alefuhamza;"
|
||||
"ao>$alefhamza;"
|
||||
"aa>$alef;"
|
||||
"an>$fathatein;"
|
||||
"a>$fatha;"
|
||||
"b>$beh;"
|
||||
"c>$kaf;"
|
||||
// To be symmetrical with shadda rules below, we want to
|
||||
// map $dad $shadda to dd'dd, etc. - liu
|
||||
"$dhal{dh>$shadda;"
|
||||
"dh>$dhal;"
|
||||
"$dad{dd>$shadda;"
|
||||
"dd>$dad;"
|
||||
"$dal{d>$shadda;"
|
||||
"d>$dal;"
|
||||
"e>$ein;"
|
||||
"f>$feh;"
|
||||
"gh>$ghein;"
|
||||
"g>$geem;"
|
||||
"hh>$hah;"
|
||||
"h>$heh;"
|
||||
"ii>$kasratein;"
|
||||
"i>$kasra;"
|
||||
"j>$geem;"
|
||||
"kh>$kha;"
|
||||
"k>$kaf;"
|
||||
"l>$lam;"
|
||||
"m>$meem;"
|
||||
"n>$noon;"
|
||||
"o>$hamza;"
|
||||
"p>$peh;"
|
||||
"q>$qaaf;"
|
||||
"r>$reh;"
|
||||
"sh>$sheen;"
|
||||
"ss>$sad;"
|
||||
"s>$seen;"
|
||||
"th>$theh;"
|
||||
"tm>$tehmarbuta;"
|
||||
"tt>$tah;"
|
||||
"t>$teh;"
|
||||
"uu>$dammatein;"
|
||||
"u>$damma;"
|
||||
"v>$beh;"
|
||||
"we>$wauuhamza;"
|
||||
"w>$wau;"
|
||||
"x>$kaf$shadda$seen;"
|
||||
"ye>$yehuhamza;"
|
||||
"ym>$yehmaqsura;"
|
||||
"y>$yeh;"
|
||||
"zz>$zah;"
|
||||
"z>$zain;"
|
||||
|
||||
"0>\u0660;" // Arabic digit 0
|
||||
"1>\u0661;" // Arabic digit 1
|
||||
"2>\u0662;" // Arabic digit 2
|
||||
"3>\u0663;" // Arabic digit 3
|
||||
"4>\u0664;" // Arabic digit 4
|
||||
"5>\u0665;" // Arabic digit 5
|
||||
"6>\u0666;" // Arabic digit 6
|
||||
"7>\u0667;" // Arabic digit 7
|
||||
"8>\u0668;" // Arabic digit 8
|
||||
"9>\u0669;" // Arabic digit 9
|
||||
"'%'>\u066A;" // Arabic %
|
||||
"'.'>\u066B;" // Arabic decimal separator
|
||||
"','>\u066C;" // Arabic thousands separator
|
||||
"'*'>\u066D;" // Arabic five-pointed star
|
||||
|
||||
"'`0'>0;" // Escaped forms of the above
|
||||
"'`1'>1;"
|
||||
"'`2'>2;"
|
||||
"'`3'>3;"
|
||||
"'`4'>4;"
|
||||
"'`5'>5;"
|
||||
"'`6'>6;"
|
||||
"'`7'>7;"
|
||||
"'`8'>8;"
|
||||
"'`9'>9;"
|
||||
"'`%'>'%';"
|
||||
"'`.'>'.';"
|
||||
"'`,'>',';"
|
||||
"'`*'>'*';"
|
||||
"'``'>'`';"
|
||||
|
||||
"''>;"
|
||||
|
||||
// now Arabic to English
|
||||
|
||||
"''ai<a{$alefmadda;"
|
||||
"ai<$alefmadda;"
|
||||
"''ae<a{$alefuhamza;"
|
||||
"ae<$alefuhamza;"
|
||||
"''ao<a{$alefhamza;"
|
||||
"ao<$alefhamza;"
|
||||
"''aa<a{$alef;"
|
||||
"aa<$alef;"
|
||||
"''an<a{$fathatein;"
|
||||
"an<$fathatein;"
|
||||
"''a<a{$fatha;"
|
||||
"a<$fatha;"
|
||||
"b<$beh;"
|
||||
"''dh<d{$dhal;"
|
||||
"dh<$dhal;"
|
||||
"''dd<d{$dad;"
|
||||
"dd<$dad;"
|
||||
"''d<d{$dal;"
|
||||
"d<$dal;"
|
||||
"''e<a{$ein;"
|
||||
"''e<w{$ein;"
|
||||
"''e<y{$ein;"
|
||||
"e<$ein;"
|
||||
"f<$feh;"
|
||||
"gh<$ghein;"
|
||||
"''hh<d{$hah;"
|
||||
"''hh<t{$hah;"
|
||||
"''hh<k{$hah;"
|
||||
"''hh<s{$hah;"
|
||||
"hh<$hah;"
|
||||
"''h<d{$heh;"
|
||||
"''h<t{$heh;"
|
||||
"''h<k{$heh;"
|
||||
"''h<s{$heh;"
|
||||
"h<$heh;"
|
||||
"''ii<i{$kasratein;"
|
||||
"ii<$kasratein;"
|
||||
"''i<i{$kasra;"
|
||||
"i<$kasra;"
|
||||
"j<$geem;"
|
||||
"kh<$kha;"
|
||||
"x<$kaf$shadda$seen;"
|
||||
"k<$kaf;"
|
||||
"l<$lam;"
|
||||
"''m<y{$meem;"
|
||||
"''m<t{$meem;"
|
||||
"m<$meem;"
|
||||
"n<$noon;"
|
||||
"''o<a{$hamza;"
|
||||
"o<$hamza;"
|
||||
"p<$peh;"
|
||||
"q<$qaaf;"
|
||||
"r<$reh;"
|
||||
"sh<$sheen;"
|
||||
"''ss<s{$sad;"
|
||||
"ss<$sad;"
|
||||
"''s<s{$seen;"
|
||||
"s<$seen;"
|
||||
"th<$theh;"
|
||||
"tm<$tehmarbuta;"
|
||||
"''tt<t{$tah;"
|
||||
"tt<$tah;"
|
||||
"''t<t{$teh;"
|
||||
"t<$teh;"
|
||||
"''uu<u{$dammatein;"
|
||||
"uu<$dammatein;"
|
||||
"''u<u{$damma;"
|
||||
"u<$damma;"
|
||||
"we<$wauuhamza;"
|
||||
"w<$wau;"
|
||||
"ye<$yehuhamza;"
|
||||
"ym<$yehmaqsura;"
|
||||
"''y<y{$yeh;"
|
||||
"y<$yeh;"
|
||||
"''zz<z{$zah;"
|
||||
"zz<$zah;"
|
||||
"''z<z{$zain;"
|
||||
"z<$zain;"
|
||||
|
||||
// The following three rules map x $shadda to x x, where
|
||||
// x is dh or dd. If x is d, d'd is output. Net effect
|
||||
// is to map s.th. like $dad $shadda to dd'dd. - liu
|
||||
"dh<dh{$shadda;"
|
||||
"dd<dd{$shadda;"
|
||||
"''d<d{$shadda;"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,307 +1,312 @@
|
|||
//--------------------------------------------------------------------
|
||||
// Copyright (C) 1999, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
// Copyright (c) 1999-2000, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//--------------------------------------------------------------------
|
||||
// Date Name Description
|
||||
// 11/17/99 aliu Creation.
|
||||
// 12/10/99 aliu Fix case handling.
|
||||
// THIS IS A MACHINE-GENERATED FILE
|
||||
// Tool: src\com\ibm\tools\translit\dumpICUrules.bat
|
||||
// Source: src\com\ibm\text\resources/TransliterationRule_Latin_Cyrillic.java
|
||||
// Date: Fri May 19 15:50:22 2000
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
// Latin-Cyrillic
|
||||
|
||||
lcyril {
|
||||
Rule {
|
||||
//* /* This class is designed to be a general Latin-Cyrillic
|
||||
//* transliteration. The standard Russian transliterations
|
||||
//* are generally used for the letters from Russian,
|
||||
//* with additional Cyrillic characters given consistent
|
||||
//* mappings.
|
||||
//* */
|
||||
|
||||
"S-hacek=\u0160;"
|
||||
"s-hacek=\u0161;"
|
||||
|
||||
"YO=\u0401;"
|
||||
"J=\u0408;"
|
||||
"A=\u0410;"
|
||||
"B=\u0411;"
|
||||
"V=\u0412;"
|
||||
"G=\u0413;"
|
||||
"D=\u0414;"
|
||||
"YE=\u0415;"
|
||||
"ZH=\u0416;"
|
||||
"Z=\u0417;"
|
||||
"YI=\u0418;"
|
||||
"Y=\u0419;"
|
||||
"K=\u041A;"
|
||||
"L=\u041B;"
|
||||
"M=\u041C;"
|
||||
"N=\u041D;"
|
||||
"O=\u041E;"
|
||||
"P=\u041F;"
|
||||
"R=\u0420;"
|
||||
"S=\u0421;"
|
||||
"T=\u0422;"
|
||||
"U=\u0423;"
|
||||
"F=\u0424;"
|
||||
"KH=\u0425;"
|
||||
"TS=\u0426;"
|
||||
"CH=\u0427;"
|
||||
"SH=\u0428;"
|
||||
"SHCH=\u0429;"
|
||||
"HARD=\u042A;"
|
||||
"I=\u042B;"
|
||||
"SOFT=\u042C;"
|
||||
"E=\u042D;"
|
||||
"YU=\u042E;"
|
||||
"YA=\u042F;"
|
||||
|
||||
// Lowercase
|
||||
|
||||
"a=\u0430;"
|
||||
"b=\u0431;"
|
||||
"v=\u0432;"
|
||||
"g=\u0433;"
|
||||
"d=\u0434;"
|
||||
"ye=\u0435;"
|
||||
"zh=\u0436;"
|
||||
"z=\u0437;"
|
||||
"yi=\u0438;"
|
||||
"y=\u0439;"
|
||||
"k=\u043a;"
|
||||
"l=\u043b;"
|
||||
"m=\u043c;"
|
||||
"n=\u043d;"
|
||||
"o=\u043e;"
|
||||
"p=\u043f;"
|
||||
"r=\u0440;"
|
||||
"s=\u0441;"
|
||||
"t=\u0442;"
|
||||
"u=\u0443;"
|
||||
"f=\u0444;"
|
||||
"kh=\u0445;"
|
||||
"ts=\u0446;"
|
||||
"ch=\u0447;"
|
||||
"sh=\u0448;"
|
||||
"shch=\u0449;"
|
||||
"hard=\u044a;"
|
||||
"i=\u044b;"
|
||||
"soft=\u044c;"
|
||||
"e=\u044d;"
|
||||
"yu=\u044e;"
|
||||
"ya=\u044f;"
|
||||
|
||||
"yo=\u0451;"
|
||||
"j=\u0458;"
|
||||
|
||||
// variables
|
||||
// some are duplicated so lowercasing works
|
||||
|
||||
"csoft=[eiyEIY];"
|
||||
"CSOFT=[eiyEIY];"
|
||||
|
||||
"BECOMES_H=[{HARD}{hard}];"
|
||||
"becomes_h=[{HARD}{hard}];"
|
||||
|
||||
"BECOMES_S=[{S}{s}];"
|
||||
"becomes_s=[{S}{s}];"
|
||||
|
||||
"BECOMES_C=[{CH}{ch}];"
|
||||
"becomes_c=[{CH}{ch}];"
|
||||
|
||||
"BECOMES_VOWEL=[{A}{E}{I}{O}{U}{a}{e}{i}{o}{u}];"
|
||||
"becomes_vowel=[{A}{E}{I}{O}{U}{a}{e}{i}{o}{u}];"
|
||||
|
||||
"letter=[[:Lu:][:Ll:]];"
|
||||
"lower=[[:Ll:]];"
|
||||
|
||||
//* /*
|
||||
//* Modified to combine display transliterator and typing transliterator.
|
||||
//* The display mapping uses accents for the "soft" vowels.
|
||||
//* It does not, although it could, use characters like \u0161 instead of digraphs
|
||||
//* like sh.
|
||||
//* */
|
||||
|
||||
// #############################################
|
||||
// Special titlecase forms, not duplicated
|
||||
// #############################################
|
||||
|
||||
"Ch>{CH};" "Ch<{CH}({lower};"
|
||||
"Kh>{KH};" "Kh<{KH}({lower};"
|
||||
"Shch>{SHCH};" "Shch<{SHCH}({lower};"
|
||||
"Sh>{SH};" "Sh<{SH}({lower};"
|
||||
"Ts>{TS};" "Ts<{TS}({lower};"
|
||||
"Zh>{ZH};" "Zh<{ZH}({lower};"
|
||||
"Yi>{YI};" //+ "Yi<{YI}({lower};"
|
||||
"Ye>{YE};" //+ "Ye<{YE}({lower};"
|
||||
"Yo>{YO};" //+ "Yo<{YO}({lower};"
|
||||
"Yu>{YU};" //+ "Yu<{YU}({lower};"
|
||||
"Ya>{YA};" //+ "Ya<{YA}({lower};"
|
||||
|
||||
// #############################################
|
||||
// Rules to Duplicate
|
||||
// To get the lowercase versions, copy these and lowercase
|
||||
// #############################################
|
||||
|
||||
// variant spellings in English
|
||||
|
||||
"SHTCH>{SHCH};"
|
||||
"TCH>{CH};"
|
||||
"TH>{Z};"
|
||||
"Q>{K};"
|
||||
"WH>{V};"
|
||||
"W>{V};"
|
||||
"X>{K}{S};" //+ "X<{K}{S};"
|
||||
|
||||
// Separate letters that would otherwise join
|
||||
|
||||
"SH''<{SH}({BECOMES_C};"
|
||||
"T''<{T}({BECOMES_S};"
|
||||
|
||||
"K''<{K}({BECOMES_H};"
|
||||
"S''<{S}({BECOMES_H};"
|
||||
"T''<{T}({BECOMES_H};"
|
||||
"Z''<{Z}({BECOMES_H};"
|
||||
|
||||
"Y''<{Y}({BECOMES_VOWEL};"
|
||||
|
||||
// Main letters
|
||||
|
||||
"A<>{A};"
|
||||
"B<>{B};"
|
||||
"CH<>{CH};"
|
||||
"D<>{D};"
|
||||
"E<>{E};"
|
||||
"F<>{F};"
|
||||
"G<>{G};"
|
||||
"\u00cc<>{YI};"
|
||||
"I<>{I};"
|
||||
"KH<>{KH};"
|
||||
"K<>{K};"
|
||||
"L<>{L};"
|
||||
"M<>{M};"
|
||||
"N<>{N};"
|
||||
"O<>{O};"
|
||||
"P<>{P};"
|
||||
"R<>{R};"
|
||||
"SHCH<>{SHCH};"
|
||||
"SH>{SH};" //+ "SH<{SH};"
|
||||
"{S-hacek}<>{SH};"
|
||||
"S<>{S};"
|
||||
"TS<>{TS};"
|
||||
"T<>{T};"
|
||||
"U<>{U};"
|
||||
"V<>{V};"
|
||||
//\u00cc\u00c0\u00c8\u00d2\u00d9
|
||||
"YE>{YE};" //+ "YE<{YE};"
|
||||
"\u00c8<>{YE};"
|
||||
"YO>{YO};" //+ "YO<{YO};"
|
||||
"\u00d2<>{YO};"
|
||||
"YU>{YU};" //+ "YU<{YU};"
|
||||
"\u00d9<>{YU};"
|
||||
"YA>{YA};" //+ "YA<{YA};"
|
||||
"\u00c0<>{YA};"
|
||||
"Y<>{Y};"
|
||||
"ZH<>{ZH};"
|
||||
"Z<>{Z};"
|
||||
|
||||
"H<>{HARD};"
|
||||
"\u0178<>{SOFT};"
|
||||
|
||||
// Non-russian
|
||||
|
||||
"J<>{J};"
|
||||
|
||||
// variant spellings in English
|
||||
|
||||
"C({csoft}>{S};"
|
||||
"C>{K};"
|
||||
|
||||
// #############################################
|
||||
// Duplicated Rules
|
||||
// Copy and lowercase the above rules
|
||||
// #############################################
|
||||
|
||||
// variant spellings in english
|
||||
|
||||
"shtch>{shch};"
|
||||
"tch>{ch};"
|
||||
"th>{z};"
|
||||
"q>{k};"
|
||||
"wh>{v};"
|
||||
"w>{v};"
|
||||
"x>{k}{s};" //+ "x<{k}{s};"
|
||||
|
||||
// separate letters that would otherwise join
|
||||
|
||||
"sh''<{sh}({becomes_c};"
|
||||
"t''<{t}({becomes_s};"
|
||||
|
||||
"k''<{k}({becomes_h};"
|
||||
"s''<{s}({becomes_h};"
|
||||
"t''<{t}({becomes_h};"
|
||||
"z''<{z}({becomes_h};"
|
||||
|
||||
"y''<{y}({becomes_vowel};"
|
||||
|
||||
// main letters
|
||||
|
||||
"a<>{a};"
|
||||
"b<>{b};"
|
||||
"ch<>{ch};"
|
||||
"d<>{d};"
|
||||
"e<>{e};"
|
||||
"f<>{f};"
|
||||
"g<>{g};"
|
||||
"\u00ec<>{yi};"
|
||||
"i<>{i};"
|
||||
"kh<>{kh};"
|
||||
"k<>{k};"
|
||||
"l<>{l};"
|
||||
"m<>{m};"
|
||||
"n<>{n};"
|
||||
"o<>{o};"
|
||||
"p<>{p};"
|
||||
"r<>{r};"
|
||||
"shch<>{shch};"
|
||||
"sh>{sh};" //+ "sh<{sh};"
|
||||
"{s-hacek}<>{sh};"
|
||||
"s<>{s};"
|
||||
"ts<>{ts};"
|
||||
"t<>{t};"
|
||||
"u<>{u};"
|
||||
"v<>{v};"
|
||||
//\u00ec\u00e0\u00e8\u00f2\u00f9
|
||||
"ye>{ye};" //+ "ye<{ye};"
|
||||
"\u00e8<>{ye};"
|
||||
"yo>{yo};" //+ "yo<{yo};"
|
||||
"\u00f2<>{yo};"
|
||||
"yu>{yu};" //+ "yu<{yu};"
|
||||
"\u00f9<>{yu};"
|
||||
"ya>{ya};" //+ "ya<{ya};"
|
||||
"\u00e0<>{ya};"
|
||||
"y<>{y};"
|
||||
"zh<>{zh};"
|
||||
"z<>{z};"
|
||||
|
||||
"h<>{hard};"
|
||||
"\u00ff<>{soft};"
|
||||
|
||||
// non-russian
|
||||
|
||||
"j<>{j};"
|
||||
|
||||
// variant spellings in english
|
||||
|
||||
"c({csoft}>{s};"
|
||||
"c>{k};"
|
||||
|
||||
|
||||
|
||||
// #############################################
|
||||
// End of Duplicated Rules
|
||||
// #############################################
|
||||
|
||||
//generally the last rule
|
||||
"''>;"
|
||||
//the end
|
||||
}
|
||||
Rule {
|
||||
|
||||
// This class is designed to be a general Latin-Cyrillic
|
||||
// transliteration. The standard Russian transliterations
|
||||
// are generally used for the letters from Russian,
|
||||
// with additional Cyrillic characters given consistent
|
||||
// mappings.
|
||||
|
||||
"$S_hacek=\u0160;"
|
||||
"$s_hacek=\u0161;"
|
||||
|
||||
"$YO=\u0401;"
|
||||
"$J=\u0408;"
|
||||
"$A=\u0410;"
|
||||
"$B=\u0411;"
|
||||
"$V=\u0412;"
|
||||
"$G=\u0413;"
|
||||
"$D=\u0414;"
|
||||
"$YE=\u0415;"
|
||||
"$ZH=\u0416;"
|
||||
"$Z=\u0417;"
|
||||
"$YI=\u0418;"
|
||||
"$Y=\u0419;"
|
||||
"$K=\u041A;"
|
||||
"$L=\u041B;"
|
||||
"$M=\u041C;"
|
||||
"$N=\u041D;"
|
||||
"$O=\u041E;"
|
||||
"$P=\u041F;"
|
||||
"$R=\u0420;"
|
||||
"$S=\u0421;"
|
||||
"$T=\u0422;"
|
||||
"$U=\u0423;"
|
||||
"$F=\u0424;"
|
||||
"$KH=\u0425;"
|
||||
"$TS=\u0426;"
|
||||
"$CH=\u0427;"
|
||||
"$SH=\u0428;"
|
||||
"$SHCH=\u0429;"
|
||||
"$HARD=\u042A;"
|
||||
"$I=\u042B;"
|
||||
"$SOFT=\u042C;"
|
||||
"$E=\u042D;"
|
||||
"$YU=\u042E;"
|
||||
"$YA=\u042F;"
|
||||
|
||||
// Lowercase
|
||||
|
||||
"$a=\u0430;"
|
||||
"$b=\u0431;"
|
||||
"$v=\u0432;"
|
||||
"$g=\u0433;"
|
||||
"$d=\u0434;"
|
||||
"$ye=\u0435;"
|
||||
"$zh=\u0436;"
|
||||
"$z=\u0437;"
|
||||
"$yi=\u0438;"
|
||||
"$y=\u0439;"
|
||||
"$k=\u043a;"
|
||||
"$l=\u043b;"
|
||||
"$m=\u043c;"
|
||||
"$n=\u043d;"
|
||||
"$o=\u043e;"
|
||||
"$p=\u043f;"
|
||||
"$r=\u0440;"
|
||||
"$s=\u0441;"
|
||||
"$t=\u0442;"
|
||||
"$u=\u0443;"
|
||||
"$f=\u0444;"
|
||||
"$kh=\u0445;"
|
||||
"$ts=\u0446;"
|
||||
"$ch=\u0447;"
|
||||
"$sh=\u0448;"
|
||||
"$shch=\u0449;"
|
||||
"$hard=\u044a;"
|
||||
"$i=\u044b;"
|
||||
"$soft=\u044c;"
|
||||
"$e=\u044d;"
|
||||
"$yu=\u044e;"
|
||||
"$ya=\u044f;"
|
||||
|
||||
"$yo=\u0451;"
|
||||
"$j=\u0458;"
|
||||
|
||||
// variables
|
||||
// some are duplicated so lowercasing works
|
||||
|
||||
"$csoft=[eiyEIY];"
|
||||
"$CSOFT=[eiyEIY];"
|
||||
|
||||
"$BECOMES_H=[$HARD$hard];"
|
||||
"$becomes_h=[$HARD$hard];"
|
||||
|
||||
"$BECOMES_S=[$S$s];"
|
||||
"$becomes_s=[$S$s];"
|
||||
|
||||
"$BECOMES_C=[$CH$ch];"
|
||||
"$becomes_c=[$CH$ch];"
|
||||
|
||||
"$BECOMES_VOWEL=[$A$E$I$O$U$a$e$i$o$u];"
|
||||
"$becomes_vowel=[$A$E$I$O$U$a$e$i$o$u];"
|
||||
|
||||
"$letter=[[:Lu:][:Ll:]];"
|
||||
"$lower=[[:Ll:]];"
|
||||
|
||||
// Modified to combine display transliterator and typing transliterator.
|
||||
// The display mapping uses accents for the "soft" vowels.
|
||||
// It does not, although it could, use characters like \u0161 instead of digraphs
|
||||
// like sh.
|
||||
|
||||
// #############################################
|
||||
// Special titlecase forms, not duplicated
|
||||
// #############################################
|
||||
|
||||
"Sh''ch<>$SH$ch;" // LIU Distinguish $SH$ch from $SHCH
|
||||
|
||||
"Ch>$CH;" "Ch<$CH}$lower;"
|
||||
"Kh>$KH;" "Kh<$KH}$lower;"
|
||||
"Shch>$SHCH;" "Shch<$SHCH}$lower;"
|
||||
"Sh>$SH;" "Sh<$SH}$lower;"
|
||||
"Ts>$TS;" "Ts<$TS}$lower;"
|
||||
"Zh>$ZH;" "Zh<$ZH}$lower;"
|
||||
"Yi>$YI;" //+ "Yi<$YI}$lower;"
|
||||
"Ye>$YE;" //+ "Ye<$YE}$lower;"
|
||||
"Yo>$YO;" //+ "Yo<$YO}$lower;"
|
||||
"Yu>$YU;" //+ "Yu<$YU}$lower;"
|
||||
"Ya>$YA;" //+ "Ya<$YA}$lower;"
|
||||
|
||||
// #############################################
|
||||
// Rules to Duplicate
|
||||
// To get the lowercase versions, copy these and lowercase
|
||||
// #############################################
|
||||
|
||||
// variant spellings in English
|
||||
|
||||
"SHTCH>$SHCH;"
|
||||
"TCH>$CH;"
|
||||
"TH>$Z;"
|
||||
"Q>$K;"
|
||||
"WH>$V;"
|
||||
"W>$V;"
|
||||
"X>$K$S;" //+ "X<$K$S;"
|
||||
|
||||
// Separate letters that would otherwise join
|
||||
|
||||
"SH''<$SH}$BECOMES_C;"
|
||||
"T''<$T}$BECOMES_S;"
|
||||
"T''<$T}[$CH$SHCH$shch];" // LIU add special cases
|
||||
|
||||
"K''<$K}$BECOMES_H;"
|
||||
"S''<$S}$BECOMES_H;"
|
||||
"T''<$T}$BECOMES_H;"
|
||||
"Z''<$Z}$BECOMES_H;"
|
||||
|
||||
"Y''<$Y}$BECOMES_VOWEL;"
|
||||
|
||||
// Main letters
|
||||
|
||||
"A<>$A;"
|
||||
"B<>$B;"
|
||||
"CH<>$CH;"
|
||||
"D<>$D;"
|
||||
"E<>$E;"
|
||||
"F<>$F;"
|
||||
"G<>$G;"
|
||||
"\u00cc<>$YI;"
|
||||
"I<>$I;"
|
||||
"KH<>$KH;"
|
||||
"K<>$K;"
|
||||
"L<>$L;"
|
||||
"M<>$M;"
|
||||
"N<>$N;"
|
||||
"O<>$O;"
|
||||
"P<>$P;"
|
||||
"R<>$R;"
|
||||
"SHCH<>$SHCH;"
|
||||
"SH>$SH;" //+ "SH<$SH;"
|
||||
"$S_hacek<>$SH;"
|
||||
"S<>$S;"
|
||||
"TS<>$TS;"
|
||||
"T<>$T;"
|
||||
"U<>$U;"
|
||||
"V<>$V;"
|
||||
//\u00cc\u00c0\u00c8\u00d2\u00d9
|
||||
"YE>$YE;" //+ "YE<$YE;"
|
||||
"\u00c8<>$YE;"
|
||||
"YO>$YO;" //+ "YO<$YO;"
|
||||
"\u00d2<>$YO;"
|
||||
"YU>$YU;" //+ "YU<$YU;"
|
||||
"\u00d9<>$YU;"
|
||||
"YA>$YA;" //+ "YA<$YA;"
|
||||
"\u00c0<>$YA;"
|
||||
"Y<>$Y;"
|
||||
"ZH<>$ZH;"
|
||||
"Z<>$Z;"
|
||||
|
||||
"H<>$HARD;"
|
||||
"\u0178<>$SOFT;"
|
||||
|
||||
// Non-russian
|
||||
|
||||
"J<>$J;"
|
||||
|
||||
// variant spellings in English
|
||||
|
||||
"C}$csoft>$S;"
|
||||
"C>$K;"
|
||||
|
||||
// #############################################
|
||||
// Duplicated Rules
|
||||
// Copy and lowercase the above rules
|
||||
// #############################################
|
||||
|
||||
// variant spellings in english
|
||||
|
||||
"shtch>$shch;"
|
||||
"tch>$ch;"
|
||||
"th>$z;"
|
||||
"q>$k;"
|
||||
"wh>$v;"
|
||||
"w>$v;"
|
||||
"x>$k$s;" //+ "x<$k$s;"
|
||||
|
||||
// separate letters that would otherwise join
|
||||
|
||||
"sh''<$sh}$becomes_c;"
|
||||
"t''<$t}$becomes_s;"
|
||||
"t''<$t}[$ch$shch];" // LIU add special cases
|
||||
|
||||
"k''<$k}$becomes_h;"
|
||||
"s''<$s}$becomes_h;"
|
||||
"t''<$t}$becomes_h;"
|
||||
"z''<$z}$becomes_h;"
|
||||
|
||||
"y''<$y}$becomes_vowel;"
|
||||
|
||||
// main letters
|
||||
|
||||
"a<>$a;"
|
||||
"b<>$b;"
|
||||
"ch<>$ch;"
|
||||
"d<>$d;"
|
||||
"e<>$e;"
|
||||
"f<>$f;"
|
||||
"g<>$g;"
|
||||
"\u00ec<>$yi;"
|
||||
"i<>$i;"
|
||||
"kh<>$kh;"
|
||||
"k<>$k;"
|
||||
"l<>$l;"
|
||||
"m<>$m;"
|
||||
"n<>$n;"
|
||||
"o<>$o;"
|
||||
"p<>$p;"
|
||||
"r<>$r;"
|
||||
"shch<>$shch;"
|
||||
"sh>$sh;" //+ "sh<$sh;"
|
||||
"$s_hacek<>$sh;"
|
||||
"s<>$s;"
|
||||
"ts<>$ts;"
|
||||
"t<>$t;"
|
||||
"u<>$u;"
|
||||
"v<>$v;"
|
||||
//\u00ec\u00e0\u00e8\u00f2\u00f9
|
||||
"ye>$ye;" //+ "ye<$ye;"
|
||||
"\u00e8<>$ye;"
|
||||
"yo>$yo;" //+ "yo<$yo;"
|
||||
"\u00f2<>$yo;"
|
||||
"yu>$yu;" //+ "yu<$yu;"
|
||||
"\u00f9<>$yu;"
|
||||
"ya>$ya;" //+ "ya<$ya;"
|
||||
"\u00e0<>$ya;"
|
||||
"y<>$y;"
|
||||
"zh<>$zh;"
|
||||
"z<>$z;"
|
||||
|
||||
"h<>$hard;"
|
||||
"\u00ff<>$soft;"
|
||||
|
||||
// non-russian
|
||||
|
||||
"j<>$j;"
|
||||
|
||||
// variant spellings in english
|
||||
|
||||
"c}$csoft>$s;"
|
||||
"c>$k;"
|
||||
|
||||
|
||||
|
||||
// #############################################
|
||||
// End of Duplicated Rules
|
||||
// #############################################
|
||||
|
||||
//generally the last rule
|
||||
"''>;"
|
||||
//the end
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,411 +1,411 @@
|
|||
//--------------------------------------------------------------------
|
||||
// Copyright (C) 1999, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
// Copyright (c) 1999-2000, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//--------------------------------------------------------------------
|
||||
// Date Name Description
|
||||
// 11/17/99 aliu Creation.
|
||||
// THIS IS A MACHINE-GENERATED FILE
|
||||
// Tool: src\com\ibm\tools\translit\dumpICUrules.bat
|
||||
// Source: src\com\ibm\text\resources/TransliterationRule_Latin_Devanagari.java
|
||||
// Date: Fri May 19 15:50:22 2000
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
// Latin-Devanagari
|
||||
|
||||
ldevan {
|
||||
Rule {
|
||||
//#####################################################################
|
||||
// Keyboard Transliteration Table
|
||||
//#####################################################################
|
||||
// Conversions should be:
|
||||
// 1. complete
|
||||
// * convert every sequence of Latin letters (a to z plus apostrophe)
|
||||
// to a sequence of Native letters
|
||||
// * convert every sequence of Native letters to Latin letters
|
||||
// 2. reversable
|
||||
// * any string of Native converted to Latin and back should be the same
|
||||
// * this is not true for English converted to Native & back, e.g.:
|
||||
// k -> {kaf} -> k
|
||||
// c -> {kaf} -> k
|
||||
//#####################################################################
|
||||
// Sequences of Latin letters may convert to a single Native letter.
|
||||
// When this is the case, an apostrophe can be used to indicate separate
|
||||
// letters.$
|
||||
// E.g. sh -> {shin}
|
||||
// s'h -> {sin}{heh}
|
||||
// ss -> {sad}
|
||||
// s's -> {sin}{shadda}
|
||||
//#####################################################################
|
||||
// To Do:
|
||||
// finish adding shadda, add sokoon, fix uppercase
|
||||
// make two transliteration tables: one with vowels, one without
|
||||
//#####################################################################
|
||||
// Modifications
|
||||
// Devanagari Transliterator: broken up with consonsants/vowels
|
||||
//#####################################################################
|
||||
// Unicode character name definitions
|
||||
//#####################################################################
|
||||
|
||||
//consonants
|
||||
"candrabindu=\u0901;"
|
||||
"bindu=\u0902;"
|
||||
"visarga=\u0903;"
|
||||
|
||||
// w<vowel> represents the stand-alone form
|
||||
"wa=\u0905;"
|
||||
"waa=\u0906;"
|
||||
"wi=\u0907;"
|
||||
"wii=\u0908;"
|
||||
"wu=\u0909;"
|
||||
"wuu=\u090A;"
|
||||
"wr=\u090B;"
|
||||
"wl=\u090C;"
|
||||
"we=\u090F;"
|
||||
"wai=\u0910;"
|
||||
"wo=\u0913;"
|
||||
"wau=\u0914;"
|
||||
|
||||
"ka=\u0915;"
|
||||
"kha=\u0916;"
|
||||
"ga=\u0917;"
|
||||
"gha=\u0918;"
|
||||
"nga=\u0919;"
|
||||
|
||||
"ca=\u091A;"
|
||||
"cha=\u091B;"
|
||||
"ja=\u091C;"
|
||||
"jha=\u091D;"
|
||||
"nya=\u091E;"
|
||||
|
||||
"tta=\u091F;"
|
||||
"ttha=\u0920;"
|
||||
"dda=\u0921;"
|
||||
"ddha=\u0922;"
|
||||
"nna=\u0923;"
|
||||
|
||||
"ta=\u0924;"
|
||||
"tha=\u0925;"
|
||||
"da=\u0926;"
|
||||
"dha=\u0927;"
|
||||
"na=\u0928;"
|
||||
|
||||
"pa=\u092A;"
|
||||
"pha=\u092B;"
|
||||
"ba=\u092C;"
|
||||
"bha=\u092D;"
|
||||
"ma=\u092E;"
|
||||
|
||||
"ya=\u092F;"
|
||||
"ra=\u0930;"
|
||||
"rra=\u0931;"
|
||||
"la=\u0933;"
|
||||
"va=\u0935;"
|
||||
|
||||
"sha=\u0936;"
|
||||
"ssa=\u0937;"
|
||||
"sa=\u0938;"
|
||||
"ha=\u0939;"
|
||||
|
||||
// <vowel> represents the dependent form
|
||||
"aa=\u093E;"
|
||||
"i=\u093F;"
|
||||
"ii=\u0940;"
|
||||
"u=\u0941;"
|
||||
"uu=\u0942;"
|
||||
"rh=\u0943;"
|
||||
"lh=\u0944;"
|
||||
"e=\u0947;"
|
||||
"ai=\u0948;"
|
||||
"o=\u094B;"
|
||||
"au=\u094C;"
|
||||
|
||||
"virama=\u094D;"
|
||||
|
||||
"wrr=\u0960;"
|
||||
"rrh=\u0962;"
|
||||
|
||||
"danda=\u0964;"
|
||||
"doubleDanda=\u0965;"
|
||||
"depVowelAbove=[\u093E-\u0940\u0945-\u094C];"
|
||||
"depVowelBelow=[\u0941-\u0944];"
|
||||
// Ech: Double escape U+0000, so UnicodeString doesn't consider it
|
||||
// to be the end of the string. This is only necessary for U+0000
|
||||
// right now. [liu]
|
||||
"endThing=[{danda}{doubleDanda}\\u0000-\u08FF\u0980-\uFFFF];"
|
||||
|
||||
"&=[{virama}{aa}{ai}{au}{ii}{i}{uu}{u}{rrh}{rh}{lh}{e}{o}];"
|
||||
"%=[bcdfghjklmnpqrstvwxyz];"
|
||||
|
||||
//#####################################################################
|
||||
// convert from Latin letters to Native letters
|
||||
//#####################################################################
|
||||
//Hindi>\u092d\u093e\u0930\u0924--\u0020\u0926\u0947\u0936\u0020\u092c\u0928\u094d\u0927\u0941\u002e
|
||||
|
||||
// special forms with no good conversion
|
||||
|
||||
"mm>{bindu};"
|
||||
"x>{visarga};"
|
||||
|
||||
// convert to independent forms at start of word or syllable:
|
||||
// e.g. keai -> {ka}{e}{wai}; k'ai -> {ka}{wai}; (ai) -> ({wai})
|
||||
// Moved up [LIU]
|
||||
|
||||
"aa>{waa};"
|
||||
"ai>{wai};"
|
||||
"au>{wau};"
|
||||
"ii>{wii};"
|
||||
"i>{wi};"
|
||||
"uu>{wuu};"
|
||||
"u>{wu};"
|
||||
"rrh>{wrr};"
|
||||
"rh>{wr};"
|
||||
"lh>{wl};"
|
||||
"e>{we};"
|
||||
"o>{wo};"
|
||||
"a>{wa};"
|
||||
|
||||
// normal consonants
|
||||
|
||||
"kh>{kha}|{virama};"
|
||||
"k>{ka}|{virama};"
|
||||
"q>{ka}|{virama};"
|
||||
"gh>{gha}|{virama};"
|
||||
"g>{ga}|{virama};"
|
||||
"ng>{nga}|{virama};"
|
||||
"ch>{cha}|{virama};"
|
||||
"c>{ca}|{virama};"
|
||||
"jh>{jha}|{virama};"
|
||||
"j>{ja}|{virama};"
|
||||
"ny>{nya}|{virama};"
|
||||
"tth>{ttha}|{virama};"
|
||||
"tt>{tta}|{virama};"
|
||||
"ddh>{ddha}|{virama};"
|
||||
"dd>{dda}|{virama};"
|
||||
"nn>{nna}|{virama};"
|
||||
"th>{tha}|{virama};"
|
||||
"t>{ta}|{virama};"
|
||||
"dh>{dha}|{virama};"
|
||||
"d>{da}|{virama};"
|
||||
"n>{na}|{virama};"
|
||||
"ph>{pha}|{virama};"
|
||||
"p>{pa}|{virama};"
|
||||
"bh>{bha}|{virama};"
|
||||
"b>{ba}|{virama};"
|
||||
"m>{ma}|{virama};"
|
||||
"y>{ya}|{virama};"
|
||||
"r>{ra}|{virama};"
|
||||
"l>{la}|{virama};"
|
||||
"v>{va}|{virama};"
|
||||
"f>{va}|{virama};"
|
||||
"w>{va}|{virama};"
|
||||
"sh>{sha}|{virama};"
|
||||
"ss>{ssa}|{virama};"
|
||||
"s>{sa}|{virama};"
|
||||
"z>{sa}|{virama};"
|
||||
"h>{ha}|{virama};"
|
||||
|
||||
".>{danda};"
|
||||
"{danda}.>{doubleDanda};"
|
||||
"{depVowelAbove})~>{bindu};"
|
||||
"{depVowelBelow})~>{candrabindu};"
|
||||
|
||||
// convert to dependent forms after consonant with no vowel:
|
||||
// e.g. kai -> {ka}{virama}ai -> {ka}{ai}
|
||||
|
||||
"{virama}aa>{aa};"
|
||||
"{virama}ai>{ai};"
|
||||
"{virama}au>{au};"
|
||||
"{virama}ii>{ii};"
|
||||
"{virama}i>{i};"
|
||||
"{virama}uu>{uu};"
|
||||
"{virama}u>{u};"
|
||||
"{virama}rrh>{rrh};"
|
||||
"{virama}rh>{rh};"
|
||||
"{virama}lh>{lh};"
|
||||
"{virama}e>{e};"
|
||||
"{virama}o>{o};"
|
||||
"{virama}a>;"
|
||||
|
||||
// otherwise convert independent forms when separated by ': k'ai -> {ka}{virama}{wai}
|
||||
|
||||
"{virama}''aa>{waa};"
|
||||
"{virama}''ai>{wai};"
|
||||
"{virama}''au>{wau};"
|
||||
"{virama}''ii>{wii};"
|
||||
"{virama}''i>{wi};"
|
||||
"{virama}''uu>{wuu};"
|
||||
"{virama}''u>{wu};"
|
||||
"{virama}''rrh>{wrr};"
|
||||
"{virama}''rh>{wr};"
|
||||
"{virama}''lh>{wl};"
|
||||
"{virama}''e>{we};"
|
||||
"{virama}''o>{wo};"
|
||||
"{virama}''a>{wa};"
|
||||
|
||||
"{virama}({endThing}>;"
|
||||
|
||||
// convert any left-over apostrophes used for separation
|
||||
|
||||
"''>;"
|
||||
|
||||
//#####################################################################
|
||||
// convert from Native letters to Latin letters
|
||||
//#####################################################################
|
||||
|
||||
// special forms with no good conversion
|
||||
|
||||
"mm<{bindu};"
|
||||
"x<{visarga};"
|
||||
|
||||
// normal consonants
|
||||
|
||||
"kh<{kha}(&;"
|
||||
"kha<{kha};"
|
||||
"k''<{ka}{virama}({ha};"
|
||||
"k<{ka}(&;"
|
||||
"ka<{ka};"
|
||||
"gh<{gha}(&;"
|
||||
"gha<{gha};"
|
||||
"g''<{ga}{virama}({ha};"
|
||||
"g<{ga}(&;"
|
||||
"ga<{ga};"
|
||||
"ng<{nga}(&;"
|
||||
"nga<{nga};"
|
||||
"ch<{cha}(&;"
|
||||
"cha<{cha};"
|
||||
"c''<{ca}{virama}({ha};"
|
||||
"c<{ca}(&;"
|
||||
"ca<{ca};"
|
||||
"jh<{jha}(&;"
|
||||
"jha<{jha};"
|
||||
"j''<{ja}{virama}({ha};"
|
||||
"j<{ja}(&;"
|
||||
"ja<{ja};"
|
||||
"ny<{nya}(&;"
|
||||
"nya<{nya};"
|
||||
"tth<{ttha}(&;"
|
||||
"ttha<{ttha};"
|
||||
"tt''<{tta}{virama}({ha};"
|
||||
"tt<{tta}(&;"
|
||||
"tta<{tta};"
|
||||
"ddh<{ddha}(&;"
|
||||
"ddha<{ddha};"
|
||||
"dd''<{dda}(&{ha};"
|
||||
"dd<{dda}(&;"
|
||||
"dda<{dda};"
|
||||
"dh<{dha}(&;"
|
||||
"dha<{dha};"
|
||||
"d''<{da}{virama}({ha};"
|
||||
"d''<{da}{virama}({ddha};"
|
||||
"d''<{da}{virama}({dda};"
|
||||
"d''<{da}{virama}({dha};"
|
||||
"d''<{da}{virama}({da};"
|
||||
"d<{da}(&;"
|
||||
"da<{da};"
|
||||
"th<{tha}(&;"
|
||||
"tha<{tha};"
|
||||
"t''<{ta}{virama}({ha};"
|
||||
"t''<{ta}{virama}({ttha};"
|
||||
"t''<{ta}{virama}({tta};"
|
||||
"t''<{ta}{virama}({tha};"
|
||||
"t''<{ta}{virama}({ta};"
|
||||
"t<{ta}(&;"
|
||||
"ta<{ta};"
|
||||
"n''<{na}{virama}({ga};"
|
||||
"n''<{na}{virama}({ya};"
|
||||
"n<{na}(&;"
|
||||
"na<{na};"
|
||||
"ph<{pha}(&;"
|
||||
"pha<{pha};"
|
||||
"p''<{pa}{virama}({ha};"
|
||||
"p<{pa}(&;"
|
||||
"pa<{pa};"
|
||||
"bh<{bha}(&;"
|
||||
"bha<{bha};"
|
||||
"b''<{ba}{virama}({ha};"
|
||||
"b<{ba}(&;"
|
||||
"ba<{ba};"
|
||||
"m''<{ma}{virama}({ma};"
|
||||
"m''<{ma}{virama}({bindu};"
|
||||
"m<{ma}(&;"
|
||||
"ma<{ma};"
|
||||
"y<{ya}(&;"
|
||||
"ya<{ya};"
|
||||
"r''<{ra}{virama}({ha};"
|
||||
"r<{ra}(&;"
|
||||
"ra<{ra};"
|
||||
"l''<{la}{virama}({ha};"
|
||||
"l<{la}(&;"
|
||||
"la<{la};"
|
||||
"v<{va}(&;"
|
||||
"va<{va};"
|
||||
"sh<{sha}(&;"
|
||||
"sha<{sha};"
|
||||
"ss<{ssa}(&;"
|
||||
"ssa<{ssa};"
|
||||
"s''<{sa}{virama}({ha};"
|
||||
"s''<{sa}{virama}({sha};"
|
||||
"s''<{sa}{virama}({ssa};"
|
||||
"s''<{sa}{virama}({sa};"
|
||||
"s<{sa}(&;"
|
||||
"sa<{sa};"
|
||||
"h<{ha}(&;"
|
||||
"ha<{ha};"
|
||||
|
||||
// dependent vowels (should never occur except following consonants)
|
||||
|
||||
"aa<{aa};"
|
||||
"ai<{ai};"
|
||||
"au<{au};"
|
||||
"ii<{ii};"
|
||||
"i<{i};"
|
||||
"uu<{uu};"
|
||||
"u<{u};"
|
||||
"rrh<{rrh};"
|
||||
"rh<{rh};"
|
||||
"lh<{lh};"
|
||||
"e<{e};"
|
||||
"o<{o};"
|
||||
|
||||
// independent vowels (when following consonants)
|
||||
|
||||
"''aa<a){waa};"
|
||||
"''aa<%){waa};"
|
||||
"''ai<a){wai};"
|
||||
"''ai<%){wai};"
|
||||
"''au<a){wau};"
|
||||
"''au<%){wau};"
|
||||
"''ii<a){wii};"
|
||||
"''ii<%){wii};"
|
||||
"''i<a){wi};"
|
||||
"''i<%){wi};"
|
||||
"''uu<a){wuu};"
|
||||
"''uu<%){wuu};"
|
||||
"''u<a){wu};"
|
||||
"''u<%){wu};"
|
||||
"''rrh<%){wrr};"
|
||||
"''rh<%){wr};"
|
||||
"''lh<%){wl};"
|
||||
"''e<%){we};"
|
||||
"''o<%){wo};"
|
||||
"''a<a){wa};"
|
||||
"''a<%){wa};"
|
||||
|
||||
|
||||
// independent vowels (otherwise)
|
||||
|
||||
"aa<{waa};"
|
||||
"ai<{wai};"
|
||||
"au<{wau};"
|
||||
"ii<{wii};"
|
||||
"i<{wi};"
|
||||
"uu<{wuu};"
|
||||
"u<{wu};"
|
||||
"rrh<{wrr};"
|
||||
"rh<{wr};"
|
||||
"lh<{wl};"
|
||||
"e<{we};"
|
||||
"o<{wo};"
|
||||
"a<{wa};"
|
||||
|
||||
// blow away any remaining viramas
|
||||
|
||||
"<{virama};"
|
||||
}
|
||||
Rule {
|
||||
//#####################################################################
|
||||
// Keyboard Transliteration Table
|
||||
//#####################################################################
|
||||
// Conversions should be:
|
||||
// 1. complete
|
||||
// * convert every sequence of Latin letters (a to z plus apostrophe)
|
||||
// to a sequence of Native letters
|
||||
// * convert every sequence of Native letters to Latin letters
|
||||
// 2. reversable
|
||||
// * any string of Native converted to Latin and back should be the same
|
||||
// * this is not true for English converted to Native & back, e.g.:
|
||||
// k -> {kaf} -> k
|
||||
// c -> {kaf} -> k
|
||||
//#####################################################################
|
||||
// Sequences of Latin letters may convert to a single Native letter.
|
||||
// When this is the case, an apostrophe can be used to indicate separate
|
||||
// letters.$
|
||||
// E.g. sh -> {shin}
|
||||
// s'h -> {sin}{heh}
|
||||
// ss -> {sad}
|
||||
// s's -> {sin}{shadda}
|
||||
//#####################################################################
|
||||
// To Do:
|
||||
// finish adding shadda, add sokoon, fix uppercase
|
||||
// make two transliteration tables: one with vowels, one without
|
||||
//#####################################################################
|
||||
// Modifications
|
||||
// Devanagari Transliterator: broken up with consonsants/vowels
|
||||
//#####################################################################
|
||||
// Unicode character name definitions
|
||||
//#####################################################################
|
||||
|
||||
//consonants
|
||||
"$candrabindu=\u0901;"
|
||||
"$bindu=\u0902;"
|
||||
"$visarga=\u0903;"
|
||||
|
||||
// w<vowel> represents the stand-alone form
|
||||
"$wa=\u0905;"
|
||||
"$waa=\u0906;"
|
||||
"$wi=\u0907;"
|
||||
"$wii=\u0908;"
|
||||
"$wu=\u0909;"
|
||||
"$wuu=\u090A;"
|
||||
"$wr=\u090B;"
|
||||
"$wl=\u090C;"
|
||||
"$we=\u090F;"
|
||||
"$wai=\u0910;"
|
||||
"$wo=\u0913;"
|
||||
"$wau=\u0914;"
|
||||
|
||||
"$ka=\u0915;"
|
||||
"$kha=\u0916;"
|
||||
"$ga=\u0917;"
|
||||
"$gha=\u0918;"
|
||||
"$nga=\u0919;"
|
||||
|
||||
"$ca=\u091A;"
|
||||
"$cha=\u091B;"
|
||||
"$ja=\u091C;"
|
||||
"$jha=\u091D;"
|
||||
"$nya=\u091E;"
|
||||
|
||||
"$tta=\u091F;"
|
||||
"$ttha=\u0920;"
|
||||
"$dda=\u0921;"
|
||||
"$ddha=\u0922;"
|
||||
"$nna=\u0923;"
|
||||
|
||||
"$ta=\u0924;"
|
||||
"$tha=\u0925;"
|
||||
"$da=\u0926;"
|
||||
"$dha=\u0927;"
|
||||
"$na=\u0928;"
|
||||
|
||||
"$pa=\u092A;"
|
||||
"$pha=\u092B;"
|
||||
"$ba=\u092C;"
|
||||
"$bha=\u092D;"
|
||||
"$ma=\u092E;"
|
||||
|
||||
"$ya=\u092F;"
|
||||
"$ra=\u0930;"
|
||||
"$rra=\u0931;"
|
||||
"$la=\u0933;"
|
||||
"$va=\u0935;"
|
||||
|
||||
"$sha=\u0936;"
|
||||
"$ssa=\u0937;"
|
||||
"$sa=\u0938;"
|
||||
"$ha=\u0939;"
|
||||
|
||||
// <vowel> represents the dependent form
|
||||
"$aa=\u093E;"
|
||||
"$i=\u093F;"
|
||||
"$ii=\u0940;"
|
||||
"$u=\u0941;"
|
||||
"$uu=\u0942;"
|
||||
"$rh=\u0943;"
|
||||
"$lh=\u0944;"
|
||||
"$e=\u0947;"
|
||||
"$ai=\u0948;"
|
||||
"$o=\u094B;"
|
||||
"$au=\u094C;"
|
||||
|
||||
"$virama=\u094D;"
|
||||
|
||||
"$wrr=\u0960;"
|
||||
"$rrh=\u0962;"
|
||||
|
||||
"$danda=\u0964;"
|
||||
"$doubleDanda=\u0965;"
|
||||
"$depVowelAbove=[\u093E-\u0940\u0945-\u094C];"
|
||||
"$depVowelBelow=[\u0941-\u0944];"
|
||||
"$endThing=[$danda$doubleDanda \\u0000-\u08FF\u0980-\uFFFF];"
|
||||
|
||||
// $x was originally called '&'; $z was '%'
|
||||
"$x=[$virama$aa$ai$au$ii$i$uu$u$rrh$rh$lh$e$o];"
|
||||
"$z=[bcdfghjklmnpqrstvwxyz];"
|
||||
|
||||
//#####################################################################
|
||||
// convert from Latin letters to Native letters
|
||||
//#####################################################################
|
||||
//Hindi>\u092d\u093e\u0930\u0924--\u0020\u0926\u0947\u0936\u0020\u092c\u0928\u094d\u0927\u0941\u002e
|
||||
|
||||
// special forms with no good conversion
|
||||
|
||||
"mm>$bindu;"
|
||||
"x>$visarga;"
|
||||
|
||||
// convert to independent forms at start of word or syllable:
|
||||
// e.g. keai -> {ka}{e}{wai}; k'ai -> {ka}{wai}; (ai) -> ({wai})
|
||||
// Moved up [LIU]
|
||||
|
||||
"aa>$waa;"
|
||||
"ai>$wai;"
|
||||
"au>$wau;"
|
||||
"ii>$wii;"
|
||||
"i>$wi;"
|
||||
"uu>$wuu;"
|
||||
"u>$wu;"
|
||||
"rrh>$wrr;"
|
||||
"rh>$wr;"
|
||||
"lh>$wl;"
|
||||
"e>$we;"
|
||||
"o>$wo;"
|
||||
"a>$wa;"
|
||||
|
||||
// normal consonants
|
||||
|
||||
"kh>$kha|$virama;"
|
||||
"k>$ka|$virama;"
|
||||
"q>$ka|$virama;"
|
||||
"gh>$gha|$virama;"
|
||||
"g>$ga|$virama;"
|
||||
"ng>$nga|$virama;"
|
||||
"ch>$cha|$virama;"
|
||||
"c>$ca|$virama;"
|
||||
"jh>$jha|$virama;"
|
||||
"j>$ja|$virama;"
|
||||
"ny>$nya|$virama;"
|
||||
"tth>$ttha|$virama;"
|
||||
"tt>$tta|$virama;"
|
||||
"ddh>$ddha|$virama;"
|
||||
"dd>$dda|$virama;"
|
||||
"nn>$nna|$virama;"
|
||||
"th>$tha|$virama;"
|
||||
"t>$ta|$virama;"
|
||||
"dh>$dha|$virama;"
|
||||
"d>$da|$virama;"
|
||||
"n>$na|$virama;"
|
||||
"ph>$pha|$virama;"
|
||||
"p>$pa|$virama;"
|
||||
"bh>$bha|$virama;"
|
||||
"b>$ba|$virama;"
|
||||
"m>$ma|$virama;"
|
||||
"y>$ya|$virama;"
|
||||
"r>$ra|$virama;"
|
||||
"l>$la|$virama;"
|
||||
"v>$va|$virama;"
|
||||
"f>$va|$virama;"
|
||||
"w>$va|$virama;"
|
||||
"sh>$sha|$virama;"
|
||||
"ss>$ssa|$virama;"
|
||||
"s>$sa|$virama;"
|
||||
"z>$sa|$virama;"
|
||||
"h>$ha|$virama;"
|
||||
|
||||
"'.'>$danda;"
|
||||
"$danda'.'>$doubleDanda;"
|
||||
"$depVowelAbove{'~'>$bindu;"
|
||||
"$depVowelBelow{'~'>$candrabindu;"
|
||||
|
||||
// convert to dependent forms after consonant with no vowel:
|
||||
// e.g. kai -> {ka}{virama}ai -> {ka}{ai}
|
||||
|
||||
"$virama aa>$aa;"
|
||||
"$virama ai>$ai;"
|
||||
"$virama au>$au;"
|
||||
"$virama ii>$ii;"
|
||||
"$virama i>$i;"
|
||||
"$virama uu>$uu;"
|
||||
"$virama u>$u;"
|
||||
"$virama rrh>$rrh;"
|
||||
"$virama rh>$rh;"
|
||||
"$virama lh>$lh;"
|
||||
"$virama e>$e;"
|
||||
"$virama o>$o;"
|
||||
"$virama a>;"
|
||||
|
||||
// otherwise convert independent forms when separated by ': k'ai -> {ka}{virama}{wai}
|
||||
|
||||
"$virama''aa>$waa;"
|
||||
"$virama''ai>$wai;"
|
||||
"$virama''au>$wau;"
|
||||
"$virama''ii>$wii;"
|
||||
"$virama''i>$wi;"
|
||||
"$virama''uu>$wuu;"
|
||||
"$virama''u>$wu;"
|
||||
"$virama''rrh>$wrr;"
|
||||
"$virama''rh>$wr;"
|
||||
"$virama''lh>$wl;"
|
||||
"$virama''e>$we;"
|
||||
"$virama''o>$wo;"
|
||||
"$virama''a>$wa;"
|
||||
|
||||
"$virama}$endThing>;"
|
||||
|
||||
// convert any left-over apostrophes used for separation
|
||||
|
||||
"''>;"
|
||||
|
||||
//#####################################################################
|
||||
// convert from Native letters to Latin letters
|
||||
//#####################################################################
|
||||
|
||||
// special forms with no good conversion
|
||||
|
||||
"mm<$bindu;"
|
||||
"x<$visarga;"
|
||||
|
||||
// normal consonants
|
||||
|
||||
"kh<$kha}$x;"
|
||||
"kha<$kha;"
|
||||
"k''<$ka$virama}$ha;"
|
||||
"k<$ka}$x;"
|
||||
"ka<$ka;"
|
||||
"gh<$gha}$x;"
|
||||
"gha<$gha;"
|
||||
"g''<$ga$virama}$ha;"
|
||||
"g<$ga}$x;"
|
||||
"ga<$ga;"
|
||||
"ng<$nga}$x;"
|
||||
"nga<$nga;"
|
||||
"ch<$cha}$x;"
|
||||
"cha<$cha;"
|
||||
"c''<$ca$virama}$ha;"
|
||||
"c<$ca}$x;"
|
||||
"ca<$ca;"
|
||||
"jh<$jha}$x;"
|
||||
"jha<$jha;"
|
||||
"j''<$ja$virama}$ha;"
|
||||
"j<$ja}$x;"
|
||||
"ja<$ja;"
|
||||
"ny<$nya}$x;"
|
||||
"nya<$nya;"
|
||||
"tth<$ttha}$x;"
|
||||
"ttha<$ttha;"
|
||||
"tt''<$tta$virama}$ha;"
|
||||
"tt<$tta}$x;"
|
||||
"tta<$tta;"
|
||||
"ddh<$ddha}$x;"
|
||||
"ddha<$ddha;"
|
||||
"dd''<$dda}$x$ha;"
|
||||
"dd<$dda}$x;"
|
||||
"dda<$dda;"
|
||||
"dh<$dha}$x;"
|
||||
"dha<$dha;"
|
||||
"d''<$da$virama}$ha;"
|
||||
"d''<$da$virama}$ddha;"
|
||||
"d''<$da$virama}$dda;"
|
||||
"d''<$da$virama}$dha;"
|
||||
"d''<$da$virama}$da;"
|
||||
"d<$da}$x;"
|
||||
"da<$da;"
|
||||
"th<$tha}$x;"
|
||||
"tha<$tha;"
|
||||
"t''<$ta$virama}$ha;"
|
||||
"t''<$ta$virama}$ttha;"
|
||||
"t''<$ta$virama}$tta;"
|
||||
"t''<$ta$virama}$tha;"
|
||||
"t''<$ta$virama}$ta;"
|
||||
"t<$ta}$x;"
|
||||
"ta<$ta;"
|
||||
"n''<$na$virama}$ga;"
|
||||
"n''<$na$virama}$ya;"
|
||||
"n<$na}$x;"
|
||||
"na<$na;"
|
||||
"ph<$pha}$x;"
|
||||
"pha<$pha;"
|
||||
"p''<$pa$virama}$ha;"
|
||||
"p<$pa}$x;"
|
||||
"pa<$pa;"
|
||||
"bh<$bha}$x;"
|
||||
"bha<$bha;"
|
||||
"b''<$ba$virama}$ha;"
|
||||
"b<$ba}$x;"
|
||||
"ba<$ba;"
|
||||
"m''<$ma$virama}$ma;"
|
||||
"m''<$ma$virama}$bindu;"
|
||||
"m<$ma}$x;"
|
||||
"ma<$ma;"
|
||||
"y<$ya}$x;"
|
||||
"ya<$ya;"
|
||||
"r''<$ra$virama}$ha;"
|
||||
"r<$ra}$x;"
|
||||
"ra<$ra;"
|
||||
"l''<$la$virama}$ha;"
|
||||
"l<$la}$x;"
|
||||
"la<$la;"
|
||||
"v<$va}$x;"
|
||||
"va<$va;"
|
||||
"sh<$sha}$x;"
|
||||
"sha<$sha;"
|
||||
"ss<$ssa}$x;"
|
||||
"ssa<$ssa;"
|
||||
"s''<$sa$virama}$ha;"
|
||||
"s''<$sa$virama}$sha;"
|
||||
"s''<$sa$virama}$ssa;"
|
||||
"s''<$sa$virama}$sa;"
|
||||
"s<$sa}$x;"
|
||||
"sa<$sa;"
|
||||
"h<$ha}$x;"
|
||||
"ha<$ha;"
|
||||
|
||||
// dependent vowels (should never occur except following consonants)
|
||||
|
||||
"aa<$aa;"
|
||||
"ai<$ai;"
|
||||
"au<$au;"
|
||||
"ii<$ii;"
|
||||
"i<$i;"
|
||||
"uu<$uu;"
|
||||
"u<$u;"
|
||||
"rrh<$rrh;"
|
||||
"rh<$rh;"
|
||||
"lh<$lh;"
|
||||
"e<$e;"
|
||||
"o<$o;"
|
||||
|
||||
// independent vowels (when following consonants)
|
||||
|
||||
"''aa<a{$waa;"
|
||||
"''aa<$z{$waa;"
|
||||
"''ai<a{$wai;"
|
||||
"''ai<$z{$wai;"
|
||||
"''au<a{$wau;"
|
||||
"''au<$z{$wau;"
|
||||
"''ii<a{$wii;"
|
||||
"''ii<$z{$wii;"
|
||||
"''i<a{$wi;"
|
||||
"''i<$z{$wi;"
|
||||
"''uu<a{$wuu;"
|
||||
"''uu<$z{$wuu;"
|
||||
"''u<a{$wu;"
|
||||
"''u<$z{$wu;"
|
||||
"''rrh<$z{$wrr;"
|
||||
"''rh<$z{$wr;"
|
||||
"''lh<$z{$wl;"
|
||||
"''e<$z{$we;"
|
||||
"''o<$z{$wo;"
|
||||
"''a<a{$wa;"
|
||||
"''a<$z{$wa;"
|
||||
|
||||
|
||||
// independent vowels (otherwise)
|
||||
|
||||
"aa<$waa;"
|
||||
"ai<$wai;"
|
||||
"au<$wau;"
|
||||
"ii<$wii;"
|
||||
"i<$wi;"
|
||||
"uu<$wuu;"
|
||||
"u<$wu;"
|
||||
"rrh<$wrr;"
|
||||
"rh<$wr;"
|
||||
"lh<$wl;"
|
||||
"e<$we;"
|
||||
"o<$wo;"
|
||||
"a<$wa;"
|
||||
|
||||
// blow away any remaining viramas
|
||||
|
||||
"<$virama;"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,376 +1,380 @@
|
|||
//--------------------------------------------------------------------
|
||||
// Copyright (C) 1999, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
// Copyright (c) 1999-2000, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//--------------------------------------------------------------------
|
||||
// Date Name Description
|
||||
// 11/17/99 aliu Creation.
|
||||
// THIS IS A MACHINE-GENERATED FILE
|
||||
// Tool: src\com\ibm\tools\translit\dumpICUrules.bat
|
||||
// Source: src\com\ibm\text\resources/TransliterationRule_Latin_Greek.java
|
||||
// Date: Fri May 19 15:50:22 2000
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
// Latin-Greek
|
||||
|
||||
lgreek {
|
||||
Rule {
|
||||
// ==============================================
|
||||
// Modern Greek Transliteration Rules
|
||||
//
|
||||
// This transliterates modern Greek characters, but using rules
|
||||
// that are traditional for Ancient Greek, and
|
||||
// thus more resemble Greek words that have become part
|
||||
// of English. It differs from the official Greek
|
||||
// transliteration, which is more phonetic (since
|
||||
// most modern Greek vowels, for example, have
|
||||
// degenerated simply to sound like "ee").
|
||||
//
|
||||
// There are only a few tricky parts.
|
||||
// 1. eta and omega don't map directly to Latin vowels,
|
||||
// so we use a macron on e and o, and some
|
||||
// other combinations if they are accented.
|
||||
// 2. The accented, diaeresis i and y are substituted too.
|
||||
// 3. Some letters use digraphs, like "ph". While typical,
|
||||
// they need some special handling.
|
||||
// 4. A gamma before a gamma or a few other letters is
|
||||
// transliterated as an "n", as in "Anglo"
|
||||
// 5. An ypsilon after a vowel is a "u", as in
|
||||
// "Mouseio". Otherwise it is a "y" as in "Physikon"
|
||||
// 6. The construction of the rules is made simpler by making sure
|
||||
// that most rules for lowercase letters exactly correspond to the
|
||||
// rules for uppercase letters, *except* for the case of the letters
|
||||
// in the rule itself. That way, after modifying the uppercase rules,
|
||||
// you can just copy, paste, and "set to lowercase" to get
|
||||
// the rules for lowercase letters!
|
||||
// ==============================================
|
||||
|
||||
// ==============================================
|
||||
// Variables, used to make the rules more comprehensible
|
||||
// and for conditionals.
|
||||
// ==============================================
|
||||
|
||||
// Latin Letters
|
||||
|
||||
"E-MACRON=\u0112;"
|
||||
"e-macron=\u0113;"
|
||||
"O-MACRON=\u014C;"
|
||||
"o-macron=\u014D;"
|
||||
"Y-UMLAUT=\u0178;"
|
||||
"y-umlaut=\u00FF;"
|
||||
|
||||
//! // with real accents.
|
||||
//! + "E-MACRON-ACUTE=\u0112\u0301;"
|
||||
//! + "e-macron-acute=\u0113\u0301;"
|
||||
//! + "O-MACRON-ACUTE=\u014C\u0301;"
|
||||
//! + "o-macron-acute=\u014D\u0301;"
|
||||
//! + "y-umlaut-acute=\u00FF\u0301;"
|
||||
//! + "\u00ef-acute=\u00ef\u0301;"
|
||||
//! + "\u00fc-acute=\u00fc\u0301;"
|
||||
//! //
|
||||
|
||||
// single letter equivalents
|
||||
|
||||
"E-MACRON-ACUTE=\u00CA;"
|
||||
"e-macron-acute=\u00EA;"
|
||||
"O-MACRON-ACUTE=\u00D4;"
|
||||
"o-macron-acute=\u00F4;"
|
||||
"y-umlaut-acute=\u0177;"
|
||||
"\u00ef-acute=\u00EE;"
|
||||
"\u00fc-acute=\u00FB;"
|
||||
|
||||
// Greek Letters
|
||||
|
||||
"ALPHA=\u0391;"
|
||||
"BETA=\u0392;"
|
||||
"GAMMA=\u0393;"
|
||||
"DELTA=\u0394;"
|
||||
"EPSILON=\u0395;"
|
||||
"ZETA=\u0396;"
|
||||
"ETA=\u0397;"
|
||||
"THETA=\u0398;"
|
||||
"IOTA=\u0399;"
|
||||
"KAPPA=\u039A;"
|
||||
"LAMBDA=\u039B;"
|
||||
"MU=\u039C;"
|
||||
"NU=\u039D;"
|
||||
"XI=\u039E;"
|
||||
"OMICRON=\u039F;"
|
||||
"PI=\u03A0;"
|
||||
"RHO=\u03A1;"
|
||||
"SIGMA=\u03A3;"
|
||||
"TAU=\u03A4;"
|
||||
"YPSILON=\u03A5;"
|
||||
"PHI=\u03A6;"
|
||||
"CHI=\u03A7;"
|
||||
"PSI=\u03A8;"
|
||||
"OMEGA=\u03A9;"
|
||||
|
||||
"ALPHA+=\u0386;"
|
||||
"EPSILON+=\u0388;"
|
||||
"ETA+=\u0389;"
|
||||
"IOTA+=\u038A;"
|
||||
"OMICRON+=\u038C;"
|
||||
"YPSILON+=\u038E;"
|
||||
"OMEGA+=\u038F;"
|
||||
"IOTA_DIAERESIS=\u03AA;"
|
||||
"YPSILON_DIAERESIS=\u03AB;"
|
||||
|
||||
"alpha=\u03B1;"
|
||||
"beta=\u03B2;"
|
||||
"gamma=\u03B3;"
|
||||
"delta=\u03B4;"
|
||||
"epsilon=\u03B5;"
|
||||
"zeta=\u03B6;"
|
||||
"eta=\u03B7;"
|
||||
"theta=\u03B8;"
|
||||
"iota=\u03B9;"
|
||||
"kappa=\u03BA;"
|
||||
"lambda=\u03BB;"
|
||||
"mu=\u03BC;"
|
||||
"nu=\u03BD;"
|
||||
"xi=\u03BE;"
|
||||
"omicron=\u03BF;"
|
||||
"pi=\u03C0;"
|
||||
"rho=\u03C1;"
|
||||
"sigma=\u03C3;"
|
||||
"tau=\u03C4;"
|
||||
"ypsilon=\u03C5;"
|
||||
"phi=\u03C6;"
|
||||
"chi=\u03C7;"
|
||||
"psi=\u03C8;"
|
||||
"omega=\u03C9;"
|
||||
|
||||
//forms
|
||||
|
||||
"alpha+=\u03AC;"
|
||||
"epsilon+=\u03AD;"
|
||||
"eta+=\u03AE;"
|
||||
"iota+=\u03AF;"
|
||||
"omicron+=\u03CC;"
|
||||
"ypsilon+=\u03CD;"
|
||||
"omega+=\u03CE;"
|
||||
"iota_diaeresis=\u03CA;"
|
||||
"ypsilon_diaeresis=\u03CB;"
|
||||
"iota_diaeresis+=\u0390;"
|
||||
"ypsilon_diaeresis+=\u03B0;"
|
||||
"sigma+=\u03C2;"
|
||||
|
||||
// Variables for conditional mappings
|
||||
|
||||
// Use lowercase for all variable names, to allow cut/paste below.
|
||||
|
||||
"letter=[~[:Lu:][:Ll:]];"
|
||||
"lower=[[:Ll:]];"
|
||||
"softener=[eiyEIY];"
|
||||
"vowel=[aeiouAEIOU"
|
||||
"{ALPHA}{EPSILON}{ETA}{IOTA}{OMICRON}{YPSILON}{OMEGA}"
|
||||
"{ALPHA+}{EPSILON+}{ETA+}{IOTA+}{OMICRON+}{YPSILON+}{OMEGA+}"
|
||||
"{IOTA_DIAERESIS}{YPSILON_DIAERESIS}"
|
||||
"{alpha}{epsilon}{eta}{iota}{omicron}{ypsilon}{omega}"
|
||||
"{alpha+}{epsilon+}{eta+}{iota+}{omicron+}{ypsilon+}{omega+}"
|
||||
"{iota_diaeresis}{ypsilon_diaeresis}"
|
||||
"{iota_diaeresis+}{ypsilon_diaeresis+}"
|
||||
"];"
|
||||
"n-gamma=[GKXCgkxc];"
|
||||
"gamma-n=[{GAMMA}{KAPPA}{CHI}{XI}{gamma}{kappa}{chi}{xi}];"
|
||||
"pp=[Pp];"
|
||||
|
||||
// ==============================================
|
||||
// Rules
|
||||
// ==============================================
|
||||
// The following are special titlecases, and should
|
||||
// not be copied when duplicating the lowercase
|
||||
// ==============================================
|
||||
|
||||
"Th <> {THETA}({lower};"
|
||||
"Ph <> {PHI}({lower};"
|
||||
"Ch <> {CHI}({lower};"
|
||||
//masked: + "Ps<{PHI}({lower};"
|
||||
|
||||
// Because there is no uppercase forms for final sigma,
|
||||
// we had to move all the sigma rules up here.
|
||||
|
||||
// Remember to insert ' to preserve round trip, for double letters
|
||||
// don't need to do this for the digraphs with h,
|
||||
// since it is not created when mapping back from greek
|
||||
|
||||
// use special form for s
|
||||
|
||||
"''S <> ({pp}) {SIGMA} ;" // handle PS
|
||||
"S <> {SIGMA};"
|
||||
|
||||
// The following are a bit tricky. 's' takes two forms in greek
|
||||
// final or non final.
|
||||
// We use ~s to represent the abnormal form: final before letter
|
||||
// or non-final before non-letter.
|
||||
// We use 's to separate p and s (otherwise ps is one letter)
|
||||
// so, we break out the following forms:
|
||||
|
||||
"''s < ({pp}) {sigma} ({letter});"
|
||||
"s < {sigma} ({letter});"
|
||||
"~s < {sigma} ;"
|
||||
|
||||
"~s < {sigma+} ({letter});"
|
||||
"''s < ({pp}) {sigma+} ;"
|
||||
"s < {sigma+} ;"
|
||||
|
||||
"~s ({letter}) > {sigma+};"
|
||||
"~s > {sigma};"
|
||||
"''s ({letter}) > {sigma};"
|
||||
"''s > {sigma+};"
|
||||
"s ({letter}) > {sigma};"
|
||||
"s > {sigma+};"
|
||||
|
||||
// because there are no uppercase forms, had to move these up too.
|
||||
|
||||
"i\"`>{iota_diaeresis+};"
|
||||
"y\"`>{ypsilon_diaeresis+};"
|
||||
|
||||
"{\u00ef-acute} <> {iota_diaeresis+};"
|
||||
"{\u00fc-acute} <> {vowel}){ypsilon_diaeresis+};"
|
||||
"{y-umlaut-acute} <> {ypsilon_diaeresis+};"
|
||||
|
||||
// ==============================================
|
||||
// Uppercase Forms.
|
||||
// To make lowercase forms, just copy and lowercase below
|
||||
// ==============================================
|
||||
|
||||
// Typing variants, in case the keyboard doesn't have accents
|
||||
|
||||
"A`>{ALPHA+};"
|
||||
"E`>{EPSILON+};"
|
||||
"EE`>{ETA+};"
|
||||
"EE>{ETA};"
|
||||
"I`>{IOTA+};"
|
||||
"O`>{OMICRON+};"
|
||||
"OO`>{OMEGA+};"
|
||||
"OO>{OMEGA};"
|
||||
"I\">{IOTA_DIAERESIS};"
|
||||
"Y\">{YPSILON_DIAERESIS};"
|
||||
|
||||
// Basic Letters
|
||||
|
||||
"A<>{ALPHA};"
|
||||
"\u00c1<>{ALPHA+};"
|
||||
"B<>{BETA};"
|
||||
"N ({n-gamma}) <> {GAMMA} ({gamma-n});"
|
||||
"G<>{GAMMA};"
|
||||
"D<>{DELTA};"
|
||||
"''E <> ([Ee]){EPSILON};" // handle EE
|
||||
"E<>{EPSILON};"
|
||||
"\u00c9<>{EPSILON+};"
|
||||
"Z<>{ZETA};"
|
||||
"{E-MACRON-ACUTE}<>{ETA+};"
|
||||
"{E-MACRON}<>{ETA};"
|
||||
"TH<>{THETA};"
|
||||
"I<>{IOTA};"
|
||||
"\u00cd<>{IOTA+};"
|
||||
"\u00cf<>{IOTA_DIAERESIS};"
|
||||
"K<>{KAPPA};"
|
||||
"L<>{LAMBDA};"
|
||||
"M<>{MU};"
|
||||
"N'' <> {NU} ({gamma-n});"
|
||||
"N<>{NU};"
|
||||
"X<>{XI};"
|
||||
"''O <> ([Oo]) {OMICRON};" // handle OO
|
||||
"O<>{OMICRON};"
|
||||
"\u00d3<>{OMICRON+};"
|
||||
"PH<>{PHI};" // needs ordering before P
|
||||
"PS<>{PSI};" // needs ordering before P
|
||||
"P<>{PI};"
|
||||
"R<>{RHO};"
|
||||
"T<>{TAU};"
|
||||
"U <> ({vowel}) {YPSILON};"
|
||||
"\u00da <> ({vowel}) {YPSILON+};"
|
||||
"\u00dc <> ({vowel}) {YPSILON_DIAERESIS};"
|
||||
"Y<>{YPSILON};"
|
||||
"\u00dd<>{YPSILON+};"
|
||||
"{Y-UMLAUT}<>{YPSILON_DIAERESIS};"
|
||||
"CH<>{CHI};"
|
||||
"{O-MACRON-ACUTE}<>{OMEGA+};"
|
||||
"{O-MACRON}<>{OMEGA};"
|
||||
|
||||
// Extra English Letters. Mapped for completeness
|
||||
|
||||
"C({softener})>|S;"
|
||||
"C>|K;"
|
||||
"F>|PH;"
|
||||
"H>|CH;"
|
||||
"J>|I;"
|
||||
"Q>|K;"
|
||||
"V>|U;"
|
||||
"W>|U;"
|
||||
|
||||
// ==============================================
|
||||
// Lowercase Forms. Just copy above and lowercase
|
||||
// ==============================================
|
||||
|
||||
// typing variants, in case the keyboard doesn't have accents
|
||||
|
||||
"a`>{alpha+};"
|
||||
"e`>{epsilon+};"
|
||||
"ee`>{eta+};"
|
||||
"ee>{eta};"
|
||||
"i`>{iota+};"
|
||||
"o`>{omicron+};"
|
||||
"oo`>{omega+};"
|
||||
"oo>{omega};"
|
||||
"i\">{iota_diaeresis};"
|
||||
"y\">{ypsilon_diaeresis};"
|
||||
|
||||
// basic letters
|
||||
|
||||
"a<>{alpha};"
|
||||
"\u00e1<>{alpha+};"
|
||||
"b<>{beta};"
|
||||
"n ({n-gamma}) <> {gamma} ({gamma-n});"
|
||||
"g<>{gamma};"
|
||||
"d<>{delta};"
|
||||
"''e <> ([Ee]){epsilon};" // handle EE
|
||||
"e<>{epsilon};"
|
||||
"\u00e9<>{epsilon+};"
|
||||
"z<>{zeta};"
|
||||
"{e-macron-acute}<>{eta+};"
|
||||
"{e-macron}<>{eta};"
|
||||
"th<>{theta};"
|
||||
"i<>{iota};"
|
||||
"\u00ed<>{iota+};"
|
||||
"\u00ef<>{iota_diaeresis};"
|
||||
"k<>{kappa};"
|
||||
"l<>{lambda};"
|
||||
"m<>{mu};"
|
||||
"n'' <> {nu} ({gamma-n});"
|
||||
"n<>{nu};"
|
||||
"x<>{xi};"
|
||||
"''o <> ([Oo]) {omicron};" // handle OO
|
||||
"o<>{omicron};"
|
||||
"\u00f3<>{omicron+};"
|
||||
"ph<>{phi};" // needs ordering before p
|
||||
"ps<>{psi};" // needs ordering before p
|
||||
"p<>{pi};"
|
||||
"r<>{rho};"
|
||||
"t<>{tau};"
|
||||
"u <> ({vowel}){ypsilon};"
|
||||
"\u00fa <> ({vowel}){ypsilon+};"
|
||||
"\u00fc <> ({vowel}){ypsilon_diaeresis};"
|
||||
"y<>{ypsilon};"
|
||||
"\u00fd<>{ypsilon+};"
|
||||
"{y-umlaut}<>{ypsilon_diaeresis};"
|
||||
"ch<>{chi};"
|
||||
"{o-macron-acute}<>{omega+};"
|
||||
"{o-macron}<>{omega};"
|
||||
|
||||
// extra english letters. mapped for completeness
|
||||
|
||||
"c({softener})>|s;"
|
||||
"c>|k;"
|
||||
"f>|ph;"
|
||||
"h>|ch;"
|
||||
"j>|i;"
|
||||
"q>|k;"
|
||||
"v>|u;"
|
||||
"w>|u;"
|
||||
|
||||
// ====================================
|
||||
// Normal final rule: remove '
|
||||
// ====================================
|
||||
|
||||
//+ "''>;"
|
||||
}
|
||||
Rule {
|
||||
// ==============================================
|
||||
// Modern Greek Transliteration Rules
|
||||
//
|
||||
// This transliterates modern Greek characters, but using rules
|
||||
// that are traditional for Ancient Greek, and
|
||||
// thus more resemble Greek words that have become part
|
||||
// of English. It differs from the official Greek
|
||||
// transliteration, which is more phonetic (since
|
||||
// most modern Greek vowels, for example, have
|
||||
// degenerated simply to sound like "ee").
|
||||
//
|
||||
// There are only a few tricky parts.
|
||||
// 1. eta and omega don't map directly to Latin vowels,
|
||||
// so we use a macron on e and o, and some
|
||||
// other combinations if they are accented.
|
||||
// 2. The accented, diaeresis i and y are substituted too.
|
||||
// 3. Some letters use digraphs, like "ph". While typical,
|
||||
// they need some special handling.
|
||||
// 4. A gamma before a gamma or a few other letters is
|
||||
// transliterated as an "n", as in "Anglo"
|
||||
// 5. An ypsilon after a vowel is a "u", as in
|
||||
// "Mouseio". Otherwise it is a "y" as in "Physikon"
|
||||
// 6. The construction of the rules is made simpler by making sure
|
||||
// that most rules for lowercase letters exactly correspond to the
|
||||
// rules for uppercase letters, *except* for the case of the letters
|
||||
// in the rule itself. That way, after modifying the uppercase rules,
|
||||
// you can just copy, paste, and "set to lowercase" to get
|
||||
// the rules for lowercase letters!
|
||||
// ==============================================
|
||||
|
||||
// ==============================================
|
||||
// Variables, used to make the rules more comprehensible
|
||||
// and for conditionals.
|
||||
// ==============================================
|
||||
|
||||
"$quote='\"';"
|
||||
|
||||
// Latin Letters
|
||||
|
||||
"$E_MACRON=\u0112;"
|
||||
"$e_macron=\u0113;"
|
||||
"$O_MACRON=\u014C;"
|
||||
"$o_macron=\u014D;"
|
||||
"$Y_UMLAUT=\u0178;"
|
||||
"$y_umlaut=\u00FF;"
|
||||
|
||||
//! // with real accents.
|
||||
//! + "$E_MACRON_ACUTE=\u0112\u0301;"
|
||||
//! + "$e_macron_acute=\u0113\u0301;"
|
||||
//! + "$O_MACRON_ACUTE=\u014C\u0301;"
|
||||
//! + "$o_macron_acute=\u014D\u0301;"
|
||||
//! + "$y_umlaut_acute=\u00FF\u0301;"
|
||||
//! + "$u00ef_acute=\u00ef\u0301;"
|
||||
//! + "$u00fc_acute=\u00fc\u0301;"
|
||||
//! //
|
||||
|
||||
// single letter equivalents
|
||||
|
||||
"$E_MACRON_ACUTE=\u00CA;"
|
||||
"$e_macron_acute=\u00EA;"
|
||||
"$O_MACRON_ACUTE=\u00D4;"
|
||||
"$o_macron_acute=\u00F4;"
|
||||
"$y_umlaut_acute=\u0177;"
|
||||
"$u00ef_acute=\u00EE;"
|
||||
"$u00fc_acute=\u00FB;"
|
||||
|
||||
// Greek Letters
|
||||
|
||||
"$ALPHA=\u0391;"
|
||||
"$BETA=\u0392;"
|
||||
"$GAMMA=\u0393;"
|
||||
"$DELTA=\u0394;"
|
||||
"$EPSILON=\u0395;"
|
||||
"$ZETA=\u0396;"
|
||||
"$ETA=\u0397;"
|
||||
"$THETA=\u0398;"
|
||||
"$IOTA=\u0399;"
|
||||
"$KAPPA=\u039A;"
|
||||
"$LAMBDA=\u039B;"
|
||||
"$MU=\u039C;"
|
||||
"$NU=\u039D;"
|
||||
"$XI=\u039E;"
|
||||
"$OMICRON=\u039F;"
|
||||
"$PI=\u03A0;"
|
||||
"$RHO=\u03A1;"
|
||||
"$SIGMA=\u03A3;"
|
||||
"$TAU=\u03A4;"
|
||||
"$YPSILON=\u03A5;"
|
||||
"$PHI=\u03A6;"
|
||||
"$CHI=\u03A7;"
|
||||
"$PSI=\u03A8;"
|
||||
"$OMEGA=\u03A9;"
|
||||
|
||||
"$ALPHA2=\u0386;"
|
||||
"$EPSILON2=\u0388;"
|
||||
"$ETA2=\u0389;"
|
||||
"$IOTA2=\u038A;"
|
||||
"$OMICRON2=\u038C;"
|
||||
"$YPSILON2=\u038E;"
|
||||
"$OMEGA2=\u038F;"
|
||||
"$IOTA_DIAERESIS=\u03AA;"
|
||||
"$YPSILON_DIAERESIS=\u03AB;"
|
||||
|
||||
"$alpha=\u03B1;"
|
||||
"$beta=\u03B2;"
|
||||
"$gamma=\u03B3;"
|
||||
"$delta=\u03B4;"
|
||||
"$epsilon=\u03B5;"
|
||||
"$zeta=\u03B6;"
|
||||
"$eta=\u03B7;"
|
||||
"$theta=\u03B8;"
|
||||
"$iota=\u03B9;"
|
||||
"$kappa=\u03BA;"
|
||||
"$lambda=\u03BB;"
|
||||
"$mu=\u03BC;"
|
||||
"$nu=\u03BD;"
|
||||
"$xi=\u03BE;"
|
||||
"$omicron=\u03BF;"
|
||||
"$pi=\u03C0;"
|
||||
"$rho=\u03C1;"
|
||||
"$sigma=\u03C3;"
|
||||
"$tau=\u03C4;"
|
||||
"$ypsilon=\u03C5;"
|
||||
"$phi=\u03C6;"
|
||||
"$chi=\u03C7;"
|
||||
"$psi=\u03C8;"
|
||||
"$omega=\u03C9;"
|
||||
|
||||
//forms
|
||||
|
||||
"$alpha2=\u03AC;"
|
||||
"$epsilon2=\u03AD;"
|
||||
"$eta2=\u03AE;"
|
||||
"$iota2=\u03AF;"
|
||||
"$omicron2=\u03CC;"
|
||||
"$ypsilon2=\u03CD;"
|
||||
"$omega2=\u03CE;"
|
||||
"$iota_diaeresis=\u03CA;"
|
||||
"$ypsilon_diaeresis=\u03CB;"
|
||||
"$iota_diaeresis2=\u0390;"
|
||||
"$ypsilon_diaeresis2=\u03B0;"
|
||||
"$sigma2=\u03C2;"
|
||||
|
||||
// Variables for conditional mappings
|
||||
|
||||
// Use lowercase for all variable names, to allow cut/paste below.
|
||||
|
||||
"$letter=[~[:Lu:][:Ll:]];"
|
||||
"$lower=[[:Ll:]];"
|
||||
"$softener=[eiyEIY];"
|
||||
"$vowel=[aeiouAEIOU"
|
||||
"$ALPHA$EPSILON$ETA$IOTA$OMICRON$YPSILON$OMEGA"
|
||||
"$ALPHA2$EPSILON2$ETA2$IOTA2$OMICRON2$YPSILON2$OMEGA2"
|
||||
"$IOTA_DIAERESIS$YPSILON_DIAERESIS"
|
||||
"$alpha$epsilon$eta$iota$omicron$ypsilon$omega"
|
||||
"$alpha2$epsilon2$eta2$iota2$omicron2$ypsilon2$omega2"
|
||||
"$iota_diaeresis$ypsilon_diaeresis"
|
||||
"$iota_diaeresis2$ypsilon_diaeresis2"
|
||||
"];"
|
||||
"$n_gamma=[GKXCgkxc];"
|
||||
"$gamma_n=[$GAMMA$KAPPA$CHI$XI$gamma$kappa$chi$xi];"
|
||||
"$pp=[Pp];"
|
||||
|
||||
// ==============================================
|
||||
// Rules
|
||||
// ==============================================
|
||||
// The following are special titlecases, and should
|
||||
// not be copied when duplicating the lowercase
|
||||
// ==============================================
|
||||
|
||||
"Th <> $THETA}$lower;"
|
||||
"Ph <> $PHI}$lower;"
|
||||
"Ch <> $CHI}$lower;"
|
||||
//masked: + "Ps<$PHI}$lower;"
|
||||
|
||||
// Because there is no uppercase forms for final sigma,
|
||||
// we had to move all the sigma rules up here.
|
||||
|
||||
// Remember to insert ' to preserve round trip, for double letters
|
||||
// don't need to do this for the digraphs with h,
|
||||
// since it is not created when mapping back from greek
|
||||
|
||||
// use special form for s
|
||||
|
||||
"''S <> $pp{$SIGMA;" // handle PS
|
||||
"S <> $SIGMA;"
|
||||
|
||||
// The following are a bit tricky. 's' takes two forms in greek
|
||||
// final or non final.
|
||||
// We use ~s to represent the abnormal form: final before letter
|
||||
// or non-final before non-letter.
|
||||
// We use 's to separate p and s (otherwise ps is one letter)
|
||||
// so, we break out the following forms:
|
||||
|
||||
"''s < $pp{$sigma}$letter;"
|
||||
"s < $sigma}$letter;"
|
||||
"'~'s < $sigma;"
|
||||
|
||||
"'~'s < $sigma2}$letter;"
|
||||
"''s < $pp{$sigma2;"
|
||||
"s < $sigma2;"
|
||||
|
||||
"'~'s }$letter>$sigma2;"
|
||||
"'~'s > $sigma;"
|
||||
"''s }$letter>$sigma;"
|
||||
"''s > $sigma2;"
|
||||
"s }$letter>$sigma;"
|
||||
"s > $sigma2;"
|
||||
|
||||
// because there are no uppercase forms, had to move these up too.
|
||||
|
||||
"i$quote'`'>$iota_diaeresis2;"
|
||||
"y$quote'`'>$ypsilon_diaeresis2;"
|
||||
|
||||
"$u00ef_acute<>$iota_diaeresis2;"
|
||||
"$u00fc_acute<>$vowel{$ypsilon_diaeresis2;"
|
||||
"$y_umlaut_acute<>$ypsilon_diaeresis2;"
|
||||
|
||||
// ==============================================
|
||||
// Uppercase Forms.
|
||||
// To make lowercase forms, just copy and lowercase below
|
||||
// ==============================================
|
||||
|
||||
// Typing variants, in case the keyboard doesn't have accents
|
||||
|
||||
"'A`'>$ALPHA2;"
|
||||
"'E`'>$EPSILON2;"
|
||||
"'EE`'>$ETA2;"
|
||||
"EE>$ETA;"
|
||||
"'I`'>$IOTA2;"
|
||||
"'O`'>$OMICRON2;"
|
||||
"'OO`'>$OMEGA2;"
|
||||
"OO>$OMEGA;"
|
||||
"I$quote>$IOTA_DIAERESIS;"
|
||||
"Y$quote>$YPSILON_DIAERESIS;"
|
||||
|
||||
// Basic Letters
|
||||
|
||||
"A<>$ALPHA;"
|
||||
"\u00c1<>$ALPHA2;"
|
||||
"B<>$BETA;"
|
||||
"N }$n_gamma<>$GAMMA}$gamma_n;"
|
||||
"G<>$GAMMA;"
|
||||
"D<>$DELTA;"
|
||||
"''E <> [Ee]{$EPSILON;" // handle EE
|
||||
"E<>$EPSILON;"
|
||||
"\u00c9<>$EPSILON2;"
|
||||
"Z<>$ZETA;"
|
||||
"$E_MACRON_ACUTE<>$ETA2;"
|
||||
"$E_MACRON<>$ETA;"
|
||||
"TH<>$THETA;"
|
||||
"I<>$IOTA;"
|
||||
"\u00cd<>$IOTA2;"
|
||||
"\u00cf<>$IOTA_DIAERESIS;"
|
||||
"K<>$KAPPA;"
|
||||
"L<>$LAMBDA;"
|
||||
"M<>$MU;"
|
||||
"N'' <> $NU}$gamma_n;"
|
||||
"N<>$NU;"
|
||||
"X<>$XI;"
|
||||
"''O <> [Oo]{ $OMICRON;" // handle OO
|
||||
"O<>$OMICRON;"
|
||||
"\u00d3<>$OMICRON2;"
|
||||
"PH<>$PHI;" // needs ordering before P
|
||||
"PS<>$PSI;" // needs ordering before P
|
||||
"P<>$PI;"
|
||||
"R<>$RHO;"
|
||||
"T<>$TAU;"
|
||||
"U <> $vowel{$YPSILON;"
|
||||
"\u00da <> $vowel{$YPSILON2;"
|
||||
"\u00dc <> $vowel{$YPSILON_DIAERESIS;"
|
||||
"Y<>$YPSILON;"
|
||||
"\u00dd<>$YPSILON2;"
|
||||
"$Y_UMLAUT<>$YPSILON_DIAERESIS;"
|
||||
"CH<>$CHI;"
|
||||
"$O_MACRON_ACUTE<>$OMEGA2;"
|
||||
"$O_MACRON<>$OMEGA;"
|
||||
|
||||
// Extra English Letters. Mapped for completeness
|
||||
|
||||
"C}$softener>|S;"
|
||||
"C>|K;"
|
||||
"F>|PH;"
|
||||
"H>|CH;"
|
||||
"J>|I;"
|
||||
"Q>|K;"
|
||||
"V>|U;"
|
||||
"W>|U;"
|
||||
|
||||
// ==============================================
|
||||
// Lowercase Forms. Just copy above and lowercase
|
||||
// ==============================================
|
||||
|
||||
// typing variants, in case the keyboard doesn't have accents
|
||||
|
||||
"'a`'>$alpha2;"
|
||||
"'e`'>$epsilon2;"
|
||||
"'ee`'>$eta2;"
|
||||
"ee>$eta;"
|
||||
"'i`'>$iota2;"
|
||||
"'o`'>$omicron2;"
|
||||
"'oo`'>$omega2;"
|
||||
"oo>$omega;"
|
||||
"i$quote>$iota_diaeresis;"
|
||||
"y$quote>$ypsilon_diaeresis;"
|
||||
|
||||
// basic letters
|
||||
|
||||
"a<>$alpha;"
|
||||
"\u00e1<>$alpha2;"
|
||||
"b<>$beta;"
|
||||
"n }$n_gamma<>$gamma}$gamma_n;"
|
||||
"g<>$gamma;"
|
||||
"d<>$delta;"
|
||||
"''e <> [Ee]{$epsilon;" // handle EE
|
||||
"e<>$epsilon;"
|
||||
"\u00e9<>$epsilon2;"
|
||||
"z<>$zeta;"
|
||||
"$e_macron_acute<>$eta2;"
|
||||
"$e_macron<>$eta;"
|
||||
"th<>$theta;"
|
||||
"i<>$iota;"
|
||||
"\u00ed<>$iota2;"
|
||||
"\u00ef<>$iota_diaeresis;"
|
||||
"k<>$kappa;"
|
||||
"l<>$lambda;"
|
||||
"m<>$mu;"
|
||||
"n'' <> $nu}$gamma_n;"
|
||||
"n<>$nu;"
|
||||
"x<>$xi;"
|
||||
"''o <> [Oo]{ $omicron;" // handle OO
|
||||
"o<>$omicron;"
|
||||
"\u00f3<>$omicron2;"
|
||||
"ph<>$phi;" // needs ordering before p
|
||||
"ps<>$psi;" // needs ordering before p
|
||||
"p<>$pi;"
|
||||
"r<>$rho;"
|
||||
"t<>$tau;"
|
||||
"u <> $vowel{$ypsilon;"
|
||||
"\u00fa <> $vowel{$ypsilon2;"
|
||||
"\u00fc <> $vowel{$ypsilon_diaeresis;"
|
||||
"y<>$ypsilon;"
|
||||
"\u00fd<>$ypsilon2;"
|
||||
"$y_umlaut<>$ypsilon_diaeresis;"
|
||||
"ch<>$chi;"
|
||||
"$o_macron_acute<>$omega2;"
|
||||
"$o_macron<>$omega;"
|
||||
|
||||
// extra english letters. mapped for completeness
|
||||
|
||||
"c}$softener>|s;"
|
||||
"c>|k;"
|
||||
"f>|ph;"
|
||||
"h>|ch;"
|
||||
"j>|i;"
|
||||
"q>|k;"
|
||||
"v>|u;"
|
||||
"w>|u;"
|
||||
|
||||
// ====================================
|
||||
// Normal final rule: remove '
|
||||
// ====================================
|
||||
|
||||
//+ "''>;"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,278 +1,309 @@
|
|||
//--------------------------------------------------------------------
|
||||
// Copyright (C) 1999, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
// Copyright (c) 1999-2000, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//--------------------------------------------------------------------
|
||||
// Date Name Description
|
||||
// 11/17/99 aliu Creation.
|
||||
// THIS IS A MACHINE-GENERATED FILE
|
||||
// Tool: src\com\ibm\tools\translit\dumpICUrules.bat
|
||||
// Source: src\com\ibm\text\resources/TransliterationRule_Latin_Hebrew.java
|
||||
// Date: Fri May 19 15:50:22 2000
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
// Latin-Hebrew
|
||||
|
||||
lhebrew {
|
||||
Rule {
|
||||
//variable names, derived from the Unicode names.
|
||||
|
||||
"POINT_SHEVA=\u05B0;"
|
||||
"POINT_HATAF_SEGOL=\u05B1;"
|
||||
"POINT_HATAF_PATAH=\u05B2;"
|
||||
"POINT_HATAF_QAMATS=\u05B3;"
|
||||
"POINT_HIRIQ=\u05B4;"
|
||||
"POINT_TSERE=\u05B5;"
|
||||
"POINT_SEGOL=\u05B6;"
|
||||
"POINT_PATAH=\u05B7;"
|
||||
"POINT_QAMATS=\u05B8;"
|
||||
"POINT_HOLAM=\u05B9;"
|
||||
"POINT_QUBUTS=\u05BB;"
|
||||
"POINT_DAGESH_OR_MAPIQ=\u05BC;"
|
||||
"POINT_METEG=\u05BD;"
|
||||
"PUNCTUATION_MAQAF=\u05BE;"
|
||||
"POINT_RAFE=\u05BF;"
|
||||
"PUNCTUATION_PASEQ=\u05C0;"
|
||||
"POINT_SHIN_DOT=\u05C1;"
|
||||
"POINT_SIN_DOT=\u05C2;"
|
||||
"PUNCTUATION_SOF_PASUQ=\u05C3;"
|
||||
"ALEF=\u05D0;"
|
||||
"BET=\u05D1;"
|
||||
"GIMEL=\u05D2;"
|
||||
"DALET=\u05D3;"
|
||||
"HE=\u05D4;"
|
||||
"VAV=\u05D5;"
|
||||
"ZAYIN=\u05D6;"
|
||||
"HET=\u05D7;"
|
||||
"TET=\u05D8;"
|
||||
"YOD=\u05D9;"
|
||||
"FINAL_KAF=\u05DA;"
|
||||
"KAF=\u05DB;"
|
||||
"LAMED=\u05DC;"
|
||||
"FINAL_MEM=\u05DD;"
|
||||
"MEM=\u05DE;"
|
||||
"FINAL_NUN=\u05DF;"
|
||||
"NUN=\u05E0;"
|
||||
"SAMEKH=\u05E1;"
|
||||
"AYIN=\u05E2;"
|
||||
"FINAL_PE=\u05E3;"
|
||||
"PE=\u05E4;"
|
||||
"FINAL_TSADI=\u05E5;"
|
||||
"TSADI=\u05E6;"
|
||||
"QOF=\u05E7;"
|
||||
"RESH=\u05E8;"
|
||||
"SHIN=\u05E9;"
|
||||
"TAV=\u05EA;"
|
||||
"YIDDISH_DOUBLE_VAV=\u05F0;"
|
||||
"YIDDISH_VAV_YOD=\u05F1;"
|
||||
"YIDDISH_DOUBLE_YOD=\u05F2;"
|
||||
"PUNCTUATION_GERESH=\u05F3;"
|
||||
"PUNCTUATION_GERSHAYIM=\u05F4;"
|
||||
|
||||
//wildcards
|
||||
//The values can be anything we don't use in this file: start at E000.
|
||||
|
||||
"letter=[abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ];"
|
||||
|
||||
"softvowel=[eiyEIY];"
|
||||
|
||||
"vowellike=[{ALEF}{AYIN}{YOD}{VAV}];"
|
||||
|
||||
//?>{POINT_SHEVA}
|
||||
//?>{POINT_HATAF_SEGOL}
|
||||
//?>{POINT_HATAF_PATAH}
|
||||
//?>{POINT_HATAF_QAMATS}
|
||||
//?>{POINT_HIRIQ}
|
||||
//?>{POINT_TSERE}
|
||||
//?>{POINT_SEGOL}
|
||||
//?>{POINT_PATAH}
|
||||
//?>{POINT_QAMATS}
|
||||
//?>{POINT_HOLAM}
|
||||
//?>{POINT_QUBUTS}
|
||||
//?>{POINT_DAGESH_OR_MAPIQ}
|
||||
//?>{POINT_METEG}
|
||||
//?>{PUNCTUATION_MAQAF}
|
||||
//?>{POINT_RAFE}
|
||||
//?>{PUNCTUATION_PASEQ}
|
||||
//?>{POINT_SHIN_DOT}
|
||||
//?>{POINT_SIN_DOT}
|
||||
//?>{PUNCTUATION_SOF_PASUQ}
|
||||
|
||||
"a>{ALEF};"
|
||||
"A>{ALEF};"
|
||||
|
||||
"b>{BET};"
|
||||
"B>{BET};"
|
||||
|
||||
"c({softvowel}>{SAMEKH};"
|
||||
"C({softvowel}>{SAMEKH};"
|
||||
"c({letter}>{KAF};"
|
||||
"C({letter}>{KAF};"
|
||||
"c>{FINAL_KAF};"
|
||||
"C>{FINAL_KAF};"
|
||||
|
||||
"d>{DALET};"
|
||||
"D>{DALET};"
|
||||
|
||||
"e>{AYIN};"
|
||||
"E>{AYIN};"
|
||||
|
||||
"f({letter}>{PE};"
|
||||
"f>{FINAL_PE};"
|
||||
"F({letter}>{PE};"
|
||||
"F>{FINAL_PE};"
|
||||
|
||||
"g>{GIMEL};"
|
||||
"G>{GIMEL};"
|
||||
|
||||
"h>{HE};"
|
||||
"H>{HE};"
|
||||
|
||||
"i>{YOD};"
|
||||
"I>{YOD};"
|
||||
|
||||
"j>{DALET}{SHIN};"
|
||||
"J>{DALET}{SHIN};"
|
||||
|
||||
"kH>{HET};"
|
||||
"kh>{HET};"
|
||||
"KH>{HET};"
|
||||
"Kh>{HET};"
|
||||
"k({letter}>{KAF};"
|
||||
"K({letter}>{KAF};"
|
||||
"k>{FINAL_KAF};"
|
||||
"K>{FINAL_KAF};"
|
||||
|
||||
"l>{LAMED};"
|
||||
"L>{LAMED};"
|
||||
|
||||
"m({letter}>{MEM};"
|
||||
"m>{FINAL_MEM};"
|
||||
"M({letter}>{MEM};"
|
||||
"M>{FINAL_MEM};"
|
||||
|
||||
"n({letter}>{NUN};"
|
||||
"n>{FINAL_NUN};"
|
||||
"N({letter}>{NUN};"
|
||||
"N>{FINAL_NUN};"
|
||||
|
||||
"o>{VAV};"
|
||||
"O>{VAV};"
|
||||
|
||||
"p({letter}>{PE};"
|
||||
"p>{FINAL_PE};"
|
||||
"P({letter}>{PE};"
|
||||
"P>{FINAL_PE};"
|
||||
|
||||
"q>{QOF};"
|
||||
"Q>{QOF};"
|
||||
|
||||
"r>{RESH};"
|
||||
"R>{RESH};"
|
||||
|
||||
"sH>{SHIN};"
|
||||
"sh>{SHIN};"
|
||||
"SH>{SHIN};"
|
||||
"Sh>{SHIN};"
|
||||
"s>{SAMEKH};"
|
||||
"S>{SAMEKH};"
|
||||
|
||||
"th>{TAV};"
|
||||
"tH>{TAV};"
|
||||
"TH>{TAV};"
|
||||
"Th>{TAV};"
|
||||
"tS({letter}>{TSADI};"
|
||||
"ts({letter}>{TSADI};"
|
||||
"Ts({letter}>{TSADI};"
|
||||
"TS({letter}>{TSADI};"
|
||||
"tS>{FINAL_TSADI};"
|
||||
"ts>{FINAL_TSADI};"
|
||||
"Ts>{FINAL_TSADI};"
|
||||
"TS>{FINAL_TSADI};"
|
||||
"t>{TET};"
|
||||
"T>{TET};"
|
||||
|
||||
"u>{VAV};"
|
||||
"U>{VAV};"
|
||||
|
||||
"v>{VAV};"
|
||||
"V>{VAV};"
|
||||
|
||||
"w>{VAV};"
|
||||
"W>{VAV};"
|
||||
|
||||
"x>{KAF}{SAMEKH};"
|
||||
"X>{KAF}{SAMEKH};"
|
||||
|
||||
"y>{YOD};"
|
||||
"Y>{YOD};"
|
||||
|
||||
"z>{ZAYIN};"
|
||||
"Z>{ZAYIN};"
|
||||
|
||||
//#?>{YIDDISH_DOUBLE_VAV}
|
||||
//?>{YIDDISH_VAV_YOD}
|
||||
//?>{YIDDISH_DOUBLE_YOD}
|
||||
//?>{PUNCTUATION_GERESH}
|
||||
//?>{PUNCTUATION_GERSHAYIM}
|
||||
|
||||
"''>;"
|
||||
|
||||
//{POINT_SHEVA}>@
|
||||
//{POINT_HATAF_SEGOL}>@
|
||||
//{POINT_HATAF_PATAH}>@
|
||||
//{POINT_HATAF_QAMATS}>@
|
||||
//{POINT_HIRIQ}>@
|
||||
//{POINT_TSERE}>@
|
||||
//{POINT_SEGOL}>@
|
||||
//{POINT_PATAH}>@
|
||||
//{POINT_QAMATS}>@
|
||||
//{POINT_HOLAM}>@
|
||||
//{POINT_QUBUTS}>@
|
||||
//{POINT_DAGESH_OR_MAPIQ}>@
|
||||
//{POINT_METEG}>@
|
||||
//{PUNCTUATION_MAQAF}>@
|
||||
//{POINT_RAFE}>@
|
||||
//{PUNCTUATION_PASEQ}>@
|
||||
//{POINT_SHIN_DOT}>@
|
||||
//{POINT_SIN_DOT}>@
|
||||
//{PUNCTUATION_SOF_PASUQ}>@
|
||||
|
||||
"a<{ALEF};"
|
||||
"e<{AYIN};"
|
||||
"b<{BET};"
|
||||
"d<{DALET};"
|
||||
"k<{FINAL_KAF};"
|
||||
"m<{FINAL_MEM};"
|
||||
"n<{FINAL_NUN};"
|
||||
"p<{FINAL_PE};"
|
||||
"ts<{FINAL_TSADI};"
|
||||
"g<{GIMEL};"
|
||||
"kh<{HET};"
|
||||
"h<{HE};"
|
||||
"k''<{KAF}({HE};"
|
||||
"k<{KAF};"
|
||||
"l<{LAMED};"
|
||||
"m<{MEM};"
|
||||
"n<{NUN};"
|
||||
"p<{PE};"
|
||||
"q<{QOF};"
|
||||
"r<{RESH};"
|
||||
"s''<{SAMEKH}({HE};"
|
||||
"s<{SAMEKH};"
|
||||
"sh<{SHIN};"
|
||||
"th<{TAV};"
|
||||
"t''<{TET}({HE};"
|
||||
"t''<{TET}({SAMEKH};"
|
||||
"t''<{TET}({SHIN};"
|
||||
"t<{TET};"
|
||||
"ts<{TSADI};"
|
||||
"v<{VAV}({vowellike};"
|
||||
"u<{VAV};"
|
||||
"y<{YOD};"
|
||||
"z<{ZAYIN};"
|
||||
|
||||
//{YIDDISH_DOUBLE_VAV}>@
|
||||
//{YIDDISH_VAV_YOD}>@
|
||||
//{YIDDISH_DOUBLE_YOD}>@
|
||||
//{PUNCTUATION_GERESH}>@
|
||||
//{PUNCTUATION_GERSHAYIM}>@
|
||||
|
||||
"<'';"
|
||||
}
|
||||
Rule {
|
||||
//variable names, derived from the Unicode names.
|
||||
|
||||
"$POINT_SHEVA=\u05B0;"
|
||||
"$POINT_HATAF_SEGOL=\u05B1;"
|
||||
"$POINT_HATAF_PATAH=\u05B2;"
|
||||
"$POINT_HATAF_QAMATS=\u05B3;"
|
||||
"$POINT_HIRIQ=\u05B4;"
|
||||
"$POINT_TSERE=\u05B5;"
|
||||
"$POINT_SEGOL=\u05B6;"
|
||||
"$POINT_PATAH=\u05B7;"
|
||||
"$POINT_QAMATS=\u05B8;"
|
||||
"$POINT_HOLAM=\u05B9;"
|
||||
"$POINT_QUBUTS=\u05BB;"
|
||||
"$POINT_DAGESH_OR_MAPIQ=\u05BC;"
|
||||
"$POINT_METEG=\u05BD;"
|
||||
"$PUNCTUATION_MAQAF=\u05BE;"
|
||||
"$POINT_RAFE=\u05BF;"
|
||||
"$PUNCTUATION_PASEQ=\u05C0;"
|
||||
"$POINT_SHIN_DOT=\u05C1;"
|
||||
"$POINT_SIN_DOT=\u05C2;"
|
||||
"$PUNCTUATION_SOF_PASUQ=\u05C3;"
|
||||
"$ALEF=\u05D0;"
|
||||
"$BET=\u05D1;"
|
||||
"$GIMEL=\u05D2;"
|
||||
"$DALET=\u05D3;"
|
||||
"$HE=\u05D4;"
|
||||
"$VAV=\u05D5;"
|
||||
"$ZAYIN=\u05D6;"
|
||||
"$HET=\u05D7;"
|
||||
"$TET=\u05D8;"
|
||||
"$YOD=\u05D9;"
|
||||
"$FINAL_KAF=\u05DA;"
|
||||
"$KAF=\u05DB;"
|
||||
"$LAMED=\u05DC;"
|
||||
"$FINAL_MEM=\u05DD;"
|
||||
"$MEM=\u05DE;"
|
||||
"$FINAL_NUN=\u05DF;"
|
||||
"$NUN=\u05E0;"
|
||||
"$SAMEKH=\u05E1;"
|
||||
"$AYIN=\u05E2;"
|
||||
"$FINAL_PE=\u05E3;"
|
||||
"$PE=\u05E4;"
|
||||
"$FINAL_TSADI=\u05E5;"
|
||||
"$TSADI=\u05E6;"
|
||||
"$QOF=\u05E7;"
|
||||
"$RESH=\u05E8;"
|
||||
"$SHIN=\u05E9;"
|
||||
"$TAV=\u05EA;"
|
||||
"$YIDDISH_DOUBLE_VAV=\u05F0;"
|
||||
"$YIDDISH_VAV_YOD=\u05F1;"
|
||||
"$YIDDISH_DOUBLE_YOD=\u05F2;"
|
||||
"$PUNCTUATION_GERESH=\u05F3;"
|
||||
"$PUNCTUATION_GERSHAYIM=\u05F4;"
|
||||
|
||||
//wildcards
|
||||
//The values can be anything we don't use in this file: start at E000.
|
||||
|
||||
"$letter=[abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ];"
|
||||
|
||||
"$softvowel=[eiyEIY];"
|
||||
|
||||
"$vowellike=[$ALEF$AYIN$YOD$VAV];"
|
||||
|
||||
"$hebrew=[\u0590-\u05FF];" // the whole block -liu
|
||||
|
||||
//?>{POINT_SHEVA}
|
||||
//?>{POINT_HATAF_SEGOL}
|
||||
//?>{POINT_HATAF_PATAH}
|
||||
//?>{POINT_HATAF_QAMATS}
|
||||
//?>{POINT_HIRIQ}
|
||||
//?>{POINT_TSERE}
|
||||
//?>{POINT_SEGOL}
|
||||
//?>{POINT_PATAH}
|
||||
//?>{POINT_QAMATS}
|
||||
//?>{POINT_HOLAM}
|
||||
//?>{POINT_QUBUTS}
|
||||
//?>{POINT_DAGESH_OR_MAPIQ}
|
||||
//?>{POINT_METEG}
|
||||
//?>{PUNCTUATION_MAQAF}
|
||||
//?>{POINT_RAFE}
|
||||
//?>{PUNCTUATION_PASEQ}
|
||||
//?>{POINT_SHIN_DOT}
|
||||
//?>{POINT_SIN_DOT}
|
||||
//?>{PUNCTUATION_SOF_PASUQ}
|
||||
|
||||
// why is this a special case? -liu
|
||||
"k''h <> $KAF $HE ;"
|
||||
|
||||
// mark non-final forms in final position as x~ -liu
|
||||
"k < $KAF } $hebrew ;"
|
||||
"m < $MEM } $hebrew ;"
|
||||
"n < $NUN } $hebrew ;"
|
||||
"p < $PE } $hebrew ;"
|
||||
"ts < $TSADI } $hebrew ;"
|
||||
"k'~' <> $KAF ;"
|
||||
"m'~' <> $MEM ;"
|
||||
"n'~' <> $NUN ;"
|
||||
"p'~' <> $PE ;"
|
||||
"ts'~'<> $TSADI ;"
|
||||
|
||||
// mark final forms in non-final position as x^ -liu
|
||||
"k'^' <> $FINAL_KAF } $hebrew ;"
|
||||
"m'^' <> $FINAL_MEM } $hebrew ;"
|
||||
"n'^' <> $FINAL_NUN } $hebrew ;"
|
||||
"p'^' <> $FINAL_PE } $hebrew ;"
|
||||
"ts'^'<> $FINAL_TSADI } $hebrew ;"
|
||||
"k < $FINAL_KAF;"
|
||||
"m < $FINAL_MEM;"
|
||||
"n < $FINAL_NUN;"
|
||||
"p < $FINAL_PE;"
|
||||
"ts < $FINAL_TSADI;"
|
||||
|
||||
"a>$ALEF;"
|
||||
"A>$ALEF;"
|
||||
|
||||
"b>$BET;"
|
||||
"B>$BET;"
|
||||
|
||||
"c}$softvowel>$SAMEKH;"
|
||||
"C}$softvowel>$SAMEKH;"
|
||||
"c}$letter>$KAF;"
|
||||
"C}$letter>$KAF;"
|
||||
"c>$FINAL_KAF;"
|
||||
"C>$FINAL_KAF;"
|
||||
|
||||
"d>$DALET;"
|
||||
"D>$DALET;"
|
||||
|
||||
"e>$AYIN;"
|
||||
"E>$AYIN;"
|
||||
|
||||
"f}$letter>$PE;"
|
||||
"f>$FINAL_PE;"
|
||||
"F}$letter>$PE;"
|
||||
"F>$FINAL_PE;"
|
||||
|
||||
"g>$GIMEL;"
|
||||
"G>$GIMEL;"
|
||||
|
||||
"h>$HE;"
|
||||
"H>$HE;"
|
||||
|
||||
"i>$YOD;"
|
||||
"I>$YOD;"
|
||||
|
||||
"j>$DALET$SHIN;"
|
||||
"J>$DALET$SHIN;"
|
||||
|
||||
"kH>$HET;"
|
||||
"kh>$HET;"
|
||||
"KH>$HET;"
|
||||
"Kh>$HET;"
|
||||
"k}$letter>$KAF;"
|
||||
"K}$letter>$KAF;"
|
||||
"k>$FINAL_KAF;"
|
||||
"K>$FINAL_KAF;"
|
||||
|
||||
"l>$LAMED;"
|
||||
"L>$LAMED;"
|
||||
|
||||
"m}$letter>$MEM;"
|
||||
"m>$FINAL_MEM;"
|
||||
"M}$letter>$MEM;"
|
||||
"M>$FINAL_MEM;"
|
||||
|
||||
"n}$letter>$NUN;"
|
||||
"n>$FINAL_NUN;"
|
||||
"N}$letter>$NUN;"
|
||||
"N>$FINAL_NUN;"
|
||||
|
||||
"o>$VAV;"
|
||||
"O>$VAV;"
|
||||
|
||||
"p}$letter>$PE;"
|
||||
"p>$FINAL_PE;"
|
||||
"P}$letter>$PE;"
|
||||
"P>$FINAL_PE;"
|
||||
|
||||
"q>$QOF;"
|
||||
"Q>$QOF;"
|
||||
|
||||
"r>$RESH;"
|
||||
"R>$RESH;"
|
||||
|
||||
"sH>$SHIN;"
|
||||
"sh>$SHIN;"
|
||||
"SH>$SHIN;"
|
||||
"Sh>$SHIN;"
|
||||
"s>$SAMEKH;"
|
||||
"S>$SAMEKH;"
|
||||
|
||||
"th>$TAV;"
|
||||
"tH>$TAV;"
|
||||
"TH>$TAV;"
|
||||
"Th>$TAV;"
|
||||
"tS}$letter>$TSADI;"
|
||||
"ts}$letter>$TSADI;"
|
||||
"Ts}$letter>$TSADI;"
|
||||
"TS}$letter>$TSADI;"
|
||||
"tS>$FINAL_TSADI;"
|
||||
"ts>$FINAL_TSADI;"
|
||||
"Ts>$FINAL_TSADI;"
|
||||
"TS>$FINAL_TSADI;"
|
||||
"t>$TET;"
|
||||
"T>$TET;"
|
||||
|
||||
"u>$VAV;"
|
||||
"U>$VAV;"
|
||||
|
||||
"v>$VAV;"
|
||||
"V>$VAV;"
|
||||
|
||||
"w>$VAV;"
|
||||
"W>$VAV;"
|
||||
|
||||
"x>$KAF$SAMEKH;"
|
||||
"X>$KAF$SAMEKH;"
|
||||
|
||||
"y>$YOD;"
|
||||
"Y>$YOD;"
|
||||
|
||||
"z>$ZAYIN;"
|
||||
"Z>$ZAYIN;"
|
||||
|
||||
//#?>{YIDDISH_DOUBLE_VAV}
|
||||
//?>{YIDDISH_VAV_YOD}
|
||||
//?>{YIDDISH_DOUBLE_YOD}
|
||||
//?>{PUNCTUATION_GERESH}
|
||||
//?>{PUNCTUATION_GERSHAYIM}
|
||||
|
||||
"''>;"
|
||||
|
||||
//{POINT_SHEVA}>@
|
||||
//{POINT_HATAF_SEGOL}>@
|
||||
//{POINT_HATAF_PATAH}>@
|
||||
//{POINT_HATAF_QAMATS}>@
|
||||
//{POINT_HIRIQ}>@
|
||||
//{POINT_TSERE}>@
|
||||
//{POINT_SEGOL}>@
|
||||
//{POINT_PATAH}>@
|
||||
//{POINT_QAMATS}>@
|
||||
//{POINT_HOLAM}>@
|
||||
//{POINT_QUBUTS}>@
|
||||
//{POINT_DAGESH_OR_MAPIQ}>@
|
||||
//{POINT_METEG}>@
|
||||
//{PUNCTUATION_MAQAF}>@
|
||||
//{POINT_RAFE}>@
|
||||
//{PUNCTUATION_PASEQ}>@
|
||||
//{POINT_SHIN_DOT}>@
|
||||
//{POINT_SIN_DOT}>@
|
||||
//{PUNCTUATION_SOF_PASUQ}>@
|
||||
|
||||
"a<$ALEF;"
|
||||
"e<$AYIN;"
|
||||
"b<$BET;"
|
||||
"d<$DALET;"
|
||||
//+ "k<$FINAL_KAF;"
|
||||
//+ "m<$FINAL_MEM;"
|
||||
//+ "n<$FINAL_NUN;"
|
||||
//+ "p<$FINAL_PE;"
|
||||
//+ "ts<$FINAL_TSADI;"
|
||||
"g<$GIMEL;"
|
||||
"kh<$HET;"
|
||||
"h<$HE;"
|
||||
//+ "k''<$KAF}$HE;"
|
||||
//+ "k<$KAF;"
|
||||
"l<$LAMED;"
|
||||
//+ "m<$MEM;"
|
||||
//+ "n<$NUN;"
|
||||
//+ "p<$PE;"
|
||||
"q<$QOF;"
|
||||
"r<$RESH;"
|
||||
"s''<$SAMEKH}$HE;"
|
||||
"s<$SAMEKH;"
|
||||
"sh<$SHIN;"
|
||||
"th<$TAV;"
|
||||
"t''<$TET}$HE;"
|
||||
"t''<$TET}$SAMEKH;"
|
||||
"t''<$TET}$SHIN;"
|
||||
"t<$TET;"
|
||||
//+ "ts<$TSADI;"
|
||||
"v<$VAV}$vowellike;"
|
||||
"u<$VAV;"
|
||||
"y<$YOD;"
|
||||
"z<$ZAYIN;"
|
||||
|
||||
//{YIDDISH_DOUBLE_VAV}>@
|
||||
//{YIDDISH_VAV_YOD}>@
|
||||
//{YIDDISH_DOUBLE_YOD}>@
|
||||
//{PUNCTUATION_GERESH}>@
|
||||
//{PUNCTUATION_GERSHAYIM}>@
|
||||
|
||||
"<'';"
|
||||
}
|
||||
}
|
||||
|
|
1057
icu4c/data/ljamo.txt
1057
icu4c/data/ljamo.txt
File diff suppressed because it is too large
Load diff
2628
icu4c/data/lkana.txt
2628
icu4c/data/lkana.txt
File diff suppressed because it is too large
Load diff
|
@ -1,83 +1,82 @@
|
|||
//--------------------------------------------------------------------
|
||||
// Copyright (C) 1999, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
// Copyright (c) 1999-2000, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//--------------------------------------------------------------------
|
||||
// Date Name Description
|
||||
// 11/17/99 aliu Creation.
|
||||
// THIS IS A MACHINE-GENERATED FILE
|
||||
// Tool: src\com\ibm\tools\translit\dumpICUrules.bat
|
||||
// Source: src\com\ibm\text\resources/TransliterationRule_StraightQuotes_CurlyQuotes.java
|
||||
// Date: Fri May 19 15:50:23 2000
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
// StraightQuotes-CurlyQuotes
|
||||
|
||||
quotes {
|
||||
Rule {
|
||||
// Rewritten using character codes [LIU]
|
||||
"white=[[:Zs:][:Zl:][:Zp:]];"
|
||||
"black=[^{white}];"
|
||||
"open=[:Ps:];"
|
||||
"dquote=\";"
|
||||
|
||||
"lAng=\u3008;"
|
||||
"ldAng=\u300A;"
|
||||
"lBrk='[';"
|
||||
"lBrc='{';"
|
||||
|
||||
"lquote=\u2018;"
|
||||
"rquote=\u2019;"
|
||||
"ldquote=\u201C;"
|
||||
"rdquote=\u201D;"
|
||||
|
||||
"ldguill=\u00AB;"
|
||||
"rdguill=\u00BB;"
|
||||
"lguill=\u2039;"
|
||||
"rguill=\u203A;"
|
||||
|
||||
"mdash=\u2014;"
|
||||
|
||||
//#######################################
|
||||
// Conversions from input
|
||||
//#######################################
|
||||
|
||||
// join single quotes
|
||||
"{lquote}''>{ldquote};"
|
||||
"{lquote}{lquote}>{ldquote};"
|
||||
"{rquote}''>{rdquote};"
|
||||
"{rquote}{rquote}>{rdquote};"
|
||||
|
||||
//smart single quotes
|
||||
"{white})''>{lquote};"
|
||||
"{open})''>{lquote};"
|
||||
"{black})''>{rquote};"
|
||||
"''>{lquote};"
|
||||
|
||||
//smart doubles
|
||||
"{white}){dquote}>{ldquote};"
|
||||
"{open}){dquote}>{ldquote};"
|
||||
"{black}){dquote}>{rdquote};"
|
||||
"{dquote}>{ldquote};"
|
||||
|
||||
// join single guillemets
|
||||
"{rguill}{rguill}>{rdguill};"
|
||||
"'>>'>{rdguill};"
|
||||
"{lguill}{lguill}>{ldguill};"
|
||||
"'<<'>{ldguill};"
|
||||
|
||||
// prevent double spaces
|
||||
"\\ )\\ >;"
|
||||
|
||||
// join hyphens into dash
|
||||
"-->{mdash};"
|
||||
|
||||
//#######################################
|
||||
// Conversions back to input
|
||||
//#######################################
|
||||
|
||||
//smart quotes
|
||||
"''<{lquote};"
|
||||
"''<{rquote};"
|
||||
"{dquote}<{ldquote};"
|
||||
"{dquote}<{rdquote};"
|
||||
|
||||
//hyphens
|
||||
"--<{mdash};"
|
||||
}
|
||||
Rule {
|
||||
// Rewritten using character codes [LIU]
|
||||
"$white=[[:Zs:][:Zl:][:Zp:]];"
|
||||
"$black=[^$white];"
|
||||
"$open=[:Ps:];"
|
||||
"$dquote='\"';"
|
||||
|
||||
"$lAng=\u3008;"
|
||||
"$ldAng=\u300A;"
|
||||
"$lBrk='[';"
|
||||
"$lBrc='{';"
|
||||
|
||||
"$lquote=\u2018;"
|
||||
"$rquote=\u2019;"
|
||||
"$ldquote=\u201C;"
|
||||
"$rdquote=\u201D;"
|
||||
|
||||
"$ldguill=\u00AB;"
|
||||
"$rdguill=\u00BB;"
|
||||
"$lguill=\u2039;"
|
||||
"$rguill=\u203A;"
|
||||
|
||||
"$mdash=\u2014;"
|
||||
|
||||
//#######################################
|
||||
// Conversions from input
|
||||
//#######################################
|
||||
|
||||
// join single quotes
|
||||
"$lquote''>$ldquote;"
|
||||
"$lquote$lquote>$ldquote;"
|
||||
"$rquote''>$rdquote;"
|
||||
"$rquote$rquote>$rdquote;"
|
||||
|
||||
//smart single quotes
|
||||
"$white{''>$lquote;"
|
||||
"$open{''>$lquote;"
|
||||
"$black{''>$rquote;"
|
||||
"''>$lquote;"
|
||||
|
||||
//smart doubles
|
||||
"$white{$dquote>$ldquote;"
|
||||
"$open{$dquote>$ldquote;"
|
||||
"$black{$dquote>$rdquote;"
|
||||
"$dquote>$ldquote;"
|
||||
|
||||
// join single guillemets
|
||||
"$rguill$rguill>$rdguill;"
|
||||
"'>>'>$rdguill;"
|
||||
"$lguill$lguill>$ldguill;"
|
||||
"'<<'>$ldguill;"
|
||||
|
||||
// prevent double spaces
|
||||
"\\ {\\ >;"
|
||||
|
||||
// join hyphens into dash ### BIDIRECTIONAL ###
|
||||
"'--'<>$mdash;"
|
||||
|
||||
//#######################################
|
||||
// Conversions back to input
|
||||
//#######################################
|
||||
|
||||
//smart quotes
|
||||
"''<$lquote;"
|
||||
"''<$rquote;"
|
||||
"$dquote<$ldquote;"
|
||||
"$dquote<$rdquote;"
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -135,9 +135,8 @@ RuleBasedTransliterator::handleTransliterate(Replaceable& text, Position& index,
|
|||
++cursor;
|
||||
}
|
||||
} else {
|
||||
text.handleReplaceBetween(cursor, cursor + r->getKeyLength(),
|
||||
r->getOutput());
|
||||
limit += r->getOutput().length() - r->getKeyLength();
|
||||
// Delegate replacement to TransliterationRule object
|
||||
limit += r->replace(text, cursor, *data);
|
||||
cursor += r->getCursorPos();
|
||||
++loopCount;
|
||||
}
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
#include "rbt_data.h"
|
||||
#include "hash.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uniset.h"
|
||||
|
||||
TransliterationRuleData::TransliterationRuleData(UErrorCode& status) :
|
||||
variableNames(0), setVariables(0) {
|
||||
|
@ -17,37 +18,21 @@ TransliterationRuleData::TransliterationRuleData(UErrorCode& status) :
|
|||
return;
|
||||
}
|
||||
variableNames = new Hashtable(status);
|
||||
if (U_SUCCESS(status)) {
|
||||
variableNames->setValueDeleter(uhash_deleteUnicodeString);
|
||||
}
|
||||
setVariables = 0;
|
||||
setVariablesLength = 0;
|
||||
}
|
||||
|
||||
TransliterationRuleData::~TransliterationRuleData() {
|
||||
delete variableNames;
|
||||
delete[] setVariables;
|
||||
}
|
||||
|
||||
void
|
||||
TransliterationRuleData::defineVariable(const UnicodeString& name,
|
||||
UChar value,
|
||||
UErrorCode& status) {
|
||||
int32_t v = value | 0x10000; // Set bit 16
|
||||
variableNames->put(name, (void*) v, status);
|
||||
}
|
||||
|
||||
UChar
|
||||
TransliterationRuleData::lookupVariable(const UnicodeString& name,
|
||||
UErrorCode& status) const {
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
if (setVariables != 0) {
|
||||
for (int32_t i=0; i<setVariablesLength; ++i) {
|
||||
delete setVariables[i];
|
||||
}
|
||||
delete[] setVariables;
|
||||
}
|
||||
void* value = variableNames->get(name);
|
||||
/* Even U+0000 can be stored in the table because we set
|
||||
* bit 16 in defineVariable().
|
||||
*/
|
||||
if (value == 0) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
return (UChar) (int32_t) (unsigned long) value;
|
||||
}
|
||||
|
||||
const UnicodeSet*
|
||||
|
@ -56,7 +41,8 @@ TransliterationRuleData::lookupSet(UChar standIn) const {
|
|||
return (i >= 0 && i < setVariablesLength) ? setVariables[i] : 0;
|
||||
}
|
||||
|
||||
UBool
|
||||
TransliterationRuleData::isVariableDefined(const UnicodeString& name) const {
|
||||
return 0 != variableNames->get(name);
|
||||
int32_t
|
||||
TransliterationRuleData::lookupSegmentReference(UChar c) const {
|
||||
int32_t i = c - segmentBase;
|
||||
return (i >= 0 && i < 9) ? i : -1;
|
||||
}
|
||||
|
|
|
@ -35,42 +35,39 @@ class TransliterationRuleData {
|
|||
|
||||
public:
|
||||
|
||||
// PUBLIC DATA MEMBERS
|
||||
|
||||
/**
|
||||
* Rule table. May be empty.
|
||||
*
|
||||
* PUBLIC DATA MEMBER for internal use by RBT
|
||||
*/
|
||||
TransliterationRuleSet ruleSet;
|
||||
|
||||
/**
|
||||
* Map variable name (UnicodeString) to variable (Character).
|
||||
* A variable name may correspond to a single literal
|
||||
* character, in which case the character is stored in this
|
||||
* hash. It may also correspond to a UnicodeSet, in which
|
||||
* case a character is again stored in this hash, but the
|
||||
* character is a stand-in: it is a key for a secondary lookup
|
||||
* in data.setVariables. The stand-in also represents the
|
||||
* UnicodeSet in the stored rules.
|
||||
*
|
||||
* PUBLIC DATA MEMBER for internal use by RBT
|
||||
* Map variable name (String) to variable (UnicodeString). A variable name
|
||||
* corresponds to zero or more characters, stored in a UnicodeString in
|
||||
* this hash. One or more of these chars may also correspond to a
|
||||
* UnicodeSet, in which case the character in the UnicodeString in this hash is
|
||||
* a stand-in: it is an index for a secondary lookup in
|
||||
* data.setVariables. The stand-in also represents the UnicodeSet in
|
||||
* the stored rules.
|
||||
*/
|
||||
Hashtable* variableNames;
|
||||
|
||||
/**
|
||||
* Map category variable (Character) to set (UnicodeSet).
|
||||
* Map category variable (UChar) to set (UnicodeSet).
|
||||
* Variables that correspond to a set of characters are mapped
|
||||
* from variable name to a stand-in character in data.variableNames.
|
||||
* The stand-in then serves as a key in this hash to lookup the
|
||||
* actual UnicodeSet object. In addition, the stand-in is
|
||||
* stored in the rule text to represent the set of characters.
|
||||
* setVariables[i] represents character (setVariablesBase + i).
|
||||
*
|
||||
* PUBLIC DATA MEMBER for internal use by RBT
|
||||
*/
|
||||
UnicodeSet** setVariables;
|
||||
|
||||
/**
|
||||
* The character represented by setVariables[0].
|
||||
* The character that represents setVariables[0]. Characters
|
||||
* setVariablesBase through setVariablesBase +
|
||||
* setVariables.length - 1 represent UnicodeSet objects.
|
||||
*/
|
||||
UChar setVariablesBase;
|
||||
|
||||
|
@ -79,20 +76,34 @@ public:
|
|||
*/
|
||||
int32_t setVariablesLength;
|
||||
|
||||
/**
|
||||
* The character that represents segment 1. Characters segmentBase
|
||||
* through segmentBase + 8 represent segments 1 through 9.
|
||||
*/
|
||||
UChar segmentBase;
|
||||
|
||||
public:
|
||||
|
||||
TransliterationRuleData(UErrorCode& status);
|
||||
|
||||
~TransliterationRuleData();
|
||||
|
||||
void defineVariable(const UnicodeString& name,
|
||||
UChar value,
|
||||
UErrorCode& status);
|
||||
|
||||
UChar lookupVariable(const UnicodeString& name,
|
||||
UErrorCode& status) const;
|
||||
|
||||
const UnicodeSet* lookupSet(UChar standIn) const;
|
||||
|
||||
UBool isVariableDefined(const UnicodeString& name) const;
|
||||
/**
|
||||
* Return the zero-based index of the segment represented by the given
|
||||
* character, or -1 if none. Repeat: This is a zero-based return value,
|
||||
* 0..8, even though these are notated "$1".."$9".
|
||||
*/
|
||||
int32_t lookupSegmentReference(UChar c) const;
|
||||
|
||||
/**
|
||||
* Return the character used to stand for the given segment reference.
|
||||
* The reference must be in the range 1..9.
|
||||
*/
|
||||
UChar getSegmentStandin(int32_t ref) const {
|
||||
return segmentBase + ref - 1;
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
@ -17,27 +17,31 @@
|
|||
#include "unicode/parsepos.h"
|
||||
#include "symtable.h"
|
||||
#include "unicode/parseerr.h"
|
||||
#include "hash.h"
|
||||
|
||||
// Operators
|
||||
const UChar TransliterationRuleParser::VARIABLE_DEF_OP = 0x003D/*=*/;
|
||||
const UChar TransliterationRuleParser::FORWARD_RULE_OP = 0x003E/*>*/;
|
||||
const UChar TransliterationRuleParser::REVERSE_RULE_OP = 0x003C/*<*/;
|
||||
const UChar TransliterationRuleParser::FWDREV_RULE_OP = 0x007E/*~*/; // internal rep of <> op
|
||||
const UnicodeString TransliterationRuleParser::OPERATORS = UNICODE_STRING("=><", 3);
|
||||
#define VARIABLE_DEF_OP ((UChar)0x003D) /*=*/
|
||||
#define FORWARD_RULE_OP ((UChar)0x003E) /*>*/
|
||||
#define REVERSE_RULE_OP ((UChar)0x003C) /*<*/
|
||||
#define FWDREV_RULE_OP ((UChar)0x007E) /*~*/ // internal rep of <> op
|
||||
#define OPERATORS UNICODE_STRING("=><", 3)
|
||||
|
||||
// Other special characters
|
||||
const UChar TransliterationRuleParser::QUOTE = 0x0027/*'*/;
|
||||
const UChar TransliterationRuleParser::ESCAPE = 0x005C/*\*/;
|
||||
const UChar TransliterationRuleParser::END_OF_RULE = 0x003B/*;*/;
|
||||
const UChar TransliterationRuleParser::RULE_COMMENT_CHAR = 0x0023/*#*/;
|
||||
#define QUOTE ((UChar)0x0027) /*'*/
|
||||
#define ESCAPE ((UChar)0x005C) /*\*/
|
||||
#define END_OF_RULE ((UChar)0x003B) /*;*/
|
||||
#define RULE_COMMENT_CHAR ((UChar)0x0023) /*#*/
|
||||
|
||||
const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = 0x007B/*{*/;
|
||||
const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = 0x007D/*}*/;
|
||||
const UChar TransliterationRuleParser::CONTEXT_OPEN = 0x0028/*(*/;
|
||||
const UChar TransliterationRuleParser::CONTEXT_CLOSE = 0x0029/*)*/;
|
||||
const UChar TransliterationRuleParser::SET_OPEN = 0x005B/*[*/;
|
||||
const UChar TransliterationRuleParser::SET_CLOSE = 0x005D/*]*/;
|
||||
const UChar TransliterationRuleParser::CURSOR_POS = 0x007C/*|*/;
|
||||
#define SEGMENT_OPEN ((UChar)0x0028) /*(*/
|
||||
#define SEGMENT_CLOSE ((UChar)0x0029) /*)*/
|
||||
#define CONTEXT_ANTE ((UChar)0x007B) /*{*/
|
||||
#define CONTEXT_POST ((UChar)0x007D) /*}*/
|
||||
#define SET_OPEN ((UChar)0x005B) /*[*/
|
||||
#define SET_CLOSE ((UChar)0x005D) /*]*/
|
||||
#define CURSOR_POS ((UChar)0x007C) /*|*/
|
||||
#define CURSOR_OFFSET ((UChar)0x0040) /*@*/
|
||||
|
||||
const UnicodeString TransliterationRuleParser::gOPERATORS = OPERATORS;
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// BEGIN ParseData
|
||||
|
@ -58,14 +62,12 @@ public:
|
|||
ParseData(const TransliterationRuleData* data = 0,
|
||||
const UVector* setVariablesVector = 0);
|
||||
|
||||
/**
|
||||
* Lookup the object associated with this string and return it.
|
||||
* Return U_ILLEGAL_ARGUMENT_ERROR status if the name does not
|
||||
* exist. Return a non-NULL set if the name is mapped to a set;
|
||||
* otherwise return a NULL set.
|
||||
*/
|
||||
virtual void lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
|
||||
UErrorCode& status) const;
|
||||
virtual const UnicodeString* lookup(const UnicodeString& s) const;
|
||||
|
||||
virtual const UnicodeSet* lookupSet(UChar ch) const;
|
||||
|
||||
virtual UnicodeString parseReference(const UnicodeString& text,
|
||||
ParsePosition& pos, int32_t limit) const;
|
||||
};
|
||||
|
||||
ParseData::ParseData(const TransliterationRuleData* d,
|
||||
|
@ -73,21 +75,389 @@ ParseData::ParseData(const TransliterationRuleData* d,
|
|||
data(d), setVariablesVector(sets) {}
|
||||
|
||||
/**
|
||||
* Implement SymbolTable API. Lookup a variable, returning
|
||||
* either a Character, a UnicodeSet, or null.
|
||||
* Implement SymbolTable API.
|
||||
*/
|
||||
void ParseData::lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
|
||||
UErrorCode& status) const {
|
||||
c = data->lookupVariable(name, status);
|
||||
if (U_SUCCESS(status)) {
|
||||
int32_t i = c - data->setVariablesBase;
|
||||
const UnicodeString* ParseData::lookup(const UnicodeString& name) const {
|
||||
return (const UnicodeString*) data->variableNames->get(name);
|
||||
}
|
||||
|
||||
/**
|
||||
* Implement SymbolTable API.
|
||||
*/
|
||||
const UnicodeSet* ParseData::lookupSet(UChar ch) const {
|
||||
// Note that we cannot use data.lookupSet() because the
|
||||
// set array has not been constructed yet.
|
||||
const UnicodeSet* set = NULL;
|
||||
int32_t i = ch - data->setVariablesBase;
|
||||
if (i >= 0 && i < setVariablesVector->size()) {
|
||||
int32_t i = ch - data->setVariablesBase;
|
||||
set = (i < setVariablesVector->size()) ?
|
||||
(UnicodeSet*) setVariablesVector->elementAt(i) : 0;
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implement SymbolTable API. Parse out a symbol reference
|
||||
* name.
|
||||
*/
|
||||
UnicodeString ParseData::parseReference(const UnicodeString& text,
|
||||
ParsePosition& pos, int32_t limit) const {
|
||||
int32_t start = pos.getIndex();
|
||||
int32_t i = start;
|
||||
UnicodeString result;
|
||||
while (i < limit) {
|
||||
UChar c = text.charAt(i);
|
||||
if ((i==start && !Unicode::isUnicodeIdentifierStart(c)) ||
|
||||
!Unicode::isUnicodeIdentifierPart(c)) {
|
||||
break;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
if (i == start) { // No valid name chars
|
||||
return result; // Indicate failure with empty string
|
||||
//if (start > 0) {
|
||||
// --start;
|
||||
//}
|
||||
//limit = ruleEnd(text, start, limit);
|
||||
//throw new IllegalArgumentException("Illegal variable reference " +
|
||||
// text.substring(start, limit));
|
||||
}
|
||||
pos.setIndex(i);
|
||||
text.extractBetween(start, i, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// END ParseData
|
||||
// BEGIN RuleHalf
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* A class representing one side of a rule. This class knows how to
|
||||
* parse half of a rule. It is tightly coupled to the method
|
||||
* RuleBasedTransliterator.Parser.parseRule().
|
||||
*/
|
||||
class RuleHalf {
|
||||
|
||||
public:
|
||||
|
||||
UnicodeString text;
|
||||
|
||||
int32_t cursor; // position of cursor in text
|
||||
int32_t ante; // position of ante context marker '{' in text
|
||||
int32_t post; // position of post context marker '}' in text
|
||||
|
||||
// Record the position of the segment substrings and references. A
|
||||
// given side should have segments or segment references, but not
|
||||
// both.
|
||||
UVector* segments; // ref substring start,limits
|
||||
int32_t maxRef; // index of largest ref (1..9)
|
||||
|
||||
// Record the offset to the cursor either to the left or to the
|
||||
// right of the key. This is indicated by characters on the output
|
||||
// side that allow the cursor to be positioned arbitrarily within
|
||||
// the matching text. For example, abc{def} > | @@@ xyz; changes
|
||||
// def to xyz and moves the cursor to before abc. Offset characters
|
||||
// must be at the start or end, and they cannot move the cursor past
|
||||
// the ante- or postcontext text. Placeholders are only valid in
|
||||
// output text.
|
||||
int32_t cursorOffset; // only nonzero on output side
|
||||
|
||||
TransliterationRuleParser& parser;
|
||||
|
||||
static const UnicodeString gOperators;
|
||||
|
||||
//--------------------------------------------------
|
||||
// Methods
|
||||
|
||||
RuleHalf(TransliterationRuleParser& parser);
|
||||
~RuleHalf();
|
||||
|
||||
/**
|
||||
* Parse one side of a rule, stopping at either the limit,
|
||||
* the END_OF_RULE character, or an operator. Return
|
||||
* the pos of the terminating character (or limit).
|
||||
*/
|
||||
int32_t parse(const UnicodeString& rule, int32_t pos, int32_t limit,
|
||||
TransliterationRuleParser& parser);
|
||||
|
||||
/**
|
||||
* Remove context.
|
||||
*/
|
||||
void removeContext();
|
||||
|
||||
/**
|
||||
* Create and return an int[] array of segments.
|
||||
*/
|
||||
int32_t* createSegments() const;
|
||||
|
||||
int syntaxError(int32_t code,
|
||||
const UnicodeString& rule,
|
||||
int32_t start) {
|
||||
return parser.syntaxError(code, rule, start);
|
||||
}
|
||||
};
|
||||
|
||||
const UnicodeString RuleHalf::gOperators = OPERATORS;
|
||||
|
||||
RuleHalf::RuleHalf(TransliterationRuleParser& p) : parser(p) {
|
||||
cursor = -1;
|
||||
ante = -1;
|
||||
post = -1;
|
||||
segments = NULL;
|
||||
maxRef = -1;
|
||||
cursorOffset = 0;
|
||||
}
|
||||
|
||||
RuleHalf::~RuleHalf() {
|
||||
delete segments;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse one side of a rule, stopping at either the limit,
|
||||
* the END_OF_RULE character, or an operator. Return
|
||||
* the pos of the terminating character (or limit).
|
||||
*/
|
||||
int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit,
|
||||
TransliterationRuleParser& parser) {
|
||||
int32_t start = pos;
|
||||
UnicodeString& buf = text;
|
||||
ParsePosition pp;
|
||||
int32_t cursorOffsetPos = 0; // Position of first CURSOR_OFFSET on _right_
|
||||
UnicodeString scratch;
|
||||
bool_t done = FALSE;
|
||||
|
||||
while (pos < limit && !done) {
|
||||
UChar c = rule.charAt(pos++);
|
||||
if (Unicode::isWhitespace(c)) {
|
||||
// Ignore whitespace. Note that this is not Unicode
|
||||
// spaces, but Java spaces -- a subset, representing
|
||||
// whitespace likely to be seen in code.
|
||||
continue;
|
||||
}
|
||||
// Handle escapes
|
||||
if (c == ESCAPE) {
|
||||
if (pos == limit) {
|
||||
return syntaxError(RuleBasedTransliterator::TRAILING_BACKSLASH, rule, start);
|
||||
}
|
||||
|
||||
// UNLIKE THE JAVA version, we parse \uXXXX escapes. We
|
||||
// do not do this in Java because the compiler has already
|
||||
// done it when the ResourceBundle file was compiled.
|
||||
// Parse \uXXXX escapes
|
||||
c = rule.charAt(pos++);
|
||||
if (c == 0x0075/*u*/) {
|
||||
if ((pos+4) > limit) {
|
||||
return syntaxError(RuleBasedTransliterator::MALFORMED_UNICODE_ESCAPE, rule, start);
|
||||
}
|
||||
c = (UChar)0x0000;
|
||||
for (int32_t plim=pos+4; pos<plim; ++pos) { // [sic]
|
||||
int32_t digit = Unicode::digit(rule.charAt(pos), 16);
|
||||
if (digit<0) {
|
||||
return syntaxError(RuleBasedTransliterator::MALFORMED_UNICODE_ESCAPE, rule, start);
|
||||
}
|
||||
c = (UChar) ((c << 4) | digit);
|
||||
}
|
||||
}
|
||||
|
||||
buf.append(c);
|
||||
continue;
|
||||
}
|
||||
// Handle quoted matter
|
||||
if (c == QUOTE) {
|
||||
int32_t iq = rule.indexOf(QUOTE, pos);
|
||||
if (iq == pos) {
|
||||
buf.append(c); // Parse [''] outside quotes as [']
|
||||
++pos;
|
||||
} else {
|
||||
/* This loop picks up a segment of quoted text of the
|
||||
* form 'aaaa' each time through. If this segment
|
||||
* hasn't really ended ('aaaa''bbbb') then it keeps
|
||||
* looping, each time adding on a new segment. When it
|
||||
* reaches the final quote it breaks.
|
||||
*/
|
||||
for (;;) {
|
||||
if (iq < 0) {
|
||||
return syntaxError(RuleBasedTransliterator::UNTERMINATED_QUOTE, rule, start);
|
||||
}
|
||||
scratch.truncate(0);
|
||||
rule.extractBetween(pos, iq, scratch);
|
||||
buf.append(scratch);
|
||||
pos = iq+1;
|
||||
if (pos < limit && rule.charAt(pos) == QUOTE) {
|
||||
// Parse [''] inside quotes as [']
|
||||
iq = rule.indexOf(QUOTE, pos+1);
|
||||
// Continue looping
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (gOperators.indexOf(c) >= 0) {
|
||||
--pos; // Backup to point to operator
|
||||
break;
|
||||
}
|
||||
switch (c) {
|
||||
case SEGMENT_OPEN:
|
||||
case SEGMENT_CLOSE:
|
||||
// Handle segment definitions "(" and ")"
|
||||
// Parse "(", ")"
|
||||
if (segments == NULL) {
|
||||
segments = new UVector();
|
||||
}
|
||||
if ((c == SEGMENT_OPEN) !=
|
||||
(segments->size() % 2 == 0)) {
|
||||
return syntaxError(RuleBasedTransliterator::MISMATCHED_SEGMENT_DELIMITERS,
|
||||
rule, start);
|
||||
}
|
||||
segments->addElement((void*) buf.length());
|
||||
break;
|
||||
case END_OF_RULE:
|
||||
--pos; // Backup to point to END_OF_RULE
|
||||
done = TRUE;
|
||||
break;
|
||||
case SymbolTable::SYMBOL_REF:
|
||||
// Handle variable references and segment references "$1" .. "$9"
|
||||
{
|
||||
// A variable reference must be followed immediately
|
||||
// by a Unicode identifier start and zero or more
|
||||
// Unicode identifier part characters, or by a digit
|
||||
// 1..9 if it is a segment reference.
|
||||
if (pos == limit) {
|
||||
return syntaxError(RuleBasedTransliterator::MALFORMED_SYMBOL_REFERENCE, rule, start);
|
||||
}
|
||||
// Parse "$1" "$2" .. "$9"
|
||||
c = rule.charAt(pos);
|
||||
int32_t r = Unicode::digit(c, 10);
|
||||
if (r >= 1 && r <= 9) {
|
||||
if (r > maxRef) {
|
||||
maxRef = r;
|
||||
}
|
||||
buf.append(parser.data->getSegmentStandin(r));
|
||||
++pos;
|
||||
} else {
|
||||
pp.setIndex(pos);
|
||||
UnicodeString name = parser.parseData->
|
||||
parseReference(rule, pp, limit);
|
||||
if (name.length() == 0) {
|
||||
return syntaxError(RuleBasedTransliterator::MALFORMED_VARIABLE_REFERENCE,
|
||||
rule, start);
|
||||
}
|
||||
pos = pp.getIndex();
|
||||
// If this is a variable definition statement,
|
||||
// then the LHS variable will be undefined. In
|
||||
// that case appendVariableDef() will append the
|
||||
// special placeholder char variableLimit-1.
|
||||
|
||||
//buf.append(parser.getVariableDef(name));
|
||||
parser.appendVariableDef(name, buf);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case CONTEXT_ANTE:
|
||||
if (ante >= 0) {
|
||||
return syntaxError(RuleBasedTransliterator::MULTIPLE_ANTE_CONTEXTS, rule, start);
|
||||
}
|
||||
ante = buf.length();
|
||||
break;
|
||||
case CONTEXT_POST:
|
||||
if (post >= 0) {
|
||||
return syntaxError(RuleBasedTransliterator::MULTIPLE_POST_CONTEXTS, rule, start);
|
||||
}
|
||||
post = buf.length();
|
||||
break;
|
||||
case SET_OPEN:
|
||||
pp.setIndex(pos-1); // Backup to opening '['
|
||||
buf.append(parser.parseSet(rule, pp));
|
||||
if (U_FAILURE(parser.status)) {
|
||||
return syntaxError(RuleBasedTransliterator::MALFORMED_SET, rule, start);
|
||||
}
|
||||
pos = pp.getIndex();
|
||||
break;
|
||||
case CURSOR_POS:
|
||||
if (cursor >= 0) {
|
||||
return syntaxError(RuleBasedTransliterator::MULTIPLE_CURSORS, rule, start);
|
||||
}
|
||||
cursor = buf.length();
|
||||
break;
|
||||
case CURSOR_OFFSET:
|
||||
if (cursorOffset < 0) {
|
||||
if (buf.length() > 0) {
|
||||
return syntaxError(RuleBasedTransliterator::MISPLACED_CURSOR_OFFSET, rule, start);
|
||||
}
|
||||
--cursorOffset;
|
||||
} else if (cursorOffset > 0) {
|
||||
if (buf.length() != cursorOffsetPos || cursor >= 0) {
|
||||
return syntaxError(RuleBasedTransliterator::MISPLACED_CURSOR_OFFSET, rule, start);
|
||||
}
|
||||
++cursorOffset;
|
||||
} else {
|
||||
if (cursor == 0 && buf.length() == 0) {
|
||||
cursorOffset = -1;
|
||||
} else if (cursor < 0) {
|
||||
cursorOffsetPos = buf.length();
|
||||
cursorOffset = 1;
|
||||
} else {
|
||||
return syntaxError(RuleBasedTransliterator::MISPLACED_CURSOR_OFFSET, rule, start);
|
||||
}
|
||||
}
|
||||
break;
|
||||
// case SET_CLOSE:
|
||||
default:
|
||||
// Disallow unquoted characters other than [0-9A-Za-z]
|
||||
// in the printable ASCII range. These characters are
|
||||
// reserved for possible future use.
|
||||
if (c >= 0x0021 && c <= 0x007E &&
|
||||
!((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
|
||||
(c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
|
||||
(c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) {
|
||||
return syntaxError(RuleBasedTransliterator::UNQUOTED_SPECIAL, rule, start);
|
||||
}
|
||||
buf.append(c);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (cursorOffset > 0 && cursor != cursorOffsetPos) {
|
||||
return syntaxError(RuleBasedTransliterator::MISPLACED_CURSOR_OFFSET, rule, start);
|
||||
}
|
||||
// text = buf.toString();
|
||||
return pos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove context.
|
||||
*/
|
||||
void RuleHalf::removeContext() {
|
||||
//text = text.substring(ante < 0 ? 0 : ante,
|
||||
// post < 0 ? text.length() : post);
|
||||
if (post >= 0) {
|
||||
text.remove(post);
|
||||
}
|
||||
if (ante >= 0) {
|
||||
text.removeBetween(0, ante);
|
||||
}
|
||||
ante = post = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and return an int32_t[] array of segments.
|
||||
*/
|
||||
int32_t* RuleHalf::createSegments() const {
|
||||
if (segments == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
int32_t* result = new int32_t[segments->size()];
|
||||
for (int32_t i=0; i<segments->size(); ++i) {
|
||||
result[i] = (int32_t) segments->elementAt(i);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// END RuleHalf
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
TransliterationRuleData*
|
||||
|
@ -206,251 +576,142 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
|
|||
// Locate the left side, operator, and right side
|
||||
int32_t start = pos;
|
||||
UChar op = 0;
|
||||
const UnicodeString& rule = rules; // TEMPORARY: FIX LATER
|
||||
|
||||
UnicodeString buf;
|
||||
int32_t cursor = -1; // position of cursor in buf
|
||||
int32_t ante = -1; // position of ante context marker ')' in buf
|
||||
int32_t post = -1; // position of post context marker '(' in buf
|
||||
int32_t postClose = -1; // position of post context close ')' in buf
|
||||
// Use pointers to automatics to make swapping possible.
|
||||
RuleHalf _left(*this), _right(*this);
|
||||
RuleHalf* left = &_left;
|
||||
RuleHalf* right = &_right;
|
||||
|
||||
// Assigned to buf and its adjuncts after the LHS has been
|
||||
// parsed. Thereafter, buf etc. refer to the RHS.
|
||||
UnicodeString left;
|
||||
int32_t leftCursor = -1, leftAnte = -1, leftPost = -1, leftPostClose = -1;
|
||||
|
||||
UnicodeString scratch;
|
||||
|
||||
while (pos < limit) {
|
||||
UChar c = rules.charAt(pos++);
|
||||
if (Unicode::isWhitespace(c)) {
|
||||
// Ignore whitespace. Note that this is not Unicode
|
||||
// spaces, but Java spaces -- a subset, representing
|
||||
// whitespace likely to be seen in code.
|
||||
continue;
|
||||
}
|
||||
// Handle escapes
|
||||
if (c == ESCAPE) {
|
||||
if (pos == limit) {
|
||||
return syntaxError(RuleBasedTransliterator::TRAILING_BACKSLASH, rules, start);
|
||||
}
|
||||
// Parse \uXXXX escapes
|
||||
c = rules.charAt(pos++);
|
||||
if (c == 0x0075/*u*/) {
|
||||
if ((pos+4) > limit) {
|
||||
return syntaxError(RuleBasedTransliterator::MALFORMED_UNICODE_ESCAPE, rules, start);
|
||||
}
|
||||
c = (UChar)0x0000;
|
||||
for (int32_t plim=pos+4; pos<plim; ++pos) { // [sic]
|
||||
int32_t digit = Unicode::digit(rules.charAt(pos), 16);
|
||||
if (digit<0) {
|
||||
return syntaxError(RuleBasedTransliterator::MALFORMED_UNICODE_ESCAPE, rules, start);
|
||||
}
|
||||
c = (UChar) ((c << 4) | digit);
|
||||
}
|
||||
}
|
||||
|
||||
buf.append(c);
|
||||
continue;
|
||||
}
|
||||
// Handle quoted matter
|
||||
if (c == QUOTE) {
|
||||
int32_t iq = rules.indexOf(QUOTE, pos);
|
||||
if (iq == pos) {
|
||||
buf.append(c); // Parse [''] outside quotes as [']
|
||||
++pos;
|
||||
} else {
|
||||
/* This loop picks up a segment of quoted text of the
|
||||
* form 'aaaa' each time through. If this segment
|
||||
* hasn't really ended ('aaaa''bbbb') then it keeps
|
||||
* looping, each time adding on a new segment. When it
|
||||
* reaches the final quote it breaks.
|
||||
*/
|
||||
for (;;) {
|
||||
if (iq < 0) {
|
||||
return syntaxError(RuleBasedTransliterator::UNTERMINATED_QUOTE, rules, start);
|
||||
}
|
||||
scratch.truncate(0);
|
||||
rules.extractBetween(pos, iq, scratch);
|
||||
buf.append(scratch);
|
||||
pos = iq+1;
|
||||
if (pos < limit && rules.charAt(pos) == QUOTE) {
|
||||
// Parse [''] inside quotes as [']
|
||||
iq = rules.indexOf(QUOTE, pos+1);
|
||||
// Continue looping
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (OPERATORS.indexOf(c) >= 0) {
|
||||
if (op != 0) {
|
||||
return syntaxError(RuleBasedTransliterator::UNQUOTED_SPECIAL, rules, start);
|
||||
}
|
||||
// Found an operator char. Check for forward-reverse operator.
|
||||
if (c == REVERSE_RULE_OP &&
|
||||
(pos < limit && rules.charAt(pos) == FORWARD_RULE_OP)) {
|
||||
++pos;
|
||||
op = FWDREV_RULE_OP;
|
||||
} else {
|
||||
op = c;
|
||||
}
|
||||
left = buf; // lhs
|
||||
leftCursor = cursor;
|
||||
leftAnte = ante;
|
||||
leftPost = post;
|
||||
leftPostClose = postClose;
|
||||
|
||||
buf.truncate(0);
|
||||
cursor = ante = post = postClose = -1;
|
||||
continue;
|
||||
}
|
||||
if (c == END_OF_RULE) {
|
||||
break;
|
||||
}
|
||||
switch (c) {
|
||||
case VARIABLE_REF_OPEN:
|
||||
{
|
||||
int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, pos);
|
||||
if (pos == j || j < 0) { // empty or unterminated
|
||||
return syntaxError(RuleBasedTransliterator::MALFORMED_VARIABLE_REFERENCE, rules, start);
|
||||
}
|
||||
scratch.truncate(0);
|
||||
rules.extractBetween(pos, j, scratch);
|
||||
pos = j+1;
|
||||
UChar v = data->lookupVariable(scratch, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return syntaxError(RuleBasedTransliterator::UNDEFINED_VARIABLE, rules, start);
|
||||
}
|
||||
buf.append(v);
|
||||
}
|
||||
break;
|
||||
case CONTEXT_OPEN:
|
||||
if (post >= 0) {
|
||||
return syntaxError(RuleBasedTransliterator::MULTIPLE_POST_CONTEXTS, rules, start);
|
||||
}
|
||||
// Ignore CONTEXT_OPEN if buffer length is zero -- that means
|
||||
// this is the optional opening delimiter for the ante context.
|
||||
if (buf.length() > 0) {
|
||||
post = buf.length();
|
||||
}
|
||||
break;
|
||||
case CONTEXT_CLOSE:
|
||||
if (postClose >= 0) {
|
||||
return syntaxError(RuleBasedTransliterator::UNEXPECTED_CLOSE_CONTEXT, rules, start);
|
||||
}
|
||||
if (post >= 0) {
|
||||
// This is probably the optional closing delimiter
|
||||
// for the post context; save the pos and check later.
|
||||
postClose = buf.length();
|
||||
} else if (ante >= 0) {
|
||||
return syntaxError(RuleBasedTransliterator::MULTIPLE_ANTE_CONTEXTS, rules, start);
|
||||
} else {
|
||||
ante = buf.length();
|
||||
}
|
||||
break;
|
||||
case SET_OPEN: {
|
||||
ParsePosition pp(pos-1); // Backup to opening '['
|
||||
buf.append(registerSet(new UnicodeSet(rules, pp, *parseData, status)));
|
||||
if (U_FAILURE(status)) {
|
||||
return syntaxError(RuleBasedTransliterator::MALFORMED_SET, rules, start);
|
||||
}
|
||||
pos = pp.getIndex(); }
|
||||
break;
|
||||
case VARIABLE_REF_CLOSE:
|
||||
case SET_CLOSE:
|
||||
return syntaxError(RuleBasedTransliterator::UNQUOTED_SPECIAL, rules, start);
|
||||
case CURSOR_POS:
|
||||
if (cursor >= 0) {
|
||||
return syntaxError(RuleBasedTransliterator::MULTIPLE_CURSORS, rules, start);
|
||||
}
|
||||
cursor = buf.length();
|
||||
break;
|
||||
default:
|
||||
buf.append(c);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (op == 0) {
|
||||
return syntaxError(RuleBasedTransliterator::MISSING_OPERATOR, rules, start);
|
||||
undefinedVariableName.remove();
|
||||
pos = left->parse(rule, pos, limit, *this);
|
||||
if (U_FAILURE(status)) {
|
||||
return start;
|
||||
}
|
||||
|
||||
// Check context close parameters
|
||||
if ((leftPostClose >= 0 && leftPostClose != left.length()) ||
|
||||
(postClose >= 0 && postClose != buf.length())) {
|
||||
return syntaxError(RuleBasedTransliterator::TEXT_AFTER_CLOSE_CONTEXT, rules, start);
|
||||
if (pos == limit ||
|
||||
gOPERATORS.indexOf(op = rule.charAt(pos++)) < 0) {
|
||||
return syntaxError(RuleBasedTransliterator::MISSING_OPERATOR, rule, start);
|
||||
}
|
||||
|
||||
// Context is only allowed on the input side; that is, the left side
|
||||
// for forward rules. Cursors are only allowed on the output side;
|
||||
// that is, the right side for forward rules. Bidirectional rules
|
||||
// ignore elements that do not apply.
|
||||
// Found an operator char. Check for forward-reverse operator.
|
||||
if (op == REVERSE_RULE_OP &&
|
||||
(pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
|
||||
++pos;
|
||||
op = FWDREV_RULE_OP;
|
||||
}
|
||||
|
||||
switch (op) {
|
||||
case VARIABLE_DEF_OP:
|
||||
pos = right->parse(rule, pos, limit, *this);
|
||||
if (U_FAILURE(status)) {
|
||||
return start;
|
||||
}
|
||||
|
||||
if (pos < limit) {
|
||||
if (rule.charAt(pos) == END_OF_RULE) {
|
||||
++pos;
|
||||
} else {
|
||||
// RuleHalf parser must have terminated at an operator
|
||||
return syntaxError(RuleBasedTransliterator::UNQUOTED_SPECIAL, rule, start);
|
||||
}
|
||||
}
|
||||
|
||||
if (op == VARIABLE_DEF_OP) {
|
||||
// LHS is the name. RHS is a single character, either a literal
|
||||
// or a set (already parsed). If RHS is longer than one
|
||||
// character, it is either a multi-character string, or multiple
|
||||
// sets, or a mixture of chars and sets -- syntax error.
|
||||
if (buf.length() != 1) {
|
||||
return syntaxError(RuleBasedTransliterator::MALFORMED_RHS, rules, start);
|
||||
}
|
||||
if (data->isVariableDefined(left)) {
|
||||
return syntaxError(RuleBasedTransliterator::DUPLICATE_VARIABLE_DEFINITION, rules, start);
|
||||
}
|
||||
data->defineVariable(left, buf.charAt(0), status);
|
||||
break;
|
||||
|
||||
case FORWARD_RULE_OP:
|
||||
if (direction == RuleBasedTransliterator::FORWARD) {
|
||||
if (ante >= 0 || post >= 0 || leftCursor >= 0) {
|
||||
return syntaxError(RuleBasedTransliterator::MALFORMED_RULE, rules, start);
|
||||
}
|
||||
data->ruleSet.addRule(new TransliterationRule(
|
||||
left, leftAnte, leftPost,
|
||||
buf, cursor, status), status);
|
||||
} // otherwise ignore the rule; it's not the direction we want
|
||||
break;
|
||||
|
||||
case REVERSE_RULE_OP:
|
||||
if (direction == RuleBasedTransliterator::REVERSE) {
|
||||
if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
|
||||
return syntaxError(RuleBasedTransliterator::MALFORMED_RULE, rules, start);
|
||||
}
|
||||
data->ruleSet.addRule(new TransliterationRule(
|
||||
buf, ante, post,
|
||||
left, leftCursor, status), status);
|
||||
} // otherwise ignore the rule; it's not the direction we want
|
||||
break;
|
||||
|
||||
case FWDREV_RULE_OP:
|
||||
if (direction == RuleBasedTransliterator::FORWARD) {
|
||||
// The output side is the right; trim off any context
|
||||
if (post >= 0) {
|
||||
buf.remove(post);
|
||||
}
|
||||
if (ante >= 0) {
|
||||
buf.removeBetween(0, ante);
|
||||
}
|
||||
data->ruleSet.addRule(new TransliterationRule(
|
||||
left, leftAnte, leftPost,
|
||||
buf, cursor, status), status);
|
||||
} else {
|
||||
// The output side is the left; trim off any context
|
||||
if (leftPost >= 0) {
|
||||
left.remove(leftPost);
|
||||
}
|
||||
if (leftAnte >= 0) {
|
||||
left.removeBetween(0, leftAnte);
|
||||
}
|
||||
data->ruleSet.addRule(new TransliterationRule(
|
||||
buf, ante, post,
|
||||
left, leftCursor, status), status);
|
||||
// We expect to see a single undefined variable (the one being
|
||||
// defined).
|
||||
if (undefinedVariableName.length() == 0) {
|
||||
// "Missing '$' or duplicate definition"
|
||||
return syntaxError(RuleBasedTransliterator::BAD_VARIABLE_DEFINITION, rule, start);
|
||||
}
|
||||
break;
|
||||
if (left->text.length() != 1 || left->text.charAt(0) != variableLimit) {
|
||||
// "Malformed LHS"
|
||||
return syntaxError(RuleBasedTransliterator::MALFORMED_VARIABLE_DEFINITION, rule, start);
|
||||
}
|
||||
// We allow anything on the right, including an empty string.
|
||||
UnicodeString* value = new UnicodeString(right->text);
|
||||
data->variableNames->put(undefinedVariableName, value, status);
|
||||
|
||||
++variableLimit;
|
||||
return pos;
|
||||
}
|
||||
|
||||
// If this is not a variable definition rule, we shouldn't have
|
||||
// any undefined variable names.
|
||||
if (undefinedVariableName.length() != 0) {
|
||||
syntaxError(// "Undefined variable $" + undefinedVariableName,
|
||||
RuleBasedTransliterator::UNDEFINED_VARIABLE,
|
||||
rule, start);
|
||||
}
|
||||
|
||||
// If the direction we want doesn't match the rule
|
||||
// direction, do nothing.
|
||||
if (op != FWDREV_RULE_OP &&
|
||||
((direction == Transliterator::FORWARD) != (op == FORWARD_RULE_OP))) {
|
||||
return pos;
|
||||
}
|
||||
|
||||
// Transform the rule into a forward rule by swapping the
|
||||
// sides if necessary.
|
||||
if (direction == Transliterator::REVERSE) {
|
||||
left = &_right;
|
||||
right = &_left;
|
||||
}
|
||||
|
||||
// Remove non-applicable elements in forward-reverse
|
||||
// rules. Bidirectional rules ignore elements that do not
|
||||
// apply.
|
||||
if (op == FWDREV_RULE_OP) {
|
||||
right->removeContext();
|
||||
delete right->segments;
|
||||
right->segments = NULL;
|
||||
left->cursor = left->maxRef = -1;
|
||||
left->cursorOffset = 0;
|
||||
}
|
||||
|
||||
// Normalize context
|
||||
if (left->ante < 0) {
|
||||
left->ante = 0;
|
||||
}
|
||||
if (left->post < 0) {
|
||||
left->post = left->text.length();
|
||||
}
|
||||
|
||||
// Context is only allowed on the input side. Cursors are only
|
||||
// allowed on the output side. Segment delimiters can only appear
|
||||
// on the left, and references on the right. Cursor offset
|
||||
// cannot appear without an explicit cursor. Cursor offset
|
||||
// cannot place the cursor outside the limits of the context.
|
||||
if (right->ante >= 0 || right->post >= 0 || left->cursor >= 0 ||
|
||||
right->segments != NULL || left->maxRef >= 0 ||
|
||||
(right->cursorOffset != 0 && right->cursor < 0) ||
|
||||
(right->cursorOffset > (left->text.length() - left->post)) ||
|
||||
(-right->cursorOffset > left->ante)) {
|
||||
return syntaxError(RuleBasedTransliterator::MALFORMED_RULE, rule, start);
|
||||
}
|
||||
|
||||
// Check integrity of segments and segment references. Each
|
||||
// segment's start must have a corresponding limit, and the
|
||||
// references must not refer to segments that do not exist.
|
||||
if (left->segments != NULL) {
|
||||
int n = left->segments->size();
|
||||
if (n % 2 != 0) {
|
||||
return syntaxError(RuleBasedTransliterator::MISSING_SEGMENT_CLOSE, rule, start);
|
||||
}
|
||||
n /= 2;
|
||||
if (right->maxRef > n) {
|
||||
return syntaxError(RuleBasedTransliterator::UNDEFINED_SEGMENT_REFERENCE, rule, start);
|
||||
}
|
||||
}
|
||||
|
||||
data->ruleSet.addRule(new TransliterationRule(
|
||||
left->text, left->ante, left->post,
|
||||
right->text, right->cursor, right->cursorOffset,
|
||||
left->createSegments(), status), status);
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
|
@ -474,6 +735,9 @@ int32_t TransliterationRuleParser::syntaxError(int32_t parseErrorCode,
|
|||
if (end < 0) {
|
||||
end = rule.length();
|
||||
}
|
||||
if (end > (start + 80)) { // In case end wasn't found
|
||||
end = start + 80;
|
||||
}
|
||||
rule.extractBetween(start, end, parseError->context); // Current rule
|
||||
}
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
|
@ -481,20 +745,52 @@ int32_t TransliterationRuleParser::syntaxError(int32_t parseErrorCode,
|
|||
}
|
||||
|
||||
/**
|
||||
* Allocate a private-use substitution character for the given set,
|
||||
* register it in the setVariables hash, and return the substitution
|
||||
* character.
|
||||
* Parse a UnicodeSet out, store it, and return the stand-in character
|
||||
* used to represent it.
|
||||
*/
|
||||
UChar TransliterationRuleParser::registerSet(UnicodeSet* adoptedSet) {
|
||||
UChar TransliterationRuleParser::parseSet(const UnicodeString& rule,
|
||||
ParsePosition& pos) {
|
||||
UnicodeSet* set = new UnicodeSet(rule, pos, *parseData, status);
|
||||
if (variableNext >= variableLimit) {
|
||||
// throw new RuntimeException("Private use variables exhausted");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
setVariablesVector.addElement(adoptedSet);
|
||||
setVariablesVector.addElement(set);
|
||||
return variableNext++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Append the value of the given variable name to the given
|
||||
* UnicodeString.
|
||||
*/
|
||||
void TransliterationRuleParser::appendVariableDef(const UnicodeString& name,
|
||||
UnicodeString& buf) {
|
||||
const UnicodeString* s = (const UnicodeString*) data->variableNames->get(name);
|
||||
if (s == NULL) {
|
||||
// We allow one undefined variable so that variable definition
|
||||
// statements work. For the first undefined variable we return
|
||||
// the special placeholder variableLimit-1, and save the variable
|
||||
// name.
|
||||
if (undefinedVariableName.length() == 0) {
|
||||
undefinedVariableName = name;
|
||||
if (variableNext >= variableLimit) {
|
||||
// throw new RuntimeException("Private use variables exhausted");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
buf.append((UChar) --variableLimit);
|
||||
} else {
|
||||
//throw new IllegalArgumentException("Undefined variable $"
|
||||
// + name);
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
buf.append(*s);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines what part of the private use region of Unicode we can use for
|
||||
* variable stand-ins. The correct way to do this is as follows: Parse each
|
||||
|
@ -511,7 +807,9 @@ void TransliterationRuleParser::determineVariableRange(void) {
|
|||
data->setVariablesBase = variableNext = variableLimit = (UChar) 0;
|
||||
|
||||
if (r != 0) {
|
||||
data->setVariablesBase = variableNext = r->start;
|
||||
// Allocate 9 characters for segment references 1 through 9
|
||||
data->segmentBase = r->start;
|
||||
data->setVariablesBase = variableNext = (UChar) (data->segmentBase + 9);
|
||||
variableLimit = (UChar) (r->start + r->length);
|
||||
delete r;
|
||||
}
|
||||
|
|
|
@ -15,6 +15,8 @@
|
|||
class TransliterationRuleData;
|
||||
class UnicodeSet;
|
||||
class ParseData;
|
||||
class RuleHalf;
|
||||
class ParsePosition;
|
||||
|
||||
class TransliterationRuleParser {
|
||||
|
||||
|
@ -67,25 +69,16 @@ class TransliterationRuleParser {
|
|||
*/
|
||||
UChar variableLimit;
|
||||
|
||||
// Operators
|
||||
static const UChar VARIABLE_DEF_OP;
|
||||
static const UChar FORWARD_RULE_OP;
|
||||
static const UChar REVERSE_RULE_OP;
|
||||
static const UChar FWDREV_RULE_OP; // internal rep of <> op
|
||||
static const UnicodeString OPERATORS;
|
||||
/**
|
||||
* When we encounter an undefined variable, we do not immediately signal
|
||||
* an error, in case we are defining this variable, e.g., "$a = [a-z];".
|
||||
* Instead, we save the name of the undefined variable, and substitute
|
||||
* in the placeholder char variableLimit - 1, and decrement
|
||||
* variableLimit.
|
||||
*/
|
||||
UnicodeString undefinedVariableName;
|
||||
|
||||
// Other special characters
|
||||
static const UChar QUOTE;
|
||||
static const UChar ESCAPE;
|
||||
static const UChar END_OF_RULE;
|
||||
static const UChar RULE_COMMENT_CHAR;
|
||||
static const UChar VARIABLE_REF_OPEN;
|
||||
static const UChar VARIABLE_REF_CLOSE;
|
||||
static const UChar CONTEXT_OPEN;
|
||||
static const UChar CONTEXT_CLOSE;
|
||||
static const UChar SET_OPEN;
|
||||
static const UChar SET_CLOSE;
|
||||
static const UChar CURSOR_POS;
|
||||
static const UnicodeString gOPERATORS;
|
||||
|
||||
public:
|
||||
|
||||
|
@ -151,8 +144,22 @@ private:
|
|||
* register it in the setVariables hash, and return the substitution
|
||||
* character.
|
||||
*/
|
||||
UChar registerSet(UnicodeSet* adoptedSet);
|
||||
//UChar registerSet(UnicodeSet* adoptedSet);
|
||||
|
||||
/**
|
||||
* Parse a UnicodeSet out, store it, and return the stand-in character
|
||||
* used to represent it.
|
||||
*/
|
||||
UChar parseSet(const UnicodeString& rule,
|
||||
ParsePosition& pos);
|
||||
|
||||
/**
|
||||
* Append the value of the given variable name to the given
|
||||
* UnicodeString.
|
||||
*/
|
||||
void appendVariableDef(const UnicodeString& name,
|
||||
UnicodeString& buf);
|
||||
|
||||
/**
|
||||
* Determines what part of the private use region of Unicode we can use for
|
||||
* variable stand-ins. The correct way to do this is as follows: Parse each
|
||||
|
@ -178,6 +185,8 @@ private:
|
|||
static int32_t quotedIndexOf(const UnicodeString& text,
|
||||
int32_t start, int32_t limit,
|
||||
UChar c);
|
||||
|
||||
friend class RuleHalf;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
@ -13,6 +13,38 @@
|
|||
#include "unicode/unifilt.h"
|
||||
#include "unicode/uniset.h"
|
||||
|
||||
/**
|
||||
* Construct a new rule with the given input, output text, and other
|
||||
* attributes. A cursor position may be specified for the output text.
|
||||
* @param input input string, including key and optional ante and
|
||||
* post context
|
||||
* @param anteContextPos offset into input to end of ante context, or -1 if
|
||||
* none. Must be <= input.length() if not -1.
|
||||
* @param postContextPos offset into input to start of post context, or -1
|
||||
* if none. Must be <= input.length() if not -1, and must be >=
|
||||
* anteContextPos.
|
||||
* @param output output string
|
||||
* @param cursorPos offset into output at which cursor is located, or -1 if
|
||||
* none. If less than zero, then the cursor is placed after the
|
||||
* <code>output</code>; that is, -1 is equivalent to
|
||||
* <code>output.length()</code>. If greater than
|
||||
* <code>output.length()</code> then an exception is thrown.
|
||||
* @param adoptedSegs array of 2n integers. Each of n pairs consists of offset,
|
||||
* limit for a segment of the input string. Characters in the output string
|
||||
* refer to these segments if they are in a special range determined by the
|
||||
* associated RuleBasedTransliterator.Data object. May be null if there are
|
||||
* no segments.
|
||||
*/
|
||||
TransliterationRule::TransliterationRule(const UnicodeString& input,
|
||||
int32_t anteContextPos, int32_t postContextPos,
|
||||
const UnicodeString& output,
|
||||
int32_t cursorPos, int32_t cursorOffset,
|
||||
int32_t* adoptedSegs,
|
||||
UErrorCode& status) {
|
||||
init(input, anteContextPos, postContextPos,
|
||||
output, cursorPos, cursorOffset, adoptedSegs, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new rule with the given input, output text, and other
|
||||
* attributes. A cursor position may be specified for the output text.
|
||||
|
@ -35,6 +67,16 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
|
|||
const UnicodeString& output,
|
||||
int32_t cursorPos,
|
||||
UErrorCode& status) {
|
||||
init(input, anteContextPos, postContextPos,
|
||||
output, cursorPos, 0, NULL, status);
|
||||
}
|
||||
|
||||
void TransliterationRule::init(const UnicodeString& input,
|
||||
int32_t anteContextPos, int32_t postContextPos,
|
||||
const UnicodeString& output,
|
||||
int32_t cursorPos, int32_t cursorOffset,
|
||||
int32_t* adoptedSegs,
|
||||
UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
@ -61,35 +103,24 @@ TransliterationRule::TransliterationRule(const UnicodeString& input,
|
|||
keyLength = postContextPos - anteContextLength;
|
||||
}
|
||||
if (cursorPos < 0) {
|
||||
this->cursorPos = output.length();
|
||||
cursorPos = output.length();
|
||||
} else {
|
||||
if (cursorPos > output.length()) {
|
||||
// throw new IllegalArgumentException("Invalid cursor position");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
this->cursorPos = cursorPos;
|
||||
}
|
||||
this->cursorPos = cursorPos + cursorOffset;
|
||||
pattern = input;
|
||||
this->output = output;
|
||||
// We don't validate the segments array. The caller must
|
||||
// guarantee that the segments are well-formed.
|
||||
this->segments = adoptedSegs;
|
||||
}
|
||||
|
||||
TransliterationRule::~TransliterationRule() {}
|
||||
|
||||
/**
|
||||
* Return the length of the key. Equivalent to <code>getKey().length()</code>.
|
||||
* @return the length of the match key.
|
||||
*/
|
||||
int32_t TransliterationRule::getKeyLength(void) const {
|
||||
return keyLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the output string.
|
||||
* @return the output string.
|
||||
*/
|
||||
const UnicodeString& TransliterationRule::getOutput(void) const {
|
||||
return output;
|
||||
TransliterationRule::~TransliterationRule() {
|
||||
delete[] segments;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -115,7 +146,7 @@ int32_t TransliterationRule::getAnteContextLength(void) const {
|
|||
* unless the first character of the key is a set. If it's a
|
||||
* set, or otherwise can match multiple keys, the index value is -1.
|
||||
*/
|
||||
int16_t TransliterationRule::getIndexValue(const TransliterationRuleData& data) {
|
||||
int16_t TransliterationRule::getIndexValue(const TransliterationRuleData& data) const {
|
||||
if (anteContextLength == pattern.length()) {
|
||||
// A pattern with just ante context {such as foo)>bar} can
|
||||
// match any key.
|
||||
|
@ -125,6 +156,71 @@ int16_t TransliterationRule::getIndexValue(const TransliterationRuleData& data)
|
|||
return data.lookupSet(c) == NULL ? (c & 0xFF) : -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Do a replacement of the input pattern with the output text in
|
||||
* the given string, at the given offset. This method assumes
|
||||
* that a match has already been found in the given text at the
|
||||
* given position.
|
||||
* @param text the text containing the substring to be replaced
|
||||
* @param offset the offset into the text at which the pattern
|
||||
* matches. This is the offset to the point after the ante
|
||||
* context, if any, and before the match string and any post
|
||||
* context.
|
||||
* @param data the RuleBasedTransliterator.Data object specifying
|
||||
* context for this transliterator.
|
||||
* @return the change in the length of the text
|
||||
*/
|
||||
int32_t TransliterationRule::replace(Replaceable& text, int32_t offset,
|
||||
const TransliterationRuleData& data) const {
|
||||
if (segments == NULL) {
|
||||
text.handleReplaceBetween(offset, offset + keyLength, output);
|
||||
return output.length() - keyLength;
|
||||
} else {
|
||||
/* When there are segments to be copied, use the Replaceable.copy()
|
||||
* API in order to retain out-of-band data. Copy everything to the
|
||||
* point after the key, then delete the key. That is, copy things
|
||||
* into offset + keyLength, then replace offset .. offset +
|
||||
* keyLength with the empty string.
|
||||
*
|
||||
* Minimize the number of calls to Replaceable.replace() and
|
||||
* Replaceable.copy().
|
||||
*/
|
||||
int32_t textStart = offset - anteContextLength;
|
||||
int32_t dest = offset + keyLength; // copy new text to here
|
||||
UnicodeString buf;
|
||||
for (int32_t i=0; i<output.length(); ++i) {
|
||||
UChar c = output.charAt(i);
|
||||
int32_t b = data.lookupSegmentReference(c);
|
||||
if (b < 0) {
|
||||
// Accumulate straight (non-segment) text.
|
||||
buf.append(c);
|
||||
} else {
|
||||
// Insert any accumulated straight text.
|
||||
if (buf.length() > 0) {
|
||||
text.handleReplaceBetween(dest, dest, buf);
|
||||
dest += buf.length();
|
||||
buf.remove();
|
||||
}
|
||||
// Copy segment with out-of-band data
|
||||
b *= 2;
|
||||
text.copy(textStart + segments[b],
|
||||
textStart + segments[b+1], dest);
|
||||
dest += segments[b+1] - segments[b];
|
||||
}
|
||||
|
||||
}
|
||||
// Insert any accumulated straight text.
|
||||
if (buf.length() > 0) {
|
||||
text.handleReplaceBetween(dest, dest, buf);
|
||||
dest += buf.length();
|
||||
}
|
||||
// Delete the key
|
||||
buf.remove();
|
||||
text.handleReplaceBetween(offset, offset + keyLength, buf);
|
||||
return dest - (offset + keyLength) - keyLength;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal method. Returns true if this rule matches the given
|
||||
* index value. The index value is an 8-bit integer, 0..255,
|
||||
|
@ -136,7 +232,7 @@ int16_t TransliterationRule::getIndexValue(const TransliterationRuleData& data)
|
|||
* then it will match any key.
|
||||
*/
|
||||
UBool TransliterationRule::matchesIndexValue(uint8_t v,
|
||||
const TransliterationRuleData& data) {
|
||||
const TransliterationRuleData& data) const {
|
||||
if (anteContextLength == pattern.length()) {
|
||||
// A pattern with just ante context {such as foo)>bar} can
|
||||
// match any key.
|
||||
|
|
|
@ -86,6 +86,21 @@ private:
|
|||
*/
|
||||
UnicodeString output;
|
||||
|
||||
/**
|
||||
* Array of segments. These are segments of the input string that may be
|
||||
* referenced and appear in the output string. Each segment is stored as an
|
||||
* offset, limit pair. Segments are referenced by a 1-based index;
|
||||
* reference i thus includes characters at offset segments[2*i-2] to
|
||||
* segments[2*i-1]-1 in the pattern string.
|
||||
*
|
||||
* In the output string, a segment reference is indicated by a character in
|
||||
* a special range, as defined by RuleBasedTransliterator.Data.
|
||||
*
|
||||
* Most rules have no segments, in which case segments is null, and the
|
||||
* output string need not be checked for segment reference characters.
|
||||
*/
|
||||
int32_t* segments;
|
||||
|
||||
/**
|
||||
* The length of the string that must match before the key. If
|
||||
* zero, then there is no matching requirement before the key.
|
||||
|
@ -109,6 +124,35 @@ private:
|
|||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Construct a new rule with the given input, output text, and other
|
||||
* attributes. A cursor position may be specified for the output text.
|
||||
* @param input input string, including key and optional ante and
|
||||
* post context
|
||||
* @param anteContextPos offset into input to end of ante context, or -1 if
|
||||
* none. Must be <= input.length() if not -1.
|
||||
* @param postContextPos offset into input to start of post context, or -1
|
||||
* if none. Must be <= input.length() if not -1, and must be >=
|
||||
* anteContextPos.
|
||||
* @param output output string
|
||||
* @param cursorPos offset into output at which cursor is located, or -1 if
|
||||
* none. If less than zero, then the cursor is placed after the
|
||||
* <code>output</code>; that is, -1 is equivalent to
|
||||
* <code>output.length()</code>. If greater than
|
||||
* <code>output.length()</code> then an exception is thrown.
|
||||
* @param adoptedSegs array of 2n integers. Each of n pairs consists of offset,
|
||||
* limit for a segment of the input string. Characters in the output string
|
||||
* refer to these segments if they are in a special range determined by the
|
||||
* associated RuleBasedTransliterator.Data object. May be null if there are
|
||||
* no segments.
|
||||
*/
|
||||
TransliterationRule(const UnicodeString& input,
|
||||
int32_t anteContextPos, int32_t postContextPos,
|
||||
const UnicodeString& output,
|
||||
int32_t cursorPos, int32_t cursorOffset,
|
||||
int32_t* adoptedSegs,
|
||||
UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Construct a new rule with the given input, output text, and other
|
||||
* attributes. A cursor position may be specified for the output text.
|
||||
|
@ -137,18 +181,6 @@ public:
|
|||
*/
|
||||
virtual ~TransliterationRule();
|
||||
|
||||
/**
|
||||
* Return the length of the key. Equivalent to <code>getKey().length()</code>.
|
||||
* @return the length of the match key.
|
||||
*/
|
||||
virtual int32_t getKeyLength(void) const;
|
||||
|
||||
/**
|
||||
* Return the output string.
|
||||
* @return the output string.
|
||||
*/
|
||||
virtual const UnicodeString& getOutput(void) const;
|
||||
|
||||
/**
|
||||
* Return the position of the cursor within the output string.
|
||||
* @return a value from 0 to <code>getOutput().length()</code>, inclusive.
|
||||
|
@ -168,7 +200,24 @@ public:
|
|||
* unless the first character of the key is a set. If it's a
|
||||
* set, or otherwise can match multiple keys, the index value is -1.
|
||||
*/
|
||||
int16_t getIndexValue(const TransliterationRuleData& data);
|
||||
int16_t getIndexValue(const TransliterationRuleData& data) const;
|
||||
|
||||
/**
|
||||
* Do a replacement of the input pattern with the output text in
|
||||
* the given string, at the given offset. This method assumes
|
||||
* that a match has already been found in the given text at the
|
||||
* given position.
|
||||
* @param text the text containing the substring to be replaced
|
||||
* @param offset the offset into the text at which the pattern
|
||||
* matches. This is the offset to the point after the ante
|
||||
* context, if any, and before the match string and any post
|
||||
* context.
|
||||
* @param data the RuleBasedTransliterator.Data object specifying
|
||||
* context for this transliterator.
|
||||
* @return the change in the length of the text
|
||||
*/
|
||||
int32_t replace(Replaceable& text, int32_t offset,
|
||||
const TransliterationRuleData& data) const;
|
||||
|
||||
/**
|
||||
* Internal method. Returns true if this rule matches the given
|
||||
|
@ -181,7 +230,7 @@ public:
|
|||
* then it will match any key.
|
||||
*/
|
||||
UBool matchesIndexValue(uint8_t v,
|
||||
const TransliterationRuleData& data);
|
||||
const TransliterationRuleData& data) const;
|
||||
|
||||
/**
|
||||
* Return true if this rule masks another rule. If r1 masks r2 then
|
||||
|
@ -289,6 +338,15 @@ public:
|
|||
virtual UBool charMatches(UChar keyChar, UChar textChar,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const;
|
||||
|
||||
private:
|
||||
|
||||
void init(const UnicodeString& input,
|
||||
int32_t anteContextPos, int32_t postContextPos,
|
||||
const UnicodeString& output,
|
||||
int32_t cursorPos, int32_t cursorOffset,
|
||||
int32_t* adoptedSegs,
|
||||
UErrorCode& status);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
@ -10,20 +10,56 @@
|
|||
#ifndef SYMTABLE_H
|
||||
#define SYMTABLE_H
|
||||
|
||||
class ParsePosition;
|
||||
class UnicodeSet;
|
||||
class UnicodeString;
|
||||
|
||||
/**
|
||||
* An abstract class that maps strings to objects.
|
||||
* An interface that maps strings to objects. This interface defines
|
||||
* both lookup protocol and parsing. This allows different components
|
||||
* to share a symbol table and to handle name parsing uniformly. It
|
||||
* is expected that client parse code look for the SYMBOL_REF
|
||||
* character and, when seen, attempt to parse the characters after it
|
||||
* using parseReference().
|
||||
*
|
||||
* <p>Currently, RuleBasedTransliterator and UnicodeSet use this
|
||||
* interface to share variable definitions.
|
||||
*/
|
||||
class SymbolTable {
|
||||
public:
|
||||
|
||||
/**
|
||||
* Lookup the object associated with this string and return it.
|
||||
* Return U_ILLEGAL_ARGUMENT_ERROR status if the name does not
|
||||
* exist. Return a non-NULL set if the name is mapped to a set;
|
||||
* otherwise return a NULL set.
|
||||
* The character preceding a symbol reference name.
|
||||
*/
|
||||
virtual void lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,
|
||||
UErrorCode& status) const = 0;
|
||||
enum { SYMBOL_REF = 0x0024 /*$*/ };
|
||||
|
||||
/**
|
||||
* Lookup the characters associated with this string and return it.
|
||||
* Return <tt>NULL</tt> if no such name exists. The resultant
|
||||
* string may have length zero.
|
||||
*/
|
||||
virtual const UnicodeString* lookup(const UnicodeString& s) const = 0;
|
||||
|
||||
/**
|
||||
* Lookup the UnicodeSet associated with the given character, and
|
||||
* return it. Return <tt>null</tt> if not found.
|
||||
*/
|
||||
virtual const UnicodeSet* lookupSet(UChar ch) const = 0;
|
||||
|
||||
/**
|
||||
* Parse a symbol reference name from the given string, starting
|
||||
* at the given position. If no valid symbol reference name is
|
||||
* found, throw an exception.
|
||||
* @param text the text to parse for the name
|
||||
* @param pos on entry, the index of the first character to parse.
|
||||
* This is the character following the SYMBOL_REF character. On
|
||||
* exit, the index after the last parsed character.
|
||||
* @param limit the index after the last character to be parsed.
|
||||
* @return the parsed name.
|
||||
* @exception IllegalArgumentException if no valid name is found.
|
||||
*/
|
||||
virtual UnicodeString parseReference(const UnicodeString& text,
|
||||
ParsePosition& pos, int32_t limit) const = 0;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
@ -17,194 +17,237 @@
|
|||
class TransliterationRuleData;
|
||||
|
||||
/**
|
||||
* A transliterator that reads a set of rules in order to determine how to perform
|
||||
* translations. Rules are stored in resource bundles indexed by name. Rules are separated by
|
||||
* semicolons (';'). To include a literal semicolon, prefix it with a backslash ('\;').
|
||||
* Whitespace, as defined by <code>Character.isWhitespace()</code>, is ignored. If the first
|
||||
* non-blank character on a line is '#', the entire line is ignored as a comment. </p>
|
||||
* <code>RuleBasedTransliterator</code> is a transliterator
|
||||
* that reads a set of rules in order to determine how to perform
|
||||
* translations. Rule sets are stored in resource bundles indexed by
|
||||
* name. Rules within a rule set are separated by semicolons (';').
|
||||
* To include a literal semicolon, prefix it with a backslash ('\').
|
||||
* Whitespace, as defined by <code>Character.isWhitespace()</code>,
|
||||
* is ignored. If the first non-blank character on a line is '#',
|
||||
* the entire line is ignored as a comment. </p>
|
||||
*
|
||||
* <p>Each set of rules consists of two groups, one forward, and one reverse. This is a
|
||||
* convention that is not enforced; rules for one direction may be omitted, with the result
|
||||
* that translations in that direction will not modify the source text. </p>
|
||||
* <p>Each set of rules consists of two groups, one forward, and one
|
||||
* reverse. This is a convention that is not enforced; rules for one
|
||||
* direction may be omitted, with the result that translations in
|
||||
* that direction will not modify the source text. In addition,
|
||||
* bidirectional forward-reverse rules may be specified for
|
||||
* symmetrical transformations.</p>
|
||||
*
|
||||
* <p><b>Rule syntax</b> </p>
|
||||
*
|
||||
* <p>Rule statements take one of the following forms:
|
||||
* <p>Rule statements take one of the following forms: </p>
|
||||
*
|
||||
* <dl>
|
||||
* <dt><code>alefmadda=\u0622</code></dt>
|
||||
* <dd><strong>Variable definition.</strong> The name on the left is assigned the character or
|
||||
* expression on the right. Names may not contain any special characters (see list below).
|
||||
* Duplicate names (including duplicates of simple variables or category names) cause an
|
||||
* exception to be thrown. If the right hand side consists of one character, then the
|
||||
* variable stands for that character. In this example, after this statement, instances of
|
||||
* the left hand name surrounded by braces, "<code>{alefmadda}</code>", will be
|
||||
* replaced by the Unicode character U+0622. If the right hand side is longer than one
|
||||
* character, then it is interpreted as a character category expression; see below for
|
||||
* details.</dd>
|
||||
* <dt> </dt>
|
||||
* <dt><code>softvowel=[eiyEIY]</code></dt>
|
||||
* <dd><strong>Category definition.</strong> The name on the left is assigned to stand for a
|
||||
* set of characters. The same rules for names of simple variables apply. After this
|
||||
* statement, the left hand variable will be interpreted as indicating a set of characters in
|
||||
* appropriate contexts. The pattern syntax defining sets of characters is defined by {@link
|
||||
* UnicodeSet}. Examples of valid patterns are:<table>
|
||||
* <tr valign="top">
|
||||
* <td nowrap><code>[abc]</code></td>
|
||||
* <td>The set containing the characters 'a', 'b', and 'c'.</td>
|
||||
* </tr>
|
||||
* <tr valign="top">
|
||||
* <td nowrap><code>[^abc]</code></td>
|
||||
* <td>The set of all characters <em>except</em> 'a', 'b', and 'c'.</td>
|
||||
* </tr>
|
||||
* <tr valign="top">
|
||||
* <td nowrap><code>[A-Z]</code></td>
|
||||
* <td>The set of all characters from 'A' to 'Z' in Unicode order.</td>
|
||||
* </tr>
|
||||
* <tr valign="top">
|
||||
* <td nowrap><code>[:Lu:]</code></td>
|
||||
* <td>The set of Unicode uppercase letters. See <a href="http://www.unicode.org">www.unicode.org</a>
|
||||
* for a complete list of categories and their two-letter codes.</td>
|
||||
* </tr>
|
||||
* <tr valign="top">
|
||||
* <td nowrap><code>[^a-z[:Lu:][:Ll:]]</code></td>
|
||||
* <td>The set of all characters <em>except</em> 'a' through 'z' and uppercase or lowercase
|
||||
* letters.</td>
|
||||
* </tr>
|
||||
* </table>
|
||||
* <p>See {@link UnicodeSet} for more documentation and examples. </p>
|
||||
* </dd>
|
||||
* <dt><code>ai>{alefmadda}</code></dt>
|
||||
* <dd><strong>Forward translation rule.</strong> This rule states that the string on the left
|
||||
* will be changed to the string on the right when performing forward transliteration.</dd>
|
||||
* <dt> </dt>
|
||||
* <dt><code>ai<{alefmadda}</code></dt>
|
||||
* <dd><strong>Reverse translation rule.</strong> This rule states that the string on the right
|
||||
* will be changed to the string on the left when performing reverse transliteration.</dd>
|
||||
* <dt><code>$alefmadda=\u0622;</code></dt>
|
||||
* <dd><strong>Variable definition.</strong> The name on the
|
||||
* left is assigned the text on the right. In this example,
|
||||
* after this statement, instances of the left hand name,
|
||||
* "<code>$alefmadda</code>", will be replaced by
|
||||
* the Unicode character U+0622. Variable names must begin
|
||||
* with a letter and consist only of letters, digits, and
|
||||
* underscores. Case is significant. Duplicate names cause
|
||||
* an exception to be thrown, that is, variables cannot be
|
||||
* redefined. The right hand side may contain well-formed
|
||||
* text of any length, including no text at all ("<code>$empty=;</code>").
|
||||
* The right hand side may contain embedded <code>UnicodeSet</code>
|
||||
* patterns, for example, "<code>$softvowel=[eiyEIY]</code>".</dd>
|
||||
* <dd> </dd>
|
||||
* <dt><code>ai>$alefmadda;</code></dt>
|
||||
* <dd><strong>Forward translation rule.</strong> This rule
|
||||
* states that the string on the left will be changed to the
|
||||
* string on the right when performing forward
|
||||
* transliteration.</dd>
|
||||
* <dt> </dt>
|
||||
* <dt><code>ai<$alefmadda;</code></dt>
|
||||
* <dd><strong>Reverse translation rule.</strong> This rule
|
||||
* states that the string on the right will be changed to
|
||||
* the string on the left when performing reverse
|
||||
* transliteration.</dd>
|
||||
* </dl>
|
||||
*
|
||||
* <dl>
|
||||
* <dt><code>ai<>{alefmadda}</code></dt>
|
||||
* <dd><strong>Bidirectional translation rule.</strong> This rule states that the string on the
|
||||
* right will be changed to the string on the left when performing forward transliteration,
|
||||
* and vice versa when performing reverse transliteration.</dd>
|
||||
* <dt><code>ai<>$alefmadda;</code></dt>
|
||||
* <dd><strong>Bidirectional translation rule.</strong> This
|
||||
* rule states that the string on the right will be changed
|
||||
* to the string on the left when performing forward
|
||||
* transliteration, and vice versa when performing reverse
|
||||
* transliteration.</dd>
|
||||
* </dl>
|
||||
*
|
||||
* <p>Forward and reverse translation rules consist of a <em>match pattern</em> and an <em>output
|
||||
* string</em>. The match pattern consists of literal characters, optionally preceded by
|
||||
* context, and optionally followed by context. Context characters, like literal pattern
|
||||
* characters, must be matched in the text being transliterated. However, unlike literal
|
||||
* pattern characters, they are not replaced by the output text. For example, the pattern
|
||||
* "<code>(abc)def</code>" indicates the characters "<code>def</code>"
|
||||
* must be preceded by "<code>abc</code>" for a successful match. If there is a
|
||||
* successful match, "<code>def</code>" will be replaced, but not "<code>abc</code>".
|
||||
* The initial '<code>(</code>' is optional, so "<code>abc)def</code>" is
|
||||
* equivalent to "<code>(abc)def</code>". Another example is "<code>123(456)</code>"
|
||||
* (or "<code>123(456</code>") in which the literal pattern "<code>123</code>"
|
||||
* must be followed by "<code>456</code>". </p>
|
||||
* <p>Translation rules consist of a <em>match pattern</em> and an <em>output
|
||||
* string</em>. The match pattern consists of literal characters,
|
||||
* optionally preceded by context, and optionally followed by
|
||||
* context. Context characters, like literal pattern characters,
|
||||
* must be matched in the text being transliterated. However, unlike
|
||||
* literal pattern characters, they are not replaced by the output
|
||||
* text. For example, the pattern "<code>abc{def}</code>"
|
||||
* indicates the characters "<code>def</code>" must be
|
||||
* preceded by "<code>abc</code>" for a successful match.
|
||||
* If there is a successful match, "<code>def</code>" will
|
||||
* be replaced, but not "<code>abc</code>". The final '<code>}</code>'
|
||||
* is optional, so "<code>abc{def</code>" is equivalent to
|
||||
* "<code>abc{def}</code>". Another example is "<code>{123}456</code>"
|
||||
* (or "<code>123}456</code>") in which the literal
|
||||
* pattern "<code>123</code>" must be followed by "<code>456</code>".
|
||||
* </p>
|
||||
*
|
||||
* <p>The output string of a forward or reverse rule consists of characters to replace the
|
||||
* literal pattern characters. If the output string contains the character '<code>|</code>',
|
||||
* this is taken to indicate the location of the <em>cursor</em> after replacement. The
|
||||
* cursor is the point in the text at which the next replacement, if any, will be applied. </p>
|
||||
*
|
||||
* <p>In addition to being defined in variables, <code>UnicodeSet</code> patterns may be
|
||||
* embedded directly into rule strings. Thus, the following two rules are equivalent:</p>
|
||||
* <p>The output string of a forward or reverse rule consists of
|
||||
* characters to replace the literal pattern characters. If the
|
||||
* output string contains the character '<code>|</code>', this is
|
||||
* taken to indicate the location of the <em>cursor</em> after
|
||||
* replacement. The cursor is the point in the text at which the
|
||||
* next replacement, if any, will be applied. The cursor is usually
|
||||
* placed within the replacement text; however, it can actually be
|
||||
* placed into the precending or following context by using the
|
||||
* special character '<code>@</code>'. Examples:</p>
|
||||
*
|
||||
* <blockquote>
|
||||
* <p><code>vowel=[aeiou]; {vowel}>*; # One way to do this<br>
|
||||
* [aeiou]>*;
|
||||
* #
|
||||
* Another way</code></p>
|
||||
* <p><code>a {foo} z > | @ bar; # foo -> bar, move cursor
|
||||
* before a<br>
|
||||
* {foo} xyz > bar @@|; # foo -> bar, cursor between
|
||||
* y and z</code></p>
|
||||
* </blockquote>
|
||||
*
|
||||
* <p><b>UnicodeSet</b></p>
|
||||
*
|
||||
* <p><code>UnicodeSet</code> patterns may appear anywhere that
|
||||
* makes sense. They may appear in variable definitions.
|
||||
* Contrariwise, <code>UnicodeSet</code> patterns may themselves
|
||||
* contain variable references, such as "<code>$a=[a-z];$not_a=[^$a]</code>",
|
||||
* or "<code>$range=a-z;$ll=[$range]</code>".</p>
|
||||
*
|
||||
* <p><code>UnicodeSet</code> patterns may also be embedded directly
|
||||
* into rule strings. Thus, the following two rules are equivalent:</p>
|
||||
*
|
||||
* <blockquote>
|
||||
* <p><code>$vowel=[aeiou]; $vowel>'*'; # One way to do this<br>
|
||||
* [aeiou]>'*';
|
||||
* #
|
||||
* Another way</code></p>
|
||||
* </blockquote>
|
||||
*
|
||||
* <p>See {@link UnicodeSet} for more documentation and examples.</p>
|
||||
*
|
||||
* <p><b>Segments</b></p>
|
||||
*
|
||||
* <p>Segments of the input string can be matched and copied to the
|
||||
* output string. This makes certain sets of rules simpler and more
|
||||
* general, and makes reordering possible. For example:</p>
|
||||
*
|
||||
* <blockquote>
|
||||
* <p><code>([a-z]) > $1 $1;
|
||||
* #
|
||||
* double lowercase letters<br>
|
||||
* ([:Lu:]) ([:Ll:]) > $2 $1; # reverse order of Lu-Ll pairs</code></p>
|
||||
* </blockquote>
|
||||
*
|
||||
* <p>The segment of the input string to be copied is delimited by
|
||||
* "<code>(</code>" and "<code>)</code>". Up to
|
||||
* nine segments may be defined. Segments may not overlap. In the
|
||||
* output string, "<code>$1</code>" through "<code>$9</code>"
|
||||
* represent the input string segments, in left-to-right order of
|
||||
* definition.</p>
|
||||
*
|
||||
* <p><b>Example</b> </p>
|
||||
*
|
||||
* <p>The following example rules illustrate many of the features of the rule language. </p>
|
||||
* <p>The following example rules illustrate many of the features of
|
||||
* the rule language. </p>
|
||||
*
|
||||
* <table cellpadding="4">
|
||||
* <tr valign="top">
|
||||
* <td>Rule 1.</td>
|
||||
* <td nowrap><code>(abc)def>x|y</code></td>
|
||||
* </tr>
|
||||
* <tr valign="top">
|
||||
* <td>Rule 2.</td>
|
||||
* <td nowrap><code>xyz>r</code></td>
|
||||
* </tr>
|
||||
* <tr valign="top">
|
||||
* <td>Rule 3.</td>
|
||||
* <td nowrap><code>yz>q</code></td>
|
||||
* </tr>
|
||||
* <table border="0" cellpadding="4">
|
||||
* <tr>
|
||||
* <td valign="top">Rule 1.</td>
|
||||
* <td valign="top" nowrap><code>abc{def}>x|y</code></td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td valign="top">Rule 2.</td>
|
||||
* <td valign="top" nowrap><code>xyz>r</code></td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td valign="top">Rule 3.</td>
|
||||
* <td valign="top" nowrap><code>yz>q</code></td>
|
||||
* </tr>
|
||||
* </table>
|
||||
*
|
||||
* <p>Applying these rules to the string "<code>adefabcdefz</code>" yields the
|
||||
* following results: </p>
|
||||
* <p>Applying these rules to the string "<code>adefabcdefz</code>"
|
||||
* yields the following results: </p>
|
||||
*
|
||||
* <table cellpadding="4">
|
||||
* <tr valign="top">
|
||||
* <td nowrap><code>|adefabcdefz</code></td>
|
||||
* <td>Initial state, no rules match. Advance cursor.</td>
|
||||
* </tr>
|
||||
* <tr valign="top">
|
||||
* <td nowrap><code>a|defabcdefz</code></td>
|
||||
* <td>Still no match. Rule 1 does not match because the preceding context is not present.</td>
|
||||
* </tr>
|
||||
* <tr valign="top">
|
||||
* <td nowrap><code>ad|efabcdefz</code></td>
|
||||
* <td>Still no match. Keep advancing until there is a match...</td>
|
||||
* </tr>
|
||||
* <tr valign="top">
|
||||
* <td nowrap><code>ade|fabcdefz</code></td>
|
||||
* <td>...</td>
|
||||
* </tr>
|
||||
* <tr valign="top">
|
||||
* <td nowrap><code>adef|abcdefz</code></td>
|
||||
* <td>...</td>
|
||||
* </tr>
|
||||
* <tr valign="top">
|
||||
* <td nowrap><code>adefa|bcdefz</code></td>
|
||||
* <td>...</td>
|
||||
* </tr>
|
||||
* <tr valign="top">
|
||||
* <td nowrap><code>adefab|cdefz</code></td>
|
||||
* <td>...</td>
|
||||
* </tr>
|
||||
* <tr valign="top">
|
||||
* <td nowrap><code>adefabc|defz</code></td>
|
||||
* <td>Rule 1 matches; replace "<code>def</code>" with "<code>xy</code>"
|
||||
* and back up the cursor to before the '<code>y</code>'.</td>
|
||||
* </tr>
|
||||
* <tr valign="top">
|
||||
* <td nowrap><code>adefabcx|yz</code></td>
|
||||
* <td>Although "<code>xyz</code>" is present, rule 2 does not match because the
|
||||
* cursor is before the '<code>y</code>', not before the '<code>x</code>'. Rule 3 does match.
|
||||
* Replace "<code>yz</code>" with "<code>q</code>".</td>
|
||||
* </tr>
|
||||
* <tr valign="top">
|
||||
* <td nowrap><code>adefabcxq|</code></td>
|
||||
* <td>The cursor is at the end; transliteration is complete.</td>
|
||||
* </tr>
|
||||
* <table border="0" cellpadding="4">
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>|adefabcdefz</code></td>
|
||||
* <td valign="top">Initial state, no rules match. Advance
|
||||
* cursor.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>a|defabcdefz</code></td>
|
||||
* <td valign="top">Still no match. Rule 1 does not match
|
||||
* because the preceding context is not present.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>ad|efabcdefz</code></td>
|
||||
* <td valign="top">Still no match. Keep advancing until
|
||||
* there is a match...</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>ade|fabcdefz</code></td>
|
||||
* <td valign="top">...</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>adef|abcdefz</code></td>
|
||||
* <td valign="top">...</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>adefa|bcdefz</code></td>
|
||||
* <td valign="top">...</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>adefab|cdefz</code></td>
|
||||
* <td valign="top">...</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>adefabc|defz</code></td>
|
||||
* <td valign="top">Rule 1 matches; replace "<code>def</code>"
|
||||
* with "<code>xy</code>" and back up the cursor
|
||||
* to before the '<code>y</code>'.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>adefabcx|yz</code></td>
|
||||
* <td valign="top">Although "<code>xyz</code>" is
|
||||
* present, rule 2 does not match because the cursor is
|
||||
* before the '<code>y</code>', not before the '<code>x</code>'.
|
||||
* Rule 3 does match. Replace "<code>yz</code>"
|
||||
* with "<code>q</code>".</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>adefabcxq|</code></td>
|
||||
* <td valign="top">The cursor is at the end;
|
||||
* transliteration is complete.</td>
|
||||
* </tr>
|
||||
* </table>
|
||||
*
|
||||
* <p>The order of rules is significant. If multiple rules may match at some point, the first
|
||||
* matching rule is applied. </p>
|
||||
* <p>The order of rules is significant. If multiple rules may match
|
||||
* at some point, the first matching rule is applied. </p>
|
||||
*
|
||||
* <p>Forward and reverse rules may have an empty output string. Otherwise, an empty left or
|
||||
* right hand side of any statement is a syntax error. </p>
|
||||
* <p>Forward and reverse rules may have an empty output string.
|
||||
* Otherwise, an empty left or right hand side of any statement is a
|
||||
* syntax error. </p>
|
||||
*
|
||||
* <p>Single quotes are used to quote the special characters <code>=><{}[]()|</code>.
|
||||
* To specify a single quote itself, inside or outside of quotes, use two single quotes in a
|
||||
* row. For example, the rule "<code>'>'>o''clock</code>" changes the string
|
||||
* "<code>></code>" to the string "<code>o'clock</code>". </p>
|
||||
* <p>Single quotes are used to quote any character other than a
|
||||
* digit or letter. To specify a single quote itself, inside or
|
||||
* outside of quotes, use two single quotes in a row. For example,
|
||||
* the rule "<code>'>'>o''clock</code>" changes the
|
||||
* string "<code>></code>" to the string "<code>o'clock</code>".
|
||||
* </p>
|
||||
*
|
||||
* <p><b>Notes</b> </p>
|
||||
*
|
||||
* <p>While a RuleBasedTransliterator is being built, it checks that the rules are added in
|
||||
* proper order. For example, if the rule "a>x" is followed by the rule
|
||||
* "ab>y", then the second rule will throw an exception. The reason is that the
|
||||
* second rule can never be triggered, since the first rule always matches anything it
|
||||
* matches. In other words, the first rule <em>masks</em> the second rule. </p>
|
||||
*
|
||||
* <p>While a RuleBasedTransliterator is being built, it checks that
|
||||
* the rules are added in proper order. For example, if the rule
|
||||
* "a>x" is followed by the rule "ab>y",
|
||||
* then the second rule will throw an exception. The reason is that
|
||||
* the second rule can never be triggered, since the first rule
|
||||
* always matches anything it matches. In other words, the first
|
||||
* rule <em>masks</em> the second rule. </p>
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @draft
|
||||
*/
|
||||
|
@ -312,20 +355,23 @@ public:
|
|||
*/
|
||||
enum {
|
||||
PARSE_ERROR_BASE = 0x10000,
|
||||
DUPLICATE_VARIABLE_DEFINITION,
|
||||
MALFORMED_RHS,
|
||||
BAD_VARIABLE_DEFINITION,
|
||||
MALFORMED_RULE,
|
||||
MALFORMED_SET,
|
||||
MALFORMED_SYMBOL_REFERENCE,
|
||||
MALFORMED_UNICODE_ESCAPE,
|
||||
MALFORMED_VARIABLE_DEFINITION,
|
||||
MALFORMED_VARIABLE_REFERENCE,
|
||||
MISMATCHED_SEGMENT_DELIMITERS,
|
||||
MISPLACED_CURSOR_OFFSET,
|
||||
MISSING_OPERATOR,
|
||||
MISSING_SEGMENT_CLOSE,
|
||||
MULTIPLE_ANTE_CONTEXTS,
|
||||
MULTIPLE_CURSORS,
|
||||
MULTIPLE_POST_CONTEXTS,
|
||||
TEXT_AFTER_CLOSE_CONTEXT,
|
||||
TRAILING_BACKSLASH,
|
||||
UNDEFINED_SEGMENT_REFERENCE,
|
||||
UNDEFINED_VARIABLE,
|
||||
UNEXPECTED_CLOSE_CONTEXT,
|
||||
UNQUOTED_SPECIAL,
|
||||
UNTERMINATED_QUOTE
|
||||
};
|
||||
|
|
|
@ -276,18 +276,6 @@ class U_I18N_API UnicodeSet : public UnicodeFilter {
|
|||
* ":]". Example: "[:Lu:]".
|
||||
*/
|
||||
static const UnicodeString CATEGORY_CLOSE;
|
||||
|
||||
/**
|
||||
* Delimiter char beginning a variable reference:
|
||||
* "{". Example: "{var}".
|
||||
*/
|
||||
static const UChar VARIABLE_REF_OPEN;
|
||||
|
||||
/**
|
||||
* Delimiter char ending a variable reference:
|
||||
* "}". Example: "{var}".
|
||||
*/
|
||||
static const UChar VARIABLE_REF_CLOSE;
|
||||
|
||||
// More special characters...
|
||||
static const UChar SET_OPEN;
|
||||
|
|
|
@ -30,18 +30,6 @@ UnicodeString* UnicodeSet::CATEGORY_PAIRS_CACHE =
|
|||
*/
|
||||
const UnicodeString UnicodeSet::CATEGORY_CLOSE = UNICODE_STRING(":]", 2);
|
||||
|
||||
/**
|
||||
* Delimiter char beginning a variable reference:
|
||||
* "{". Example: "{var}".
|
||||
*/
|
||||
const UChar UnicodeSet::VARIABLE_REF_OPEN = 0x007B /*{*/;
|
||||
|
||||
/**
|
||||
* Delimiter char ending a variable reference:
|
||||
* "}". Example: "{var}".
|
||||
*/
|
||||
const UChar UnicodeSet::VARIABLE_REF_CLOSE = 0x007D /*}*/;
|
||||
|
||||
// Define UChar constants using hex for EBCDIC compatibility
|
||||
const UChar UnicodeSet::SET_OPEN = 0x005B; /*[*/
|
||||
const UChar UnicodeSet::SET_CLOSE = 0x005D; /*]*/
|
||||
|
@ -497,9 +485,15 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
|
|||
int32_t i = pos.getIndex();
|
||||
int32_t limit = pattern.length();
|
||||
UnicodeString nestedAux;
|
||||
UnicodeString* nestedPairs;
|
||||
const UnicodeString* nestedPairs;
|
||||
UnicodeString scratch;
|
||||
for (; i<limit; ++i) {
|
||||
/* In the case of an embedded SymbolTable variable, we look it up and
|
||||
* then take characters from the resultant char[] array. These chars
|
||||
* are subjected to an extra level of lookup in the SymbolTable in case
|
||||
* they are stand-ins for a nested UnicodeSet. */
|
||||
const UnicodeString* varValueBuffer = NULL;
|
||||
int32_t ivarValueBuffer = 0;
|
||||
for (; i<limit; i+=((varValueBuffer==NULL)?1:0)) {
|
||||
/* If the next element is a single character, c will be set to it,
|
||||
* and nestedPairs will be null. In this case isLiteral indicates
|
||||
* whether the character should assume special meaning if it has
|
||||
|
@ -508,9 +502,24 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
|
|||
* nestedPairs will be set to the pairs list for the nested set, and
|
||||
* c's value should be ignored.
|
||||
*/
|
||||
UChar c = pattern.charAt(i);
|
||||
nestedPairs = NULL;
|
||||
UBool isLiteral = FALSE;
|
||||
UChar c;
|
||||
if (varValueBuffer != NULL) {
|
||||
if (ivarValueBuffer < varValueBuffer->length()) {
|
||||
c = varValueBuffer->charAt(ivarValueBuffer++);
|
||||
const UnicodeSet* s = symbols->lookupSet(c);
|
||||
if (s != NULL) {
|
||||
//nestedSet = s;
|
||||
nestedPairs = &s->pairs;
|
||||
}
|
||||
} else {
|
||||
varValueBuffer = NULL;
|
||||
c = pattern.charAt(i);
|
||||
}
|
||||
} else {
|
||||
c = pattern.charAt(i);
|
||||
}
|
||||
|
||||
// Ignore whitespace. This is not Unicode whitespace, but Java
|
||||
// whitespace, a subset of Unicode whitespace.
|
||||
|
@ -556,103 +565,104 @@ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/,
|
|||
// will be 2 if we want a closing ']', or 3 if we should parse a
|
||||
// category and close with ":]".
|
||||
|
||||
/* Handle escapes. If a character is escaped, then it assumes its
|
||||
* literal value. This is true for all characters, both special
|
||||
* characters and characters with no special meaning. We also
|
||||
* interpret '\\uxxxx' Unicode escapes here (as literals).
|
||||
*/
|
||||
if (c == BACKSLASH) {
|
||||
++i;
|
||||
if (i < pattern.length()) {
|
||||
c = pattern.charAt(i);
|
||||
isLiteral = TRUE;
|
||||
if (c == 0x0075 /*u*/) {
|
||||
if ((i+4) >= pattern.length()) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return pairsBuf;
|
||||
}
|
||||
c = (UChar)0x0000;
|
||||
for (int32_t j=(++i)+4; i<j; ++i) { // [sic]
|
||||
int32_t digit = Unicode::digit(pattern.charAt(i), 16);
|
||||
if (digit<0) {
|
||||
// Only process escapes, variable references, and nested sets
|
||||
// if we are _not_ retrieving characters from the variable
|
||||
// buffer. Characters in the variable buffer have already
|
||||
// benn through escape and variable reference processing.
|
||||
if (varValueBuffer == NULL) {
|
||||
/* Handle escapes. If a character is escaped, then it assumes its
|
||||
* literal value. This is true for all characters, both special
|
||||
* characters and characters with no special meaning. We also
|
||||
* interpret '\\uxxxx' Unicode escapes here (as literals).
|
||||
*/
|
||||
if (c == BACKSLASH) {
|
||||
++i;
|
||||
if (i < pattern.length()) {
|
||||
c = pattern.charAt(i);
|
||||
isLiteral = TRUE;
|
||||
if (c == 0x0075 /*u*/) {
|
||||
if ((i+4) >= pattern.length()) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return pairsBuf;
|
||||
}
|
||||
c = (UChar) ((c << 4) | digit);
|
||||
c = (UChar)0x0000;
|
||||
for (int32_t j=(++i)+4; i<j; ++i) { // [sic]
|
||||
int32_t digit = Unicode::digit(pattern.charAt(i), 16);
|
||||
if (digit<0) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return pairsBuf;
|
||||
}
|
||||
c = (UChar) ((c << 4) | digit);
|
||||
}
|
||||
--i; // Move i back to last parsed character
|
||||
}
|
||||
--i; // Move i back to last parsed character
|
||||
}
|
||||
} else {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return pairsBuf;
|
||||
}
|
||||
}
|
||||
|
||||
/* Parse variable references. These are treated as literals. If a
|
||||
* variable refers to a UnicodeSet, nestedPairs is assigned here.
|
||||
* Variable names are only parsed if varNameToChar is not null.
|
||||
* Set variables are only looked up if varCharToSet is not null.
|
||||
*/
|
||||
else if (symbols != NULL && !isLiteral && c == VARIABLE_REF_OPEN) {
|
||||
++i;
|
||||
int32_t j = pattern.indexOf(VARIABLE_REF_CLOSE, i);
|
||||
UnicodeSet* set = NULL;
|
||||
if (i == j || j < 0) { // empty or unterminated
|
||||
// throw new IllegalArgumentException("Illegal variable reference");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
} else {
|
||||
scratch.truncate(0);
|
||||
pattern.extractBetween(i, j, scratch);
|
||||
symbols->lookup(scratch, c, set, status);
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
// Either the reference was ill-formed (empty name, or no
|
||||
// closing '}', or the specified name is not defined.
|
||||
return pairsBuf;
|
||||
}
|
||||
isLiteral = TRUE;
|
||||
|
||||
if (set != NULL) {
|
||||
nestedPairs = &set->pairs;
|
||||
}
|
||||
i = j; // Make i point to '}'
|
||||
}
|
||||
|
||||
/* An opening bracket indicates the first bracket of a nested
|
||||
* subpattern, either a normal pattern or a category pattern. We
|
||||
* recognize these here and set nestedPairs accordingly.
|
||||
*/
|
||||
else if (!isLiteral && c == SET_OPEN) {
|
||||
// Handle "[:...:]", representing a character category
|
||||
UChar d = charAfter(pattern, i);
|
||||
if (d == COLON) {
|
||||
i += 2;
|
||||
int32_t j = pattern.indexOf(CATEGORY_CLOSE, i);
|
||||
if (j < 0) {
|
||||
// throw new IllegalArgumentException("Missing \":]\"");
|
||||
} else {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return pairsBuf;
|
||||
}
|
||||
scratch.truncate(0);
|
||||
pattern.extractBetween(i, j, scratch);
|
||||
nestedPairs = &getCategoryPairs(nestedAux, scratch, status);
|
||||
if (U_FAILURE(status)) {
|
||||
}
|
||||
|
||||
/* Parse variable references. These are treated as literals. If a
|
||||
* variable refers to a UnicodeSet, its stand in character is
|
||||
* returned in the UChar[] buffer.
|
||||
* Variable names are only parsed if varNameToChar is not null.
|
||||
* Set variables are only looked up if varCharToSet is not null.
|
||||
*/
|
||||
else if (symbols != NULL && !isLiteral && c == SymbolTable::SYMBOL_REF) {
|
||||
pos.setIndex(++i);
|
||||
UnicodeString name = symbols->parseReference(pattern, pos, limit);
|
||||
if (name.length() == 0) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return pairsBuf;
|
||||
}
|
||||
i = j+1; // Make i point to ']' in ":]"
|
||||
if (mode == 3) {
|
||||
// Entire pattern is a category; leave parse loop
|
||||
pairsBuf.append(*nestedPairs);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Recurse to get the pairs for this nested set.
|
||||
pos.setIndex(i);
|
||||
nestedPairs = &parse(nestedAux, pattern, pos, symbols, status);
|
||||
if (U_FAILURE(status)) {
|
||||
varValueBuffer = symbols->lookup(name);
|
||||
if (varValueBuffer == NULL) {
|
||||
//throw new IllegalArgumentException("Undefined variable: "
|
||||
// + name);
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return pairsBuf;
|
||||
}
|
||||
i = pos.getIndex() - 1; // - 1 to point at ']'
|
||||
ivarValueBuffer = 0;
|
||||
i = pos.getIndex(); // Make i point PAST last char of var name
|
||||
continue; // Back to the top to get varValueBuffer[0]
|
||||
}
|
||||
|
||||
/* An opening bracket indicates the first bracket of a nested
|
||||
* subpattern, either a normal pattern or a category pattern. We
|
||||
* recognize these here and set nestedPairs accordingly.
|
||||
*/
|
||||
else if (!isLiteral && c == SET_OPEN) {
|
||||
// Handle "[:...:]", representing a character category
|
||||
UChar d = charAfter(pattern, i);
|
||||
if (d == COLON) {
|
||||
i += 2;
|
||||
int32_t j = pattern.indexOf(CATEGORY_CLOSE, i);
|
||||
if (j < 0) {
|
||||
// throw new IllegalArgumentException("Missing \":]\"");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return pairsBuf;
|
||||
}
|
||||
scratch.truncate(0);
|
||||
pattern.extractBetween(i, j, scratch);
|
||||
nestedPairs = &getCategoryPairs(nestedAux, scratch, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return pairsBuf;
|
||||
}
|
||||
i = j+1; // Make i point to ']' in ":]"
|
||||
if (mode == 3) {
|
||||
// Entire pattern is a category; leave parse loop
|
||||
pairsBuf.append(*nestedPairs);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Recurse to get the pairs for this nested set.
|
||||
pos.setIndex(i);
|
||||
nestedPairs = &parse(nestedAux, pattern, pos, symbols, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return pairsBuf;
|
||||
}
|
||||
i = pos.getIndex() - 1; // - 1 to point at ']'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -117,9 +117,10 @@ void TransliteratorAPITest::TestgetInverse() {
|
|||
Transliterator* invt1 = Transliterator::createInstance("Latin-Kana");
|
||||
Transliterator* t2 = Transliterator::createInstance("Latin-Devanagari");
|
||||
Transliterator* invt2 = Transliterator::createInstance("Devanagari-Latin");
|
||||
if(t1 == 0 || invt1 == 0 || t2 == 0 || invt2 == 0)
|
||||
if(t1 == 0 || invt1 == 0 || t2 == 0 || invt2 == 0) {
|
||||
errln("FAIL: in instantiation");
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
Transliterator* inverse1=t1->createInverse();
|
||||
Transliterator* inverse2=t2->createInverse();
|
||||
|
@ -235,7 +236,7 @@ void TransliteratorAPITest::TestTransliterate1(){
|
|||
"Unicode-Hex", "hello", "\\u0068\\u0065\\u006C\\u006C\\u006F" ,
|
||||
"Hex-Unicode", "\\u0068\\u0065\\u006C\\u006C\\u006F", "hello" ,
|
||||
"Latin-Devanagari", "bhaarata", CharsToUnicodeString("\\u092D\\u093E\\u0930\\u0924") ,
|
||||
"Devanagari-Latin", CharsToUnicodeString("\\u092D\\u093E\\u0930\\u0924"), "bhaaarata" ,
|
||||
"Devanagari-Latin", CharsToUnicodeString("\\u092D\\u093E\\u0930\\u0924"), "bhaarata" ,
|
||||
// "Contracted-Expanded", CharsToUnicodeString("\\u00C0\\u00C1\\u0042"), CharsToUnicodeString("\\u0041\\u0300\\u0041\\u0301\\u0042") ,
|
||||
// "Expanded-Contracted", CharsToUnicodeString("\\u0041\\u0300\\u0041\\u0301\\u0042"), CharsToUnicodeString("\\u00C0\\u00C1\\u0042") ,
|
||||
"Latin-Arabic", "aap", CharsToUnicodeString("\\u0627\\u06A4") ,
|
||||
|
@ -472,13 +473,17 @@ void TransliteratorAPITest::TestKeyboardTransliterator2(){
|
|||
t=Transliterator::createInstance("Unicode-Hex");
|
||||
if(t == 0)
|
||||
errln("FAIL : construction");
|
||||
keyboardAux(t, Data, rs, 0, 20);
|
||||
else {
|
||||
keyboardAux(t, Data, rs, 0, 20);
|
||||
delete t;
|
||||
}
|
||||
|
||||
rs="Hindi --";
|
||||
t=Transliterator::createInstance("Latin-Devanagari");
|
||||
if(t == 0)
|
||||
errln("FAIL : construction");
|
||||
keyboardAux(t, Data, rs, 20, 40);
|
||||
else
|
||||
keyboardAux(t, Data, rs, 20, 40);
|
||||
|
||||
|
||||
// rs="Add here:";
|
||||
|
|
|
@ -55,6 +55,9 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
|
|||
CASE(12,TestJ277);
|
||||
CASE(13,TestJ243);
|
||||
CASE(14,TestJ329);
|
||||
CASE(15,TestSegments);
|
||||
CASE(16,TestCursorOffset);
|
||||
CASE(17,TestArbitraryVariableValues);
|
||||
default: name = ""; break;
|
||||
}
|
||||
}
|
||||
|
@ -140,17 +143,15 @@ void TransliteratorTest::TestSimpleRules(void) {
|
|||
UErrorCode status = U_ZERO_ERROR;
|
||||
RuleBasedTransliterator t(
|
||||
"<ID>",
|
||||
UnicodeString("dummy=").append((UChar)0xE100) +
|
||||
UnicodeString(
|
||||
";"
|
||||
" vowel = [aeiouAEIOU];"
|
||||
" lu = [:Lu:];"
|
||||
|
||||
" {vowel} ({lu}) > ! ;"
|
||||
" {vowel} > & ;"
|
||||
" !) {lu} > ^ ;"
|
||||
" {lu} > * ;"
|
||||
" a > ERROR", ""),
|
||||
UnicodeString("$dummy=").append((UChar)0xE100) +
|
||||
UnicodeString(";"
|
||||
"$vowel=[aeiouAEIOU];"
|
||||
"$lu=[:Lu:];"
|
||||
"$vowel } $lu > '!';"
|
||||
"$vowel > '&';"
|
||||
"'!' { $lu > '^';"
|
||||
"$lu > '*';"
|
||||
"a > ERROR", ""),
|
||||
status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("FAIL: RBT constructor failed");
|
||||
|
@ -163,16 +164,16 @@ void TransliteratorTest::TestSimpleRules(void) {
|
|||
* Test inline set syntax and set variable syntax.
|
||||
*/
|
||||
void TransliteratorTest::TestInlineSet(void) {
|
||||
expect("[:Ll:] (x) > y; [:Ll:] > z;", "aAbxq", "zAyzz");
|
||||
expect("{ [:Ll:] } x > y; [:Ll:] > z;", "aAbxq", "zAyzz");
|
||||
expect("a[0-9]b > qrs", "1a7b9", "1qrs9");
|
||||
|
||||
expect(UnicodeString(
|
||||
"digit = [0-9];"
|
||||
"alpha = [a-zA-Z];"
|
||||
"alphanumeric = [{digit}{alpha}];" // ***
|
||||
"special = [^{alphanumeric}];" // ***
|
||||
"{alphanumeric} > -;"
|
||||
"{special} > *;", ""),
|
||||
"$digit = [0-9];"
|
||||
"$alpha = [a-zA-Z];"
|
||||
"$alphanumeric = [$digit $alpha];" // ***
|
||||
"$special = [^$alphanumeric];" // ***
|
||||
"$alphanumeric > '-';"
|
||||
"$special > '*';", ""),
|
||||
|
||||
"thx-1138", "---*----");
|
||||
}
|
||||
|
@ -498,6 +499,10 @@ void TransliteratorTest::TestPatternQuoting(void) {
|
|||
void TransliteratorTest::TestJ277(void) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
Transliterator *gl = Transliterator::createInstance("Greek-Latin");
|
||||
if (gl == NULL) {
|
||||
errln("FAIL: createInstance(Greek-Latin) returned NULL");
|
||||
return;
|
||||
}
|
||||
|
||||
UChar sigma = 0x3C3;
|
||||
UChar upsilon = 0x3C5;
|
||||
|
@ -520,17 +525,17 @@ void TransliteratorTest::TestJ277(void) {
|
|||
|
||||
// Again, using a smaller rule set
|
||||
UnicodeString rules(
|
||||
"alpha = \\u03B1;"
|
||||
"nu = \\u03BD;"
|
||||
"sigma = \\u03C3;"
|
||||
"ypsilon = \\u03C5;"
|
||||
"vowel = [aeiouAEIOU{alpha}{ypsilon}];"
|
||||
"s <> {sigma};"
|
||||
"a <> {alpha};"
|
||||
"u <> ({vowel}) {ypsilon};"
|
||||
"y <> {ypsilon};"
|
||||
"n <> {nu};"
|
||||
);
|
||||
"$alpha = \\u03B1;"
|
||||
"$nu = \\u03BD;"
|
||||
"$sigma = \\u03C3;"
|
||||
"$ypsilon = \\u03C5;"
|
||||
"$vowel = [aeiouAEIOU$alpha$ypsilon];"
|
||||
"s <> $sigma;"
|
||||
"a <> $alpha;"
|
||||
"u <> $vowel { $ypsilon;"
|
||||
"y <> $ypsilon;"
|
||||
"n <> $nu;",
|
||||
"");
|
||||
RuleBasedTransliterator mini("mini", rules, Transliterator::REVERSE, status);
|
||||
if (U_FAILURE(status)) { errln("FAIL: Transliterator constructor failed"); return; }
|
||||
expect(mini, syn, "syn");
|
||||
|
@ -626,6 +631,100 @@ void TransliteratorTest::TestJ329(void) {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test segments and segment references.
|
||||
*/
|
||||
void TransliteratorTest::TestSegments(void) {
|
||||
// Array of 3n items
|
||||
// Each item is <rules>, <input>, <expected output>
|
||||
UnicodeString DATA[] = {
|
||||
"([a-z]) '.' ([0-9]) > $2 '-' $1",
|
||||
"abc.123.xyz.456",
|
||||
"ab1-c23.xy4-z56",
|
||||
};
|
||||
int32_t DATA_length = sizeof(DATA)/sizeof(*DATA);
|
||||
|
||||
for (int32_t i=0; i<DATA_length; i+=3) {
|
||||
logln("Pattern: " + prettify(DATA[i]));
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
RuleBasedTransliterator t("<ID>", DATA[i], status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("FAIL: RBT constructor");
|
||||
} else {
|
||||
expect(t, DATA[i+1], DATA[i+2]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test cursor positioning outside of the key
|
||||
*/
|
||||
void TransliteratorTest::TestCursorOffset(void) {
|
||||
// Array of 3n items
|
||||
// Each item is <rules>, <input>, <expected output>
|
||||
UnicodeString DATA[] = {
|
||||
"pre {alpha} post > | @ ALPHA ;"
|
||||
"eALPHA > beta ;"
|
||||
"pre {beta} post > BETA @@ | ;"
|
||||
"post > xyz",
|
||||
|
||||
"prealphapost prebetapost",
|
||||
|
||||
"prbetaxyz preBETApost",
|
||||
};
|
||||
int32_t DATA_length = sizeof(DATA)/sizeof(*DATA);
|
||||
|
||||
for (int32_t i=0; i<DATA_length; i+=3) {
|
||||
logln("Pattern: " + prettify(DATA[i]));
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
RuleBasedTransliterator t("<ID>", DATA[i], status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("FAIL: RBT constructor");
|
||||
} else {
|
||||
expect(t, DATA[i+1], DATA[i+2]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test zero length and > 1 char length variable values. Test
|
||||
* use of variable refs in UnicodeSets.
|
||||
*/
|
||||
void TransliteratorTest::TestArbitraryVariableValues(void) {
|
||||
// Array of 3n items
|
||||
// Each item is <rules>, <input>, <expected output>
|
||||
UnicodeString DATA[] = {
|
||||
"$abe = ab;"
|
||||
"$pat = x[yY]z;"
|
||||
"$ll = 'a-z';"
|
||||
"$llZ = [$ll];"
|
||||
"$llY = [$ll$pat];"
|
||||
"$emp = ;"
|
||||
|
||||
"$abe > ABE;"
|
||||
"$pat > END;"
|
||||
"$llZ > 1;"
|
||||
"$llY > 2;"
|
||||
"7$emp 8 > 9;"
|
||||
"",
|
||||
|
||||
"ab xYzxyz stY78",
|
||||
"ABE ENDEND 1129",
|
||||
};
|
||||
int32_t DATA_length = sizeof(DATA)/sizeof(*DATA);
|
||||
|
||||
for (int32_t i=0; i<DATA_length; i+=3) {
|
||||
logln("Pattern: " + prettify(DATA[i]));
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
RuleBasedTransliterator t("<ID>", DATA[i], status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("FAIL: RBT constructor");
|
||||
} else {
|
||||
expect(t, DATA[i+1], DATA[i+2]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
|
|
|
@ -96,6 +96,22 @@ class TransliteratorTest : public IntlTest {
|
|||
*/
|
||||
void TestJ329(void);
|
||||
|
||||
/**
|
||||
* Test segments and segment references.
|
||||
*/
|
||||
void TestSegments();
|
||||
|
||||
/**
|
||||
* Test cursor positioning outside of the key
|
||||
*/
|
||||
void TestCursorOffset();
|
||||
|
||||
/**
|
||||
* Test zero length and > 1 char length variable values. Test
|
||||
* use of variable refs in UnicodeSets.
|
||||
*/
|
||||
void TestArbitraryVariableValues();
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
|
|
Loading…
Add table
Reference in a new issue