mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-1231 Major revision -- see bug report
X-SVN-Rev: 5901
This commit is contained in:
parent
56e8f94332
commit
249d435b2a
10 changed files with 2024 additions and 0 deletions
285
icu4j/src/com/ibm/icu/impl/data/Transliterator_Any_Accents.txt
Executable file
285
icu4j/src/com/ibm/icu/impl/data/Transliterator_Any_Accents.txt
Executable file
|
@ -0,0 +1,285 @@
|
|||
:: nfd;
|
||||
|
||||
# to do: make reversible
|
||||
|
||||
# define special conversion character
|
||||
|
||||
$x = \| ;
|
||||
|
||||
# Provide keyboard equivalents for common diacritics used in transliteration
|
||||
|
||||
\` $x <> \u0300 ; # COMBINING GRAVE ACCENT
|
||||
\' $x <> \u0301 ; # COMBINING ACUTE ACCENT
|
||||
\^ $x <> \u0302 ; # COMBINING CIRCUMFLEX ACCENT
|
||||
\~ $x <> \u0303 ; # COMBINING TILDE
|
||||
\- $x <> \u0304 ; # COMBINING MACRON
|
||||
\" $x <> \u0308 ; # COMBINING DIAERESIS
|
||||
\* $x <> \u030A ; # COMBINING RING ABOVE
|
||||
\, $x <> \u0327 ; # COMBINING CEDILLA
|
||||
'/' $x <> \u0338 ; # COMBINING LONG SOLIDUS OVERLAY
|
||||
\. $x <> \u0323 ; # COMBINING DOT BELOW
|
||||
|
||||
# Combine common characters
|
||||
|
||||
AE $x <> \u00C6 ; # LATIN CAPITAL LETTER AE
|
||||
ae $x <> \u00E6 ; # LATIN SMALL LETTER AE
|
||||
D $x <> \u00D0 ; # LATIN CAPITAL LETTER ETH
|
||||
d $x <> \u00F0 ; # LATIN SMALL LETTER ETH
|
||||
O'/' $x <> \u00D8 ; # LATIN CAPITAL LETTER O WITH STROKE
|
||||
o'/' $x <> \u00F8 ; # LATIN SMALL LETTER O WITH STROKE
|
||||
TH $x <> \u00DE ; # LATIN CAPITAL LETTER THORN
|
||||
th $x <> \u00FE ; # LATIN SMALL LETTER THORN
|
||||
OE $x <> \u0152 ; # LATIN CAPITAL LIGATURE OE
|
||||
oe $x <> \u0153 ; # LATIN SMALL LIGATURE OE
|
||||
|
||||
ss $x <> \u00DF ; # LATIN SMALL LETTER SHARP S
|
||||
|
||||
NG $x <> \u014A ; # LATIN CAPITAL LETTER ENG
|
||||
ng $x <> \u014B ; # LATIN SMALL LETTER ENG
|
||||
|
||||
T $x <> \u0398 ; # THETA
|
||||
t $x <> \u03B8 ; # THETA
|
||||
SH $x <> \u01A9 ; # LATIN CAPITAL LETTER ESH
|
||||
sh $x <> \u0283 ; # LATIN SMALL LETTER ESH
|
||||
ZH $x <> \u01B7 ; # LATIN CAPITAL LETTER EZH
|
||||
zh $x <> \u0292 ; # LATIN SMALL LETTER EZH
|
||||
|
||||
U $x <> \u01B1 ; # LATIN CAPITAL LETTER UPSILON
|
||||
u $x <> \u028A ; # LATIN SMALL LETTER UPSILON
|
||||
A $x <> \u018F ; # LATIN CAPITAL LETTER SCHWA
|
||||
a $x <> \u0259 ; # LATIN SMALL LETTER SCHWA
|
||||
O $x <> \u0186 ; # LATIN CAPITAL LETTER OPEN O
|
||||
o $x <> \u0254 ; # LATIN SMALL LETTER OPEN O
|
||||
E $x <> \u0190 ; # LATIN CAPITAL LETTER OPEN E
|
||||
e $x <> \u025B ; # LATIN SMALL LETTER OPEN E
|
||||
|
||||
# three that don't have uppercases
|
||||
|
||||
'?' $x <> \u0294 ; # LATIN LETTER GLOTTAL STOP
|
||||
i $x <> \u026A ; # LATIN LETTER SMALL CAPITAL I
|
||||
v $x <> \u028C ; # LATIN SMALL LETTER TURNED V
|
||||
|
||||
$x > ; # delete any left-overs
|
||||
|
||||
# Additional Characters that may be added in the future
|
||||
|
||||
# xxx $x <> \u0306 ; # COMBINING BREVE
|
||||
# xxx $x <> \u0307 ; # COMBINING DOT ABOVE
|
||||
# xxx $x <> \u0309 ; # COMBINING HOOK ABOVE
|
||||
# xxx $x <> \u030B ; # COMBINING DOUBLE ACUTE ACCENT
|
||||
# xxx $x <> \u030C ; # COMBINING CARON
|
||||
# xxx $x <> \u030F ; # COMBINING DOUBLE GRAVE ACCENT
|
||||
# xxx $x <> \u0311 ; # COMBINING INVERTED BREVE
|
||||
# xxx $x <> \u0313 ; # COMBINING COMMA ABOVE
|
||||
# xxx $x <> \u0314 ; # COMBINING REVERSED COMMA ABOVE
|
||||
# xxx $x <> \u031B ; # COMBINING HORN
|
||||
# xxx $x <> \u0324 ; # COMBINING DIAERESIS BELOW
|
||||
# xxx $x <> \u0325 ; # COMBINING RING BELOW
|
||||
# xxx $x <> \u0326 ; # COMBINING COMMA BELOW
|
||||
# xxx $x <> \u0328 ; # COMBINING OGONEK
|
||||
# xxx $x <> \u032D ; # COMBINING CIRCUMFLEX ACCENT BELOW
|
||||
# xxx $x <> \u032E ; # COMBINING BREVE BELOW
|
||||
# xxx $x <> \u0330 ; # COMBINING TILDE BELOW
|
||||
# xxx $x <> \u0331 ; # COMBINING MACRON BELOW
|
||||
|
||||
# yyy $x <> \u00AA ; # FEMININE ORDINAL INDICATOR
|
||||
# yyy $x <> \u00BA ; # MASCULINE ORDINAL INDICATOR
|
||||
# yyy $x <> \u0110 ; # LATIN CAPITAL LETTER D WITH STROKE
|
||||
# yyy $x <> \u0111 ; # LATIN SMALL LETTER D WITH STROKE
|
||||
# yyy $x <> \u0126 ; # LATIN CAPITAL LETTER H WITH STROKE
|
||||
# yyy $x <> \u0127 ; # LATIN SMALL LETTER H WITH STROKE
|
||||
# yyy $x <> \u0131 ; # LATIN SMALL LETTER DOTLESS I
|
||||
# yyy $x <> \u0138 ; # LATIN SMALL LETTER KRA
|
||||
# yyy $x <> \u013F ; # LATIN CAPITAL LETTER L WITH MIDDLE DOT
|
||||
# yyy $x <> \u0140 ; # LATIN SMALL LETTER L WITH MIDDLE DOT
|
||||
# yyy $x <> \u0141 ; # LATIN CAPITAL LETTER L WITH STROKE
|
||||
# yyy $x <> \u0142 ; # LATIN SMALL LETTER L WITH STROKE
|
||||
# yyy $x <> \u0149 ; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
|
||||
# yyy $x <> \u0166 ; # LATIN CAPITAL LETTER T WITH STROKE
|
||||
# yyy $x <> \u0167 ; # LATIN SMALL LETTER T WITH STROKE
|
||||
# yyy $x <> \u017F ; # LATIN SMALL LETTER LONG S
|
||||
# yyy $x <> \u0180 ; # LATIN SMALL LETTER B WITH STROKE
|
||||
# yyy $x <> \u0181 ; # LATIN CAPITAL LETTER B WITH HOOK
|
||||
# yyy $x <> \u0182 ; # LATIN CAPITAL LETTER B WITH TOPBAR
|
||||
# yyy $x <> \u0183 ; # LATIN SMALL LETTER B WITH TOPBAR
|
||||
# yyy $x <> \u0184 ; # LATIN CAPITAL LETTER TONE SIX
|
||||
# yyy $x <> \u0185 ; # LATIN SMALL LETTER TONE SIX
|
||||
# yyy $x <> \u0187 ; # LATIN CAPITAL LETTER C WITH HOOK
|
||||
# yyy $x <> \u0188 ; # LATIN SMALL LETTER C WITH HOOK
|
||||
# yyy $x <> \u0189 ; # LATIN CAPITAL LETTER AFRICAN D
|
||||
# yyy $x <> \u018A ; # LATIN CAPITAL LETTER D WITH HOOK
|
||||
# yyy $x <> \u018B ; # LATIN CAPITAL LETTER D WITH TOPBAR
|
||||
# yyy $x <> \u018C ; # LATIN SMALL LETTER D WITH TOPBAR
|
||||
# yyy $x <> \u018D ; # LATIN SMALL LETTER TURNED DELTA
|
||||
# yyy $x <> \u018E ; # LATIN CAPITAL LETTER REVERSED E
|
||||
# yyy $x <> \u0191 ; # LATIN CAPITAL LETTER F WITH HOOK
|
||||
# yyy $x <> \u0192 ; # LATIN SMALL LETTER F WITH HOOK
|
||||
# yyy $x <> \u0193 ; # LATIN CAPITAL LETTER G WITH HOOK
|
||||
# yyy $x <> \u0194 ; # LATIN CAPITAL LETTER GAMMA
|
||||
# yyy $x <> \u0195 ; # LATIN SMALL LETTER HV
|
||||
# yyy $x <> \u0196 ; # LATIN CAPITAL LETTER IOTA
|
||||
# yyy $x <> \u0197 ; # LATIN CAPITAL LETTER I WITH STROKE
|
||||
# yyy $x <> \u0198 ; # LATIN CAPITAL LETTER K WITH HOOK
|
||||
# yyy $x <> \u0199 ; # LATIN SMALL LETTER K WITH HOOK
|
||||
# yyy $x <> \u019A ; # LATIN SMALL LETTER L WITH BAR
|
||||
# yyy $x <> \u019B ; # LATIN SMALL LETTER LAMBDA WITH STROKE
|
||||
# yyy $x <> \u019C ; # LATIN CAPITAL LETTER TURNED M
|
||||
# yyy $x <> \u019D ; # LATIN CAPITAL LETTER N WITH LEFT HOOK
|
||||
# yyy $x <> \u019E ; # LATIN SMALL LETTER N WITH LONG RIGHT LEG
|
||||
# yyy $x <> \u019F ; # LATIN CAPITAL LETTER O WITH MIDDLE TILDE
|
||||
# yyy $x <> \u01A2 ; # LATIN CAPITAL LETTER OI
|
||||
# yyy $x <> \u01A3 ; # LATIN SMALL LETTER OI
|
||||
# yyy $x <> \u01A4 ; # LATIN CAPITAL LETTER P WITH HOOK
|
||||
# yyy $x <> \u01A5 ; # LATIN SMALL LETTER P WITH HOOK
|
||||
# yyy $x <> \u01A6 ; # LATIN LETTER YR
|
||||
# yyy $x <> \u01A7 ; # LATIN CAPITAL LETTER TONE TWO
|
||||
# yyy $x <> \u01A8 ; # LATIN SMALL LETTER TONE TWO
|
||||
# yyy $x <> \u01AA ; # LATIN LETTER REVERSED ESH LOOP
|
||||
# yyy $x <> \u01AB ; # LATIN SMALL LETTER T WITH PALATAL HOOK
|
||||
# yyy $x <> \u01AC ; # LATIN CAPITAL LETTER T WITH HOOK
|
||||
# yyy $x <> \u01AD ; # LATIN SMALL LETTER T WITH HOOK
|
||||
# yyy $x <> \u01AE ; # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
|
||||
# yyy $x <> \u01B2 ; # LATIN CAPITAL LETTER V WITH HOOK
|
||||
# yyy $x <> \u01B3 ; # LATIN CAPITAL LETTER Y WITH HOOK
|
||||
# yyy $x <> \u01B4 ; # LATIN SMALL LETTER Y WITH HOOK
|
||||
# yyy $x <> \u01B5 ; # LATIN CAPITAL LETTER Z WITH STROKE
|
||||
# yyy $x <> \u01B6 ; # LATIN SMALL LETTER Z WITH STROKE
|
||||
# yyy $x <> \u01B8 ; # LATIN CAPITAL LETTER EZH REVERSED
|
||||
# yyy $x <> \u01B9 ; # LATIN SMALL LETTER EZH REVERSED
|
||||
# yyy $x <> \u01BA ; # LATIN SMALL LETTER EZH WITH TAIL
|
||||
# yyy $x <> \u01BB ; # LATIN LETTER TWO WITH STROKE
|
||||
# yyy $x <> \u01BC ; # LATIN CAPITAL LETTER TONE FIVE
|
||||
# yyy $x <> \u01BD ; # LATIN SMALL LETTER TONE FIVE
|
||||
# yyy $x <> \u01BE ; # LATIN LETTER INVERTED GLOTTAL STOP WITH STROKE
|
||||
# yyy $x <> \u01BF ; # LATIN LETTER WYNN
|
||||
# yyy $x <> \u01C0 ; # LATIN LETTER DENTAL CLICK
|
||||
# yyy $x <> \u01C1 ; # LATIN LETTER LATERAL CLICK
|
||||
# yyy $x <> \u01C2 ; # LATIN LETTER ALVEOLAR CLICK
|
||||
# yyy $x <> \u01C3 ; # LATIN LETTER RETROFLEX CLICK
|
||||
# yyy $x <> \u01C4 ; # LATIN CAPITAL LETTER DZ WITH CARON
|
||||
# yyy $x <> \u01C5 ; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
|
||||
# yyy $x <> \u01C6 ; # LATIN SMALL LETTER DZ WITH CARON
|
||||
# yyy $x <> \u01C7 ; # LATIN CAPITAL LETTER LJ
|
||||
# yyy $x <> \u01C8 ; # LATIN CAPITAL LETTER L WITH SMALL LETTER J
|
||||
# yyy $x <> \u01C9 ; # LATIN SMALL LETTER LJ
|
||||
# yyy $x <> \u01CA ; # LATIN CAPITAL LETTER NJ
|
||||
# yyy $x <> \u01CB ; # LATIN CAPITAL LETTER N WITH SMALL LETTER J
|
||||
# yyy $x <> \u01CC ; # LATIN SMALL LETTER NJ
|
||||
# yyy $x <> \u01DD ; # LATIN SMALL LETTER TURNED E
|
||||
# yyy $x <> \u01E4 ; # LATIN CAPITAL LETTER G WITH STROKE
|
||||
# yyy $x <> \u01E5 ; # LATIN SMALL LETTER G WITH STROKE
|
||||
# yyy $x <> \u01F1 ; # LATIN CAPITAL LETTER DZ
|
||||
# yyy $x <> \u01F2 ; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z
|
||||
# yyy $x <> \u01F3 ; # LATIN SMALL LETTER DZ
|
||||
# yyy $x <> \u01F6 ; # LATIN CAPITAL LETTER HWAIR
|
||||
# yyy $x <> \u01F7 ; # LATIN CAPITAL LETTER WYNN
|
||||
# yyy $x <> \u021C ; # LATIN CAPITAL LETTER YOGH
|
||||
# yyy $x <> \u021D ; # LATIN SMALL LETTER YOGH
|
||||
# yyy $x <> \u0222 ; # LATIN CAPITAL LETTER OU
|
||||
# yyy $x <> \u0223 ; # LATIN SMALL LETTER OU
|
||||
# yyy $x <> \u0224 ; # LATIN CAPITAL LETTER Z WITH HOOK
|
||||
# yyy $x <> \u0225 ; # LATIN SMALL LETTER Z WITH HOOK
|
||||
# yyy $x <> \u0250 ; # LATIN SMALL LETTER TURNED A
|
||||
# yyy $x <> \u0251 ; # LATIN SMALL LETTER ALPHA
|
||||
# yyy $x <> \u0252 ; # LATIN SMALL LETTER TURNED ALPHA
|
||||
# yyy $x <> \u0253 ; # LATIN SMALL LETTER B WITH HOOK
|
||||
# yyy $x <> \u0255 ; # LATIN SMALL LETTER C WITH CURL
|
||||
# yyy $x <> \u0256 ; # LATIN SMALL LETTER D WITH TAIL
|
||||
# yyy $x <> \u0257 ; # LATIN SMALL LETTER D WITH HOOK
|
||||
# yyy $x <> \u0258 ; # LATIN SMALL LETTER REVERSED E
|
||||
# yyy $x <> \u025A ; # LATIN SMALL LETTER SCHWA WITH HOOK
|
||||
# yyy $x <> \u025C ; # LATIN SMALL LETTER REVERSED OPEN E
|
||||
# yyy $x <> \u025D ; # LATIN SMALL LETTER REVERSED OPEN E WITH HOOK
|
||||
# yyy $x <> \u025E ; # LATIN SMALL LETTER CLOSED REVERSED OPEN E
|
||||
# yyy $x <> \u025F ; # LATIN SMALL LETTER DOTLESS J WITH STROKE
|
||||
# yyy $x <> \u0260 ; # LATIN SMALL LETTER G WITH HOOK
|
||||
# yyy $x <> \u0261 ; # LATIN SMALL LETTER SCRIPT G
|
||||
# yyy $x <> \u0262 ; # LATIN LETTER SMALL CAPITAL G
|
||||
# yyy $x <> \u0263 ; # LATIN SMALL LETTER GAMMA
|
||||
# yyy $x <> \u0264 ; # LATIN SMALL LETTER RAMS HORN
|
||||
# yyy $x <> \u0265 ; # LATIN SMALL LETTER TURNED H
|
||||
# yyy $x <> \u0266 ; # LATIN SMALL LETTER H WITH HOOK
|
||||
# yyy $x <> \u0267 ; # LATIN SMALL LETTER HENG WITH HOOK
|
||||
# yyy $x <> \u0268 ; # LATIN SMALL LETTER I WITH STROKE
|
||||
# yyy $x <> \u0269 ; # LATIN SMALL LETTER IOTA
|
||||
# yyy $x <> \u026B ; # LATIN SMALL LETTER L WITH MIDDLE TILDE
|
||||
# yyy $x <> \u026C ; # LATIN SMALL LETTER L WITH BELT
|
||||
# yyy $x <> \u026D ; # LATIN SMALL LETTER L WITH RETROFLEX HOOK
|
||||
# yyy $x <> \u026E ; # LATIN SMALL LETTER LEZH
|
||||
# yyy $x <> \u026F ; # LATIN SMALL LETTER TURNED M
|
||||
# yyy $x <> \u0270 ; # LATIN SMALL LETTER TURNED M WITH LONG LEG
|
||||
# yyy $x <> \u0271 ; # LATIN SMALL LETTER M WITH HOOK
|
||||
# yyy $x <> \u0272 ; # LATIN SMALL LETTER N WITH LEFT HOOK
|
||||
# yyy $x <> \u0273 ; # LATIN SMALL LETTER N WITH RETROFLEX HOOK
|
||||
# yyy $x <> \u0274 ; # LATIN LETTER SMALL CAPITAL N
|
||||
# yyy $x <> \u0275 ; # LATIN SMALL LETTER BARRED O
|
||||
# yyy $x <> \u0276 ; # LATIN LETTER SMALL CAPITAL OE
|
||||
# yyy $x <> \u0277 ; # LATIN SMALL LETTER CLOSED OMEGA
|
||||
# yyy $x <> \u0278 ; # LATIN SMALL LETTER PHI
|
||||
# yyy $x <> \u0279 ; # LATIN SMALL LETTER TURNED R
|
||||
# yyy $x <> \u027A ; # LATIN SMALL LETTER TURNED R WITH LONG LEG
|
||||
# yyy $x <> \u027B ; # LATIN SMALL LETTER TURNED R WITH HOOK
|
||||
# yyy $x <> \u027C ; # LATIN SMALL LETTER R WITH LONG LEG
|
||||
# yyy $x <> \u027D ; # LATIN SMALL LETTER R WITH TAIL
|
||||
# yyy $x <> \u027E ; # LATIN SMALL LETTER R WITH FISHHOOK
|
||||
# yyy $x <> \u027F ; # LATIN SMALL LETTER REVERSED R WITH FISHHOOK
|
||||
# yyy $x <> \u0280 ; # LATIN LETTER SMALL CAPITAL R
|
||||
# yyy $x <> \u0281 ; # LATIN LETTER SMALL CAPITAL INVERTED R
|
||||
# yyy $x <> \u0282 ; # LATIN SMALL LETTER S WITH HOOK
|
||||
# yyy $x <> \u0284 ; # LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK
|
||||
# yyy $x <> \u0285 ; # LATIN SMALL LETTER SQUAT REVERSED ESH
|
||||
# yyy $x <> \u0286 ; # LATIN SMALL LETTER ESH WITH CURL
|
||||
# yyy $x <> \u0287 ; # LATIN SMALL LETTER TURNED T
|
||||
# yyy $x <> \u0288 ; # LATIN SMALL LETTER T WITH RETROFLEX HOOK
|
||||
# yyy $x <> \u0289 ; # LATIN SMALL LETTER U BAR
|
||||
# yyy $x <> \u028B ; # LATIN SMALL LETTER V WITH HOOK
|
||||
# yyy $x <> \u028D ; # LATIN SMALL LETTER TURNED W
|
||||
# yyy $x <> \u028E ; # LATIN SMALL LETTER TURNED Y
|
||||
# yyy $x <> \u028F ; # LATIN LETTER SMALL CAPITAL Y
|
||||
# yyy $x <> \u0290 ; # LATIN SMALL LETTER Z WITH RETROFLEX HOOK
|
||||
# yyy $x <> \u0291 ; # LATIN SMALL LETTER Z WITH CURL
|
||||
# yyy $x <> \u0293 ; # LATIN SMALL LETTER EZH WITH CURL
|
||||
# yyy $x <> \u0294 ; # LATIN LETTER GLOTTAL STOP
|
||||
# yyy $x <> \u0295 ; # LATIN LETTER PHARYNGEAL VOICED FRICATIVE
|
||||
# yyy $x <> \u0296 ; # LATIN LETTER INVERTED GLOTTAL STOP
|
||||
# yyy $x <> \u0297 ; # LATIN LETTER STRETCHED C
|
||||
# yyy $x <> \u0298 ; # LATIN LETTER BILABIAL CLICK
|
||||
# yyy $x <> \u0299 ; # LATIN LETTER SMALL CAPITAL B
|
||||
# yyy $x <> \u029A ; # LATIN SMALL LETTER CLOSED OPEN E
|
||||
# yyy $x <> \u029B ; # LATIN LETTER SMALL CAPITAL G WITH HOOK
|
||||
# yyy $x <> \u029C ; # LATIN LETTER SMALL CAPITAL H
|
||||
# yyy $x <> \u029D ; # LATIN SMALL LETTER J WITH CROSSED-TAIL
|
||||
# yyy $x <> \u029E ; # LATIN SMALL LETTER TURNED K
|
||||
# yyy $x <> \u029F ; # LATIN LETTER SMALL CAPITAL L
|
||||
# yyy $x <> \u02A0 ; # LATIN SMALL LETTER Q WITH HOOK
|
||||
# yyy $x <> \u02A1 ; # LATIN LETTER GLOTTAL STOP WITH STROKE
|
||||
# yyy $x <> \u02A2 ; # LATIN LETTER REVERSED GLOTTAL STOP WITH STROKE
|
||||
# yyy $x <> \u02A3 ; # LATIN SMALL LETTER DZ DIGRAPH
|
||||
# yyy $x <> \u02A4 ; # LATIN SMALL LETTER DEZH DIGRAPH
|
||||
# yyy $x <> \u02A5 ; # LATIN SMALL LETTER DZ DIGRAPH WITH CURL
|
||||
# yyy $x <> \u02A6 ; # LATIN SMALL LETTER TS DIGRAPH
|
||||
# yyy $x <> \u02A7 ; # LATIN SMALL LETTER TESH DIGRAPH
|
||||
# yyy $x <> \u02A8 ; # LATIN SMALL LETTER TC DIGRAPH WITH CURL
|
||||
# yyy $x <> \u02A9 ; # LATIN SMALL LETTER FENG DIGRAPH
|
||||
# yyy $x <> \u02AA ; # LATIN SMALL LETTER LS DIGRAPH
|
||||
# yyy $x <> \u02AB ; # LATIN SMALL LETTER LZ DIGRAPH
|
||||
# yyy $x <> \u02AC ; # LATIN LETTER BILABIAL PERCUSSIVE
|
||||
# yyy $x <> \u02AD ; # LATIN LETTER BIDENTAL PERCUSSIVE
|
||||
# yyy $x <> \u02B0 ; # MODIFIER LETTER SMALL H
|
||||
# yyy $x <> \u02B1 ; # MODIFIER LETTER SMALL H WITH HOOK
|
||||
# yyy $x <> \u02B2 ; # MODIFIER LETTER SMALL J
|
||||
# yyy $x <> \u02B3 ; # MODIFIER LETTER SMALL R
|
||||
# yyy $x <> \u02B4 ; # MODIFIER LETTER SMALL TURNED R
|
||||
# yyy $x <> \u02B5 ; # MODIFIER LETTER SMALL TURNED R WITH HOOK
|
||||
# yyy $x <> \u02B6 ; # MODIFIER LETTER SMALL CAPITAL INVERTED R
|
||||
# yyy $x <> \u02B7 ; # MODIFIER LETTER SMALL W
|
||||
# yyy $x <> \u02B8 ; # MODIFIER LETTER SMALL Y
|
||||
# yyy $x <> \u02E0 ; # MODIFIER LETTER SMALL GAMMA
|
||||
# yyy $x <> \u02E1 ; # MODIFIER LETTER SMALL L
|
||||
# yyy $x <> \u02E2 ; # MODIFIER LETTER SMALL S
|
||||
# yyy $x <> \u02E3 ; # MODIFIER LETTER SMALL X
|
||||
# yyy $x <> \u02E4 ; # MODIFIER LETTER SMALL REVERSED GLOTTAL STOP
|
||||
# yyy $x <> \u1E9A ; # LATIN SMALL LETTER A WITH RIGHT HALF RING
|
||||
# yyy $x <> \u207F ; # SUPERSCRIPT LATIN SMALL LETTER N
|
||||
|
||||
:: NFC;
|
31
icu4j/src/com/ibm/icu/impl/data/Transliterator_Any_Publishing.txt
Executable file
31
icu4j/src/com/ibm/icu/impl/data/Transliterator_Any_Publishing.txt
Executable file
|
@ -0,0 +1,31 @@
|
|||
# Test case
|
||||
# "The" "(quick)" ('brown') `fox' ` jumped -- "over?"
|
||||
|
||||
# Variables
|
||||
|
||||
$single = \' ;
|
||||
$space = ' ' ;
|
||||
$double = \" ;
|
||||
$back = \` ;
|
||||
$tab = '\u0008' ;
|
||||
$makeRight = [[:Z:][:Ps:][:Pi:]] ;
|
||||
|
||||
# fix UNIX quotes
|
||||
|
||||
$back $back > “ ;
|
||||
$back > ‘ ;
|
||||
|
||||
# fix typewriter quotes, by context
|
||||
|
||||
$makeRight {$double} <> “ ;
|
||||
^ {$double} > “ ;
|
||||
$double <> ” ;
|
||||
|
||||
$makeRight {$single} <> ‘ ;
|
||||
^ {$single} > ‘ ;
|
||||
$single <> ’;
|
||||
|
||||
# fix multiple spaces and hyphens
|
||||
|
||||
$space {$space} > ;
|
||||
'--' <> — ;
|
242
icu4j/src/com/ibm/icu/impl/data/Transliterator_Greek_Latin.txt
Executable file
242
icu4j/src/com/ibm/icu/impl/data/Transliterator_Greek_Latin.txt
Executable file
|
@ -0,0 +1,242 @@
|
|||
# Rules are predicated on running NFD first, and NFC afterwards
|
||||
::NFD ;
|
||||
|
||||
# TEST CASES
|
||||
|
||||
# Ὀλίγοι ἔμφονες πολλῶν ἀφρόνων φοβερώτεροι — Πλάτωνος
|
||||
# ᾂ ᾒ ᾢ ᾃ ᾓ ᾣ
|
||||
# ᾳ ῃ ῳ ὃ ὄ
|
||||
# ὠς ὡς ὢς ὣς
|
||||
# Ὠς Ὡς Ὢς Ὣς
|
||||
# ὨΣ ὩΣ ὪΣ ὫΣ
|
||||
# Ạ, ạ, Ẹ, ẹ, Ọ, ọ
|
||||
|
||||
# Useful variables
|
||||
|
||||
$lower = [:Ll:] ;
|
||||
$upper = [:Lu:] ;
|
||||
$accent = [:M:] ;
|
||||
|
||||
$macron = \u0304 ;
|
||||
$ddot = \u0308 ;
|
||||
|
||||
$lcgvowel = [αεηιουω] ;
|
||||
$ucgvowel = [ΑΕΗΙΟΥΩ] ;
|
||||
$gvowel = [$lcgvowel $ucgvowel] ;
|
||||
$lcgvowelC = [$lcgvowel $accent] ;
|
||||
|
||||
$vowel = [ AEIOUaeiou $gvowel] ;
|
||||
|
||||
$beforeLower = $accent * $lower ;
|
||||
|
||||
$gammaLike = [ΓΚΞΧγκξχ] ;
|
||||
$smooth = ̓ ;
|
||||
$rough = ̔ ;
|
||||
$iotasub = ͅ ;
|
||||
|
||||
# Fix punctuation
|
||||
|
||||
\; <> \? ;
|
||||
· <> \: ;
|
||||
|
||||
# CIRCUMFLEX: convert greek circumflex to normal one. Could use tilde or inverted breve
|
||||
|
||||
\u0342 <> \u0302 ;
|
||||
|
||||
# IOTA: convert iota subscript to iota
|
||||
# first make previous alpha long!
|
||||
|
||||
Α } $accent * $iotasub > A $macron ;
|
||||
α } $accent * $iotasub > a $macron ;
|
||||
|
||||
# now convert to uppercase if after uppercase, ow to lowercase
|
||||
|
||||
$upper $accent * { $iotasub > I ;
|
||||
$iotasub > i ;
|
||||
|
||||
| $1 $iotasub < ([:L:] $macron [:M:]*) i ;
|
||||
|
||||
# BREATHING
|
||||
|
||||
$smooth > ; #delete smooth breathing
|
||||
|
||||
# Convert rough breathing to h, and move before letters.
|
||||
|
||||
# Make A ` x = > H a x
|
||||
|
||||
Α $rough } $beforeLower > H | α ;
|
||||
Ε $rough } $beforeLower > H | ε;
|
||||
Η $rough } $beforeLower > H | η ;
|
||||
Ι ($ddot?) $rough } $beforeLower > H | ι $1;
|
||||
Ο $rough } $beforeLower > H | ο ;
|
||||
Υ $rough } $beforeLower > H | υ ;
|
||||
Ω ($ddot?) $rough } $beforeLower > H | ω $1;
|
||||
|
||||
# Make A x ` = > H a x
|
||||
|
||||
Α ($lower) $rough > H | α $1 ;
|
||||
Ε ($lower) $rough > H | ε $1 ;
|
||||
Η ($lower) $rough > H | η $1 ;
|
||||
Ι ($lower $ddot?) $rough > H | ι $1 ;
|
||||
Ο ($lower) $rough > H | ο $1 ;
|
||||
Υ ($lower) $rough > H | υ $1 ;
|
||||
Ω ($lower $ddot?) $rough > H | ω $1 ;
|
||||
|
||||
#Otherwise, make x ` into h x and X ` into H X
|
||||
|
||||
($lcgvowel + $ddot? ) $rough > h | $1 ;
|
||||
($gvowel + $ddot? ) $rough > H | $1 ;
|
||||
|
||||
# Go backwards with H
|
||||
|
||||
| $1 $rough < h ([aeiouyAEIOUY] $macron? $ddot? [aeiouyAEIOUY] $macron?) ;
|
||||
| $1 $rough < h ([aeiouyAEIOUY] $macron? $ddot?) ;
|
||||
|
||||
| $1 $rough < H ([AEIOUY] $macron? $ddot?[aeiouyAEIOUY] $macron?) ;
|
||||
| $1 $rough < H ([AEIOUY] $macron? $ddot?) ;
|
||||
|
||||
# titlecase, have to fix individually
|
||||
| $1 $rough < H (a $macron? $ddot? [aeiouyAEIOUY] $macron?) ;
|
||||
| $1 $rough < H (e $macron? $ddot? [aeiouyAEIOUY] $macron?) ;
|
||||
| $1 $rough < H (i $macron? $ddot? [aeiouyAEIOUY] $macron?) ;
|
||||
| $1 $rough < H (o $macron? $ddot? [aeiouyAEIOUY] $macron?) ;
|
||||
| $1 $rough < H (u $macron? $ddot? [aeiouyAEIOUY] $macron?) ;
|
||||
| $1 $rough < H (y $macron? [aeiouyAEIOUY] $macron?) ;
|
||||
| $1 $rough < H (a $macron? $ddot? ) ;
|
||||
| $1 $rough < H (e $macron? $ddot? ) ;
|
||||
| $1 $rough < H (i $macron? $ddot? ) ;
|
||||
| $1 $rough < H (o $macron? $ddot? ) ;
|
||||
| $1 $rough < H (u $macron? $ddot? ) ;
|
||||
| $1 $rough < H (y $macron? $ddot? ) ;
|
||||
|
||||
# Now do smooth
|
||||
|
||||
ηυ $1 $smooth < [^[:L:][:M:]] { e $macron u ( $ddot? ) };
|
||||
η $smooth < [^[:L:][:M:]] { e $macron };
|
||||
ω $smooth < [^[:L:][:M:]] { o $macron };
|
||||
|
||||
αυ $1 $smooth < [^[:L:][:M:]] { au ( $ddot? ) };
|
||||
αι $1 $smooth < [^[:L:][:M:]] { ai ( $ddot? ) };
|
||||
α $smooth < [^[:L:][:M:]] { a };
|
||||
ευ $1 $smooth < [^[:L:][:M:]] { eu ( $ddot? ) };
|
||||
ει $1 $smooth < [^[:L:][:M:]] { ei ( $ddot? ) };
|
||||
ε $smooth < [^[:L:][:M:]] { e };
|
||||
ι $smooth < [^[:L:][:M:]] { i };
|
||||
ου $1 $smooth < [^[:L:][:M:]] { ou ( $ddot? ) };
|
||||
οι $1 $smooth < [^[:L:][:M:]] { oi ( $ddot? ) };
|
||||
ο $smooth < [^[:L:][:M:]] { o };
|
||||
υ $smooth < [^[:L:][:M:]] { u };
|
||||
υι $1 $smooth < [^[:L:][:M:]] { ui ( $ddot? ) };
|
||||
|
||||
|
||||
# seems to cause infinite loop
|
||||
# | $1 $smooth < [:^L:] { ([aeiouyAEIOUY] $macron? [aeiouyAEIOUY] $macron?) } [^[$smooth]] ;
|
||||
# | $1 $smooth < [:^L:] { ([aeiouyAEIOUY] $macron?) } [[^aeiouyAEIOUY] [$smooth]] ;
|
||||
|
||||
# TODO: preserve smooth/rough breathing if not
|
||||
# on initial vowel sequence
|
||||
|
||||
# need to have these up here so the rules don't mask
|
||||
|
||||
η <> e $macron ;
|
||||
Η <> E $macron ;
|
||||
|
||||
φ <> ph ;
|
||||
Ψ } $beforeLower <> Ps ;
|
||||
Ψ <> PS ;
|
||||
|
||||
Φ } $beforeLower <> Ph ;
|
||||
Φ <> PH ;
|
||||
ψ <> ps ;
|
||||
|
||||
ω <> o $macron ;
|
||||
Ω <> O $macron;
|
||||
|
||||
# NORMAL
|
||||
|
||||
α <> a ;
|
||||
Α <> A ;
|
||||
|
||||
β <> b ;
|
||||
Β <> B ;
|
||||
|
||||
γ } $gammaLike <> n } [gkc] ;
|
||||
γ <> g ;
|
||||
Γ } $gammaLike <> N } [gkc] ;
|
||||
Γ <> G ;
|
||||
|
||||
δ <> d ;
|
||||
Δ <> D ;
|
||||
|
||||
ε <> e ;
|
||||
Ε <> E ;
|
||||
|
||||
ζ <> z ;
|
||||
Ζ <> Z ;
|
||||
|
||||
θ <> th ;
|
||||
Θ } $beforeLower <> Th ;
|
||||
Θ <> TH ;
|
||||
|
||||
ι <> i ;
|
||||
Ι <> I ;
|
||||
|
||||
κ <> k ;
|
||||
Κ <> K ;
|
||||
|
||||
λ <> l ;
|
||||
Λ <> L ;
|
||||
|
||||
μ <> m ;
|
||||
Μ <> M ;
|
||||
|
||||
ν } $gammaLike > n\' ;
|
||||
ν <> n ;
|
||||
Ν } $gammaLike <> N\' ;
|
||||
Ν <> N ;
|
||||
|
||||
ξ <> x ;
|
||||
Ξ <> X ;
|
||||
|
||||
ο <> o ;
|
||||
Ο <> O ;
|
||||
|
||||
π <> p ;
|
||||
Π <> P ;
|
||||
|
||||
ρ $rough <> rh;
|
||||
Ρ $rough } $beforeLower <> Rh ;
|
||||
Ρ $rough <> RH ;
|
||||
ρ <> r ;
|
||||
Ρ <> R ;
|
||||
|
||||
[Pp] {ς > \'s ;
|
||||
[Pp] {σ > \'s ;
|
||||
σ < [:^L:] [:M:]* { s } [:^L:] ;
|
||||
ς <> s } [:^L:] ;
|
||||
σ <> s ;
|
||||
[Pp] { Σ <> \'S ;
|
||||
Σ <> S ;
|
||||
|
||||
τ <> t ;
|
||||
Τ <> T ;
|
||||
|
||||
$vowel {υ } <> u ;
|
||||
υ <> y ;
|
||||
$vowel { Υ <> U ;
|
||||
Υ <> Y ;
|
||||
|
||||
χ <> ch ;
|
||||
Χ } $beforeLower <> Ch ;
|
||||
Χ <> CH ;
|
||||
|
||||
# completeness for ASCII
|
||||
|
||||
| k < c ;
|
||||
| ph < f ;
|
||||
| i < j ;
|
||||
| k < q ;
|
||||
| u < v ;
|
||||
| u < w ;
|
||||
|
||||
::NFC ;
|
8
icu4j/src/com/ibm/icu/impl/data/Transliterator_Hiragana_Latin.txt
Executable file
8
icu4j/src/com/ibm/icu/impl/data/Transliterator_Hiragana_Latin.txt
Executable file
|
@ -0,0 +1,8 @@
|
|||
#--------------------------------------------------------------------
|
||||
# Copyright (c) 1999-2001, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#--------------------------------------------------------------------
|
||||
# Date: Tue Jan 23 12:18:46 2001
|
||||
#--------------------------------------------------------------------
|
||||
:: Hiragana-Katakana;
|
||||
:: Katakana-Latin;
|
446
icu4j/src/com/ibm/icu/impl/data/Transliterator_Latin_Katakana.txt
Executable file
446
icu4j/src/com/ibm/icu/impl/data/Transliterator_Latin_Katakana.txt
Executable file
|
@ -0,0 +1,446 @@
|
|||
#--------------------------------------------------------------------
|
||||
# Copyright (c) 1999-2001, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#--------------------------------------------------------------------
|
||||
# Date: Tue Jan 23 12:18:46 2001
|
||||
#--------------------------------------------------------------------
|
||||
|
||||
# Latin-Katakana
|
||||
|
||||
::NFD ;
|
||||
:: [a-zA-Z] Lower ;
|
||||
|
||||
# Uses modified Hepburn. Small changes to make
|
||||
# unambiguous.
|
||||
|
||||
# | Kunrei-shiki: Hepburn/MHepburn
|
||||
# | ------------------------------
|
||||
# | si: shi
|
||||
# | si ~ya: sha
|
||||
# | si ~yu: shu
|
||||
# | si ~yo: sho
|
||||
# | zi: ji
|
||||
# | zi ~ya: ja
|
||||
# | zi ~yu: ju
|
||||
# | zi ~yo: jo
|
||||
# | ti: chi
|
||||
# | ti ~ya: cha
|
||||
# | ti ~yu: chu
|
||||
# | ti ~yu: cho
|
||||
# | tu: tsu
|
||||
# | di: ji/dji
|
||||
# | du: zu/dzu
|
||||
# | hu: fu
|
||||
|
||||
# | For foreign words:
|
||||
# | -----------------
|
||||
# | se ~i si
|
||||
# | si ~e she
|
||||
# |
|
||||
# | ze ~i zi
|
||||
# | zi ~e je
|
||||
# |
|
||||
# | te ~i ti
|
||||
# | ti ~e che
|
||||
# | te ~u tu
|
||||
# |
|
||||
# | de ~i di
|
||||
# | de ~u du
|
||||
# | de ~i di
|
||||
# |
|
||||
# | he ~u: hu
|
||||
# | hu ~a fa
|
||||
# | hu ~i fi
|
||||
# | hu ~e he
|
||||
# | hu ~o ho
|
||||
|
||||
# Most small forms are generated, but if necessary
|
||||
# explicit small forms are given with ~a, ~ya, etc.
|
||||
|
||||
#------------------------------------------------------
|
||||
# Variables
|
||||
|
||||
$vowel = [aeiou] ;
|
||||
$macron = \u0304 ;
|
||||
|
||||
# Variables used for doubled-consonants with tsu
|
||||
|
||||
$kana = [\u3041-\u3094] ;
|
||||
|
||||
$voice = [\u3099\u309B];
|
||||
$semivoice = [\u309A\u309C];
|
||||
|
||||
$k_start = [カキクケコかきくけこ] ;
|
||||
|
||||
$s_start = [サシスセソさしすせそ] ;
|
||||
|
||||
$j_start = [シし] $voice ;
|
||||
|
||||
$t_start = [タチツテトたちつてと] ;
|
||||
|
||||
$n_start = [ナニヌネノンなにぬねの] ;
|
||||
|
||||
$h_start = [ハヒヘホはひへほ] ;
|
||||
$f_start = [フふ] ;
|
||||
|
||||
$m_start = [マミムメモまみむめも] ;
|
||||
|
||||
$y_start = [ヤユヨやゆよ] ;
|
||||
|
||||
$r_start = [ラリルレロらりるれろ] ;
|
||||
|
||||
$w_start = [ワヰヱヲわゐゑを] ;
|
||||
|
||||
# if ン is followed by $n_quoter, then it needs an
|
||||
# apostrophe after its romaji form to disambiguate it.
|
||||
# e.g., ン ア ! = ナ, so represent as "n'a", not "na".
|
||||
|
||||
$n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ;
|
||||
|
||||
$small_y = [ャィュェョ] ;
|
||||
|
||||
$iteration = \u309D ;
|
||||
|
||||
#------------------------------------------------------
|
||||
# katakana rules
|
||||
|
||||
# Punctuation
|
||||
|
||||
'.' <> 。;
|
||||
',' <> 、;
|
||||
# ' ' } [a-z] > ; # delete spaces before latin
|
||||
# ' ' < [^' '\u30A0-\u30ff] {} ['\u30A0-\u30ff] ; #insert spaces before hiragana
|
||||
|
||||
# Iteration Mark
|
||||
# Copy previous letter & marks
|
||||
|
||||
# TODO
|
||||
# | $1 $1 < ($kana [[:M:]$voice$semivoice]?) $iteration
|
||||
|
||||
# Specials for katakana -- not shared with hiragana
|
||||
|
||||
va <> ヷ ;
|
||||
vi <> ヸ ;
|
||||
ve <> ヹ ;
|
||||
vo <> ヺ ;
|
||||
'~ka' <> ヵ ;
|
||||
'~ke' <> ヶ ;
|
||||
|
||||
# ~~~ begin shared rules ~~~
|
||||
|
||||
#special
|
||||
|
||||
ya < '~'ャ;
|
||||
yi < '~'ィ ;
|
||||
yu < '~'ュ;
|
||||
ye < '~'ェ;
|
||||
yo < '~'ョ;
|
||||
|
||||
#normal
|
||||
|
||||
a <> ア ;
|
||||
|
||||
b | '~' < ヒ ゙} $small_y ;
|
||||
by } $vowel > ビ | '~y' ;
|
||||
|
||||
ba <> バ ;
|
||||
bi <> ビ ;
|
||||
bu <> ブ ;
|
||||
be <> ベ ;
|
||||
bo <> ボ ;
|
||||
|
||||
c } i > | s ;
|
||||
c } e > | s ;
|
||||
|
||||
da <> ダ ;
|
||||
di <> ディ ;
|
||||
du <> デゥ ;
|
||||
de <> デ ;
|
||||
do <> ド ;
|
||||
dzu <> ヅ ;
|
||||
dja < ヂャ ;
|
||||
dji'~i' < ヂィ ; # liu
|
||||
dju < ヂュ ;
|
||||
dje < ヂェ ;
|
||||
djo < ヂョ ;
|
||||
dji <> ヂ ;
|
||||
dj } $vowel > ヂ | '~y' ;
|
||||
|
||||
# TODO: QUESTION: use ĵĴżŻ instead of dj, dz
|
||||
|
||||
cha < チャ ;
|
||||
chi'~i' < チィ ; # liu
|
||||
chu < チュ ;
|
||||
che < チェ ;
|
||||
cho < チョ ;
|
||||
chi <> チ ;
|
||||
ch } $vowel > チ | '~y' ;
|
||||
|
||||
e <> エ ;
|
||||
|
||||
g | '~' < ギ} $small_y ;
|
||||
gy } $vowel > ギ | '~y' ;
|
||||
|
||||
ga <> ガ ;
|
||||
gi <> ギ ;
|
||||
gu <> グ ;
|
||||
ge <> ゲ ;
|
||||
go <> ゴ ;
|
||||
|
||||
i <> イ ;
|
||||
|
||||
# j } $vowel > ジ | '~y' ;
|
||||
|
||||
ja < ジャ ;
|
||||
ji'~i' < ジィ ; # liu
|
||||
ju < ジュ ;
|
||||
je < ジェ ;
|
||||
jo < ジョ ;
|
||||
ji <> ジ ;
|
||||
|
||||
k | '~' < キ} $small_y ;
|
||||
ky } $vowel > キ | '~y' ;
|
||||
|
||||
ka <> カ ;
|
||||
ki <> キ ;
|
||||
ku <> ク ;
|
||||
ke <> ケ ;
|
||||
ko <> コ ;
|
||||
|
||||
m | '~' < ミ} $small_y ;
|
||||
my } $vowel > ミ | '~y' ;
|
||||
|
||||
ma <> マ ;
|
||||
mi <> ミ ;
|
||||
mu <> ム ;
|
||||
me <> メ ;
|
||||
mo <> モ ;
|
||||
|
||||
m } [pbfv] > ン ;
|
||||
|
||||
n | '~' < ニ } $small_y ;
|
||||
ny } $vowel > ニ | '~y' ;
|
||||
|
||||
na <> ナ ;
|
||||
ni <> ニ ;
|
||||
nu <> ヌ ;
|
||||
ne <> ネ ;
|
||||
no <> ノ ;
|
||||
|
||||
o <> オ ;
|
||||
|
||||
p | '~' < ピ } $small_y ;
|
||||
py } $vowel > ピ | '~y' ;
|
||||
|
||||
pa <> パ ;
|
||||
pi <> ピ ;
|
||||
pu <> プ ;
|
||||
pe <> ペ ;
|
||||
po <> ポ ;
|
||||
|
||||
h | '~' < ヒ } $small_y ;
|
||||
hy } $vowel > ヒ | '~y' ;
|
||||
|
||||
ha <> ハ ;
|
||||
hi <> ヒ ;
|
||||
hu <> ヘゥ ;
|
||||
he <> ヘ ;
|
||||
ho <> ホ ;
|
||||
|
||||
# f | '~' < フ } $small_y ;
|
||||
# f } $vowel > フ | '~' ;
|
||||
|
||||
fa <> ファ ;
|
||||
fi <> フィ ;
|
||||
fe <> フェ ;
|
||||
fo <> フォ ;
|
||||
fu <> フ ;
|
||||
|
||||
r | '~' < リ } $small_y ;
|
||||
ry } $vowel > リ | '~y' ;
|
||||
|
||||
ra <> ラ ;
|
||||
ri <> リ ;
|
||||
ru <> ル ;
|
||||
re <> レ ;
|
||||
ro <> ロ ;
|
||||
|
||||
za <> ザ ;
|
||||
zi <> ゼィ ;
|
||||
zu <> ズ ;
|
||||
ze <> ゼ ;
|
||||
zo <> ゾ ;
|
||||
|
||||
sa <> サ ;
|
||||
si <> セィ ;
|
||||
su <> ス ;
|
||||
se <> セ ;
|
||||
so <> ソ ;
|
||||
|
||||
sha < シャ ;
|
||||
shi'~i' < シィ ; # liu
|
||||
shu < シュ ;
|
||||
she < シェ ;
|
||||
sho < ショ ;
|
||||
shi <> シ ;
|
||||
sh } $vowel > シ | '~y' ;
|
||||
|
||||
ta <> タ ;
|
||||
ti <> ティ ;
|
||||
tu <> テゥ ;
|
||||
te <> テ ;
|
||||
to <> ト ;
|
||||
|
||||
tsu <> ツ ;
|
||||
|
||||
# v } $vowel > ヴ | '~' ;
|
||||
|
||||
#'v~a' < ヴァ ; # liu
|
||||
#'v~i' < ヴィ ; # liu
|
||||
#'v~e' < ヴェ ; # liu
|
||||
#'v~o' < ヴォ ; # liu
|
||||
vu <> ヴ ;
|
||||
|
||||
u <> ウ ;
|
||||
|
||||
# w } $vowel > ウ | '~' ;
|
||||
|
||||
wa <> ワ ;
|
||||
wi <> ヰ ;
|
||||
wu > ウ ;
|
||||
we <> ヱ ;
|
||||
wo <> ヲ ;
|
||||
|
||||
ya <> ヤ ;
|
||||
yi > イ ;
|
||||
yu <> ユ ;
|
||||
ye > エ ;
|
||||
yo <> ヨ ;
|
||||
|
||||
# double consonants
|
||||
|
||||
#specials
|
||||
s } sh > ッ ;
|
||||
t } ch > ッ ;
|
||||
|
||||
#voiced
|
||||
|
||||
b } b <> ッ } [$h_start$f_start] $voice;
|
||||
d } d <> ッ } $t_start $voice;
|
||||
g } g <> ッ } $k_start $voice;
|
||||
p } p <> ッ } [$h_start$f_start] $semivoice;
|
||||
# v } v <> ッ } [ワヰウヱヲう] $voice ;
|
||||
z } z <> ッ } $s_start $voice;
|
||||
|
||||
# normal
|
||||
|
||||
j } j <> ッ } $j_start ;
|
||||
k } k <> ッ } $k_start ;
|
||||
m } m <> ッ } $m_start ;
|
||||
n } n <> ッ } $n_start ;
|
||||
h } h <> ッ } $h_start ;
|
||||
f } f <> ッ } $f_start ;
|
||||
r } r <> ッ } $r_start ;
|
||||
t } t <> ッ } $t_start ;
|
||||
s } s <> ッ } $s_start ;
|
||||
|
||||
# completeness
|
||||
w < ッ } $w_start;
|
||||
y < ッ } $y_start;
|
||||
|
||||
x } x > ッ ;
|
||||
c } k > ッ ;
|
||||
c } c > ッ ;
|
||||
c } q > ッ ;
|
||||
l } l > ッ ;
|
||||
q } q > ッ ;
|
||||
# y } y > ッ ;
|
||||
# w } w > ッ ;
|
||||
|
||||
# prolonged vowel mark. this indicates a doubling of
|
||||
# the preceding vowel sound
|
||||
|
||||
#a < a { ー ; # liu
|
||||
#e < e { ー ; # liu
|
||||
#i < i { ー ; # liu
|
||||
#o < o { ー ; # liu
|
||||
#u < u { ー ; # liu
|
||||
|
||||
$macron <> ー ;
|
||||
|
||||
# small forms
|
||||
|
||||
'~a' <> ァ ;
|
||||
'~i' <> ィ ;
|
||||
'~u' <> ゥ ;
|
||||
'~e' <> ェ ;
|
||||
'~o' <> ォ ;
|
||||
'~tsu' <> ッ ;
|
||||
'~wa' <> ヮ ;
|
||||
'~ya' <> ャ ;
|
||||
'~yi' > ィ ;
|
||||
'~yu' <> ュ ;
|
||||
'~ye' > ェ ;
|
||||
'~yo' <> ョ ;
|
||||
|
||||
# one-way latin- > kana rules. these do not occur in
|
||||
# well-formed romaji representing actual japanese text.
|
||||
# their purpose is to make all romaji map to kana of
|
||||
# some sort.
|
||||
|
||||
# the following are not really necessary, but produce
|
||||
# slightly more natural results.
|
||||
|
||||
cy > セィ ;
|
||||
dy > ディ ;
|
||||
hy > ヒ ;
|
||||
sy > セィ ;
|
||||
ty > ティ ;
|
||||
zy > ゼィ ;
|
||||
|
||||
# isolated consonants listed here so as not to mask
|
||||
# longer rules above.
|
||||
|
||||
ch > チ;
|
||||
sh > シ ;
|
||||
dz > ヅ ;
|
||||
dj > ヂ;
|
||||
|
||||
b > ブ ;
|
||||
d > デ ;
|
||||
g > グ ;
|
||||
h > ヘ ;
|
||||
k > ク ;
|
||||
m > ム ;
|
||||
n'' < ン } $n_quoter ;
|
||||
n <> ン ;
|
||||
p > プ ;
|
||||
r > ル ;
|
||||
s > ス ;
|
||||
t > テ ;
|
||||
y > イ ;
|
||||
z > ズ ;
|
||||
v > ヴ ;
|
||||
|
||||
f > フ;
|
||||
j > ジ;
|
||||
w > ウ;
|
||||
|
||||
# simple substitutions using backup
|
||||
|
||||
c > | k ;
|
||||
l > | r ;
|
||||
q > | k ;
|
||||
x > | ks ;
|
||||
|
||||
# ~~~ END shared rules ~~~
|
||||
|
||||
#------------------------------------------------------
|
||||
# Final cleanup
|
||||
|
||||
'~' > ; # delete stray tildes between letters
|
||||
# '' > ; # delete stray quotes between letters
|
||||
|
||||
:: NFC ;
|
||||
|
||||
# eof
|
285
icu4j/src/com/ibm/text/resources/Transliterator_Any_Accents.txt
Executable file
285
icu4j/src/com/ibm/text/resources/Transliterator_Any_Accents.txt
Executable file
|
@ -0,0 +1,285 @@
|
|||
:: nfd;
|
||||
|
||||
# to do: make reversible
|
||||
|
||||
# define special conversion character
|
||||
|
||||
$x = \| ;
|
||||
|
||||
# Provide keyboard equivalents for common diacritics used in transliteration
|
||||
|
||||
\` $x <> \u0300 ; # COMBINING GRAVE ACCENT
|
||||
\' $x <> \u0301 ; # COMBINING ACUTE ACCENT
|
||||
\^ $x <> \u0302 ; # COMBINING CIRCUMFLEX ACCENT
|
||||
\~ $x <> \u0303 ; # COMBINING TILDE
|
||||
\- $x <> \u0304 ; # COMBINING MACRON
|
||||
\" $x <> \u0308 ; # COMBINING DIAERESIS
|
||||
\* $x <> \u030A ; # COMBINING RING ABOVE
|
||||
\, $x <> \u0327 ; # COMBINING CEDILLA
|
||||
'/' $x <> \u0338 ; # COMBINING LONG SOLIDUS OVERLAY
|
||||
\. $x <> \u0323 ; # COMBINING DOT BELOW
|
||||
|
||||
# Combine common characters
|
||||
|
||||
AE $x <> \u00C6 ; # LATIN CAPITAL LETTER AE
|
||||
ae $x <> \u00E6 ; # LATIN SMALL LETTER AE
|
||||
D $x <> \u00D0 ; # LATIN CAPITAL LETTER ETH
|
||||
d $x <> \u00F0 ; # LATIN SMALL LETTER ETH
|
||||
O'/' $x <> \u00D8 ; # LATIN CAPITAL LETTER O WITH STROKE
|
||||
o'/' $x <> \u00F8 ; # LATIN SMALL LETTER O WITH STROKE
|
||||
TH $x <> \u00DE ; # LATIN CAPITAL LETTER THORN
|
||||
th $x <> \u00FE ; # LATIN SMALL LETTER THORN
|
||||
OE $x <> \u0152 ; # LATIN CAPITAL LIGATURE OE
|
||||
oe $x <> \u0153 ; # LATIN SMALL LIGATURE OE
|
||||
|
||||
ss $x <> \u00DF ; # LATIN SMALL LETTER SHARP S
|
||||
|
||||
NG $x <> \u014A ; # LATIN CAPITAL LETTER ENG
|
||||
ng $x <> \u014B ; # LATIN SMALL LETTER ENG
|
||||
|
||||
T $x <> \u0398 ; # THETA
|
||||
t $x <> \u03B8 ; # THETA
|
||||
SH $x <> \u01A9 ; # LATIN CAPITAL LETTER ESH
|
||||
sh $x <> \u0283 ; # LATIN SMALL LETTER ESH
|
||||
ZH $x <> \u01B7 ; # LATIN CAPITAL LETTER EZH
|
||||
zh $x <> \u0292 ; # LATIN SMALL LETTER EZH
|
||||
|
||||
U $x <> \u01B1 ; # LATIN CAPITAL LETTER UPSILON
|
||||
u $x <> \u028A ; # LATIN SMALL LETTER UPSILON
|
||||
A $x <> \u018F ; # LATIN CAPITAL LETTER SCHWA
|
||||
a $x <> \u0259 ; # LATIN SMALL LETTER SCHWA
|
||||
O $x <> \u0186 ; # LATIN CAPITAL LETTER OPEN O
|
||||
o $x <> \u0254 ; # LATIN SMALL LETTER OPEN O
|
||||
E $x <> \u0190 ; # LATIN CAPITAL LETTER OPEN E
|
||||
e $x <> \u025B ; # LATIN SMALL LETTER OPEN E
|
||||
|
||||
# three that don't have uppercases
|
||||
|
||||
'?' $x <> \u0294 ; # LATIN LETTER GLOTTAL STOP
|
||||
i $x <> \u026A ; # LATIN LETTER SMALL CAPITAL I
|
||||
v $x <> \u028C ; # LATIN SMALL LETTER TURNED V
|
||||
|
||||
$x > ; # delete any left-overs
|
||||
|
||||
# Additional Characters that may be added in the future
|
||||
|
||||
# xxx $x <> \u0306 ; # COMBINING BREVE
|
||||
# xxx $x <> \u0307 ; # COMBINING DOT ABOVE
|
||||
# xxx $x <> \u0309 ; # COMBINING HOOK ABOVE
|
||||
# xxx $x <> \u030B ; # COMBINING DOUBLE ACUTE ACCENT
|
||||
# xxx $x <> \u030C ; # COMBINING CARON
|
||||
# xxx $x <> \u030F ; # COMBINING DOUBLE GRAVE ACCENT
|
||||
# xxx $x <> \u0311 ; # COMBINING INVERTED BREVE
|
||||
# xxx $x <> \u0313 ; # COMBINING COMMA ABOVE
|
||||
# xxx $x <> \u0314 ; # COMBINING REVERSED COMMA ABOVE
|
||||
# xxx $x <> \u031B ; # COMBINING HORN
|
||||
# xxx $x <> \u0324 ; # COMBINING DIAERESIS BELOW
|
||||
# xxx $x <> \u0325 ; # COMBINING RING BELOW
|
||||
# xxx $x <> \u0326 ; # COMBINING COMMA BELOW
|
||||
# xxx $x <> \u0328 ; # COMBINING OGONEK
|
||||
# xxx $x <> \u032D ; # COMBINING CIRCUMFLEX ACCENT BELOW
|
||||
# xxx $x <> \u032E ; # COMBINING BREVE BELOW
|
||||
# xxx $x <> \u0330 ; # COMBINING TILDE BELOW
|
||||
# xxx $x <> \u0331 ; # COMBINING MACRON BELOW
|
||||
|
||||
# yyy $x <> \u00AA ; # FEMININE ORDINAL INDICATOR
|
||||
# yyy $x <> \u00BA ; # MASCULINE ORDINAL INDICATOR
|
||||
# yyy $x <> \u0110 ; # LATIN CAPITAL LETTER D WITH STROKE
|
||||
# yyy $x <> \u0111 ; # LATIN SMALL LETTER D WITH STROKE
|
||||
# yyy $x <> \u0126 ; # LATIN CAPITAL LETTER H WITH STROKE
|
||||
# yyy $x <> \u0127 ; # LATIN SMALL LETTER H WITH STROKE
|
||||
# yyy $x <> \u0131 ; # LATIN SMALL LETTER DOTLESS I
|
||||
# yyy $x <> \u0138 ; # LATIN SMALL LETTER KRA
|
||||
# yyy $x <> \u013F ; # LATIN CAPITAL LETTER L WITH MIDDLE DOT
|
||||
# yyy $x <> \u0140 ; # LATIN SMALL LETTER L WITH MIDDLE DOT
|
||||
# yyy $x <> \u0141 ; # LATIN CAPITAL LETTER L WITH STROKE
|
||||
# yyy $x <> \u0142 ; # LATIN SMALL LETTER L WITH STROKE
|
||||
# yyy $x <> \u0149 ; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
|
||||
# yyy $x <> \u0166 ; # LATIN CAPITAL LETTER T WITH STROKE
|
||||
# yyy $x <> \u0167 ; # LATIN SMALL LETTER T WITH STROKE
|
||||
# yyy $x <> \u017F ; # LATIN SMALL LETTER LONG S
|
||||
# yyy $x <> \u0180 ; # LATIN SMALL LETTER B WITH STROKE
|
||||
# yyy $x <> \u0181 ; # LATIN CAPITAL LETTER B WITH HOOK
|
||||
# yyy $x <> \u0182 ; # LATIN CAPITAL LETTER B WITH TOPBAR
|
||||
# yyy $x <> \u0183 ; # LATIN SMALL LETTER B WITH TOPBAR
|
||||
# yyy $x <> \u0184 ; # LATIN CAPITAL LETTER TONE SIX
|
||||
# yyy $x <> \u0185 ; # LATIN SMALL LETTER TONE SIX
|
||||
# yyy $x <> \u0187 ; # LATIN CAPITAL LETTER C WITH HOOK
|
||||
# yyy $x <> \u0188 ; # LATIN SMALL LETTER C WITH HOOK
|
||||
# yyy $x <> \u0189 ; # LATIN CAPITAL LETTER AFRICAN D
|
||||
# yyy $x <> \u018A ; # LATIN CAPITAL LETTER D WITH HOOK
|
||||
# yyy $x <> \u018B ; # LATIN CAPITAL LETTER D WITH TOPBAR
|
||||
# yyy $x <> \u018C ; # LATIN SMALL LETTER D WITH TOPBAR
|
||||
# yyy $x <> \u018D ; # LATIN SMALL LETTER TURNED DELTA
|
||||
# yyy $x <> \u018E ; # LATIN CAPITAL LETTER REVERSED E
|
||||
# yyy $x <> \u0191 ; # LATIN CAPITAL LETTER F WITH HOOK
|
||||
# yyy $x <> \u0192 ; # LATIN SMALL LETTER F WITH HOOK
|
||||
# yyy $x <> \u0193 ; # LATIN CAPITAL LETTER G WITH HOOK
|
||||
# yyy $x <> \u0194 ; # LATIN CAPITAL LETTER GAMMA
|
||||
# yyy $x <> \u0195 ; # LATIN SMALL LETTER HV
|
||||
# yyy $x <> \u0196 ; # LATIN CAPITAL LETTER IOTA
|
||||
# yyy $x <> \u0197 ; # LATIN CAPITAL LETTER I WITH STROKE
|
||||
# yyy $x <> \u0198 ; # LATIN CAPITAL LETTER K WITH HOOK
|
||||
# yyy $x <> \u0199 ; # LATIN SMALL LETTER K WITH HOOK
|
||||
# yyy $x <> \u019A ; # LATIN SMALL LETTER L WITH BAR
|
||||
# yyy $x <> \u019B ; # LATIN SMALL LETTER LAMBDA WITH STROKE
|
||||
# yyy $x <> \u019C ; # LATIN CAPITAL LETTER TURNED M
|
||||
# yyy $x <> \u019D ; # LATIN CAPITAL LETTER N WITH LEFT HOOK
|
||||
# yyy $x <> \u019E ; # LATIN SMALL LETTER N WITH LONG RIGHT LEG
|
||||
# yyy $x <> \u019F ; # LATIN CAPITAL LETTER O WITH MIDDLE TILDE
|
||||
# yyy $x <> \u01A2 ; # LATIN CAPITAL LETTER OI
|
||||
# yyy $x <> \u01A3 ; # LATIN SMALL LETTER OI
|
||||
# yyy $x <> \u01A4 ; # LATIN CAPITAL LETTER P WITH HOOK
|
||||
# yyy $x <> \u01A5 ; # LATIN SMALL LETTER P WITH HOOK
|
||||
# yyy $x <> \u01A6 ; # LATIN LETTER YR
|
||||
# yyy $x <> \u01A7 ; # LATIN CAPITAL LETTER TONE TWO
|
||||
# yyy $x <> \u01A8 ; # LATIN SMALL LETTER TONE TWO
|
||||
# yyy $x <> \u01AA ; # LATIN LETTER REVERSED ESH LOOP
|
||||
# yyy $x <> \u01AB ; # LATIN SMALL LETTER T WITH PALATAL HOOK
|
||||
# yyy $x <> \u01AC ; # LATIN CAPITAL LETTER T WITH HOOK
|
||||
# yyy $x <> \u01AD ; # LATIN SMALL LETTER T WITH HOOK
|
||||
# yyy $x <> \u01AE ; # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
|
||||
# yyy $x <> \u01B2 ; # LATIN CAPITAL LETTER V WITH HOOK
|
||||
# yyy $x <> \u01B3 ; # LATIN CAPITAL LETTER Y WITH HOOK
|
||||
# yyy $x <> \u01B4 ; # LATIN SMALL LETTER Y WITH HOOK
|
||||
# yyy $x <> \u01B5 ; # LATIN CAPITAL LETTER Z WITH STROKE
|
||||
# yyy $x <> \u01B6 ; # LATIN SMALL LETTER Z WITH STROKE
|
||||
# yyy $x <> \u01B8 ; # LATIN CAPITAL LETTER EZH REVERSED
|
||||
# yyy $x <> \u01B9 ; # LATIN SMALL LETTER EZH REVERSED
|
||||
# yyy $x <> \u01BA ; # LATIN SMALL LETTER EZH WITH TAIL
|
||||
# yyy $x <> \u01BB ; # LATIN LETTER TWO WITH STROKE
|
||||
# yyy $x <> \u01BC ; # LATIN CAPITAL LETTER TONE FIVE
|
||||
# yyy $x <> \u01BD ; # LATIN SMALL LETTER TONE FIVE
|
||||
# yyy $x <> \u01BE ; # LATIN LETTER INVERTED GLOTTAL STOP WITH STROKE
|
||||
# yyy $x <> \u01BF ; # LATIN LETTER WYNN
|
||||
# yyy $x <> \u01C0 ; # LATIN LETTER DENTAL CLICK
|
||||
# yyy $x <> \u01C1 ; # LATIN LETTER LATERAL CLICK
|
||||
# yyy $x <> \u01C2 ; # LATIN LETTER ALVEOLAR CLICK
|
||||
# yyy $x <> \u01C3 ; # LATIN LETTER RETROFLEX CLICK
|
||||
# yyy $x <> \u01C4 ; # LATIN CAPITAL LETTER DZ WITH CARON
|
||||
# yyy $x <> \u01C5 ; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
|
||||
# yyy $x <> \u01C6 ; # LATIN SMALL LETTER DZ WITH CARON
|
||||
# yyy $x <> \u01C7 ; # LATIN CAPITAL LETTER LJ
|
||||
# yyy $x <> \u01C8 ; # LATIN CAPITAL LETTER L WITH SMALL LETTER J
|
||||
# yyy $x <> \u01C9 ; # LATIN SMALL LETTER LJ
|
||||
# yyy $x <> \u01CA ; # LATIN CAPITAL LETTER NJ
|
||||
# yyy $x <> \u01CB ; # LATIN CAPITAL LETTER N WITH SMALL LETTER J
|
||||
# yyy $x <> \u01CC ; # LATIN SMALL LETTER NJ
|
||||
# yyy $x <> \u01DD ; # LATIN SMALL LETTER TURNED E
|
||||
# yyy $x <> \u01E4 ; # LATIN CAPITAL LETTER G WITH STROKE
|
||||
# yyy $x <> \u01E5 ; # LATIN SMALL LETTER G WITH STROKE
|
||||
# yyy $x <> \u01F1 ; # LATIN CAPITAL LETTER DZ
|
||||
# yyy $x <> \u01F2 ; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z
|
||||
# yyy $x <> \u01F3 ; # LATIN SMALL LETTER DZ
|
||||
# yyy $x <> \u01F6 ; # LATIN CAPITAL LETTER HWAIR
|
||||
# yyy $x <> \u01F7 ; # LATIN CAPITAL LETTER WYNN
|
||||
# yyy $x <> \u021C ; # LATIN CAPITAL LETTER YOGH
|
||||
# yyy $x <> \u021D ; # LATIN SMALL LETTER YOGH
|
||||
# yyy $x <> \u0222 ; # LATIN CAPITAL LETTER OU
|
||||
# yyy $x <> \u0223 ; # LATIN SMALL LETTER OU
|
||||
# yyy $x <> \u0224 ; # LATIN CAPITAL LETTER Z WITH HOOK
|
||||
# yyy $x <> \u0225 ; # LATIN SMALL LETTER Z WITH HOOK
|
||||
# yyy $x <> \u0250 ; # LATIN SMALL LETTER TURNED A
|
||||
# yyy $x <> \u0251 ; # LATIN SMALL LETTER ALPHA
|
||||
# yyy $x <> \u0252 ; # LATIN SMALL LETTER TURNED ALPHA
|
||||
# yyy $x <> \u0253 ; # LATIN SMALL LETTER B WITH HOOK
|
||||
# yyy $x <> \u0255 ; # LATIN SMALL LETTER C WITH CURL
|
||||
# yyy $x <> \u0256 ; # LATIN SMALL LETTER D WITH TAIL
|
||||
# yyy $x <> \u0257 ; # LATIN SMALL LETTER D WITH HOOK
|
||||
# yyy $x <> \u0258 ; # LATIN SMALL LETTER REVERSED E
|
||||
# yyy $x <> \u025A ; # LATIN SMALL LETTER SCHWA WITH HOOK
|
||||
# yyy $x <> \u025C ; # LATIN SMALL LETTER REVERSED OPEN E
|
||||
# yyy $x <> \u025D ; # LATIN SMALL LETTER REVERSED OPEN E WITH HOOK
|
||||
# yyy $x <> \u025E ; # LATIN SMALL LETTER CLOSED REVERSED OPEN E
|
||||
# yyy $x <> \u025F ; # LATIN SMALL LETTER DOTLESS J WITH STROKE
|
||||
# yyy $x <> \u0260 ; # LATIN SMALL LETTER G WITH HOOK
|
||||
# yyy $x <> \u0261 ; # LATIN SMALL LETTER SCRIPT G
|
||||
# yyy $x <> \u0262 ; # LATIN LETTER SMALL CAPITAL G
|
||||
# yyy $x <> \u0263 ; # LATIN SMALL LETTER GAMMA
|
||||
# yyy $x <> \u0264 ; # LATIN SMALL LETTER RAMS HORN
|
||||
# yyy $x <> \u0265 ; # LATIN SMALL LETTER TURNED H
|
||||
# yyy $x <> \u0266 ; # LATIN SMALL LETTER H WITH HOOK
|
||||
# yyy $x <> \u0267 ; # LATIN SMALL LETTER HENG WITH HOOK
|
||||
# yyy $x <> \u0268 ; # LATIN SMALL LETTER I WITH STROKE
|
||||
# yyy $x <> \u0269 ; # LATIN SMALL LETTER IOTA
|
||||
# yyy $x <> \u026B ; # LATIN SMALL LETTER L WITH MIDDLE TILDE
|
||||
# yyy $x <> \u026C ; # LATIN SMALL LETTER L WITH BELT
|
||||
# yyy $x <> \u026D ; # LATIN SMALL LETTER L WITH RETROFLEX HOOK
|
||||
# yyy $x <> \u026E ; # LATIN SMALL LETTER LEZH
|
||||
# yyy $x <> \u026F ; # LATIN SMALL LETTER TURNED M
|
||||
# yyy $x <> \u0270 ; # LATIN SMALL LETTER TURNED M WITH LONG LEG
|
||||
# yyy $x <> \u0271 ; # LATIN SMALL LETTER M WITH HOOK
|
||||
# yyy $x <> \u0272 ; # LATIN SMALL LETTER N WITH LEFT HOOK
|
||||
# yyy $x <> \u0273 ; # LATIN SMALL LETTER N WITH RETROFLEX HOOK
|
||||
# yyy $x <> \u0274 ; # LATIN LETTER SMALL CAPITAL N
|
||||
# yyy $x <> \u0275 ; # LATIN SMALL LETTER BARRED O
|
||||
# yyy $x <> \u0276 ; # LATIN LETTER SMALL CAPITAL OE
|
||||
# yyy $x <> \u0277 ; # LATIN SMALL LETTER CLOSED OMEGA
|
||||
# yyy $x <> \u0278 ; # LATIN SMALL LETTER PHI
|
||||
# yyy $x <> \u0279 ; # LATIN SMALL LETTER TURNED R
|
||||
# yyy $x <> \u027A ; # LATIN SMALL LETTER TURNED R WITH LONG LEG
|
||||
# yyy $x <> \u027B ; # LATIN SMALL LETTER TURNED R WITH HOOK
|
||||
# yyy $x <> \u027C ; # LATIN SMALL LETTER R WITH LONG LEG
|
||||
# yyy $x <> \u027D ; # LATIN SMALL LETTER R WITH TAIL
|
||||
# yyy $x <> \u027E ; # LATIN SMALL LETTER R WITH FISHHOOK
|
||||
# yyy $x <> \u027F ; # LATIN SMALL LETTER REVERSED R WITH FISHHOOK
|
||||
# yyy $x <> \u0280 ; # LATIN LETTER SMALL CAPITAL R
|
||||
# yyy $x <> \u0281 ; # LATIN LETTER SMALL CAPITAL INVERTED R
|
||||
# yyy $x <> \u0282 ; # LATIN SMALL LETTER S WITH HOOK
|
||||
# yyy $x <> \u0284 ; # LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK
|
||||
# yyy $x <> \u0285 ; # LATIN SMALL LETTER SQUAT REVERSED ESH
|
||||
# yyy $x <> \u0286 ; # LATIN SMALL LETTER ESH WITH CURL
|
||||
# yyy $x <> \u0287 ; # LATIN SMALL LETTER TURNED T
|
||||
# yyy $x <> \u0288 ; # LATIN SMALL LETTER T WITH RETROFLEX HOOK
|
||||
# yyy $x <> \u0289 ; # LATIN SMALL LETTER U BAR
|
||||
# yyy $x <> \u028B ; # LATIN SMALL LETTER V WITH HOOK
|
||||
# yyy $x <> \u028D ; # LATIN SMALL LETTER TURNED W
|
||||
# yyy $x <> \u028E ; # LATIN SMALL LETTER TURNED Y
|
||||
# yyy $x <> \u028F ; # LATIN LETTER SMALL CAPITAL Y
|
||||
# yyy $x <> \u0290 ; # LATIN SMALL LETTER Z WITH RETROFLEX HOOK
|
||||
# yyy $x <> \u0291 ; # LATIN SMALL LETTER Z WITH CURL
|
||||
# yyy $x <> \u0293 ; # LATIN SMALL LETTER EZH WITH CURL
|
||||
# yyy $x <> \u0294 ; # LATIN LETTER GLOTTAL STOP
|
||||
# yyy $x <> \u0295 ; # LATIN LETTER PHARYNGEAL VOICED FRICATIVE
|
||||
# yyy $x <> \u0296 ; # LATIN LETTER INVERTED GLOTTAL STOP
|
||||
# yyy $x <> \u0297 ; # LATIN LETTER STRETCHED C
|
||||
# yyy $x <> \u0298 ; # LATIN LETTER BILABIAL CLICK
|
||||
# yyy $x <> \u0299 ; # LATIN LETTER SMALL CAPITAL B
|
||||
# yyy $x <> \u029A ; # LATIN SMALL LETTER CLOSED OPEN E
|
||||
# yyy $x <> \u029B ; # LATIN LETTER SMALL CAPITAL G WITH HOOK
|
||||
# yyy $x <> \u029C ; # LATIN LETTER SMALL CAPITAL H
|
||||
# yyy $x <> \u029D ; # LATIN SMALL LETTER J WITH CROSSED-TAIL
|
||||
# yyy $x <> \u029E ; # LATIN SMALL LETTER TURNED K
|
||||
# yyy $x <> \u029F ; # LATIN LETTER SMALL CAPITAL L
|
||||
# yyy $x <> \u02A0 ; # LATIN SMALL LETTER Q WITH HOOK
|
||||
# yyy $x <> \u02A1 ; # LATIN LETTER GLOTTAL STOP WITH STROKE
|
||||
# yyy $x <> \u02A2 ; # LATIN LETTER REVERSED GLOTTAL STOP WITH STROKE
|
||||
# yyy $x <> \u02A3 ; # LATIN SMALL LETTER DZ DIGRAPH
|
||||
# yyy $x <> \u02A4 ; # LATIN SMALL LETTER DEZH DIGRAPH
|
||||
# yyy $x <> \u02A5 ; # LATIN SMALL LETTER DZ DIGRAPH WITH CURL
|
||||
# yyy $x <> \u02A6 ; # LATIN SMALL LETTER TS DIGRAPH
|
||||
# yyy $x <> \u02A7 ; # LATIN SMALL LETTER TESH DIGRAPH
|
||||
# yyy $x <> \u02A8 ; # LATIN SMALL LETTER TC DIGRAPH WITH CURL
|
||||
# yyy $x <> \u02A9 ; # LATIN SMALL LETTER FENG DIGRAPH
|
||||
# yyy $x <> \u02AA ; # LATIN SMALL LETTER LS DIGRAPH
|
||||
# yyy $x <> \u02AB ; # LATIN SMALL LETTER LZ DIGRAPH
|
||||
# yyy $x <> \u02AC ; # LATIN LETTER BILABIAL PERCUSSIVE
|
||||
# yyy $x <> \u02AD ; # LATIN LETTER BIDENTAL PERCUSSIVE
|
||||
# yyy $x <> \u02B0 ; # MODIFIER LETTER SMALL H
|
||||
# yyy $x <> \u02B1 ; # MODIFIER LETTER SMALL H WITH HOOK
|
||||
# yyy $x <> \u02B2 ; # MODIFIER LETTER SMALL J
|
||||
# yyy $x <> \u02B3 ; # MODIFIER LETTER SMALL R
|
||||
# yyy $x <> \u02B4 ; # MODIFIER LETTER SMALL TURNED R
|
||||
# yyy $x <> \u02B5 ; # MODIFIER LETTER SMALL TURNED R WITH HOOK
|
||||
# yyy $x <> \u02B6 ; # MODIFIER LETTER SMALL CAPITAL INVERTED R
|
||||
# yyy $x <> \u02B7 ; # MODIFIER LETTER SMALL W
|
||||
# yyy $x <> \u02B8 ; # MODIFIER LETTER SMALL Y
|
||||
# yyy $x <> \u02E0 ; # MODIFIER LETTER SMALL GAMMA
|
||||
# yyy $x <> \u02E1 ; # MODIFIER LETTER SMALL L
|
||||
# yyy $x <> \u02E2 ; # MODIFIER LETTER SMALL S
|
||||
# yyy $x <> \u02E3 ; # MODIFIER LETTER SMALL X
|
||||
# yyy $x <> \u02E4 ; # MODIFIER LETTER SMALL REVERSED GLOTTAL STOP
|
||||
# yyy $x <> \u1E9A ; # LATIN SMALL LETTER A WITH RIGHT HALF RING
|
||||
# yyy $x <> \u207F ; # SUPERSCRIPT LATIN SMALL LETTER N
|
||||
|
||||
:: NFC;
|
31
icu4j/src/com/ibm/text/resources/Transliterator_Any_Publishing.txt
Executable file
31
icu4j/src/com/ibm/text/resources/Transliterator_Any_Publishing.txt
Executable file
|
@ -0,0 +1,31 @@
|
|||
# Test case
|
||||
# "The" "(quick)" ('brown') `fox' ` jumped -- "over?"
|
||||
|
||||
# Variables
|
||||
|
||||
$single = \' ;
|
||||
$space = ' ' ;
|
||||
$double = \" ;
|
||||
$back = \` ;
|
||||
$tab = '\u0008' ;
|
||||
$makeRight = [[:Z:][:Ps:][:Pi:]] ;
|
||||
|
||||
# fix UNIX quotes
|
||||
|
||||
$back $back > “ ;
|
||||
$back > ‘ ;
|
||||
|
||||
# fix typewriter quotes, by context
|
||||
|
||||
$makeRight {$double} <> “ ;
|
||||
^ {$double} > “ ;
|
||||
$double <> ” ;
|
||||
|
||||
$makeRight {$single} <> ‘ ;
|
||||
^ {$single} > ‘ ;
|
||||
$single <> ’;
|
||||
|
||||
# fix multiple spaces and hyphens
|
||||
|
||||
$space {$space} > ;
|
||||
'--' <> — ;
|
242
icu4j/src/com/ibm/text/resources/Transliterator_Greek_Latin.txt
Executable file
242
icu4j/src/com/ibm/text/resources/Transliterator_Greek_Latin.txt
Executable file
|
@ -0,0 +1,242 @@
|
|||
# Rules are predicated on running NFD first, and NFC afterwards
|
||||
::NFD ;
|
||||
|
||||
# TEST CASES
|
||||
|
||||
# Ὀλίγοι ἔμφονες πολλῶν ἀφρόνων φοβερώτεροι — Πλάτωνος
|
||||
# ᾂ ᾒ ᾢ ᾃ ᾓ ᾣ
|
||||
# ᾳ ῃ ῳ ὃ ὄ
|
||||
# ὠς ὡς ὢς ὣς
|
||||
# Ὠς Ὡς Ὢς Ὣς
|
||||
# ὨΣ ὩΣ ὪΣ ὫΣ
|
||||
# Ạ, ạ, Ẹ, ẹ, Ọ, ọ
|
||||
|
||||
# Useful variables
|
||||
|
||||
$lower = [:Ll:] ;
|
||||
$upper = [:Lu:] ;
|
||||
$accent = [:M:] ;
|
||||
|
||||
$macron = \u0304 ;
|
||||
$ddot = \u0308 ;
|
||||
|
||||
$lcgvowel = [αεηιουω] ;
|
||||
$ucgvowel = [ΑΕΗΙΟΥΩ] ;
|
||||
$gvowel = [$lcgvowel $ucgvowel] ;
|
||||
$lcgvowelC = [$lcgvowel $accent] ;
|
||||
|
||||
$vowel = [ AEIOUaeiou $gvowel] ;
|
||||
|
||||
$beforeLower = $accent * $lower ;
|
||||
|
||||
$gammaLike = [ΓΚΞΧγκξχ] ;
|
||||
$smooth = ̓ ;
|
||||
$rough = ̔ ;
|
||||
$iotasub = ͅ ;
|
||||
|
||||
# Fix punctuation
|
||||
|
||||
\; <> \? ;
|
||||
· <> \: ;
|
||||
|
||||
# CIRCUMFLEX: convert greek circumflex to normal one. Could use tilde or inverted breve
|
||||
|
||||
\u0342 <> \u0302 ;
|
||||
|
||||
# IOTA: convert iota subscript to iota
|
||||
# first make previous alpha long!
|
||||
|
||||
Α } $accent * $iotasub > A $macron ;
|
||||
α } $accent * $iotasub > a $macron ;
|
||||
|
||||
# now convert to uppercase if after uppercase, ow to lowercase
|
||||
|
||||
$upper $accent * { $iotasub > I ;
|
||||
$iotasub > i ;
|
||||
|
||||
| $1 $iotasub < ([:L:] $macron [:M:]*) i ;
|
||||
|
||||
# BREATHING
|
||||
|
||||
$smooth > ; #delete smooth breathing
|
||||
|
||||
# Convert rough breathing to h, and move before letters.
|
||||
|
||||
# Make A ` x = > H a x
|
||||
|
||||
Α $rough } $beforeLower > H | α ;
|
||||
Ε $rough } $beforeLower > H | ε;
|
||||
Η $rough } $beforeLower > H | η ;
|
||||
Ι ($ddot?) $rough } $beforeLower > H | ι $1;
|
||||
Ο $rough } $beforeLower > H | ο ;
|
||||
Υ $rough } $beforeLower > H | υ ;
|
||||
Ω ($ddot?) $rough } $beforeLower > H | ω $1;
|
||||
|
||||
# Make A x ` = > H a x
|
||||
|
||||
Α ($lower) $rough > H | α $1 ;
|
||||
Ε ($lower) $rough > H | ε $1 ;
|
||||
Η ($lower) $rough > H | η $1 ;
|
||||
Ι ($lower $ddot?) $rough > H | ι $1 ;
|
||||
Ο ($lower) $rough > H | ο $1 ;
|
||||
Υ ($lower) $rough > H | υ $1 ;
|
||||
Ω ($lower $ddot?) $rough > H | ω $1 ;
|
||||
|
||||
#Otherwise, make x ` into h x and X ` into H X
|
||||
|
||||
($lcgvowel + $ddot? ) $rough > h | $1 ;
|
||||
($gvowel + $ddot? ) $rough > H | $1 ;
|
||||
|
||||
# Go backwards with H
|
||||
|
||||
| $1 $rough < h ([aeiouyAEIOUY] $macron? $ddot? [aeiouyAEIOUY] $macron?) ;
|
||||
| $1 $rough < h ([aeiouyAEIOUY] $macron? $ddot?) ;
|
||||
|
||||
| $1 $rough < H ([AEIOUY] $macron? $ddot?[aeiouyAEIOUY] $macron?) ;
|
||||
| $1 $rough < H ([AEIOUY] $macron? $ddot?) ;
|
||||
|
||||
# titlecase, have to fix individually
|
||||
| $1 $rough < H (a $macron? $ddot? [aeiouyAEIOUY] $macron?) ;
|
||||
| $1 $rough < H (e $macron? $ddot? [aeiouyAEIOUY] $macron?) ;
|
||||
| $1 $rough < H (i $macron? $ddot? [aeiouyAEIOUY] $macron?) ;
|
||||
| $1 $rough < H (o $macron? $ddot? [aeiouyAEIOUY] $macron?) ;
|
||||
| $1 $rough < H (u $macron? $ddot? [aeiouyAEIOUY] $macron?) ;
|
||||
| $1 $rough < H (y $macron? [aeiouyAEIOUY] $macron?) ;
|
||||
| $1 $rough < H (a $macron? $ddot? ) ;
|
||||
| $1 $rough < H (e $macron? $ddot? ) ;
|
||||
| $1 $rough < H (i $macron? $ddot? ) ;
|
||||
| $1 $rough < H (o $macron? $ddot? ) ;
|
||||
| $1 $rough < H (u $macron? $ddot? ) ;
|
||||
| $1 $rough < H (y $macron? $ddot? ) ;
|
||||
|
||||
# Now do smooth
|
||||
|
||||
ηυ $1 $smooth < [^[:L:][:M:]] { e $macron u ( $ddot? ) };
|
||||
η $smooth < [^[:L:][:M:]] { e $macron };
|
||||
ω $smooth < [^[:L:][:M:]] { o $macron };
|
||||
|
||||
αυ $1 $smooth < [^[:L:][:M:]] { au ( $ddot? ) };
|
||||
αι $1 $smooth < [^[:L:][:M:]] { ai ( $ddot? ) };
|
||||
α $smooth < [^[:L:][:M:]] { a };
|
||||
ευ $1 $smooth < [^[:L:][:M:]] { eu ( $ddot? ) };
|
||||
ει $1 $smooth < [^[:L:][:M:]] { ei ( $ddot? ) };
|
||||
ε $smooth < [^[:L:][:M:]] { e };
|
||||
ι $smooth < [^[:L:][:M:]] { i };
|
||||
ου $1 $smooth < [^[:L:][:M:]] { ou ( $ddot? ) };
|
||||
οι $1 $smooth < [^[:L:][:M:]] { oi ( $ddot? ) };
|
||||
ο $smooth < [^[:L:][:M:]] { o };
|
||||
υ $smooth < [^[:L:][:M:]] { u };
|
||||
υι $1 $smooth < [^[:L:][:M:]] { ui ( $ddot? ) };
|
||||
|
||||
|
||||
# seems to cause infinite loop
|
||||
# | $1 $smooth < [:^L:] { ([aeiouyAEIOUY] $macron? [aeiouyAEIOUY] $macron?) } [^[$smooth]] ;
|
||||
# | $1 $smooth < [:^L:] { ([aeiouyAEIOUY] $macron?) } [[^aeiouyAEIOUY] [$smooth]] ;
|
||||
|
||||
# TODO: preserve smooth/rough breathing if not
|
||||
# on initial vowel sequence
|
||||
|
||||
# need to have these up here so the rules don't mask
|
||||
|
||||
η <> e $macron ;
|
||||
Η <> E $macron ;
|
||||
|
||||
φ <> ph ;
|
||||
Ψ } $beforeLower <> Ps ;
|
||||
Ψ <> PS ;
|
||||
|
||||
Φ } $beforeLower <> Ph ;
|
||||
Φ <> PH ;
|
||||
ψ <> ps ;
|
||||
|
||||
ω <> o $macron ;
|
||||
Ω <> O $macron;
|
||||
|
||||
# NORMAL
|
||||
|
||||
α <> a ;
|
||||
Α <> A ;
|
||||
|
||||
β <> b ;
|
||||
Β <> B ;
|
||||
|
||||
γ } $gammaLike <> n } [gkc] ;
|
||||
γ <> g ;
|
||||
Γ } $gammaLike <> N } [gkc] ;
|
||||
Γ <> G ;
|
||||
|
||||
δ <> d ;
|
||||
Δ <> D ;
|
||||
|
||||
ε <> e ;
|
||||
Ε <> E ;
|
||||
|
||||
ζ <> z ;
|
||||
Ζ <> Z ;
|
||||
|
||||
θ <> th ;
|
||||
Θ } $beforeLower <> Th ;
|
||||
Θ <> TH ;
|
||||
|
||||
ι <> i ;
|
||||
Ι <> I ;
|
||||
|
||||
κ <> k ;
|
||||
Κ <> K ;
|
||||
|
||||
λ <> l ;
|
||||
Λ <> L ;
|
||||
|
||||
μ <> m ;
|
||||
Μ <> M ;
|
||||
|
||||
ν } $gammaLike > n\' ;
|
||||
ν <> n ;
|
||||
Ν } $gammaLike <> N\' ;
|
||||
Ν <> N ;
|
||||
|
||||
ξ <> x ;
|
||||
Ξ <> X ;
|
||||
|
||||
ο <> o ;
|
||||
Ο <> O ;
|
||||
|
||||
π <> p ;
|
||||
Π <> P ;
|
||||
|
||||
ρ $rough <> rh;
|
||||
Ρ $rough } $beforeLower <> Rh ;
|
||||
Ρ $rough <> RH ;
|
||||
ρ <> r ;
|
||||
Ρ <> R ;
|
||||
|
||||
[Pp] {ς > \'s ;
|
||||
[Pp] {σ > \'s ;
|
||||
σ < [:^L:] [:M:]* { s } [:^L:] ;
|
||||
ς <> s } [:^L:] ;
|
||||
σ <> s ;
|
||||
[Pp] { Σ <> \'S ;
|
||||
Σ <> S ;
|
||||
|
||||
τ <> t ;
|
||||
Τ <> T ;
|
||||
|
||||
$vowel {υ } <> u ;
|
||||
υ <> y ;
|
||||
$vowel { Υ <> U ;
|
||||
Υ <> Y ;
|
||||
|
||||
χ <> ch ;
|
||||
Χ } $beforeLower <> Ch ;
|
||||
Χ <> CH ;
|
||||
|
||||
# completeness for ASCII
|
||||
|
||||
| k < c ;
|
||||
| ph < f ;
|
||||
| i < j ;
|
||||
| k < q ;
|
||||
| u < v ;
|
||||
| u < w ;
|
||||
|
||||
::NFC ;
|
8
icu4j/src/com/ibm/text/resources/Transliterator_Hiragana_Latin.txt
Executable file
8
icu4j/src/com/ibm/text/resources/Transliterator_Hiragana_Latin.txt
Executable file
|
@ -0,0 +1,8 @@
|
|||
#--------------------------------------------------------------------
|
||||
# Copyright (c) 1999-2001, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#--------------------------------------------------------------------
|
||||
# Date: Tue Jan 23 12:18:46 2001
|
||||
#--------------------------------------------------------------------
|
||||
:: Hiragana-Katakana;
|
||||
:: Katakana-Latin;
|
446
icu4j/src/com/ibm/text/resources/Transliterator_Latin_Katakana.txt
Executable file
446
icu4j/src/com/ibm/text/resources/Transliterator_Latin_Katakana.txt
Executable file
|
@ -0,0 +1,446 @@
|
|||
#--------------------------------------------------------------------
|
||||
# Copyright (c) 1999-2001, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#--------------------------------------------------------------------
|
||||
# Date: Tue Jan 23 12:18:46 2001
|
||||
#--------------------------------------------------------------------
|
||||
|
||||
# Latin-Katakana
|
||||
|
||||
::NFD ;
|
||||
:: [a-zA-Z] Lower ;
|
||||
|
||||
# Uses modified Hepburn. Small changes to make
|
||||
# unambiguous.
|
||||
|
||||
# | Kunrei-shiki: Hepburn/MHepburn
|
||||
# | ------------------------------
|
||||
# | si: shi
|
||||
# | si ~ya: sha
|
||||
# | si ~yu: shu
|
||||
# | si ~yo: sho
|
||||
# | zi: ji
|
||||
# | zi ~ya: ja
|
||||
# | zi ~yu: ju
|
||||
# | zi ~yo: jo
|
||||
# | ti: chi
|
||||
# | ti ~ya: cha
|
||||
# | ti ~yu: chu
|
||||
# | ti ~yu: cho
|
||||
# | tu: tsu
|
||||
# | di: ji/dji
|
||||
# | du: zu/dzu
|
||||
# | hu: fu
|
||||
|
||||
# | For foreign words:
|
||||
# | -----------------
|
||||
# | se ~i si
|
||||
# | si ~e she
|
||||
# |
|
||||
# | ze ~i zi
|
||||
# | zi ~e je
|
||||
# |
|
||||
# | te ~i ti
|
||||
# | ti ~e che
|
||||
# | te ~u tu
|
||||
# |
|
||||
# | de ~i di
|
||||
# | de ~u du
|
||||
# | de ~i di
|
||||
# |
|
||||
# | he ~u: hu
|
||||
# | hu ~a fa
|
||||
# | hu ~i fi
|
||||
# | hu ~e he
|
||||
# | hu ~o ho
|
||||
|
||||
# Most small forms are generated, but if necessary
|
||||
# explicit small forms are given with ~a, ~ya, etc.
|
||||
|
||||
#------------------------------------------------------
|
||||
# Variables
|
||||
|
||||
$vowel = [aeiou] ;
|
||||
$macron = \u0304 ;
|
||||
|
||||
# Variables used for doubled-consonants with tsu
|
||||
|
||||
$kana = [\u3041-\u3094] ;
|
||||
|
||||
$voice = [\u3099\u309B];
|
||||
$semivoice = [\u309A\u309C];
|
||||
|
||||
$k_start = [カキクケコかきくけこ] ;
|
||||
|
||||
$s_start = [サシスセソさしすせそ] ;
|
||||
|
||||
$j_start = [シし] $voice ;
|
||||
|
||||
$t_start = [タチツテトたちつてと] ;
|
||||
|
||||
$n_start = [ナニヌネノンなにぬねの] ;
|
||||
|
||||
$h_start = [ハヒヘホはひへほ] ;
|
||||
$f_start = [フふ] ;
|
||||
|
||||
$m_start = [マミムメモまみむめも] ;
|
||||
|
||||
$y_start = [ヤユヨやゆよ] ;
|
||||
|
||||
$r_start = [ラリルレロらりるれろ] ;
|
||||
|
||||
$w_start = [ワヰヱヲわゐゑを] ;
|
||||
|
||||
# if ン is followed by $n_quoter, then it needs an
|
||||
# apostrophe after its romaji form to disambiguate it.
|
||||
# e.g., ン ア ! = ナ, so represent as "n'a", not "na".
|
||||
|
||||
$n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ;
|
||||
|
||||
$small_y = [ャィュェョ] ;
|
||||
|
||||
$iteration = \u309D ;
|
||||
|
||||
#------------------------------------------------------
|
||||
# katakana rules
|
||||
|
||||
# Punctuation
|
||||
|
||||
'.' <> 。;
|
||||
',' <> 、;
|
||||
# ' ' } [a-z] > ; # delete spaces before latin
|
||||
# ' ' < [^' '\u30A0-\u30ff] {} ['\u30A0-\u30ff] ; #insert spaces before hiragana
|
||||
|
||||
# Iteration Mark
|
||||
# Copy previous letter & marks
|
||||
|
||||
# TODO
|
||||
# | $1 $1 < ($kana [[:M:]$voice$semivoice]?) $iteration
|
||||
|
||||
# Specials for katakana -- not shared with hiragana
|
||||
|
||||
va <> ヷ ;
|
||||
vi <> ヸ ;
|
||||
ve <> ヹ ;
|
||||
vo <> ヺ ;
|
||||
'~ka' <> ヵ ;
|
||||
'~ke' <> ヶ ;
|
||||
|
||||
# ~~~ begin shared rules ~~~
|
||||
|
||||
#special
|
||||
|
||||
ya < '~'ャ;
|
||||
yi < '~'ィ ;
|
||||
yu < '~'ュ;
|
||||
ye < '~'ェ;
|
||||
yo < '~'ョ;
|
||||
|
||||
#normal
|
||||
|
||||
a <> ア ;
|
||||
|
||||
b | '~' < ヒ ゙} $small_y ;
|
||||
by } $vowel > ビ | '~y' ;
|
||||
|
||||
ba <> バ ;
|
||||
bi <> ビ ;
|
||||
bu <> ブ ;
|
||||
be <> ベ ;
|
||||
bo <> ボ ;
|
||||
|
||||
c } i > | s ;
|
||||
c } e > | s ;
|
||||
|
||||
da <> ダ ;
|
||||
di <> ディ ;
|
||||
du <> デゥ ;
|
||||
de <> デ ;
|
||||
do <> ド ;
|
||||
dzu <> ヅ ;
|
||||
dja < ヂャ ;
|
||||
dji'~i' < ヂィ ; # liu
|
||||
dju < ヂュ ;
|
||||
dje < ヂェ ;
|
||||
djo < ヂョ ;
|
||||
dji <> ヂ ;
|
||||
dj } $vowel > ヂ | '~y' ;
|
||||
|
||||
# TODO: QUESTION: use ĵĴżŻ instead of dj, dz
|
||||
|
||||
cha < チャ ;
|
||||
chi'~i' < チィ ; # liu
|
||||
chu < チュ ;
|
||||
che < チェ ;
|
||||
cho < チョ ;
|
||||
chi <> チ ;
|
||||
ch } $vowel > チ | '~y' ;
|
||||
|
||||
e <> エ ;
|
||||
|
||||
g | '~' < ギ} $small_y ;
|
||||
gy } $vowel > ギ | '~y' ;
|
||||
|
||||
ga <> ガ ;
|
||||
gi <> ギ ;
|
||||
gu <> グ ;
|
||||
ge <> ゲ ;
|
||||
go <> ゴ ;
|
||||
|
||||
i <> イ ;
|
||||
|
||||
# j } $vowel > ジ | '~y' ;
|
||||
|
||||
ja < ジャ ;
|
||||
ji'~i' < ジィ ; # liu
|
||||
ju < ジュ ;
|
||||
je < ジェ ;
|
||||
jo < ジョ ;
|
||||
ji <> ジ ;
|
||||
|
||||
k | '~' < キ} $small_y ;
|
||||
ky } $vowel > キ | '~y' ;
|
||||
|
||||
ka <> カ ;
|
||||
ki <> キ ;
|
||||
ku <> ク ;
|
||||
ke <> ケ ;
|
||||
ko <> コ ;
|
||||
|
||||
m | '~' < ミ} $small_y ;
|
||||
my } $vowel > ミ | '~y' ;
|
||||
|
||||
ma <> マ ;
|
||||
mi <> ミ ;
|
||||
mu <> ム ;
|
||||
me <> メ ;
|
||||
mo <> モ ;
|
||||
|
||||
m } [pbfv] > ン ;
|
||||
|
||||
n | '~' < ニ } $small_y ;
|
||||
ny } $vowel > ニ | '~y' ;
|
||||
|
||||
na <> ナ ;
|
||||
ni <> ニ ;
|
||||
nu <> ヌ ;
|
||||
ne <> ネ ;
|
||||
no <> ノ ;
|
||||
|
||||
o <> オ ;
|
||||
|
||||
p | '~' < ピ } $small_y ;
|
||||
py } $vowel > ピ | '~y' ;
|
||||
|
||||
pa <> パ ;
|
||||
pi <> ピ ;
|
||||
pu <> プ ;
|
||||
pe <> ペ ;
|
||||
po <> ポ ;
|
||||
|
||||
h | '~' < ヒ } $small_y ;
|
||||
hy } $vowel > ヒ | '~y' ;
|
||||
|
||||
ha <> ハ ;
|
||||
hi <> ヒ ;
|
||||
hu <> ヘゥ ;
|
||||
he <> ヘ ;
|
||||
ho <> ホ ;
|
||||
|
||||
# f | '~' < フ } $small_y ;
|
||||
# f } $vowel > フ | '~' ;
|
||||
|
||||
fa <> ファ ;
|
||||
fi <> フィ ;
|
||||
fe <> フェ ;
|
||||
fo <> フォ ;
|
||||
fu <> フ ;
|
||||
|
||||
r | '~' < リ } $small_y ;
|
||||
ry } $vowel > リ | '~y' ;
|
||||
|
||||
ra <> ラ ;
|
||||
ri <> リ ;
|
||||
ru <> ル ;
|
||||
re <> レ ;
|
||||
ro <> ロ ;
|
||||
|
||||
za <> ザ ;
|
||||
zi <> ゼィ ;
|
||||
zu <> ズ ;
|
||||
ze <> ゼ ;
|
||||
zo <> ゾ ;
|
||||
|
||||
sa <> サ ;
|
||||
si <> セィ ;
|
||||
su <> ス ;
|
||||
se <> セ ;
|
||||
so <> ソ ;
|
||||
|
||||
sha < シャ ;
|
||||
shi'~i' < シィ ; # liu
|
||||
shu < シュ ;
|
||||
she < シェ ;
|
||||
sho < ショ ;
|
||||
shi <> シ ;
|
||||
sh } $vowel > シ | '~y' ;
|
||||
|
||||
ta <> タ ;
|
||||
ti <> ティ ;
|
||||
tu <> テゥ ;
|
||||
te <> テ ;
|
||||
to <> ト ;
|
||||
|
||||
tsu <> ツ ;
|
||||
|
||||
# v } $vowel > ヴ | '~' ;
|
||||
|
||||
#'v~a' < ヴァ ; # liu
|
||||
#'v~i' < ヴィ ; # liu
|
||||
#'v~e' < ヴェ ; # liu
|
||||
#'v~o' < ヴォ ; # liu
|
||||
vu <> ヴ ;
|
||||
|
||||
u <> ウ ;
|
||||
|
||||
# w } $vowel > ウ | '~' ;
|
||||
|
||||
wa <> ワ ;
|
||||
wi <> ヰ ;
|
||||
wu > ウ ;
|
||||
we <> ヱ ;
|
||||
wo <> ヲ ;
|
||||
|
||||
ya <> ヤ ;
|
||||
yi > イ ;
|
||||
yu <> ユ ;
|
||||
ye > エ ;
|
||||
yo <> ヨ ;
|
||||
|
||||
# double consonants
|
||||
|
||||
#specials
|
||||
s } sh > ッ ;
|
||||
t } ch > ッ ;
|
||||
|
||||
#voiced
|
||||
|
||||
b } b <> ッ } [$h_start$f_start] $voice;
|
||||
d } d <> ッ } $t_start $voice;
|
||||
g } g <> ッ } $k_start $voice;
|
||||
p } p <> ッ } [$h_start$f_start] $semivoice;
|
||||
# v } v <> ッ } [ワヰウヱヲう] $voice ;
|
||||
z } z <> ッ } $s_start $voice;
|
||||
|
||||
# normal
|
||||
|
||||
j } j <> ッ } $j_start ;
|
||||
k } k <> ッ } $k_start ;
|
||||
m } m <> ッ } $m_start ;
|
||||
n } n <> ッ } $n_start ;
|
||||
h } h <> ッ } $h_start ;
|
||||
f } f <> ッ } $f_start ;
|
||||
r } r <> ッ } $r_start ;
|
||||
t } t <> ッ } $t_start ;
|
||||
s } s <> ッ } $s_start ;
|
||||
|
||||
# completeness
|
||||
w < ッ } $w_start;
|
||||
y < ッ } $y_start;
|
||||
|
||||
x } x > ッ ;
|
||||
c } k > ッ ;
|
||||
c } c > ッ ;
|
||||
c } q > ッ ;
|
||||
l } l > ッ ;
|
||||
q } q > ッ ;
|
||||
# y } y > ッ ;
|
||||
# w } w > ッ ;
|
||||
|
||||
# prolonged vowel mark. this indicates a doubling of
|
||||
# the preceding vowel sound
|
||||
|
||||
#a < a { ー ; # liu
|
||||
#e < e { ー ; # liu
|
||||
#i < i { ー ; # liu
|
||||
#o < o { ー ; # liu
|
||||
#u < u { ー ; # liu
|
||||
|
||||
$macron <> ー ;
|
||||
|
||||
# small forms
|
||||
|
||||
'~a' <> ァ ;
|
||||
'~i' <> ィ ;
|
||||
'~u' <> ゥ ;
|
||||
'~e' <> ェ ;
|
||||
'~o' <> ォ ;
|
||||
'~tsu' <> ッ ;
|
||||
'~wa' <> ヮ ;
|
||||
'~ya' <> ャ ;
|
||||
'~yi' > ィ ;
|
||||
'~yu' <> ュ ;
|
||||
'~ye' > ェ ;
|
||||
'~yo' <> ョ ;
|
||||
|
||||
# one-way latin- > kana rules. these do not occur in
|
||||
# well-formed romaji representing actual japanese text.
|
||||
# their purpose is to make all romaji map to kana of
|
||||
# some sort.
|
||||
|
||||
# the following are not really necessary, but produce
|
||||
# slightly more natural results.
|
||||
|
||||
cy > セィ ;
|
||||
dy > ディ ;
|
||||
hy > ヒ ;
|
||||
sy > セィ ;
|
||||
ty > ティ ;
|
||||
zy > ゼィ ;
|
||||
|
||||
# isolated consonants listed here so as not to mask
|
||||
# longer rules above.
|
||||
|
||||
ch > チ;
|
||||
sh > シ ;
|
||||
dz > ヅ ;
|
||||
dj > ヂ;
|
||||
|
||||
b > ブ ;
|
||||
d > デ ;
|
||||
g > グ ;
|
||||
h > ヘ ;
|
||||
k > ク ;
|
||||
m > ム ;
|
||||
n'' < ン } $n_quoter ;
|
||||
n <> ン ;
|
||||
p > プ ;
|
||||
r > ル ;
|
||||
s > ス ;
|
||||
t > テ ;
|
||||
y > イ ;
|
||||
z > ズ ;
|
||||
v > ヴ ;
|
||||
|
||||
f > フ;
|
||||
j > ジ;
|
||||
w > ウ;
|
||||
|
||||
# simple substitutions using backup
|
||||
|
||||
c > | k ;
|
||||
l > | r ;
|
||||
q > | k ;
|
||||
x > | ks ;
|
||||
|
||||
# ~~~ END shared rules ~~~
|
||||
|
||||
#------------------------------------------------------
|
||||
# Final cleanup
|
||||
|
||||
'~' > ; # delete stray tildes between letters
|
||||
# '' > ; # delete stray quotes between letters
|
||||
|
||||
:: NFC ;
|
||||
|
||||
# eof
|
Loading…
Add table
Reference in a new issue