diff --git a/.gitattributes b/.gitattributes index 42e7b1a693f..03890f752c4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -87,12 +87,9 @@ icu4j/src/com/ibm/icu/dev/data/unicode/Draft-TestSuite.txt -text icu4j/src/com/ibm/icu/impl/data/thai_dict -text icu4j/src/com/ibm/icu/impl/data/unames.dat -text icu4j/src/com/ibm/icu/impl/data/uprops.dat -text -icu4j/src/com/ibm/text/resources/Transliterator_Fullwidth_Halfwidth.utf8.txt -text icu4j/src/com/ibm/text/resources/Transliterator_Han_Pinyin.utf8.txt -text -icu4j/src/com/ibm/text/resources/Transliterator_Hiragana_Katakana.utf8.txt -text icu4j/src/com/ibm/text/resources/Transliterator_Kanji_English.utf8.txt -text icu4j/src/com/ibm/text/resources/Transliterator_Kanji_OnRomaji.utf8.txt -text -icu4j/src/com/ibm/text/resources/Transliterator_Latin_Jamo.utf8.txt -text icu4j/src/com/ibm/text/resources/thai_dict -text icu4j/src/com/ibm/text/resources/unames.dat -text icu4j/src/com/ibm/text/resources/uprops.dat -text diff --git a/icu4j/src/com/ibm/text/resources/Transliterator_Fullwidth_Halfwidth.utf8.txt b/icu4j/src/com/ibm/text/resources/Transliterator_Fullwidth_Halfwidth.utf8.txt deleted file mode 100755 index ba6f8af35ff..00000000000 --- a/icu4j/src/com/ibm/text/resources/Transliterator_Fullwidth_Halfwidth.utf8.txt +++ /dev/null @@ -1,270 +0,0 @@ -#-------------------------------------------------------------------- -# Copyright (c) 1999-2001, International Business Machines -# Corporation and others. All Rights Reserved. -#-------------------------------------------------------------------- -# Date: Tue Jan 23 12:41:57 2001 -#-------------------------------------------------------------------- - -# Fullwidth-Halfwidth - -# Mechanically generated from Unicode Character Database - -# multicharacter - -ガ<>ガ; # to KATAKANA LETTER GA -ギ<>ギ; # to KATAKANA LETTER GI -グ<>グ; # to KATAKANA LETTER GU -ゲ<>ゲ; # to KATAKANA LETTER GE -ゴ<>ゴ; # to KATAKANA LETTER GO -ザ<>ザ; # to KATAKANA LETTER ZA -ジ<>ジ; # to KATAKANA LETTER ZI -ズ<>ズ; # to KATAKANA LETTER ZU -ゼ<>ゼ; # to KATAKANA LETTER ZE -ゾ<>ゾ; # to KATAKANA LETTER ZO -ダ<>ダ; # to KATAKANA LETTER DA -ヂ<>ヂ; # to KATAKANA LETTER DI -ヅ<>ヅ; # to KATAKANA LETTER DU -デ<>デ; # to KATAKANA LETTER DE -ド<>ド; # to KATAKANA LETTER DO -バ<>バ; # to KATAKANA LETTER BA -パ<>パ; # to KATAKANA LETTER PA -ビ<>ビ; # to KATAKANA LETTER BI -ピ<>ピ; # to KATAKANA LETTER PI -ブ<>ブ; # to KATAKANA LETTER BU -プ<>プ; # to KATAKANA LETTER PU -ベ<>ベ; # to KATAKANA LETTER BE -ペ<>ペ; # to KATAKANA LETTER PE -ボ<>ボ; # to KATAKANA LETTER BO -ポ<>ポ; # to KATAKANA LETTER PO -ヴ<>ヴ; # to KATAKANA LETTER VU -ヷ<>ヷ; # to KATAKANA LETTER VA -ヺ<>ヺ; # to KATAKANA LETTER VO - -# single character - -!<>'!'; # from FULLWIDTH EXCLAMATION MARK -"<>'\"'; # from FULLWIDTH QUOTATION MARK -#<>'#'; # from FULLWIDTH NUMBER SIGN -$<>'$'; # from FULLWIDTH DOLLAR SIGN -%<>'%'; # from FULLWIDTH PERCENT SIGN -&<>'&'; # from FULLWIDTH AMPERSAND -'<>''; # from FULLWIDTH APOSTROPHE -(<>'('; # from FULLWIDTH LEFT PARENTHESIS -)<>')'; # from FULLWIDTH RIGHT PARENTHESIS -*<>'*'; # from FULLWIDTH ASTERISK -+<>'+'; # from FULLWIDTH PLUS SIGN -,<>','; # from FULLWIDTH COMMA --<>'-'; # from FULLWIDTH HYPHEN-MINUS -.<>'.'; # from FULLWIDTH FULL STOP -/<>'/'; # from FULLWIDTH SOLIDUS -0<>'0'; # from FULLWIDTH DIGIT ZERO -1<>'1'; # from FULLWIDTH DIGIT ONE -2<>'2'; # from FULLWIDTH DIGIT TWO -3<>'3'; # from FULLWIDTH DIGIT THREE -4<>'4'; # from FULLWIDTH DIGIT FOUR -5<>'5'; # from FULLWIDTH DIGIT FIVE -6<>'6'; # from FULLWIDTH DIGIT SIX -7<>'7'; # from FULLWIDTH DIGIT SEVEN -8<>'8'; # from FULLWIDTH DIGIT EIGHT -9<>'9'; # from FULLWIDTH DIGIT NINE -:<>':'; # from FULLWIDTH COLON -;<>';'; # from FULLWIDTH SEMICOLON -<<>'<'; # from FULLWIDTH LESS-THAN SIGN -=<>'='; # from FULLWIDTH EQUALS SIGN -><>'>'; # from FULLWIDTH GREATER-THAN SIGN -?<>'?'; # from FULLWIDTH QUESTION MARK -@<>'@'; # from FULLWIDTH COMMERCIAL AT -A<>A; # from FULLWIDTH LATIN CAPITAL LETTER A -B<>B; # from FULLWIDTH LATIN CAPITAL LETTER B -C<>C; # from FULLWIDTH LATIN CAPITAL LETTER C -D<>D; # from FULLWIDTH LATIN CAPITAL LETTER D -E<>E; # from FULLWIDTH LATIN CAPITAL LETTER E -F<>F; # from FULLWIDTH LATIN CAPITAL LETTER F -G<>G; # from FULLWIDTH LATIN CAPITAL LETTER G -H<>H; # from FULLWIDTH LATIN CAPITAL LETTER H -I<>I; # from FULLWIDTH LATIN CAPITAL LETTER I -J<>J; # from FULLWIDTH LATIN CAPITAL LETTER J -K<>K; # from FULLWIDTH LATIN CAPITAL LETTER K -L<>L; # from FULLWIDTH LATIN CAPITAL LETTER L -M<>M; # from FULLWIDTH LATIN CAPITAL LETTER M -N<>N; # from FULLWIDTH LATIN CAPITAL LETTER N -O<>O; # from FULLWIDTH LATIN CAPITAL LETTER O -P<>P; # from FULLWIDTH LATIN CAPITAL LETTER P -Q<>Q; # from FULLWIDTH LATIN CAPITAL LETTER Q -R<>R; # from FULLWIDTH LATIN CAPITAL LETTER R -S<>S; # from FULLWIDTH LATIN CAPITAL LETTER S -T<>T; # from FULLWIDTH LATIN CAPITAL LETTER T -U<>U; # from FULLWIDTH LATIN CAPITAL LETTER U -V<>V; # from FULLWIDTH LATIN CAPITAL LETTER V -W<>W; # from FULLWIDTH LATIN CAPITAL LETTER W -X<>X; # from FULLWIDTH LATIN CAPITAL LETTER X -Y<>Y; # from FULLWIDTH LATIN CAPITAL LETTER Y -Z<>Z; # from FULLWIDTH LATIN CAPITAL LETTER Z -[<>'['; # from FULLWIDTH LEFT SQUARE BRACKET -\<>'\\'; # from FULLWIDTH REVERSE SOLIDUS {double escape - aliu} -]<>']'; # from FULLWIDTH RIGHT SQUARE BRACKET -^<>'^'; # from FULLWIDTH CIRCUMFLEX ACCENT -_<>'_'; # from FULLWIDTH LOW LINE -`<>'`'; # from FULLWIDTH GRAVE ACCENT -a<>a; # from FULLWIDTH LATIN SMALL LETTER A -b<>b; # from FULLWIDTH LATIN SMALL LETTER B -c<>c; # from FULLWIDTH LATIN SMALL LETTER C -d<>d; # from FULLWIDTH LATIN SMALL LETTER D -e<>e; # from FULLWIDTH LATIN SMALL LETTER E -f<>f; # from FULLWIDTH LATIN SMALL LETTER F -g<>g; # from FULLWIDTH LATIN SMALL LETTER G -h<>h; # from FULLWIDTH LATIN SMALL LETTER H -i<>i; # from FULLWIDTH LATIN SMALL LETTER I -j<>j; # from FULLWIDTH LATIN SMALL LETTER J -k<>k; # from FULLWIDTH LATIN SMALL LETTER K -l<>l; # from FULLWIDTH LATIN SMALL LETTER L -m<>m; # from FULLWIDTH LATIN SMALL LETTER M -n<>n; # from FULLWIDTH LATIN SMALL LETTER N -o<>o; # from FULLWIDTH LATIN SMALL LETTER O -p<>p; # from FULLWIDTH LATIN SMALL LETTER P -q<>q; # from FULLWIDTH LATIN SMALL LETTER Q -r<>r; # from FULLWIDTH LATIN SMALL LETTER R -s<>s; # from FULLWIDTH LATIN SMALL LETTER S -t<>t; # from FULLWIDTH LATIN SMALL LETTER T -u<>u; # from FULLWIDTH LATIN SMALL LETTER U -v<>v; # from FULLWIDTH LATIN SMALL LETTER V -w<>w; # from FULLWIDTH LATIN SMALL LETTER W -x<>x; # from FULLWIDTH LATIN SMALL LETTER X -y<>y; # from FULLWIDTH LATIN SMALL LETTER Y -z<>z; # from FULLWIDTH LATIN SMALL LETTER Z -{<>'{'; # from FULLWIDTH LEFT CURLY BRACKET -|<>'|'; # from FULLWIDTH VERTICAL LINE -}<>'}'; # from FULLWIDTH RIGHT CURLY BRACKET -~<>'~'; # from FULLWIDTH TILDE -。<>。; # to HALFWIDTH IDEOGRAPHIC FULL STOP -「<>「; # to HALFWIDTH LEFT CORNER BRACKET -」<>」; # to HALFWIDTH RIGHT CORNER BRACKET -、<>、; # to HALFWIDTH IDEOGRAPHIC COMMA -・<>・; # to HALFWIDTH KATAKANA MIDDLE DOT -ヲ<>ヲ; # to HALFWIDTH KATAKANA LETTER WO -ァ<>ァ; # to HALFWIDTH KATAKANA LETTER SMALL A -ィ<>ィ; # to HALFWIDTH KATAKANA LETTER SMALL I -ゥ<>ゥ; # to HALFWIDTH KATAKANA LETTER SMALL U -ェ<>ェ; # to HALFWIDTH KATAKANA LETTER SMALL E -ォ<>ォ; # to HALFWIDTH KATAKANA LETTER SMALL O -ャ<>ャ; # to HALFWIDTH KATAKANA LETTER SMALL YA -ュ<>ュ; # to HALFWIDTH KATAKANA LETTER SMALL YU -ョ<>ョ; # to HALFWIDTH KATAKANA LETTER SMALL YO -ッ<>ッ; # to HALFWIDTH KATAKANA LETTER SMALL TU -ー<>ー; # to HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK -ア<>ア; # to HALFWIDTH KATAKANA LETTER A -イ<>イ; # to HALFWIDTH KATAKANA LETTER I -ウ<>ウ; # to HALFWIDTH KATAKANA LETTER U -エ<>エ; # to HALFWIDTH KATAKANA LETTER E -オ<>オ; # to HALFWIDTH KATAKANA LETTER O -カ<>カ; # to HALFWIDTH KATAKANA LETTER KA -キ<>キ; # to HALFWIDTH KATAKANA LETTER KI -ク<>ク; # to HALFWIDTH KATAKANA LETTER KU -ケ<>ケ; # to HALFWIDTH KATAKANA LETTER KE -コ<>コ; # to HALFWIDTH KATAKANA LETTER KO -サ<>サ; # to HALFWIDTH KATAKANA LETTER SA -シ<>シ; # to HALFWIDTH KATAKANA LETTER SI -ス<>ス; # to HALFWIDTH KATAKANA LETTER SU -セ<>セ; # to HALFWIDTH KATAKANA LETTER SE -ソ<>ソ; # to HALFWIDTH KATAKANA LETTER SO -タ<>タ; # to HALFWIDTH KATAKANA LETTER TA -チ<>チ; # to HALFWIDTH KATAKANA LETTER TI -ツ<>ツ; # to HALFWIDTH KATAKANA LETTER TU -テ<>テ; # to HALFWIDTH KATAKANA LETTER TE -ト<>ト; # to HALFWIDTH KATAKANA LETTER TO -ナ<>ナ; # to HALFWIDTH KATAKANA LETTER NA -ニ<>ニ; # to HALFWIDTH KATAKANA LETTER NI -ヌ<>ヌ; # to HALFWIDTH KATAKANA LETTER NU -ネ<>ネ; # to HALFWIDTH KATAKANA LETTER NE -ノ<>ノ; # to HALFWIDTH KATAKANA LETTER NO -ハ<>ハ; # to HALFWIDTH KATAKANA LETTER HA -ヒ<>ヒ; # to HALFWIDTH KATAKANA LETTER HI -フ<>フ; # to HALFWIDTH KATAKANA LETTER HU -ヘ<>ヘ; # to HALFWIDTH KATAKANA LETTER HE -ホ<>ホ; # to HALFWIDTH KATAKANA LETTER HO -マ<>マ; # to HALFWIDTH KATAKANA LETTER MA -ミ<>ミ; # to HALFWIDTH KATAKANA LETTER MI -ム<>ム; # to HALFWIDTH KATAKANA LETTER MU -メ<>メ; # to HALFWIDTH KATAKANA LETTER ME -モ<>モ; # to HALFWIDTH KATAKANA LETTER MO -ヤ<>ヤ; # to HALFWIDTH KATAKANA LETTER YA -ユ<>ユ; # to HALFWIDTH KATAKANA LETTER YU -ヨ<>ヨ; # to HALFWIDTH KATAKANA LETTER YO -ラ<>ラ; # to HALFWIDTH KATAKANA LETTER RA -リ<>リ; # to HALFWIDTH KATAKANA LETTER RI -ル<>ル; # to HALFWIDTH KATAKANA LETTER RU -レ<>レ; # to HALFWIDTH KATAKANA LETTER RE -ロ<>ロ; # to HALFWIDTH KATAKANA LETTER RO -ワ<>ワ; # to HALFWIDTH KATAKANA LETTER WA -ン<>ン; # to HALFWIDTH KATAKANA LETTER N -゙<>゙; # to HALFWIDTH KATAKANA VOICED SOUND MARK -゚<>゚; # to HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK -ᅠ<>ᅠ; # to HALFWIDTH HANGUL FILLER -ᄀ<>ᄀ; # to HALFWIDTH HANGUL LETTER KIYEOK -ᄁ<>ᄁ; # to HALFWIDTH HANGUL LETTER SSANGKIYEOK -ᆪ<>ᆪ; # to HALFWIDTH HANGUL LETTER KIYEOK-SIOS -ᄂ<>ᄂ; # to HALFWIDTH HANGUL LETTER NIEUN -ᆬ<>ᆬ; # to HALFWIDTH HANGUL LETTER NIEUN-CIEUC -ᆭ<>ᆭ; # to HALFWIDTH HANGUL LETTER NIEUN-HIEUH -ᄃ<>ᄃ; # to HALFWIDTH HANGUL LETTER TIKEUT -ᄄ<>ᄄ; # to HALFWIDTH HANGUL LETTER SSANGTIKEUT -ᄅ<>ᄅ; # to HALFWIDTH HANGUL LETTER RIEUL -ᆰ<>ᆰ; # to HALFWIDTH HANGUL LETTER RIEUL-KIYEOK -ᆱ<>ᆱ; # to HALFWIDTH HANGUL LETTER RIEUL-MIEUM -ᆲ<>ᆲ; # to HALFWIDTH HANGUL LETTER RIEUL-PIEUP -ᆳ<>ᆳ; # to HALFWIDTH HANGUL LETTER RIEUL-SIOS -ᆴ<>ᆴ; # to HALFWIDTH HANGUL LETTER RIEUL-THIEUTH -ᆵ<>ᆵ; # to HALFWIDTH HANGUL LETTER RIEUL-PHIEUPH -ᄚ<>ᄚ; # to HALFWIDTH HANGUL LETTER RIEUL-HIEUH -ᄆ<>ᄆ; # to HALFWIDTH HANGUL LETTER MIEUM -ᄇ<>ᄇ; # to HALFWIDTH HANGUL LETTER PIEUP -ᄈ<>ᄈ; # to HALFWIDTH HANGUL LETTER SSANGPIEUP -ᄡ<>ᄡ; # to HALFWIDTH HANGUL LETTER PIEUP-SIOS -ᄉ<>ᄉ; # to HALFWIDTH HANGUL LETTER SIOS -ᄊ<>ᄊ; # to HALFWIDTH HANGUL LETTER SSANGSIOS -ᄋ<>ᄋ; # to HALFWIDTH HANGUL LETTER IEUNG -ᄌ<>ᄌ; # to HALFWIDTH HANGUL LETTER CIEUC -ᄍ<>ᄍ; # to HALFWIDTH HANGUL LETTER SSANGCIEUC -ᄎ<>ᄎ; # to HALFWIDTH HANGUL LETTER CHIEUCH -ᄏ<>ᄏ; # to HALFWIDTH HANGUL LETTER KHIEUKH -ᄐ<>ᄐ; # to HALFWIDTH HANGUL LETTER THIEUTH -ᄑ<>ᄑ; # to HALFWIDTH HANGUL LETTER PHIEUPH -ᄒ<>ᄒ; # to HALFWIDTH HANGUL LETTER HIEUH -ᅡ<>ᅡ; # to HALFWIDTH HANGUL LETTER A -ᅢ<>ᅢ; # to HALFWIDTH HANGUL LETTER AE -ᅣ<>ᅣ; # to HALFWIDTH HANGUL LETTER YA -ᅤ<>ᅤ; # to HALFWIDTH HANGUL LETTER YAE -ᅥ<>ᅥ; # to HALFWIDTH HANGUL LETTER EO -ᅦ<>ᅦ; # to HALFWIDTH HANGUL LETTER E -ᅧ<>ᅧ; # to HALFWIDTH HANGUL LETTER YEO -ᅨ<>ᅨ; # to HALFWIDTH HANGUL LETTER YE -ᅩ<>ᅩ; # to HALFWIDTH HANGUL LETTER O -ᅪ<>ᅪ; # to HALFWIDTH HANGUL LETTER WA -ᅫ<>ᅫ; # to HALFWIDTH HANGUL LETTER WAE -ᅬ<>ᅬ; # to HALFWIDTH HANGUL LETTER OE -ᅭ<>ᅭ; # to HALFWIDTH HANGUL LETTER YO -ᅮ<>ᅮ; # to HALFWIDTH HANGUL LETTER U -ᅯ<>ᅯ; # to HALFWIDTH HANGUL LETTER WEO -ᅰ<>ᅰ; # to HALFWIDTH HANGUL LETTER WE -ᅱ<>ᅱ; # to HALFWIDTH HANGUL LETTER WI -ᅲ<>ᅲ; # to HALFWIDTH HANGUL LETTER YU -ᅳ<>ᅳ; # to HALFWIDTH HANGUL LETTER EU -ᅴ<>ᅴ; # to HALFWIDTH HANGUL LETTER YI -ᅵ<>ᅵ; # to HALFWIDTH HANGUL LETTER I -¢<>'¢'; # from FULLWIDTH CENT SIGN -£<>'£'; # from FULLWIDTH POUND SIGN -¬<>'¬'; # from FULLWIDTH NOT SIGN - ̄<>' '̄; # from FULLWIDTH MACRON -' '<>' '; # ideographic space (place this after MACRON) -¦<>'¦'; # from FULLWIDTH BROKEN BAR -¥<>'¥'; # from FULLWIDTH YEN SIGN -₩<>₩; # from FULLWIDTH WON SIGN -│<>│; # to HALFWIDTH FORMS LIGHT VERTICAL -←<>←; # to HALFWIDTH LEFTWARDS ARROW -↑<>↑; # to HALFWIDTH UPWARDS ARROW -→<>→; # to HALFWIDTH RIGHTWARDS ARROW -↓<>↓; # to HALFWIDTH DOWNWARDS ARROW -■<>■; # to HALFWIDTH BLACK SQUARE -○<>○; # to HALFWIDTH WHITE CIRCLE - -# eof diff --git a/icu4j/src/com/ibm/text/resources/Transliterator_Hiragana_Katakana.utf8.txt b/icu4j/src/com/ibm/text/resources/Transliterator_Hiragana_Katakana.utf8.txt deleted file mode 100755 index df904d4e841..00000000000 --- a/icu4j/src/com/ibm/text/resources/Transliterator_Hiragana_Katakana.utf8.txt +++ /dev/null @@ -1,200 +0,0 @@ -#-------------------------------------------------------------------- -# Copyright (c) 1999-2001, International Business Machines -# Corporation and others. All Rights Reserved. -#-------------------------------------------------------------------- -# Date: Tue Jan 23 2001 -#-------------------------------------------------------------------- - -# Hiragana-Katana - -# This is largely a one-to-one mapping, but it has a -# few kinks: - -# 1. The Katakana va/vi/ve/vo (30F7-30FA) have no -# Hiragana equivalents. We use Hiragana wa/wi/we/wo -# (308F-3092) with a voicing mark (3099), which is -# semantically equivalent. However, this is a non- -# roundtripping transformation. - -# 2. The Katakana small ka/ke (30F5,30F6) have no -# Hiragana equiavlents. We convert them to normal -# Hiragana ka/ke (304B,3051). This is a one-way -# information-losing transformation and precludes -# round-tripping of 30F5 and 30F6. - -# 3. The combining marks 3099-309C are in the Hiragana -# block, but they apply to Katakana as well, so we -# leave them untouched. - -# 4. The Katakana prolonged sound mark 30FC doubles the -# preceding vowel. This is a one-way information- -# losing transformation from Katakana to Hiragana. - -# 5. The Katakana middle dot separates words in foreign -# expressions; we leave this unmodified. - -# The above points preclude successful round-trip -# transformations of arbitrary input text. However, -# they provide naturalistic results that should conform -# to user expectations. - - -# Combining equivalents va/vi/ve/vo -わ゙ <> ヷ; -ゐ゙ <> ヸ; -ゑ゙ <> ヹ; -を゙ <> ヺ; - -# One-to-one mappings, main block -# 3041:3094 <> 30A1:30F4 -# 309D,E <> 30FD,E -ぁ <> ァ; -あ <> ア; -ぃ <> ィ; -い <> イ; -ぅ <> ゥ; -う <> ウ; -ぇ <> ェ; -え <> エ; -ぉ <> ォ; -お <> オ; -か <> カ; -が <> ガ; -き <> キ; -ぎ <> ギ; -く <> ク; -ぐ <> グ; -け <> ケ; -げ <> ゲ; -こ <> コ; -ご <> ゴ; -さ <> サ; -ざ <> ザ; -し <> シ; -じ <> ジ; -す <> ス; -ず <> ズ; -せ <> セ; -ぜ <> ゼ; -そ <> ソ; -ぞ <> ゾ; -た <> タ; -だ <> ダ; -ち <> チ; -ぢ <> ヂ; -っ <> ッ; -つ <> ツ; -づ <> ヅ; -て <> テ; -で <> デ; -と <> ト; -ど <> ド; -な <> ナ; -に <> ニ; -ぬ <> ヌ; -ね <> ネ; -の <> ノ; -は <> ハ; -ば <> バ; -ぱ <> パ; -ひ <> ヒ; -び <> ビ; -ぴ <> ピ; -ふ <> フ; -ぶ <> ブ; -ぷ <> プ; -へ <> ヘ; -べ <> ベ; -ぺ <> ペ; -ほ <> ホ; -ぼ <> ボ; -ぽ <> ポ; -ま <> マ; -み <> ミ; -む <> ム; -め <> メ; -も <> モ; -ゃ <> ャ; -や <> ヤ; -ゅ <> ュ; -ゆ <> ユ; -ょ <> ョ; -よ <> ヨ; -ら <> ラ; -り <> リ; -る <> ル; -れ <> レ; -ろ <> ロ; -ゎ <> ヮ; -わ <> ワ; -ゐ <> ヰ; -ゑ <> ヱ; -を <> ヲ; -ん <> ン; -ゔ <> ヴ; -ゝ <> ヽ; -ゞ <> ヾ; - -# One-way Katakana-Hiragana xform of small K ka/ke to -# normal H ka/ke. -か < ヵ; -け < ヶ; - -# Katakana followed by a prolonged sound mark 30FC has -# its final vowel doubled. This is a Katakana-Hiragana -# one-way information-losing transformation. We -# include the small Katakana (e.g., small A 3041) and -# do not distinguish them from their large -# counterparts. It doesn't make sense to double a -# small counterpart vowel as a small Hiragana vowel, so -# we don't do so. In natural text this should never -# occur anyway. If a 30FC is seen without a preceding -# vowel sound (e.g., after n 30F3) we do not change it. - -### $long = ー; - -# The following categories are Hiragana, not Katakana -# as might be expected, since by the time we get to the -# 30FC, the preceding character will have already been -# transformed to Hiragana. - -# {The following mechanically generated from the -# Unicode 3.0 data:} - -$xa = [ \ -ぁ あ か が さ ざ \ -た だ な は ば ぱ \ -ま ゃ や ら ゎ わ \ -]; - -$xi = [ \ -ぃ い き ぎ し じ \ -ち ぢ に ひ び ぴ \ -み り ゐ \ -]; - -$xu = [ \ -ぅ う く ぐ す ず \ -っ つ づ ぬ ふ ぶ \ -ぷ む ゅ ゆ る ゔ \ -]; - -$xe = [ \ -ぇ え け げ せ ぜ \ -て で ね へ べ ぺ \ -め れ ゑ \ -]; - -$xo = [ \ -ぉ お こ ご そ ぞ \ -と ど の ほ ぼ ぽ \ -も ょ よ ろ を \ -]; - -あ < $xa {ー}; -い < $xi {ー}; -う < $xu {ー}; -え < $xe {ー}; -お < $xo {ー}; - -# eof diff --git a/icu4j/src/com/ibm/text/resources/Transliterator_Latin_Jamo.utf8.txt b/icu4j/src/com/ibm/text/resources/Transliterator_Latin_Jamo.utf8.txt deleted file mode 100755 index bbcbb8f5cb0..00000000000 --- a/icu4j/src/com/ibm/text/resources/Transliterator_Latin_Jamo.utf8.txt +++ /dev/null @@ -1,511 +0,0 @@ -#-------------------------------------------------------------------- -# Copyright (c) 1999-2001, International Business Machines -# Corporation and others. All Rights Reserved. -#-------------------------------------------------------------------- - -# Latin-Jamo - -# Transliteration from Latin characters to Korean script is done in -# two steps: Latin to Jamo, then Jamo to Hangul. The Jamo-Hangul -# transliteration is done algorithmically following Unicode 3.0 -# section 3.11. This file implements the Latin to Jamo -# transliteration using rules. - -# Jamo occupy the block 1100-11FF. Within this block there are three -# groups of characters: initial consonants or choseong (I), medial -# vowels or jungseong (M), and trailing consonants or jongseong (F). -# Standard Korean syllables are of the form I+M+F*. - -# Section 3.11 describes the use of 'filler' jamo to convert -# nonstandard syllables to standard form: the choseong filler 115F and -# the junseong filler 1160. In this transliterator, we will not use -# 115F or 1160. - -# We will, however, insert two 'null' jamo to make foreign words -# conform to Korean syllable structure. These are the null initial -# consonant 110B (IEUNG) and the null vowel 1173 (EU). In Latin text, -# we will use the hyphen in order to disambiguate strings, -# e.g. "kan-ggan" (initial GG) vs. "kanggan" (final NG + initial G). - -# We will not use all of the characters in the jamo block. We will -# only use the 19 initials, 21 medials, and 27 finals possessing a -# jamo short name as defined in section 4.4 of the Unicode book. - -# Rules of thumb. These guidelines provide the basic framework -# for the rules. They are phrased in terms of Latin-Jamo transliteration. -# The Jamo-Latin rules derive from these, since the Jamo-Latin rules are -# just context-free transliteration of jamo to corresponding short names, -# with the addition of hyphens to maintain round-trip integrity -# in the context of the Latin-Jamo rules. - -# A sequence of vowels: -# - Take the longest sequence you can. If there are too many, or you don't -# have a starting consonant, introduce a 110B necessary. - -# A sequence of consonants. -# - First join the double consonants: G + G -> GG -# - In the remaining list, -# -- If there is no preceding vowel, take the first consonant, and insert EU -# after it. Continue with the rest of the consonants. -# -- If there is one consonant, attach to the following vowel -# -- If there are two consonants and a following vowel, attach one to the -# preceeding vowel, and one to the following vowel. -# -- If there are more than two consonants, join the first two together if you -# can: L + G => LG -# -- If you still end up with more than 2 consonants, insert EU after the -# first one, and continue with the rest of the consonants. - -#---------------------------------------------------------------------- -# Variables - -# Some latin consonants or consonant pairs only occur as initials, and -# some only as finals, but some occur as both. This makes some jamo -# consonants ambiguous when transliterated into latin. -# Initial only: IEUNG BB DD JJ R -# Final only: BS GS L LB LG LH LM LP LS LT NG NH NJ -# Initial and Final: B C D G GG H J K M N P S SS T - - $Gi = \u1100; - $GGi = \u1101; - $Ni = \u1102; - $Di = \u1103; - $DD = \u1104; - $R = \u1105; - $Mi = \u1106; - $Bi = \u1107; - $BB = \u1108; - $Si = \u1109; - $SSi = \u110A; - $IEUNG = \u110B; # null initial, inserted during Latin-Jamo - $Ji = \u110C; - $JJ = \u110D; - $Ci = \u110E; - $Ki = \u110F; - $Ti = \u1110; - $Pi = \u1111; - $Hi = \u1112; - - $A = \u1161; - $AE = \u1162; - $YA = \u1163; - $YAE = \u1164; - $EO = \u1165; - $E = \u1166; - $YEO = \u1167; - $YE = \u1168; - $O = \u1169; - $WA = \u116A; - $WAE = \u116B; - $OE = \u116C; - $YO = \u116D; - $U = \u116E; - $WEO = \u116F; - $WE = \u1170; - $WI = \u1171; - $YU = \u1172; - $EU = \u1173; # null medial, inserted during Latin-Jamo - $YI = \u1174; - $I = \u1175; - - $Gf = \u11A8; - $GGf = \u11A9; - $GS = \u11AA; - $Nf = \u11AB; - $NJ = \u11AC; - $NH = \u11AD; - $Df = \u11AE; - $L = \u11AF; - $LG = \u11B0; - $LM = \u11B1; - $LB = \u11B2; - $LS = \u11B3; - $LT = \u11B4; - $LP = \u11B5; - $LH = \u11B6; - $Mf = \u11B7; - $Bf = \u11B8; - $BS = \u11B9; - $Sf = \u11BA; - $SSf = \u11BB; - $NG = \u11BC; - $Jf = \u11BD; - $Cf = \u11BE; - $Kf = \u11BF; - $Tf = \u11C0; - $Pf = \u11C1; - $Hf = \u11C2; - - $jamoInitial = [\u1100-\u1112]; - - $jamoMedial = [\u1161-\u1175]; - - $latinInitial = [bcdghjkmnprst]; - - # Any character in the latin transliteration of a medial - $latinMedial = [aeiouwy]; - - # The last character of the latin transliteration of a medial - $latinMedialEnd = [aeiou]; - -#---------------------------------------------------------------------- -# Jamo-Latin - -# Jamo to latin is relatively simple, since it is the latin that is -# ambiguous. Most rules are straightforward, and we encode them below -# as simple add-on back rule, e.g.: - -# $jamoMedial {bs} > $BS; - -# becomes - -# $jamoMedial {bs} <> $BS; - -# Furthermore, we don't care about the ordering for Jamo-Latin because -# we are going from single characters, so we can very easily piggyback -# on the Latin-Jamo. - -# The main issue with Jamo-Latin is when to insert hyphens. -# Hyphens are inserted to obtain correct round trip behavior. For -# example, the sequence Ki A Gf Gi E, if transliterated to "kagge", -# would then round trip to Ki A GGi E. To prevent this, we insert a -# hyphen: "kag-ge". IMPORTANT: The need for hyphens depends -# very specifically on the behavior of the Latin-Jamo rules. A change -# in the Latin-Jamo behavior can completely change the way the -# hyphen insertion must be done. - -# First try to preserve actual hyphens in the jamo text by doubling -# them. This fixes problems like: -# (Di)(A)(Ji)(U)(NG)-(IEUNG)(YEO)(Nf)(Gi)(YEO)(L) => dajung-yeongyeol -# => (Di)(A)(Ji)(U)(NG)(IEUNG)(YEO)(Nf)(Gi)(YEO)(L). This is optional -# -- if we don't care about losing hyphens in the jamo, we can delete -# this rule. - - '--' <> '-'; - -# Triple consonants. For three consonants "axxx" we insert a -# hyphen between the first and second "x" if XXf, Xf, and Xi all -# exist, and we have A Xf XXi. This prevents the reverse -# transliteration to A XXf Xi. - - '-' < $latinMedialEnd g {} $GGi; - '-' < $latinMedialEnd s {} $SSi; - -# For vowels the rule is similar. If there is a vowel "ae" such that -# "a" by itself and "e" by itself are vowels, then we want to map A E -# to "a-e" so as not to round trip to AE. However, in the text Ki EO -# IEUNG E we don't need to map to "keo-e". "keoe" suffices. For -# vowels of the form "aei", both "ae" + "i" and "a" + "ei" must be -# tested. NOTE: These rules used to have a left context of -# $latinInitial instead of [^$latinMedial]. The problem with this is -# sequences where an initial IEUNG is transliterated away: -# (IEUNG)(A)(IEUNG)(EO) => aeo => (IEUNG)(AE)(IEUNG)(O) - - '-' < [^$latinMedial] [y w] e {} [$O $OE]; - '-' < [^$latinMedial] e {} [$O $OE $U]; - '-' < [^$latinMedial] [o a] {} [$E $EO $EU]; - '-' < [^$latinMedial] [w y] a {} [$E $EO $EU]; - -# Similar to the above, but with an intervening $IEUNG. - - '-' < [^$latinMedial] [y w] e {} $IEUNG [$O $OE]; - '-' < [^$latinMedial] e {} $IEUNG [$O $OE $U]; - '-' < [^$latinMedial] [o a] {} $IEUNG [$E $EO $EU]; - '-' < [^$latinMedial] [w y] a {} $IEUNG [$E $EO $EU]; - -# Single finals followed by IEUNG. The jamo sequence A Xf IEUNG E, -# where Xi also exists, must be transliterated as "ax-e" to prevent -# the round trip conversion to A Xi E. - - '-' < $latinMedialEnd b {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd c {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd d {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd g {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd h {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd j {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd k {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd m {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd n {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd p {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd s {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd t {} $IEUNG $jamoMedial; - -# Double finals followed by IEUNG. Similar to the single finals -# followed by IEUNG. Any latin consonant pair X Y, between medials, -# that we would split by Latin-Jamo, we must handle when it occurs as -# part of A XYf IEUNG E, to prevent round trip conversion to A Xf Yi -# E. - - '-' < $latinMedialEnd b s {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd g g {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd g s {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd l b {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd l g {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd l h {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd l m {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd l p {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd l s {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd l t {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd n g {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd n h {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd n j {} $IEUNG $jamoMedial; - '-' < $latinMedialEnd s s {} $IEUNG $jamoMedial; - -# Split doubles. Text of the form A Xi Xf E, where XXi also occurs, -# we transliterate as "ax-xe" to prevent round trip transliteration as -# A XXi E. - - '-' < $latinMedialEnd b {} $Bi $jamoMedial; - '-' < $latinMedialEnd d {} $Di $jamoMedial; - '-' < $latinMedialEnd j {} $Ji $jamoMedial; - '-' < $latinMedialEnd g {} $Gi $jamoMedial; - '-' < $latinMedialEnd s {} $Si $jamoMedial; - -# XYY. This corresponds to the XYY rule in Latin-Jamo. By default -# Latin-Jamo maps "xyy" to Xf YYi, to keep YY together. As a result, -# "xyy" forms that correspond to XYf Yi must be transliterated as -# "xy-y". - - '-' < $latinMedialEnd b s {} [$Si $SSi]; - '-' < $latinMedialEnd g s {} [$Si $SSi]; - '-' < $latinMedialEnd l b {} [$Bi $BB]; - '-' < $latinMedialEnd l g {} [$Gi $GGi]; - '-' < $latinMedialEnd l s {} [$Si $SSi]; - '-' < $latinMedialEnd n g {} [$Gi $GGi]; - '-' < $latinMedialEnd n j {} [$Ji $JJ]; - -# Deletion of IEUNG is handled below. - -#---------------------------------------------------------------------- -# Latin-Jamo - -# [Basic, context-free Jamo-Latin rules are embedded here too. See -# above.] - -# Split digraphs: Text of the form 'axye', where 'xy' is a final -# digraph, 'x' is a final (by itself), 'y' is an initial, and 'a' and -# 'e' are medials, we want to transliterate this as A Xf Yi E rather -# than A XYf IEUNG E. We do NOT include text of the form "axxe", -# since that is handled differently below. These rules are generated -# programmatically from the jamo data. - - $jamoMedial {b s} $latinMedial > $Bf $Si; - $jamoMedial {g s} $latinMedial > $Gf $Si; - $jamoMedial {l b} $latinMedial > $L $Bi; - $jamoMedial {l g} $latinMedial > $L $Gi; - $jamoMedial {l h} $latinMedial > $L $Hi; - $jamoMedial {l m} $latinMedial > $L $Mi; - $jamoMedial {l p} $latinMedial > $L $Pi; - $jamoMedial {l s} $latinMedial > $L $Si; - $jamoMedial {l t} $latinMedial > $L $Ti; - $jamoMedial {n g} $latinMedial > $Nf $Gi; - $jamoMedial {n h} $latinMedial > $Nf $Hi; - $jamoMedial {n j} $latinMedial > $Nf $Ji; - -# Single consonants are initials: Text of the form 'axe', where 'x' -# can be an initial or a final, and 'a' and 'e' are medials, we want -# to transliterate as A Xi E rather than A Xf IEUNG E. - - $jamoMedial {b} $latinMedial > $Bi; - $jamoMedial {c} $latinMedial > $Ci; - $jamoMedial {d} $latinMedial > $Di; - $jamoMedial {g} $latinMedial > $Gi; - $jamoMedial {h} $latinMedial > $Hi; - $jamoMedial {j} $latinMedial > $Ji; - $jamoMedial {k} $latinMedial > $Ki; - $jamoMedial {m} $latinMedial > $Mi; - $jamoMedial {n} $latinMedial > $Ni; - $jamoMedial {p} $latinMedial > $Pi; - $jamoMedial {s} $latinMedial > $Si; - $jamoMedial {t} $latinMedial > $Ti; - -# Doubled initials. The sequence "axxe", where XX exists as an initial -# (XXi), and also Xi and Xf exist (true of all digraphs XX), we want -# to transliterate as A XXi E, rather than split to A Xf Xi E. - - $jamoMedial {b b} $latinMedial > $BB; - $jamoMedial {d d} $latinMedial > $DD; - $jamoMedial {j j} $latinMedial > $JJ; - $jamoMedial {g g} $latinMedial > $GGi; - $jamoMedial {s s} $latinMedial > $SSi; - -# XYY. Because doubled consonants bind more strongly than XY -# consonants, we must handle the sequence "axyy" specially. Here XYf -# and YYi must exist. In these cases, we map to Xf YYi rather than -# XYf. - - $jamoMedial {b} s s > $Bf; - $jamoMedial {g} s s > $Gf; - $jamoMedial {l} b b > $L; - $jamoMedial {l} g g > $L; - $jamoMedial {l} s s > $L; - $jamoMedial {n} g g > $Nf; - $jamoMedial {n} j j > $Nf; - -# Finals: Attach consonant with preceding medial to preceding medial. -# Do this BEFORE mapping consonants to initials. Longer keys must -# precede shorter keys that they start with, e.g., the rule for 'bs' -# must precede 'b'. - -# [BASIC Jamo-Latin FINALS handled here. Order irrelevant within this -# block for Jamo-Latin.] - - $jamoMedial {bs} <> $BS; - $jamoMedial {b} <> $Bf; - $jamoMedial {c} <> $Cf; - $jamoMedial {d} <> $Df; - $jamoMedial {gg} <> $GGf; - $jamoMedial {gs} <> $GS; - $jamoMedial {g} <> $Gf; - $jamoMedial {h} <> $Hf; - $jamoMedial {j} <> $Jf; - $jamoMedial {k} <> $Kf; - $jamoMedial {lb} <> $LB; $jamoMedial {lg} <> $LG; - $jamoMedial {lh} <> $LH; - $jamoMedial {lm} <> $LM; - $jamoMedial {lp} <> $LP; - $jamoMedial {ls} <> $LS; - $jamoMedial {lt} <> $LT; - $jamoMedial {l} <> $L; - $jamoMedial {m} <> $Mf; - $jamoMedial {ng} <> $NG; - $jamoMedial {nh} <> $NH; - $jamoMedial {nj} <> $NJ; - $jamoMedial {n} <> $Nf; - $jamoMedial {p} <> $Pf; - $jamoMedial {ss} <> $SSf; - $jamoMedial {s} <> $Sf; - $jamoMedial {t} <> $Tf; - -# Initials: Attach single consonant to following medial. Do this -# AFTER mapping finals. Longer keys must precede shorter keys that -# they start with, e.g., the rule for 'gg' must precede 'g'. - -# [BASIC Jamo-Latin INITIALS handled here. Order irrelevant within -# this block for Jamo-Latin.] - - {gg} $latinMedial <> $GGi; - {g} $latinMedial <> $Gi; - {n} $latinMedial <> $Ni; - {dd} $latinMedial <> $DD; - {d} $latinMedial <> $Di; - {r} $latinMedial <> $R; - {m} $latinMedial <> $Mi; - {bb} $latinMedial <> $BB; - {b} $latinMedial <> $Bi; - {ss} $latinMedial <> $SSi; - {s} $latinMedial <> $Si; - {jj} $latinMedial <> $JJ; - {j} $latinMedial <> $Ji; - {c} $latinMedial <> $Ci; - {k} $latinMedial <> $Ki; - {t} $latinMedial <> $Ti; - {p} $latinMedial <> $Pi; - {h} $latinMedial <> $Hi; - -# 'r' in final position. Because of the equivalency of the 'l' and -# 'r' jamo (the glyphs are the same), we try to provide the same -# equivalency in Latin-Jamo. The 'l' to 'r' conversion is handled -# below. If we see an 'r' in an apparent final position, treat it -# like 'l'. For example, "karka" => Ki A R EU Ki A without this rule. -# Instead, we want Ki A L Ki A. - - $jamoMedial {r} $latinInitial > | l; - -# Initial + Final: If we match the next rule, we have initial then -# final consonant with no intervening medial. We insert the null -# vowel BEFORE it to create a well-formed syllable. (In the next rule -# we insert a null vowel AFTER an anomalous initial.) - - $jamoInitial {} [bcdghjklmnpst] > $EU; - -# Initial + X: This block matches an initial consonant not followed by -# a medial. We insert the null vowel after it. We handle double -# initials explicitly here; for single initial consonants we insert EU -# (as Latin) after them and let standard rules do the rest. - -# BREAKS ROUND TRIP INTEGRITY - - gg > $GGi $EU; - dd > $DD $EU; - bb > $BB $EU; - ss > $SSi $EU; - jj > $JJ $EU; - - ([bcdghjkmnprst]) > | $1 eu; - -# X + Final: Finally we have to deal with a consonant that can only be -# interpreted as a final (not an initial) and which is preceded -# neither by an initial nor a medial. It is the start of the -# syllable, but cannot be. Most of these will already be handled by -# the above rules. 'bs' splits into Bi EU Sf. Similar for 'gs' 'ng' -# 'nh' 'nj'. The only problem is 'l' and digraphs starting with 'l'. -# For this isolated case, we could add a null initial and medial, -# which would give "la" => IEUNG EU L IEUNG A, for example. A more -# economical solution is to transliterate isolated "l" (that is, -# initial "l") to "r". (Other similar conversions of consonants that -# occur neither as initials nor as finals are handled below.) - - l > | r; - -# Medials. If a medial is preceded by an initial, then we proceed -# normally. As usual, longer keys must precede shorter ones. - -# [BASIC Jamo-Latin MEDIALS handled here. Order irrelevant within -# this block for Jamo-Latin.] - - $jamoInitial {ae} <> $AE; - $jamoInitial {a} <> $A; - $jamoInitial {eo} <> $EO; - $jamoInitial {eu} <> $EU; - $jamoInitial {e} <> $E; - $jamoInitial {i} <> $I; - $jamoInitial {oe} <> $OE; - $jamoInitial {o} <> $O; - $jamoInitial {u} <> $U; - $jamoInitial {wae} <> $WAE; - $jamoInitial {wa} <> $WA; - $jamoInitial {weo} <> $WEO; - $jamoInitial {we} <> $WE; - $jamoInitial {wi} <> $WI; - $jamoInitial {yae} <> $YAE; - $jamoInitial {ya} <> $YA; - $jamoInitial {yeo} <> $YEO; - $jamoInitial {ye} <> $YE; - $jamoInitial {yi} <> $YI; - $jamoInitial {yo} <> $YO; - $jamoInitial {yu} <> $YU; - -# We may see an anomalous isolated 'w' or 'y'. In that case, we -# interpret it as 'wi' and 'yu', respectively. - -# BREAKS ROUND TRIP INTEGRITY - - $jamoInitial {w} > | wi; - $jamoInitial {y} > | yu; - -# Otherwise, insert a null consonant IEUNG before the medial (which is -# still an untransliterated latin vowel). - - ($latinMedial) > $IEUNG | $1; - -# Convert non-jamo latin consonants to equivalents. These occur as -# neither initials nor finals in jamo. 'l' occurs as a final, but not -# an initial; it is handled above. The following letters (left hand -# side) will never be output by Jamo-Latin. - - f > | p; - q > | k; - v > | b; - x > | ks; - z > | s; - -# Delete hyphens (Latin-Jamo). - - '-' > ; - -# Delete null consonants (Jamo-Latin). Do NOT delete null EU vowels, -# since these may also occur in text. - - < $IEUNG; - -# eof