diff --git a/icu4c/source/data/translit/Any_Accents.txt b/icu4c/source/data/translit/Any_Accents.txt index 7a63c9cde35..283f434267d 100644 --- a/icu4c/source/data/translit/Any_Accents.txt +++ b/icu4c/source/data/translit/Any_Accents.txt @@ -1,15 +1,20 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Any_Accents.txt # Generated from CLDR # + :: NFD (NFC) ; +# to do: make reversible +# define special conversion characters. +# varients of this could use different characters, or set one or the other to null. $pre = \← ; $post = \→ ; +# Provide keyboard equivalents for common diacritics used in transliteration $pre \` $post ↔ \u0300 ; # COMBINING GRAVE ACCENT $pre \' $post ↔ \u0301 ; # COMBINING ACUTE ACCENT $pre \^ $post ↔ \u0302 ; # COMBINING CIRCUMFLEX ACCENT @@ -20,6 +25,7 @@ $pre \* $post ↔ \u030A ; # COMBINING RING ABOVE $pre \, $post ↔ \u0327 ; # COMBINING CEDILLA $pre '/' $post ↔ \u0338 ; # COMBINING LONG SOLIDUS OVERLAY $pre \. $post ↔ \u0323 ; # COMBINING DOT BELOW +# Combine common characters $pre AE $post ↔ Æ ; # LATIN CAPITAL LETTER AE $pre ae $post ↔ æ ; # LATIN SMALL LETTER AE $pre D $post ↔ Ð ; # LATIN CAPITAL LETTER ETH @@ -47,7 +53,227 @@ $pre O $post ↔ Ɔ ; # LATIN CAPITAL LETTER OPEN O $pre o $post ↔ ɔ ; # LATIN SMALL LETTER OPEN O $pre E $post ↔ Ɛ ; # LATIN CAPITAL LETTER OPEN E $pre e $post ↔ ɛ ; # LATIN SMALL LETTER OPEN E +# three that don't have uppercases $pre '?' $post ↔ ʔ ; # LATIN LETTER GLOTTAL STOP $pre i $post ↔ ɪ ; # LATIN LETTER SMALL CAPITAL I $pre v $post ↔ ʌ ; # LATIN SMALL LETTER TURNED V +# Additional Characters that may be added in the future +# $pre XXX $post ↔ \u0306 ; # COMBINING BREVE +# $pre XXX $post ↔ \u0307 ; # COMBINING DOT ABOVE +# $pre XXX $post ↔ \u0309 ; # COMBINING HOOK ABOVE +# $pre XXX $post ↔ \u030B ; # COMBINING DOUBLE ACUTE ACCENT +# $pre XXX $post ↔ \u030C ; # COMBINING CARON +# $pre XXX $post ↔ \u030F ; # COMBINING DOUBLE GRAVE ACCENT +# $pre XXX $post ↔ \u0311 ; # COMBINING INVERTED BREVE +# $pre XXX $post ↔ \u0313 ; # COMBINING COMMA ABOVE +# $pre XXX $post ↔ \u0314 ; # COMBINING REVERSED COMMA ABOVE +# $pre XXX $post ↔ \u031B ; # COMBINING HORN +# $pre XXX $post ↔ \u0324 ; # COMBINING DIAERESIS BELOW +# $pre XXX $post ↔ \u0325 ; # COMBINING RING BELOW +# $pre XXX $post ↔ \u0326 ; # COMBINING COMMA BELOW +# $pre XXX $post ↔ \u0328 ; # COMBINING OGONEK +# $pre XXX $post ↔ \u032D ; # COMBINING CIRCUMFLEX ACCENT BELOW +# $pre XXX $post ↔ \u032E ; # COMBINING BREVE BELOW +# $pre XXX $post ↔ \u0330 ; # COMBINING TILDE BELOW +# $pre XXX $post ↔ \u0331 ; # COMBINING MACRON BELOW +# $pre YYY $post ↔ ª ; # FEMININE ORDINAL INDICATOR +# $pre YYY $post ↔ º ; # MASCULINE ORDINAL INDICATOR +# $pre YYY $post ↔ Đ ; # LATIN CAPITAL LETTER D WITH STROKE +# $pre YYY $post ↔ đ ; # LATIN SMALL LETTER D WITH STROKE +# $pre YYY $post ↔ Ħ ; # LATIN CAPITAL LETTER H WITH STROKE +# $pre YYY $post ↔ ħ ; # LATIN SMALL LETTER H WITH STROKE +# $pre YYY $post ↔ ı ; # LATIN SMALL LETTER DOTLESS I +# $pre YYY $post ↔ ĸ ; # LATIN SMALL LETTER KRA +# $pre YYY $post ↔ Ŀ ; # LATIN CAPITAL LETTER L WITH MIDDLE DOT +# $pre YYY $post ↔ ŀ ; # LATIN SMALL LETTER L WITH MIDDLE DOT +# $pre YYY $post ↔ Ł ; # LATIN CAPITAL LETTER L WITH STROKE +# $pre YYY $post ↔ ł ; # LATIN SMALL LETTER L WITH STROKE +# $pre YYY $post ↔ ʼn ; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE +# $pre YYY $post ↔ Ŧ ; # LATIN CAPITAL LETTER T WITH STROKE +# $pre YYY $post ↔ ŧ ; # LATIN SMALL LETTER T WITH STROKE +# $pre YYY $post ↔ ſ ; # LATIN SMALL LETTER LONG S +# $pre YYY $post ↔ ƀ ; # LATIN SMALL LETTER B WITH STROKE +# $pre YYY $post ↔ Ɓ ; # LATIN CAPITAL LETTER B WITH HOOK +# $pre YYY $post ↔ Ƃ ; # LATIN CAPITAL LETTER B WITH TOPBAR +# $pre YYY $post ↔ ƃ ; # LATIN SMALL LETTER B WITH TOPBAR +# $pre YYY $post ↔ Ƅ ; # LATIN CAPITAL LETTER TONE SIX +# $pre YYY $post ↔ ƅ ; # LATIN SMALL LETTER TONE SIX +# $pre YYY $post ↔ Ƈ ; # LATIN CAPITAL LETTER C WITH HOOK +# $pre YYY $post ↔ ƈ ; # LATIN SMALL LETTER C WITH HOOK +# $pre YYY $post ↔ Ɖ ; # LATIN CAPITAL LETTER AFRICAN D +# $pre YYY $post ↔ Ɗ ; # LATIN CAPITAL LETTER D WITH HOOK +# $pre YYY $post ↔ Ƌ ; # LATIN CAPITAL LETTER D WITH TOPBAR +# $pre YYY $post ↔ ƌ ; # LATIN SMALL LETTER D WITH TOPBAR +# $pre YYY $post ↔ ƍ ; # LATIN SMALL LETTER TURNED DELTA +# $pre YYY $post ↔ Ǝ ; # LATIN CAPITAL LETTER REVERSED E +# $pre YYY $post ↔ Ƒ ; # LATIN CAPITAL LETTER F WITH HOOK +# $pre YYY $post ↔ ƒ ; # LATIN SMALL LETTER F WITH HOOK +# $pre YYY $post ↔ Ɠ ; # LATIN CAPITAL LETTER G WITH HOOK +# $pre YYY $post ↔ Ɣ ; # LATIN CAPITAL LETTER GAMMA +# $pre YYY $post ↔ ƕ ; # LATIN SMALL LETTER HV +# $pre YYY $post ↔ Ɩ ; # LATIN CAPITAL LETTER IOTA +# $pre YYY $post ↔ Ɨ ; # LATIN CAPITAL LETTER I WITH STROKE +# $pre YYY $post ↔ Ƙ ; # LATIN CAPITAL LETTER K WITH HOOK +# $pre YYY $post ↔ ƙ ; # LATIN SMALL LETTER K WITH HOOK +# $pre YYY $post ↔ ƚ ; # LATIN SMALL LETTER L WITH BAR +# $pre YYY $post ↔ ƛ ; # LATIN SMALL LETTER LAMBDA WITH STROKE +# $pre YYY $post ↔ Ɯ ; # LATIN CAPITAL LETTER TURNED M +# $pre YYY $post ↔ Ɲ ; # LATIN CAPITAL LETTER N WITH LEFT HOOK +# $pre YYY $post ↔ ƞ ; # LATIN SMALL LETTER N WITH LONG RIGHT LEG +# $pre YYY $post ↔ Ɵ ; # LATIN CAPITAL LETTER O WITH MIDDLE TILDE +# $pre YYY $post ↔ Ƣ ; # LATIN CAPITAL LETTER OI +# $pre YYY $post ↔ ƣ ; # LATIN SMALL LETTER OI +# $pre YYY $post ↔ Ƥ ; # LATIN CAPITAL LETTER P WITH HOOK +# $pre YYY $post ↔ ƥ ; # LATIN SMALL LETTER P WITH HOOK +# $pre YYY $post ↔ Ʀ ; # LATIN LETTER YR +# $pre YYY $post ↔ Ƨ ; # LATIN CAPITAL LETTER TONE TWO +# $pre YYY $post ↔ ƨ ; # LATIN SMALL LETTER TONE TWO +# $pre YYY $post ↔ ƪ ; # LATIN LETTER REVERSED ESH LOOP +# $pre YYY $post ↔ ƫ ; # LATIN SMALL LETTER T WITH PALATAL HOOK +# $pre YYY $post ↔ Ƭ ; # LATIN CAPITAL LETTER T WITH HOOK +# $pre YYY $post ↔ ƭ ; # LATIN SMALL LETTER T WITH HOOK +# $pre YYY $post ↔ Ʈ ; # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK +# $pre YYY $post ↔ Ʋ ; # LATIN CAPITAL LETTER V WITH HOOK +# $pre YYY $post ↔ Ƴ ; # LATIN CAPITAL LETTER Y WITH HOOK +# $pre YYY $post ↔ ƴ ; # LATIN SMALL LETTER Y WITH HOOK +# $pre YYY $post ↔ Ƶ ; # LATIN CAPITAL LETTER Z WITH STROKE +# $pre YYY $post ↔ ƶ ; # LATIN SMALL LETTER Z WITH STROKE +# $pre YYY $post ↔ Ƹ ; # LATIN CAPITAL LETTER EZH REVERSED +# $pre YYY $post ↔ ƹ ; # LATIN SMALL LETTER EZH REVERSED +# $pre YYY $post ↔ ƺ ; # LATIN SMALL LETTER EZH WITH TAIL +# $pre YYY $post ↔ ƻ ; # LATIN LETTER TWO WITH STROKE +# $pre YYY $post ↔ Ƽ ; # LATIN CAPITAL LETTER TONE FIVE +# $pre YYY $post ↔ ƽ ; # LATIN SMALL LETTER TONE FIVE +# $pre YYY $post ↔ ƾ ; # LATIN LETTER INVERTED GLOTTAL STOP WITH STROKE +# $pre YYY $post ↔ ƿ ; # LATIN LETTER WYNN +# $pre YYY $post ↔ ǀ ; # LATIN LETTER DENTAL CLICK +# $pre YYY $post ↔ ǁ ; # LATIN LETTER LATERAL CLICK +# $pre YYY $post ↔ ǂ ; # LATIN LETTER ALVEOLAR CLICK +# $pre YYY $post ↔ ǃ ; # LATIN LETTER RETROFLEX CLICK +# $pre YYY $post ↔ DŽ ; # LATIN CAPITAL LETTER DZ WITH CARON +# $pre YYY $post ↔ Dž ; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON +# $pre YYY $post ↔ dž ; # LATIN SMALL LETTER DZ WITH CARON +# $pre YYY $post ↔ LJ ; # LATIN CAPITAL LETTER LJ +# $pre YYY $post ↔ Lj ; # LATIN CAPITAL LETTER L WITH SMALL LETTER J +# $pre YYY $post ↔ lj ; # LATIN SMALL LETTER LJ +# $pre YYY $post ↔ NJ ; # LATIN CAPITAL LETTER NJ +# $pre YYY $post ↔ Nj ; # LATIN CAPITAL LETTER N WITH SMALL LETTER J +# $pre YYY $post ↔ nj ; # LATIN SMALL LETTER NJ +# $pre YYY $post ↔ ǝ ; # LATIN SMALL LETTER TURNED E +# $pre YYY $post ↔ Ǥ ; # LATIN CAPITAL LETTER G WITH STROKE +# $pre YYY $post ↔ ǥ ; # LATIN SMALL LETTER G WITH STROKE +# $pre YYY $post ↔ DZ ; # LATIN CAPITAL LETTER DZ +# $pre YYY $post ↔ Dz ; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z +# $pre YYY $post ↔ dz ; # LATIN SMALL LETTER DZ +# $pre YYY $post ↔ Ƕ ; # LATIN CAPITAL LETTER HWAIR +# $pre YYY $post ↔ Ƿ ; # LATIN CAPITAL LETTER WYNN +# $pre YYY $post ↔ Ȝ ; # LATIN CAPITAL LETTER YOGH +# $pre YYY $post ↔ ȝ ; # LATIN SMALL LETTER YOGH +# $pre YYY $post ↔ Ȣ ; # LATIN CAPITAL LETTER OU +# $pre YYY $post ↔ ȣ ; # LATIN SMALL LETTER OU +# $pre YYY $post ↔ Ȥ ; # LATIN CAPITAL LETTER Z WITH HOOK +# $pre YYY $post ↔ ȥ ; # LATIN SMALL LETTER Z WITH HOOK +# $pre YYY $post ↔ ɐ ; # LATIN SMALL LETTER TURNED A +# $pre YYY $post ↔ ɑ ; # LATIN SMALL LETTER ALPHA +# $pre YYY $post ↔ ɒ ; # LATIN SMALL LETTER TURNED ALPHA +# $pre YYY $post ↔ ɓ ; # LATIN SMALL LETTER B WITH HOOK +# $pre YYY $post ↔ ɕ ; # LATIN SMALL LETTER C WITH CURL +# $pre YYY $post ↔ ɖ ; # LATIN SMALL LETTER D WITH TAIL +# $pre YYY $post ↔ ɗ ; # LATIN SMALL LETTER D WITH HOOK +# $pre YYY $post ↔ ɘ ; # LATIN SMALL LETTER REVERSED E +# $pre YYY $post ↔ ɚ ; # LATIN SMALL LETTER SCHWA WITH HOOK +# $pre YYY $post ↔ ɜ ; # LATIN SMALL LETTER REVERSED OPEN E +# $pre YYY $post ↔ ɝ ; # LATIN SMALL LETTER REVERSED OPEN E WITH HOOK +# $pre YYY $post ↔ ɞ ; # LATIN SMALL LETTER CLOSED REVERSED OPEN E +# $pre YYY $post ↔ ɟ ; # LATIN SMALL LETTER DOTLESS J WITH STROKE +# $pre YYY $post ↔ ɠ ; # LATIN SMALL LETTER G WITH HOOK +# $pre YYY $post ↔ ɡ ; # LATIN SMALL LETTER SCRIPT G +# $pre YYY $post ↔ ɢ ; # LATIN LETTER SMALL CAPITAL G +# $pre YYY $post ↔ ɣ ; # LATIN SMALL LETTER GAMMA +# $pre YYY $post ↔ ɤ ; # LATIN SMALL LETTER RAMS HORN +# $pre YYY $post ↔ ɥ ; # LATIN SMALL LETTER TURNED H +# $pre YYY $post ↔ ɦ ; # LATIN SMALL LETTER H WITH HOOK +# $pre YYY $post ↔ ɧ ; # LATIN SMALL LETTER HENG WITH HOOK +# $pre YYY $post ↔ ɨ ; # LATIN SMALL LETTER I WITH STROKE +# $pre YYY $post ↔ ɩ ; # LATIN SMALL LETTER IOTA +# $pre YYY $post ↔ ɫ ; # LATIN SMALL LETTER L WITH MIDDLE TILDE +# $pre YYY $post ↔ ɬ ; # LATIN SMALL LETTER L WITH BELT +# $pre YYY $post ↔ ɭ ; # LATIN SMALL LETTER L WITH RETROFLEX HOOK +# $pre YYY $post ↔ ɮ ; # LATIN SMALL LETTER LEZH +# $pre YYY $post ↔ ɯ ; # LATIN SMALL LETTER TURNED M +# $pre YYY $post ↔ ɰ ; # LATIN SMALL LETTER TURNED M WITH LONG LEG +# $pre YYY $post ↔ ɱ ; # LATIN SMALL LETTER M WITH HOOK +# $pre YYY $post ↔ ɲ ; # LATIN SMALL LETTER N WITH LEFT HOOK +# $pre YYY $post ↔ ɳ ; # LATIN SMALL LETTER N WITH RETROFLEX HOOK +# $pre YYY $post ↔ ɴ ; # LATIN LETTER SMALL CAPITAL N +# $pre YYY $post ↔ ɵ ; # LATIN SMALL LETTER BARRED O +# $pre YYY $post ↔ ɶ ; # LATIN LETTER SMALL CAPITAL OE +# $pre YYY $post ↔ ɷ ; # LATIN SMALL LETTER CLOSED OMEGA +# $pre YYY $post ↔ ɸ ; # LATIN SMALL LETTER PHI +# $pre YYY $post ↔ ɹ ; # LATIN SMALL LETTER TURNED R +# $pre YYY $post ↔ ɺ ; # LATIN SMALL LETTER TURNED R WITH LONG LEG +# $pre YYY $post ↔ ɻ ; # LATIN SMALL LETTER TURNED R WITH HOOK +# $pre YYY $post ↔ ɼ ; # LATIN SMALL LETTER R WITH LONG LEG +# $pre YYY $post ↔ ɽ ; # LATIN SMALL LETTER R WITH TAIL +# $pre YYY $post ↔ ɾ ; # LATIN SMALL LETTER R WITH FISHHOOK +# $pre YYY $post ↔ ɿ ; # LATIN SMALL LETTER REVERSED R WITH FISHHOOK +# $pre YYY $post ↔ ʀ ; # LATIN LETTER SMALL CAPITAL R +# $pre YYY $post ↔ ʁ ; # LATIN LETTER SMALL CAPITAL INVERTED R +# $pre YYY $post ↔ ʂ ; # LATIN SMALL LETTER S WITH HOOK +# $pre YYY $post ↔ ʄ ; # LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK +# $pre YYY $post ↔ ʅ ; # LATIN SMALL LETTER SQUAT REVERSED ESH +# $pre YYY $post ↔ ʆ ; # LATIN SMALL LETTER ESH WITH CURL +# $pre YYY $post ↔ ʇ ; # LATIN SMALL LETTER TURNED T +# $pre YYY $post ↔ ʈ ; # LATIN SMALL LETTER T WITH RETROFLEX HOOK +# $pre YYY $post ↔ ʉ ; # LATIN SMALL LETTER U BAR +# $pre YYY $post ↔ ʋ ; # LATIN SMALL LETTER V WITH HOOK +# $pre YYY $post ↔ ʍ ; # LATIN SMALL LETTER TURNED W +# $pre YYY $post ↔ ʎ ; # LATIN SMALL LETTER TURNED Y +# $pre YYY $post ↔ ʏ ; # LATIN LETTER SMALL CAPITAL Y +# $pre YYY $post ↔ ʐ ; # LATIN SMALL LETTER Z WITH RETROFLEX HOOK +# $pre YYY $post ↔ ʑ ; # LATIN SMALL LETTER Z WITH CURL +# $pre YYY $post ↔ ʓ ; # LATIN SMALL LETTER EZH WITH CURL +# $pre YYY $post ↔ ʔ ; # LATIN LETTER GLOTTAL STOP +# $pre YYY $post ↔ ʕ ; # LATIN LETTER PHARYNGEAL VOICED FRICATIVE +# $pre YYY $post ↔ ʖ ; # LATIN LETTER INVERTED GLOTTAL STOP +# $pre YYY $post ↔ ʗ ; # LATIN LETTER STRETCHED C +# $pre YYY $post ↔ ʘ ; # LATIN LETTER BILABIAL CLICK +# $pre YYY $post ↔ ʙ ; # LATIN LETTER SMALL CAPITAL B +# $pre YYY $post ↔ ʚ ; # LATIN SMALL LETTER CLOSED OPEN E +# $pre YYY $post ↔ ʛ ; # LATIN LETTER SMALL CAPITAL G WITH HOOK +# $pre YYY $post ↔ ʜ ; # LATIN LETTER SMALL CAPITAL H +# $pre YYY $post ↔ ʝ ; # LATIN SMALL LETTER J WITH CROSSED-TAIL +# $pre YYY $post ↔ ʞ ; # LATIN SMALL LETTER TURNED K +# $pre YYY $post ↔ ʟ ; # LATIN LETTER SMALL CAPITAL L +# $pre YYY $post ↔ ʠ ; # LATIN SMALL LETTER Q WITH HOOK +# $pre YYY $post ↔ ʡ ; # LATIN LETTER GLOTTAL STOP WITH STROKE +# $pre YYY $post ↔ ʢ ; # LATIN LETTER REVERSED GLOTTAL STOP WITH STROKE +# $pre YYY $post ↔ ʣ ; # LATIN SMALL LETTER DZ DIGRAPH +# $pre YYY $post ↔ ʤ ; # LATIN SMALL LETTER DEZH DIGRAPH +# $pre YYY $post ↔ ʥ ; # LATIN SMALL LETTER DZ DIGRAPH WITH CURL +# $pre YYY $post ↔ ʦ ; # LATIN SMALL LETTER TS DIGRAPH +# $pre YYY $post ↔ ʧ ; # LATIN SMALL LETTER TESH DIGRAPH +# $pre YYY $post ↔ ʨ ; # LATIN SMALL LETTER TC DIGRAPH WITH CURL +# $pre YYY $post ↔ ʩ ; # LATIN SMALL LETTER FENG DIGRAPH +# $pre YYY $post ↔ ʪ ; # LATIN SMALL LETTER LS DIGRAPH +# $pre YYY $post ↔ ʫ ; # LATIN SMALL LETTER LZ DIGRAPH +# $pre YYY $post ↔ ʬ ; # LATIN LETTER BILABIAL PERCUSSIVE +# $pre YYY $post ↔ ʭ ; # LATIN LETTER BIDENTAL PERCUSSIVE +# $pre YYY $post ↔ ʰ ; # MODIFIER LETTER SMALL H +# $pre YYY $post ↔ ʱ ; # MODIFIER LETTER SMALL H WITH HOOK +# $pre YYY $post ↔ ʲ ; # MODIFIER LETTER SMALL J +# $pre YYY $post ↔ ʳ ; # MODIFIER LETTER SMALL R +# $pre YYY $post ↔ ʴ ; # MODIFIER LETTER SMALL TURNED R +# $pre YYY $post ↔ ʵ ; # MODIFIER LETTER SMALL TURNED R WITH HOOK +# $pre YYY $post ↔ ʶ ; # MODIFIER LETTER SMALL CAPITAL INVERTED R +# $pre YYY $post ↔ ʷ ; # MODIFIER LETTER SMALL W +# $pre YYY $post ↔ ʸ ; # MODIFIER LETTER SMALL Y +# $pre YYY $post ↔ ˠ ; # MODIFIER LETTER SMALL GAMMA +# $pre YYY $post ↔ ˡ ; # MODIFIER LETTER SMALL L +# $pre YYY $post ↔ ˢ ; # MODIFIER LETTER SMALL S +# $pre YYY $post ↔ ˣ ; # MODIFIER LETTER SMALL X +# $pre YYY $post ↔ ˤ ; # MODIFIER LETTER SMALL REVERSED GLOTTAL STOP +# $pre YYY $post ↔ ẚ ; # LATIN SMALL LETTER A WITH RIGHT HALF RING +# $pre YYY $post ↔ ⁿ ; # SUPERSCRIPT LATIN SMALL LETTER N :: NFC (NFD) ; + diff --git a/icu4c/source/data/translit/Any_Publishing.txt b/icu4c/source/data/translit/Any_Publishing.txt index c3f4a566325..1c65a773ad5 100644 --- a/icu4c/source/data/translit/Any_Publishing.txt +++ b/icu4c/source/data/translit/Any_Publishing.txt @@ -1,23 +1,31 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Any_Publishing.txt # Generated from CLDR # + +# Test case +# "The" "(quick)" ('brown') `fox' ` jumped -- "over?" +# Variables $single = \' ; $space = ' ' ; $double = \" ; $back = \` ; $tab = \u0008 ; $makeRight = [[:Z:][:Ps:][:Pi:]$] ; +# fix UNIX quotes $back $back → “ ; $back → ‘ ; +# fix typewriter quotes, by context $makeRight {$double} ↔ “ ; $double ↔ ” ; $makeRight {$single} ↔ ‘ ; $single ↔ ’; +# fix multiple spaces and hyphens $space {$space} → ; '--' ↔ — ; + diff --git a/icu4c/source/data/translit/Arabic_Latin.txt b/icu4c/source/data/translit/Arab_Latn.txt similarity index 73% rename from icu4c/source/data/translit/Arabic_Latin.txt rename to icu4c/source/data/translit/Arab_Latn.txt index ef0484f34af..fd6066dda06 100644 --- a/icu4c/source/data/translit/Arabic_Latin.txt +++ b/icu4c/source/data/translit/Arab_Latn.txt @@ -1,23 +1,38 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Arabic_Latin.txt +# File: Arab_Latn.txt # Generated from CLDR # + +# Generally follows UNGEGN +# http://www.eki.ee/wgrs/rom1_ar.pdf +# Occasionally deviates in the direction of ISO 233 +# http://homepage.mac.com/sirbinks/pdf/Arabic.pdf +# a) where required for disambiguation. +# b) with underdot instead of cedilla for letter like SAD, +# since those are explicitly in Unicode for transliteration. +# c) with extra non-Arabic-language letters, like PEH +# +# Does *not* do assimilation of "al", nor hyphenation. +# While it could be done, we need to determine whether a prefix "al" could +# occur other than as the definite article (since no space is used). :: [[:Arabic:][:block=ARABIC:][‎ⁿ،؛؟ـ\u064B-\u0655٠-٬۰-۹﷼ښ]] ; :: NFKD (NFC); $disambig = \u0331 ; $disambig2 = \u0330 ; $under = \u0323 ; $descender = ˌ; -$notAbove = [[:^ccc=0:]&[:^ccc=230:]]; +$notAbove = [[:^ccc=0:] & [:^ccc=230:]]; +# non-letters [:Nd:]{٫}[:Nd:] ↔ [:Nd:]{','}[:Nd:] ; # ARABIC DECIMAL SEPARATOR [:Nd:]{٬}[:Nd:] ↔ [:Nd:]{'.'}[:Nd:] ; # ARABIC THOUSANDS SEPARATOR ٫ ↔ ',' $disambig ; # ARABIC DECIMAL SEPARATOR ٬ ↔ '.' $disambig ; # ARABIC THOUSANDS SEPARATOR +# ٭ ↔ ; # ARABIC FIVE POINTED STAR // no need to transliterate ، ↔ ',' ; # ARABIC COMMA ؛ ↔ ';' ; # ARABIC SEMICOLON ؟ ↔ '?' ; # ARABIC QUESTION MARK @@ -42,9 +57,12 @@ $notAbove = [[:^ccc=0:]&[:^ccc=230:]]; ٧ ↔ 7 ; # ARABIC-INDIC DIGIT SEVEN ٨ ↔ 8 ; # ARABIC-INDIC DIGIT EIGHT ٩ ↔ 9 ; # ARABIC-INDIC DIGIT NINE +# letters +# long vowels \u064Eا↔ a\u0304 ; # ARABIC FATHA, ARABIC LETTER ALEF \u064Fو ↔ u\u0304 ; # ARABIC DAMMA, ARABIC LETTER WAW \u0650ي ↔ i\u0304 ; # ARABIC KASRA, ARABIC LETTER YEH +# longer items moved here to prevent masking ث ↔ t h $disambig ; # ARABIC LETTER THEH ذ ↔ d h $disambig ; # ARABIC LETTER THAL ش ↔ s h $disambig ; # ARABIC LETTER SHEEN @@ -53,13 +71,19 @@ $notAbove = [[:^ccc=0:]&[:^ccc=230:]]; ط ↔ t $under ; # ARABIC LETTER TAH ظ ↔ z $under ; # ARABIC LETTER ZAH غ ↔ g h $disambig ; # ARABIC LETTER GHAIN +# WARNING: special case +# ←t, umlaut, half-ring below→ will be canonically ordered as ←t, half-ring below, umlaut→ +# so on the return, we have to skip over (but preserve) the half-ring below (or others like it) +# ة\u0655 ← t\u0339\u0308 ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS ة ↔ t \u0308 ; # ARABIC LETTER TEH MARBUTA ة | $1 ← t ($notAbove+) \u0308 ; # ARABIC LETTER TEH MARBUTA +# non-Arabic language ژ ↔ z h $disambig ; # ARABIC LETTER JEH ڭ ↔ n $disambig g ; # ARABIC LETTER NG ۋ ↔ v $disambig ; # ARABIC LETTER VE ی ↔ y $disambig2 ; # ARABIC LETTER FARSI YEH ښ ↔ s $descender; +# Arabic language ء ↔ ʾ ; # ARABIC LETTER HAMZA ا ↔ a $under; # ARABIC LETTER ALEF ب ↔ b ; # ARABIC LETTER BEH @@ -92,13 +116,18 @@ $notAbove = [[:^ccc=0:]&[:^ccc=230:]]; \u0650 ↔ i ; # ARABIC KASRA \u0651 ↔ \u0303 ; # ARABIC SHADDA \u0652 ↔ \u030A ; # ARABIC SUKUN +# special combining marks \u0653 ↔ \u0302 ; # ARABIC MADDAH ABOVE \u0654 ↔ \u0309 ; # ARABIC HAMZA ABOVE \u0655 ↔ \u0339 ; # ARABIC HAMZA BELOW +# Some non-Arabic language (not in UNGEGN) پ ↔ p ; # ARABIC LETTER PEH چ ↔ c h $disambig ; # ARABIC LETTER TCHEH ڤ ↔ v ; # ARABIC LETTER VEH +# ڥ ↔ v $disambig ; # ARABIC LETTER FEH WITH THREE DOTS BELOW +# ڢ ↔ f $disambig ; # ARABIC LETTER FEH WITH DOT MOVED BELOW گ ↔ g ; # ARABIC LETTER GAF +# fallbacks | s ← c } [eiy]; | k ← c ; | i ← e ; @@ -108,3 +137,4 @@ $notAbove = [[:^ccc=0:]&[:^ccc=230:]]; :: (lower) ; ::NFC (NFD); :: ( [[:Latin:] [%,.0-9;?ʾ-ʿ\u0302-\u0304\u0308-\u030A\u0323\u0330-\u0331\u0339;ˌ]] ); + diff --git a/icu4c/source/data/translit/Bengali_Devanagari.txt b/icu4c/source/data/translit/Beng_Deva.txt similarity index 82% rename from icu4c/source/data/translit/Bengali_Devanagari.txt rename to icu4c/source/data/translit/Beng_Deva.txt index a430e44e5ca..e1cdd74dd19 100644 --- a/icu4c/source/data/translit/Bengali_Devanagari.txt +++ b/icu4c/source/data/translit/Beng_Deva.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Bengali_Devanagari.txt +# File: Beng_Deva.txt # Generated from CLDR # + ::[।-॥\u0981-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ\u09BC-\u09C4ে-ৈো-\u09CDৗড়-ঢ়য়-\u09E3০-৺ৎ]; ::NFD; ::Bengali-InterIndic; ::InterIndic-Devanagari; ::NFC; + diff --git a/icu4c/source/data/translit/Bengali_Gujarati.txt b/icu4c/source/data/translit/Beng_Gujr.txt similarity index 82% rename from icu4c/source/data/translit/Bengali_Gujarati.txt rename to icu4c/source/data/translit/Beng_Gujr.txt index c68bc550133..66f998f0d7c 100644 --- a/icu4c/source/data/translit/Bengali_Gujarati.txt +++ b/icu4c/source/data/translit/Beng_Gujr.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Bengali_Gujarati.txt +# File: Beng_Gujr.txt # Generated from CLDR # + ::[।-॥\u0981-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ\u09BC-\u09C4ে-ৈো-\u09CDৗড়-ঢ়য়-\u09E3০-৺ৎ]; ::NFD; ::Bengali-InterIndic; ::InterIndic-Gujarati; ::NFC; + diff --git a/icu4c/source/data/translit/Bengali_Gurmukhi.txt b/icu4c/source/data/translit/Beng_Guru.txt similarity index 82% rename from icu4c/source/data/translit/Bengali_Gurmukhi.txt rename to icu4c/source/data/translit/Beng_Guru.txt index 88de1d25a83..551cc69f5f2 100644 --- a/icu4c/source/data/translit/Bengali_Gurmukhi.txt +++ b/icu4c/source/data/translit/Beng_Guru.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Bengali_Gurmukhi.txt +# File: Beng_Guru.txt # Generated from CLDR # + ::[।-॥\u0981-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ\u09BC-\u09C4ে-ৈো-\u09CDৗড়-ঢ়য়-\u09E3০-৺ৎ]; ::NFD; ::Bengali-InterIndic; ::InterIndic-Gurmukhi; ::NFC; + diff --git a/icu4c/source/data/translit/Bengali_Kannada.txt b/icu4c/source/data/translit/Beng_Knda.txt similarity index 82% rename from icu4c/source/data/translit/Bengali_Kannada.txt rename to icu4c/source/data/translit/Beng_Knda.txt index 4c234b5623c..f0798719c1a 100644 --- a/icu4c/source/data/translit/Bengali_Kannada.txt +++ b/icu4c/source/data/translit/Beng_Knda.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Bengali_Kannada.txt +# File: Beng_Knda.txt # Generated from CLDR # + ::[।-॥\u0981-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ\u09BC-\u09C4ে-ৈো-\u09CDৗড়-ঢ়য়-\u09E3০-৺ৎ]; ::NFD; ::Bengali-InterIndic; ::InterIndic-Kannada; ::NFC; + diff --git a/icu4c/source/data/translit/Bengali_Latin.txt b/icu4c/source/data/translit/Beng_Latn.txt similarity index 83% rename from icu4c/source/data/translit/Bengali_Latin.txt rename to icu4c/source/data/translit/Beng_Latn.txt index 89dd911eab7..59ee77c06d1 100644 --- a/icu4c/source/data/translit/Bengali_Latin.txt +++ b/icu4c/source/data/translit/Beng_Latn.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Bengali_Latin.txt +# File: Beng_Latn.txt # Generated from CLDR # + ::[[:script=bengali:][।-॥\u0981-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ\u09BC-\u09C4ে-ৈো-\u09CDৗড়-ঢ়য়-\u09E3০-৺ৎ]]; ::NFD; ::Bengali-InterIndic; ::InterIndic-Latin; ::NFC; + diff --git a/icu4c/source/data/translit/Bengali_Malayalam.txt b/icu4c/source/data/translit/Beng_Mlym.txt similarity index 82% rename from icu4c/source/data/translit/Bengali_Malayalam.txt rename to icu4c/source/data/translit/Beng_Mlym.txt index 27adb341ee2..8a141b770a0 100644 --- a/icu4c/source/data/translit/Bengali_Malayalam.txt +++ b/icu4c/source/data/translit/Beng_Mlym.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Bengali_Malayalam.txt +# File: Beng_Mlym.txt # Generated from CLDR # + ::[।-॥\u0981-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ\u09BC-\u09C4ে-ৈো-\u09CDৗড়-ঢ়য়-\u09E3০-৺ৎ]; ::NFD; ::Bengali-InterIndic; ::InterIndic-Malayalam; ::NFC; + diff --git a/icu4c/source/data/translit/Bengali_Oriya.txt b/icu4c/source/data/translit/Beng_Orya.txt similarity index 83% rename from icu4c/source/data/translit/Bengali_Oriya.txt rename to icu4c/source/data/translit/Beng_Orya.txt index a1f0d8c0389..a1cd578d6fe 100644 --- a/icu4c/source/data/translit/Bengali_Oriya.txt +++ b/icu4c/source/data/translit/Beng_Orya.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Bengali_Oriya.txt +# File: Beng_Orya.txt # Generated from CLDR # + ::[।-॥\u0981-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ\u09BC-\u09C4ে-ৈো-\u09CDৗড়-ঢ়য়-\u09E3০-৺ৎ]; ::NFD; ::Bengali-InterIndic; ::InterIndic-Oriya; ::NFC; + diff --git a/icu4c/source/data/translit/Bengali_Tamil.txt b/icu4c/source/data/translit/Beng_Taml.txt similarity index 83% rename from icu4c/source/data/translit/Bengali_Tamil.txt rename to icu4c/source/data/translit/Beng_Taml.txt index 6613544da32..4696034f0aa 100644 --- a/icu4c/source/data/translit/Bengali_Tamil.txt +++ b/icu4c/source/data/translit/Beng_Taml.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Bengali_Tamil.txt +# File: Beng_Taml.txt # Generated from CLDR # + ::[।-॥\u0981-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ\u09BC-\u09C4ে-ৈো-\u09CDৗড়-ঢ়য়-\u09E3০-৺ৎ]; ::NFD; ::Bengali-InterIndic; ::InterIndic-Tamil; ::NFC; + diff --git a/icu4c/source/data/translit/Bengali_Telugu.txt b/icu4c/source/data/translit/Beng_Telu.txt similarity index 83% rename from icu4c/source/data/translit/Bengali_Telugu.txt rename to icu4c/source/data/translit/Beng_Telu.txt index 50d7025be0a..dc8fde6660f 100644 --- a/icu4c/source/data/translit/Bengali_Telugu.txt +++ b/icu4c/source/data/translit/Beng_Telu.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Bengali_Telugu.txt +# File: Beng_Telu.txt # Generated from CLDR # + ::[।-॥\u0981-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ\u09BC-\u09C4ে-ৈো-\u09CDৗড়-ঢ়য়-\u09E3০-৺ৎ]; ::NFD; ::Bengali-InterIndic; ::InterIndic-Telugu; ::NFC; + diff --git a/icu4c/source/data/translit/Bengali_InterIndic.txt b/icu4c/source/data/translit/Bengali_InterIndic.txt index ac4598faac8..2cff14eb52a 100644 --- a/icu4c/source/data/translit/Bengali_InterIndic.txt +++ b/icu4c/source/data/translit/Bengali_InterIndic.txt @@ -1,12 +1,14 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Bengali_InterIndic.txt # Generated from CLDR # + +# Bengali-InterIndic ো→\uE04B; # VOWEL SIGN O ৌ→\uE04C; # VOWEL SIGN AU \u0981→\uE001; # SIGN CANDRABINDU @@ -69,9 +71,11 @@ ৈ→\uE048; # VOWEL SIGN AI ো→\uE04B; ৌ→\uE04C; +# \u09CD→\uE04D; # SIGN VIRAMA ৎ→\uE083; # Khanda-ta ৗ→\uE057; # AU LENGTH MARK +# ৠ→\uE060; # LETTER VOCALIC RR ৡ→\uE061; # LETTER VOCALIC LL \u09E2→\uE062; # VOWEL SIGN VOCALIC L @@ -99,3 +103,6 @@ ৺→\uE07B; # ISSHAR ।→\uE064; # DANDA ॥→\uE065; # DOUBLE DANDA +# :: NFC (NFD) ; +# eof + diff --git a/icu4c/source/data/translit/Bulgarian_Latin_BGN.txt b/icu4c/source/data/translit/Bulgarian_Latin_BGN.txt deleted file mode 100644 index c40362bf20a..00000000000 --- a/icu4c/source/data/translit/Bulgarian_Latin_BGN.txt +++ /dev/null @@ -1,100 +0,0 @@ -# *************************************************************************** -# * -# * Copyright (C) 2004-2015, International Business Machines -# * Corporation; Unicode, Inc.; and others. All Rights Reserved. -# * -# *************************************************************************** -# File: Bulgarian_Latin_BGN.txt -# Generated from CLDR -# -:: [АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯѪѢабвгдежзийклмнопрстуфхцчшщъьюяѫѣ] ; -:: NFD (NFC) ; -$upperConsonants = [БВГДЖЗЙКЛМНПРСТФХЦЧШЩЬ] ; -$lowerConsonants = [бвгджзйклмнпрстфхцчшщь] ; -$consonants = [$upperConsonants $lowerConsonants] ; -$upperVowels = [АЕИОУЪЮЯѪѢ] ; -$lowerVowels = [аеиоуъюяѫѣ] ; -$vowels = [$upperVowels $lowerVowels] ; -$lower = [$lowerConsonants $lowerVowels] ; -$bulgarian = [ $lower $upperConsonants $upperVowels ] ; -$wordBoundary = [^[:L:][:M:][:N:]] ; -А → A ; # CYRILLIC CAPITAL LETTER A -а → a ; # CYRILLIC SMALL LETTER A -Б → B ; # CYRILLIC CAPITAL LETTER BE -б → b ; # CYRILLIC SMALL LETTER BE -В → V ; # CYRILLIC CAPITAL LETTER VE -в → v ; # CYRILLIC SMALL LETTER VE -Г → G ; # CYRILLIC CAPITAL LETTER GHE -г → g ; # CYRILLIC SMALL LETTER GHE -Д → D ; # CYRILLIC CAPITAL LETTER DE -д → d ; # CYRILLIC SMALL LETTER DE -Е → E ; # CYRILLIC CAPITAL LETTER DE -е → e ; # CYRILLIC SMALL LETTER DE -Ж} $lower → Zh ; # CYRILLIC CAPITAL LETTER ZHE -Ж → ZH ; # CYRILLIC CAPITAL LETTER ZHE -ж → zh ; # CYRILLIC SMALL LETTER ZHE -З → Z ; # CYRILLIC CAPITAL LETTER ZE -з → z ; # CYRILLIC SMALL LETTER ZE -И → I ; # CYRILLIC CAPITAL LETTER I -и → i ; # CYRILLIC SMALL LETTER I -Й → Y ; # CYRILLIC CAPITAL LETTER I -й → y ; # CYRILLIC SMALL LETTER I -К → K ; # CYRILLIC CAPITAL LETTER KA -к → k ; # CYRILLIC SMALL LETTER KA -Л → L ; # CYRILLIC CAPITAL LETTER EL -л → l ; # CYRILLIC SMALL LETTER EL -М → M ; # CYRILLIC CAPITAL LETTER EM -м → m ; # CYRILLIC SMALL LETTER EM -Н → N ; # CYRILLIC CAPITAL LETTER EN -н → n ; # CYRILLIC SMALL LETTER EN -О → O ; # CYRILLIC CAPITAL LETTER O -о → o ; # CYRILLIC SMALL LETTER O -П → P ; # CYRILLIC CAPITAL LETTER PE -п → p ; # CYRILLIC SMALL LETTER PE -Р → R ; # CYRILLIC CAPITAL LETTER ER -р → r ; # CYRILLIC SMALL LETTER ER -С → S ; # CYRILLIC CAPITAL LETTER ES -с → s ; # CYRILLIC SMALL LETTER ES -ТС → T·S ; # CYRILLIC CAPITAL LETTER TE -Тс → T·s ; # CYRILLIC CAPITAL LETTER TE -тс → t·s ; # CYRILLIC SMALL LETTER TE -Т → T ; # CYRILLIC CAPITAL LETTER TE -т → t ; # CYRILLIC SMALL LETTER TE -У → U ; # CYRILLIC CAPITAL LETTER U -у → u ; # CYRILLIC SMALL LETTER U -Ф → F ; # CYRILLIC CAPITAL LETTER EF -ф → f ; # CYRILLIC SMALL LETTER EF -Х} $lower → Kh ; # CYRILLIC CAPITAL LETTER HA -Х → KH ; # CYRILLIC CAPITAL LETTER HA -х → kh ; # CYRILLIC SMALL LETTER HA -Ц} $lower → Ts ; # CYRILLIC CAPITAL LETTER TSE -Ц → TS ; # CYRILLIC CAPITAL LETTER TSE -ц → ts ; # CYRILLIC SMALL LETTER TSE -Ч} $lower → Ch ; # CYRILLIC CAPITAL LETTER CHE -Ч → CH ; # CYRILLIC CAPITAL LETTER CHE -ч → ch ; # CYRILLIC SMALL LETTER CHE -ШТ → SH·T ; # CYRILLIC CAPITAL LETTER SHA -Шт → Sh·t ; # CYRILLIC CAPITAL LETTER SHA -шт → sh·t ; # CYRILLIC SMALL LETTER SHA -Ш} $lower → Sh ; # CYRILLIC CAPITAL LETTER SHA -Ш → SH ; # CYRILLIC CAPITAL LETTER SHA -ш → sh ; # CYRILLIC SMALL LETTER SHA -Щ} $lower → Sht ; # CYRILLIC CAPITAL LETTER SHCHA -Щ → SHT ; # CYRILLIC CAPITAL LETTER SHCHA -щ → sht ; # CYRILLIC SMALL LETTER SHCHA -Ъ → Ŭ ; # CYRILLIC CAPITAL LETTER HARD SIGN -ъ → ŭ ; # CYRILLIC SMALL LETTER HARD SIGN -$bulgarian { [Ъъ] } $wordBoundary > ; -Ь → ’ ; # CYRILLIC CAPITAL LETTER SOFT SIGN -ь → ’ ; # CYRILLIC SMALL LETTER SOFT SIGN -Ю} $lower → Yu ; # CYRILLIC CAPITAL LETTER YU -Ю → YU ; # CYRILLIC CAPITAL LETTER YU -ю → yu ; # CYRILLIC SMALL LETTER YU -Я} $lower → Ya ; # CYRILLIC CAPITAL LETTER YA -Я → YA ; # CYRILLIC CAPITAL LETTER YA -я → ya ; # CYRILLIC SMALL LETTER YA -Ѫ → Ŭ ; # CYRILLIC CAPITAL LETTER BIG YUS -ѫ → ŭ ; # CYRILLIC SMALL LETTER BIG YUS -Ѣ} $lower → Ye ; # CYRILLIC CAPITAL LETTER YAT -Ѣ → YE ; # CYRILLIC CAPITAL LETTER YAT -ѣ → ye ; # CYRILLIC SMALL LETTER YAT diff --git a/icu4c/source/data/translit/Cyrillic_Latin.txt b/icu4c/source/data/translit/Cyrillic_Latin.txt deleted file mode 100644 index e82bfa90ade..00000000000 --- a/icu4c/source/data/translit/Cyrillic_Latin.txt +++ /dev/null @@ -1,129 +0,0 @@ -# *************************************************************************** -# * -# * Copyright (C) 2004-2015, International Business Machines -# * Corporation; Unicode, Inc.; and others. All Rights Reserved. -# * -# *************************************************************************** -# File: Cyrillic_Latin.txt -# Generated from CLDR -# -:: [Ққ\u0308Ă-ăĔ-ĕĞ-ğĬ-ĭŎ-ŏŬ-ŭ\u0306Ѐ-џҐ-ҕҘ-ҙӁ-ӂӐ-ӟӢ-ӧӬ-ӵӸ-ӹḜ-ḝẮ-ặᾰᾸῐῘῠῨ] ; -:: NFD (NFC) ; -$modprime = ʹ; -$modprime2 = ʺ; -$grave = \u0300; -$acute = \u0301; -$hat = \u0302; -$breve = \u0306 ; -$dot = \u0307 ; -$caron = \u030C ; -$comma = \u0326 ; -$under = \u0331 ; -$descender = ˌ; -я ↔ a $hat ; # CYRILLIC SMALL LETTER YA -Я ↔ A $hat ; # CYRILLIC CAPITAL LETTER YA -ч ↔ c $caron ; # CYRILLIC SMALL LETTER CHE -Ч ↔ C $caron; # CYRILLIC CAPITAL LETTER CHE -э ↔ e $acute; # CYRILLIC SMALL LETTER E -Э ↔ E $acute; # CYRILLIC CAPITAL LETTER E -є ↔ e $hat; # CYRILLIC SMALL LETTER UKRAINIAN IE -Є ↔ E $hat; # CYRILLIC CAPITAL LETTER UKRAINIAN IE -ш ↔ s $caron ; # CYRILLIC SMALL LETTER SHA -Ш ↔ S $caron ; # CYRILLIC CAPITAL LETTER SHA -щ ↔ s $hat ; # CYRILLIC SMALL LETTER SHCHA -Щ ↔ S $hat; # CYRILLIC CAPITAL LETTER SHCHA -ѕ ↔ z $hat ; # CYRILLIC SMALL LETTER DZE -Ѕ ↔ Z $hat; # CYRILLIC CAPITAL LETTER DZE -ю ↔ u $hat ; # CYRILLIC SMALL LETTER YU -Ю ↔ U $hat ; # CYRILLIC CAPITAL LETTER YU -і ↔ i $acute; # CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I -І ↔ I $acute; # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I -ј ↔ j $caron; # CYRILLIC SMALL LETTER JE -Ј ↔ J $caron; # CYRILLIC CAPITAL LETTER JE -љ ↔ l $hat ; # CYRILLIC SMALL LETTER LJE -Љ ↔ L $hat ; # CYRILLIC CAPITAL LETTER LJE -њ ↔ n $hat ; # CYRILLIC SMALL LETTER NJE -Њ ↔ N $hat ; # CYRILLIC CAPITAL LETTER NJE -ћ ↔ c $acute ; # CYRILLIC SMALL LETTER TSHE -Ћ ↔ C $acute ; # CYRILLIC CAPITAL LETTER TSHE -џ ↔ d $hat ; # CYRILLIC SMALL LETTER DZHE -Џ ↔ D $hat ; # CYRILLIC CAPITAL LETTER DZHE -а ↔ a ; # CYRILLIC SMALL LETTER A -А ↔ A ; # CYRILLIC CAPITAL LETTER A -ә ↔ ə ; # CYRILLIC SMALL LETTER SCHWA -Ә ↔ Ə ; # CYRILLIC CAPITAL LETTER SCHWA -ӕ ↔ æ ; # CYRILLIC SMALL LIGATURE A IE -Ӕ ↔ Æ ; # CYRILLIC CAPITAL LIGATURE A IE -б ↔ b ; # CYRILLIC SMALL LETTER BE -Б ↔ B ; # CYRILLIC CAPITAL LETTER BE -в ↔ v ; # CYRILLIC SMALL LETTER VE -В ↔ V ; # CYRILLIC CAPITAL LETTER VE -ґ ↔ g $grave ; # CYRILLIC SMALL LETTER GHE WITH UPTURN -Ґ ↔ G $grave ; # CYRILLIC CAPITAL LETTER GHE WITH UPTURN -ғ ↔ g $dot ; # CYRILLIC SMALL LETTER GHE WITH STROKE -Ғ ↔ G $dot; # CYRILLIC CAPITAL LETTER GHE WITH STROKE -ҕ ↔ g $breve; # CYRILLIC SMALL LETTER GHE WITH MIDDLE HOOK -Ҕ ↔ G $breve; # CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK -г ↔ g ; # CYRILLIC SMALL LETTER GHE -Г ↔ G ; # CYRILLIC CAPITAL LETTER GHE -д ↔ d; # CYRILLIC SMALL LETTER DE -Д ↔ D; # CYRILLIC CAPITAL LETTER DE -ђ ↔ đ ; # CYRILLIC SMALL LETTER DJE -Ђ ↔ Đ ; # CYRILLIC CAPITAL LETTER DJE -ҙ ↔ z $comma ; # CYRILLIC SMALL LETTER ZE WITH DESCENDER -Ҙ ↔ Z $comma ; # CYRILLIC CAPITAL LETTER ZE WITH DESCENDER -е ↔ e ; # CYRILLIC SMALL LETTER IE -Е ↔ E; # CYRILLIC CAPITAL LETTER IE -ж ↔ z $caron; # CYRILLIC SMALL LETTER ZHE -Ж ↔ Z $caron; # CYRILLIC CAPITAL LETTER ZHE -з ↔ z ; # CYRILLIC SMALL LETTER ZE -З ↔ Z; # CYRILLIC CAPITAL LETTER ZE -и\u0306 ↔ j ; # CYRILLIC SMALL LETTER I -И\u0306 ↔ J ; # CYRILLIC CAPITAL LETTER I -и ↔ i ; # CYRILLIC SMALL LETTER I -И ↔ I ; # CYRILLIC CAPITAL LETTER I -қ ↔ k $descender ; # CYRILLIC SMALL LETTER KA WITH DESCENDER -Қ ↔ K $descender ; # CYRILLIC CAPITAL LETTER KA WITH DESCENDER -к ↔ k ; # CYRILLIC SMALL LETTER KA -К ↔ K; # CYRILLIC CAPITAL LETTER KA -л ↔ l ; # CYRILLIC SMALL LETTER EL -Л ↔ L; # CYRILLIC CAPITAL LETTER EL -м ↔ m ; # CYRILLIC SMALL LETTER EM -М ↔ M ; # CYRILLIC CAPITAL LETTER EM -н ↔ n ; # CYRILLIC SMALL LETTER EN -Н ↔ N; # CYRILLIC CAPITAL LETTER EN -о ↔ o ; # CYRILLIC SMALL LETTER O -О ↔ O ; # CYRILLIC CAPITAL LETTER O -п ↔ p ; # CYRILLIC SMALL LETTER PE -П ↔ P ; # CYRILLIC CAPITAL LETTER PE -р ↔ r ; # CYRILLIC SMALL LETTER ER -Р ↔ R ; # CYRILLIC CAPITAL LETTER ER -с ↔ s ; # CYRILLIC SMALL LETTER ES -С ↔ S ; # CYRILLIC CAPITAL LETTER ES -т ↔ t ; # CYRILLIC SMALL LETTER TE -Т ↔ T ; # CYRILLIC CAPITAL LETTER TE -у ↔ u ; # CYRILLIC SMALL LETTER U -У ↔ U ; # CYRILLIC CAPITAL LETTER U -ф ↔ f ; # CYRILLIC SMALL LETTER EF -Ф ↔ F ; # CYRILLIC CAPITAL LETTER EF -х ↔ h ; # CYRILLIC SMALL LETTER HA -Х ↔ H; # CYRILLIC CAPITAL LETTER HA -ц ↔ c ; # CYRILLIC SMALL LETTER TSE -Ц ↔ C; # CYRILLIC CAPITAL LETTER TSE -Ъ ↔ $modprime2 $under ; # CYRILLIC CAPITAL LETTER HARD SIGN -ъ ↔ $modprime2 ; # CYRILLIC SMALL LETTER HARD SIGN -Ь ↔ $modprime $under ; # CYRILLIC CAPITAL LETTER SOFT SIGN -ь ↔ $modprime ; # CYRILLIC SMALL LETTER SOFT SIGN -ы ↔ y ; # CYRILLIC SMALL LETTER YERU -Ы ↔ Y ; # CYRILLIC CAPITAL LETTER YERU -$ignore = [[:Mark:]''] * ; -| k ← q ; -| K ← Q ; -| u ← w ; -| U ← W ; -| KS ← X } $ignore [:UppercaseLetter:] ; -| KS ← [:UppercaseLetter:] $ignore { X ; -| Ks ← X ; -| ks ← x ; -:: NFC (NFD) ; -:: ( [ˌ\u0308A-Za-zÀ-ÏÑ-ÖÙ-Ýà-ïñ-öù-ýÿ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƏƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǰǴ-ǵǸ-țȞ-ȟȦ-ȳəʹ-ʺ\u0300-\u0302\u0306-\u0307\u030C\u0326\u0331\u0340-\u0341\u0344ʹ΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЀЃЌ-ЎЙйѐѓќ-ўӁ-ӂӐ-ӑӖ-ӗḀ-ẙẛẠ-ỹἂ-ἅἊ-Ἅἒ-ἕἚ-Ἕἢ-ἥἪ-Ἥἲ-ἵἺ-Ἵὂ-ὅὊ-Ὅὒ-ὕὛὝὢ-ὥὪ-Ὥὰ-ώᾂ-ᾅᾊ-ᾍᾒ-ᾕᾚ-ᾝᾢ-ᾥᾪ-ᾭᾰᾲᾴᾸᾺ-ΆῂῄῈ-Ή῍-῎ῐῒ-ΐῘῚ-Ί῝-῞ῠῢ-ΰῨῪ-Ύ῭-΅ῲῴῸ-ΏK-Å] ) ; diff --git a/icu4c/source/data/translit/Cyrl_Latn.txt b/icu4c/source/data/translit/Cyrl_Latn.txt new file mode 100644 index 00000000000..09d01244993 --- /dev/null +++ b/icu4c/source/data/translit/Cyrl_Latn.txt @@ -0,0 +1,279 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: Cyrl_Latn.txt +# Generated from CLDR +# + +# TODO: add remaining characters +# Should add variants for Russian-English, Russian-German +# Those can use this as a base, and then remap cases +# like a $hat to ya or ja. +# :: [\u0000-\u007E ʹ ʺ [:Cyrillic:] [:Latin:] [:nonspacing mark:]] ; +### WARNING, \u0308 must be added to the generated filters, in both directions ### +# MINIMAL FILTER +:: [Ққ\u0308Ă-ăĔ-ĕĞ-ğĬ-ĭŎ-ŏŬ-ŭ\u0306Ѐ-џҐ-ҕҘ-ҙӁ-ӂӐ-ӟӢ-ӧӬ-ӵӸ-ӹḜ-ḝẮ-ặᾰᾸῐῘῠῨ] ; +:: NFD (NFC) ; +$modprime = ʹ; +$modprime2 = ʺ; +$grave = \u0300; +$acute = \u0301; +$hat = \u0302; +$breve = \u0306 ; +$dot = \u0307 ; +$caron = \u030C ; +$comma = \u0326 ; +$under = \u0331 ; +$descender = ˌ; +# move up so not masked +я ↔ a $hat ; # CYRILLIC SMALL LETTER YA +Я ↔ A $hat ; # CYRILLIC CAPITAL LETTER YA +ч ↔ c $caron ; # CYRILLIC SMALL LETTER CHE +Ч ↔ C $caron; # CYRILLIC CAPITAL LETTER CHE +# ҷ ↔ XXX ; # CYRILLIC SMALL LETTER CHE WITH DESCENDER +# Ҷ ↔ XXX ; # CYRILLIC CAPITAL LETTER CHE WITH DESCENDER +# ӌ ↔ XXX ; # CYRILLIC SMALL LETTER KHAKASSIAN CHE +# Ӌ ↔ XXX ; # CYRILLIC CAPITAL LETTER KHAKASSIAN CHE +# ҹ ↔ XXX ; # CYRILLIC SMALL LETTER CHE WITH VERTICAL STROKE +# Ҹ ↔ XXX ; # CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE +э ↔ e $acute; # CYRILLIC SMALL LETTER E +Э ↔ E $acute; # CYRILLIC CAPITAL LETTER E +є ↔ e $hat; # CYRILLIC SMALL LETTER UKRAINIAN IE +Є ↔ E $hat; # CYRILLIC CAPITAL LETTER UKRAINIAN IE +ш ↔ s $caron ; # CYRILLIC SMALL LETTER SHA +Ш ↔ S $caron ; # CYRILLIC CAPITAL LETTER SHA +щ ↔ s $hat ; # CYRILLIC SMALL LETTER SHCHA +Щ ↔ S $hat; # CYRILLIC CAPITAL LETTER SHCHA +ѕ ↔ z $hat ; # CYRILLIC SMALL LETTER DZE +Ѕ ↔ Z $hat; # CYRILLIC CAPITAL LETTER DZE +# ӡ ↔ XXX ; # CYRILLIC SMALL LETTER ABKHASIAN DZE +# Ӡ ↔ XXX ; # CYRILLIC CAPITAL LETTER ABKHASIAN DZE +ю ↔ u $hat ; # CYRILLIC SMALL LETTER YU +Ю ↔ U $hat ; # CYRILLIC CAPITAL LETTER YU +і ↔ i $acute; # CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I +І ↔ I $acute; # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I +ј ↔ j $caron; # CYRILLIC SMALL LETTER JE +Ј ↔ J $caron; # CYRILLIC CAPITAL LETTER JE +љ ↔ l $hat ; # CYRILLIC SMALL LETTER LJE +Љ ↔ L $hat ; # CYRILLIC CAPITAL LETTER LJE +њ ↔ n $hat ; # CYRILLIC SMALL LETTER NJE +Њ ↔ N $hat ; # CYRILLIC CAPITAL LETTER NJE +ћ ↔ c $acute ; # CYRILLIC SMALL LETTER TSHE +Ћ ↔ C $acute ; # CYRILLIC CAPITAL LETTER TSHE +џ ↔ d $hat ; # CYRILLIC SMALL LETTER DZHE +Џ ↔ D $hat ; # CYRILLIC CAPITAL LETTER DZHE +# Normal order +а ↔ a ; # CYRILLIC SMALL LETTER A +А ↔ A ; # CYRILLIC CAPITAL LETTER A +ә ↔ ə ; # CYRILLIC SMALL LETTER SCHWA +Ә ↔ Ə ; # CYRILLIC CAPITAL LETTER SCHWA +ӕ ↔ æ ; # CYRILLIC SMALL LIGATURE A IE +Ӕ ↔ Æ ; # CYRILLIC CAPITAL LIGATURE A IE +б ↔ b ; # CYRILLIC SMALL LETTER BE +Б ↔ B ; # CYRILLIC CAPITAL LETTER BE +в ↔ v ; # CYRILLIC SMALL LETTER VE +В ↔ V ; # CYRILLIC CAPITAL LETTER VE +ґ ↔ g $grave ; # CYRILLIC SMALL LETTER GHE WITH UPTURN +Ґ ↔ G $grave ; # CYRILLIC CAPITAL LETTER GHE WITH UPTURN +ғ ↔ g $dot ; # CYRILLIC SMALL LETTER GHE WITH STROKE +Ғ ↔ G $dot; # CYRILLIC CAPITAL LETTER GHE WITH STROKE +ҕ ↔ g $breve; # CYRILLIC SMALL LETTER GHE WITH MIDDLE HOOK +Ҕ ↔ G $breve; # CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK +г ↔ g ; # CYRILLIC SMALL LETTER GHE +Г ↔ G ; # CYRILLIC CAPITAL LETTER GHE +д ↔ d; # CYRILLIC SMALL LETTER DE +Д ↔ D; # CYRILLIC CAPITAL LETTER DE +ђ ↔ đ ; # CYRILLIC SMALL LETTER DJE +Ђ ↔ Đ ; # CYRILLIC CAPITAL LETTER DJE +ҙ ↔ z $comma ; # CYRILLIC SMALL LETTER ZE WITH DESCENDER +Ҙ ↔ Z $comma ; # CYRILLIC CAPITAL LETTER ZE WITH DESCENDER +е ↔ e ; # CYRILLIC SMALL LETTER IE +Е ↔ E; # CYRILLIC CAPITAL LETTER IE +ж ↔ z $caron; # CYRILLIC SMALL LETTER ZHE +Ж ↔ Z $caron; # CYRILLIC CAPITAL LETTER ZHE +# җ ↔ XXX ; # CYRILLIC SMALL LETTER ZHE WITH DESCENDER +# Җ ↔ XXX ; # CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER +з ↔ z ; # CYRILLIC SMALL LETTER ZE +З ↔ Z; # CYRILLIC CAPITAL LETTER ZE +и\u0306 ↔ j ; # CYRILLIC SMALL LETTER I +И\u0306 ↔ J ; # CYRILLIC CAPITAL LETTER I +и ↔ i ; # CYRILLIC SMALL LETTER I +И ↔ I ; # CYRILLIC CAPITAL LETTER I +қ ↔ k $descender ; # CYRILLIC SMALL LETTER KA WITH DESCENDER +Қ ↔ K $descender ; # CYRILLIC CAPITAL LETTER KA WITH DESCENDER +к ↔ k ; # CYRILLIC SMALL LETTER KA +К ↔ K; # CYRILLIC CAPITAL LETTER KA +# ӄ ↔ XXX ; # CYRILLIC SMALL LETTER KA WITH HOOK +# Ӄ ↔ XXX ; # CYRILLIC CAPITAL LETTER KA WITH HOOK +# ҡ ↔ XXX ; # CYRILLIC SMALL LETTER BASHKIR KA +# Ҡ ↔ XXX ; # CYRILLIC CAPITAL LETTER BASHKIR KA +# ҟ ↔ XXX ; # CYRILLIC SMALL LETTER KA WITH STROKE +# Ҟ ↔ XXX ; # CYRILLIC CAPITAL LETTER KA WITH STROKE +# ҝ ↔ XXX ; # CYRILLIC SMALL LETTER KA WITH VERTICAL STROKE +# Ҝ ↔ XXX ; # CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE +л ↔ l ; # CYRILLIC SMALL LETTER EL +Л ↔ L; # CYRILLIC CAPITAL LETTER EL +м ↔ m ; # CYRILLIC SMALL LETTER EM +М ↔ M ; # CYRILLIC CAPITAL LETTER EM +н ↔ n ; # CYRILLIC SMALL LETTER EN +Н ↔ N; # CYRILLIC CAPITAL LETTER EN +# ң ↔ XXX ; # CYRILLIC SMALL LETTER EN WITH DESCENDER +# Ң ↔ XXX ; # CYRILLIC CAPITAL LETTER EN WITH DESCENDER +# ӈ ↔ XXX ; # CYRILLIC SMALL LETTER EN WITH HOOK +# Ӈ ↔ XXX ; # CYRILLIC CAPITAL LETTER EN WITH HOOK +# ҥ ↔ XXX ; # CYRILLIC SMALL LIGATURE EN GHE +# Ҥ ↔ XXX ; # CYRILLIC CAPITAL LIGATURE EN GHE +о ↔ o ; # CYRILLIC SMALL LETTER O +О ↔ O ; # CYRILLIC CAPITAL LETTER O +# ө ↔ XXX ; # CYRILLIC SMALL LETTER BARRED O +# Ө ↔ XXX ; # CYRILLIC CAPITAL LETTER BARRED O +п ↔ p ; # CYRILLIC SMALL LETTER PE +П ↔ P ; # CYRILLIC CAPITAL LETTER PE +# ҧ ↔ XXX ; # CYRILLIC SMALL LETTER PE WITH MIDDLE HOOK +# Ҧ ↔ XXX ; # CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK +# ҁ ↔ XXX ; # CYRILLIC SMALL LETTER KOPPA +# Ҁ ↔ XXX ; # CYRILLIC CAPITAL LETTER KOPPA +р ↔ r ; # CYRILLIC SMALL LETTER ER +Р ↔ R ; # CYRILLIC CAPITAL LETTER ER +# ҏ ↔ XXX ; # CYRILLIC SMALL LETTER ER WITH TICK +# Ҏ ↔ XXX ; # CYRILLIC CAPITAL LETTER ER WITH TICK +с ↔ s ; # CYRILLIC SMALL LETTER ES +С ↔ S ; # CYRILLIC CAPITAL LETTER ES +# ҫ ↔ XXX ; # CYRILLIC SMALL LETTER ES WITH DESCENDER +# Ҫ ↔ XXX ; # CYRILLIC CAPITAL LETTER ES WITH DESCENDER +т ↔ t ; # CYRILLIC SMALL LETTER TE +Т ↔ T ; # CYRILLIC CAPITAL LETTER TE +# ҭ ↔ XXX ; # CYRILLIC SMALL LETTER TE WITH DESCENDER +# Ҭ ↔ XXX ; # CYRILLIC CAPITAL LETTER TE WITH DESCENDER +у ↔ u ; # CYRILLIC SMALL LETTER U +У ↔ U ; # CYRILLIC CAPITAL LETTER U +# ү ↔ XXX ; # CYRILLIC SMALL LETTER STRAIGHT U +# Ү ↔ XXX ; # CYRILLIC CAPITAL LETTER STRAIGHT U +# ұ ↔ XXX ; # CYRILLIC SMALL LETTER STRAIGHT U WITH STROKE +# Ұ ↔ XXX ; # CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE +# ѹ ↔ XXX ; # CYRILLIC SMALL LETTER UK +# Ѹ ↔ XXX ; # CYRILLIC CAPITAL LETTER UK +ф ↔ f ; # CYRILLIC SMALL LETTER EF +Ф ↔ F ; # CYRILLIC CAPITAL LETTER EF +х ↔ h ; # CYRILLIC SMALL LETTER HA +Х ↔ H; # CYRILLIC CAPITAL LETTER HA +# ҳ ↔ XXX ; # CYRILLIC SMALL LETTER HA WITH DESCENDER +# Ҳ ↔ XXX ; # CYRILLIC CAPITAL LETTER HA WITH DESCENDER +# һ ↔ XXX ; # CYRILLIC SMALL LETTER SHHA +# Һ ↔ XXX ; # CYRILLIC CAPITAL LETTER SHHA +# ѡ ↔ XXX ; # CYRILLIC SMALL LETTER OMEGA +# Ѡ ↔ XXX ; # CYRILLIC CAPITAL LETTER OMEGA +# ѿ ↔ XXX ; # CYRILLIC SMALL LETTER OT +# Ѿ ↔ XXX ; # CYRILLIC CAPITAL LETTER OT +# ѽ ↔ XXX ; # CYRILLIC SMALL LETTER OMEGA WITH TITLO +# Ѽ ↔ XXX ; # CYRILLIC CAPITAL LETTER OMEGA WITH TITLO +# ѻ ↔ XXX ; # CYRILLIC SMALL LETTER ROUND OMEGA +# Ѻ ↔ XXX ; # CYRILLIC CAPITAL LETTER ROUND OMEGA +ц ↔ c ; # CYRILLIC SMALL LETTER TSE +Ц ↔ C; # CYRILLIC CAPITAL LETTER TSE +# ҵ ↔ XXX ; # CYRILLIC SMALL LIGATURE TE TSE +# Ҵ ↔ XXX ; # CYRILLIC CAPITAL LIGATURE TE TSE +# ҽ ↔ XXX ; # CYRILLIC SMALL LETTER ABKHASIAN CHE +# Ҽ ↔ XXX ; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE +# ҿ ↔ XXX ; # CYRILLIC SMALL LETTER ABKHASIAN CHE WITH DESCENDER +# Ҿ ↔ XXX ; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER +Ъ ↔ $modprime2 $under ; # CYRILLIC CAPITAL LETTER HARD SIGN +ъ ↔ $modprime2 ; # CYRILLIC SMALL LETTER HARD SIGN +Ь ↔ $modprime $under ; # CYRILLIC CAPITAL LETTER SOFT SIGN +ь ↔ $modprime ; # CYRILLIC SMALL LETTER SOFT SIGN +ы ↔ y ; # CYRILLIC SMALL LETTER YERU +Ы ↔ Y ; # CYRILLIC CAPITAL LETTER YERU +# ҍ ↔ XXX ; # CYRILLIC SMALL LETTER SEMISOFT SIGN +# Ҍ ↔ XXX ; # CYRILLIC CAPITAL LETTER SEMISOFT SIGN +# ѣ ↔ XXX ; # CYRILLIC SMALL LETTER YAT +# Ѣ ↔ XXX ; # CYRILLIC CAPITAL LETTER YAT +# ѥ ↔ XXX ; # CYRILLIC SMALL LETTER IOTIFIED E +# Ѥ ↔ XXX ; # CYRILLIC CAPITAL LETTER IOTIFIED E +# ѧ ↔ XXX ; # CYRILLIC SMALL LETTER LITTLE YUS +# Ѧ ↔ XXX ; # CYRILLIC CAPITAL LETTER LITTLE YUS +# ѫ ↔ XXX ; # CYRILLIC SMALL LETTER BIG YUS +# Ѫ ↔ XXX ; # CYRILLIC CAPITAL LETTER BIG YUS +# ѩ ↔ XXX ; # CYRILLIC SMALL LETTER IOTIFIED LITTLE YUS +# Ѩ ↔ XXX ; # CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS +# ѭ ↔ XXX ; # CYRILLIC SMALL LETTER IOTIFIED BIG YUS +# Ѭ ↔ XXX ; # CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS +# ѯ ↔ XXX ; # CYRILLIC SMALL LETTER KSI +# Ѯ ↔ XXX ; # CYRILLIC CAPITAL LETTER KSI +# ѱ ↔ XXX ; # CYRILLIC SMALL LETTER PSI +# Ѱ ↔ XXX ; # CYRILLIC CAPITAL LETTER PSI +# ѳ ↔ XXX ; # CYRILLIC SMALL LETTER FITA +# Ѳ ↔ XXX ; # CYRILLIC CAPITAL LETTER FITA +# ѵ ↔ XXX ; # CYRILLIC SMALL LETTER IZHITSA +# Ѵ ↔ XXX ; # CYRILLIC CAPITAL LETTER IZHITSA +# ҩ ↔ XXX ; # CYRILLIC SMALL LETTER ABKHASIAN HA +# Ҩ ↔ XXX ; # CYRILLIC CAPITAL LETTER ABKHASIAN HA +# Ӏ ↔ XXX ; # CYRILLIC LETTER PALOCHKA +### а\u0306 ↔ XXX ; # CYRILLIC SMALL LETTER A +### А\u0306 ↔ XXX ; # CYRILLIC CAPITAL LETTER A +### а\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER A +### А\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER A +### ә\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER SCHWA +### Ә\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER SCHWA +### г\u0301 ↔ XXX ; # CYRILLIC SMALL LETTER GHE +### Г\u0301 ↔ XXX ; # CYRILLIC CAPITAL LETTER GHE +### е\u0300 ↔ XXX ; # CYRILLIC SMALL LETTER IE +### Е\u0300 ↔ XXX ; # CYRILLIC CAPITAL LETTER IE +### е\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER IE +### Е\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER IE +### е\u0306 ↔ XXX ; # CYRILLIC SMALL LETTER IE +### Е\u0306 ↔ XXX ; # CYRILLIC CAPITAL LETTER IE +### ж\u0306 ↔ XXX ; # CYRILLIC SMALL LETTER ZHE +### Ж\u0306 ↔ XXX ; # CYRILLIC CAPITAL LETTER ZHE +### ж\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER ZHE +### Ж\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER ZHE +### з\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER ZE +### З\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER ZE +### и\u0300 ↔ XXX ; # CYRILLIC SMALL LETTER I +### И\u0300 ↔ XXX ; # CYRILLIC CAPITAL LETTER I +### и\u0304 ↔ XXX ; # CYRILLIC SMALL LETTER I +### И\u0304 ↔ XXX ; # CYRILLIC CAPITAL LETTER I +### и\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER I +### И\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER I +### і\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I +### І\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I +### о\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER O +### О\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER O +### ө\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER BARRED O +### Ө\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER BARRED O +### к\u0301 ↔ XXX ; # CYRILLIC SMALL LETTER KA +### К\u0301 ↔ XXX ; # CYRILLIC CAPITAL LETTER KA +### у\u0304 ↔ XXX ; # CYRILLIC SMALL LETTER U +### У\u0304 ↔ XXX ; # CYRILLIC CAPITAL LETTER U +### у\u0306 ↔ XXX ; # CYRILLIC SMALL LETTER U +### У\u0306 ↔ XXX ; # CYRILLIC CAPITAL LETTER U +### у\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER U +### У\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER U +### у\u030B ↔ XXX ; # CYRILLIC SMALL LETTER U +### У\u030B ↔ XXX ; # CYRILLIC CAPITAL LETTER U +### ч\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER CHE +### Ч\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER CHE +### ы\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER YERU +### Ы\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER YERU +### э\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER E +### Э\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER E +### ѵ\u030F ↔ XXX ; # CYRILLIC SMALL LETTER IZHITSA +### Ѵ\u030F ↔ XXX ; # CYRILLIC CAPITAL LETTER IZHITSA +# Completeness +$ignore = [[:Mark:]''] * ; +| k ← q ; +| K ← Q ; +| u ← w ; +| U ← W ; +| KS ← X } $ignore [:UppercaseLetter:] ; +| KS ← [:UppercaseLetter:] $ignore { X ; +| Ks ← X ; +| ks ← x ; +:: NFC (NFD) ; +# note: a global filter is more efficient, but MUST include all source chars!! +# :: ([\u0000-\u007E ʹ ʺ [:Cyrillic:] [:Latin:] [:nonspacing mark:]]); +# MINIMAL FILTER: Latin-Cyrillic +:: ( [ˌ\u0308A-Za-zÀ-ÏÑ-ÖÙ-Ýà-ïñ-öù-ýÿ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƏƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǰǴ-ǵǸ-țȞ-ȟȦ-ȳəʹ-ʺ\u0300-\u0302\u0306-\u0307\u030C\u0326\u0331\u0340-\u0341\u0344ʹ΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЀЃЌ-ЎЙйѐѓќ-ўӁ-ӂӐ-ӑӖ-ӗḀ-ẙẛẠ-ỹἂ-ἅἊ-Ἅἒ-ἕἚ-Ἕἢ-ἥἪ-Ἥἲ-ἵἺ-Ἵὂ-ὅὊ-Ὅὒ-ὕὛὝὢ-ὥὪ-Ὥὰ-ώᾂ-ᾅᾊ-ᾍᾒ-ᾕᾚ-ᾝᾢ-ᾥᾪ-ᾭᾰᾲᾴᾸᾺ-ΆῂῄῈ-Ή῍-῎ῐῒ-ΐῘῚ-Ί῝-῞ῠῢ-ΰῨῪ-Ύ῭-΅ῲῴῸ-ΏK-Å] ) ; + diff --git a/icu4c/source/data/translit/Devanagari_Bengali.txt b/icu4c/source/data/translit/Deva_Beng.txt similarity index 80% rename from icu4c/source/data/translit/Devanagari_Bengali.txt rename to icu4c/source/data/translit/Deva_Beng.txt index 7e5e3d7cce2..e77233d1392 100644 --- a/icu4c/source/data/translit/Devanagari_Bengali.txt +++ b/icu4c/source/data/translit/Deva_Beng.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Devanagari_Bengali.txt +# File: Deva_Beng.txt # Generated from CLDR # + ::[\u0901-ःऄअ-ह\u093C-\u094Dॐ-\u0954क़-९ॽ]; ::NFD; ::Devanagari-InterIndic; ::InterIndic-Bengali; ::NFC; + diff --git a/icu4c/source/data/translit/Devanagari_Gujarati.txt b/icu4c/source/data/translit/Deva_Gujr.txt similarity index 80% rename from icu4c/source/data/translit/Devanagari_Gujarati.txt rename to icu4c/source/data/translit/Deva_Gujr.txt index 1619b8b4dfd..d0bc47d188f 100644 --- a/icu4c/source/data/translit/Devanagari_Gujarati.txt +++ b/icu4c/source/data/translit/Deva_Gujr.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Devanagari_Gujarati.txt +# File: Deva_Gujr.txt # Generated from CLDR # + ::[\u0901-ःऄअ-ह\u093C-\u094Dॐ-\u0954क़-९ॽ]; ::NFD; ::Devanagari-InterIndic; ::InterIndic-Gujarati; ::NFC; + diff --git a/icu4c/source/data/translit/Devanagari_Gurmukhi.txt b/icu4c/source/data/translit/Deva_Guru.txt similarity index 80% rename from icu4c/source/data/translit/Devanagari_Gurmukhi.txt rename to icu4c/source/data/translit/Deva_Guru.txt index 27764a51623..d5560b402d5 100644 --- a/icu4c/source/data/translit/Devanagari_Gurmukhi.txt +++ b/icu4c/source/data/translit/Deva_Guru.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Devanagari_Gurmukhi.txt +# File: Deva_Guru.txt # Generated from CLDR # + ::[\u0901-ःऄअ-ह\u093C-\u094Dॐ-\u0954क़-९ॽ]; ::NFD; ::Devanagari-InterIndic; ::InterIndic-Gurmukhi; ::NFC; + diff --git a/icu4c/source/data/translit/Devanagari_Kannada.txt b/icu4c/source/data/translit/Deva_Knda.txt similarity index 80% rename from icu4c/source/data/translit/Devanagari_Kannada.txt rename to icu4c/source/data/translit/Deva_Knda.txt index ec1544ba6fe..519c37bbbed 100644 --- a/icu4c/source/data/translit/Devanagari_Kannada.txt +++ b/icu4c/source/data/translit/Deva_Knda.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Devanagari_Kannada.txt +# File: Deva_Knda.txt # Generated from CLDR # + ::[\u0901-ःऄअ-ह\u093C-\u094Dॐ-\u0954क़-९ॽ]; ::NFD; ::Devanagari-InterIndic; ::InterIndic-Kannada; ::NFC; + diff --git a/icu4c/source/data/translit/Devanagari_Latin.txt b/icu4c/source/data/translit/Deva_Latn.txt similarity index 80% rename from icu4c/source/data/translit/Devanagari_Latin.txt rename to icu4c/source/data/translit/Deva_Latn.txt index 7ba0a76463f..37777b5cb9b 100644 --- a/icu4c/source/data/translit/Devanagari_Latin.txt +++ b/icu4c/source/data/translit/Deva_Latn.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Devanagari_Latin.txt +# File: Deva_Latn.txt # Generated from CLDR # + ::[\u0901-ःऄअ-ह\u093C-\u094Dॐ-\u0954क़-९ॽ]; ::NFD; ::Devanagari-InterIndic; ::InterIndic-Latin; ::NFC; + diff --git a/icu4c/source/data/translit/Devanagari_Malayalam.txt b/icu4c/source/data/translit/Deva_Mlym.txt similarity index 80% rename from icu4c/source/data/translit/Devanagari_Malayalam.txt rename to icu4c/source/data/translit/Deva_Mlym.txt index 3c87d63b59a..3e3efff0366 100644 --- a/icu4c/source/data/translit/Devanagari_Malayalam.txt +++ b/icu4c/source/data/translit/Deva_Mlym.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Devanagari_Malayalam.txt +# File: Deva_Mlym.txt # Generated from CLDR # + ::[\u0901-ःऄअ-ह\u093C-\u094Dॐ-\u0954क़-९ॽ]; ::NFD; ::Devanagari-InterIndic; ::InterIndic-Malayalam; ::NFC; + diff --git a/icu4c/source/data/translit/Devanagari_Oriya.txt b/icu4c/source/data/translit/Deva_Orya.txt similarity index 80% rename from icu4c/source/data/translit/Devanagari_Oriya.txt rename to icu4c/source/data/translit/Deva_Orya.txt index a92ed9f1ff7..57fa8dca45d 100644 --- a/icu4c/source/data/translit/Devanagari_Oriya.txt +++ b/icu4c/source/data/translit/Deva_Orya.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Devanagari_Oriya.txt +# File: Deva_Orya.txt # Generated from CLDR # + ::[\u0901-ःऄअ-ह\u093C-\u094Dॐ-\u0954क़-९ॽ]; ::NFD; ::Devanagari-InterIndic; ::InterIndic-Oriya; ::NFC; + diff --git a/icu4c/source/data/translit/Devanagari_Tamil.txt b/icu4c/source/data/translit/Deva_Taml.txt similarity index 80% rename from icu4c/source/data/translit/Devanagari_Tamil.txt rename to icu4c/source/data/translit/Deva_Taml.txt index 56017631e65..e86e4edf454 100644 --- a/icu4c/source/data/translit/Devanagari_Tamil.txt +++ b/icu4c/source/data/translit/Deva_Taml.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Devanagari_Tamil.txt +# File: Deva_Taml.txt # Generated from CLDR # + ::[\u0901-ःऄअ-ह\u093C-\u094Dॐ-\u0954क़-९ॽ]; ::NFD; ::Devanagari-InterIndic; ::InterIndic-Tamil; ::NFC; + diff --git a/icu4c/source/data/translit/Devanagari_Telugu.txt b/icu4c/source/data/translit/Deva_Telu.txt similarity index 80% rename from icu4c/source/data/translit/Devanagari_Telugu.txt rename to icu4c/source/data/translit/Deva_Telu.txt index d8ce1025105..bd1dae040a3 100644 --- a/icu4c/source/data/translit/Devanagari_Telugu.txt +++ b/icu4c/source/data/translit/Deva_Telu.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Devanagari_Telugu.txt +# File: Deva_Telu.txt # Generated from CLDR # + ::[\u0901-ःऄअ-ह\u093C-\u094Dॐ-\u0954क़-९ॽ]; ::NFD; ::Devanagari-InterIndic; ::InterIndic-Telugu; ::NFC; + diff --git a/icu4c/source/data/translit/Devanagari_InterIndic.txt b/icu4c/source/data/translit/Devanagari_InterIndic.txt index 0eea065c0f9..baccb1e5af1 100644 --- a/icu4c/source/data/translit/Devanagari_InterIndic.txt +++ b/icu4c/source/data/translit/Devanagari_InterIndic.txt @@ -1,12 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Devanagari_InterIndic.txt # Generated from CLDR # + +# Devanagari-InterIndic +# :: NFD; +#Rules for Decomposed characters \u0901→\uE001; # SIGN CANDRABINDU \u0902→\uE002; # SIGN ANUSVARA ः→\uE003; # SIGN VISARGA @@ -113,3 +117,5 @@ ९→\uE06F; # DIGIT NINE ॰→\uE070; # Devanagari-InterIndic: ABBREVIATION SIGN ॽ→\uE082; # Devanagari Glottal Stop +# :: NFC (NFD) ; + diff --git a/icu4c/source/data/translit/Fullwidth_Halfwidth.txt b/icu4c/source/data/translit/Fullwidth_Halfwidth.txt index cb0f929cc9c..16197818e42 100644 --- a/icu4c/source/data/translit/Fullwidth_Halfwidth.txt +++ b/icu4c/source/data/translit/Fullwidth_Halfwidth.txt @@ -1,12 +1,18 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Fullwidth_Halfwidth.txt # Generated from CLDR # + +# Fullwidth-Halfwidth +# Mechanically generated from Unicode Character Database +# IDEOGRAPHIC SPACE then added, and +# FULLWIDTH MACRON changed to map to MACRON, not SPACE + COMBINING MACRON +# multicharacter ガ↔ガ; # to KATAKANA LETTER GA ギ↔ギ; # to KATAKANA LETTER GI グ↔グ; # to KATAKANA LETTER GU @@ -35,6 +41,7 @@ ヴ↔ヴ; # to KATAKANA LETTER VU ヷ↔ヷ; # to KATAKANA LETTER VA ヺ↔ヺ; # to KATAKANA LETTER VO +# single character !↔'!'; # from FULLWIDTH EXCLAMATION MARK "↔'"'; # from FULLWIDTH QUOTATION MARK #↔'#'; # from FULLWIDTH NUMBER SIGN @@ -259,3 +266,5 @@ ↓↔↓; # to HALFWIDTH DOWNWARDS ARROW ■↔■; # to HALFWIDTH BLACK SQUARE ○↔○; # to HALFWIDTH WHITE CIRCLE +# eof + diff --git a/icu4c/source/data/translit/Georgian_Latin.txt b/icu4c/source/data/translit/Geor_Latn.txt similarity index 68% rename from icu4c/source/data/translit/Georgian_Latin.txt rename to icu4c/source/data/translit/Geor_Latn.txt index 07b439ace7d..44b47dd9cbc 100644 --- a/icu4c/source/data/translit/Georgian_Latin.txt +++ b/icu4c/source/data/translit/Geor_Latn.txt @@ -1,12 +1,14 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Georgian_Latin.txt +# File: Geor_Latn.txt # Generated from CLDR # + +# long items moved up წ ↔ tsʼ ; კ ↔ kʼ ; პ ↔ pʼ ; @@ -21,6 +23,7 @@ ძ ↔ dz ; ხ ↔ kh ; ჳ ↔ ŭi ; +# normal order ა ↔ a ; ბ ↔ b ; გ ↔ g ; @@ -30,15 +33,30 @@ ზ ↔ z ; თ ↔ t ; ი ↔ i ; +#კ ↔ kʼ ; ლ ↔ l ; მ ↔ m ; ნ ↔ n ; ო ↔ o ; +#პ ↔ pʼ ; +#ჟ ↔ zh ; რ ↔ r ; ს ↔ s ; +#ტ ↔ tʼ ; უ ↔ u ; ფ ↔ p ; ქ ↔ k ; +#ღ ↔ gh ; +#ყ ↔ qʼ ; +#შ ↔ sh ; +#ჩ ↔ ch ; +#ც ↔ ts ; +#ძ ↔ dz ; +#წ ↔ tsʼ ; +#ჭ ↔ chʼ ; +#ხ ↔ kh ; ჯ ↔ j ; ჰ ↔ h ; +#ჳ ↔ ŭi ; ჴ ↔ q ; + diff --git a/icu4c/source/data/translit/Greek_Latin.txt b/icu4c/source/data/translit/Grek_Latn.txt similarity index 76% rename from icu4c/source/data/translit/Greek_Latin.txt rename to icu4c/source/data/translit/Grek_Latn.txt index 5118c6fe6e5..e528b379d05 100644 --- a/icu4c/source/data/translit/Greek_Latin.txt +++ b/icu4c/source/data/translit/Grek_Latn.txt @@ -1,18 +1,33 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Greek_Latin.txt +# File: Grek_Latn.txt # Generated from CLDR # + +# Rules are predicated on running NFD first, and NFC afterwards +# :: [\u0000-\u007F \u0370-Ͽ [:Greek:] [:nonspacing mark:]] ; +# MINIMAL FILTER GENERATED FOR: Greek-Latin :: [;µ·ÄËÏÖÜäëïöüÿ-āĒ-ēĪ-īŌ-ōŪ-ūŸǕ-ǜǞ-ǣǬ-ǭȪ-ȭȰ-ȳ\u0304\u0308\u0313-\u0314\u0342-\u0345ͺ;Ά-ΊΌΎ-ΡΣ-ώϐ-ϗϛϝϟϡϣϥϧϩϫϭϯ-ϵϷ-\u07FBЁЇёїӒ-ӓӚ-ӟӢ-ӧӪ-ӱӴ-ӵӸ-ӹḔ-ḗḠ-ḡḦ-ḧḮ-ḯḸ-ḹṎ-ṓṜ-ṝṺ-ṻẄ-ẅẌ-ẍẗἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼι῁-ῄῆ-ῌ῏-ΐῖ-Ί῟-Ῥῲ-ῴῶ-ῼΩϹ] ; :: NFD (NFC) ; +# TEST CASES +# Ὀλίγοι ἔμφονες πολλῶν ἀφρόνων φοβερώτεροι — Πλάτωνος +# ᾂ ᾒ ᾢ ᾃ ᾓ ᾣ +# ᾳ ῃ ῳ ὃ ὄ +# ὠς ὡς ὢς ὣς +# Ὠς Ὡς Ὢς Ὣς +# ὨΣ ὩΣ ὪΣ ὫΣ +# Ạ, ạ, Ẹ, ẹ, Ọ, ọ +# Useful variables $lower = [[:latin:][:greek:] & [:Ll:]]; $glower = [[:greek:] & [:Ll:]]; $upper = [[:latin:][:greek:] & [:Lu:]] ; $accent = [:M:] ; +# NOTE: restrict to just the Greek & Latin accents that we care about +# TODO: broaden out once interation is fixed $accentMinus = [ [\u0300-\u0345] & [:M:] - [\u0338]] ; $macron = \u0304 ; $ddot = \u0308 ; @@ -37,18 +52,27 @@ $beforeLetter = [[:M:]\']* [:L:] ; $beforeLower = $accent * $lower ; $notLetter = [^[:L:][:M:]] ; $under = \u0331; +# Fix punctuation +# preserve original \: ↔ \: $under ; \? ↔ \? $under ; \; ↔ \? ; · ↔ \: ; +# CIRCUMFLEX: convert greek circumflex to normal one. Could use tilde or inverted breve \u0342 ↔ \u0302 ; +# IOTA: convert iota subscript to iota +# first make previous alpha long! $accent_minus = [[$accent]-[$iotasub$macron]]; Α } $accent_minus * $iotasub → | Α $macron ; α } $accent_minus * $iotasub → | α $macron ; +# now convert to uppercase if after uppercase, ow to lowercase $upper $accent * { $iotasub → I ; $iotasub → i ; | $1 $iotasub ← ($evowel $macron $accentMinus *) i ; | $1 $iotasub ← ($evowel $macron $accentMinus *) I ; +# BREATHING +# Convert rough breathing to h, and move before letters. +# Make A ` x = → H a x Α ($macron?) $rough } $beforeLower → H | α $1; Ε $rough } $beforeLower → H | ε; Η $rough } $beforeLower → H | η ; @@ -56,6 +80,7 @@ $iotasub → i ; Ο $rough } $beforeLower → H | ο ; Υ $rough } $beforeLower → H | υ ; Ω ($ddot?) $rough } $beforeLower → H | ω $1; +# Make A x ` = → H a x Α ($glower $macron?) $rough → H | α $1 ; Ε ($glower) $rough → H | ε $1 ; Η ($glower) $rough → H | η $1 ; @@ -63,14 +88,18 @@ $iotasub → i ; Ο ($glower) $rough → H | ο $1 ; Υ ($glower) $rough → H | υ $1 ; Ω ($glower $ddot?) $rough → H | ω $1 ; +#Otherwise, make x ` into h x and X ` into H X ($lcgvowel + $ddotmac? ) $rough → h | $1 ; ($gvowel + $ddotmac? ) $rough → H | $1 ; +# Go backwards with H | $1 $rough ← h ($evowel $macron $ddot? $evowel2_i $macron?) ; | $1 $rough ← h ($evowel $ddot? $evowel2 $macron?) ; | $1 $rough ← h ($evowel $macron? $ddot?) ; | $1 $rough ← H ([AEIOUY] $macron $ddot? $evowel2_i $macron?) ; | $1 $rough ← H ([AEIOUY] $ddot? $evowel2 $macron?) ; | $1 $rough ← H ([AEIOUY] $macron? $ddot?) ; +# titlecase, have to fix individually +# in the future, we should add &uppercase() to make this easier | A $1 $rough ← H a ($macron $ddot? $evowel2_i $macron?) ; | E $1 $rough ← H e ($macron $ddot? $evowel2_i $macron?) ; | I $1 $rough ← H i ($macron $ddot? $evowel2_i $macron?) ; @@ -89,10 +118,18 @@ $iotasub → i ; | O $1 $rough ← H o ($macron? $ddot? ) ; | U $1 $rough ← H u ($macron? $ddot? ) ; | Y $1 $rough ← H y ($macron? $ddot? ) ; +# Now do smooth +#delete smooth breathing for Latin $smooth → ; +# insert in Greek +# the assumption is that all Marks are on letters. | $1 $smooth ← $notLetter { ([rR]) } [^hH$smooth$rough] ; | $1 $smooth ← $notLetter { ($evowel $macron? $evowel2 $macron?) } [^$smooth$rough] ; | $1 $smooth ← $notLetter { ($evowel $macron?) } [^$evowel2$smooth$rough] ; +# TODO: preserve smooth/rough breathing if not +# on initial vowel sequence +# need to have these up here so the rules don't mask +# remove now superfluous macron when returning Α ← A $macron ; α ← a $macron ; η ↔ e $macron ; @@ -105,6 +142,7 @@ $smooth → ; ψ ↔ ps ; ω ↔ o $macron ; Ω ↔ O $macron; +# NORMAL α ↔ a ; Α ↔ A ; β ↔ b ; @@ -145,17 +183,24 @@ $smooth → ; Ρ $rough ↔ RH ; ρ ↔ r ; Ρ ↔ R ; +# insert separator before things that turn into s [Pp] { } [ςσΣϷϸϺϻ] → \' ; +# special S variants Ϸ ↔ S\u030C ; # Ϸ GREEK CAPITAL LETTER SHO Uppercase_Letter Grek - L ϸ ↔ s\u030C ; #ϸ GREEK SMALL LETTER SHO Lowercase_Letter Grek - L Ϻ ↔ S\u0302 ; # Ϻ GREEK CAPITAL LETTER SAN Uppercase_Letter Grek - L ϻ ↔ s\u0302 ; # ϻ GREEK SMALL LETTER SAN Lowercase_Letter Grek - L +# underbar means exception +# before a letter, initial ς } $beforeLetter ↔ s $underbar } $beforeLetter; σ } $beforeLetter ↔ s } $beforeLetter; +# otherwise, after a letter = final $afterLetter { σ ↔ $afterLetter { s $underbar; $afterLetter { ς ↔ $afterLetter { s ; +# otherwise (isolated) = initial ς ↔ s $underbar; σ ↔ s ; +# [Pp] { Σ ↔ \'S ; Σ ↔ S ; τ ↔ t ; Τ ↔ T ; @@ -166,6 +211,7 @@ $vowel { Υ ↔ U ; χ ↔ ch ; Χ } $beforeLower ↔ Ch ; Χ ↔ CH ; +# Completeness for ASCII $ignore = [[:Mark:]''] * ; | k ← c ; | ph ← f ; @@ -187,6 +233,7 @@ $rough } $ignore [:UppercaseLetter:] → H ; $ignore [:UppercaseLetter:] { $rough → H ; $rough ← H ; $rough ↔ h ; +# Completeness for Greek ϐ → | β ; ϑ → | θ ; ϒ → | Υ ; @@ -201,7 +248,12 @@ $rough ↔ h ; ϵ → | ε ; µ → | μ ; ͺ → i; +# delete any trailing ' marks used for roundtripping ← [Ππ] { \' } [Ss] ; ← [Νν] { \' } $egammaLike ; ::NFC (NFD) ; +# ([\u0000-\u007F [:Latin:] [:Greek:] [:nonspacing mark:]]) ; +# ([\u0000-\u007F · [:Latin:] [:nonspacing mark:]]) ; +# MINIMAL FILTER GENERATED FOR: Latin-Greek BACKWARD :: ( [':?A-Za-zÀ-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0300-\u0337\u0339-\u0345΅-ΆΈ-ΊΌΎ-ΐΪ-ΰϊ-ώϓ-ϔЀ-ЁЃЇЌ-ЎЙйѐ-ёѓїќ-ўѶ-ѷӁ-ӂӐ-ӓӖ-ӗӚ-ӟӢ-ӧӪ-ӵӸ-ӹḀ-ẙẛẠ-ỹἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼ῁-ῄῆ-ΐῖ-Ί῝-΅ῲ-ῴῶ-ῼK-Å] ) ; + diff --git a/icu4c/source/data/translit/Greek_Latin_UNGEGN.txt b/icu4c/source/data/translit/Grek_Latn_UNGEGN.txt similarity index 72% rename from icu4c/source/data/translit/Greek_Latin_UNGEGN.txt rename to icu4c/source/data/translit/Grek_Latn_UNGEGN.txt index 6c8ae5247dc..0a5034ff5db 100644 --- a/icu4c/source/data/translit/Greek_Latin_UNGEGN.txt +++ b/icu4c/source/data/translit/Grek_Latn_UNGEGN.txt @@ -1,14 +1,21 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Greek_Latin_UNGEGN.txt +# File: Grek_Latn_UNGEGN.txt # Generated from CLDR # + +# For modern Greek, based on UNGEGN rules. +# Rules are predicated on running NFD first, and NFC afterwards +# MINIMAL FILTER GENERATED FOR: Greek-Latin/UNGEGN +# WARNING: need to add accents to both filters ### +# :: [\u0301\u0304\u0306\u0308;µ·ÀÂÈÊÌÎÒÔÙÛàâèêìîòôùûĈ-ĉĜ-ĝĤ-ĥĴ-ĵŜ-ŝŴ-ŷǛ-ǜǸ-ǹ\u0300\u0302\u0313-\u0314\u0340\u0342-\u0343\u0345ͺ;Ά-ΊΌΎ-ΡΣ-ώϐ-ϖϰ-ϵЀЍѐѝḔ-ḕṐ-ṑẀ-ẁẐ-ẑẤ-ậẰ-ằẾ-ệỐ-ộỜ-ờỪ-ừỲ-ỳἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼι῁-ῄῆ-῍῏-ΐῖ-Ί῝῟-῭ῲ-ῴῶ-ῼΩϷ-\u07FBϹ] ; :: [[[:Greek:][:Mn:][:Me:]] [\:-;?·;·]] ; ::NFD (NFC) ; +# Useful variables $lower = [[:latin:][:greek:] & [:Ll:]] ; $upper = [[:latin:][:greek:] & [:Lu:]] ; $accent = [[:Mn:][:Me:]] ; @@ -31,10 +38,13 @@ $under = \u0331; $caron = \u030C; $afterLetter = [:L:] [\'$accent]* ; $beforeLetter = [\'$accent]* [:L:] ; +# Fix punctuation +# preserve orginal \: ↔ \: $under ; \? ↔ \? $under ; \; ↔ \? ; · ↔ \: ; +# Fix any ancient characters that creep in \u0342 → \u0301 ; \u0302 → \u0301 ; \u0300 → \u0301 ; @@ -42,6 +52,7 @@ $smooth → ; $rough → ; $iotasub → ; ͺ → ; +# need to have these up here so the rules don't mask η ↔ i $under ; Η ↔ I $under ; Ψ } $beforeLower ↔ Ps ; @@ -49,6 +60,7 @@ $iotasub → ; ψ ↔ ps ; ω ↔ o $under ; Ω ↔ O $under; +# at begining or end of word, convert mp to b [^[:L:]$accent] { μπ → b ; μπ } [^[:L:]$accent] → b ; [^[:L:]$accent] { [Μμ][Ππ] → B ; @@ -56,6 +68,7 @@ $iotasub → ; μπ ← b ; Μπ ← B } $beforeLower ; ΜΠ ← B ; +# handle diphthongs ending with upsilon ου ↔ ou ; ΟΥ ↔ OU ; Ου ↔ Ou ; @@ -70,6 +83,7 @@ $fmaker { Υ } $softener ↔ V $under ; $fmaker { Υ ↔ U $under ; υ ↔ y ; Υ ↔ Y ; +# NORMAL α ↔ a ; Α ↔ A ; β ↔ v ; @@ -107,17 +121,24 @@ $fmaker { Υ ↔ U $under ; Π ↔ P ; ρ ↔ r ; Ρ ↔ R ; +# insert separator before things that turn into s [Pp] { } [ςσΣϷϸϺϻ] → \' ; +# special S variants Ϸ ↔ S\u030C ; # Ϸ GREEK CAPITAL LETTER SHO Uppercase_Letter Grek - L ϸ ↔ s\u030C ; #ϸ GREEK SMALL LETTER SHO Lowercase_Letter Grek - L Ϻ ↔ S\u0302 ; # Ϻ GREEK CAPITAL LETTER SAN Uppercase_Letter Grek - L ϻ ↔ s\u0302 ; # ϻ GREEK SMALL LETTER SAN Lowercase_Letter Grek - L +# Caron means exception +# before a letter, initial ς } $beforeLetter ↔ s $under } $beforeLetter; σ } $beforeLetter ↔ s } $beforeLetter; +# otherwise, after a letter = final $afterLetter { σ ↔ $afterLetter { s $under; $afterLetter { ς ↔ $afterLetter { s ; +# otherwise (isolated) = initial ς ↔ s $under; σ ↔ s ; +# [Pp] { Σ ↔ \'S ; Σ ↔ S ; τ ↔ t ; Τ ↔ T ; @@ -126,6 +147,8 @@ $afterLetter { ς ↔ $afterLetter { s ; χ ↔ ch ; Χ } $beforeLower ↔ Ch ; Χ ↔ CH ; +# Completeness for ASCII +# $ignore = [[:Mark:]''] * ; | ch ← h ; | k ← c ; | i ← j ; @@ -142,6 +165,7 @@ $afterLetter { ς ↔ $afterLetter { s ; | B ← U } $vowel ; | Y ← W ; | Y ← U ; +# Completeness for Greek ϐ → | β ; ϑ → | θ ; ϒ → | Υ ; @@ -155,7 +179,10 @@ $afterLetter { ς ↔ $afterLetter { s ; ϴ → | Θ ; ϵ → | ε ; µ → | μ ; +# delete any trailing ' marks used for roundtripping ← [Ππ] { \' } [Ss] ; ← [Νν] { \' } $egammaLike ; ::NFC (NFD) ; +# MINIMAL FILTER GENERATED FOR: Latin-Greek/UNGEGN BACKWARD :: ([[[:Latin:][:Mn:][:Me:]] ['\:?]]) ; + diff --git a/icu4c/source/data/translit/Gujarati_InterIndic.txt b/icu4c/source/data/translit/Gujarati_InterIndic.txt index d5ebb7a7ce5..b43939a6704 100644 --- a/icu4c/source/data/translit/Gujarati_InterIndic.txt +++ b/icu4c/source/data/translit/Gujarati_InterIndic.txt @@ -1,12 +1,15 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Gujarati_InterIndic.txt # Generated from CLDR # + +# Gujarati-InterIndic +#:: NFD (NFC) ; \u0A81→\uE001; # SIGN CANDRABINDU \u0A82→\uE002; # SIGN ANUSVARA ઃ→\uE003; # SIGN VISARGA @@ -90,3 +93,6 @@ ।→\uE064; # DANDA ॥→\uE065; # DOUBLE DANDA ૰→\uE070; # ABBREVIATION SIGN +# :: NFC (NFD) ; +# eof + diff --git a/icu4c/source/data/translit/Gujarati_Bengali.txt b/icu4c/source/data/translit/Gujr_Beng.txt similarity index 82% rename from icu4c/source/data/translit/Gujarati_Bengali.txt rename to icu4c/source/data/translit/Gujr_Beng.txt index 1e1281cc9db..98e71de16af 100644 --- a/icu4c/source/data/translit/Gujarati_Bengali.txt +++ b/icu4c/source/data/translit/Gujr_Beng.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Gujarati_Bengali.txt +# File: Gujr_Beng.txt # Generated from CLDR # + ::[।-॥\u0A81-ઃઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હ\u0ABC-\u0AC5\u0AC7-ૉો-\u0ACDૐૠૡ૦-૯]; ::NFD; ::Gujarati-InterIndic; ::InterIndic-Bengali; ::NFC; + diff --git a/icu4c/source/data/translit/Gujarati_Devanagari.txt b/icu4c/source/data/translit/Gujr_Deva.txt similarity index 82% rename from icu4c/source/data/translit/Gujarati_Devanagari.txt rename to icu4c/source/data/translit/Gujr_Deva.txt index 6a980dc299a..ec1b5b0a8d5 100644 --- a/icu4c/source/data/translit/Gujarati_Devanagari.txt +++ b/icu4c/source/data/translit/Gujr_Deva.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Gujarati_Devanagari.txt +# File: Gujr_Deva.txt # Generated from CLDR # + ::[।-॥\u0A81-ઃઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હ\u0ABC-\u0AC5\u0AC7-ૉો-\u0ACDૐૠૡ૦-૯]; ::NFD; ::Gujarati-InterIndic; ::InterIndic-Devanagari; ::NFC; + diff --git a/icu4c/source/data/translit/Gujarati_Gurmukhi.txt b/icu4c/source/data/translit/Gujr_Guru.txt similarity index 82% rename from icu4c/source/data/translit/Gujarati_Gurmukhi.txt rename to icu4c/source/data/translit/Gujr_Guru.txt index 24bce97b8b5..5e06171b88b 100644 --- a/icu4c/source/data/translit/Gujarati_Gurmukhi.txt +++ b/icu4c/source/data/translit/Gujr_Guru.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Gujarati_Gurmukhi.txt +# File: Gujr_Guru.txt # Generated from CLDR # + ::[।-॥\u0A81-ઃઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હ\u0ABC-\u0AC5\u0AC7-ૉો-\u0ACDૐૠૡ૦-૯]; ::NFD; ::Gujarati-InterIndic; ::InterIndic-Gurmukhi; ::NFC; + diff --git a/icu4c/source/data/translit/Gujarati_Kannada.txt b/icu4c/source/data/translit/Gujr_Knda.txt similarity index 82% rename from icu4c/source/data/translit/Gujarati_Kannada.txt rename to icu4c/source/data/translit/Gujr_Knda.txt index 1908bd3e20d..93b10308cb8 100644 --- a/icu4c/source/data/translit/Gujarati_Kannada.txt +++ b/icu4c/source/data/translit/Gujr_Knda.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Gujarati_Kannada.txt +# File: Gujr_Knda.txt # Generated from CLDR # + ::[।-॥\u0A81-ઃઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હ\u0ABC-\u0AC5\u0AC7-ૉો-\u0ACDૐૠૡ૦-૯]; ::NFD; ::Gujarati-InterIndic; ::InterIndic-Kannada; ::NFC; + diff --git a/icu4c/source/data/translit/Gujarati_Latin.txt b/icu4c/source/data/translit/Gujr_Latn.txt similarity index 82% rename from icu4c/source/data/translit/Gujarati_Latin.txt rename to icu4c/source/data/translit/Gujr_Latn.txt index aae1faee5ad..8847e79ca83 100644 --- a/icu4c/source/data/translit/Gujarati_Latin.txt +++ b/icu4c/source/data/translit/Gujr_Latn.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Gujarati_Latin.txt +# File: Gujr_Latn.txt # Generated from CLDR # + ::[।-॥\u0A81-ઃઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હ\u0ABC-\u0AC5\u0AC7-ૉો-\u0ACDૐૠૡ૦-૯]; ::NFD; ::Gujarati-InterIndic; ::InterIndic-Latin; ::NFC; + diff --git a/icu4c/source/data/translit/Gujarati_Malayalam.txt b/icu4c/source/data/translit/Gujr_Mlym.txt similarity index 82% rename from icu4c/source/data/translit/Gujarati_Malayalam.txt rename to icu4c/source/data/translit/Gujr_Mlym.txt index b7c5abac8a2..34d614fc8e2 100644 --- a/icu4c/source/data/translit/Gujarati_Malayalam.txt +++ b/icu4c/source/data/translit/Gujr_Mlym.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Gujarati_Malayalam.txt +# File: Gujr_Mlym.txt # Generated from CLDR # + ::[।-॥\u0A81-ઃઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હ\u0ABC-\u0AC5\u0AC7-ૉો-\u0ACDૐૠૡ૦-૯]; ::NFD; ::Gujarati-InterIndic; ::InterIndic-Malayalam; ::NFC; + diff --git a/icu4c/source/data/translit/Gujarati_Oriya.txt b/icu4c/source/data/translit/Gujr_Orya.txt similarity index 82% rename from icu4c/source/data/translit/Gujarati_Oriya.txt rename to icu4c/source/data/translit/Gujr_Orya.txt index de0d3d82e77..dce65711961 100644 --- a/icu4c/source/data/translit/Gujarati_Oriya.txt +++ b/icu4c/source/data/translit/Gujr_Orya.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Gujarati_Oriya.txt +# File: Gujr_Orya.txt # Generated from CLDR # + ::[।-॥\u0A81-ઃઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હ\u0ABC-\u0AC5\u0AC7-ૉો-\u0ACDૐૠૡ૦-૯]; ::NFD; ::Gujarati-InterIndic; ::InterIndic-Oriya; ::NFC; + diff --git a/icu4c/source/data/translit/Gujarati_Tamil.txt b/icu4c/source/data/translit/Gujr_Taml.txt similarity index 82% rename from icu4c/source/data/translit/Gujarati_Tamil.txt rename to icu4c/source/data/translit/Gujr_Taml.txt index 7a22cb343e0..c7042827dea 100644 --- a/icu4c/source/data/translit/Gujarati_Tamil.txt +++ b/icu4c/source/data/translit/Gujr_Taml.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Gujarati_Tamil.txt +# File: Gujr_Taml.txt # Generated from CLDR # + ::[।-॥\u0A81-ઃઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હ\u0ABC-\u0AC5\u0AC7-ૉો-\u0ACDૐૠૡ૦-૯]; ::NFD; ::Gujarati-InterIndic; ::InterIndic-Tamil; ::NFC; + diff --git a/icu4c/source/data/translit/Gujarati_Telugu.txt b/icu4c/source/data/translit/Gujr_Telu.txt similarity index 82% rename from icu4c/source/data/translit/Gujarati_Telugu.txt rename to icu4c/source/data/translit/Gujr_Telu.txt index 82b625a2c32..8b7c2f5b865 100644 --- a/icu4c/source/data/translit/Gujarati_Telugu.txt +++ b/icu4c/source/data/translit/Gujr_Telu.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Gujarati_Telugu.txt +# File: Gujr_Telu.txt # Generated from CLDR # + ::[।-॥\u0A81-ઃઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હ\u0ABC-\u0AC5\u0AC7-ૉો-\u0ACDૐૠૡ૦-૯]; ::NFD; ::Gujarati-InterIndic; ::InterIndic-Telugu; ::NFC; + diff --git a/icu4c/source/data/translit/Gurmukhi_InterIndic.txt b/icu4c/source/data/translit/Gurmukhi_InterIndic.txt index 5150c6fbaea..23d6acff8fb 100644 --- a/icu4c/source/data/translit/Gurmukhi_InterIndic.txt +++ b/icu4c/source/data/translit/Gurmukhi_InterIndic.txt @@ -1,12 +1,21 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Gurmukhi_InterIndic.txt # Generated from CLDR # + +# Gurmukhi-InterIndic +#:: NFD (NFC) ; +#ਖ\u0A3C→\uE059; # LETTER KHHA +#ਗ\u0A3C→\uE05A; # LETTER GHHA +#ਜ\u0A3C→\uE05B; # LETTER ZA +#ਸ\u0A3C→\uE036; # LETTER SHA +#ਲ\u0A3C→\uE033; # LETTER LLA +#ਫ\u0A3C→\uE05E; # LETTER FA \u0A01→\uE001; # SIGN CHANDRABINDU \u0A02→\uE002; # SIGN BINDI ਅ→\uE005; # LETTER A @@ -83,3 +92,6 @@ ੴ→\uE080; # EK ONKAR ।→\uE064; # DANDA ॥→\uE065; # DOUBLE DANDA +# :: NFC (NFD) ; +# eof + diff --git a/icu4c/source/data/translit/Gurmukhi_Bengali.txt b/icu4c/source/data/translit/Guru_Beng.txt similarity index 83% rename from icu4c/source/data/translit/Gurmukhi_Bengali.txt rename to icu4c/source/data/translit/Guru_Beng.txt index e8b6cd016e2..a1d5298e191 100644 --- a/icu4c/source/data/translit/Gurmukhi_Bengali.txt +++ b/icu4c/source/data/translit/Guru_Beng.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Gurmukhi_Bengali.txt +# File: Guru_Beng.txt # Generated from CLDR # + ::[।-॥\u0A01\u0A02ਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹ\u0A3Cਾ-\u0A42\u0A47-\u0A48\u0A4B-\u0A4Dਖ਼-ੜਫ਼੦-ੴ]; ::NFD; ::Gurmukhi-InterIndic; ::InterIndic-Bengali; ::NFC; + diff --git a/icu4c/source/data/translit/Gurmukhi_Devanagari.txt b/icu4c/source/data/translit/Guru_Deva.txt similarity index 82% rename from icu4c/source/data/translit/Gurmukhi_Devanagari.txt rename to icu4c/source/data/translit/Guru_Deva.txt index 16b69fe67a0..b4b633adb11 100644 --- a/icu4c/source/data/translit/Gurmukhi_Devanagari.txt +++ b/icu4c/source/data/translit/Guru_Deva.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Gurmukhi_Devanagari.txt +# File: Guru_Deva.txt # Generated from CLDR # + ::[।-॥\u0A01\u0A02ਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹ\u0A3Cਾ-\u0A42\u0A47-\u0A48\u0A4B-\u0A4Dਖ਼-ੜਫ਼੦-ੴ]; ::NFD; ::Gurmukhi-InterIndic; ::InterIndic-Devanagari; ::NFC; + diff --git a/icu4c/source/data/translit/Gurmukhi_Gujarati.txt b/icu4c/source/data/translit/Guru_Gujr.txt similarity index 83% rename from icu4c/source/data/translit/Gurmukhi_Gujarati.txt rename to icu4c/source/data/translit/Guru_Gujr.txt index c517f11a52e..5052427ec5d 100644 --- a/icu4c/source/data/translit/Gurmukhi_Gujarati.txt +++ b/icu4c/source/data/translit/Guru_Gujr.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Gurmukhi_Gujarati.txt +# File: Guru_Gujr.txt # Generated from CLDR # + ::[।-॥\u0A01\u0A02ਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹ\u0A3Cਾ-\u0A42\u0A47-\u0A48\u0A4B-\u0A4Dਖ਼-ੜਫ਼੦-ੴ]; ::NFD; ::Gurmukhi-InterIndic; ::InterIndic-Gujarati; ::NFC; + diff --git a/icu4c/source/data/translit/Gurmukhi_Kannada.txt b/icu4c/source/data/translit/Guru_Knda.txt similarity index 83% rename from icu4c/source/data/translit/Gurmukhi_Kannada.txt rename to icu4c/source/data/translit/Guru_Knda.txt index 57e7d4999fd..29c97c9e48d 100644 --- a/icu4c/source/data/translit/Gurmukhi_Kannada.txt +++ b/icu4c/source/data/translit/Guru_Knda.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Gurmukhi_Kannada.txt +# File: Guru_Knda.txt # Generated from CLDR # + ::[।-॥\u0A01\u0A02ਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹ\u0A3Cਾ-\u0A42\u0A47-\u0A48\u0A4B-\u0A4Dਖ਼-ੜਫ਼੦-ੴ]; ::NFD; ::Gurmukhi-InterIndic; ::InterIndic-Kannada; ::NFC; + diff --git a/icu4c/source/data/translit/Gurmukhi_Latin.txt b/icu4c/source/data/translit/Guru_Latn.txt similarity index 83% rename from icu4c/source/data/translit/Gurmukhi_Latin.txt rename to icu4c/source/data/translit/Guru_Latn.txt index edc47403d21..d2ad6225af0 100644 --- a/icu4c/source/data/translit/Gurmukhi_Latin.txt +++ b/icu4c/source/data/translit/Guru_Latn.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Gurmukhi_Latin.txt +# File: Guru_Latn.txt # Generated from CLDR # + ::[।-॥\u0A01\u0A02ਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹ\u0A3Cਾ-\u0A42\u0A47-\u0A48\u0A4B-\u0A4Dਖ਼-ੜਫ਼੦-ੴ]; ::NFD; ::Gurmukhi-InterIndic; ::InterIndic-Latin; ::NFC; + diff --git a/icu4c/source/data/translit/Gurmukhi_Malayalam.txt b/icu4c/source/data/translit/Guru_Mlym.txt similarity index 82% rename from icu4c/source/data/translit/Gurmukhi_Malayalam.txt rename to icu4c/source/data/translit/Guru_Mlym.txt index ae24f68e522..9157c9de154 100644 --- a/icu4c/source/data/translit/Gurmukhi_Malayalam.txt +++ b/icu4c/source/data/translit/Guru_Mlym.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Gurmukhi_Malayalam.txt +# File: Guru_Mlym.txt # Generated from CLDR # + ::[।-॥\u0A01\u0A02ਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹ\u0A3Cਾ-\u0A42\u0A47-\u0A48\u0A4B-\u0A4Dਖ਼-ੜਫ਼੦-ੴ]; ::NFD; ::Gurmukhi-InterIndic; ::InterIndic-Malayalam; ::NFC; + diff --git a/icu4c/source/data/translit/Gurmukhi_Oriya.txt b/icu4c/source/data/translit/Guru_Orya.txt similarity index 83% rename from icu4c/source/data/translit/Gurmukhi_Oriya.txt rename to icu4c/source/data/translit/Guru_Orya.txt index cf4a7b251ee..9ae5cb24f29 100644 --- a/icu4c/source/data/translit/Gurmukhi_Oriya.txt +++ b/icu4c/source/data/translit/Guru_Orya.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Gurmukhi_Oriya.txt +# File: Guru_Orya.txt # Generated from CLDR # + ::[।-॥\u0A01\u0A02ਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹ\u0A3Cਾ-\u0A42\u0A47-\u0A48\u0A4B-\u0A4Dਖ਼-ੜਫ਼੦-ੴ]; ::NFD; ::Gurmukhi-InterIndic; ::InterIndic-Oriya; ::NFC; + diff --git a/icu4c/source/data/translit/Gurmukhi_Tamil.txt b/icu4c/source/data/translit/Guru_Taml.txt similarity index 83% rename from icu4c/source/data/translit/Gurmukhi_Tamil.txt rename to icu4c/source/data/translit/Guru_Taml.txt index 1872fa09af9..224bfa81744 100644 --- a/icu4c/source/data/translit/Gurmukhi_Tamil.txt +++ b/icu4c/source/data/translit/Guru_Taml.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Gurmukhi_Tamil.txt +# File: Guru_Taml.txt # Generated from CLDR # + ::[।-॥\u0A01\u0A02ਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹ\u0A3Cਾ-\u0A42\u0A47-\u0A48\u0A4B-\u0A4Dਖ਼-ੜਫ਼੦-ੴ]; ::NFD; ::Gurmukhi-InterIndic; ::InterIndic-Tamil; ::NFC; + diff --git a/icu4c/source/data/translit/Gurmukhi_Telugu.txt b/icu4c/source/data/translit/Guru_Telu.txt similarity index 83% rename from icu4c/source/data/translit/Gurmukhi_Telugu.txt rename to icu4c/source/data/translit/Guru_Telu.txt index e6efb5b5342..4fdf1a11123 100644 --- a/icu4c/source/data/translit/Gurmukhi_Telugu.txt +++ b/icu4c/source/data/translit/Guru_Telu.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Gurmukhi_Telugu.txt +# File: Guru_Telu.txt # Generated from CLDR # + ::[।-॥\u0A01\u0A02ਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹ\u0A3Cਾ-\u0A42\u0A47-\u0A48\u0A4B-\u0A4Dਖ਼-ੜਫ਼੦-ੴ]; ::NFD; ::Gurmukhi-InterIndic; ::InterIndic-Telugu; ::NFC; + diff --git a/icu4c/source/data/translit/Han_Latin_Names.txt b/icu4c/source/data/translit/Han_Latin_Names.txt index 648547f4218..19977455ba3 100755 --- a/icu4c/source/data/translit/Han_Latin_Names.txt +++ b/icu4c/source/data/translit/Han_Latin_Names.txt @@ -1,19 +1,33 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Han_Latin_Names.txt # Generated from CLDR # + +# This transform is primarily intended to produce readings for Chinese surnames, or for full +# Chinese personal names - surname first - that occur at the beginning of a contiguous Han substring +# (i.e. at the beginning of text, or immediately preceded by space or other non-Han characters). +# Several Han characters have different readings in surnames, than the readings found in Han-Latin. +# ---- +# Insert marker at start of each Han sequence (including Han after space). +# Do this before ::Han-Spacedhan() to catch Han after space in original text, +# and to apply before all other rules. $startOfHanMarker = \uFDD1; [:^script=Han:] { ([:script=Han:]) → $startOfHanMarker $1; +# Need Spacedhan so the name transliterations get spaced properly ::Han-Spacedhan(); +# Convert special name readings that depend on next character 令 } \u0020? 狐 →líng; 万 } \u0020? 俟 →mò; 澹 } \u0020? 台 →tán; +# The following maps 长 to the standard Han-Latin reading zhǎng for this case, +# to override the normal Han-Latin/Names reading 长→cháng further below $startOfHanMarker{ 长 } \u0020? 孙 →zhǎng; +# Convert single characters with special name readings $startOfHanMarker{ 秘→bì; $startOfHanMarker{ 卜→bǔ; 长→cháng; @@ -48,7 +62,11 @@ $startOfHanMarker{ 员→yùn; $startOfHanMarker{ 查→zhā; 翟→zhái; 曾→zēng; +# Convert $startOfHanMarkers to space, or to nothing if they are at the beginning of text. +# Need to do this as a separate pass to get the spacing right. ::Null(); [^$]{ $startOfHanMarker →\u0020; $startOfHanMarker →; +# Then run the normal Han-Latin transform for the rest ::Han-Latin(); + diff --git a/icu4c/source/data/translit/Han_Spacedhan.txt b/icu4c/source/data/translit/Han_Spacedhan.txt index 0126b40f23b..01e80226801 100644 --- a/icu4c/source/data/translit/Han_Spacedhan.txt +++ b/icu4c/source/data/translit/Han_Spacedhan.txt @@ -1,18 +1,27 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Han_Spacedhan.txt # Generated from CLDR # -:: [[㆒-㆟㈠-㉇㊀-㊰㋀-㋋㍘-㍰㍻-㍿㏠-㏾ 🈐-🈒🈔-🈺🉀-🉈🉐🉑][:ideographic:][:sc=han:]] nfkc; + +# Only intended for internal use +# Make sure Han are normalized, including characters that contain them. +# The first set in the filter is computed with http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:tonfkd:/XXX/:]-[:ideographic:]-[:sc=han:] +# Where XXX is the resolved [:ideographic:][:sc=han:]. It needs updating with each Unicode release! +:: [[㆒-㆟㈠-㉇㊀-㊰㋀-㋋㍘-㍰㍻-㍿㏠-㏾ 🈐-🈒🈔-🈺🉀-🉈🉐🉑][:ideographic:][:sc=han:]] nfkc; :: fullwidth-halfwidth; 。 → '.'; $terminalPunct = [\.\,\:\;\?\!.,:?!。、;[:Pe:][:Pf:]]; $initialPunct = [:Ps:][:Pi:]; +# add space between any Han or terminal punctuation and letters, and +# between letters and Han or initial punct [[:Ideographic:] $terminalPunct] {} [:Letter:] → ' ' ; [:Letter:] [:Mark:]* {} [[:Ideographic:] $initialPunct] → ' ' ; +# remove spacing between ideographs and other letters ← [:Ideographic:] { ' ' } [:Letter:] ; ← [:Letter:] [:Mark:]* { ' ' } [:Ideographic:] ; + diff --git a/icu4c/source/data/translit/Hangul_Latin.txt b/icu4c/source/data/translit/Hang_Latn.txt similarity index 82% rename from icu4c/source/data/translit/Hangul_Latin.txt rename to icu4c/source/data/translit/Hang_Latn.txt index 3b5d04e6064..bcd153ac210 100644 --- a/icu4c/source/data/translit/Hangul_Latin.txt +++ b/icu4c/source/data/translit/Hang_Latn.txt @@ -1,13 +1,15 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Hangul_Latin.txt +# File: Hang_Latn.txt # Generated from CLDR # + ::['ᄀ-하-ᅵᆨ-ᇂㄱ-ㄿㅁ-ㅃㅅ-ㅣ㈀-㈜㉠-㉻가-힣'ᄀ-ᆵᄆ-ᄈᄉ-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ]; ::NFKD; ::ConjoiningJamo-Latin; ::NFC; + diff --git a/icu4c/source/data/translit/Han_Latin.txt b/icu4c/source/data/translit/Hani_Latn.txt similarity index 99% rename from icu4c/source/data/translit/Han_Latin.txt rename to icu4c/source/data/translit/Hani_Latn.txt index ce006384a56..ee56a5f68dd 100644 --- a/icu4c/source/data/translit/Han_Latin.txt +++ b/icu4c/source/data/translit/Hani_Latn.txt @@ -1,16 +1,22 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Han_Latin.txt +# File: Hani_Latn.txt # Generated from CLDR # + +# Warning: does not do round-trip mapping!! +# Convert CJK characters ::Han-Spacedhan(); -藏 } \u0020? 文 →zàng; -重 } \u0020? 庆 →chóng; -沈 } \u0020? 阳 →shěn; +# Convert compounds; these are added individually, not derived from Unihan kMandarin. +# Note that Han-Spacedhan() has already been applied, so there should be spaces between Han characters. +藏 } \u0020? 文 →zàng;# 藏 is zàng (not cáng) if followed by 文 wén: 藏文 language Zàngwén = Tibetan +重 } \u0020? 庆 →chóng;# 重 is chóng (not zhòng) if followed by 庆 qìng: 重庆 city Chóngqìng +沈 } \u0020? 阳 →shěn;# 沈 is shěn (not chén) if followed by 阳 yáng: 沈阳 city Shěnyáng +# START AUTOGENERATED Han-Latin.xml ( Unihan kMandarin) [呵锕阿𠼞𥥩𨉚]→ā; [嗄]→á; [啊]→a; @@ -1604,3 +1610,31 @@ [㝾佐唨左繓𠂇𥙀𦈛𧲭𨀨]→zuǒ; [㑅㘀㘴㤰㭮䔘䟶作侳做唑坐岝岞座怍祚糳胙葃葄袏阼飵𠱯𡯨𡹥𥅁𥥏𥽿𦥬𧃘𨐳𨝨𪎲]→zuò; [咗蓙]→zuo; +# END AUTOGENERATED Han-Latin.xml (Unihan kMandarin) +# fallbacks +## | yi ← i; +## | wu ← u; +## | bi ← b; +## | ci ← c; +## | di ← d; +## | fu ← f; +## | gu ← g; +## | he ← h; +## | ji ← j; +## | ku ← k; +## | li ← l; +## | mi ← m; +## | pi ← p; +## | qi ← q; +## | l ← r; +## | si ← s; +## | ti ← t; +## | f ← v; +## | wa ← w; +## | xi ← x; +## | yi ← y; +## | zi ← z; +# filter out the half-width hangul +# :: [^ᄒ-○] fullwidth-halfwidth (); +## :: (lower) ; + diff --git a/icu4c/source/data/translit/Simplified_Traditional.txt b/icu4c/source/data/translit/Hans_Hant.txt similarity index 98% rename from icu4c/source/data/translit/Simplified_Traditional.txt rename to icu4c/source/data/translit/Hans_Hant.txt index 88f1039c865..78d8437fbc0 100644 --- a/icu4c/source/data/translit/Simplified_Traditional.txt +++ b/icu4c/source/data/translit/Hans_Hant.txt @@ -1,25 +1,38 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Simplified_Traditional.txt +# File: Hans_Hant.txt # Generated from CLDR # + +# Copyright (c) 2005-2007,2010 Apple Inc., Unicode Inc., +# and others. All Rights Reserved. +# For terms of use, see http://unicode.org/copyright.html#Exhibit1 +# Convert between simplified and traditional Chinese +# UTRANS_FORWARD is from simplified to traditional $SCDigit = [零一二三四五六七八九十百千万亿两] ; $TCDigit = [零一二三四五六七八九十百千萬億兩] ; +# +# complex mappings for which there is no easy solution +# so we arbitrarily pick one +# +# does not handle the surnames 于 or 於 于飞↔于飛; 于归↔于歸; 于思↔于思; 单于↔單于; 鲜于↔鮮于; +# 姜片↔薑片; 姜末↔薑末; 生姜↔生薑; 鬼子姜↔鬼子薑; 姜↔姜; 姜←薑; +# 赤皮仑↔赤皮崙; 金仑溪↔金崙溪; 下仑↔下崙; @@ -33,6 +46,10 @@ $TCDigit = [零一二三四五六七八九十百千萬億兩] ; 仑↔侖; 曲↔曲; 曲←麯; +# +# Multiliple TC to SC +# +# 茶余饭后↔茶餘飯後; 余怒未消↔餘怒未消; 余音绕梁↔餘音繞梁; @@ -69,22 +86,27 @@ $SCDigit { 余 → 餘; 余↔余; 馀→餘; 余←餘; +# 什么↔甚麼; 么↔麼; 么←麽; 么←么; +# 复数↔複數; 复分数↔複分數; 复杂↔複雜; 复制↔複製; 复↔復; 复←複; +# +# 了然↔瞭然; 了解↔瞭解; 了望↔瞭望; 明了↔明瞭; 了↔了; 了←瞭; +# 解铃系铃↔解鈴繫鈴; 系词↔繫詞; 系辞↔繫辭; @@ -97,6 +119,7 @@ $SCDigit { 余 → 餘; 系↔系; 系←係; 系←繫; +# 糊里糊涂↔糊裡糊塗; 稀里糊涂↔稀裡糊塗; 蒙在鼓里↔蒙在鼓裡; @@ -193,6 +216,10 @@ $SCDigit { 海里 → 海里; 里外↔裡外; 里←裡; 里←裏; +# +# 乾 appears in the most compounds +# 幹 is next +# then 干 干什么↔幹甚麼; 干部↔幹部; 干才↔幹才; @@ -234,6 +261,7 @@ $SCDigit { 海里 → 海里; 干↔乾; 干←幹; 干←干; +# 划时代↔劃時代; 划分↔劃分; 划分←畫分; @@ -269,6 +297,7 @@ $SCDigit { 海里 → 海里; 划↔划; 划←劃; 画↔畫; +# $SCDigit { 斗 → 斗; 斗量↔斗量; 斗胆↔斗胆; @@ -289,6 +318,7 @@ $SCDigit { 斗 → 斗; 斗←斗; 斗←闘; 斗←鬭; +# 周 } $SCDigit → 週; 周末↔週末; 周刊↔週刊; @@ -300,6 +330,8 @@ $SCDigit { 斗 → 斗; 本周↔本週; 周↔周; 周←週; +# +# 松球↔松毬; 球花↔毬花; 球果↔毬果; @@ -316,11 +348,13 @@ $SCDigit { 斗 → 斗; 肉松↔肉鬆; 松↔松; 松←鬆; +# 果↔果; 果←菓; 老板↔老闆; 板↔板; 板←闆; +# 面条↔麵條; 面粉↔麵粉; 面包↔麵包; @@ -343,14 +377,17 @@ $SCDigit { 斗 → 斗; 白面↔白麵; 面↔面; 面←麵; +# 防御↔防禦; 御敌↔禦敵; 御寒↔禦寒; 御↔御; 御←禦; +# 腼腆↔靦腆; 腼←靦; 䩄→靦; +# 宫商角徵羽↔宮商角徵羽; 征伐↔征伐; 征服↔征服; @@ -361,18 +398,21 @@ $SCDigit { 斗 → 斗; 亲征↔親征; 征↔徵; 征←征; +# 台风↔颱風; 写字台↔寫字檯; 台↔台; 台←颱; 台←臺; 台←檯; +# 胡同↔衚衕; 胡子↔鬍子; 胡须↔鬍鬚; 胡↔胡; 胡←鬍; 胡←衚; +# 须根↔鬚根; 须鲸↔鬚鯨; 须眉↔鬚眉; @@ -380,6 +420,7 @@ $SCDigit { 斗 → 斗; 触须↔觸鬚; 须↔須; 须←鬚; +# $SCDigit { 只 → 隻; 形单影只↔形單影隻; 只贺新禧←祇賀新禧; @@ -396,6 +437,7 @@ $SCDigit { 只 → 隻; 祇↔祇; 只↔只; 只←隻; +# 并发↔併發; 并拢↔併攏; 并入↔併入; @@ -405,10 +447,13 @@ $SCDigit { 只 → 隻; 吞并↔吞併; 并↔並; 并←併; +# 当↔當; 当←噹; +# 药↔藥; 药←葯; +# 布道↔佈道; 布景↔佈景; 布局↔佈局; @@ -421,12 +466,14 @@ $SCDigit { 只 → 隻; 宣布↔宣佈; 布↔布; 布←佈; +# 开天辟地↔開天闢地; 开辟↔開闢; 辟邪↔闢邪; 辟↔辟; 辟←闢; 人言藉藉↔人言藉藉; +# 借口↔藉口; 借故↔藉故; 借使↔藉使; @@ -439,20 +486,25 @@ $SCDigit { 只 → 隻; 借↔借; 借←藉; 藉→藉; +# 尽管↔儘管; 尽↔盡; 尽←儘; +# 叶韵↔叶韻; 叶↔葉; 叶←叶; +# 伙计↔夥計; 伙伴↔夥伴; 伙↔伙; 伙←夥; +# 家具↔傢具; 家伙↔傢伙; 家↔家; 家←傢; +# 奸夫↔姦夫; 奸妇↔姦婦; 奸情↔姦情; @@ -465,6 +517,7 @@ $SCDigit { 只 → 隻; 诱奸↔誘姦; 奸↔奸; 奸←姦; +# 历书↔曆書; 历法↔曆法; 公历↔公曆; @@ -483,18 +536,22 @@ $SCDigit { 只 → 隻; 万历↔萬曆; 历↔歷; 历←曆; +# 万俟↔万俟; # surname +# 气冲冲↔氣沖沖; 气焰↔氣燄; 焰←燄; 气↔氣; 气←气; +# 细致↔細緻; 精致↔精緻; 标致↔標緻; 别致↔別緻; 致↔致; 致←緻; +# 制版↔製版; 制成↔製成; 制品↔製品; @@ -511,6 +568,7 @@ $SCDigit { 只 → 隻; 预制↔預製; 制↔制; 制←製; +# 谷贱伤农↔穀賤傷農; 谷神星↔穀神星; 鬼谷子↔鬼谷子; @@ -535,6 +593,7 @@ $SCDigit { 只 → 隻; 谷↔谷; 谷←穀; 谷←榖; +# 后妃↔后妃; 后稷↔后稷; 后土↔后土; @@ -545,6 +604,7 @@ $SCDigit { 只 → 隻; 太后↔太后; 后↔後; 后←后; +# 地方志↔地方誌; 标志↔標誌; 墓志↔墓誌; @@ -554,19 +614,24 @@ $SCDigit { 只 → 隻; 杂志↔雜誌; 志↔志; 志←誌; +# 别扭↔彆扭; 别↔別; 别←彆; +# 汇报↔彙報; 词汇↔詞彙; 字汇↔字彙; 汇↔匯; 汇←彙; +# 辞↔辭; 辞←辞; 词↔詞; +# 机↔機; 机←机; +# 发廊↔髮廊; 发妻↔髮妻; 发型↔髮型; @@ -631,11 +696,13 @@ $SCDigit { 只 → 隻; 发↔發; 发←髮; 卷←捲; +# 人云亦云↔人云亦云; 不知所云↔不知所云; 云游↔雲遊; 云↔雲; 云←云; +# 子丑寅卯↔子丑寅卯; 生旦淨末丑↔生旦净末丑; 丑时↔丑時; @@ -644,9 +711,11 @@ $SCDigit { 只 → 隻; 小丑↔小丑; 丑↔醜; 丑←丑; +# 萝卜↔蘿蔔; 卜↔卜; 卜←蔔; +# 冲茶↔沖茶; 冲淡↔沖淡; 冲服↔沖服; @@ -660,16 +729,20 @@ $SCDigit { 只 → 隻; 冲↔衝; 冲←沖; 冲←冲; +# $SCDigit { 出 } 戏 → 齣; 出游↔出遊; 出↔出; 出←齣; +# 线↔線; 线←綫; +# 核实↔覈實; 核算↔覈算; 核↔核; 核←覈; +# 回路↔迴路; 回廊↔迴廊; 回游↔回遊; @@ -678,12 +751,15 @@ $SCDigit { 出 } 戏 → 齣; 回↔回; 回←迴; 回←廻; +# 冬冬↔鼕鼕; 冬↔冬; 冬←鼕; +# 咸菜↔鹹菜; 咸↔咸; 咸←鹹; +# 清心寡欲↔清心寡慾; 克欲修行↔克慾修行; 欲不可纵↔慾不可縱; @@ -705,6 +781,7 @@ $SCDigit { 出 } 戏 → 齣; 嗜欲↔嗜慾; 欲↔欲; 欲←慾; +# 准绳↔準繩; 准时↔準時; 准头↔準頭; @@ -719,6 +796,7 @@ $SCDigit { 出 } 戏 → 齣; 准↔准; 准←準; 标↔標; +# 注册↔註冊; 注销↔註銷; 注解↔註解; @@ -728,6 +806,9 @@ $SCDigit { 出 } 戏 → 齣; 加注↔加註; 注↔注; 注←註; +# +# variants +# 凶暴↔兇暴; 凶器↔兇器; 凶手↔兇手; @@ -736,23 +817,32 @@ $SCDigit { 出 } 戏 → 齣; 逞凶↔逞兇; 凶↔凶; 凶←兇; +# 扬↔揚; 扬←䬗; 飏↔颺; +# 宴↔宴; 宴←醼; 䜩↔讌; +# 咬↔咬; 咬←齩; 咬←䶧; +# 豆↔豆; 豆←荳; +# 韭↔韭; 韭←韮; +# +# 笺↔箋; 笺←牋; +# 团↔團; 团←糰; +# 卤鸡↔滷雞; 卤味↔滷味; 卤菜↔滷菜; @@ -760,36 +850,50 @@ $SCDigit { 出 } 戏 → 齣; 盐卤↔鹽滷; 卤↔鹵; 卤←滷; +# 呆↔呆; 呆←獃; +# 泛↔泛; 泛←氾; 泛←汎; +# 妫↔媯; 妫←嬀; +# 众↔眾; 众←衆; +# 钩↔鈎; 钩←鉤; +# 绱↔緔; 绱←鞝; +# 锐↔銳; 锐←鋭; +# 赝↔贋; 赝←贗; 赃↔贓; 赃←贜; +# 粗↔粗; 粗←麤; +# 关↔關; 关←関; +# 饥↔飢; 饥←饑; +# 款↔款; 款←欵; 胧↔朧; +# 蒙↔蒙; 蒙←懞; +# 骂↔罵; 骂←駡; 脏↔臟; @@ -819,18 +923,24 @@ $SCDigit { 出 } 戏 → 齣; 炮↔炮; 炮←砲; 炮←礮; +# 启↔啓; 启←啟; +# 茶几↔茶几; 几↔幾; 几←几; +# 德↔德; 德←悳; +# 悫↔愨; 悫←慤; +# 克↔克; 克←剋; 克←尅; +# 坛坛罐罐↔罈罈罐罐; 瓶瓶坛坛↔瓶瓶罈罈; 醋坛↔醋罈; @@ -840,6 +950,7 @@ $SCDigit { 出 } 戏 → 齣; 坛←壜; 坛←罎; 坛←罈; +# 升华↔昇華; 毕升↔畢昇; 高升↔高昇; @@ -847,19 +958,26 @@ $SCDigit { 出 } 戏 → 齣; 升↔升; 升←昇; 升←陞; +# 伪↔偽; 伪←僞; +# 收获→收穫; 获↔獲; 获←穫; +# 绦↔縧; 绦←絛; +# 绣↔繡; 绣←綉; +# 钵↔鉢; 钵←缽; +# 蜡↔蠟; 蜡←蜡; +# 采薪之忧↔采薪之憂; 兴高采烈↔興高采烈; 无精打采↔無精打采; @@ -875,6 +993,7 @@ $SCDigit { 出 } 戏 → 齣; 䌽→綵; 采↔採; 采←埰; +# 厕↔廁; 厕←厠; 捣↔搗; @@ -899,8 +1018,10 @@ $SCDigit { 出 } 戏 → 齣; 凼←氹; 床↔床; 床←牀; +# first form is more common 墙↔牆; 墙←墻; +# 奖↔獎; 奖←奬; 眦↔眥; @@ -927,8 +1048,10 @@ $SCDigit { 出 } 戏 → 齣; 酝←醞; 录↔錄; 录←録; +# 鏽 is more common 锈↔鏽; 锈←銹; +# 镢↔鐝; 䦆←钁; 阅↔閱; @@ -939,6 +1062,7 @@ $SCDigit { 出 } 戏 → 齣; 闲居↔閑居; 闲↔閒; 闲←閑; +# 游山玩水↔遊山玩水; 游伴↔遊伴; 游程↔遊程; @@ -998,6 +1122,7 @@ $SCDigit { 出 } 戏 → 齣; 夜游↔夜遊; 游↔游; 游←遊; +# 表蒙子↔錶蒙子; 表带↔錶帶; 表链↔錶鏈; @@ -1018,11 +1143,14 @@ $SCDigit { 出 } 戏 → 齣; 停表↔停錶; 表↔表; 表←錶; +# 症结↔癥結; 症↔症; 症←癥; +# 痴↔痴; 痴←癡; +# 白洋淀↔白洋淀; 荷花淀↔荷花淀; 水淀↔水淀; @@ -1030,22 +1158,26 @@ $SCDigit { 出 } 戏 → 齣; 东淀↔東淀; 淀↔澱; 淀←淀; +# 向导↔嚮導; 响应←嚮應; 向往↔嚮往; 向↔向; 向←嚮; 向←曏; +# 扎营↔紮營; 驻扎↔駐紮; 扎↔扎; 扎←紮; +# 占卜↔占卜; 占卦↔占卦; 占梦↔占夢; 占星↔占星; 占↔佔; 占←占; +# 托名↔託名; 托收↔託收; 信托↔信託; @@ -1061,14 +1193,18 @@ $SCDigit { 出 } 戏 → 齣; 托↔托; 托←託; 讬→託; +# 涌↔湧; 涌←涌; +# 累↔累; 累←纍; +# 困惫↔睏憊; 困乏↔睏乏; 困↔困; 困←睏; +# 左邻右舍↔左鄰右舍; 舍利↔舍利; 舍弟↔舍弟; @@ -1084,42 +1220,53 @@ $SCDigit { 出 } 戏 → 齣; 猪舍↔豬舍; 舍↔捨; 舍←舍; +# 杠↔槓; 杠←杠; +# 雇员↔僱員; 雇↔雇; 雇←僱; +# 刮倒↔颳倒; 刮↔刮; 刮←颳; +# 狸↔狸; 狸←貍; +# 跌交↔跌跤; 交↔交; 交←跤; +# 侄媳妇↔姪媳婦; 侄女↔姪女; 侄孙↔姪孫; 侄↔侄; 侄←姪; +# 勋↔勳; 勋←勛; +# 秋千↔鞦韆; 荡秋千↔盪鞦韆; 荡↔蕩; 荡←盪; 秋↔秋; +# 不寒而栗↔不寒而慄; 颤栗↔顫慄; 战栗↔戰慄; 栗↔栗; 栗←慄; +# 细嚼慢咽↔細嚼慢嚥; 狼吞虎咽↔狼吞虎嚥; 咽气↔嚥氣; 下咽↔下嚥; 咽↔咽; 咽←嚥; +# 吊民伐罪↔弔民伐罪; 形影相吊↔形影相弔; 提心吊胆↔提心弔膽; @@ -1128,32 +1275,43 @@ $SCDigit { 出 } 戏 → 齣; 吊唁↔弔唁; 吊↔吊; 吊←弔; +# 英寸↔英吋; +# 方腊↔方腊; 腊↔臘; +# 乡愿↔鄉愿; 愿↔願; 愿←愿; +# 古迹↔古蹟; 史迹↔史蹟; 迹↔跡; 迹←蹟; +# 净↔淨; 净←凈; +# 侥幸↔僥倖; 侥↔僥; 幸↔幸; 幸←倖; +# 蚝↔蠔; 蚝←蚝; +# 柜柳↔柜柳; # ju3liu3 柜↔櫃; # gui4 +# 拉纤↔拉縴; 纤夫↔縴夫; 纤路↔縴路; 纤绳↔縴繩; 纤↔纖; # reading xian1 纤←縴; # reading qian4 +# +# separate readings for po1 or po4 from pu2 厚朴↔厚朴; 朴刀↔朴刀; # po1dao1 朴硝↔朴硝; # po4xiao1 @@ -1533,6 +1691,9 @@ $SCDigit { 出 } 戏 → 齣; 镌↔鐫; 镌←鎸; 于↔於; +# +# one-to-one mappings +# 亘↔亙; 铝↔鋁; 极↔極; @@ -1611,6 +1772,7 @@ $SCDigit { 出 } 戏 → 齣; 㑩↔儸; 傩↔儺; 俨↔儼; +# Preserve 丰 for traditional in some cases 丰标不凡→丰標不凡; 丰}[度情茸姿神采]→丰; 丰仪→丰儀; @@ -3985,5 +4147,7 @@ $SCDigit { 出 } 戏 → 齣; 龚↔龔; 龛↔龕; 龟↔龜; +# map some punctuation too “↔「; ”↔」; + diff --git a/icu4c/source/data/translit/Hebrew_Latin.txt b/icu4c/source/data/translit/Hebr_Latn.txt similarity index 68% rename from icu4c/source/data/translit/Hebrew_Latin.txt rename to icu4c/source/data/translit/Hebr_Latn.txt index 16dda62fc83..eb132682d82 100644 --- a/icu4c/source/data/translit/Hebrew_Latin.txt +++ b/icu4c/source/data/translit/Hebr_Latn.txt @@ -1,15 +1,36 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Hebrew_Latin.txt +# File: Hebr_Latn.txt # Generated from CLDR # + +# Transliteration table for Hebrew +# Based on the UNGEGN table at: +# http://www.eki.ee/wgrs/rom1_he.pdf +# +# Exceptions: +# - Accents are added to disambiguate letters +# - Combinations of dagesh, shin/sin dot that produce different +# letters are not yet encoded. +# +# To test, open: +# http://www.ibm.com/software/globalization/icu/demo/transform +# Click Edit, paste in this file, Save As hebrew-latin/XXX +# (where XXX is a username) +# Now go back to the main window, and try it out. +# Use hebrew-latin/XXX for Output 1, and (Inverse) for Output 2 +# Paste in hebrew text in Input, and hit Transliterate. +# +# For more information, see: +# http://icu.sourceforge.net/userguide/Transform.html :: [[:Hebrew:] [:^ccc=0:] [\u05B0-\u05B9\u05BB-\u05BC\u05C1-\u05C2ℵ-ℸ\u0304\u05BF] - [\u05BD]] ; :: nfkd (nfc) ; $letterAfter = [:M:]* [:L:] ; +# move longer items here to avoid masking ח ↔ h\u0331 ; צ ↔ z\u0331 } $letterAfter; ץ ↔ z\u0331 ; @@ -43,6 +64,7 @@ $letterAfter = [:M:]* [:L:] ; \u05BC ↔ \u0307 ; # dagesh just goes to overdot for now \u05C1 ↔ \u030C ; # shin dot -→ sh \u05C2 ↔ \u0302 ; # sin dot -→ s +# points $above = [^[:ccc=0:][:ccc=230:]]*; ‎\u05B2‎ → à ; ‎\u05B2‎ $1← a ($above) \u0300; @@ -62,6 +84,7 @@ $above = [^[:ccc=0:][:ccc=230:]]*; \u05B6 ↔ e ; \u05B3 ↔ o ; \u05BF ↔ \u0304 ; +# fallbacks ק ← c ; פ ← f } $letterAfter; ף ← f ; @@ -71,3 +94,4 @@ $above = [^[:ccc=0:][:ccc=230:]]*; :: (lower); :: nfc (nfd) ; :: ([[:Latin:] [:^ccc=0:] [ʻ-ʼ\u0300-\u0302\u0307\u030C\u0327\u0331\u0340-\u0341 \u0304 ]]); + diff --git a/icu4c/source/data/translit/Hira_Kana.txt b/icu4c/source/data/translit/Hira_Kana.txt new file mode 100644 index 00000000000..7ae5f60602a --- /dev/null +++ b/icu4c/source/data/translit/Hira_Kana.txt @@ -0,0 +1,188 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: Hira_Kana.txt +# Generated from CLDR +# + +# note: a global filter is more efficient, but MUST include all source chars +:: [\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ー[:Hiragana:] [:Katakana:] [:nonspacing mark:]] ; +:: NFKC (); +# Hiragana-Katakana +# This is largely a one-to-one mapping, but it has a +# few kinks: +# 1. The Katakana va/vi/ve/vo (30F7-30FA) have no +# Hiragana equivalents. We use Hiragana wa/wi/we/wo +# (308F-3092) with a voicing mark (3099), which is +# semantically equivalent. However, this is a non- +# roundtripping transformation. +# 2. The Katakana small ka/ke (30F5,30F6) have no +# Hiragana equiavlents. We convert them to normal +# Hiragana ka/ke (304B,3051). This is a one-way +# information-losing transformation and precludes +# round-tripping of 30F5 and 30F6. +# 3. The combining marks 3099-309C are in the Hiragana +# block, but they apply to Katakana as well, so we +# leave them untouched. +# 4. The Katakana prolonged sound mark 30FC doubles the +# preceding vowel. This is a one-way information- +# losing transformation from Katakana to Hiragana. +# 5. The Katakana middle dot separates words in foreign +# expressions; we leave this unmodified. +# The above points preclude successful round-trip +# transformations of arbitrary input text. However, +# they provide naturalistic results that should conform +# to user expectations. +# Combining equivalents va/vi/ve/vo +わ\u3099 ↔ ヷ; +ゐ\u3099 ↔ ヸ; +ゑ\u3099 ↔ ヹ; +を\u3099 ↔ ヺ; +# One-to-one mappings, main block +# 3041:3094 ↔ 30A1:30F4 +# 309D,E ↔ 30FD,E +ぁ ↔ ァ; +あ ↔ ア; +ぃ ↔ ィ; +い ↔ イ; +ぅ ↔ ゥ; +う ↔ ウ; +ぇ ↔ ェ; +え ↔ エ; +ぉ ↔ ォ; +お ↔ オ; +か ↔ カ; +が ↔ ガ; +き ↔ キ; +ぎ ↔ ギ; +く ↔ ク; +ぐ ↔ グ; +け ↔ ケ; +げ ↔ ゲ; +こ ↔ コ; +ご ↔ ゴ; +さ ↔ サ; +ざ ↔ ザ; +し ↔ シ; +じ ↔ ジ; +す ↔ ス; +ず ↔ ズ; +せ ↔ セ; +ぜ ↔ ゼ; +そ ↔ ソ; +ぞ ↔ ゾ; +た ↔ タ; +だ ↔ ダ; +ち ↔ チ; +ぢ ↔ ヂ; +っ ↔ ッ; +つ ↔ ツ; +づ ↔ ヅ; +て ↔ テ; +で ↔ デ; +と ↔ ト; +ど ↔ ド; +な ↔ ナ; +に ↔ ニ; +ぬ ↔ ヌ; +ね ↔ ネ; +の ↔ ノ; +は ↔ ハ; +ば ↔ バ; +ぱ ↔ パ; +ひ ↔ ヒ; +び ↔ ビ; +ぴ ↔ ピ; +ふ ↔ フ; +ぶ ↔ ブ; +ぷ ↔ プ; +へ ↔ ヘ; +べ ↔ ベ; +ぺ ↔ ペ; +ほ ↔ ホ; +ぼ ↔ ボ; +ぽ ↔ ポ; +ま ↔ マ; +み ↔ ミ; +む ↔ ム; +め ↔ メ; +も ↔ モ; +ゃ ↔ ャ; +や ↔ ヤ; +ゅ ↔ ュ; +ゆ ↔ ユ; +ょ ↔ ョ; +よ ↔ ヨ; +ら ↔ ラ; +り ↔ リ; +る ↔ ル; +れ ↔ レ; +ろ ↔ ロ; +ゎ ↔ ヮ; +わ ↔ ワ; +ゐ ↔ ヰ; +ゑ ↔ ヱ; +を ↔ ヲ; +ん ↔ ン; +ゔ ↔ ヴ; +ゝ ↔ ヽ; +ゞ ↔ ヾ; +# One-way Katakana-Hiragana xform of small K ka/ke to +# normal H ka/ke. +か ← ヵ; +け ← ヶ; +# Katakana followed by a prolonged sound mark 30FC has +# its final vowel doubled. This is a Katakana-Hiragana +# one-way information-losing transformation. We +# include the small Katakana (e.g., small A 3041) and +# do not distinguish them from their large +# counterparts. It doesn't make sense to double a +# small counterpart vowel as a small Hiragana vowel, so +# we don't do so. In natural text this should never +# occur anyway. If a 30FC is seen without a preceding +# vowel sound (e.g., after n 30F3) we do not change it. +### $long = ー; +# The following categories are Hiragana, not Katakana +# as might be expected, since by the time we get to the +# 30FC, the preceding character will have already been +# transformed to Hiragana. +# {The following mechanically generated from the +# Unicode 3.0 data:} +$xa = [ \ +ぁ あ か が さ ざ \ +た だ な は ば ぱ \ +ま ゃ や ら ゎ わ \ +]; +$xi = [ \ +ぃ い き ぎ し じ \ +ち ぢ に ひ び ぴ \ +み り ゐ \ +]; +$xu = [ \ +ぅ う く ぐ す ず \ +っ つ づ ぬ ふ ぶ \ +ぷ む ゅ ゆ る ゔ \ +]; +$xe = [ \ +ぇ え け げ せ ぜ \ +て で ね へ べ ぺ \ +め れ ゑ \ +]; +$xo = [ \ +ぉ お こ ご そ ぞ \ +と ど の ほ ぼ ぽ \ +も ょ よ ろ を \ +]; +あ ← $xa {ー}; +い ← $xi {ー}; +う ← $xu {ー}; +え ← $xe {ー}; +お ← $xo {ー}; +:: (NFKC) ; +# note: a global filter is more efficient, but MUST include all source chars!! +:: ([\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ー[:Hiragana:] [:Katakana:] [:nonspacing mark:]]); +# eof + diff --git a/icu4c/source/data/translit/Hiragana_Latin.txt b/icu4c/source/data/translit/Hira_Latn.txt similarity index 87% rename from icu4c/source/data/translit/Hiragana_Latin.txt rename to icu4c/source/data/translit/Hira_Latn.txt index 64609b65e0e..8e6f2dc8c44 100644 --- a/icu4c/source/data/translit/Hiragana_Latin.txt +++ b/icu4c/source/data/translit/Hira_Latn.txt @@ -1,12 +1,13 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Hiragana_Latin.txt +# File: Hira_Latn.txt # Generated from CLDR # + :: [ぁ-ゔ\u3099ゝ-ゞガギグゲゴザジズゼゾダヂヅデドバビブベボヴヷ-ヺーヾ] ; :: NFD ; :: Hiragana-Katakana; @@ -14,3 +15,4 @@ :: NFC ; :: (Lower) ; :: ([',.A-Za-z~À-ÖØ-öø-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0304Ӣ-ӣӮ-ӯḀ-ẙẠ-ỹᾱᾹῑῙῡῩK-Å]) ; + diff --git a/icu4c/source/data/translit/Hiragana_Katakana.txt b/icu4c/source/data/translit/Hiragana_Katakana.txt deleted file mode 100644 index a599b6e5a5c..00000000000 --- a/icu4c/source/data/translit/Hiragana_Katakana.txt +++ /dev/null @@ -1,135 +0,0 @@ -# *************************************************************************** -# * -# * Copyright (C) 2004-2015, International Business Machines -# * Corporation; Unicode, Inc.; and others. All Rights Reserved. -# * -# *************************************************************************** -# File: Hiragana_Katakana.txt -# Generated from CLDR -# -:: [\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ー[:Hiragana:] [:Katakana:] [:nonspacing mark:]] ; -:: NFKC (); -わ\u3099 ↔ ヷ; -ゐ\u3099 ↔ ヸ; -ゑ\u3099 ↔ ヹ; -を\u3099 ↔ ヺ; -ぁ ↔ ァ; -あ ↔ ア; -ぃ ↔ ィ; -い ↔ イ; -ぅ ↔ ゥ; -う ↔ ウ; -ぇ ↔ ェ; -え ↔ エ; -ぉ ↔ ォ; -お ↔ オ; -か ↔ カ; -が ↔ ガ; -き ↔ キ; -ぎ ↔ ギ; -く ↔ ク; -ぐ ↔ グ; -け ↔ ケ; -げ ↔ ゲ; -こ ↔ コ; -ご ↔ ゴ; -さ ↔ サ; -ざ ↔ ザ; -し ↔ シ; -じ ↔ ジ; -す ↔ ス; -ず ↔ ズ; -せ ↔ セ; -ぜ ↔ ゼ; -そ ↔ ソ; -ぞ ↔ ゾ; -た ↔ タ; -だ ↔ ダ; -ち ↔ チ; -ぢ ↔ ヂ; -っ ↔ ッ; -つ ↔ ツ; -づ ↔ ヅ; -て ↔ テ; -で ↔ デ; -と ↔ ト; -ど ↔ ド; -な ↔ ナ; -に ↔ ニ; -ぬ ↔ ヌ; -ね ↔ ネ; -の ↔ ノ; -は ↔ ハ; -ば ↔ バ; -ぱ ↔ パ; -ひ ↔ ヒ; -び ↔ ビ; -ぴ ↔ ピ; -ふ ↔ フ; -ぶ ↔ ブ; -ぷ ↔ プ; -へ ↔ ヘ; -べ ↔ ベ; -ぺ ↔ ペ; -ほ ↔ ホ; -ぼ ↔ ボ; -ぽ ↔ ポ; -ま ↔ マ; -み ↔ ミ; -む ↔ ム; -め ↔ メ; -も ↔ モ; -ゃ ↔ ャ; -や ↔ ヤ; -ゅ ↔ ュ; -ゆ ↔ ユ; -ょ ↔ ョ; -よ ↔ ヨ; -ら ↔ ラ; -り ↔ リ; -る ↔ ル; -れ ↔ レ; -ろ ↔ ロ; -ゎ ↔ ヮ; -わ ↔ ワ; -ゐ ↔ ヰ; -ゑ ↔ ヱ; -を ↔ ヲ; -ん ↔ ン; -ゔ ↔ ヴ; -ゝ ↔ ヽ; -ゞ ↔ ヾ; -か ← ヵ; -け ← ヶ; -$xa = [ \ -ぁ あ か が さ ざ \ -た だ な は ば ぱ \ -ま ゃ や ら ゎ わ \ -]; -$xi = [ \ -ぃ い き ぎ し じ \ -ち ぢ に ひ び ぴ \ -み り ゐ \ -]; -$xu = [ \ -ぅ う く ぐ す ず \ -っ つ づ ぬ ふ ぶ \ -ぷ む ゅ ゆ る ゔ \ -]; -$xe = [ \ -ぇ え け げ せ ぜ \ -て で ね へ べ ぺ \ -め れ ゑ \ -]; -$xo = [ \ -ぉ お こ ご そ ぞ \ -と ど の ほ ぼ ぽ \ -も ょ よ ろ を \ -]; -あ ← $xa {ー}; -い ← $xi {ー}; -う ← $xu {ー}; -え ← $xe {ー}; -お ← $xo {ー}; -:: (NFKC) ; -:: ([\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ー[:Hiragana:] [:Katakana:] [:nonspacing mark:]]); diff --git a/icu4c/source/data/translit/IPA_XSampa.txt b/icu4c/source/data/translit/IPA_XSampa.txt deleted file mode 100644 index 944c722cc62..00000000000 --- a/icu4c/source/data/translit/IPA_XSampa.txt +++ /dev/null @@ -1,177 +0,0 @@ -# *************************************************************************** -# * -# * Copyright (C) 2004-2015, International Business Machines -# * Corporation; Unicode, Inc.; and others. All Rights Reserved. -# * -# *************************************************************************** -# File: IPA_XSampa.txt -# Generated from CLDR -# -$t = '_'; # X-SAMPA representation of IPA tie bar. -::NFD; -ʯ ↔ 'z`_w='; # LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL -ǁ ↔ '|\|\'; # LATIN LETTER LATERAL CLICK -ʄ ↔ 'J\_<'; # LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK -ʛ ↔ 'G\_<'; # LATIN LETTER SMALL CAPITAL G WITH HOOK -ʮ ↔ 'z_w='; # LATIN SMALL LETTER TURNED H WITH FISHHOOK -\u1DC4 ↔ '_H_T'; # COMBINING MACRON-ACUTE -\u1DC5 ↔ '_B_L'; # COMBINING GRAVE-MACRON -\u1DC8 ↔ '_R_F'; # COMBINING GRAVE-ACUTE-GRAVE -ɓ ↔ 'b_<'; # LATIN SMALL LETTER B WITH HOOK -ɗ ↔ 'd_<'; # LATIN SMALL LETTER D WITH HOOK -ɠ ↔ 'g_<'; # LATIN SMALL LETTER G WITH HOOK -ɻ ↔ 'r\`'; # LATIN SMALL LETTER TURNED R WITH HOOK -↗ ↔ ''; # NORTH EAST ARROW -↘ ↔ ''; # SOUTH EAST ARROW -ħ ↔ 'X\'; # LATIN SMALL LETTER H WITH STROKE -ǀ ↔ '|\'; # LATIN LETTER DENTAL CLICK -ǂ ↔ '=\'; # LATIN LETTER ALVEOLAR CLICK -ǃ ↔ '!\'; # LATIN LETTER RETROFLEX CLICK -ɕ ↔ 's\'; # LATIN SMALL LETTER C WITH CURL -ɖ ↔ 'd`'; # LATIN SMALL LETTER D WITH TAIL -ɘ ↔ '@\'; # LATIN SMALL LETTER REVERSED E -ɚ ↔ '@`'; # LATIN SMALL LETTER SCHWA WITH HOOK -ɝ ↔ '3`'; # LATIN SMALL LETTER REVERSED OPEN E WITH HOOK -ɞ ↔ '3\'; # LATIN SMALL LETTER CLOSED REVERSED OPEN E -ɟ ↔ 'J\'; # LATIN SMALL LETTER DOTLESS J WITH STROKE -ɢ ↔ 'G\'; # LATIN LETTER SMALL CAPITAL G -ɦ ↔ 'h\'; # LATIN SMALL LETTER H WITH HOOK -ɧ ↔ 'x\'; # LATIN SMALL LETTER HENG WITH HOOK -ɭ ↔ 'l`'; # LATIN SMALL LETTER L WITH RETROFLEX HOOK -ɮ ↔ 'K\'; # LATIN SMALL LETTER LEZH -ɰ ↔ 'M\'; # LATIN SMALL LETTER TURNED M WITH LONG LEG -ɳ ↔ 'n`'; # LATIN SMALL LETTER N WITH RETROFLEX HOOK -ɴ ↔ 'N\'; # LATIN LETTER SMALL CAPITAL N -ɸ ↔ 'p\'; # LATIN SMALL LETTER PHI -ɹ ↔ 'r\'; # LATIN SMALL LETTER TURNED R -ɺ ↔ 'l\'; # LATIN SMALL LETTER TURNED R WITH LONG LEG -ɽ ↔ 'r`'; # LATIN SMALL LETTER R WITH TAIL -ʀ ↔ 'R\'; # LATIN LETTER SMALL CAPITAL R -ʂ ↔ 's`'; # LATIN SMALL LETTER S WITH HOOK -ʈ ↔ 't`'; # LATIN SMALL LETTER T WITH RETROFLEX HOOK -ʐ ↔ 'z`'; # LATIN SMALL LETTER Z WITH RETROFLEX HOOK -ʑ ↔ 'z\'; # LATIN SMALL LETTER Z WITH CURL -ʕ ↔ '?\'; # LATIN LETTER PHARYNGEAL VOICED FRICATIVE -ʘ ↔ 'O\'; # LATIN LETTER BILABIAL CLICK -ʙ ↔ 'B\'; # LATIN LETTER SMALL CAPITAL B -ʜ ↔ 'H\'; # LATIN LETTER SMALL CAPITAL H -ʝ ↔ 'j\'; # LATIN SMALL LETTER J WITH CROSSED-TAIL -ʟ ↔ 'L\'; # LATIN LETTER SMALL CAPITAL L -ʡ ↔ '>\'; # LATIN LETTER GLOTTAL STOP WITH STROKE -ʢ ↔ '<\'; # LATIN LETTER REVERSED GLOTTAL STOP WITH STROKE -ʰ ↔ '_h'; # MODIFIER LETTER SMALL H -ʷ ↔ '_w'; # MODIFIER LETTER SMALL W -ʼ ↔ '_>'; # MODIFIER LETTER APOSTROPHE -ˆ ↔ '_\'; # MODIFIER LETTER CIRCUMFLEX ACCENT -ˇ ↔ '_/'; # CARON -ˑ ↔ ':\'; # MODIFIER LETTER HALF TRIANGULAR COLON -ˠ ↔ '_G'; # MODIFIER LETTER SMALL GAMMA -ˡ ↔ '_l'; # MODIFIER LETTER SMALL L -ˤ ↔ '_?\'; # MODIFIER LETTER SMALL REVERSED GLOTTAL STOP -\u0300 ↔ '_L'; # COMBINING GRAVE ACCENT -\u0301 ↔ '_H'; # COMBINING ACUTE ACCENT -\u0302 ↔ '_F'; # COMBINING CIRCUMFLEX ACCENT -\u0304 ↔ '_M'; # COMBINING MACRON -\u0306 ↔ '_X'; # COMBINING BREVE -\u0308 ↔ '_"'; # COMBINING DIAERESIS -\u030B ↔ '_T'; # COMBINING DOUBLE ACUTE ACCENT -\u030C ↔ '_R'; # COMBINING CARON -\u030F ↔ '_B'; # COMBINING DOUBLE GRAVE ACCENT -\u0318 ↔ '_A'; # COMBINING LEFT TACK BELOW -\u0319 ↔ '_q'; # COMBINING RIGHT TACK BELOW -\u031A ↔ '_}'; # COMBINING LEFT ANGLE ABOVE -\u031C ↔ '_c'; # COMBINING LEFT HALF RING BELOW -\u031D ↔ '_r'; # COMBINING UP TACK BELOW -\u031E ↔ '_o'; # COMBINING DOWN TACK BELOW -\u031F ↔ '_+'; # COMBINING PLUS SIGN BELOW -\u0320 ↔ '_-'; # COMBINING MINUS SIGN BELOW -\u0324 ↔ '_t'; # COMBINING DIAERESIS BELOW -\u0325 ↔ '_0'; # COMBINING RING BELOW -\u032A ↔ '_d'; # COMBINING BRIDGE BELOW -\u032C ↔ '_v'; # COMBINING CARON BELOW -\u032F ↔ '_^'; # COMBINING INVERTED BREVE BELOW -\u0330 ↔ '_k'; # COMBINING TILDE BELOW -\u0334 ↔ '_e'; # COMBINING TILDE OVERLAY -\u0339 ↔ '_O'; # COMBINING RIGHT HALF RING BELOW -\u033A ↔ '_a'; # COMBINING INVERTED BRIDGE BELOW -\u033B ↔ '_m'; # COMBINING SQUARE BELOW -\u033C ↔ '_N'; # COMBINING SEAGULL BELOW -\u033D ↔ '_x'; # COMBINING X ABOVE -ᵻ ↔ 'I\'; # LATIN SMALL CAPITAL LETTER I WITH STROKE -ᵿ ↔ 'U\'; # LATIN SMALL CAPITAL LETTER U WITH STROKE -ⁿ ↔ '_n'; # MODIFIER LETTER LATIN SMALL LETTER N -ʋ ← 'v\'; # LATIN SMALL LETTER V WITH HOOK -ʲ ← '_j'; # MODIFIER LETTER SMALL H -\u0303 ← '_~'; # COMBINING TILDE -\u0329 ← '_='; # COMBINING VERTICAL LINE BELOW -c\u0327 ↔ C; # LATIN SMALL LETTER C WITH CEDILLA (decomposed) -æ ↔ '{'; # LATIN SMALL LETTER AE -ð ↔ D; # LATIN SMALL LETTER ETH -ø ↔ 2; # LATIN SMALL LETTER O WITH STROKE -ŋ ↔ N; # LATIN SMALL LETTER ENG -œ ↔ 9; # LATIN SMALL LIGATURE OE -ɐ ↔ 6; # LATIN SMALL LETTER TURNED A -ɑ ↔ A; # LATIN SMALL LETTER ALPHA -ɒ ↔ Q; # LATIN SMALL LETTER TURNED ALPHA -ɔ ↔ O; # LATIN SMALL LETTER OPEN O -ə ↔ '@'; # LATIN SMALL LETTER SCHWA -ɛ ↔ E; # LATIN SMALL LETTER OPEN E -ɜ ↔ 3; # LATIN SMALL LETTER REVERSED OPEN E -ɡ ↔ g; # LATIN SMALL LETTER SCRIPT G -ɣ ↔ G; # LATIN SMALL LETTER GAMMA -ɤ ↔ 7; # LATIN SMALL LETTER RAMS HORN -ɥ ↔ H; # LATIN SMALL LETTER TURNED H -ɨ ↔ 1; # LATIN SMALL LETTER I WITH STROKE -ɪ ↔ I; # LATIN LETTER SMALL CAPITAL I -ɫ ↔ 5; # LATIN SMALL LETTER L WITH MIDDLE TILDE -ɬ ↔ K; # LATIN SMALL LETTER L WITH BELT -ɯ ↔ M; # LATIN SMALL LETTER TURNED M -ɱ ↔ F; # LATIN SMALL LETTER M WITH HOOK -ɲ ↔ J; # LATIN SMALL LETTER N WITH LEFT HOOK -ɵ ↔ 8; # LATIN SMALL LETTER BARRED O -ɶ ↔ '&'; # LATIN LETTER SMALL CAPITAL OE -ɾ ↔ 4; # LATIN SMALL LETTER R WITH FISHHOOK -ʁ ↔ R; # LATIN LETTER SMALL CAPITAL INVERTED R -ʃ ↔ S; # LATIN SMALL LETTER ESH -ʉ ↔ '}'; # LATIN SMALL LETTER U BAR -ʊ ↔ U; # LATIN SMALL LETTER UPSILON -ʋ ↔ P; # LATIN SMALL LETTER V WITH HOOK -ʌ ↔ V; # LATIN SMALL LETTER TURNED V -ʍ ↔ W; # LATIN SMALL LETTER TURNED W -ʎ ↔ L; # LATIN SMALL LETTER TURNED Y -ʏ ↔ Y; # LATIN LETTER SMALL CAPITAL Y -ʒ ↔ Z; # LATIN SMALL LETTER EZH -ʔ ↔ '?'; # LATIN LETTER GLOTTAL STOP -ʲ ↔ \'; # MODIFIER LETTER SMALL J -ˈ ↔ '"'; # MODIFIER LETTER VERTICAL LINE -ˌ ↔ '%'; # MODIFIER LETTER LOW VERTICAL LINE -ː ↔ ':'; # MODIFIER LETTER TRIANGULAR COLON -˞ ↔ '`'; # MODIFIER LETTER RHOTIC HOOK -\u0303 ↔ '~'; # COMBINING TILDE -\u0329 ↔ '='; # COMBINING VERTICAL LINE BELOW -\u0361 ↔ $t; # COMBINING DOUBLE INVERTED BREVE -β ↔ B; # GREEK SMALL LETTER BETA -θ ↔ T; # GREEK SMALL LETTER THETA -χ ↔ X; # GREEK SMALL LETTER CHI -↑ ↔ '^'; # UPWARDS ARROW -↓ ↔ '!'; # DOWNWARDS ARROW -φ → 'p\'; # GREEK SMALL LETTER PHI -ɩ → I; # LATIN SMALL LETTER IOTA -ɷ → U; # LATIN SMALL LETTER CLOSED OMEGA -ɼ → 'r_r'; # LATIN SMALL LETTER R WITH LONG LEG -ɿ → 'z='; # LATIN SMALL LETTER REVERSED R WITH FISHHOOK -ʅ → 'z`='; # LATIN SMALL LETTER SQUAT REVERSED ESH -ʆ → S\'; # LATIN SMALL LETTER ESH WITH CURL -ʇ → '|\' ; # LATIN SMALL LETTER TURNED T -ʓ → Z\'; # LATIN SMALL LETTER EZH WITH CURL -ʖ → '|\|\'; # LATIN LETTER INVERTED GLOTTAL STOP -ʗ → '!\'; # LATIN LETTER STRETCHED C -ʚ → '3\'; # LATIN SMALL LETTER CLOSED OPEN E -ʠ → 'G\_<_0'; # LATIN SMALL LETTER Q WITH HOOK -ʣ → d $t z; # LATIN SMALL LETTER DZ DIGRAPH -ʤ → d $t Z; # LATIN SMALL LETTER DEZH DIGRAPH -ʥ → d $t 'z\'; # LATIN SMALL LETTER DZ DIGRAPH WITH CURL -ʦ → t $t s; # LATIN SMALL LETTER TS DIGRAPH -ʧ → t $t S; # LATIN SMALL LETTER TESH DIGRAPH -ʨ → t $t 's\'; # LATIN SMALL LETTER TC DIGRAPH WITH CURL -::NFC; diff --git a/icu4c/source/data/translit/InterIndic_Bengali.txt b/icu4c/source/data/translit/InterIndic_Bengali.txt index 41d33022930..2dcf9f5e180 100644 --- a/icu4c/source/data/translit/InterIndic_Bengali.txt +++ b/icu4c/source/data/translit/InterIndic_Bengali.txt @@ -1,12 +1,15 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: InterIndic_Bengali.txt # Generated from CLDR # + +# InterIndic-Bengali +#:: NFD (NFC) ; \uE001→\u0981; # SIGN CANDRABINDU \uE002→ং; # SIGN ANUSVARA \uE003→ঃ; # SIGN VISARGA @@ -136,3 +139,6 @@ \uE083→ৎ; # Khanda-ta 0 → ০; # FALLBACK FOR TAMIL 1 → ১; +# :: NFC (NFD) ; +# eof + diff --git a/icu4c/source/data/translit/InterIndic_Devanagari.txt b/icu4c/source/data/translit/InterIndic_Devanagari.txt index 09878154a01..c66111c0a44 100644 --- a/icu4c/source/data/translit/InterIndic_Devanagari.txt +++ b/icu4c/source/data/translit/InterIndic_Devanagari.txt @@ -1,12 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: InterIndic_Devanagari.txt # Generated from CLDR # + +# InterIndic-Devanagari +#:: NFD (NFC) ; +#Rules for Decomposed characters \uE028\uE03C → ऩ; #\uE029 \uE030\uE03C → ऱ; #\uE031 \uE033\uE03C → ऴ; #\uE034 @@ -18,6 +22,7 @@ \uE022\uE03C → ढ़; #\uE05D LETTER RHA (pronounced RRHA) \uE02B\uE03C → फ़; #\uE05E LETTER FA \uE02F\uE03C → य़; #\uE05F LETTER YYA +#Decomposed compatibility transliterations \uE012\uE057→औ; # FALLBACK FOR TAMIL AU 0 → ०; # FALLBACK FOR TAMIL 1 → १; @@ -73,9 +78,11 @@ \uE02F → य; # LETTER YA \uE030 → र; # LETTER RA \uE031 → ऱ; # LETTER RRA (Eyelash RA for Southern scripts) +#\uE031 → र; \uE032 → ल; # LETTER LA \uE033 → ळ; # LETTER LLA \uE034 → ऴ; # LETTER LLLA (LLLA for Southern scripts) +#\uE034 → ळ; \uE035 → व; # LETTER VA \uE036 → श; # LETTER SHA \uE037 → ष; # LETTER SSA @@ -148,3 +155,6 @@ \uE081→व; # FALLBACK FOR ORIYA LETTER WA \uE082→ॽ; # Devanagari Glottal Sign \uE083→त\u094D; # Bengali Khanda-ta +# :: NFC; +# eof + diff --git a/icu4c/source/data/translit/InterIndic_Gujarati.txt b/icu4c/source/data/translit/InterIndic_Gujarati.txt index 077e9d02fef..6576e94cd88 100644 --- a/icu4c/source/data/translit/InterIndic_Gujarati.txt +++ b/icu4c/source/data/translit/InterIndic_Gujarati.txt @@ -1,12 +1,15 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: InterIndic_Gujarati.txt # Generated from CLDR # + +# InterIndic-Gujarati +#:: NFD (NFC) ; \uE001→\u0A81; # SIGN CANDRABINDU \uE002→\u0A82; # SIGN ANUSVARA \uE003→ઃ; # SIGN VISARGA @@ -136,3 +139,7 @@ \uE083→ત\u0ACD; # Bengali Khanda-ta 0 → ૦; # FALLBACK FOR TAMIL 1 → ૧; +#\uE080→; # UNMAPPED InterIndic-Gujarati: ISSHAR +# :: NFC (NFD) ; +# eof + diff --git a/icu4c/source/data/translit/InterIndic_Gurmukhi.txt b/icu4c/source/data/translit/InterIndic_Gurmukhi.txt index 3cc68262535..90795004fe0 100644 --- a/icu4c/source/data/translit/InterIndic_Gurmukhi.txt +++ b/icu4c/source/data/translit/InterIndic_Gurmukhi.txt @@ -1,16 +1,22 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: InterIndic_Gurmukhi.txt # Generated from CLDR # + +# InterIndic-Gurmukhi +#:: NFD (NFC) ; $vowel = [ਅ-ਔ ਾ-\u0A4D]; $consonant = [ਕ-ਹ]; \uE001→\u0A01; # SIGN CHANDRABINDU +#rules for BINDI +# Anusvara is equivalent to BINDI when preceeded by a vowel $vowel{\uE002→\u0A02; # SIGN ANUSVARA (\u0A02 = SIGN BINDI) +# else is equivalent to TIPPI $consonant{\uE002→\u0A70; # SIGN TIPPI \uE002→\u0A02; \uE003→; # FALLBACK BLOW AWAY SIGN VISARGA @@ -140,3 +146,6 @@ $consonant{\uE002→\u0A70; # SIGN TIPPI \uE083→ਤ\u0A4D; # Bengali Khanda-ta 0 → ੦; # FALLBACK FOR TAMIL 1 → ੧; +# :: NFC (NFD) ; +# eof + diff --git a/icu4c/source/data/translit/InterIndic_Kannada.txt b/icu4c/source/data/translit/InterIndic_Kannada.txt index 30e51aec1a4..628ea69a182 100644 --- a/icu4c/source/data/translit/InterIndic_Kannada.txt +++ b/icu4c/source/data/translit/InterIndic_Kannada.txt @@ -1,12 +1,15 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: InterIndic_Kannada.txt # Generated from CLDR # + +# InterIndic-Kannada +#:: NFD (NFC) ; \uE033\uE03C→ೞ; # LETTER FA \uE001→ಂ; # REMAP (indicExceptions.txt): \u0C81→ಂ = SIGN CANDRABINDU→SIGN ANUSVARA \uE002→ಂ; # SIGN ANUSVARA @@ -138,3 +141,6 @@ \uE083→ತ\u0CCD; # Bengali Khanda-ta 0 → ೦; # FALLBACK FOR TAMIL 1 → ೧; +# :: NFC (NFD) ; +# eof + diff --git a/icu4c/source/data/translit/InterIndic_Latin.txt b/icu4c/source/data/translit/InterIndic_Latin.txt index 0c85a5a446b..c21309cc0a0 100644 --- a/icu4c/source/data/translit/InterIndic_Latin.txt +++ b/icu4c/source/data/translit/InterIndic_Latin.txt @@ -1,15 +1,21 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: InterIndic_Latin.txt # Generated from CLDR # + +# InterIndic-Latin +#\u0E00 reserved +#consonants $chandrabindu=\uE001; $anusvara=\uE002; $visarga=\uE003; +#\u0E004 reserved +# w←vowel→ represents the stand-alone form $wa=\uE005; $waa=\uE006; $wi=\uE007; @@ -64,8 +70,11 @@ $sha=\uE036; $ssa=\uE037; $sa=\uE038; $ha=\uE039; +#\u093A Reserved +#\u093B Reserved $nukta=\uE03C; $avagraha=\uE03D; # SIGN AVAGRAHA +# ←vowel→ represents the dependent form $aa=\uE03E; $i=\uE03F; $ii=\uE040; @@ -82,6 +91,8 @@ $so=\uE04A; # VOWEL SIGN SHORT O $o=\uE04B; # ो $au=\uE04C; $virama=\uE04D; +# \u094E Reserved +# \u094F Reserved $om=\uE050; # OM \uE051→; # UNMAPPED STRESS SIGN UDATTA \uE052→; # UNMAPPED STRESS SIGN ANUDATTA @@ -90,6 +101,7 @@ $om=\uE050; # OM $lm = \uE055;# Telugu Length Mark $ailm=\uE056;# AI Length Mark $aulm=\uE057;# AU Length Mark +#urdu compatibity forms $uka=\uE058; $ukha=\uE059; $ugha=\uE05A; @@ -114,14 +126,21 @@ $six=\uE06C; # DIGIT SIX $seven=\uE06D; # DIGIT SEVEN $eight=\uE06E; # DIGIT EIGHT $nine=\uE06F; # DIGIT NINE +# Glottal stop $dgs=\uE082; +#Khanda-ta $kta=\uE083; $depVowelAbove=[\uE03E-\uE040\uE045-\uE04C]; $depVowelBelow=[\uE041-\uE044]; +# $x was originally called '§'; $z was '%' $x=[$aa$ai$au$ii$i$uu$u$rrh$rh$lh$llh$e$o$se$ce$so$co]; $z=[bcdfghjklmnpqrstvwxyz]; $vowels=[aeiour\u0304\u0325\u0306]; $forceIndependentMatra = [^[[:L:][\u0300-\u034C]]]; +###################################################################### +# convert from Native letters to Latin letters +###################################################################### +#transliterations for anusvara $anusvara} [$ka$kha$ga$gha$nga] → n\u0307; $anusvara} [$ca$cha$ja$jha$nya] → n\u0304; $anusvara} [$tta$ttha$dda$ddha$nna] → n\u0323; @@ -129,6 +148,7 @@ $anusvara} [$ta$tha$da$dha$na] → n; $anusvara} [$pa$pha$ba$bha$ma] → m; $anusvara} [$ya$ra$lla$la$va$ssa$sha$sa$ha] → n; $anusvara→ m\u0307; +# Urdu compatibility $ya$nukta}$x → y\u0307; $ya$nukta$virama → y\u0307; $ya$nukta → y\u0307a; @@ -186,6 +206,7 @@ $ela → l\u0331a; $uya}$x → y\u0307; $uya$virama → y\u0307; $uya → y\u0307a; +# normal consonants $ka$virama}$ha→k''; $ka}$x→k; $ka$virama→k; @@ -312,6 +333,7 @@ $sa$virama}$ssa→s''; $sa$virama}$sa→s''; $sa}$x→s; $sa$virama→s; +#for gurmukhi $sa$nukta}$x→s\u0301; $sa$nukta$virama→s\u0301; $sa$nukta→s\u0301a; @@ -325,6 +347,7 @@ $ssa→s\u0323a; $ha}$x→h; $ha$virama→h; $ha→ha; +# dependent vowels (should never occur except following consonants) $forceIndependentMatra{$aa → \u0314a\u0304; $forceIndependentMatra{$ai → \u0314ai; $forceIndependentMatra{$au → \u0314au; @@ -338,6 +361,7 @@ $forceIndependentMatra{$llh → \u0314l\u0325\u0304; $forceIndependentMatra{$lh → \u0314l\u0325; $forceIndependentMatra{$e → \u0314e\u0304; $forceIndependentMatra{$o → \u0314o\u0304; +#extra vowels $forceIndependentMatra{$ce → \u0314e\u0306; $forceIndependentMatra{$co → \u0314o\u0306; $forceIndependentMatra{$se → \u0314e; @@ -357,10 +381,12 @@ $llh → l\u0325\u0304; $lh → l\u0325; $e → e\u0304; $o → o\u0304; +#extra vowels $ce → e\u0306; $co → o\u0306; $se → e; $so → o; +#dependent vowels when following independent vowels. Generally Illegal only for roundtripping $waa} $x → a\u0304\u0314; $wai} $x → ai\u0314; $wau} $x → au\u0314; @@ -375,11 +401,13 @@ $wl } $x → l\u0325\u0314; $we } $x → e\u0304\u0314; $wo } $x → o\u0304\u0314; $wa } $x → a\u0314; +#extra vowels $wce} $x → e\u0306\u0314; $wco} $x → o\u0306\u0314; $wse} $x → e\u0314; $wso} $x → o\u0314; $om} $x → ''om\u0314; +# independent vowels when preceeded by vowels $vowels{$waa → ''a\u0304; $vowels{$wai → ''ai; $vowels{$wau → ''au; @@ -394,10 +422,12 @@ $vowels{$wl → ''l\u0325; $vowels{$we → ''e\u0304; $vowels{$wo → ''o\u0304; $vowels{$wa → ''a; +#extra vowels $vowels{$wce → ''e\u0306; $vowels{$wco → ''o\u0306; $vowels{$wse → ''e; $vowels{$wso → ''o; +# independent vowels (otherwise) $waa → a\u0304; $wai → ai; $wau → au; @@ -412,15 +442,18 @@ $wl → l\u0325; $we → e\u0304; $wo → o\u0304; $wa → a; +#extra vowels $wce → e\u0306; $wco → o\u0306; $wse → e; $wso → o; $om → ''om; +#stress marks $avagraha → \u0315; $chandrabindu$anusvara→\u0303; $chandrabindu → m\u0310; $visarga→h\u0323; +#numbers $zero → 0; $one → 1; $two → 2; @@ -439,9 +472,11 @@ $kta→t\u0331; $danda→'.'; $doubleDanda→'.'; \uE070→; # ABBREVIATION SIGN +# LETTER RA WITH MIDDLE DIAGONAL \uE071}$x→ra; \uE071$virama→r; \uE071→ra; +# LETTER RA WITH LOWER DIAGONAL \uE072}$x→ra; \uE072$virama→r; \uE072→ra; @@ -460,3 +495,4 @@ $doubleDanda→'.'; \uE07F→; # URA \uE080→; # EK ONKAR \uE004→; # DEVANAGARI VOWEL SIGN SHORT A + diff --git a/icu4c/source/data/translit/InterIndic_Malayalam.txt b/icu4c/source/data/translit/InterIndic_Malayalam.txt index a98b8c0ff14..4752106994a 100644 --- a/icu4c/source/data/translit/InterIndic_Malayalam.txt +++ b/icu4c/source/data/translit/InterIndic_Malayalam.txt @@ -1,12 +1,15 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: InterIndic_Malayalam.txt # Generated from CLDR # + +# InterIndic-Malayalam +#:: NFD (NFC) ; \uE001→ം; # REMAP (indicExceptions.txt): \u0D01→ം = SIGN CANDRABINDU→SIGN ANUSVARA \uE002→ം; # SIGN ANUSVARA \uE003→ഃ; # SIGN VISARGA @@ -138,3 +141,6 @@ \uE083→ത\u0D4D; # Bengali Khanda-ta 0 → ൦; # FALLBACK FOR TAMIL 1 → ൧; +# :: NFC (NFD) ; +# eof + diff --git a/icu4c/source/data/translit/InterIndic_Oriya.txt b/icu4c/source/data/translit/InterIndic_Oriya.txt index dbfc50c3485..3d7658d3041 100644 --- a/icu4c/source/data/translit/InterIndic_Oriya.txt +++ b/icu4c/source/data/translit/InterIndic_Oriya.txt @@ -1,12 +1,15 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: InterIndic_Oriya.txt # Generated from CLDR # + +# InterIndic-Oriya +#:: NFD (NFC) ; \uE001→\u0B01; # SIGN CANDRABINDU \uE002→ଂ; # SIGN ANUSVARA \uE003→ଃ; # SIGN VISARGA @@ -136,3 +139,6 @@ \uE083→ତ\u0B4D; # Bengali Khanda-ta 0 → ୦; # FALLBACK FOR TAMIL 1 → ୧; +# :: NFC (NFD) ; +# eof + diff --git a/icu4c/source/data/translit/InterIndic_Tamil.txt b/icu4c/source/data/translit/InterIndic_Tamil.txt index 253ff786a35..7f2ea976488 100644 --- a/icu4c/source/data/translit/InterIndic_Tamil.txt +++ b/icu4c/source/data/translit/InterIndic_Tamil.txt @@ -1,12 +1,15 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: InterIndic_Tamil.txt # Generated from CLDR # + +# InterIndic-Tamil +#:: NFD (NFC) ; \uE001→\u0B82; # FALLBACK SIGN CANDRABINDU \uE002→\u0B82; # SIGN ANUSVARA \uE003→ஃ; # SIGN VISARGA @@ -137,3 +140,6 @@ \uE081→வ; # FALLBACK FOR ORIYA LETTER WA \uE082→; # Devanagari Glottal Stop \uE083→த\u0BCD; # Bengali Khanda-ta +# :: NFC (NFD) ; +# eof + diff --git a/icu4c/source/data/translit/InterIndic_Telugu.txt b/icu4c/source/data/translit/InterIndic_Telugu.txt index 7aca20caab3..e1c98738fcd 100644 --- a/icu4c/source/data/translit/InterIndic_Telugu.txt +++ b/icu4c/source/data/translit/InterIndic_Telugu.txt @@ -1,12 +1,15 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: InterIndic_Telugu.txt # Generated from CLDR # + +# InterIndic-Telugu +#:: NFD (NFC) ; \uE001→ఁ; # SIGN CANDRABINDU \uE002→ం; # SIGN ANUSVARA \uE003→ః; # SIGN VISARGA @@ -137,3 +140,6 @@ \uE083→త\u0C4D; # Bengali Khanda-ta 0 → ౦; # FALLBACK FOR TAMIL 1 → ౧; +# :: NFC (NFD) ; +# eof + diff --git a/icu4c/source/data/translit/Jamo_Latin.txt b/icu4c/source/data/translit/Jamo_Latn.txt similarity index 79% rename from icu4c/source/data/translit/Jamo_Latin.txt rename to icu4c/source/data/translit/Jamo_Latn.txt index 362cfe28961..acf37e90613 100644 --- a/icu4c/source/data/translit/Jamo_Latin.txt +++ b/icu4c/source/data/translit/Jamo_Latn.txt @@ -1,13 +1,15 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Jamo_Latin.txt +# File: Jamo_Latn.txt # Generated from CLDR # + ::['ᄀ-하-ᅵᆨ-ᇂ가-힣]; ::NFD; ::ConjoiningJamo-Latin; ::NFC; + diff --git a/icu4c/source/data/translit/Kannada_InterIndic.txt b/icu4c/source/data/translit/Kannada_InterIndic.txt index a71bb6fc41f..cebcbaa3bc0 100644 --- a/icu4c/source/data/translit/Kannada_InterIndic.txt +++ b/icu4c/source/data/translit/Kannada_InterIndic.txt @@ -1,12 +1,14 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Kannada_InterIndic.txt # Generated from CLDR # + +# Kannada-InterIndic \u0CC6ೕ→\uE047; # VOWEL SIGN EE \u0CC6\u0CCDೖ→\uE048\uE04D; # VOWEL SIGN AI \u0CC6ೖ→\uE048; # VOWEL SIGN AI @@ -90,3 +92,5 @@ ೭→\uE06D; # DIGIT SEVEN ೮→\uE06E; # DIGIT EIGHT ೯→\uE06F; # DIGIT NINE +# eof + diff --git a/icu4c/source/data/translit/Kannada_Bengali.txt b/icu4c/source/data/translit/Knda_Beng.txt similarity index 82% rename from icu4c/source/data/translit/Kannada_Bengali.txt rename to icu4c/source/data/translit/Knda_Beng.txt index ba963619a31..7ce40e76d29 100644 --- a/icu4c/source/data/translit/Kannada_Bengali.txt +++ b/icu4c/source/data/translit/Knda_Beng.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Kannada_Bengali.txt +# File: Knda_Beng.txt # Generated from CLDR # + ::[ಂ-ಃಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹ\u0CBCಽಾ-ೄ\u0CC6-ೈೊ-\u0CCDೕ-ೖೞೠ-ೡ೦-೯]; ::NFD; ::Kannada-InterIndic; ::InterIndic-Bengali; ::NFC; + diff --git a/icu4c/source/data/translit/Kannada_Devanagari.txt b/icu4c/source/data/translit/Knda_Deva.txt similarity index 82% rename from icu4c/source/data/translit/Kannada_Devanagari.txt rename to icu4c/source/data/translit/Knda_Deva.txt index 4fa97a4a222..fb7b4ee8df5 100644 --- a/icu4c/source/data/translit/Kannada_Devanagari.txt +++ b/icu4c/source/data/translit/Knda_Deva.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Kannada_Devanagari.txt +# File: Knda_Deva.txt # Generated from CLDR # + ::[ಂ-ಃಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹ\u0CBCಽಾ-ೄ\u0CC6-ೈೊ-\u0CCDೕ-ೖೞೠ-ೡ೦-೯]; ::NFD; ::Kannada-InterIndic; ::InterIndic-Devanagari; ::NFC; + diff --git a/icu4c/source/data/translit/Kannada_Gujarati.txt b/icu4c/source/data/translit/Knda_Gujr.txt similarity index 82% rename from icu4c/source/data/translit/Kannada_Gujarati.txt rename to icu4c/source/data/translit/Knda_Gujr.txt index 4bab364deb9..76dd6a3c9f4 100644 --- a/icu4c/source/data/translit/Kannada_Gujarati.txt +++ b/icu4c/source/data/translit/Knda_Gujr.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Kannada_Gujarati.txt +# File: Knda_Gujr.txt # Generated from CLDR # + ::[ಂ-ಃಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹ\u0CBCಽಾ-ೄ\u0CC6-ೈೊ-\u0CCDೕ-ೖೞೠ-ೡ೦-೯]; ::NFD; ::Kannada-InterIndic; ::InterIndic-Gujarati; ::NFC; + diff --git a/icu4c/source/data/translit/Kannada_Gurmukhi.txt b/icu4c/source/data/translit/Knda_Guru.txt similarity index 82% rename from icu4c/source/data/translit/Kannada_Gurmukhi.txt rename to icu4c/source/data/translit/Knda_Guru.txt index 28b53649f63..3fa910a8bd8 100644 --- a/icu4c/source/data/translit/Kannada_Gurmukhi.txt +++ b/icu4c/source/data/translit/Knda_Guru.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Kannada_Gurmukhi.txt +# File: Knda_Guru.txt # Generated from CLDR # + ::[ಂ-ಃಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹ\u0CBCಽಾ-ೄ\u0CC6-ೈೊ-\u0CCDೕ-ೖೞೠ-ೡ೦-೯]; ::NFD; ::Kannada-InterIndic; ::InterIndic-Gurmukhi; ::NFC; + diff --git a/icu4c/source/data/translit/Kannada_Latin.txt b/icu4c/source/data/translit/Knda_Latn.txt similarity index 82% rename from icu4c/source/data/translit/Kannada_Latin.txt rename to icu4c/source/data/translit/Knda_Latn.txt index 438e819bcc9..cd03aa565b6 100644 --- a/icu4c/source/data/translit/Kannada_Latin.txt +++ b/icu4c/source/data/translit/Knda_Latn.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Kannada_Latin.txt +# File: Knda_Latn.txt # Generated from CLDR # + ::[ಂ-ಃಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹ\u0CBC-ೄ\u0CC6-ೈೊ-\u0CCDೕ-ೖೞೠ-ೡ೦-೯]; ::NFD; ::Kannada-InterIndic; ::InterIndic-Latin; ::NFC; + diff --git a/icu4c/source/data/translit/Kannada_Malayalam.txt b/icu4c/source/data/translit/Knda_Mlym.txt similarity index 82% rename from icu4c/source/data/translit/Kannada_Malayalam.txt rename to icu4c/source/data/translit/Knda_Mlym.txt index 2fbbcb750bd..d68b4d433f2 100644 --- a/icu4c/source/data/translit/Kannada_Malayalam.txt +++ b/icu4c/source/data/translit/Knda_Mlym.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Kannada_Malayalam.txt +# File: Knda_Mlym.txt # Generated from CLDR # + ::[ಂ-ಃಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹ\u0CBCಽಾ-ೄ\u0CC6-ೈೊ-\u0CCDೕ-ೖೞೠ-ೡ೦-೯]; ::NFD; ::Kannada-InterIndic; ::InterIndic-Malayalam; ::NFC; + diff --git a/icu4c/source/data/translit/Kannada_Oriya.txt b/icu4c/source/data/translit/Knda_Orya.txt similarity index 82% rename from icu4c/source/data/translit/Kannada_Oriya.txt rename to icu4c/source/data/translit/Knda_Orya.txt index 71d8fb476a0..68354f93823 100644 --- a/icu4c/source/data/translit/Kannada_Oriya.txt +++ b/icu4c/source/data/translit/Knda_Orya.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Kannada_Oriya.txt +# File: Knda_Orya.txt # Generated from CLDR # + ::[ಂ-ಃಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹ\u0CBCಽಾ-ೄ\u0CC6-ೈೊ-\u0CCDೕ-ೖೞೠ-ೡ೦-೯]; ::NFD; ::Kannada-InterIndic; ::InterIndic-Oriya; ::NFC; + diff --git a/icu4c/source/data/translit/Kannada_Tamil.txt b/icu4c/source/data/translit/Knda_Taml.txt similarity index 82% rename from icu4c/source/data/translit/Kannada_Tamil.txt rename to icu4c/source/data/translit/Knda_Taml.txt index 6a76a317b73..7e35e52b116 100644 --- a/icu4c/source/data/translit/Kannada_Tamil.txt +++ b/icu4c/source/data/translit/Knda_Taml.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Kannada_Tamil.txt +# File: Knda_Taml.txt # Generated from CLDR # + ::[ಂ-ಃಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹ\u0CBCಽಾ-ೄ\u0CC6-ೈೊ-\u0CCDೕ-ೖೞೠ-ೡ೦-೯]; ::NFD; ::Kannada-InterIndic; ::InterIndic-Tamil; ::NFC; + diff --git a/icu4c/source/data/translit/Kannada_Telugu.txt b/icu4c/source/data/translit/Knda_Telu.txt similarity index 82% rename from icu4c/source/data/translit/Kannada_Telugu.txt rename to icu4c/source/data/translit/Knda_Telu.txt index 556cc6e2f9b..f6af14c0853 100644 --- a/icu4c/source/data/translit/Kannada_Telugu.txt +++ b/icu4c/source/data/translit/Knda_Telu.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Kannada_Telugu.txt +# File: Knda_Telu.txt # Generated from CLDR # + ::[ಂ-ಃಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹ\u0CBCಽಾ-ೄ\u0CC6-ೈೊ-\u0CCDೕ-ೖೞೠ-ೡ೦-೯]; ::NFD; ::Kannada-InterIndic; ::InterIndic-Telugu; ::NFC; + diff --git a/icu4c/source/data/translit/Latin_ASCII.txt b/icu4c/source/data/translit/Latin_ASCII.txt index 38f870886a6..c83e48a70e1 100644 --- a/icu4c/source/data/translit/Latin_ASCII.txt +++ b/icu4c/source/data/translit/Latin_ASCII.txt @@ -1,16 +1,32 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Latin_ASCII.txt # Generated from CLDR # + +# This handles only Latin, Common, and IDEOGRAPHIC NUMBER ZERO (Han). +# :: [[:Latin:][:Common:][:Inherited:][〇]] ; +# +# Don't want NFKD, because that would convert things like superscripts and +# subscripts, which we do not want. So the individual transforms below +# include an appropriate subset of the NFKD ones. +# Here we remove accents from Latin characters. We then recompose to permit rules +# such as mapping NOT EQUAL TO to an ASCII equivalent e.g. "!=" if we choose to. +# :: NFD() ; [:Latin:] { [:Mn:]+ → ; # maps to nothing; remove all Mn following Latin letter :: NFC() ; +# +# Some of the following mappings (noted) are from CLDR ‹character-fallback› data. +# (Note, here "‹character-fallback›" uses U+2039/U+203A to avoid XML issues) +# +# Latin letters and IPA +# Æ → AE ; # 00C6;LATIN CAPITAL LETTER AE (from ‹character-fallback›) Ð → D ; # 00D0;LATIN CAPITAL LETTER ETH Ø → O ; # 00D8;LATIN CAPITAL LETTER O WITH STROKE @@ -222,6 +238,7 @@ ỽ → v ; # 1EFD;LATIN SMALL LETTER MIDDLE-WELSH V Ỿ → Y ; # 1EFE;LATIN CAPITAL LETTER Y WITH LOOP ỿ → y ; # 1EFF;LATIN SMALL LETTER Y WITH LOOP +# Presentation forms ff → ff ; # FB00;LATIN SMALL LIGATURE FF (compat) fi → fi ; # FB01;LATIN SMALL LIGATURE FI (compat) fl → fl ; # FB02;LATIN SMALL LIGATURE FL (compat) @@ -229,6 +246,7 @@ ffl → ffl ; # FB04;LATIN SMALL LIGATURE FFL (compat) ſt → st ; # FB05;LATIN SMALL LIGATURE LONG S T (compat) st → st ; # FB06;LATIN SMALL LIGATURE ST (compat) +# Fullwidth A → A ; # FF21;FULLWIDTH LATIN CAPITAL LETTER A (compat) B → B ; # FF22;FULLWIDTH LATIN CAPITAL LETTER B (compat) C → C ; # FF23;FULLWIDTH LATIN CAPITAL LETTER C (compat) @@ -281,6 +299,9 @@ x → x ; # FF58;FULLWIDTH LATIN SMALL LETTER X (compat) y → y ; # FF59;FULLWIDTH LATIN SMALL LETTER Y (compat) z → z ; # FF5A;FULLWIDTH LATIN SMALL LETTER Z (compat) +# +# Currency and letterlike +# © → '(C)' ; # 00A9;COPYRIGHT SIGN (from ‹character-fallback›) ® → '(R)' ; # 00AE;REGISTERED SIGN (from ‹character-fallback›) ₠ → CE ; # 20A0;EURO-CURRENCY SIGN (from ‹character-fallback›) @@ -329,6 +350,9 @@ ⅇ → e ; # 2147;DOUBLE-STRUCK ITALIC SMALL E (compat) ⅈ → i ; # 2148;DOUBLE-STRUCK ITALIC SMALL I (compat) ⅉ → j ; # 2149;DOUBLE-STRUCK ITALIC SMALL J (compat) +# +# Squared Latin +# ㍱ → hPa ; # 3371;SQUARE HPA (compat) ㍲ → da ; # 3372;SQUARE DA (compat) ㍳ → AU ; # 3373;SQUARE AU (compat) @@ -410,6 +434,9 @@ ㏝ → Wb ; # 33DD;SQUARE WB (compat) ㏞ → 'V/m' ; # 33DE;SQUARE V OVER M (compat) (from ‹character-fallback›) ㏟ → 'A/m' ; # 33DF;SQUARE A OVER M (compat) (from ‹character-fallback›) +# +# Enclosed Latin +# ⒜ → '(a)' ; # 249C;PARENTHESIZED LATIN SMALL LETTER A (compat) ⒝ → '(b)' ; # 249D;PARENTHESIZED LATIN SMALL LETTER B (compat) ⒞ → '(c)' ; # 249E;PARENTHESIZED LATIN SMALL LETTER C (compat) @@ -436,6 +463,9 @@ ⒳ → '(x)' ; # 24B3;PARENTHESIZED LATIN SMALL LETTER X (compat) ⒴ → '(y)' ; # 24B4;PARENTHESIZED LATIN SMALL LETTER Y (compat) ⒵ → '(z)' ; # 24B5;PARENTHESIZED LATIN SMALL LETTER Z (compat) +# +# Roman numerals +# Ⅰ → I ; # 2160;ROMAN NUMERAL ONE (compat) Ⅱ → II ; # 2161;ROMAN NUMERAL TWO (compat) Ⅲ → III ; # 2162;ROMAN NUMERAL THREE (compat) @@ -468,6 +498,9 @@ ⅽ → c ; # 217D;SMALL ROMAN NUMERAL ONE HUNDRED (compat) ⅾ → d ; # 217E;SMALL ROMAN NUMERAL FIVE HUNDRED (compat) ⅿ → m ; # 217F;SMALL ROMAN NUMERAL ONE THOUSAND (compat) +# +# Fractions +# ¼ → ' 1/4' ; # 00BC;VULGAR FRACTION ONE QUARTER (from ‹character-fallback›) ½ → ' 1/2' ; # 00BD;VULGAR FRACTION ONE HALF (from ‹character-fallback›) ¾ → ' 3/4' ; # 00BE;VULGAR FRACTION THREE QUARTERS (from ‹character-fallback›) @@ -484,6 +517,9 @@ ⅝ → ' 5/8' ; # 215D;VULGAR FRACTION FIVE EIGHTHS (from ‹character-fallback›) ⅞ → ' 7/8' ; # 215E;VULGAR FRACTION SEVEN EIGHTHS (from ‹character-fallback›) ⅟ → ' 1/' ; # 215F;FRACTION NUMERATOR ONE (from ‹character-fallback›) +# +# Enclosed numeric +# ⑴ → '(1)' ; # 2474;PARENTHESIZED DIGIT ONE (compat) ⑵ → '(2)' ; # 2475;PARENTHESIZED DIGIT TWO (compat) ⑶ → '(3)' ; # 2476;PARENTHESIZED DIGIT THREE (compat) @@ -524,6 +560,9 @@ ⒙ → '18.' ; # 2499;NUMBER EIGHTEEN FULL STOP (compat) ⒚ → '19.' ; # 249A;NUMBER NINETEEN FULL STOP (compat) ⒛ → '20.' ; # 249B;NUMBER TWENTY FULL STOP (compat) +# +# Other numeric (ideographic and fullwidth) +# 〇 → 0 ; # 3007;IDEOGRAPHIC NUMBER ZERO 0 → 0 ; # FF10;FULLWIDTH DIGIT ZERO (compat) 1 → 1 ; # FF11;FULLWIDTH DIGIT ONE (compat) @@ -535,6 +574,9 @@ 7 → 7 ; # FF17;FULLWIDTH DIGIT SEVEN (compat) 8 → 8 ; # FF18;FULLWIDTH DIGIT EIGHT (compat) 9 → 9 ; # FF19;FULLWIDTH DIGIT NINE (compat) +# +# Spaces +# \u00A0 → ' ' ; # 00A0;NO-BREAK SPACE \u2002 → ' ' ; # 2002;EN SPACE (compat) \u2003 → ' ' ; # 2003;EM SPACE (compat) @@ -547,6 +589,16 @@ \u200A → ' ' ; # 200A;HAIR SPACE (compat) \u205F → ' ' ; # 205F;MEDIUM MATHEMATICAL SPACE (compat) \u3000 → ' ' ; # 3000;IDEOGRAPHIC SPACE (from ‹character-fallback›) +# +# Quotes, apostrophes +# +ʹ → \' ; # 02B9;MODIFIER LETTER PRIME +ʺ → \" ; # 02BA;MODIFIER LETTER DOUBLE PRIME +ʻ → \' ; # 02BB;MODIFIER LETTER TURNED COMMA +ʼ → \' ; # 02BC;MODIFIER LETTER APOSTROPHE +ʽ → \' ; # 02BD;MODIFIER LETTER REVERSED COMMA +ˈ → \' ; # 02C8;MODIFIER LETTER VERTICAL LINE +ˋ → '`' ; # 02CB;MODIFIER LETTER GRAVE ACCENT ‘ → \' ; # 2018;LEFT SINGLE QUOTATION MARK (from ‹character-fallback›) ’ → \' ; # 2019;RIGHT SINGLE QUOTATION MARK (from ‹character-fallback›) ‚ → ',' ; # 201A;SINGLE LOW-9 QUOTATION MARK (from ‹character-fallback›) @@ -565,6 +617,9 @@ » → '>>' ; # 00BB;RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK (from ‹character-fallback›) ‹ → '<' ; # 2039;SINGLE LEFT-POINTING ANGLE QUOTATION MARK › → '>' ; # 203A;SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +# +# Dashes, hyphens... +# \u00AD → '-' ; # 00AD;SOFT HYPHEN (from ‹character-fallback›) ‐ → '-' ; # 2010;HYPHEN (from ‹character-fallback›) ‑ → '-' ; # 2011;NON-BREAKING HYPHEN (from ‹character-fallback›) @@ -577,6 +632,15 @@ ﹘ → '-' ; # FE58;SMALL EM DASH (compat) ﹣ → '-' ; # FE63;SMALL HYPHEN-MINUS (compat) - → '-' ; # FF0D;FULLWIDTH HYPHEN-MINUS (compat) +# +# Other misc punctuation and symbols +# +˂ → '<' ; # 02C2;MODIFIER LETTER LEFT ARROWHEAD +˃ → '>' ; # 02C3;MODIFIER LETTER RIGHT ARROWHEAD +˄ → '^' ; # 02C4;MODIFIER LETTER UP ARROWHEAD +ˆ → '^' ; # 02C6;MODIFIER LETTER CIRCUMFLEX ACCENT +ː → ':' ; # 02D0;MODIFIER LETTER TRIANGULAR COLON +˜ → '~' ; # 02DC;SMALL TILDE ‖ → '||' ; # 2016;DOUBLE VERTICAL LINE ․ → '.' ; # 2024;ONE DOT LEADER (compat) ‥ → '..' ; # 2025;TWO DOT LEADER (compat) @@ -589,6 +653,7 @@ ⁈ → '?!' ; # 2048;QUESTION EXCLAMATION MARK (compat) ⁉ → '!?' ; # 2049;EXCLAMATION QUESTION MARK (compat) ⁎ → '*' ; # 204E;LOW ASTERISK +# CJK 、 → ',' ; # 3001;IDEOGRAPHIC COMMA 。 → '.' ; # 3002;IDEOGRAPHIC FULL STOP 〈 → '<' ; # 3008;LEFT ANGLE BRACKET @@ -601,6 +666,7 @@ 〙 → ']' ; # 3019;RIGHT WHITE TORTOISE SHELL BRACKET 〚 → '[' ; # 301A;LEFT WHITE SQUARE BRACKET 〛 → ']' ; # 301B;RIGHT WHITE SQUARE BRACKET +# Vertical and small forms ︐ → ',' ; # FE10;PRESENTATION FORM FOR VERTICAL COMMA (compat) ︑ → ',' ; # FE11;PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA (compat) ︒ → '.' ; # FE12;PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP (compat) @@ -646,6 +712,7 @@ ﹩ → '$' ; # FE69;SMALL DOLLAR SIGN (compat) ﹪ → '%' ; # FE6A;SMALL PERCENT SIGN (compat) ﹫ → '@' ; # FE6B;SMALL COMMERCIAL AT (compat) +# Fullwidth and halfwidth ! → '!' ; # FF01;FULLWIDTH EXCLAMATION MARK (compat) # → '#' ; # FF03;FULLWIDTH NUMBER SIGN (compat) $ → '$' ; # FF04;FULLWIDTH DOLLAR SIGN (compat) @@ -679,8 +746,13 @@ ⦆ → '))' ; # FF60;FULLWIDTH RIGHT WHITE PARENTHESIS (compat)(from ‹character-fallback›) 。 → '.' ; # FF61;HALFWIDTH IDEOGRAPHIC FULL STOP (compat) 、 → ',' ; # FF64;HALFWIDTH IDEOGRAPHIC COMMA (compat) +# +# Other math operators (non-ASCII-range) +# × → '*' ; # 00D7;MULTIPLICATION SIGN ÷ → '/' ; # 00F7;DIVISION SIGN +˖ → '+' ; # 02D6;MODIFIER LETTER PLUS SIGN +˗ → '-' ; # 02D7;MODIFIER LETTER MINUS SIGN − → '-' ; # 2212;MINUS SIGN (from ‹character-fallback›) ∕ → '/' ; # 2215;DIVISION SLASH (from ‹character-fallback›) ∖ → '\' ; # 2216;SET MINUS (from ‹character-fallback›) @@ -693,3 +765,4 @@ ⩴ → '::=' ; # 2A74;DOUBLE COLON EQUAL (compat) ⩵ → '==' ; # 2A75;TWO CONSECUTIVE EQUALS SIGNS (compat) ⩶ → '===' ; # 2A76;THREE CONSECUTIVE EQUALS SIGNS (compat) + diff --git a/icu4c/source/data/translit/Latin_Bopomofo.txt b/icu4c/source/data/translit/Latin_Bopomofo.txt deleted file mode 100755 index 4bd0945405b..00000000000 --- a/icu4c/source/data/translit/Latin_Bopomofo.txt +++ /dev/null @@ -1,1452 +0,0 @@ -# *************************************************************************** -# * -# * Copyright (C) 2004-2015, International Business Machines -# * Corporation; Unicode, Inc.; and others. All Rights Reserved. -# * -# *************************************************************************** -# File: Latin_Bopomofo.txt -# Generated from CLDR -# -#--- forward filter, transforms ---- -# currently in reverse it only goes back to Latin with numeric tones (not sure why); ok but would rather have marks. -# -:: [[:Latin:][:Mn:][1-5]]; # forward filter: only modifies Latin and tone marks/digits -# -#--- remap v (alternate for ü) to ü, one way ---- -[ln] { v → ü; -# -:: Latin-NumericPinyin(NumericPinyin-Latin); # tone marks in middle ↔ digits at end to use numeric below; this leaves the pinyin in NFC -# -#--- variables ---- -# -# basic pinyin and zhuyin consonant initials (not including vowel initials): -$pCons = [b p m f d t n l g k h j q x r z c s]; # and zh ch sh, covered for this by h already in the set -$zCons = [ㄅ-ㄙ]; -# -# pinyin and zhuyin minus basic consonant initials that cannot take tones by themselves -# (in some cases the exclusion is only for the pinyin, not the corresponding zhuyin: zh ch sh r z c s) -$pToneOK = [[a-z] - [b p f d t l g k h j q x r z c s]]; # and minus zh ch sh, covered for this by h already in the exclusion set -$zToneOK = [[ㄅ-ㄩ] - [ㄅ ㄆ ㄈ ㄉ ㄊ ㄌ ㄍ ㄎ ㄏ ㄐ ㄑ ㄒ]]; -# -# basic consonant initials that can take tones by themselves -# (in some cases this is only for the zhuyin, not the corresponding pinyin: ㄓ ㄔ ㄕ ㄖ ㄗ ㄘ ㄙ) -# $pConsToneOK = [m n]; -# $zConsToneOK = [ㄇ ㄋ ㄓ ㄔ ㄕ ㄖ ㄗ ㄘ ㄙ]; -# -# basic numeric pinyin and zhuyin tones -$pTone = [1-5]; -$zTone = [ˉˊˇˋ˙]; -# -#--- clusters with initial consonant ---- -# -bang }$pTone ↔ ㄅㄤ }$zTone; - bang → ㄅㄤ˙; - bang1 ← ㄅㄤ; -beng }$pTone ↔ ㄅㄥ }$zTone; - beng → ㄅㄥ˙; - beng1 ← ㄅㄥ; -biao }$pTone ↔ ㄅㄧㄠ }$zTone; - biao → ㄅㄧㄠ˙; - biao1 ← ㄅㄧㄠ; -bian }$pTone ↔ ㄅㄧㄢ }$zTone; - bian → ㄅㄧㄢ˙; - bian1 ← ㄅㄧㄢ; -bing }$pTone ↔ ㄅㄧㄥ }$zTone; - bing → ㄅㄧㄥ˙; - bing1 ← ㄅㄧㄥ; -bai }$pTone ↔ ㄅㄞ }$zTone; - bai → ㄅㄞ˙; - bai1 ← ㄅㄞ; -bei }$pTone ↔ ㄅㄟ }$zTone; - bei → ㄅㄟ˙; - bei1 ← ㄅㄟ; -bao }$pTone ↔ ㄅㄠ }$zTone; - bao → ㄅㄠ˙; - bao1 ← ㄅㄠ; -ban }$pTone ↔ ㄅㄢ }$zTone; - ban → ㄅㄢ˙; - ban1 ← ㄅㄢ; -ben }$pTone ↔ ㄅㄣ }$zTone; - ben → ㄅㄣ˙; - ben1 ← ㄅㄣ; -bie }$pTone ↔ ㄅㄧㄝ }$zTone; - bie → ㄅㄧㄝ˙; - bie1 ← ㄅㄧㄝ; -bin }$pTone ↔ ㄅㄧㄣ }$zTone; - bin → ㄅㄧㄣ˙; - bin1 ← ㄅㄧㄣ; -bun }$pTone ↔ ㄅㄨㄣ }$zTone; - bun → ㄅㄨㄣ˙; - bun1 ← ㄅㄨㄣ; -ba }$pTone ↔ ㄅㄚ }$zTone; - ba → ㄅㄚ˙; - ba1 ← ㄅㄚ; -bo }$pTone ↔ ㄅㄛ }$zTone; - bo → ㄅㄛ˙; - bo1 ← ㄅㄛ; -bi }$pTone ↔ ㄅㄧ }$zTone; - bi → ㄅㄧ˙; - bi1 ← ㄅㄧ; -bu }$pTone ↔ ㄅㄨ }$zTone; - bu → ㄅㄨ˙; - bu1 ← ㄅㄨ; -# -pang }$pTone ↔ ㄆㄤ }$zTone; - pang → ㄆㄤ˙; - pang1 ← ㄆㄤ; -peng }$pTone ↔ ㄆㄥ }$zTone; - peng → ㄆㄥ˙; - peng1 ← ㄆㄥ; -piao }$pTone ↔ ㄆㄧㄠ }$zTone; - piao → ㄆㄧㄠ˙; - piao1 ← ㄆㄧㄠ; -pian }$pTone ↔ ㄆㄧㄢ }$zTone; - pian → ㄆㄧㄢ˙; - pian1 ← ㄆㄧㄢ; -ping }$pTone ↔ ㄆㄧㄥ }$zTone; - ping → ㄆㄧㄥ˙; - ping1 ← ㄆㄧㄥ; -pai }$pTone ↔ ㄆㄞ }$zTone; - pai → ㄆㄞ˙; - pai1 ← ㄆㄞ; -pei }$pTone ↔ ㄆㄟ }$zTone; - pei → ㄆㄟ˙; - pei1 ← ㄆㄟ; -pao }$pTone ↔ ㄆㄠ }$zTone; - pao → ㄆㄠ˙; - pao1 ← ㄆㄠ; -pou }$pTone ↔ ㄆㄡ }$zTone; - pou → ㄆㄡ˙; - pou1 ← ㄆㄡ; -pan }$pTone ↔ ㄆㄢ }$zTone; - pan → ㄆㄢ˙; - pan1 ← ㄆㄢ; -pen }$pTone ↔ ㄆㄣ }$zTone; - pen → ㄆㄣ˙; - pen1 ← ㄆㄣ; -pie }$pTone ↔ ㄆㄧㄝ }$zTone; - pie → ㄆㄧㄝ˙; - pie1 ← ㄆㄧㄝ; -pin }$pTone ↔ ㄆㄧㄣ }$zTone; - pin → ㄆㄧㄣ˙; - pin1 ← ㄆㄧㄣ; -pa }$pTone ↔ ㄆㄚ }$zTone; - pa → ㄆㄚ˙; - pa1 ← ㄆㄚ; -po }$pTone ↔ ㄆㄛ }$zTone; - po → ㄆㄛ˙; - po1 ← ㄆㄛ; -pi }$pTone ↔ ㄆㄧ }$zTone; - pi → ㄆㄧ˙; - pi1 ← ㄆㄧ; -pu }$pTone ↔ ㄆㄨ }$zTone; - pu → ㄆㄨ˙; - pu1 ← ㄆㄨ; -# -mang }$pTone ↔ ㄇㄤ }$zTone; - mang → ㄇㄤ˙; - mang1 ← ㄇㄤ; -meng }$pTone ↔ ㄇㄥ }$zTone; - meng → ㄇㄥ˙; - meng1 ← ㄇㄥ; -miao }$pTone ↔ ㄇㄧㄠ }$zTone; - miao → ㄇㄧㄠ˙; - miao1 ← ㄇㄧㄠ; -mian }$pTone ↔ ㄇㄧㄢ }$zTone; - mian → ㄇㄧㄢ˙; - mian1 ← ㄇㄧㄢ; -ming }$pTone ↔ ㄇㄧㄥ }$zTone; - ming → ㄇㄧㄥ˙; - ming1 ← ㄇㄧㄥ; -mai }$pTone ↔ ㄇㄞ }$zTone; - mai → ㄇㄞ˙; - mai1 ← ㄇㄞ; -mei }$pTone ↔ ㄇㄟ }$zTone; - mei → ㄇㄟ˙; - mei1 ← ㄇㄟ; -mao }$pTone ↔ ㄇㄠ }$zTone; - mao → ㄇㄠ˙; - mao1 ← ㄇㄠ; -mou }$pTone ↔ ㄇㄡ }$zTone; - mou → ㄇㄡ˙; - mou1 ← ㄇㄡ; -man }$pTone ↔ ㄇㄢ }$zTone; - man → ㄇㄢ˙; - man1 ← ㄇㄢ; -men }$pTone ↔ ㄇㄣ }$zTone; - men → ㄇㄣ˙; - men1 ← ㄇㄣ; -mie }$pTone ↔ ㄇㄧㄝ }$zTone; - mie → ㄇㄧㄝ˙; - mie1 ← ㄇㄧㄝ; -miu }$pTone ↔ ㄇㄧㄡ }$zTone; - miu → ㄇㄧㄡ˙; - miu1 ← ㄇㄧㄡ; -min }$pTone ↔ ㄇㄧㄣ }$zTone; - min → ㄇㄧㄣ˙; - min1 ← ㄇㄧㄣ; -ma }$pTone ↔ ㄇㄚ }$zTone; - ma → ㄇㄚ˙; - ma1 ← ㄇㄚ; -mo }$pTone ↔ ㄇㄛ }$zTone; - mo → ㄇㄛ˙; - mo1 ← ㄇㄛ; -me }$pTone ↔ ㄇㄜ }$zTone; - me → ㄇㄜ˙; - me1 ← ㄇㄜ; -mi }$pTone ↔ ㄇㄧ }$zTone; - mi → ㄇㄧ˙; - mi1 ← ㄇㄧ; -mu }$pTone ↔ ㄇㄨ }$zTone; - mu → ㄇㄨ˙; - mu1 ← ㄇㄨ; -# m handled below -# -fang }$pTone ↔ ㄈㄤ }$zTone; - fang → ㄈㄤ˙; - fang1 ← ㄈㄤ; -feng }$pTone ↔ ㄈㄥ }$zTone; - feng → ㄈㄥ˙; - feng1 ← ㄈㄥ; -fiao }$pTone ↔ ㄈㄧㄠ }$zTone; - fiao → ㄈㄧㄠ˙; - fiao1 ← ㄈㄧㄠ; -fei }$pTone ↔ ㄈㄟ }$zTone; - fei → ㄈㄟ˙; - fei1 ← ㄈㄟ; -fou }$pTone ↔ ㄈㄡ }$zTone; - fou → ㄈㄡ˙; - fou1 ← ㄈㄡ; -fan }$pTone ↔ ㄈㄢ }$zTone; - fan → ㄈㄢ˙; - fan1 ← ㄈㄢ; -fen }$pTone ↔ ㄈㄣ }$zTone; - fen → ㄈㄣ˙; - fen1 ← ㄈㄣ; -fa }$pTone ↔ ㄈㄚ }$zTone; - fa → ㄈㄚ˙; - fa1 ← ㄈㄚ; -fo }$pTone ↔ ㄈㄛ }$zTone; - fo → ㄈㄛ˙; - fo1 ← ㄈㄛ; -fu }$pTone ↔ ㄈㄨ }$zTone; - fu → ㄈㄨ˙; - fu1 ← ㄈㄨ; -# -diang }$pTone ↔ ㄉㄧㄤ }$zTone; # (not in han-latin) - diang → ㄉㄧㄤ˙; - diang1 ← ㄉㄧㄤ; -dang }$pTone ↔ ㄉㄤ }$zTone; - dang → ㄉㄤ˙; - dang1 ← ㄉㄤ; -deng }$pTone ↔ ㄉㄥ }$zTone; - deng → ㄉㄥ˙; - deng1 ← ㄉㄥ; -diao }$pTone ↔ ㄉㄧㄠ }$zTone; - diao → ㄉㄧㄠ˙; - diao1 ← ㄉㄧㄠ; -dian }$pTone ↔ ㄉㄧㄢ }$zTone; - dian → ㄉㄧㄢ˙; - dian1 ← ㄉㄧㄢ; -ding }$pTone ↔ ㄉㄧㄥ }$zTone; - ding → ㄉㄧㄥ˙; - ding1 ← ㄉㄧㄥ; -duan }$pTone ↔ ㄉㄨㄢ }$zTone; - duan → ㄉㄨㄢ˙; - duan1 ← ㄉㄨㄢ; -dong }$pTone ↔ ㄉㄨㄥ }$zTone; - dong → ㄉㄨㄥ˙; - dong1 ← ㄉㄨㄥ; -dai }$pTone ↔ ㄉㄞ }$zTone; - dai → ㄉㄞ˙; - dai1 ← ㄉㄞ; -dei }$pTone ↔ ㄉㄟ }$zTone; # (not in han-latin) - dei → ㄉㄟ˙; - dei1 ← ㄉㄟ; -dao }$pTone ↔ ㄉㄠ }$zTone; - dao → ㄉㄠ˙; - dao1 ← ㄉㄠ; -dou }$pTone ↔ ㄉㄡ }$zTone; - dou → ㄉㄡ˙; - dou1 ← ㄉㄡ; -dan }$pTone ↔ ㄉㄢ }$zTone; - dan → ㄉㄢ˙; - dan1 ← ㄉㄢ; -den }$pTone ↔ ㄉㄣ }$zTone; - den → ㄉㄣ˙; - den1 ← ㄉㄣ; -dia }$pTone ↔ ㄉㄧㄚ }$zTone; - dia → ㄉㄧㄚ˙; - dia1 ← ㄉㄧㄚ; -die }$pTone ↔ ㄉㄧㄝ }$zTone; - die → ㄉㄧㄝ˙; - die1 ← ㄉㄧㄝ; -diu }$pTone ↔ ㄉㄧㄡ }$zTone; - diu → ㄉㄧㄡ˙; - diu1 ← ㄉㄧㄡ; -din }$pTone ↔ ㄉㄧㄣ }$zTone; - din → ㄉㄧㄣ˙; - din1 ← ㄉㄧㄣ; -duo }$pTone ↔ ㄉㄨㄛ }$zTone; - duo → ㄉㄨㄛ˙; - duo1 ← ㄉㄨㄛ; -dui }$pTone ↔ ㄉㄨㄟ }$zTone; - dui → ㄉㄨㄟ˙; - dui1 ← ㄉㄨㄟ; -dun }$pTone ↔ ㄉㄨㄣ }$zTone; - dun → ㄉㄨㄣ˙; - dun1 ← ㄉㄨㄣ; -da }$pTone ↔ ㄉㄚ }$zTone; - da → ㄉㄚ˙; - da1 ← ㄉㄚ; -de }$pTone ↔ ㄉㄜ }$zTone; - de → ㄉㄜ˙; - de1 ← ㄉㄜ; -di }$pTone ↔ ㄉㄧ }$zTone; - di → ㄉㄧ˙; - di1 ← ㄉㄧ; -du }$pTone ↔ ㄉㄨ }$zTone; - du → ㄉㄨ˙; - du1 ← ㄉㄨ; -# -tang }$pTone ↔ ㄊㄤ }$zTone; - tang → ㄊㄤ˙; - tang1 ← ㄊㄤ; -teng }$pTone ↔ ㄊㄥ }$zTone; - teng → ㄊㄥ˙; - teng1 ← ㄊㄥ; -tiao }$pTone ↔ ㄊㄧㄠ }$zTone; - tiao → ㄊㄧㄠ˙; - tiao1 ← ㄊㄧㄠ; -tian }$pTone ↔ ㄊㄧㄢ }$zTone; - tian → ㄊㄧㄢ˙; - tian1 ← ㄊㄧㄢ; -ting }$pTone ↔ ㄊㄧㄥ }$zTone; - ting → ㄊㄧㄥ˙; - ting1 ← ㄊㄧㄥ; -tuan }$pTone ↔ ㄊㄨㄢ }$zTone; - tuan → ㄊㄨㄢ˙; - tuan1 ← ㄊㄨㄢ; -tong }$pTone ↔ ㄊㄨㄥ }$zTone; - tong → ㄊㄨㄥ˙; - tong1 ← ㄊㄨㄥ; -tai }$pTone ↔ ㄊㄞ }$zTone; - tai → ㄊㄞ˙; - tai1 ← ㄊㄞ; -tao }$pTone ↔ ㄊㄠ }$zTone; - tao → ㄊㄠ˙; - tao1 ← ㄊㄠ; -tou }$pTone ↔ ㄊㄡ }$zTone; - tou → ㄊㄡ˙; - tou1 ← ㄊㄡ; -tan }$pTone ↔ ㄊㄢ }$zTone; - tan → ㄊㄢ˙; - tan1 ← ㄊㄢ; -tie }$pTone ↔ ㄊㄧㄝ }$zTone; - tie → ㄊㄧㄝ˙; - tie1 ← ㄊㄧㄝ; -tuo }$pTone ↔ ㄊㄨㄛ }$zTone; - tuo → ㄊㄨㄛ˙; - tuo1 ← ㄊㄨㄛ; -tui }$pTone ↔ ㄊㄨㄟ }$zTone; - tui → ㄊㄨㄟ˙; - tui1 ← ㄊㄨㄟ; -tun }$pTone ↔ ㄊㄨㄣ }$zTone; - tun → ㄊㄨㄣ˙; - tun1 ← ㄊㄨㄣ; -ta }$pTone ↔ ㄊㄚ }$zTone; - ta → ㄊㄚ˙; - ta1 ← ㄊㄚ; -te }$pTone ↔ ㄊㄜ }$zTone; - te → ㄊㄜ˙; - te1 ← ㄊㄜ; -ti }$pTone ↔ ㄊㄧ }$zTone; - ti → ㄊㄧ˙; - ti1 ← ㄊㄧ; -tu }$pTone ↔ ㄊㄨ }$zTone; - tu → ㄊㄨ˙; - tu1 ← ㄊㄨ; -# -niang }$pTone ↔ ㄋㄧㄤ }$zTone; - niang → ㄋㄧㄤ˙; - niang1 ← ㄋㄧㄤ; -nang }$pTone ↔ ㄋㄤ }$zTone; - nang → ㄋㄤ˙; - nang1 ← ㄋㄤ; -neng }$pTone ↔ ㄋㄥ }$zTone; - neng → ㄋㄥ˙; - neng1 ← ㄋㄥ; -niao }$pTone ↔ ㄋㄧㄠ }$zTone; - niao → ㄋㄧㄠ˙; - niao1 ← ㄋㄧㄠ; -nian }$pTone ↔ ㄋㄧㄢ }$zTone; - nian → ㄋㄧㄢ˙; - nian1 ← ㄋㄧㄢ; -ning }$pTone ↔ ㄋㄧㄥ }$zTone; - ning → ㄋㄧㄥ˙; - ning1 ← ㄋㄧㄥ; -nuan }$pTone ↔ ㄋㄨㄢ }$zTone; - nuan → ㄋㄨㄢ˙; - nuan1 ← ㄋㄨㄢ; -nong }$pTone ↔ ㄋㄨㄥ }$zTone; - nong → ㄋㄨㄥ˙; - nong1 ← ㄋㄨㄥ; -nai }$pTone ↔ ㄋㄞ }$zTone; - nai → ㄋㄞ˙; - nai1 ← ㄋㄞ; -nei }$pTone ↔ ㄋㄟ }$zTone; - nei → ㄋㄟ˙; - nei1 ← ㄋㄟ; -nao }$pTone ↔ ㄋㄠ }$zTone; - nao → ㄋㄠ˙; - nao1 ← ㄋㄠ; -nou }$pTone ↔ ㄋㄡ }$zTone; - nou → ㄋㄡ˙; - nou1 ← ㄋㄡ; -nan }$pTone ↔ ㄋㄢ }$zTone; - nan → ㄋㄢ˙; - nan1 ← ㄋㄢ; -nen }$pTone ↔ ㄋㄣ }$zTone; - nen → ㄋㄣ˙; - nen1 ← ㄋㄣ; -nia }$pTone ↔ ㄋㄧㄚ }$zTone; # (not in han-latin) - nia → ㄋㄧㄚ˙; - nia1 ← ㄋㄧㄚ; -nie }$pTone ↔ ㄋㄧㄝ }$zTone; - nie → ㄋㄧㄝ˙; - nie1 ← ㄋㄧㄝ; -niu }$pTone ↔ ㄋㄧㄡ }$zTone; - niu → ㄋㄧㄡ˙; - niu1 ← ㄋㄧㄡ; -nin }$pTone ↔ ㄋㄧㄣ }$zTone; - nin → ㄋㄧㄣ˙; - nin1 ← ㄋㄧㄣ; -nuo }$pTone ↔ ㄋㄨㄛ }$zTone; - nuo → ㄋㄨㄛ˙; - nuo1 ← ㄋㄨㄛ; -nun }$pTone ↔ ㄋㄨㄣ }$zTone; - nun → ㄋㄨㄣ˙; - nun1 ← ㄋㄨㄣ; -nüe }$pTone ↔ ㄋㄩㄝ }$zTone; - nüe → ㄋㄩㄝ˙; - nüe1 ← ㄋㄩㄝ; -nue }$pTone → ㄋㄩㄝ; # (not in han-latin) one-way, handle wrong u - nue → ㄋㄩㄝ˙; -na }$pTone ↔ ㄋㄚ }$zTone; - na → ㄋㄚ˙; - na1 ← ㄋㄚ; -ne }$pTone ↔ ㄋㄜ }$zTone; - ne → ㄋㄜ˙; - ne1 ← ㄋㄜ; -ni }$pTone ↔ ㄋㄧ }$zTone; - ni → ㄋㄧ˙; - ni1 ← ㄋㄧ; -nu }$pTone ↔ ㄋㄨ }$zTone; - nu → ㄋㄨ˙; - nu1 ← ㄋㄨ; -nü }$pTone ↔ ㄋㄩ }$zTone; - nü → ㄋㄩ˙; - nü1 ← ㄋㄩ; -# n handled below -# -liang }$pTone ↔ ㄌㄧㄤ }$zTone; - liang → ㄌㄧㄤ˙; - liang1 ← ㄌㄧㄤ; -lang }$pTone ↔ ㄌㄤ }$zTone; - lang → ㄌㄤ˙; - lang1 ← ㄌㄤ; -leng }$pTone ↔ ㄌㄥ }$zTone; - leng → ㄌㄥ˙; - leng1 ← ㄌㄥ; -liao }$pTone ↔ ㄌㄧㄠ }$zTone; - liao → ㄌㄧㄠ˙; - liao1 ← ㄌㄧㄠ; -lian }$pTone ↔ ㄌㄧㄢ }$zTone; - lian → ㄌㄧㄢ˙; - lian1 ← ㄌㄧㄢ; -ling }$pTone ↔ ㄌㄧㄥ }$zTone; - ling → ㄌㄧㄥ˙; - ling1 ← ㄌㄧㄥ; -luan }$pTone ↔ ㄌㄨㄢ }$zTone; - luan → ㄌㄨㄢ˙; - luan1 ← ㄌㄨㄢ; -long }$pTone ↔ ㄌㄨㄥ }$zTone; - long → ㄌㄨㄥ˙; - long1 ← ㄌㄨㄥ; -lüan }$pTone ↔ ㄌㄩㄢ }$zTone; # (not in han-latin) - lüan → ㄌㄩㄢ˙; - lüan1 ← ㄌㄩㄢ; -lai }$pTone ↔ ㄌㄞ }$zTone; - lai → ㄌㄞ˙; - lai1 ← ㄌㄞ; -lei }$pTone ↔ ㄌㄟ }$zTone; - lei → ㄌㄟ˙; - lei1 ← ㄌㄟ; -lao }$pTone ↔ ㄌㄠ }$zTone; - lao → ㄌㄠ˙; - lao1 ← ㄌㄠ; -lou }$pTone ↔ ㄌㄡ }$zTone; - lou → ㄌㄡ˙; - lou1 ← ㄌㄡ; -lan }$pTone ↔ ㄌㄢ }$zTone; - lan → ㄌㄢ˙; - lan1 ← ㄌㄢ; -lia }$pTone ↔ ㄌㄧㄚ }$zTone; - lia → ㄌㄧㄚ˙; - lia1 ← ㄌㄧㄚ; -lie }$pTone ↔ ㄌㄧㄝ }$zTone; - lie → ㄌㄧㄝ˙; - lie1 ← ㄌㄧㄝ; -liu }$pTone ↔ ㄌㄧㄡ }$zTone; - liu → ㄌㄧㄡ˙; - liu1 ← ㄌㄧㄡ; -lin }$pTone ↔ ㄌㄧㄣ }$zTone; - lin → ㄌㄧㄣ˙; - lin1 ← ㄌㄧㄣ; -luo }$pTone ↔ ㄌㄨㄛ }$zTone; - luo → ㄌㄨㄛ˙; - luo1 ← ㄌㄨㄛ; -lun }$pTone ↔ ㄌㄨㄣ }$zTone; - lun → ㄌㄨㄣ˙; - lun1 ← ㄌㄨㄣ; -lüe }$pTone ↔ ㄌㄩㄝ }$zTone; - lüe → ㄌㄩㄝ˙; - lüe1 ← ㄌㄩㄝ; -lue }$pTone → ㄌㄩㄝ; # (not in han-latin) one-way, handle wrong u - lue → ㄌㄩㄝ˙; -la }$pTone ↔ ㄌㄚ }$zTone; - la → ㄌㄚ˙; - la1 ← ㄌㄚ; -lo }$pTone ↔ ㄌㄛ }$zTone; - lo → ㄌㄛ˙; - lo1 ← ㄌㄛ; -le }$pTone ↔ ㄌㄜ }$zTone; - le → ㄌㄜ˙; - le1 ← ㄌㄜ; -li }$pTone ↔ ㄌㄧ }$zTone; - li → ㄌㄧ˙; - li1 ← ㄌㄧ; -lu }$pTone ↔ ㄌㄨ }$zTone; - lu → ㄌㄨ˙; - lu1 ← ㄌㄨ; -lü }$pTone ↔ ㄌㄩ }$zTone; - lü → ㄌㄩ˙; - lü1 ← ㄌㄩ; -# -guang }$pTone ↔ ㄍㄨㄤ }$zTone; - guang → ㄍㄨㄤ˙; - guang1 ← ㄍㄨㄤ; -gang }$pTone ↔ ㄍㄤ }$zTone; - gang → ㄍㄤ˙; - gang1 ← ㄍㄤ; -geng }$pTone ↔ ㄍㄥ }$zTone; - geng → ㄍㄥ˙; - geng1 ← ㄍㄥ; -guai }$pTone ↔ ㄍㄨㄞ }$zTone; - guai → ㄍㄨㄞ˙; - guai1 ← ㄍㄨㄞ; -guan }$pTone ↔ ㄍㄨㄢ }$zTone; - guan → ㄍㄨㄢ˙; - guan1 ← ㄍㄨㄢ; -gong }$pTone ↔ ㄍㄨㄥ }$zTone; - gong → ㄍㄨㄥ˙; - gong1 ← ㄍㄨㄥ; -gai }$pTone ↔ ㄍㄞ }$zTone; - gai → ㄍㄞ˙; - gai1 ← ㄍㄞ; -gei }$pTone ↔ ㄍㄟ }$zTone; - gei → ㄍㄟ˙; - gei1 ← ㄍㄟ; -gao }$pTone ↔ ㄍㄠ }$zTone; - gao → ㄍㄠ˙; - gao1 ← ㄍㄠ; -gou }$pTone ↔ ㄍㄡ }$zTone; - gou → ㄍㄡ˙; - gou1 ← ㄍㄡ; -gan }$pTone ↔ ㄍㄢ }$zTone; - gan → ㄍㄢ˙; - gan1 ← ㄍㄢ; -gen }$pTone ↔ ㄍㄣ }$zTone; - gen → ㄍㄣ˙; - gen1 ← ㄍㄣ; -gua }$pTone ↔ ㄍㄨㄚ }$zTone; - gua → ㄍㄨㄚ˙; - gua1 ← ㄍㄨㄚ; -guo }$pTone ↔ ㄍㄨㄛ }$zTone; - guo → ㄍㄨㄛ˙; - guo1 ← ㄍㄨㄛ; -gui }$pTone ↔ ㄍㄨㄟ }$zTone; - gui → ㄍㄨㄟ˙; - gui1 ← ㄍㄨㄟ; -gun }$pTone ↔ ㄍㄨㄣ }$zTone; - gun → ㄍㄨㄣ˙; - gun1 ← ㄍㄨㄣ; -ga }$pTone ↔ ㄍㄚ }$zTone; - ga → ㄍㄚ˙; - ga1 ← ㄍㄚ; -ge }$pTone ↔ ㄍㄜ }$zTone; - ge → ㄍㄜ˙; - ge1 ← ㄍㄜ; -gi }$pTone ↔ ㄍㄧ }$zTone; - gi → ㄍㄧ˙; - gi1 ← ㄍㄧ; -gu }$pTone ↔ ㄍㄨ }$zTone; - gu → ㄍㄨ˙; - gu1 ← ㄍㄨ; -# -kuang }$pTone ↔ ㄎㄨㄤ }$zTone; - kuang → ㄎㄨㄤ˙; - kuang1 ← ㄎㄨㄤ; -kang }$pTone ↔ ㄎㄤ }$zTone; - kang → ㄎㄤ˙; - kang1 ← ㄎㄤ; -keng }$pTone ↔ ㄎㄥ }$zTone; - keng → ㄎㄥ˙; - keng1 ← ㄎㄥ; -kuai }$pTone ↔ ㄎㄨㄞ }$zTone; - kuai → ㄎㄨㄞ˙; - kuai1 ← ㄎㄨㄞ; -kuan }$pTone ↔ ㄎㄨㄢ }$zTone; - kuan → ㄎㄨㄢ˙; - kuan1 ← ㄎㄨㄢ; -kong }$pTone ↔ ㄎㄨㄥ }$zTone; - kong → ㄎㄨㄥ˙; - kong1 ← ㄎㄨㄥ; -kai }$pTone ↔ ㄎㄞ }$zTone; - kai → ㄎㄞ˙; - kai1 ← ㄎㄞ; -kao }$pTone ↔ ㄎㄠ }$zTone; - kao → ㄎㄠ˙; - kao1 ← ㄎㄠ; -kou }$pTone ↔ ㄎㄡ }$zTone; - kou → ㄎㄡ˙; - kou1 ← ㄎㄡ; -kan }$pTone ↔ ㄎㄢ }$zTone; - kan → ㄎㄢ˙; - kan1 ← ㄎㄢ; -ken }$pTone ↔ ㄎㄣ }$zTone; - ken → ㄎㄣ˙; - ken1 ← ㄎㄣ; -kua }$pTone ↔ ㄎㄨㄚ }$zTone; - kua → ㄎㄨㄚ˙; - kua1 ← ㄎㄨㄚ; -kuo }$pTone ↔ ㄎㄨㄛ }$zTone; - kuo → ㄎㄨㄛ˙; - kuo1 ← ㄎㄨㄛ; -kui }$pTone ↔ ㄎㄨㄟ }$zTone; - kui → ㄎㄨㄟ˙; - kui1 ← ㄎㄨㄟ; -kun }$pTone ↔ ㄎㄨㄣ }$zTone; - kun → ㄎㄨㄣ˙; - kun1 ← ㄎㄨㄣ; -ka }$pTone ↔ ㄎㄚ }$zTone; - ka → ㄎㄚ˙; - ka1 ← ㄎㄚ; -ke }$pTone ↔ ㄎㄜ }$zTone; - ke → ㄎㄜ˙; - ke1 ← ㄎㄜ; -ku }$pTone ↔ ㄎㄨ }$zTone; - ku → ㄎㄨ˙; - ku1 ← ㄎㄨ; -# -huang }$pTone ↔ ㄏㄨㄤ }$zTone; - huang → ㄏㄨㄤ˙; - huang1 ← ㄏㄨㄤ; -hang }$pTone ↔ ㄏㄤ }$zTone; - hang → ㄏㄤ˙; - hang1 ← ㄏㄤ; -heng }$pTone ↔ ㄏㄥ }$zTone; - heng → ㄏㄥ˙; - heng1 ← ㄏㄥ; -huai }$pTone ↔ ㄏㄨㄞ }$zTone; - huai → ㄏㄨㄞ˙; - huai1 ← ㄏㄨㄞ; -huan }$pTone ↔ ㄏㄨㄢ }$zTone; - huan → ㄏㄨㄢ˙; - huan1 ← ㄏㄨㄢ; -hong }$pTone ↔ ㄏㄨㄥ }$zTone; - hong → ㄏㄨㄥ˙; - hong1 ← ㄏㄨㄥ; -hai }$pTone ↔ ㄏㄞ }$zTone; - hai → ㄏㄞ˙; - hai1 ← ㄏㄞ; -hei }$pTone ↔ ㄏㄟ }$zTone; - hei → ㄏㄟ˙; - hei1 ← ㄏㄟ; -hao }$pTone ↔ ㄏㄠ }$zTone; - hao → ㄏㄠ˙; - hao1 ← ㄏㄠ; -hou }$pTone ↔ ㄏㄡ }$zTone; - hou → ㄏㄡ˙; - hou1 ← ㄏㄡ; -han }$pTone ↔ ㄏㄢ }$zTone; - han → ㄏㄢ˙; - han1 ← ㄏㄢ; -hen }$pTone ↔ ㄏㄣ }$zTone; - hen → ㄏㄣ˙; - hen1 ← ㄏㄣ; -hua }$pTone ↔ ㄏㄨㄚ }$zTone; - hua → ㄏㄨㄚ˙; - hua1 ← ㄏㄨㄚ; -huo }$pTone ↔ ㄏㄨㄛ }$zTone; - huo → ㄏㄨㄛ˙; - huo1 ← ㄏㄨㄛ; -hui }$pTone ↔ ㄏㄨㄟ }$zTone; - hui → ㄏㄨㄟ˙; - hui1 ← ㄏㄨㄟ; -hun }$pTone ↔ ㄏㄨㄣ }$zTone; - hun → ㄏㄨㄣ˙; - hun1 ← ㄏㄨㄣ; -hm }$pTone ↔ ㄏㄇ }$zTone; - hm → ㄏㄇ˙; - hm1 ← ㄏㄇ; -ha }$pTone ↔ ㄏㄚ }$zTone; - ha → ㄏㄚ˙; - ha1 ← ㄏㄚ; -ho }$pTone ↔ ㄏㄛ }$zTone; - ho → ㄏㄛ˙; - ho1 ← ㄏㄛ; -he }$pTone ↔ ㄏㄜ }$zTone; - he → ㄏㄜ˙; - he1 ← ㄏㄜ; -hu }$pTone ↔ ㄏㄨ }$zTone; - hu → ㄏㄨ˙; - hu1 ← ㄏㄨ; -# -jiang }$pTone ↔ ㄐㄧㄤ }$zTone; - jiang → ㄐㄧㄤ˙; - jiang1 ← ㄐㄧㄤ; -jiong }$pTone ↔ ㄐㄩㄥ }$zTone; - jiong → ㄐㄩㄥ˙; - jiong1 ← ㄐㄩㄥ; -jiao }$pTone ↔ ㄐㄧㄠ }$zTone; - jiao → ㄐㄧㄠ˙; - jiao1 ← ㄐㄧㄠ; -jian }$pTone ↔ ㄐㄧㄢ }$zTone; - jian → ㄐㄧㄢ˙; - jian1 ← ㄐㄧㄢ; -jing }$pTone ↔ ㄐㄧㄥ }$zTone; - jing → ㄐㄧㄥ˙; - jing1 ← ㄐㄧㄥ; -juan }$pTone ↔ ㄐㄩㄢ }$zTone; - juan → ㄐㄩㄢ˙; - juan1 ← ㄐㄩㄢ; -jia }$pTone ↔ ㄐㄧㄚ }$zTone; - jia → ㄐㄧㄚ˙; - jia1 ← ㄐㄧㄚ; -jie }$pTone ↔ ㄐㄧㄝ }$zTone; - jie → ㄐㄧㄝ˙; - jie1 ← ㄐㄧㄝ; -jiu }$pTone ↔ ㄐㄧㄡ }$zTone; - jiu → ㄐㄧㄡ˙; - jiu1 ← ㄐㄧㄡ; -jin }$pTone ↔ ㄐㄧㄣ }$zTone; - jin → ㄐㄧㄣ˙; - jin1 ← ㄐㄧㄣ; -jue }$pTone ↔ ㄐㄩㄝ }$zTone; - jue → ㄐㄩㄝ˙; - jue1 ← ㄐㄩㄝ; -jun }$pTone ↔ ㄐㄩㄣ }$zTone; - jun → ㄐㄩㄣ˙; - jun1 ← ㄐㄩㄣ; -ji }$pTone ↔ ㄐㄧ }$zTone; - ji → ㄐㄧ˙; - ji1 ← ㄐㄧ; -ju }$pTone ↔ ㄐㄩ }$zTone; - ju → ㄐㄩ˙; - ju1 ← ㄐㄩ; -# -qiang }$pTone ↔ ㄑㄧㄤ }$zTone; - qiang → ㄑㄧㄤ˙; - qiang1 ← ㄑㄧㄤ; -qiong }$pTone ↔ ㄑㄩㄥ }$zTone; - qiong → ㄑㄩㄥ˙; - qiong1 ← ㄑㄩㄥ; -qiao }$pTone ↔ ㄑㄧㄠ }$zTone; - qiao → ㄑㄧㄠ˙; - qiao1 ← ㄑㄧㄠ; -qian }$pTone ↔ ㄑㄧㄢ }$zTone; - qian → ㄑㄧㄢ˙; - qian1 ← ㄑㄧㄢ; -qing }$pTone ↔ ㄑㄧㄥ }$zTone; - qing → ㄑㄧㄥ˙; - qing1 ← ㄑㄧㄥ; -quan }$pTone ↔ ㄑㄩㄢ }$zTone; - quan → ㄑㄩㄢ˙; - quan1 ← ㄑㄩㄢ; -qia }$pTone ↔ ㄑㄧㄚ }$zTone; - qia → ㄑㄧㄚ˙; - qia1 ← ㄑㄧㄚ; -qie }$pTone ↔ ㄑㄧㄝ }$zTone; - qie → ㄑㄧㄝ˙; - qie1 ← ㄑㄧㄝ; -qiu }$pTone ↔ ㄑㄧㄡ }$zTone; - qiu → ㄑㄧㄡ˙; - qiu1 ← ㄑㄧㄡ; -qin }$pTone ↔ ㄑㄧㄣ }$zTone; - qin → ㄑㄧㄣ˙; - qin1 ← ㄑㄧㄣ; -que }$pTone ↔ ㄑㄩㄝ }$zTone; - que → ㄑㄩㄝ˙; - que1 ← ㄑㄩㄝ; -qun }$pTone ↔ ㄑㄩㄣ }$zTone; - qun → ㄑㄩㄣ˙; - qun1 ← ㄑㄩㄣ; -qi }$pTone ↔ ㄑㄧ }$zTone; - qi → ㄑㄧ˙; - qi1 ← ㄑㄧ; -qu }$pTone ↔ ㄑㄩ }$zTone; - qu → ㄑㄩ˙; - qu1 ← ㄑㄩ; -# -xiang }$pTone ↔ ㄒㄧㄤ }$zTone; - xiang → ㄒㄧㄤ˙; - xiang1 ← ㄒㄧㄤ; -xiong }$pTone ↔ ㄒㄩㄥ }$zTone; - xiong → ㄒㄩㄥ˙; - xiong1 ← ㄒㄩㄥ; -xiao }$pTone ↔ ㄒㄧㄠ }$zTone; - xiao → ㄒㄧㄠ˙; - xiao1 ← ㄒㄧㄠ; -xian }$pTone ↔ ㄒㄧㄢ }$zTone; - xian → ㄒㄧㄢ˙; - xian1 ← ㄒㄧㄢ; -xing }$pTone ↔ ㄒㄧㄥ }$zTone; - xing → ㄒㄧㄥ˙; - xing1 ← ㄒㄧㄥ; -xuan }$pTone ↔ ㄒㄩㄢ }$zTone; - xuan → ㄒㄩㄢ˙; - xuan1 ← ㄒㄩㄢ; -xia }$pTone ↔ ㄒㄧㄚ }$zTone; - xia → ㄒㄧㄚ˙; - xia1 ← ㄒㄧㄚ; -xie }$pTone ↔ ㄒㄧㄝ }$zTone; - xie → ㄒㄧㄝ˙; - xie1 ← ㄒㄧㄝ; -xiu }$pTone ↔ ㄒㄧㄡ }$zTone; - xiu → ㄒㄧㄡ˙; - xiu1 ← ㄒㄧㄡ; -xin }$pTone ↔ ㄒㄧㄣ }$zTone; - xin → ㄒㄧㄣ˙; - xin1 ← ㄒㄧㄣ; -xue }$pTone ↔ ㄒㄩㄝ }$zTone; - xue → ㄒㄩㄝ˙; - xue1 ← ㄒㄩㄝ; -xun }$pTone ↔ ㄒㄩㄣ }$zTone; - xun → ㄒㄩㄣ˙; - xun1 ← ㄒㄩㄣ; -xi }$pTone ↔ ㄒㄧ }$zTone; - xi → ㄒㄧ˙; - xi1 ← ㄒㄧ; -xu }$pTone ↔ ㄒㄩ }$zTone; - xu → ㄒㄩ˙; - xu1 ← ㄒㄩ; -# -zhuang }$pTone ↔ ㄓㄨㄤ }$zTone; - zhuang → ㄓㄨㄤ˙; - zhuang1 ← ㄓㄨㄤ; -zhang }$pTone ↔ ㄓㄤ }$zTone; - zhang → ㄓㄤ˙; - zhang1 ← ㄓㄤ; -zheng }$pTone ↔ ㄓㄥ }$zTone; - zheng → ㄓㄥ˙; - zheng1 ← ㄓㄥ; -zhuai }$pTone ↔ ㄓㄨㄞ }$zTone; - zhuai → ㄓㄨㄞ˙; - zhuai1 ← ㄓㄨㄞ; -zhuan }$pTone ↔ ㄓㄨㄢ }$zTone; - zhuan → ㄓㄨㄢ˙; - zhuan1 ← ㄓㄨㄢ; -zhong }$pTone ↔ ㄓㄨㄥ }$zTone; - zhong → ㄓㄨㄥ˙; - zhong1 ← ㄓㄨㄥ; -zhai }$pTone ↔ ㄓㄞ }$zTone; - zhai → ㄓㄞ˙; - zhai1 ← ㄓㄞ; -zhei }$pTone ↔ ㄓㄟ }$zTone; # (not in han-latin) - zhei → ㄓㄟ˙; - zhei1 ← ㄓㄟ; -zhao }$pTone ↔ ㄓㄠ }$zTone; - zhao → ㄓㄠ˙; - zhao1 ← ㄓㄠ; -zhou }$pTone ↔ ㄓㄡ }$zTone; - zhou → ㄓㄡ˙; - zhou1 ← ㄓㄡ; -zhan }$pTone ↔ ㄓㄢ }$zTone; - zhan → ㄓㄢ˙; - zhan1 ← ㄓㄢ; -zhen }$pTone ↔ ㄓㄣ }$zTone; - zhen → ㄓㄣ˙; - zhen1 ← ㄓㄣ; -zhua }$pTone ↔ ㄓㄨㄚ }$zTone; - zhua → ㄓㄨㄚ˙; - zhua1 ← ㄓㄨㄚ; -zhuo }$pTone ↔ ㄓㄨㄛ }$zTone; - zhuo → ㄓㄨㄛ˙; - zhuo1 ← ㄓㄨㄛ; -zhui }$pTone ↔ ㄓㄨㄟ }$zTone; - zhui → ㄓㄨㄟ˙; - zhui1 ← ㄓㄨㄟ; -zhun }$pTone ↔ ㄓㄨㄣ }$zTone; - zhun → ㄓㄨㄣ˙; - zhun1 ← ㄓㄨㄣ; -zha }$pTone ↔ ㄓㄚ }$zTone; - zha → ㄓㄚ˙; - zha1 ← ㄓㄚ; -zhe }$pTone ↔ ㄓㄜ }$zTone; - zhe → ㄓㄜ˙; - zhe1 ← ㄓㄜ; -zhu }$pTone ↔ ㄓㄨ }$zTone; - zhu → ㄓㄨ˙; - zhu1 ← ㄓㄨ; -zhi }$pTone ↔ ㄓ }$zTone; - zhi → ㄓ˙; - zhi1 ← ㄓ; -# -chuang }$pTone ↔ ㄔㄨㄤ }$zTone; - chuang → ㄔㄨㄤ˙; - chuang1 ← ㄔㄨㄤ; -chang }$pTone ↔ ㄔㄤ }$zTone; - chang → ㄔㄤ˙; - chang1 ← ㄔㄤ; -cheng }$pTone ↔ ㄔㄥ }$zTone; - cheng → ㄔㄥ˙; - cheng1 ← ㄔㄥ; -chuai }$pTone ↔ ㄔㄨㄞ }$zTone; - chuai → ㄔㄨㄞ˙; - chuai1 ← ㄔㄨㄞ; -chuan }$pTone ↔ ㄔㄨㄢ }$zTone; - chuan → ㄔㄨㄢ˙; - chuan1 ← ㄔㄨㄢ; -chong }$pTone ↔ ㄔㄨㄥ }$zTone; - chong → ㄔㄨㄥ˙; - chong1 ← ㄔㄨㄥ; -chai }$pTone ↔ ㄔㄞ }$zTone; - chai → ㄔㄞ˙; - chai1 ← ㄔㄞ; -chao }$pTone ↔ ㄔㄠ }$zTone; - chao → ㄔㄠ˙; - chao1 ← ㄔㄠ; -chou }$pTone ↔ ㄔㄡ }$zTone; - chou → ㄔㄡ˙; - chou1 ← ㄔㄡ; -chan }$pTone ↔ ㄔㄢ }$zTone; - chan → ㄔㄢ˙; - chan1 ← ㄔㄢ; -chen }$pTone ↔ ㄔㄣ }$zTone; - chen → ㄔㄣ˙; - chen1 ← ㄔㄣ; -chua }$pTone ↔ ㄔㄨㄚ }$zTone; - chua → ㄔㄨㄚ˙; - chua1 ← ㄔㄨㄚ; -chuo }$pTone ↔ ㄔㄨㄛ }$zTone; - chuo → ㄔㄨㄛ˙; - chuo1 ← ㄔㄨㄛ; -chui }$pTone ↔ ㄔㄨㄟ }$zTone; - chui → ㄔㄨㄟ˙; - chui1 ← ㄔㄨㄟ; -chun }$pTone ↔ ㄔㄨㄣ }$zTone; - chun → ㄔㄨㄣ˙; - chun1 ← ㄔㄨㄣ; -cha }$pTone ↔ ㄔㄚ }$zTone; - cha → ㄔㄚ˙; - cha1 ← ㄔㄚ; -che }$pTone ↔ ㄔㄜ }$zTone; - che → ㄔㄜ˙; - che1 ← ㄔㄜ; -chu }$pTone ↔ ㄔㄨ }$zTone; - chu → ㄔㄨ˙; - chu1 ← ㄔㄨ; -chi }$pTone ↔ ㄔ }$zTone; - chi → ㄔ˙; - chi1 ← ㄔ; -# -shuang }$pTone ↔ ㄕㄨㄤ }$zTone; - shuang → ㄕㄨㄤ˙; - shuang1 ← ㄕㄨㄤ; -shong }$pTone ↔ ㄕㄡㄥ }$zTone; # (not in han-latin) - shong → ㄕㄡㄥ˙; - shong1 ← ㄕㄡㄥ; -shang }$pTone ↔ ㄕㄤ }$zTone; - shang → ㄕㄤ˙; - shang1 ← ㄕㄤ; -sheng }$pTone ↔ ㄕㄥ }$zTone; - sheng → ㄕㄥ˙; - sheng1 ← ㄕㄥ; -shuai }$pTone ↔ ㄕㄨㄞ }$zTone; - shuai → ㄕㄨㄞ˙; - shuai1 ← ㄕㄨㄞ; -shuan }$pTone ↔ ㄕㄨㄢ }$zTone; - shuan → ㄕㄨㄢ˙; - shuan1 ← ㄕㄨㄢ; -shai }$pTone ↔ ㄕㄞ }$zTone; - shai → ㄕㄞ˙; - shai1 ← ㄕㄞ; -shei }$pTone ↔ ㄕㄟ }$zTone; # (not in han-latin) - shei → ㄕㄟ˙; - shei1 ← ㄕㄟ; -shao }$pTone ↔ ㄕㄠ }$zTone; - shao → ㄕㄠ˙; - shao1 ← ㄕㄠ; -shou }$pTone ↔ ㄕㄡ }$zTone; - shou → ㄕㄡ˙; - shou1 ← ㄕㄡ; -shan }$pTone ↔ ㄕㄢ }$zTone; - shan → ㄕㄢ˙; - shan1 ← ㄕㄢ; -shen }$pTone ↔ ㄕㄣ }$zTone; - shen → ㄕㄣ˙; - shen1 ← ㄕㄣ; -shua }$pTone ↔ ㄕㄨㄚ }$zTone; - shua → ㄕㄨㄚ˙; - shua1 ← ㄕㄨㄚ; -shuo }$pTone ↔ ㄕㄨㄛ }$zTone; - shuo → ㄕㄨㄛ˙; - shuo1 ← ㄕㄨㄛ; -shui }$pTone ↔ ㄕㄨㄟ }$zTone; - shui → ㄕㄨㄟ˙; - shui1 ← ㄕㄨㄟ; -shun }$pTone ↔ ㄕㄨㄣ }$zTone; - shun → ㄕㄨㄣ˙; - shun1 ← ㄕㄨㄣ; -sha }$pTone ↔ ㄕㄚ }$zTone; - sha → ㄕㄚ˙; - sha1 ← ㄕㄚ; -she }$pTone ↔ ㄕㄜ }$zTone; - she → ㄕㄜ˙; - she1 ← ㄕㄜ; -shu }$pTone ↔ ㄕㄨ }$zTone; - shu → ㄕㄨ˙; - shu1 ← ㄕㄨ; -shi }$pTone ↔ ㄕ }$zTone; - shi → ㄕ˙; - shi1 ← ㄕ; -# -rang }$pTone ↔ ㄖㄤ }$zTone; - rang → ㄖㄤ˙; - rang1 ← ㄖㄤ; -reng }$pTone ↔ ㄖㄥ }$zTone; - reng → ㄖㄥ˙; - reng1 ← ㄖㄥ; -ruan }$pTone ↔ ㄖㄨㄢ }$zTone; - ruan → ㄖㄨㄢ˙; - ruan1 ← ㄖㄨㄢ; -rong }$pTone ↔ ㄖㄨㄥ }$zTone; - rong → ㄖㄨㄥ˙; - rong1 ← ㄖㄨㄥ; -rao }$pTone ↔ ㄖㄠ }$zTone; - rao → ㄖㄠ˙; - rao1 ← ㄖㄠ; -rou }$pTone ↔ ㄖㄡ }$zTone; - rou → ㄖㄡ˙; - rou1 ← ㄖㄡ; -ran }$pTone ↔ ㄖㄢ }$zTone; - ran → ㄖㄢ˙; - ran1 ← ㄖㄢ; -ren }$pTone ↔ ㄖㄣ }$zTone; - ren → ㄖㄣ˙; - ren1 ← ㄖㄣ; -ruo }$pTone ↔ ㄖㄨㄛ }$zTone; - ruo → ㄖㄨㄛ˙; - ruo1 ← ㄖㄨㄛ; -rui }$pTone ↔ ㄖㄨㄟ }$zTone; - rui → ㄖㄨㄟ˙; - rui1 ← ㄖㄨㄟ; -run }$pTone ↔ ㄖㄨㄣ }$zTone; - run → ㄖㄨㄣ˙; - run1 ← ㄖㄨㄣ; -ra }$pTone ↔ ㄖㄚ }$zTone; - ra → ㄖㄚ˙; - ra1 ← ㄖㄚ; -re }$pTone ↔ ㄖㄜ }$zTone; - re → ㄖㄜ˙; - re1 ← ㄖㄜ; -ru }$pTone ↔ ㄖㄨ }$zTone; - ru → ㄖㄨ˙; - ru1 ← ㄖㄨ; -ri }$pTone ↔ ㄖ }$zTone; - ri → ㄖ˙; - ri1 ← ㄖ; -# -zang }$pTone ↔ ㄗㄤ }$zTone; - zang → ㄗㄤ˙; - zang1 ← ㄗㄤ; -zeng }$pTone ↔ ㄗㄥ }$zTone; - zeng → ㄗㄥ˙; - zeng1 ← ㄗㄥ; -zuan }$pTone ↔ ㄗㄨㄢ }$zTone; - zuan → ㄗㄨㄢ˙; - zuan1 ← ㄗㄨㄢ; -zong }$pTone ↔ ㄗㄨㄥ }$zTone; - zong → ㄗㄨㄥ˙; - zong1 ← ㄗㄨㄥ; -zai }$pTone ↔ ㄗㄞ }$zTone; - zai → ㄗㄞ˙; - zai1 ← ㄗㄞ; -zei }$pTone ↔ ㄗㄟ }$zTone; - zei → ㄗㄟ˙; - zei1 ← ㄗㄟ; -zao }$pTone ↔ ㄗㄠ }$zTone; - zao → ㄗㄠ˙; - zao1 ← ㄗㄠ; -zou }$pTone ↔ ㄗㄡ }$zTone; - zou → ㄗㄡ˙; - zou1 ← ㄗㄡ; -zan }$pTone ↔ ㄗㄢ }$zTone; - zan → ㄗㄢ˙; - zan1 ← ㄗㄢ; -zen }$pTone ↔ ㄗㄣ }$zTone; - zen → ㄗㄣ˙; - zen1 ← ㄗㄣ; -zuo }$pTone ↔ ㄗㄨㄛ }$zTone; - zuo → ㄗㄨㄛ˙; - zuo1 ← ㄗㄨㄛ; -zui }$pTone ↔ ㄗㄨㄟ }$zTone; - zui → ㄗㄨㄟ˙; - zui1 ← ㄗㄨㄟ; -zun }$pTone ↔ ㄗㄨㄣ }$zTone; - zun → ㄗㄨㄣ˙; - zun1 ← ㄗㄨㄣ; -za }$pTone ↔ ㄗㄚ }$zTone; - za → ㄗㄚ˙; - za1 ← ㄗㄚ; -ze }$pTone ↔ ㄗㄜ }$zTone; - ze → ㄗㄜ˙; - ze1 ← ㄗㄜ; -zu }$pTone ↔ ㄗㄨ }$zTone; - zu → ㄗㄨ˙; - zu1 ← ㄗㄨ; -zi }$pTone ↔ ㄗ }$zTone; - zi → ㄗ˙; - zi1 ← ㄗ; -# -cang }$pTone ↔ ㄘㄤ }$zTone; - cang → ㄘㄤ˙; - cang1 ← ㄘㄤ; -ceng }$pTone ↔ ㄘㄥ }$zTone; - ceng → ㄘㄥ˙; - ceng1 ← ㄘㄥ; -cuan }$pTone ↔ ㄘㄨㄢ }$zTone; - cuan → ㄘㄨㄢ˙; - cuan1 ← ㄘㄨㄢ; -cong }$pTone ↔ ㄘㄨㄥ }$zTone; - cong → ㄘㄨㄥ˙; - cong1 ← ㄘㄨㄥ; -cai }$pTone ↔ ㄘㄞ }$zTone; - cai → ㄘㄞ˙; - cai1 ← ㄘㄞ; -cao }$pTone ↔ ㄘㄠ }$zTone; - cao → ㄘㄠ˙; - cao1 ← ㄘㄠ; -cou }$pTone ↔ ㄘㄡ }$zTone; - cou → ㄘㄡ˙; - cou1 ← ㄘㄡ; -can }$pTone ↔ ㄘㄢ }$zTone; - can → ㄘㄢ˙; - can1 ← ㄘㄢ; -cen }$pTone ↔ ㄘㄣ }$zTone; - cen → ㄘㄣ˙; - cen1 ← ㄘㄣ; -cuo }$pTone ↔ ㄘㄨㄛ }$zTone; - cuo → ㄘㄨㄛ˙; - cuo1 ← ㄘㄨㄛ; -cui }$pTone ↔ ㄘㄨㄟ }$zTone; - cui → ㄘㄨㄟ˙; - cui1 ← ㄘㄨㄟ; -cun }$pTone ↔ ㄘㄨㄣ }$zTone; - cun → ㄘㄨㄣ˙; - cun1 ← ㄘㄨㄣ; -ca }$pTone ↔ ㄘㄚ }$zTone; - ca → ㄘㄚ˙; - ca1 ← ㄘㄚ; -ce }$pTone ↔ ㄘㄜ }$zTone; - ce → ㄘㄜ˙; - ce1 ← ㄘㄜ; -cu }$pTone ↔ ㄘㄨ }$zTone; - cu → ㄘㄨ˙; - cu1 ← ㄘㄨ; -ci }$pTone ↔ ㄘ }$zTone; - ci → ㄘ˙; - ci1 ← ㄘ; -# -sang }$pTone ↔ ㄙㄤ }$zTone; - sang → ㄙㄤ˙; - sang1 ← ㄙㄤ; -seng }$pTone ↔ ㄙㄥ }$zTone; - seng → ㄙㄥ˙; - seng1 ← ㄙㄥ; -suan }$pTone ↔ ㄙㄨㄢ }$zTone; - suan → ㄙㄨㄢ˙; - suan1 ← ㄙㄨㄢ; -song }$pTone ↔ ㄙㄨㄥ }$zTone; - song → ㄙㄨㄥ˙; - song1 ← ㄙㄨㄥ; -sai }$pTone ↔ ㄙㄞ }$zTone; - sai → ㄙㄞ˙; - sai1 ← ㄙㄞ; -sei }$pTone ↔ ㄙㄟ }$zTone; # (not in han-latin) - sei → ㄙㄟ˙; - sei1 ← ㄙㄟ; -sao }$pTone ↔ ㄙㄠ }$zTone; - sao → ㄙㄠ˙; - sao1 ← ㄙㄠ; -sou }$pTone ↔ ㄙㄡ }$zTone; - sou → ㄙㄡ˙; - sou1 ← ㄙㄡ; -san }$pTone ↔ ㄙㄢ }$zTone; - san → ㄙㄢ˙; - san1 ← ㄙㄢ; -sen }$pTone ↔ ㄙㄣ }$zTone; - sen → ㄙㄣ˙; - sen1 ← ㄙㄣ; -suo }$pTone ↔ ㄙㄨㄛ }$zTone; - suo → ㄙㄨㄛ˙; - suo1 ← ㄙㄨㄛ; -sui }$pTone ↔ ㄙㄨㄟ }$zTone; - sui → ㄙㄨㄟ˙; - sui1 ← ㄙㄨㄟ; -sun }$pTone ↔ ㄙㄨㄣ }$zTone; - sun → ㄙㄨㄣ˙; - sun1 ← ㄙㄨㄣ; -sa }$pTone ↔ ㄙㄚ }$zTone; - sa → ㄙㄚ˙; - sa1 ← ㄙㄚ; -se }$pTone ↔ ㄙㄜ }$zTone; - se → ㄙㄜ˙; - se1 ← ㄙㄜ; -su }$pTone ↔ ㄙㄨ }$zTone; - su → ㄙㄨ˙; - su1 ← ㄙㄨ; -si }$pTone ↔ ㄙ }$zTone; - si → ㄙ˙; - si1 ← ㄙ; -# -#--- vowels and vowel compounds ---- -# most exist as syllables by themselves and they are also used as finals for initial consonants -# -yuan }$pTone ↔ ㄩㄢ }$zTone; - yuan → ㄩㄢ˙; - yuan1 ← ㄩㄢ; -yong }$pTone ↔ ㄩㄥ }$zTone; - yong → ㄩㄥ˙; - yong1 ← ㄩㄥ; -yue }$pTone ↔ ㄩㄝ }$zTone; - yue → ㄩㄝ˙; - yue1 ← ㄩㄝ; -yun }$pTone ↔ ㄩㄣ }$zTone; - yun → ㄩㄣ˙; - yun1 ← ㄩㄣ; -yu }$pTone ↔ ㄩ }$zTone; - yu → ㄩ˙; - yu1 ← ㄩ; -# iu handled below -# -yang }$pTone ↔ ㄧㄤ }$zTone; - yang → ㄧㄤ˙; - yang1 ← ㄧㄤ; -ying }$pTone ↔ ㄧㄥ }$zTone; - ying → ㄧㄥ˙; - ying1 ← ㄧㄥ; -yai }$pTone ↔ ㄧㄞ }$zTone; # (not in han-latin) - yai → ㄧㄞ˙; - yai1 ← ㄧㄞ; -yao }$pTone ↔ ㄧㄠ }$zTone; - yao → ㄧㄠ˙; - yao1 ← ㄧㄠ; -you }$pTone ↔ ㄧㄡ }$zTone; - you → ㄧㄡ˙; - you1 ← ㄧㄡ; -yan }$pTone ↔ ㄧㄢ }$zTone; - yan → ㄧㄢ˙; - yan1 ← ㄧㄢ; -yin }$pTone ↔ ㄧㄣ }$zTone; - yin → ㄧㄣ˙; - yin1 ← ㄧㄣ; -ya }$pTone ↔ ㄧㄚ }$zTone; - ya → ㄧㄚ˙; - ya1 ← ㄧㄚ; -yo }$pTone ↔ ㄧㄛ }$zTone; - yo → ㄧㄛ˙; - yo1 ← ㄧㄛ; -ye }$pTone ↔ ㄧㄝ }$zTone; - ye → ㄧㄝ˙; - ye1 ← ㄧㄝ; -yi }$pTone ↔ ㄧ }$zTone; - yi → ㄧ˙; - yi1 ← ㄧ; -# i handled below -# -wong }$pTone ↔ ㄨㄨㄥ }$zTone; - wong → ㄨㄨㄥ˙; - wong1 ← ㄨㄨㄥ; -wang }$pTone ↔ ㄨㄤ }$zTone; - wang → ㄨㄤ˙; - wang1 ← ㄨㄤ; -weng }$pTone ↔ ㄨㄥ }$zTone; - weng → ㄨㄥ˙; - weng1 ← ㄨㄥ; -wai }$pTone ↔ ㄨㄞ }$zTone; - wai → ㄨㄞ˙; - wai1 ← ㄨㄞ; -wei }$pTone ↔ ㄨㄟ }$zTone; - wei → ㄨㄟ˙; - wei1 ← ㄨㄟ; -wan }$pTone ↔ ㄨㄢ }$zTone; - wan → ㄨㄢ˙; - wan1 ← ㄨㄢ; -wen }$pTone ↔ ㄨㄣ }$zTone; - wen → ㄨㄣ˙; - wen1 ← ㄨㄣ; -wa }$pTone ↔ ㄨㄚ }$zTone; - wa → ㄨㄚ˙; - wa1 ← ㄨㄚ; -wo }$pTone ↔ ㄨㄛ }$zTone; - wo → ㄨㄛ˙; - wo1 ← ㄨㄛ; -wu }$pTone ↔ ㄨ }$zTone; - wu → ㄨ˙; - wu1 ← ㄨ; -# u handled below -# -ang }$pTone ↔ ㄤ }$zTone; - ang → ㄤ˙; - ang1 ← ㄤ; -eng }$pTone ↔ ㄥ }$zTone; - eng → ㄥ˙; - eng1 ← ㄥ; -eh }$pTone ↔ ㄝ }$zTone; # (not in han-latin) - eh → ㄝ˙; - eh1 ← ㄝ; -ea }$pTone → ㄝ; # (not in han-latin) one-way - ea → ㄝ˙; -ai }$pTone ↔ ㄞ }$zTone; - ai → ㄞ˙; - ai1 ← ㄞ; -ei }$pTone ↔ ㄟ }$zTone; - ei → ㄟ˙; - ei1 ← ㄟ; -ao }$pTone ↔ ㄠ }$zTone; - ao → ㄠ˙; - ao1 ← ㄠ; -au }$pTone → ㄠ; # (not in han-latin) one-way, handle unicode spelling - au → ㄠ˙; -ou }$pTone ↔ ㄡ }$zTone; - ou → ㄡ˙; - ou1 ← ㄡ; -an }$pTone ↔ ㄢ }$zTone; - an → ㄢ˙; - an1 ← ㄢ; -en }$pTone ↔ ㄣ }$zTone; - en → ㄣ˙; - en1 ← ㄣ; -er }$pTone ↔ ㄦ }$zTone; - er → ㄦ˙; - er1 ← ㄦ; -a }$pTone ↔ ㄚ }$zTone; - a → ㄚ˙; - a1 ← ㄚ; -o }$pTone ↔ ㄛ }$zTone; - o → ㄛ˙; - o1 ← ㄛ; -e }$pTone ↔ ㄜ }$zTone; - e → ㄜ˙; - e1 ← ㄜ; -# -# handle unicode spellings of ㄧ,ㄨ,ㄩ above -iu }$pTone → ㄩ; # (not in han-latin) one-way, handle unicode spelling - iu → ㄩ˙; -i }$pTone → ㄧ; # (not in han-latin) one-way, handle unicode spelling - i → ㄧ˙; -u }$pTone → ㄨ; # (not in han-latin) one-way, handle unicode spelling - u → ㄨ˙; -# -#--- clusters with a single pinyin consonant that can apear in other clusters ---- -# -m }$pTone ↔ ㄇ }$zTone; - m → ㄇ˙; - m1 ← ㄇ; -# -n }$pTone ↔ ㄋ }$zTone; - n → ㄋ˙; - n1 ← ㄋ; -# -#--- fallback mappings ---- -# -# separate fallback mappings for some compound finals after consonants -# (different pinyin than the standalone mappings for these zhuyin sequences). -# -#------- -# would be nice to have these, need to work out how; -# something like the following, but need to avoid conflicts with mappings above: -# $pCons{ ia }$pTone ↔ $zCons{ ㄧㄚ }$zTone; # fallback mapping for unambiguous compound final -# $pCons{ ia → ㄧㄚ˙; -# ia1 ← $zCons{ ㄧㄚ -# -# the relevant mappings are: -# ia ↔ ㄧㄚ -# ie ↔ ㄧㄝ -# iao ↔ ㄧㄠ -# iu ↔ ㄧㄡ -# ian ↔ ㄧㄢ -# in ↔ ㄧㄣ -# iang ↔ ㄧㄤ -# ing ↔ ㄧㄥ -# ua ↔ ㄨㄚ -# uo ↔ ㄨㄛ -# uai ↔ ㄨㄞ -# ui ↔ ㄨㄟ -# uang ↔ ㄨㄤ -# ong ↔ ㄨㄥ -# iong ↔ ㄩㄥ -#------- -# -# separate fallback mappings for some initial consonants not handled above -# none of the mapped consonants handled here can have tones, so this is simple -b ↔ ㄅ; -p ↔ ㄆ; -# m ↔ ㄇ; # handled above -f ↔ ㄈ; -d ↔ ㄉ; -t ↔ ㄊ; -# n ↔ ㄋ; # handled above -l ↔ ㄌ; -g ↔ ㄍ; -k ↔ ㄎ; -h ↔ ㄏ; -j ↔ ㄐ; -q ↔ ㄑ; -x ↔ ㄒ; -zh → ㄓ; # reverse mapping to zhi handled above -ch → ㄔ; # reverse mapping to chi handled above -sh → ㄕ; # reverse mapping to shi handled above -r → ㄖ; # reverse mapping to ri handled above -z → ㄗ; # reverse mapping to zi handled above -c → ㄘ; # reverse mapping to ci handled above -s → ㄙ; # reverse mapping to si handled above -# -#--- tones (except for the ummarked cases handled above) ---- -# -# tone 1: pinyin \u0304 or 1 ↔ zhuyin typically unmarked or use ˉ \u02C9 -1 → ; # map to nothing -1 ← ˉ ; # transform if marked in zhuyin -# did the following with rules for each cluster, above -# 1 ← ; # map nothing in zhuyin to pinyin mark -# -# tones 2-4 (easy) -# $pToneOK{ 2 ↔ $zToneOK{ ˊ; # pinyin \u0301 or 2 ↔ zhuyin \u02CA -# $pToneOK{ 3 ↔ $zToneOK{ ˇ; # pinyin \u030C or 3 ↔ zhuyin \u02C7 -# $pToneOK{ 4 ↔ $zToneOK{ ˋ; # pinyin \u0300 or 4 ↔ zhuyin \u02CB -# actually don't need context: -2 ↔ ˊ; # pinyin \u0301 or 2 ↔ zhuyin \u02CA -3 ↔ ˇ; # pinyin \u030C or 3 ↔ zhuyin \u02C7 -4 ↔ ˋ; # pinyin \u0300 or 4 ↔ zhuyin \u02CB -# -# tone 5 (light): pinyin typically unmarked or use 5 ↔ zhuyin ˙ \u02D9 - ← ˙; # map to nothing -5 → ˙; # transform if marked in pinyin -# did the following with rules for each cluster above -# → ˙; # map nothing in pinyin to zhuyin mark -# -#--- reverse filter ---- -# -:: ([[ㄅ-ㄩ][ˉˊˇˋ˙]]); # reverse filter: only modifies basic Bopomofo and tone marks diff --git a/icu4c/source/data/translit/Latin_ConjoiningJamo.txt b/icu4c/source/data/translit/Latin_ConjoiningJamo.txt index b5c515a434a..c5351b40f59 100644 --- a/icu4c/source/data/translit/Latin_ConjoiningJamo.txt +++ b/icu4c/source/data/translit/Latin_ConjoiningJamo.txt @@ -1,12 +1,69 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Latin_ConjoiningJamo.txt # Generated from CLDR # + +# Follows the Ministry of Culture and Tourism romanization: see http://www.korea.net/korea/kor_loca.asp?code=A020303 +# http://www.unicode.org/cldr/transliteration_guidelines.html#Korean +#- N.B. DO NOT put any filters, NFD, etc. here -- those are aliased in +#- the INDEX file. This transliterator is, by itself, not +#- instantiated. It is used as a part of Latin-Jamo, Latin-Hangul, or +#- inverses thereof. +# Transliteration from Latin characters to Korean script is done in +# two steps: Latin to Jamo, then Jamo to Hangul. The Jamo-Hangul +# transliteration is done algorithmically following Unicode 3.0 +# section 3.11. This file implements the Latin to Jamo +# transliteration using rules. +# Jamo occupy the block 1100-11FF. Within this block there are three +# groups of characters: initial consonants or choseong (I), medial +# vowels or jungseong (M), and trailing consonants or jongseong (F). +# Standard Korean syllables are of the form I+M+F*. +# Section 3.11 describes the use of 'filler' jamo to convert +# nonstandard syllables to standard form: the choseong filler 115F and +# the junseong filler 1160. In this transliterator, we will not use +# 115F or 1160. +# We will, however, insert two 'null' jamo to make foreign words +# conform to Korean syllable structure. These are the null initial +# consonant 110B (IEUNG) and the null vowel 1173 (EU). In Latin text, +# we will use the separator in order to disambiguate strings, +# e.g. "kan-ggan" (initial GG) vs. "kanggan" (final NG + initial G). +# We will not use all of the characters in the jamo block. We will +# only use the 19 initials, 21 medials, and 27 finals possessing a +# jamo short name as defined in section 4.4 of the Unicode book. +# Rules of thumb. These guidelines provide the basic framework +# for the rules. They are phrased in terms of Latin-Jamo transliteration. +# The Jamo-Latin rules derive from these, since the Jamo-Latin rules are +# just context-free transliteration of jamo to corresponding short names, +# with the addition of separators to maintain round-trip integrity +# in the context of the Latin-Jamo rules. +# A sequence of vowels: +# - Take the longest sequence you can. If there are too many, or you don't +# have a starting consonant, introduce a 110B necessary. +# A sequence of consonants. +# - First join the double consonants: G + G -→ GG +# - In the remaining list, +# -- If there is no preceding vowel, take the first consonant, and insert EU +# after it. Continue with the rest of the consonants. +# -- If there is one consonant, attach to the following vowel +# -- If there are two consonants and a following vowel, attach one to the +# preceeding vowel, and one to the following vowel. +# -- If there are more than two consonants, join the first two together if you +# can: L + G =→ LG +# -- If you still end up with more than 2 consonants, insert EU after the +# first one, and continue with the rest of the consonants. +#---------------------------------------------------------------------- +# Variables +# Some latin consonants or consonant pairs only occur as initials, and +# some only as finals, but some occur as both. This makes some jamo +# consonants ambiguous when transliterated into latin. +# Initial only: IEUNG BB DD JJ R +# Final only: BS GS L LB LG LH LM LP LS LT NG NH NJ +# Initial and Final: B C D G GG H J K M N P S SS T $Gi = ᄀ; $KKi = ᄁ; $Ni = ᄂ; @@ -77,20 +134,81 @@ $Hf = ᇂ; $jamoInitial = [ᄀ-ᄒ]; $jamoMedial = [ᅡ-ᅵ]; $latinInitial = [bcdghjklmnprst]; +# Any character in the latin transliteration of a medial $latinMedial = [aeiouwy]; +# The last character of the latin transliteration of a medial $latinMedialEnd = [aeiou]; +# Disambiguation separator $sep = \-; +#---------------------------------------------------------------------- +# Jamo-Latin +# +# Jamo to latin is relatively simple, since it is the latin that is +# ambiguous. Most rules are straightforward, and we encode them below +# as simple add-on back rule, e.g.: +# $jamoMedial {bs} → $BS; +# becomes +# $jamoMedial {bs} ↔ $BS; +# +# Furthermore, we don't care about the ordering for Jamo-Latin because +# we are going from single characters, so we can very easily piggyback +# on the Latin-Jamo. +# +# The main issue with Jamo-Latin is when to insert separators. +# Separators are inserted to obtain correct round trip behavior. For +# example, the sequence Ki A Gf Gi E, if transliterated to "kagge", +# would then round trip to Ki A GGi E. To prevent this, we insert a +# separator: "kag-ge". IMPORTANT: The need for separators depends +# very specifically on the behavior of the Latin-Jamo rules. A change +# in the Latin-Jamo behavior can completely change the way the +# separator insertion must be done. +# First try to preserve actual separators in the jamo text by doubling +# them. This fixes problems like: +# (Di)(A)(Ji)(U)(NG)-(IEUNG)(YEO)(Nf)(Gi)(YEO)(L) =→ dajung-yeongyeol +# =→ (Di)(A)(Ji)(U)(NG)(IEUNG)(YEO)(Nf)(Gi)(YEO)(L). This is optional +# -- if we don't care about losing separators in the jamo, we can delete +# this rule. $sep $sep ↔ $sep; +# Triple consonants. For three consonants "axxx" we insert a +# separator between the first and second "x" if XXf, Xf, and Xi all +# exist, and we have A Xf XXi. This prevents the reverse +# transliteration to A XXf Xi. $sep ← $latinMedialEnd s {} $SSi; +# For vowels the rule is similar. If there is a vowel "ae" such that +# "a" by itself and "e" by itself are vowels, then we want to map A E +# to "a-e" so as not to round trip to AE. However, in the text Ki EO +# IEUNG E we don't need to map to "keo-e". "keoe" suffices. For +# vowels of the form "aei", both "ae" + "i" and "a" + "ei" must be +# tested. NOTE: These rules used to have a left context of +# $latinInitial instead of [^$latinMedial]. The problem with this is +# sequences where an initial IEUNG is transliterated away: +# (IEUNG)(A)(IEUNG)(EO) =→ aeo =→ (IEUNG)(AE)(IEUNG)(O) +# Also problems in cases like gayeo, which needs to be gaye-o +# The hard case is a chain, like aeoeu. Normally interpreted as ae oe u. So for a-eoeu, we have to insert $sep +# But, we don't insert between the o and the e. +# +# a ae +# e eo eu +# i +# o oe +# u +# ui +# wa wae we wi +# yae ya yeo ye yo yu +# These are simple, since they can't chain. Note that we don't handle extreme cases like [ga][eo][e][o] $sep ← a {} [$E $EO $EU]; $sep ← [^aow] e {} [$O $OE]; $sep ← [^aowy] e {} [$U $UI]; $sep ← [^ey] o {} [$E $EO $EU]; $sep ← [^y] u {} [$I]; +# Similar to the above, but with an intervening $IEUNG. $sep ← [^$latinMedial] [y] e {} $IEUNG [$O $OE]; $sep ← [^$latinMedial] e {} $IEUNG [$O $OE $U]; $sep ← [^$latinMedial] [o a] {} $IEUNG [$E $EO $EU]; $sep ← [^$latinMedial] [w y] a {} $IEUNG [$E $EO $EU]; +# Single finals followed by IEUNG. The jamo sequence A Xf IEUNG E, +# where Xi also exists, must be transliterated as "ax-e" to prevent +# the round trip conversion to A Xi E. $sep ← $latinMedialEnd b {} $IEUNG $jamoMedial; $sep ← $latinMedialEnd d {} $IEUNG $jamoMedial; $sep ← $latinMedialEnd g {} $IEUNG $jamoMedial; @@ -103,6 +221,10 @@ $sep ← $latinMedialEnd p {} $IEUNG $jamoMedial; $sep ← $latinMedialEnd s {} $IEUNG $jamoMedial; $sep ← $latinMedialEnd t {} $IEUNG $jamoMedial; $sep ← $latinMedialEnd l {} $IEUNG $jamoMedial; +# Double finals followed by IEUNG. Similar to the single finals +# followed by IEUNG. Any latin consonant pair X Y, between medials, +# that we would split by Latin-Jamo, we must handle when it occurs as +# part of A XYf IEUNG E, to prevent round trip conversion to A Xf Yi E $sep ← $latinMedialEnd b s {} $IEUNG $jamoMedial; $sep ← $latinMedialEnd k k {} $IEUNG $jamoMedial; $sep ← $latinMedialEnd g s {} $IEUNG $jamoMedial; @@ -118,9 +240,16 @@ $sep ← $latinMedialEnd n h {} $IEUNG $jamoMedial; $sep ← $latinMedialEnd n j {} $IEUNG $jamoMedial; $sep ← $latinMedialEnd s s {} $IEUNG $jamoMedial; $sep ← $latinMedialEnd ch {} $IEUNG $jamoMedial; +# Split doubles. Text of the form A Xi Xf E, where XXi also occurs, +# we transliterate as "ax-xe" to prevent round trip transliteration as +# A XXi E. $sep ← $latinMedialEnd j {} $Ji $jamoMedial; $sep ← $latinMedialEnd k {} $Ki $jamoMedial; $sep ← $latinMedialEnd s {} $Si $jamoMedial; +# XYY. This corresponds to the XYY rule in Latin-Jamo. By default +# Latin-Jamo maps "xyy" to Xf YYi, to keep YY together. As a result, +# "xyy" forms that correspond to XYf Yi must be transliterated as +# "xy-y". $sep ← $latinMedialEnd b s {} [$Si $SSi]; $sep ← $latinMedialEnd g s {} [$Si $SSi]; $sep ← $latinMedialEnd l b {} [$Bi]; @@ -128,12 +257,25 @@ $sep ← $latinMedialEnd l g {} [$Gi]; $sep ← $latinMedialEnd l s {} [$Si $SSi]; $sep ← $latinMedialEnd n g {} [$Gi]; $sep ← $latinMedialEnd n j {} [$Ji $JJi]; +# $sep ← $latinMedialEnd l {} [$PPi]; +# $sep ← $latinMedialEnd l {} [$TTi]; $sep ← $latinMedialEnd l p {} [$Pi]; $sep ← $latinMedialEnd l t {} [$Ti]; $sep ← $latinMedialEnd k {} [$KKi $Ki]; $sep ← $latinMedialEnd p {} $Pi; $sep ← $latinMedialEnd t {} $Ti; $sep ← $latinMedialEnd c {} [$Hi]; +# Deletion of IEUNG is handled below. +#---------------------------------------------------------------------- +# Latin-Jamo +# [Basic, context-free Jamo-Latin rules are embedded here too. See +# above.] +# Split digraphs: Text of the form 'axye', where 'xy' is a final +# digraph, 'x' is a final (by itself), 'y' is an initial, and 'a' and +# 'e' are medials, we want to transliterate this as A Xf Yi E rather +# than A XYf IEUNG E. We do NOT include text of the form "axxe", +# since that is handled differently below. These rules are generated +# programmatically from the jamo data. $jamoMedial {b s} $latinMedial → $Bf $Si; $jamoMedial {g s} $latinMedial → $Gf $Si; $jamoMedial {l b} $latinMedial → $L $Bi; @@ -146,6 +288,9 @@ $jamoMedial {l t} $latinMedial → $L $Ti; $jamoMedial {n g} $latinMedial → $Nf $Gi; $jamoMedial {n h} $latinMedial → $Nf $Hi; $jamoMedial {n j} $latinMedial → $Nf $Ji; +# Single consonants are initials: Text of the form 'axe', where 'x' +# can be an initial or a final, and 'a' and 'e' are medials, we want +# to transliterate as A Xi E rather than A Xf IEUNG E. $jamoMedial {b} $latinMedial → $Bi; $jamoMedial {ch} $latinMedial → $CHi; $jamoMedial {d} $latinMedial → $Di; @@ -159,13 +304,22 @@ $jamoMedial {p} $latinMedial → $Pi; $jamoMedial {s} $latinMedial → $Si; $jamoMedial {t} $latinMedial → $Ti; $jamoMedial {l} $latinMedial → $Li; +# Doubled initials. The sequence "axxe", where XX exists as an initial +# (XXi), and also Xi and Xf exist (true of all digraphs XX), we want +# to transliterate as A XXi E, rather than split to A Xf Xi E. $jamoMedial {p p} $latinMedial → $PPi; $jamoMedial {t t} $latinMedial → $TTi; $jamoMedial {j j} $latinMedial → $JJi; $jamoMedial {k k} $latinMedial → $KKi; $jamoMedial {s s} $latinMedial → $SSi; +# XYY. Because doubled consonants bind more strongly than XY +# consonants, we must handle the sequence "axyy" specially. Here XYf +# and YYi must exist. In these cases, we map to Xf YYi rather than +# XYf. +# However, there are two special cases. $jamoMedial {lp} p p → $LP; $jamoMedial {lt} t t → $LT; +# End special cases $jamoMedial {b} s s → $Bf; $jamoMedial {g} s s → $Gf; $jamoMedial {l} b b → $L; @@ -175,6 +329,12 @@ $jamoMedial {l} t t → $L; $jamoMedial {l} p p → $L; $jamoMedial {n} g g → $Nf; $jamoMedial {n} j j → $Nf; +# Finals: Attach consonant with preceding medial to preceding medial. +# Do this BEFORE mapping consonants to initials. Longer keys must +# precede shorter keys that they start with, e.g., the rule for 'bs' +# must precede 'b'. +# [BASIC Jamo-Latin FINALS handled here. Order irrelevant within this +# block for Jamo-Latin.] $jamoMedial {bs} ↔ $BS; $jamoMedial {b} ↔ $Bf; $jamoMedial {ch} ↔ $Cf; @@ -202,6 +362,11 @@ $jamoMedial {p} ↔ $Pf; $jamoMedial {ss} ↔ $SSf; $jamoMedial {s} ↔ $Sf; $jamoMedial {t} ↔ $Tf; +# Initials: Attach single consonant to following medial. Do this +# AFTER mapping finals. Longer keys must precede shorter keys that +# they start with, e.g., the rule for 'gg' must precede 'g'. +# [BASIC Jamo-Latin INITIALS handled here. Order irrelevant within +# this block for Jamo-Latin.] {kk} $latinMedial ↔ $KKi; {g} $latinMedial ↔ $Gi; {n} $latinMedial ↔ $Ni; @@ -221,6 +386,21 @@ $jamoMedial {t} ↔ $Tf; {t} $latinMedial ↔ $Ti; {p} $latinMedial ↔ $Pi; {h} $latinMedial ↔ $Hi; +# 'r' in final position. Because of the equivalency of the 'l' and +# 'r' jamo (the glyphs are the same), we try to provide the same +# equivalency in Latin-Jamo. The 'l' to 'r' conversion is handled +# below. If we see an 'r' in an apparent final position, treat it +# like 'l'. For example, "karka" =→ Ki A R EU Ki A without this rule. +# Instead, we want Ki A L Ki A. +# Initial + Final: If we match the next rule, we have initial then +# final consonant with no intervening medial. We insert the null +# vowel BEFORE it to create a well-formed syllable. (In the next rule +# we insert a null vowel AFTER an anomalous initial.) +# Initial + X: This block matches an initial consonant not followed by +# a medial. We insert the null vowel after it. We handle double +# initials explicitly here; for single initial consonants we insert EU +# (as Latin) after them and let standard rules do the rest. +# BREAKS ROUND TRIP INTEGRITY kk → $KKi $EU; tt → $TTi $EU; pp → $PPi $EU; @@ -228,7 +408,31 @@ ss → $SSi $EU; jj → $JJi $EU; ch → $CHi $EU; ([lbdghjkmnpst]) → | $1 eu; +# X + Final: Finally we have to deal with a consonant that can only be +# interpreted as a final (not an initial) and which is preceded +# neither by an initial nor a medial. It is the start of the +# syllable, but cannot be. Most of these will already be handled by +# the above rules. 'bs' splits into Bi EU Sf. Similar for 'gs' 'ng' +# 'nh' 'nj'. The only problem is 'l' and digraphs starting with 'l'. +# For this isolated case, we could add a null initial and medial, +# which would give "la" =→ IEUNG EU L IEUNG A, for example. A more +# economical solution is to transliterate isolated "l" (that is, +# initial "l") to "r". (Other similar conversions of consonants that +# occur neither as initials nor as finals are handled below.) l → | r; +# Medials. If a medial is preceded by an initial, then we proceed +# normally. As usual, longer keys must precede shorter ones. +# [BASIC Jamo-Latin MEDIALS handled here. Order irrelevant within +# this block for Jamo-Latin.] +# +# a e i o u +# ae +# eo eu +# oe +# ui +# wa we wi +# wae +# yae ya yeo ye yo yu $jamoInitial {ae} ↔ $AE; $jamoInitial {a} ↔ $A; $jamoInitial {eo} ↔ $EO; @@ -250,9 +454,18 @@ $jamoInitial {yeo} ↔ $YEO; $jamoInitial {ye} ↔ $YE; $jamoInitial {yo} ↔ $YO; $jamoInitial {yu} ↔ $YU; +# We may see an anomalous isolated 'w' or 'y'. In that case, we +# interpret it as 'wi' and 'yu', respectively. +# BREAKS ROUND TRIP INTEGRITY $jamoInitial {w} → | wi; $jamoInitial {y} → | yu; +# Otherwise, insert a null consonant IEUNG before the medial (which is +# still an untransliterated latin vowel). ($latinMedial) → $IEUNG | $1; +# Convert non-jamo latin consonants to equivalents. These occur as +# neither initials nor finals in jamo. 'l' occurs as a final, but not +# an initial; it is handled above. The following letters (left hand +# side) will never be output by Jamo-Latin. f → | p; q → | k; v → | b; @@ -260,5 +473,14 @@ x → | ks; z → | s; r → | l; c → | k; +# Delete separators (Latin-Jamo). $sep → ; +# Delete null consonants (Jamo-Latin). Do NOT delete null EU vowels, +# since these may also occur in text. ← $IEUNG; +#- N.B. DO NOT put any filters, NFD, etc. here -- those are aliased in +#- the INDEX file. This transliterator is, by itself, not +#- instantiated. It is used as a part of Latin-Jamo, Latin-Hangul, or +#- inverses thereof. +# eof + diff --git a/icu4c/source/data/translit/Latin_InterIndic.txt b/icu4c/source/data/translit/Latin_InterIndic.txt index 385d91666e5..be50641ef3e 100644 --- a/icu4c/source/data/translit/Latin_InterIndic.txt +++ b/icu4c/source/data/translit/Latin_InterIndic.txt @@ -1,15 +1,22 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Latin_InterIndic.txt # Generated from CLDR # + +# Latin-InterIndic +#:: NFD; +#\u0E00 reserved +#consonants $chandrabindu=\uE001; $anusvara=\uE002; $visarga=\uE003; +#\u0E004 reserved +# w←vowel→ represents the stand-alone form $wa=\uE005; $waa=\uE006; $wi=\uE007; @@ -64,8 +71,11 @@ $sha=\uE036; $ssa=\uE037; $sa=\uE038; $ha=\uE039; +#\u093A Reserved +#\u093B Reserved $nukta=\uE03C; $avagraha=\uE03D; # SIGN AVAGRAHA +# ←vowel→ represents the dependent form $aa=\uE03E; $i=\uE03F; $ii=\uE040; @@ -82,10 +92,17 @@ $so=\uE04A; # VOWEL SIGN SHORT O $o=\uE04B; # ो $au=\uE04C; $virama=\uE04D; +# \u094E Reserved +# \u094F Reserved $om = \uE050; # OM +# \u0951→; # UNMAPPED STRESS SIGN UDATTA +# \u0952→; # UNMAPPED STRESS SIGN ANUDATTA +# \u0953→; # UNMAPPED GRAVE ACCENT +# \u0954→; # UNMAPPED ACUTE ACCENT $lm = \uE055;# Telugu Length Mark $ailm=\uE056;# AI Length Mark $aulm=\uE057;# AU Length Mark +#urdu compatibity forms $uka=\uE058; $ukha=\uE059; $ugha=\uE05A; @@ -111,6 +128,7 @@ $seven=\uE06D; # DIGIT SEVEN $eight=\uE06E; # DIGIT EIGHT $nine=\uE06F; # DIGIT NINE $dgs=\uE082; +# For all other scripts $ecp0=\uE070; $ecp1=\uE071; $ecp2=\uE072; @@ -127,10 +145,13 @@ $ecpC=\uE07C; $ecpD=\uE07D; $ecpE=\uE07E; $ecpF=\uE07F; +# Khanda-ta $kta=\uE083; +# ॰→; # nothing in Latin maps to InterIndic ABBREVIATION SIGN $depVowelAbove=[\uE03E-\uE040\uE045-\uE04C]; $depVowelBelow=[\uE041-\uE044]; $endThing=[$danda$doubleDanda]; +# $x was originally called '§'; $z was '%' $x=[$virama$aa$ai$au$ii$i$uu$u$rrh$rh$lh$e$o$se$ce$so$co]; $z=[bcdfghjklmnpqrstvwxyz]; $consonants=[[$ka-$ha]$z[क-ह][ক-হ][ਕ-ਹ][ક-હ][କ-ହ][க-ஹ][క-హ][ಕ-ಹ][ക-ഹ]]; @@ -139,6 +160,8 @@ $consonants=[[$ka-$ha]$z[क-ह][ক-হ][ਕ-ਹ][ક-હ][କ-ହ][க-ஹ][ m\u0310→$chandrabindu; h\u0323→$visarga; x→$ka$virama$sa; +# convert to independent forms at start of word or syllable: +# dependent forms for roundtrip \u0314a\u0304→$aa; \u0314ai→$ai; \u0314au→$au; @@ -159,6 +182,7 @@ x→$ka$virama$sa; \u0314o\u0306→$co; \u0314e→$se; \u0314o→$so; +# preceeded by consonants $consonants{ a\u0304→$aa; $consonants{ ai→$ai; $consonants{ au→$au; @@ -179,6 +203,7 @@ $consonants{ e\u0306→$ce; $consonants{ o\u0306→$co; $consonants{ e→$se; $consonants{ o→$so; +# e.g. keai -→ {ka}{e}{wai}; k'ai -→ {ka}{wai}; (ai) -→ ({wai}) a\u0304→$waa; ai→$wai; au→$wau; @@ -199,6 +224,7 @@ o\u0306→$wco; e→$wse; ''om→$om; o→$wso; +# rules for anusvara n}r\u0325 → $na|$virama; n}l\u0325 → $na|$virama; n}na → $na|$virama; @@ -211,12 +237,14 @@ n}[tdn] → $anusvara; m}[pbm] → $anusvara; n}[ylvshr] → $anusvara; m\u0307 → $anusvara; +#urdu compatibility q→$uka|$virama; k\u0331h\u0331→$ukha |$virama; g\u0307→ $ugha | $virama; z → $ujha |$virama; f → $ufa|$virama; t\u0331→$kta; +# dev y\u0307→$uya|$virama; l\u0331→$ela|$virama; n\u0331→$ena|$virama; @@ -268,15 +296,21 @@ h→$ha|$virama; $danda'.'→$doubleDanda; $depVowelAbove{'~'→$anusvara; $depVowelBelow{'~'→$chandrabindu; +# convert to dependent forms after consonant with no vowel: +# e.g. kai -→ {ka}{virama}ai -→ {ka}{ai} +#$virama aa→$aa; $virama a\u0304→$aa; $virama ai→$ai; $virama au→$au; $virama ii→$ii; $virama i\u0304→$ii; $virama i→$i; +#$virama uu→$uu; $virama u\u0304→$uu; $virama u→$u; +#$virama rrh→$rrh; $virama r\u0325\u0304→$rrh; +#$virama rh→$rh; $virama r\u0325a→$rh; $virama r\u0325→$rh; $virama l\u0325\u0304→$llh; @@ -289,16 +323,23 @@ $virama e\u0306→$ce; $virama o\u0306→$co; $virama e→$se; $virama o→$so; +# otherwise convert independent forms when separated by ': k'ai -→ {ka}{virama}{wai} +#$virama''aa→$waa; $virama''a\u0304→$waa; $virama''ai→$wai; $virama''au→$wau; +#$virama''ii→$wii; $virama''i\u0304→$wii; $virama''i→$wi; +#$virama''uu→$wuu; $virama''u\u0304→$wuu; $virama''u→$wu; +#$virama''rrh→$wrr; $virama''r\u0325\u0304→$wrr; +#$virama''rh→$wr; $virama''r\u0325→$wr; $virama''l\u0325\u0304→$wll; +#$virama''lh→$wl; $virama''l\u0325→$wl; $virama''e\u0304→$we; $virama''o\u0304→$wo; @@ -307,6 +348,7 @@ $virama''e\u0306→$wce; $virama''o\u0306→$wco; $virama''e→$wse; $virama''o→$wso; +# no virama ''a\u0304→$waa; ''ai→$wai; ''au→$wau; @@ -340,3 +382,5 @@ $virama}$endThing→; 8→$eight; 9→$nine; ''→; +#:: NFC (NFD) ; + diff --git a/icu4c/source/data/translit/Latin_NumericPinyin.txt b/icu4c/source/data/translit/Latin_NumericPinyin.txt index e907d55c6b8..7635bc9364c 100644 --- a/icu4c/source/data/translit/Latin_NumericPinyin.txt +++ b/icu4c/source/data/translit/Latin_NumericPinyin.txt @@ -1,17 +1,32 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Latin_NumericPinyin.txt # Generated from CLDR # + +# According to the pinyin definitions I've been able to find: +# 'a', 'e' are the preferred bases +# otherwise 'o' +# otherwise last vowel +# The trailing form of syllables are the following: +# "a", "ai", "ao", "an", "ang", +# "o", "ou", "ong", +# "e", "ei", "er", "en", "eng", +# "i", "ia", "iao", "ie", "iu", "ian", "in", "iang", "ing", "iong", +# "u", "ua", "uo", "uai", "ui", "uan", "un", "uang", "ueng", +# "ü", "üe", "üan", "ün" +# so the letters the tone will 'hop' are: ::NFD (NFC); $tone = [\u0304\u0301\u030C\u0300\u0306] ; +# Move the tone to the end of a syllable, and convert to number e {($tone) r} → r &Pinyin-NumericPinyin($1); ($tone) ( [i o n u {o n} {n g}]) → $2 &Pinyin-NumericPinyin($1); ($tone) → &Pinyin-NumericPinyin($1); +# The following backs up until it finds the right vowel, then deposits the tone $vowel = [aAeEiIoOuU {u\u0308} {U\u0308} vV]; $consonant = [[a-z A-Z] - [$vowel]]; $digit = [1-5]; @@ -20,3 +35,4 @@ $1 &NumericPinyin-Pinyin($3) $2 ← ([oO]) ([$vowel-[aeAE]]* $consonant*) ($digi $1 &NumericPinyin-Pinyin($3) $2 ← ($vowel) ($consonant*) ($digit); &NumericPinyin-Pinyin($1) ← [:letter:] {($digit)}; ::NFC (NFD); + diff --git a/icu4c/source/data/translit/Latin_Armenian.txt b/icu4c/source/data/translit/Latn_Armn.txt similarity index 93% rename from icu4c/source/data/translit/Latin_Armenian.txt rename to icu4c/source/data/translit/Latn_Armn.txt index c7fa26ebfcd..dea13e9cfcf 100644 --- a/icu4c/source/data/translit/Latin_Armenian.txt +++ b/icu4c/source/data/translit/Latn_Armn.txt @@ -1,12 +1,13 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Latin_Armenian.txt +# File: Latn_Armn.txt # Generated from CLDR # + ::NFD(NFC); ev ↔ և ; tʻ ↔ թ ; @@ -89,3 +90,4 @@ W ↔ Ւ ; Ō ↔ Օ ; F ↔ Ֆ ; ::NFC(NFD); + diff --git a/icu4c/source/data/translit/Latin_Bengali.txt b/icu4c/source/data/translit/Latn_Beng.txt similarity index 92% rename from icu4c/source/data/translit/Latin_Bengali.txt rename to icu4c/source/data/translit/Latn_Beng.txt index 50a76139fb0..1187a96fc83 100644 --- a/icu4c/source/data/translit/Latin_Bengali.txt +++ b/icu4c/source/data/translit/Latn_Beng.txt @@ -1,15 +1,17 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Latin_Bengali.txt +# File: Latn_Beng.txt # Generated from CLDR # + ::['.0-9A-Za-z~À-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳʔ\u0301\u0303-\u0304\u0306-\u0307\u0310\u0314-\u0315\u0323\u0325\u0331\u0341\u0344΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЃЌЎЙйѓќўӁ-ӂӐ-ӑӖ-ӗӢ-ӣӮ-ӯḀ-ẙẠ-ỹἁἃ-ἅἇἉἋ-ἍἏἑἓ-ἕἙἛ-Ἕἡἣ-ἥἧἩἫ-ἭἯἱἳ-ἵἷἹἻ-ἽἿὁὃ-ὅὉὋ-Ὅὑὓ-ὕὗὙὛὝὟὡὣ-ὥὧὩὫ-ὭὯάέήίόύώᾁᾃ-ᾅᾇᾉᾋ-ᾍᾏᾑᾓ-ᾕᾗᾙᾛ-ᾝᾟᾡᾣ-ᾥᾧᾩᾫ-ᾭᾯ-ᾱᾴᾸ-ᾹΆῄΈΉ῎ῐ-ῑΐῘ-ῙΊ῞ῠ-ῡΰῥῨ-ῩΎ-Ῥ΅ῴΌΏK-Å\uE04D\uE064]; ::NFD; ::Lower; ::Latin-InterIndic; ::InterIndic-Bengali; ::NFC; + diff --git a/icu4c/source/data/translit/Latn_Bopo.txt b/icu4c/source/data/translit/Latn_Bopo.txt new file mode 100644 index 00000000000..b643f835610 --- /dev/null +++ b/icu4c/source/data/translit/Latn_Bopo.txt @@ -0,0 +1,1454 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: Latn_Bopo.txt +# Generated from CLDR +# + +#--- forward filter, transforms ---- +# currently in reverse it only goes back to Latin with numeric tones (not sure why); ok but would rather have marks. +# +:: [[:Latin:][:Mn:][1-5]]; # forward filter: only modifies Latin and tone marks/digits +# +#--- remap v (alternate for ü) to ü, one way ---- +[ln] { v → ü; +# +:: Latin-NumericPinyin(NumericPinyin-Latin); # tone marks in middle ↔ digits at end to use numeric below; this leaves the pinyin in NFC +# +#--- variables ---- +# +# basic pinyin and zhuyin consonant initials (not including vowel initials): +$pCons = [b p m f d t n l g k h j q x r z c s]; # and zh ch sh, covered for this by h already in the set +$zCons = [ㄅ-ㄙ]; +# +# pinyin and zhuyin minus basic consonant initials that cannot take tones by themselves +# (in some cases the exclusion is only for the pinyin, not the corresponding zhuyin: zh ch sh r z c s) +$pToneOK = [[a-z] - [b p f d t l g k h j q x r z c s]]; # and minus zh ch sh, covered for this by h already in the exclusion set +$zToneOK = [[ㄅ-ㄩ] - [ㄅ ㄆ ㄈ ㄉ ㄊ ㄌ ㄍ ㄎ ㄏ ㄐ ㄑ ㄒ]]; +# +# basic consonant initials that can take tones by themselves +# (in some cases this is only for the zhuyin, not the corresponding pinyin: ㄓ ㄔ ㄕ ㄖ ㄗ ㄘ ㄙ) +# $pConsToneOK = [m n]; +# $zConsToneOK = [ㄇ ㄋ ㄓ ㄔ ㄕ ㄖ ㄗ ㄘ ㄙ]; +# +# basic numeric pinyin and zhuyin tones +$pTone = [1-5]; +$zTone = [ˉˊˇˋ˙]; +# +#--- clusters with initial consonant ---- +# +bang }$pTone ↔ ㄅㄤ }$zTone; +bang → ㄅㄤ˙; +bang1 ← ㄅㄤ; +beng }$pTone ↔ ㄅㄥ }$zTone; +beng → ㄅㄥ˙; +beng1 ← ㄅㄥ; +biao }$pTone ↔ ㄅㄧㄠ }$zTone; +biao → ㄅㄧㄠ˙; +biao1 ← ㄅㄧㄠ; +bian }$pTone ↔ ㄅㄧㄢ }$zTone; +bian → ㄅㄧㄢ˙; +bian1 ← ㄅㄧㄢ; +bing }$pTone ↔ ㄅㄧㄥ }$zTone; +bing → ㄅㄧㄥ˙; +bing1 ← ㄅㄧㄥ; +bai }$pTone ↔ ㄅㄞ }$zTone; +bai → ㄅㄞ˙; +bai1 ← ㄅㄞ; +bei }$pTone ↔ ㄅㄟ }$zTone; +bei → ㄅㄟ˙; +bei1 ← ㄅㄟ; +bao }$pTone ↔ ㄅㄠ }$zTone; +bao → ㄅㄠ˙; +bao1 ← ㄅㄠ; +ban }$pTone ↔ ㄅㄢ }$zTone; +ban → ㄅㄢ˙; +ban1 ← ㄅㄢ; +ben }$pTone ↔ ㄅㄣ }$zTone; +ben → ㄅㄣ˙; +ben1 ← ㄅㄣ; +bie }$pTone ↔ ㄅㄧㄝ }$zTone; +bie → ㄅㄧㄝ˙; +bie1 ← ㄅㄧㄝ; +bin }$pTone ↔ ㄅㄧㄣ }$zTone; +bin → ㄅㄧㄣ˙; +bin1 ← ㄅㄧㄣ; +bun }$pTone ↔ ㄅㄨㄣ }$zTone; +bun → ㄅㄨㄣ˙; +bun1 ← ㄅㄨㄣ; +ba }$pTone ↔ ㄅㄚ }$zTone; +ba → ㄅㄚ˙; +ba1 ← ㄅㄚ; +bo }$pTone ↔ ㄅㄛ }$zTone; +bo → ㄅㄛ˙; +bo1 ← ㄅㄛ; +bi }$pTone ↔ ㄅㄧ }$zTone; +bi → ㄅㄧ˙; +bi1 ← ㄅㄧ; +bu }$pTone ↔ ㄅㄨ }$zTone; +bu → ㄅㄨ˙; +bu1 ← ㄅㄨ; +# +pang }$pTone ↔ ㄆㄤ }$zTone; +pang → ㄆㄤ˙; +pang1 ← ㄆㄤ; +peng }$pTone ↔ ㄆㄥ }$zTone; +peng → ㄆㄥ˙; +peng1 ← ㄆㄥ; +piao }$pTone ↔ ㄆㄧㄠ }$zTone; +piao → ㄆㄧㄠ˙; +piao1 ← ㄆㄧㄠ; +pian }$pTone ↔ ㄆㄧㄢ }$zTone; +pian → ㄆㄧㄢ˙; +pian1 ← ㄆㄧㄢ; +ping }$pTone ↔ ㄆㄧㄥ }$zTone; +ping → ㄆㄧㄥ˙; +ping1 ← ㄆㄧㄥ; +pai }$pTone ↔ ㄆㄞ }$zTone; +pai → ㄆㄞ˙; +pai1 ← ㄆㄞ; +pei }$pTone ↔ ㄆㄟ }$zTone; +pei → ㄆㄟ˙; +pei1 ← ㄆㄟ; +pao }$pTone ↔ ㄆㄠ }$zTone; +pao → ㄆㄠ˙; +pao1 ← ㄆㄠ; +pou }$pTone ↔ ㄆㄡ }$zTone; +pou → ㄆㄡ˙; +pou1 ← ㄆㄡ; +pan }$pTone ↔ ㄆㄢ }$zTone; +pan → ㄆㄢ˙; +pan1 ← ㄆㄢ; +pen }$pTone ↔ ㄆㄣ }$zTone; +pen → ㄆㄣ˙; +pen1 ← ㄆㄣ; +pie }$pTone ↔ ㄆㄧㄝ }$zTone; +pie → ㄆㄧㄝ˙; +pie1 ← ㄆㄧㄝ; +pin }$pTone ↔ ㄆㄧㄣ }$zTone; +pin → ㄆㄧㄣ˙; +pin1 ← ㄆㄧㄣ; +pa }$pTone ↔ ㄆㄚ }$zTone; +pa → ㄆㄚ˙; +pa1 ← ㄆㄚ; +po }$pTone ↔ ㄆㄛ }$zTone; +po → ㄆㄛ˙; +po1 ← ㄆㄛ; +pi }$pTone ↔ ㄆㄧ }$zTone; +pi → ㄆㄧ˙; +pi1 ← ㄆㄧ; +pu }$pTone ↔ ㄆㄨ }$zTone; +pu → ㄆㄨ˙; +pu1 ← ㄆㄨ; +# +mang }$pTone ↔ ㄇㄤ }$zTone; +mang → ㄇㄤ˙; +mang1 ← ㄇㄤ; +meng }$pTone ↔ ㄇㄥ }$zTone; +meng → ㄇㄥ˙; +meng1 ← ㄇㄥ; +miao }$pTone ↔ ㄇㄧㄠ }$zTone; +miao → ㄇㄧㄠ˙; +miao1 ← ㄇㄧㄠ; +mian }$pTone ↔ ㄇㄧㄢ }$zTone; +mian → ㄇㄧㄢ˙; +mian1 ← ㄇㄧㄢ; +ming }$pTone ↔ ㄇㄧㄥ }$zTone; +ming → ㄇㄧㄥ˙; +ming1 ← ㄇㄧㄥ; +mai }$pTone ↔ ㄇㄞ }$zTone; +mai → ㄇㄞ˙; +mai1 ← ㄇㄞ; +mei }$pTone ↔ ㄇㄟ }$zTone; +mei → ㄇㄟ˙; +mei1 ← ㄇㄟ; +mao }$pTone ↔ ㄇㄠ }$zTone; +mao → ㄇㄠ˙; +mao1 ← ㄇㄠ; +mou }$pTone ↔ ㄇㄡ }$zTone; +mou → ㄇㄡ˙; +mou1 ← ㄇㄡ; +man }$pTone ↔ ㄇㄢ }$zTone; +man → ㄇㄢ˙; +man1 ← ㄇㄢ; +men }$pTone ↔ ㄇㄣ }$zTone; +men → ㄇㄣ˙; +men1 ← ㄇㄣ; +mie }$pTone ↔ ㄇㄧㄝ }$zTone; +mie → ㄇㄧㄝ˙; +mie1 ← ㄇㄧㄝ; +miu }$pTone ↔ ㄇㄧㄡ }$zTone; +miu → ㄇㄧㄡ˙; +miu1 ← ㄇㄧㄡ; +min }$pTone ↔ ㄇㄧㄣ }$zTone; +min → ㄇㄧㄣ˙; +min1 ← ㄇㄧㄣ; +ma }$pTone ↔ ㄇㄚ }$zTone; +ma → ㄇㄚ˙; +ma1 ← ㄇㄚ; +mo }$pTone ↔ ㄇㄛ }$zTone; +mo → ㄇㄛ˙; +mo1 ← ㄇㄛ; +me }$pTone ↔ ㄇㄜ }$zTone; +me → ㄇㄜ˙; +me1 ← ㄇㄜ; +mi }$pTone ↔ ㄇㄧ }$zTone; +mi → ㄇㄧ˙; +mi1 ← ㄇㄧ; +mu }$pTone ↔ ㄇㄨ }$zTone; +mu → ㄇㄨ˙; +mu1 ← ㄇㄨ; +# m handled below +# +fang }$pTone ↔ ㄈㄤ }$zTone; +fang → ㄈㄤ˙; +fang1 ← ㄈㄤ; +feng }$pTone ↔ ㄈㄥ }$zTone; +feng → ㄈㄥ˙; +feng1 ← ㄈㄥ; +fiao }$pTone ↔ ㄈㄧㄠ }$zTone; +fiao → ㄈㄧㄠ˙; +fiao1 ← ㄈㄧㄠ; +fei }$pTone ↔ ㄈㄟ }$zTone; +fei → ㄈㄟ˙; +fei1 ← ㄈㄟ; +fou }$pTone ↔ ㄈㄡ }$zTone; +fou → ㄈㄡ˙; +fou1 ← ㄈㄡ; +fan }$pTone ↔ ㄈㄢ }$zTone; +fan → ㄈㄢ˙; +fan1 ← ㄈㄢ; +fen }$pTone ↔ ㄈㄣ }$zTone; +fen → ㄈㄣ˙; +fen1 ← ㄈㄣ; +fa }$pTone ↔ ㄈㄚ }$zTone; +fa → ㄈㄚ˙; +fa1 ← ㄈㄚ; +fo }$pTone ↔ ㄈㄛ }$zTone; +fo → ㄈㄛ˙; +fo1 ← ㄈㄛ; +fu }$pTone ↔ ㄈㄨ }$zTone; +fu → ㄈㄨ˙; +fu1 ← ㄈㄨ; +# +diang }$pTone ↔ ㄉㄧㄤ }$zTone; # (not in han-latin) +diang → ㄉㄧㄤ˙; +diang1 ← ㄉㄧㄤ; +dang }$pTone ↔ ㄉㄤ }$zTone; +dang → ㄉㄤ˙; +dang1 ← ㄉㄤ; +deng }$pTone ↔ ㄉㄥ }$zTone; +deng → ㄉㄥ˙; +deng1 ← ㄉㄥ; +diao }$pTone ↔ ㄉㄧㄠ }$zTone; +diao → ㄉㄧㄠ˙; +diao1 ← ㄉㄧㄠ; +dian }$pTone ↔ ㄉㄧㄢ }$zTone; +dian → ㄉㄧㄢ˙; +dian1 ← ㄉㄧㄢ; +ding }$pTone ↔ ㄉㄧㄥ }$zTone; +ding → ㄉㄧㄥ˙; +ding1 ← ㄉㄧㄥ; +duan }$pTone ↔ ㄉㄨㄢ }$zTone; +duan → ㄉㄨㄢ˙; +duan1 ← ㄉㄨㄢ; +dong }$pTone ↔ ㄉㄨㄥ }$zTone; +dong → ㄉㄨㄥ˙; +dong1 ← ㄉㄨㄥ; +dai }$pTone ↔ ㄉㄞ }$zTone; +dai → ㄉㄞ˙; +dai1 ← ㄉㄞ; +dei }$pTone ↔ ㄉㄟ }$zTone; # (not in han-latin) +dei → ㄉㄟ˙; +dei1 ← ㄉㄟ; +dao }$pTone ↔ ㄉㄠ }$zTone; +dao → ㄉㄠ˙; +dao1 ← ㄉㄠ; +dou }$pTone ↔ ㄉㄡ }$zTone; +dou → ㄉㄡ˙; +dou1 ← ㄉㄡ; +dan }$pTone ↔ ㄉㄢ }$zTone; +dan → ㄉㄢ˙; +dan1 ← ㄉㄢ; +den }$pTone ↔ ㄉㄣ }$zTone; +den → ㄉㄣ˙; +den1 ← ㄉㄣ; +dia }$pTone ↔ ㄉㄧㄚ }$zTone; +dia → ㄉㄧㄚ˙; +dia1 ← ㄉㄧㄚ; +die }$pTone ↔ ㄉㄧㄝ }$zTone; +die → ㄉㄧㄝ˙; +die1 ← ㄉㄧㄝ; +diu }$pTone ↔ ㄉㄧㄡ }$zTone; +diu → ㄉㄧㄡ˙; +diu1 ← ㄉㄧㄡ; +din }$pTone ↔ ㄉㄧㄣ }$zTone; +din → ㄉㄧㄣ˙; +din1 ← ㄉㄧㄣ; +duo }$pTone ↔ ㄉㄨㄛ }$zTone; +duo → ㄉㄨㄛ˙; +duo1 ← ㄉㄨㄛ; +dui }$pTone ↔ ㄉㄨㄟ }$zTone; +dui → ㄉㄨㄟ˙; +dui1 ← ㄉㄨㄟ; +dun }$pTone ↔ ㄉㄨㄣ }$zTone; +dun → ㄉㄨㄣ˙; +dun1 ← ㄉㄨㄣ; +da }$pTone ↔ ㄉㄚ }$zTone; +da → ㄉㄚ˙; +da1 ← ㄉㄚ; +de }$pTone ↔ ㄉㄜ }$zTone; +de → ㄉㄜ˙; +de1 ← ㄉㄜ; +di }$pTone ↔ ㄉㄧ }$zTone; +di → ㄉㄧ˙; +di1 ← ㄉㄧ; +du }$pTone ↔ ㄉㄨ }$zTone; +du → ㄉㄨ˙; +du1 ← ㄉㄨ; +# +tang }$pTone ↔ ㄊㄤ }$zTone; +tang → ㄊㄤ˙; +tang1 ← ㄊㄤ; +teng }$pTone ↔ ㄊㄥ }$zTone; +teng → ㄊㄥ˙; +teng1 ← ㄊㄥ; +tiao }$pTone ↔ ㄊㄧㄠ }$zTone; +tiao → ㄊㄧㄠ˙; +tiao1 ← ㄊㄧㄠ; +tian }$pTone ↔ ㄊㄧㄢ }$zTone; +tian → ㄊㄧㄢ˙; +tian1 ← ㄊㄧㄢ; +ting }$pTone ↔ ㄊㄧㄥ }$zTone; +ting → ㄊㄧㄥ˙; +ting1 ← ㄊㄧㄥ; +tuan }$pTone ↔ ㄊㄨㄢ }$zTone; +tuan → ㄊㄨㄢ˙; +tuan1 ← ㄊㄨㄢ; +tong }$pTone ↔ ㄊㄨㄥ }$zTone; +tong → ㄊㄨㄥ˙; +tong1 ← ㄊㄨㄥ; +tai }$pTone ↔ ㄊㄞ }$zTone; +tai → ㄊㄞ˙; +tai1 ← ㄊㄞ; +tao }$pTone ↔ ㄊㄠ }$zTone; +tao → ㄊㄠ˙; +tao1 ← ㄊㄠ; +tou }$pTone ↔ ㄊㄡ }$zTone; +tou → ㄊㄡ˙; +tou1 ← ㄊㄡ; +tan }$pTone ↔ ㄊㄢ }$zTone; +tan → ㄊㄢ˙; +tan1 ← ㄊㄢ; +tie }$pTone ↔ ㄊㄧㄝ }$zTone; +tie → ㄊㄧㄝ˙; +tie1 ← ㄊㄧㄝ; +tuo }$pTone ↔ ㄊㄨㄛ }$zTone; +tuo → ㄊㄨㄛ˙; +tuo1 ← ㄊㄨㄛ; +tui }$pTone ↔ ㄊㄨㄟ }$zTone; +tui → ㄊㄨㄟ˙; +tui1 ← ㄊㄨㄟ; +tun }$pTone ↔ ㄊㄨㄣ }$zTone; +tun → ㄊㄨㄣ˙; +tun1 ← ㄊㄨㄣ; +ta }$pTone ↔ ㄊㄚ }$zTone; +ta → ㄊㄚ˙; +ta1 ← ㄊㄚ; +te }$pTone ↔ ㄊㄜ }$zTone; +te → ㄊㄜ˙; +te1 ← ㄊㄜ; +ti }$pTone ↔ ㄊㄧ }$zTone; +ti → ㄊㄧ˙; +ti1 ← ㄊㄧ; +tu }$pTone ↔ ㄊㄨ }$zTone; +tu → ㄊㄨ˙; +tu1 ← ㄊㄨ; +# +niang }$pTone ↔ ㄋㄧㄤ }$zTone; +niang → ㄋㄧㄤ˙; +niang1 ← ㄋㄧㄤ; +nang }$pTone ↔ ㄋㄤ }$zTone; +nang → ㄋㄤ˙; +nang1 ← ㄋㄤ; +neng }$pTone ↔ ㄋㄥ }$zTone; +neng → ㄋㄥ˙; +neng1 ← ㄋㄥ; +niao }$pTone ↔ ㄋㄧㄠ }$zTone; +niao → ㄋㄧㄠ˙; +niao1 ← ㄋㄧㄠ; +nian }$pTone ↔ ㄋㄧㄢ }$zTone; +nian → ㄋㄧㄢ˙; +nian1 ← ㄋㄧㄢ; +ning }$pTone ↔ ㄋㄧㄥ }$zTone; +ning → ㄋㄧㄥ˙; +ning1 ← ㄋㄧㄥ; +nuan }$pTone ↔ ㄋㄨㄢ }$zTone; +nuan → ㄋㄨㄢ˙; +nuan1 ← ㄋㄨㄢ; +nong }$pTone ↔ ㄋㄨㄥ }$zTone; +nong → ㄋㄨㄥ˙; +nong1 ← ㄋㄨㄥ; +nai }$pTone ↔ ㄋㄞ }$zTone; +nai → ㄋㄞ˙; +nai1 ← ㄋㄞ; +nei }$pTone ↔ ㄋㄟ }$zTone; +nei → ㄋㄟ˙; +nei1 ← ㄋㄟ; +nao }$pTone ↔ ㄋㄠ }$zTone; +nao → ㄋㄠ˙; +nao1 ← ㄋㄠ; +nou }$pTone ↔ ㄋㄡ }$zTone; +nou → ㄋㄡ˙; +nou1 ← ㄋㄡ; +nan }$pTone ↔ ㄋㄢ }$zTone; +nan → ㄋㄢ˙; +nan1 ← ㄋㄢ; +nen }$pTone ↔ ㄋㄣ }$zTone; +nen → ㄋㄣ˙; +nen1 ← ㄋㄣ; +nia }$pTone ↔ ㄋㄧㄚ }$zTone; # (not in han-latin) +nia → ㄋㄧㄚ˙; +nia1 ← ㄋㄧㄚ; +nie }$pTone ↔ ㄋㄧㄝ }$zTone; +nie → ㄋㄧㄝ˙; +nie1 ← ㄋㄧㄝ; +niu }$pTone ↔ ㄋㄧㄡ }$zTone; +niu → ㄋㄧㄡ˙; +niu1 ← ㄋㄧㄡ; +nin }$pTone ↔ ㄋㄧㄣ }$zTone; +nin → ㄋㄧㄣ˙; +nin1 ← ㄋㄧㄣ; +nuo }$pTone ↔ ㄋㄨㄛ }$zTone; +nuo → ㄋㄨㄛ˙; +nuo1 ← ㄋㄨㄛ; +nun }$pTone ↔ ㄋㄨㄣ }$zTone; +nun → ㄋㄨㄣ˙; +nun1 ← ㄋㄨㄣ; +nüe }$pTone ↔ ㄋㄩㄝ }$zTone; +nüe → ㄋㄩㄝ˙; +nüe1 ← ㄋㄩㄝ; +nue }$pTone → ㄋㄩㄝ; # (not in han-latin) one-way, handle wrong u +nue → ㄋㄩㄝ˙; +na }$pTone ↔ ㄋㄚ }$zTone; +na → ㄋㄚ˙; +na1 ← ㄋㄚ; +ne }$pTone ↔ ㄋㄜ }$zTone; +ne → ㄋㄜ˙; +ne1 ← ㄋㄜ; +ni }$pTone ↔ ㄋㄧ }$zTone; +ni → ㄋㄧ˙; +ni1 ← ㄋㄧ; +nu }$pTone ↔ ㄋㄨ }$zTone; +nu → ㄋㄨ˙; +nu1 ← ㄋㄨ; +nü }$pTone ↔ ㄋㄩ }$zTone; +nü → ㄋㄩ˙; +nü1 ← ㄋㄩ; +# n handled below +# +liang }$pTone ↔ ㄌㄧㄤ }$zTone; +liang → ㄌㄧㄤ˙; +liang1 ← ㄌㄧㄤ; +lang }$pTone ↔ ㄌㄤ }$zTone; +lang → ㄌㄤ˙; +lang1 ← ㄌㄤ; +leng }$pTone ↔ ㄌㄥ }$zTone; +leng → ㄌㄥ˙; +leng1 ← ㄌㄥ; +liao }$pTone ↔ ㄌㄧㄠ }$zTone; +liao → ㄌㄧㄠ˙; +liao1 ← ㄌㄧㄠ; +lian }$pTone ↔ ㄌㄧㄢ }$zTone; +lian → ㄌㄧㄢ˙; +lian1 ← ㄌㄧㄢ; +ling }$pTone ↔ ㄌㄧㄥ }$zTone; +ling → ㄌㄧㄥ˙; +ling1 ← ㄌㄧㄥ; +luan }$pTone ↔ ㄌㄨㄢ }$zTone; +luan → ㄌㄨㄢ˙; +luan1 ← ㄌㄨㄢ; +long }$pTone ↔ ㄌㄨㄥ }$zTone; +long → ㄌㄨㄥ˙; +long1 ← ㄌㄨㄥ; +lüan }$pTone ↔ ㄌㄩㄢ }$zTone; # (not in han-latin) +lüan → ㄌㄩㄢ˙; +lüan1 ← ㄌㄩㄢ; +lai }$pTone ↔ ㄌㄞ }$zTone; +lai → ㄌㄞ˙; +lai1 ← ㄌㄞ; +lei }$pTone ↔ ㄌㄟ }$zTone; +lei → ㄌㄟ˙; +lei1 ← ㄌㄟ; +lao }$pTone ↔ ㄌㄠ }$zTone; +lao → ㄌㄠ˙; +lao1 ← ㄌㄠ; +lou }$pTone ↔ ㄌㄡ }$zTone; +lou → ㄌㄡ˙; +lou1 ← ㄌㄡ; +lan }$pTone ↔ ㄌㄢ }$zTone; +lan → ㄌㄢ˙; +lan1 ← ㄌㄢ; +lia }$pTone ↔ ㄌㄧㄚ }$zTone; +lia → ㄌㄧㄚ˙; +lia1 ← ㄌㄧㄚ; +lie }$pTone ↔ ㄌㄧㄝ }$zTone; +lie → ㄌㄧㄝ˙; +lie1 ← ㄌㄧㄝ; +liu }$pTone ↔ ㄌㄧㄡ }$zTone; +liu → ㄌㄧㄡ˙; +liu1 ← ㄌㄧㄡ; +lin }$pTone ↔ ㄌㄧㄣ }$zTone; +lin → ㄌㄧㄣ˙; +lin1 ← ㄌㄧㄣ; +luo }$pTone ↔ ㄌㄨㄛ }$zTone; +luo → ㄌㄨㄛ˙; +luo1 ← ㄌㄨㄛ; +lun }$pTone ↔ ㄌㄨㄣ }$zTone; +lun → ㄌㄨㄣ˙; +lun1 ← ㄌㄨㄣ; +lüe }$pTone ↔ ㄌㄩㄝ }$zTone; +lüe → ㄌㄩㄝ˙; +lüe1 ← ㄌㄩㄝ; +lue }$pTone → ㄌㄩㄝ; # (not in han-latin) one-way, handle wrong u +lue → ㄌㄩㄝ˙; +la }$pTone ↔ ㄌㄚ }$zTone; +la → ㄌㄚ˙; +la1 ← ㄌㄚ; +lo }$pTone ↔ ㄌㄛ }$zTone; +lo → ㄌㄛ˙; +lo1 ← ㄌㄛ; +le }$pTone ↔ ㄌㄜ }$zTone; +le → ㄌㄜ˙; +le1 ← ㄌㄜ; +li }$pTone ↔ ㄌㄧ }$zTone; +li → ㄌㄧ˙; +li1 ← ㄌㄧ; +lu }$pTone ↔ ㄌㄨ }$zTone; +lu → ㄌㄨ˙; +lu1 ← ㄌㄨ; +lü }$pTone ↔ ㄌㄩ }$zTone; +lü → ㄌㄩ˙; +lü1 ← ㄌㄩ; +# +guang }$pTone ↔ ㄍㄨㄤ }$zTone; +guang → ㄍㄨㄤ˙; +guang1 ← ㄍㄨㄤ; +gang }$pTone ↔ ㄍㄤ }$zTone; +gang → ㄍㄤ˙; +gang1 ← ㄍㄤ; +geng }$pTone ↔ ㄍㄥ }$zTone; +geng → ㄍㄥ˙; +geng1 ← ㄍㄥ; +guai }$pTone ↔ ㄍㄨㄞ }$zTone; +guai → ㄍㄨㄞ˙; +guai1 ← ㄍㄨㄞ; +guan }$pTone ↔ ㄍㄨㄢ }$zTone; +guan → ㄍㄨㄢ˙; +guan1 ← ㄍㄨㄢ; +gong }$pTone ↔ ㄍㄨㄥ }$zTone; +gong → ㄍㄨㄥ˙; +gong1 ← ㄍㄨㄥ; +gai }$pTone ↔ ㄍㄞ }$zTone; +gai → ㄍㄞ˙; +gai1 ← ㄍㄞ; +gei }$pTone ↔ ㄍㄟ }$zTone; +gei → ㄍㄟ˙; +gei1 ← ㄍㄟ; +gao }$pTone ↔ ㄍㄠ }$zTone; +gao → ㄍㄠ˙; +gao1 ← ㄍㄠ; +gou }$pTone ↔ ㄍㄡ }$zTone; +gou → ㄍㄡ˙; +gou1 ← ㄍㄡ; +gan }$pTone ↔ ㄍㄢ }$zTone; +gan → ㄍㄢ˙; +gan1 ← ㄍㄢ; +gen }$pTone ↔ ㄍㄣ }$zTone; +gen → ㄍㄣ˙; +gen1 ← ㄍㄣ; +gua }$pTone ↔ ㄍㄨㄚ }$zTone; +gua → ㄍㄨㄚ˙; +gua1 ← ㄍㄨㄚ; +guo }$pTone ↔ ㄍㄨㄛ }$zTone; +guo → ㄍㄨㄛ˙; +guo1 ← ㄍㄨㄛ; +gui }$pTone ↔ ㄍㄨㄟ }$zTone; +gui → ㄍㄨㄟ˙; +gui1 ← ㄍㄨㄟ; +gun }$pTone ↔ ㄍㄨㄣ }$zTone; +gun → ㄍㄨㄣ˙; +gun1 ← ㄍㄨㄣ; +ga }$pTone ↔ ㄍㄚ }$zTone; +ga → ㄍㄚ˙; +ga1 ← ㄍㄚ; +ge }$pTone ↔ ㄍㄜ }$zTone; +ge → ㄍㄜ˙; +ge1 ← ㄍㄜ; +gi }$pTone ↔ ㄍㄧ }$zTone; +gi → ㄍㄧ˙; +gi1 ← ㄍㄧ; +gu }$pTone ↔ ㄍㄨ }$zTone; +gu → ㄍㄨ˙; +gu1 ← ㄍㄨ; +# +kuang }$pTone ↔ ㄎㄨㄤ }$zTone; +kuang → ㄎㄨㄤ˙; +kuang1 ← ㄎㄨㄤ; +kang }$pTone ↔ ㄎㄤ }$zTone; +kang → ㄎㄤ˙; +kang1 ← ㄎㄤ; +keng }$pTone ↔ ㄎㄥ }$zTone; +keng → ㄎㄥ˙; +keng1 ← ㄎㄥ; +kuai }$pTone ↔ ㄎㄨㄞ }$zTone; +kuai → ㄎㄨㄞ˙; +kuai1 ← ㄎㄨㄞ; +kuan }$pTone ↔ ㄎㄨㄢ }$zTone; +kuan → ㄎㄨㄢ˙; +kuan1 ← ㄎㄨㄢ; +kong }$pTone ↔ ㄎㄨㄥ }$zTone; +kong → ㄎㄨㄥ˙; +kong1 ← ㄎㄨㄥ; +kai }$pTone ↔ ㄎㄞ }$zTone; +kai → ㄎㄞ˙; +kai1 ← ㄎㄞ; +kao }$pTone ↔ ㄎㄠ }$zTone; +kao → ㄎㄠ˙; +kao1 ← ㄎㄠ; +kou }$pTone ↔ ㄎㄡ }$zTone; +kou → ㄎㄡ˙; +kou1 ← ㄎㄡ; +kan }$pTone ↔ ㄎㄢ }$zTone; +kan → ㄎㄢ˙; +kan1 ← ㄎㄢ; +ken }$pTone ↔ ㄎㄣ }$zTone; +ken → ㄎㄣ˙; +ken1 ← ㄎㄣ; +kua }$pTone ↔ ㄎㄨㄚ }$zTone; +kua → ㄎㄨㄚ˙; +kua1 ← ㄎㄨㄚ; +kuo }$pTone ↔ ㄎㄨㄛ }$zTone; +kuo → ㄎㄨㄛ˙; +kuo1 ← ㄎㄨㄛ; +kui }$pTone ↔ ㄎㄨㄟ }$zTone; +kui → ㄎㄨㄟ˙; +kui1 ← ㄎㄨㄟ; +kun }$pTone ↔ ㄎㄨㄣ }$zTone; +kun → ㄎㄨㄣ˙; +kun1 ← ㄎㄨㄣ; +ka }$pTone ↔ ㄎㄚ }$zTone; +ka → ㄎㄚ˙; +ka1 ← ㄎㄚ; +ke }$pTone ↔ ㄎㄜ }$zTone; +ke → ㄎㄜ˙; +ke1 ← ㄎㄜ; +ku }$pTone ↔ ㄎㄨ }$zTone; +ku → ㄎㄨ˙; +ku1 ← ㄎㄨ; +# +huang }$pTone ↔ ㄏㄨㄤ }$zTone; +huang → ㄏㄨㄤ˙; +huang1 ← ㄏㄨㄤ; +hang }$pTone ↔ ㄏㄤ }$zTone; +hang → ㄏㄤ˙; +hang1 ← ㄏㄤ; +heng }$pTone ↔ ㄏㄥ }$zTone; +heng → ㄏㄥ˙; +heng1 ← ㄏㄥ; +huai }$pTone ↔ ㄏㄨㄞ }$zTone; +huai → ㄏㄨㄞ˙; +huai1 ← ㄏㄨㄞ; +huan }$pTone ↔ ㄏㄨㄢ }$zTone; +huan → ㄏㄨㄢ˙; +huan1 ← ㄏㄨㄢ; +hong }$pTone ↔ ㄏㄨㄥ }$zTone; +hong → ㄏㄨㄥ˙; +hong1 ← ㄏㄨㄥ; +hai }$pTone ↔ ㄏㄞ }$zTone; +hai → ㄏㄞ˙; +hai1 ← ㄏㄞ; +hei }$pTone ↔ ㄏㄟ }$zTone; +hei → ㄏㄟ˙; +hei1 ← ㄏㄟ; +hao }$pTone ↔ ㄏㄠ }$zTone; +hao → ㄏㄠ˙; +hao1 ← ㄏㄠ; +hou }$pTone ↔ ㄏㄡ }$zTone; +hou → ㄏㄡ˙; +hou1 ← ㄏㄡ; +han }$pTone ↔ ㄏㄢ }$zTone; +han → ㄏㄢ˙; +han1 ← ㄏㄢ; +hen }$pTone ↔ ㄏㄣ }$zTone; +hen → ㄏㄣ˙; +hen1 ← ㄏㄣ; +hua }$pTone ↔ ㄏㄨㄚ }$zTone; +hua → ㄏㄨㄚ˙; +hua1 ← ㄏㄨㄚ; +huo }$pTone ↔ ㄏㄨㄛ }$zTone; +huo → ㄏㄨㄛ˙; +huo1 ← ㄏㄨㄛ; +hui }$pTone ↔ ㄏㄨㄟ }$zTone; +hui → ㄏㄨㄟ˙; +hui1 ← ㄏㄨㄟ; +hun }$pTone ↔ ㄏㄨㄣ }$zTone; +hun → ㄏㄨㄣ˙; +hun1 ← ㄏㄨㄣ; +hm }$pTone ↔ ㄏㄇ }$zTone; +hm → ㄏㄇ˙; +hm1 ← ㄏㄇ; +ha }$pTone ↔ ㄏㄚ }$zTone; +ha → ㄏㄚ˙; +ha1 ← ㄏㄚ; +ho }$pTone ↔ ㄏㄛ }$zTone; +ho → ㄏㄛ˙; +ho1 ← ㄏㄛ; +he }$pTone ↔ ㄏㄜ }$zTone; +he → ㄏㄜ˙; +he1 ← ㄏㄜ; +hu }$pTone ↔ ㄏㄨ }$zTone; +hu → ㄏㄨ˙; +hu1 ← ㄏㄨ; +# +jiang }$pTone ↔ ㄐㄧㄤ }$zTone; +jiang → ㄐㄧㄤ˙; +jiang1 ← ㄐㄧㄤ; +jiong }$pTone ↔ ㄐㄩㄥ }$zTone; +jiong → ㄐㄩㄥ˙; +jiong1 ← ㄐㄩㄥ; +jiao }$pTone ↔ ㄐㄧㄠ }$zTone; +jiao → ㄐㄧㄠ˙; +jiao1 ← ㄐㄧㄠ; +jian }$pTone ↔ ㄐㄧㄢ }$zTone; +jian → ㄐㄧㄢ˙; +jian1 ← ㄐㄧㄢ; +jing }$pTone ↔ ㄐㄧㄥ }$zTone; +jing → ㄐㄧㄥ˙; +jing1 ← ㄐㄧㄥ; +juan }$pTone ↔ ㄐㄩㄢ }$zTone; +juan → ㄐㄩㄢ˙; +juan1 ← ㄐㄩㄢ; +jia }$pTone ↔ ㄐㄧㄚ }$zTone; +jia → ㄐㄧㄚ˙; +jia1 ← ㄐㄧㄚ; +jie }$pTone ↔ ㄐㄧㄝ }$zTone; +jie → ㄐㄧㄝ˙; +jie1 ← ㄐㄧㄝ; +jiu }$pTone ↔ ㄐㄧㄡ }$zTone; +jiu → ㄐㄧㄡ˙; +jiu1 ← ㄐㄧㄡ; +jin }$pTone ↔ ㄐㄧㄣ }$zTone; +jin → ㄐㄧㄣ˙; +jin1 ← ㄐㄧㄣ; +jue }$pTone ↔ ㄐㄩㄝ }$zTone; +jue → ㄐㄩㄝ˙; +jue1 ← ㄐㄩㄝ; +jun }$pTone ↔ ㄐㄩㄣ }$zTone; +jun → ㄐㄩㄣ˙; +jun1 ← ㄐㄩㄣ; +ji }$pTone ↔ ㄐㄧ }$zTone; +ji → ㄐㄧ˙; +ji1 ← ㄐㄧ; +ju }$pTone ↔ ㄐㄩ }$zTone; +ju → ㄐㄩ˙; +ju1 ← ㄐㄩ; +# +qiang }$pTone ↔ ㄑㄧㄤ }$zTone; +qiang → ㄑㄧㄤ˙; +qiang1 ← ㄑㄧㄤ; +qiong }$pTone ↔ ㄑㄩㄥ }$zTone; +qiong → ㄑㄩㄥ˙; +qiong1 ← ㄑㄩㄥ; +qiao }$pTone ↔ ㄑㄧㄠ }$zTone; +qiao → ㄑㄧㄠ˙; +qiao1 ← ㄑㄧㄠ; +qian }$pTone ↔ ㄑㄧㄢ }$zTone; +qian → ㄑㄧㄢ˙; +qian1 ← ㄑㄧㄢ; +qing }$pTone ↔ ㄑㄧㄥ }$zTone; +qing → ㄑㄧㄥ˙; +qing1 ← ㄑㄧㄥ; +quan }$pTone ↔ ㄑㄩㄢ }$zTone; +quan → ㄑㄩㄢ˙; +quan1 ← ㄑㄩㄢ; +qia }$pTone ↔ ㄑㄧㄚ }$zTone; +qia → ㄑㄧㄚ˙; +qia1 ← ㄑㄧㄚ; +qie }$pTone ↔ ㄑㄧㄝ }$zTone; +qie → ㄑㄧㄝ˙; +qie1 ← ㄑㄧㄝ; +qiu }$pTone ↔ ㄑㄧㄡ }$zTone; +qiu → ㄑㄧㄡ˙; +qiu1 ← ㄑㄧㄡ; +qin }$pTone ↔ ㄑㄧㄣ }$zTone; +qin → ㄑㄧㄣ˙; +qin1 ← ㄑㄧㄣ; +que }$pTone ↔ ㄑㄩㄝ }$zTone; +que → ㄑㄩㄝ˙; +que1 ← ㄑㄩㄝ; +qun }$pTone ↔ ㄑㄩㄣ }$zTone; +qun → ㄑㄩㄣ˙; +qun1 ← ㄑㄩㄣ; +qi }$pTone ↔ ㄑㄧ }$zTone; +qi → ㄑㄧ˙; +qi1 ← ㄑㄧ; +qu }$pTone ↔ ㄑㄩ }$zTone; +qu → ㄑㄩ˙; +qu1 ← ㄑㄩ; +# +xiang }$pTone ↔ ㄒㄧㄤ }$zTone; +xiang → ㄒㄧㄤ˙; +xiang1 ← ㄒㄧㄤ; +xiong }$pTone ↔ ㄒㄩㄥ }$zTone; +xiong → ㄒㄩㄥ˙; +xiong1 ← ㄒㄩㄥ; +xiao }$pTone ↔ ㄒㄧㄠ }$zTone; +xiao → ㄒㄧㄠ˙; +xiao1 ← ㄒㄧㄠ; +xian }$pTone ↔ ㄒㄧㄢ }$zTone; +xian → ㄒㄧㄢ˙; +xian1 ← ㄒㄧㄢ; +xing }$pTone ↔ ㄒㄧㄥ }$zTone; +xing → ㄒㄧㄥ˙; +xing1 ← ㄒㄧㄥ; +xuan }$pTone ↔ ㄒㄩㄢ }$zTone; +xuan → ㄒㄩㄢ˙; +xuan1 ← ㄒㄩㄢ; +xia }$pTone ↔ ㄒㄧㄚ }$zTone; +xia → ㄒㄧㄚ˙; +xia1 ← ㄒㄧㄚ; +xie }$pTone ↔ ㄒㄧㄝ }$zTone; +xie → ㄒㄧㄝ˙; +xie1 ← ㄒㄧㄝ; +xiu }$pTone ↔ ㄒㄧㄡ }$zTone; +xiu → ㄒㄧㄡ˙; +xiu1 ← ㄒㄧㄡ; +xin }$pTone ↔ ㄒㄧㄣ }$zTone; +xin → ㄒㄧㄣ˙; +xin1 ← ㄒㄧㄣ; +xue }$pTone ↔ ㄒㄩㄝ }$zTone; +xue → ㄒㄩㄝ˙; +xue1 ← ㄒㄩㄝ; +xun }$pTone ↔ ㄒㄩㄣ }$zTone; +xun → ㄒㄩㄣ˙; +xun1 ← ㄒㄩㄣ; +xi }$pTone ↔ ㄒㄧ }$zTone; +xi → ㄒㄧ˙; +xi1 ← ㄒㄧ; +xu }$pTone ↔ ㄒㄩ }$zTone; +xu → ㄒㄩ˙; +xu1 ← ㄒㄩ; +# +zhuang }$pTone ↔ ㄓㄨㄤ }$zTone; +zhuang → ㄓㄨㄤ˙; +zhuang1 ← ㄓㄨㄤ; +zhang }$pTone ↔ ㄓㄤ }$zTone; +zhang → ㄓㄤ˙; +zhang1 ← ㄓㄤ; +zheng }$pTone ↔ ㄓㄥ }$zTone; +zheng → ㄓㄥ˙; +zheng1 ← ㄓㄥ; +zhuai }$pTone ↔ ㄓㄨㄞ }$zTone; +zhuai → ㄓㄨㄞ˙; +zhuai1 ← ㄓㄨㄞ; +zhuan }$pTone ↔ ㄓㄨㄢ }$zTone; +zhuan → ㄓㄨㄢ˙; +zhuan1 ← ㄓㄨㄢ; +zhong }$pTone ↔ ㄓㄨㄥ }$zTone; +zhong → ㄓㄨㄥ˙; +zhong1 ← ㄓㄨㄥ; +zhai }$pTone ↔ ㄓㄞ }$zTone; +zhai → ㄓㄞ˙; +zhai1 ← ㄓㄞ; +zhei }$pTone ↔ ㄓㄟ }$zTone; # (not in han-latin) +zhei → ㄓㄟ˙; +zhei1 ← ㄓㄟ; +zhao }$pTone ↔ ㄓㄠ }$zTone; +zhao → ㄓㄠ˙; +zhao1 ← ㄓㄠ; +zhou }$pTone ↔ ㄓㄡ }$zTone; +zhou → ㄓㄡ˙; +zhou1 ← ㄓㄡ; +zhan }$pTone ↔ ㄓㄢ }$zTone; +zhan → ㄓㄢ˙; +zhan1 ← ㄓㄢ; +zhen }$pTone ↔ ㄓㄣ }$zTone; +zhen → ㄓㄣ˙; +zhen1 ← ㄓㄣ; +zhua }$pTone ↔ ㄓㄨㄚ }$zTone; +zhua → ㄓㄨㄚ˙; +zhua1 ← ㄓㄨㄚ; +zhuo }$pTone ↔ ㄓㄨㄛ }$zTone; +zhuo → ㄓㄨㄛ˙; +zhuo1 ← ㄓㄨㄛ; +zhui }$pTone ↔ ㄓㄨㄟ }$zTone; +zhui → ㄓㄨㄟ˙; +zhui1 ← ㄓㄨㄟ; +zhun }$pTone ↔ ㄓㄨㄣ }$zTone; +zhun → ㄓㄨㄣ˙; +zhun1 ← ㄓㄨㄣ; +zha }$pTone ↔ ㄓㄚ }$zTone; +zha → ㄓㄚ˙; +zha1 ← ㄓㄚ; +zhe }$pTone ↔ ㄓㄜ }$zTone; +zhe → ㄓㄜ˙; +zhe1 ← ㄓㄜ; +zhu }$pTone ↔ ㄓㄨ }$zTone; +zhu → ㄓㄨ˙; +zhu1 ← ㄓㄨ; +zhi }$pTone ↔ ㄓ }$zTone; +zhi → ㄓ˙; +zhi1 ← ㄓ; +# +chuang }$pTone ↔ ㄔㄨㄤ }$zTone; +chuang → ㄔㄨㄤ˙; +chuang1 ← ㄔㄨㄤ; +chang }$pTone ↔ ㄔㄤ }$zTone; +chang → ㄔㄤ˙; +chang1 ← ㄔㄤ; +cheng }$pTone ↔ ㄔㄥ }$zTone; +cheng → ㄔㄥ˙; +cheng1 ← ㄔㄥ; +chuai }$pTone ↔ ㄔㄨㄞ }$zTone; +chuai → ㄔㄨㄞ˙; +chuai1 ← ㄔㄨㄞ; +chuan }$pTone ↔ ㄔㄨㄢ }$zTone; +chuan → ㄔㄨㄢ˙; +chuan1 ← ㄔㄨㄢ; +chong }$pTone ↔ ㄔㄨㄥ }$zTone; +chong → ㄔㄨㄥ˙; +chong1 ← ㄔㄨㄥ; +chai }$pTone ↔ ㄔㄞ }$zTone; +chai → ㄔㄞ˙; +chai1 ← ㄔㄞ; +chao }$pTone ↔ ㄔㄠ }$zTone; +chao → ㄔㄠ˙; +chao1 ← ㄔㄠ; +chou }$pTone ↔ ㄔㄡ }$zTone; +chou → ㄔㄡ˙; +chou1 ← ㄔㄡ; +chan }$pTone ↔ ㄔㄢ }$zTone; +chan → ㄔㄢ˙; +chan1 ← ㄔㄢ; +chen }$pTone ↔ ㄔㄣ }$zTone; +chen → ㄔㄣ˙; +chen1 ← ㄔㄣ; +chua }$pTone ↔ ㄔㄨㄚ }$zTone; +chua → ㄔㄨㄚ˙; +chua1 ← ㄔㄨㄚ; +chuo }$pTone ↔ ㄔㄨㄛ }$zTone; +chuo → ㄔㄨㄛ˙; +chuo1 ← ㄔㄨㄛ; +chui }$pTone ↔ ㄔㄨㄟ }$zTone; +chui → ㄔㄨㄟ˙; +chui1 ← ㄔㄨㄟ; +chun }$pTone ↔ ㄔㄨㄣ }$zTone; +chun → ㄔㄨㄣ˙; +chun1 ← ㄔㄨㄣ; +cha }$pTone ↔ ㄔㄚ }$zTone; +cha → ㄔㄚ˙; +cha1 ← ㄔㄚ; +che }$pTone ↔ ㄔㄜ }$zTone; +che → ㄔㄜ˙; +che1 ← ㄔㄜ; +chu }$pTone ↔ ㄔㄨ }$zTone; +chu → ㄔㄨ˙; +chu1 ← ㄔㄨ; +chi }$pTone ↔ ㄔ }$zTone; +chi → ㄔ˙; +chi1 ← ㄔ; +# +shuang }$pTone ↔ ㄕㄨㄤ }$zTone; +shuang → ㄕㄨㄤ˙; +shuang1 ← ㄕㄨㄤ; +shong }$pTone ↔ ㄕㄡㄥ }$zTone; # (not in han-latin) +shong → ㄕㄡㄥ˙; +shong1 ← ㄕㄡㄥ; +shang }$pTone ↔ ㄕㄤ }$zTone; +shang → ㄕㄤ˙; +shang1 ← ㄕㄤ; +sheng }$pTone ↔ ㄕㄥ }$zTone; +sheng → ㄕㄥ˙; +sheng1 ← ㄕㄥ; +shuai }$pTone ↔ ㄕㄨㄞ }$zTone; +shuai → ㄕㄨㄞ˙; +shuai1 ← ㄕㄨㄞ; +shuan }$pTone ↔ ㄕㄨㄢ }$zTone; +shuan → ㄕㄨㄢ˙; +shuan1 ← ㄕㄨㄢ; +shai }$pTone ↔ ㄕㄞ }$zTone; +shai → ㄕㄞ˙; +shai1 ← ㄕㄞ; +shei }$pTone ↔ ㄕㄟ }$zTone; # (not in han-latin) +shei → ㄕㄟ˙; +shei1 ← ㄕㄟ; +shao }$pTone ↔ ㄕㄠ }$zTone; +shao → ㄕㄠ˙; +shao1 ← ㄕㄠ; +shou }$pTone ↔ ㄕㄡ }$zTone; +shou → ㄕㄡ˙; +shou1 ← ㄕㄡ; +shan }$pTone ↔ ㄕㄢ }$zTone; +shan → ㄕㄢ˙; +shan1 ← ㄕㄢ; +shen }$pTone ↔ ㄕㄣ }$zTone; +shen → ㄕㄣ˙; +shen1 ← ㄕㄣ; +shua }$pTone ↔ ㄕㄨㄚ }$zTone; +shua → ㄕㄨㄚ˙; +shua1 ← ㄕㄨㄚ; +shuo }$pTone ↔ ㄕㄨㄛ }$zTone; +shuo → ㄕㄨㄛ˙; +shuo1 ← ㄕㄨㄛ; +shui }$pTone ↔ ㄕㄨㄟ }$zTone; +shui → ㄕㄨㄟ˙; +shui1 ← ㄕㄨㄟ; +shun }$pTone ↔ ㄕㄨㄣ }$zTone; +shun → ㄕㄨㄣ˙; +shun1 ← ㄕㄨㄣ; +sha }$pTone ↔ ㄕㄚ }$zTone; +sha → ㄕㄚ˙; +sha1 ← ㄕㄚ; +she }$pTone ↔ ㄕㄜ }$zTone; +she → ㄕㄜ˙; +she1 ← ㄕㄜ; +shu }$pTone ↔ ㄕㄨ }$zTone; +shu → ㄕㄨ˙; +shu1 ← ㄕㄨ; +shi }$pTone ↔ ㄕ }$zTone; +shi → ㄕ˙; +shi1 ← ㄕ; +# +rang }$pTone ↔ ㄖㄤ }$zTone; +rang → ㄖㄤ˙; +rang1 ← ㄖㄤ; +reng }$pTone ↔ ㄖㄥ }$zTone; +reng → ㄖㄥ˙; +reng1 ← ㄖㄥ; +ruan }$pTone ↔ ㄖㄨㄢ }$zTone; +ruan → ㄖㄨㄢ˙; +ruan1 ← ㄖㄨㄢ; +rong }$pTone ↔ ㄖㄨㄥ }$zTone; +rong → ㄖㄨㄥ˙; +rong1 ← ㄖㄨㄥ; +rao }$pTone ↔ ㄖㄠ }$zTone; +rao → ㄖㄠ˙; +rao1 ← ㄖㄠ; +rou }$pTone ↔ ㄖㄡ }$zTone; +rou → ㄖㄡ˙; +rou1 ← ㄖㄡ; +ran }$pTone ↔ ㄖㄢ }$zTone; +ran → ㄖㄢ˙; +ran1 ← ㄖㄢ; +ren }$pTone ↔ ㄖㄣ }$zTone; +ren → ㄖㄣ˙; +ren1 ← ㄖㄣ; +ruo }$pTone ↔ ㄖㄨㄛ }$zTone; +ruo → ㄖㄨㄛ˙; +ruo1 ← ㄖㄨㄛ; +rui }$pTone ↔ ㄖㄨㄟ }$zTone; +rui → ㄖㄨㄟ˙; +rui1 ← ㄖㄨㄟ; +run }$pTone ↔ ㄖㄨㄣ }$zTone; +run → ㄖㄨㄣ˙; +run1 ← ㄖㄨㄣ; +ra }$pTone ↔ ㄖㄚ }$zTone; +ra → ㄖㄚ˙; +ra1 ← ㄖㄚ; +re }$pTone ↔ ㄖㄜ }$zTone; +re → ㄖㄜ˙; +re1 ← ㄖㄜ; +ru }$pTone ↔ ㄖㄨ }$zTone; +ru → ㄖㄨ˙; +ru1 ← ㄖㄨ; +ri }$pTone ↔ ㄖ }$zTone; +ri → ㄖ˙; +ri1 ← ㄖ; +# +zang }$pTone ↔ ㄗㄤ }$zTone; +zang → ㄗㄤ˙; +zang1 ← ㄗㄤ; +zeng }$pTone ↔ ㄗㄥ }$zTone; +zeng → ㄗㄥ˙; +zeng1 ← ㄗㄥ; +zuan }$pTone ↔ ㄗㄨㄢ }$zTone; +zuan → ㄗㄨㄢ˙; +zuan1 ← ㄗㄨㄢ; +zong }$pTone ↔ ㄗㄨㄥ }$zTone; +zong → ㄗㄨㄥ˙; +zong1 ← ㄗㄨㄥ; +zai }$pTone ↔ ㄗㄞ }$zTone; +zai → ㄗㄞ˙; +zai1 ← ㄗㄞ; +zei }$pTone ↔ ㄗㄟ }$zTone; +zei → ㄗㄟ˙; +zei1 ← ㄗㄟ; +zao }$pTone ↔ ㄗㄠ }$zTone; +zao → ㄗㄠ˙; +zao1 ← ㄗㄠ; +zou }$pTone ↔ ㄗㄡ }$zTone; +zou → ㄗㄡ˙; +zou1 ← ㄗㄡ; +zan }$pTone ↔ ㄗㄢ }$zTone; +zan → ㄗㄢ˙; +zan1 ← ㄗㄢ; +zen }$pTone ↔ ㄗㄣ }$zTone; +zen → ㄗㄣ˙; +zen1 ← ㄗㄣ; +zuo }$pTone ↔ ㄗㄨㄛ }$zTone; +zuo → ㄗㄨㄛ˙; +zuo1 ← ㄗㄨㄛ; +zui }$pTone ↔ ㄗㄨㄟ }$zTone; +zui → ㄗㄨㄟ˙; +zui1 ← ㄗㄨㄟ; +zun }$pTone ↔ ㄗㄨㄣ }$zTone; +zun → ㄗㄨㄣ˙; +zun1 ← ㄗㄨㄣ; +za }$pTone ↔ ㄗㄚ }$zTone; +za → ㄗㄚ˙; +za1 ← ㄗㄚ; +ze }$pTone ↔ ㄗㄜ }$zTone; +ze → ㄗㄜ˙; +ze1 ← ㄗㄜ; +zu }$pTone ↔ ㄗㄨ }$zTone; +zu → ㄗㄨ˙; +zu1 ← ㄗㄨ; +zi }$pTone ↔ ㄗ }$zTone; +zi → ㄗ˙; +zi1 ← ㄗ; +# +cang }$pTone ↔ ㄘㄤ }$zTone; +cang → ㄘㄤ˙; +cang1 ← ㄘㄤ; +ceng }$pTone ↔ ㄘㄥ }$zTone; +ceng → ㄘㄥ˙; +ceng1 ← ㄘㄥ; +cuan }$pTone ↔ ㄘㄨㄢ }$zTone; +cuan → ㄘㄨㄢ˙; +cuan1 ← ㄘㄨㄢ; +cong }$pTone ↔ ㄘㄨㄥ }$zTone; +cong → ㄘㄨㄥ˙; +cong1 ← ㄘㄨㄥ; +cai }$pTone ↔ ㄘㄞ }$zTone; +cai → ㄘㄞ˙; +cai1 ← ㄘㄞ; +cao }$pTone ↔ ㄘㄠ }$zTone; +cao → ㄘㄠ˙; +cao1 ← ㄘㄠ; +cou }$pTone ↔ ㄘㄡ }$zTone; +cou → ㄘㄡ˙; +cou1 ← ㄘㄡ; +can }$pTone ↔ ㄘㄢ }$zTone; +can → ㄘㄢ˙; +can1 ← ㄘㄢ; +cen }$pTone ↔ ㄘㄣ }$zTone; +cen → ㄘㄣ˙; +cen1 ← ㄘㄣ; +cuo }$pTone ↔ ㄘㄨㄛ }$zTone; +cuo → ㄘㄨㄛ˙; +cuo1 ← ㄘㄨㄛ; +cui }$pTone ↔ ㄘㄨㄟ }$zTone; +cui → ㄘㄨㄟ˙; +cui1 ← ㄘㄨㄟ; +cun }$pTone ↔ ㄘㄨㄣ }$zTone; +cun → ㄘㄨㄣ˙; +cun1 ← ㄘㄨㄣ; +ca }$pTone ↔ ㄘㄚ }$zTone; +ca → ㄘㄚ˙; +ca1 ← ㄘㄚ; +ce }$pTone ↔ ㄘㄜ }$zTone; +ce → ㄘㄜ˙; +ce1 ← ㄘㄜ; +cu }$pTone ↔ ㄘㄨ }$zTone; +cu → ㄘㄨ˙; +cu1 ← ㄘㄨ; +ci }$pTone ↔ ㄘ }$zTone; +ci → ㄘ˙; +ci1 ← ㄘ; +# +sang }$pTone ↔ ㄙㄤ }$zTone; +sang → ㄙㄤ˙; +sang1 ← ㄙㄤ; +seng }$pTone ↔ ㄙㄥ }$zTone; +seng → ㄙㄥ˙; +seng1 ← ㄙㄥ; +suan }$pTone ↔ ㄙㄨㄢ }$zTone; +suan → ㄙㄨㄢ˙; +suan1 ← ㄙㄨㄢ; +song }$pTone ↔ ㄙㄨㄥ }$zTone; +song → ㄙㄨㄥ˙; +song1 ← ㄙㄨㄥ; +sai }$pTone ↔ ㄙㄞ }$zTone; +sai → ㄙㄞ˙; +sai1 ← ㄙㄞ; +sei }$pTone ↔ ㄙㄟ }$zTone; # (not in han-latin) +sei → ㄙㄟ˙; +sei1 ← ㄙㄟ; +sao }$pTone ↔ ㄙㄠ }$zTone; +sao → ㄙㄠ˙; +sao1 ← ㄙㄠ; +sou }$pTone ↔ ㄙㄡ }$zTone; +sou → ㄙㄡ˙; +sou1 ← ㄙㄡ; +san }$pTone ↔ ㄙㄢ }$zTone; +san → ㄙㄢ˙; +san1 ← ㄙㄢ; +sen }$pTone ↔ ㄙㄣ }$zTone; +sen → ㄙㄣ˙; +sen1 ← ㄙㄣ; +suo }$pTone ↔ ㄙㄨㄛ }$zTone; +suo → ㄙㄨㄛ˙; +suo1 ← ㄙㄨㄛ; +sui }$pTone ↔ ㄙㄨㄟ }$zTone; +sui → ㄙㄨㄟ˙; +sui1 ← ㄙㄨㄟ; +sun }$pTone ↔ ㄙㄨㄣ }$zTone; +sun → ㄙㄨㄣ˙; +sun1 ← ㄙㄨㄣ; +sa }$pTone ↔ ㄙㄚ }$zTone; +sa → ㄙㄚ˙; +sa1 ← ㄙㄚ; +se }$pTone ↔ ㄙㄜ }$zTone; +se → ㄙㄜ˙; +se1 ← ㄙㄜ; +su }$pTone ↔ ㄙㄨ }$zTone; +su → ㄙㄨ˙; +su1 ← ㄙㄨ; +si }$pTone ↔ ㄙ }$zTone; +si → ㄙ˙; +si1 ← ㄙ; +# +#--- vowels and vowel compounds ---- +# most exist as syllables by themselves and they are also used as finals for initial consonants +# +yuan }$pTone ↔ ㄩㄢ }$zTone; +yuan → ㄩㄢ˙; +yuan1 ← ㄩㄢ; +yong }$pTone ↔ ㄩㄥ }$zTone; +yong → ㄩㄥ˙; +yong1 ← ㄩㄥ; +yue }$pTone ↔ ㄩㄝ }$zTone; +yue → ㄩㄝ˙; +yue1 ← ㄩㄝ; +yun }$pTone ↔ ㄩㄣ }$zTone; +yun → ㄩㄣ˙; +yun1 ← ㄩㄣ; +yu }$pTone ↔ ㄩ }$zTone; +yu → ㄩ˙; +yu1 ← ㄩ; +# iu handled below +# +yang }$pTone ↔ ㄧㄤ }$zTone; +yang → ㄧㄤ˙; +yang1 ← ㄧㄤ; +ying }$pTone ↔ ㄧㄥ }$zTone; +ying → ㄧㄥ˙; +ying1 ← ㄧㄥ; +yai }$pTone ↔ ㄧㄞ }$zTone; # (not in han-latin) +yai → ㄧㄞ˙; +yai1 ← ㄧㄞ; +yao }$pTone ↔ ㄧㄠ }$zTone; +yao → ㄧㄠ˙; +yao1 ← ㄧㄠ; +you }$pTone ↔ ㄧㄡ }$zTone; +you → ㄧㄡ˙; +you1 ← ㄧㄡ; +yan }$pTone ↔ ㄧㄢ }$zTone; +yan → ㄧㄢ˙; +yan1 ← ㄧㄢ; +yin }$pTone ↔ ㄧㄣ }$zTone; +yin → ㄧㄣ˙; +yin1 ← ㄧㄣ; +ya }$pTone ↔ ㄧㄚ }$zTone; +ya → ㄧㄚ˙; +ya1 ← ㄧㄚ; +yo }$pTone ↔ ㄧㄛ }$zTone; +yo → ㄧㄛ˙; +yo1 ← ㄧㄛ; +ye }$pTone ↔ ㄧㄝ }$zTone; +ye → ㄧㄝ˙; +ye1 ← ㄧㄝ; +yi }$pTone ↔ ㄧ }$zTone; +yi → ㄧ˙; +yi1 ← ㄧ; +# i handled below +# +wong }$pTone ↔ ㄨㄨㄥ }$zTone; +wong → ㄨㄨㄥ˙; +wong1 ← ㄨㄨㄥ; +wang }$pTone ↔ ㄨㄤ }$zTone; +wang → ㄨㄤ˙; +wang1 ← ㄨㄤ; +weng }$pTone ↔ ㄨㄥ }$zTone; +weng → ㄨㄥ˙; +weng1 ← ㄨㄥ; +wai }$pTone ↔ ㄨㄞ }$zTone; +wai → ㄨㄞ˙; +wai1 ← ㄨㄞ; +wei }$pTone ↔ ㄨㄟ }$zTone; +wei → ㄨㄟ˙; +wei1 ← ㄨㄟ; +wan }$pTone ↔ ㄨㄢ }$zTone; +wan → ㄨㄢ˙; +wan1 ← ㄨㄢ; +wen }$pTone ↔ ㄨㄣ }$zTone; +wen → ㄨㄣ˙; +wen1 ← ㄨㄣ; +wa }$pTone ↔ ㄨㄚ }$zTone; +wa → ㄨㄚ˙; +wa1 ← ㄨㄚ; +wo }$pTone ↔ ㄨㄛ }$zTone; +wo → ㄨㄛ˙; +wo1 ← ㄨㄛ; +wu }$pTone ↔ ㄨ }$zTone; +wu → ㄨ˙; +wu1 ← ㄨ; +# u handled below +# +ang }$pTone ↔ ㄤ }$zTone; +ang → ㄤ˙; +ang1 ← ㄤ; +eng }$pTone ↔ ㄥ }$zTone; +eng → ㄥ˙; +eng1 ← ㄥ; +eh }$pTone ↔ ㄝ }$zTone; # (not in han-latin) +eh → ㄝ˙; +eh1 ← ㄝ; +ea }$pTone → ㄝ; # (not in han-latin) one-way +ea → ㄝ˙; +ai }$pTone ↔ ㄞ }$zTone; +ai → ㄞ˙; +ai1 ← ㄞ; +ei }$pTone ↔ ㄟ }$zTone; +ei → ㄟ˙; +ei1 ← ㄟ; +ao }$pTone ↔ ㄠ }$zTone; +ao → ㄠ˙; +ao1 ← ㄠ; +au }$pTone → ㄠ; # (not in han-latin) one-way, handle unicode spelling +au → ㄠ˙; +ou }$pTone ↔ ㄡ }$zTone; +ou → ㄡ˙; +ou1 ← ㄡ; +an }$pTone ↔ ㄢ }$zTone; +an → ㄢ˙; +an1 ← ㄢ; +en }$pTone ↔ ㄣ }$zTone; +en → ㄣ˙; +en1 ← ㄣ; +er }$pTone ↔ ㄦ }$zTone; +er → ㄦ˙; +er1 ← ㄦ; +a }$pTone ↔ ㄚ }$zTone; +a → ㄚ˙; +a1 ← ㄚ; +o }$pTone ↔ ㄛ }$zTone; +o → ㄛ˙; +o1 ← ㄛ; +e }$pTone ↔ ㄜ }$zTone; +e → ㄜ˙; +e1 ← ㄜ; +# +# handle unicode spellings of ㄧ,ㄨ,ㄩ above +iu }$pTone → ㄩ; # (not in han-latin) one-way, handle unicode spelling +iu → ㄩ˙; +i }$pTone → ㄧ; # (not in han-latin) one-way, handle unicode spelling +i → ㄧ˙; +u }$pTone → ㄨ; # (not in han-latin) one-way, handle unicode spelling +u → ㄨ˙; +# +#--- clusters with a single pinyin consonant that can apear in other clusters ---- +# +m }$pTone ↔ ㄇ }$zTone; +m → ㄇ˙; +m1 ← ㄇ; +# +n }$pTone ↔ ㄋ }$zTone; +n → ㄋ˙; +n1 ← ㄋ; +# +#--- fallback mappings ---- +# +# separate fallback mappings for some compound finals after consonants +# (different pinyin than the standalone mappings for these zhuyin sequences). +# +#------- +# would be nice to have these, need to work out how; +# something like the following, but need to avoid conflicts with mappings above: +# $pCons{ ia }$pTone ↔ $zCons{ ㄧㄚ }$zTone; # fallback mapping for unambiguous compound final +# $pCons{ ia → ㄧㄚ˙; +# ia1 ← $zCons{ ㄧㄚ +# +# the relevant mappings are: +# ia ↔ ㄧㄚ +# ie ↔ ㄧㄝ +# iao ↔ ㄧㄠ +# iu ↔ ㄧㄡ +# ian ↔ ㄧㄢ +# in ↔ ㄧㄣ +# iang ↔ ㄧㄤ +# ing ↔ ㄧㄥ +# ua ↔ ㄨㄚ +# uo ↔ ㄨㄛ +# uai ↔ ㄨㄞ +# ui ↔ ㄨㄟ +# uang ↔ ㄨㄤ +# ong ↔ ㄨㄥ +# iong ↔ ㄩㄥ +#------- +# +# separate fallback mappings for some initial consonants not handled above +# none of the mapped consonants handled here can have tones, so this is simple +b ↔ ㄅ; +p ↔ ㄆ; +# m ↔ ㄇ; # handled above +f ↔ ㄈ; +d ↔ ㄉ; +t ↔ ㄊ; +# n ↔ ㄋ; # handled above +l ↔ ㄌ; +g ↔ ㄍ; +k ↔ ㄎ; +h ↔ ㄏ; +j ↔ ㄐ; +q ↔ ㄑ; +x ↔ ㄒ; +zh → ㄓ; # reverse mapping to zhi handled above +ch → ㄔ; # reverse mapping to chi handled above +sh → ㄕ; # reverse mapping to shi handled above +r → ㄖ; # reverse mapping to ri handled above +z → ㄗ; # reverse mapping to zi handled above +c → ㄘ; # reverse mapping to ci handled above +s → ㄙ; # reverse mapping to si handled above +# +#--- tones (except for the ummarked cases handled above) ---- +# +# tone 1: pinyin \u0304 or 1 ↔ zhuyin typically unmarked or use ˉ \u02C9 +1 → ; # map to nothing +1 ← ˉ ; # transform if marked in zhuyin +# did the following with rules for each cluster, above +# 1 ← ; # map nothing in zhuyin to pinyin mark +# +# tones 2-4 (easy) +# $pToneOK{ 2 ↔ $zToneOK{ ˊ; # pinyin \u0301 or 2 ↔ zhuyin \u02CA +# $pToneOK{ 3 ↔ $zToneOK{ ˇ; # pinyin \u030C or 3 ↔ zhuyin \u02C7 +# $pToneOK{ 4 ↔ $zToneOK{ ˋ; # pinyin \u0300 or 4 ↔ zhuyin \u02CB +# actually don't need context: +2 ↔ ˊ; # pinyin \u0301 or 2 ↔ zhuyin \u02CA +3 ↔ ˇ; # pinyin \u030C or 3 ↔ zhuyin \u02C7 +4 ↔ ˋ; # pinyin \u0300 or 4 ↔ zhuyin \u02CB +# +# tone 5 (light): pinyin typically unmarked or use 5 ↔ zhuyin ˙ \u02D9 +← ˙; # map to nothing +5 → ˙; # transform if marked in pinyin +# did the following with rules for each cluster above +# → ˙; # map nothing in pinyin to zhuyin mark +# +#--- reverse filter ---- +# +:: ([[ㄅ-ㄩ][ˉˊˇˋ˙]]); # reverse filter: only modifies basic Bopomofo and tone marks + diff --git a/icu4c/source/data/translit/Latin_Devanagari.txt b/icu4c/source/data/translit/Latn_Deva.txt similarity index 91% rename from icu4c/source/data/translit/Latin_Devanagari.txt rename to icu4c/source/data/translit/Latn_Deva.txt index 57c91435c11..70a1a2698ff 100644 --- a/icu4c/source/data/translit/Latin_Devanagari.txt +++ b/icu4c/source/data/translit/Latn_Deva.txt @@ -1,15 +1,17 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Latin_Devanagari.txt +# File: Latn_Deva.txt # Generated from CLDR # + ::['.0-9A-Za-z~À-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳʔ\u0301\u0303-\u0304\u0306-\u0307\u0310\u0314-\u0315\u0323\u0325\u0331\u0341\u0344΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЃЌЎЙйѓќўӁ-ӂӐ-ӑӖ-ӗӢ-ӣӮ-ӯḀ-ẙẠ-ỹἁἃ-ἅἇἉἋ-ἍἏἑἓ-ἕἙἛ-Ἕἡἣ-ἥἧἩἫ-ἭἯἱἳ-ἵἷἹἻ-ἽἿὁὃ-ὅὉὋ-Ὅὑὓ-ὕὗὙὛὝὟὡὣ-ὥὧὩὫ-ὭὯάέήίόύώᾁᾃ-ᾅᾇᾉᾋ-ᾍᾏᾑᾓ-ᾕᾗᾙᾛ-ᾝᾟᾡᾣ-ᾥᾧᾩᾫ-ᾭᾯ-ᾱᾴᾸ-ᾹΆῄΈΉ῎ῐ-ῑΐῘ-ῙΊ῞ῠ-ῡΰῥῨ-ῩΎ-Ῥ΅ῴΌΏK-Å\uE04D\uE064]; ::NFD; ::Lower; ::Latin-InterIndic; ::InterIndic-Devanagari; ::NFC; + diff --git a/icu4c/source/data/translit/Latin_Gujarati.txt b/icu4c/source/data/translit/Latn_Gujr.txt similarity index 91% rename from icu4c/source/data/translit/Latin_Gujarati.txt rename to icu4c/source/data/translit/Latn_Gujr.txt index 9ab9475c006..b403477bc5f 100644 --- a/icu4c/source/data/translit/Latin_Gujarati.txt +++ b/icu4c/source/data/translit/Latn_Gujr.txt @@ -1,15 +1,17 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Latin_Gujarati.txt +# File: Latn_Gujr.txt # Generated from CLDR # + ::['.0-9A-Za-z~À-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳʔ\u0301\u0303-\u0304\u0306-\u0307\u0310\u0314-\u0315\u0323\u0325\u0331\u0341\u0344΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЃЌЎЙйѓќўӁ-ӂӐ-ӑӖ-ӗӢ-ӣӮ-ӯḀ-ẙẠ-ỹἁἃ-ἅἇἉἋ-ἍἏἑἓ-ἕἙἛ-Ἕἡἣ-ἥἧἩἫ-ἭἯἱἳ-ἵἷἹἻ-ἽἿὁὃ-ὅὉὋ-Ὅὑὓ-ὕὗὙὛὝὟὡὣ-ὥὧὩὫ-ὭὯάέήίόύώᾁᾃ-ᾅᾇᾉᾋ-ᾍᾏᾑᾓ-ᾕᾗᾙᾛ-ᾝᾟᾡᾣ-ᾥᾧᾩᾫ-ᾭᾯ-ᾱᾴᾸ-ᾹΆῄΈΉ῎ῐ-ῑΐῘ-ῙΊ῞ῠ-ῡΰῥῨ-ῩΎ-Ῥ΅ῴΌΏK-Å\uE04D\uE064]; ::NFD; ::Lower; ::Latin-InterIndic; ::InterIndic-Gujarati; ::NFC; + diff --git a/icu4c/source/data/translit/Latin_Gurmukhi.txt b/icu4c/source/data/translit/Latn_Guru.txt similarity index 91% rename from icu4c/source/data/translit/Latin_Gurmukhi.txt rename to icu4c/source/data/translit/Latn_Guru.txt index 95bef0103ea..2ac893241f8 100644 --- a/icu4c/source/data/translit/Latin_Gurmukhi.txt +++ b/icu4c/source/data/translit/Latn_Guru.txt @@ -1,15 +1,17 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Latin_Gurmukhi.txt +# File: Latn_Guru.txt # Generated from CLDR # + ::['.0-9A-Za-z~À-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳʔ\u0301\u0303-\u0304\u0306-\u0307\u0310\u0314-\u0315\u0323\u0325\u0331\u0341\u0344΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЃЌЎЙйѓќўӁ-ӂӐ-ӑӖ-ӗӢ-ӣӮ-ӯḀ-ẙẠ-ỹἁἃ-ἅἇἉἋ-ἍἏἑἓ-ἕἙἛ-Ἕἡἣ-ἥἧἩἫ-ἭἯἱἳ-ἵἷἹἻ-ἽἿὁὃ-ὅὉὋ-Ὅὑὓ-ὕὗὙὛὝὟὡὣ-ὥὧὩὫ-ὭὯάέήίόύώᾁᾃ-ᾅᾇᾉᾋ-ᾍᾏᾑᾓ-ᾕᾗᾙᾛ-ᾝᾟᾡᾣ-ᾥᾧᾩᾫ-ᾭᾯ-ᾱᾴᾸ-ᾹΆῄΈΉ῎ῐ-ῑΐῘ-ῙΊ῞ῠ-ῡΰῥῨ-ῩΎ-Ῥ΅ῴΌΏK-Å\uE04D\uE064]; ::NFD; ::Lower; ::Latin-InterIndic; ::InterIndic-Gurmukhi; ::NFC; + diff --git a/icu4c/source/data/translit/Latin_Hangul.txt b/icu4c/source/data/translit/Latn_Hang.txt similarity index 84% rename from icu4c/source/data/translit/Latin_Hangul.txt rename to icu4c/source/data/translit/Latn_Hang.txt index 7d039d49080..03756aaef31 100644 --- a/icu4c/source/data/translit/Latin_Hangul.txt +++ b/icu4c/source/data/translit/Latn_Hang.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Latin_Hangul.txt +# File: Latn_Hang.txt # Generated from CLDR # + ::[-A-Za-zÀ-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǡǦ-ǭǰǴ-ǵǸ-ǻȀ-țȞ-ȟȦ-ȳḀ-ẙẠ-ỹK-Å]; ::NFD; ::Lower; ::Latin-ConjoiningJamo; ::NFC; + diff --git a/icu4c/source/data/translit/Latin_Jamo.txt b/icu4c/source/data/translit/Latn_Jamo.txt similarity index 80% rename from icu4c/source/data/translit/Latin_Jamo.txt rename to icu4c/source/data/translit/Latn_Jamo.txt index 9401b928803..abdb9c6ef13 100644 --- a/icu4c/source/data/translit/Latin_Jamo.txt +++ b/icu4c/source/data/translit/Latn_Jamo.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Latin_Jamo.txt +# File: Latn_Jamo.txt # Generated from CLDR # + ::[[:script=Latin:][:M:]-]; ::NFD; ::Lower; ::Latin-ConjoiningJamo; ::[[:script=Latin:][:M:]] NFC; + diff --git a/icu4c/source/data/translit/Latin_Katakana.txt b/icu4c/source/data/translit/Latn_Kana.txt similarity index 65% rename from icu4c/source/data/translit/Latin_Katakana.txt rename to icu4c/source/data/translit/Latn_Kana.txt index bd0e07c1b11..4819d644817 100644 --- a/icu4c/source/data/translit/Latin_Katakana.txt +++ b/icu4c/source/data/translit/Latn_Kana.txt @@ -1,19 +1,70 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Latin_Katakana.txt +# File: Latn_Kana.txt # Generated from CLDR # + +# note: a global filter is more efficient, but MUST include all source chars +#:: [\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]] ; +# MINIMAL FILTER GENERATED FOR: Latin-Katakana +### WARNING -- must add width filter, both here and below!!! ### :: [[ᄀ-ᄒᄚᄡ\u1160-ᅵᆪᆬ-ᆭᆰ-ᆵ←-↓│■○\u3000-。「-」\u3099-\u309Aァ-ロワヲ-ヴヷヺ-ー!-~¢-₩][',.A-Za-z~À-ÖØ-öø-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0304Ӣ-ӣӮ-ӯḀ-ẙẠ-ỹᾱᾹῑῙῡῩK-Å]] ; :: [:Latin:] fullwidth-halfwidth (); :: NFD (NFC); :: Lower (); # whenever transliterating from cased to uncased script, include this +# :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese +# Uses modified Hepburn. Small changes to make unambiguous. +# | Kunrei-shiki: Hepburn/MHepburn +# | ------------------------------ +# | si: shi +# | si ~ya: sha +# | si ~yu: shu +# | si ~yo: sho +# | zi: ji +# | zi ~ya: ja +# | zi ~yu: ju +# | zi ~yo: jo +# | ti: chi +# | ti ~ya: cha +# | ti ~yu: chu +# | ti ~yu: cho +# | tu: tsu +# | di: ji/dji +# | du: zu/dzu +# | hu: fu +# | For foreign words: +# | ----------------- +# | se ~i si +# | si ~e she +# | +# | ze ~i zi +# | zi ~e je +# | +# | te ~i ti +# | ti ~e che +# | te ~u tu +# | +# | de ~i di +# | de ~u du +# | de ~i di +# | +# | he ~u: hu +# | hu ~a fa +# | hu ~i fi +# | hu ~e he +# | hu ~o ho +# Most small forms are generated, but if necessary +# explicit small forms are given with ~a, ~ya, etc. +#------------------------------------------------------ +# Variables $vowel = [aeiou] ; $consonant = [bcdfghjklmnpqrstvwxyz] ; $macron = \u0304 ; +# Variables used for doubled-consonants with tsu $kana = [ぁ-ゔ] ; $voice = [\u3099゛]; $semivoice = [\u309A゜]; @@ -30,22 +81,38 @@ $r_start = [ラリルレロらりるれろ] ; $w_start = [ワヰヱヲわゐゑを] ; $v_start = [ワヰヱヲ]\u3099 ; $voweled_basekana = [ァ-オカキクケコサシスセソタチッツテトナ-ノハヒフヘホマ-ヲヵヶ] ; +# if ン is followed by $n_quoter, then it needs an +# apostrophe after its romaji form to disambiguate it. +# e.g., ン ア ! = ナ, so represent as "n'a", not "na". $n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ; $small_y = [ャィュェョ] ; $iteration = ゝ ; +#------------------------------------------------------ +# katakana rules +# Punctuation '.' ↔ 。; ',' ↔ 、; +# ' ' } [a-z] → ; # delete spaces before latin +# ' ' ← [^' '゠-ヿ] {} ['゠-ヿ] ; #insert spaces before hiragana +# Iteration Mark +# Copy previous letter § marks +# TODO +# | $1 $1 ← ($kana [[:M:]$voice$semivoice]?) $iteration +# Specials for katakana -- not shared with hiragana va ↔ ワ\u3099 ; vi ↔ ヰ\u3099 ; ve ↔ ヱ\u3099 ; vo ↔ ヲ\u3099 ; '~ka' ↔ ヵ ; '~ke' ↔ ヶ ; +# ~~~ begin shared rules ~~~ +#special ya ← '~'ャ; yi ← '~'ィ ; yu ← '~'ュ; ye ← '~'ェ; yo ← '~'ョ; +#normal a ↔ ア ; b | '~' ← ヒ \u3099} $small_y ; by } $vowel → ヒ\u3099 | '~y' ; @@ -69,6 +136,7 @@ dje ← チ\u3099ェ ; djo ← チ\u3099ョ ; dji ↔ チ\u3099 ; dj } $vowel → チ\u3099 | '~y' ; +# TODO: QUESTION: use ĵĴżŻ instead of dj, dz cha ← チャ ; chi'~i' ← チィ ; # liu chu ← チュ ; @@ -85,6 +153,7 @@ gu ↔ ク\u3099 ; ge ↔ ケ\u3099 ; go ↔ コ\u3099 ; i ↔ イ ; +# j } $vowel → シ\u3099 | '~y' ; ja ↔ シ\u3099ャ ; ji'~i' ← シ\u3099ィ ; # liu ju ↔ シ\u3099ュ ; @@ -128,6 +197,8 @@ hi ↔ ヒ ; hu ↔ ヘゥ ; he ↔ ヘ ; ho ↔ ホ ; +# f | '~' ← フ } $small_y ; +# f } $vowel → フ | '~' ; fa ↔ ファ ; fi ↔ フィ ; fe ↔ フェ ; @@ -163,8 +234,14 @@ tu ↔ テゥ ; te ↔ テ ; to ↔ ト ; tsu ↔ ツ ; +# v } $vowel → ウ\u3099 | '~' ; +#'v~a' ← ウ\u3099ァ ; # liu +#'v~i' ← ウ\u3099ィ ; # liu +#'v~e' ← ウ\u3099ェ ; # liu +#'v~o' ← ウ\u3099ォ ; # liu vu ↔ ウ\u3099 ; u ↔ ウ ; +# w } $vowel → ウ | '~' ; wa ↔ ワ ; wi ↔ ヰ ; wu → ウ ; @@ -175,15 +252,20 @@ yi → イ ; yu ↔ ユ ; ye → エ ; yo ↔ ヨ ; +# double consonants +#specials s } sh → ッ ; t } ch → ッ ; +#voiced j } j ↔ ッ } $j_start ; b } b ↔ ッ } [$h_start$f_start] $voice; d } d ↔ ッ } $t_start $voice; g } g ↔ ッ } $k_start $voice; p } p ↔ ッ } [$h_start$f_start] $semivoice; +# v } v ↔ ッ } [ワヰウヱヲう] $voice ; z } z ↔ ッ } $s_start $voice; v } v ↔ ッ } $v_start; +# normal k } k ↔ ッ } $k_start ; m } m ↔ ッ } $m_start ; n } n ↔ ッ } $n_start ; @@ -194,13 +276,24 @@ t } t ↔ ッ } $t_start ; s } s ↔ ッ } $s_start ; w } w ↔ ッ } $w_start; y } y ↔ ッ } $y_start; +# completeness x } x → ッ ; c } k → ッ ; c } c → ッ ; c } q → ッ ; l } l → ッ ; q } q → ッ ; +# y } y → ッ ; +# w } w → ッ ; +# prolonged vowel mark. this indicates a doubling of +# the preceding vowel sound +#a ← a { ー ; # liu +#e ← e { ー ; # liu +#i ← i { ー ; # liu +#o ← o { ー ; # liu +#u ← u { ー ; # liu $macron ↔ ー ; +# small forms '~a' ↔ ァ ; '~i' ↔ ィ ; '~u' ↔ ゥ ; @@ -213,6 +306,8 @@ $macron ↔ ー ; '~yu' ↔ ュ ; '~ye' → ェ ; '~yo' ↔ ョ ; +# iteration marks +# TODO: make more accurate j $1 ← sh (y* $vowel) {ヽ$voice ; dj $1 ← ch (y* $vowel) {ヽ$voice ; dz $1 ← ts (y* $vowel) {ヽ$voice ; @@ -230,7 +325,16 @@ dz $1 ← dz (y* $vowel) {ヽ$voice ; $1 ← ($consonant y* $vowel) {ヽ$voice? ; $1 ← (.) {ヽ $voice? ; # otherwise repeat last character ← ヽ $voice? ; # delete if no characters found +# h- rule: lengthens vowel if not followed by a vowel. +# At the point this is applied, latin [cons]?vowel sequences +# have been converted to katakana in NFD form. $voweled_basekana [\u3099 \u309A]? { h → ー ; +# one-way latin- → kana rules. these do not occur in +# well-formed romaji representing actual japanese text. +# their purpose is to make all romaji map to kana of +# some sort. +# the following are not really necessary, but produce +# slightly more natural results. cy → セィ ; dy → テ\u3099ィ ; hy → ヒ ; @@ -238,6 +342,8 @@ sy → セィ ; ty → ティ ; zy → セ\u3099ィ ; h → ヘ ; +# isolated consonants listed here so as not to mask +# longer rules above. ch → チ; sh → シ ; dz → ツ\u3099 ; @@ -264,12 +370,22 @@ w → ウ; ð → | d ; ø → | u ; þ → | th ; +# simple substitutions using backup c → | k ; l → | r ; q → | k ; x → | ks ; +# ~~~ END shared rules ~~~ +#------------------------------------------------------ +# Final cleanup '~' → ; # delete stray tildes between letters [:Katakana:] { '' } [:Latin:] → ; # delete stray quotes between letters +# [ʾ[:Nonspacing Mark:]-[\u3099-゜]] → ; # delete any non-spacing marks that we didn't use :: NFC (NFD) ; :: ([[:Katakana:][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] halfwidth-fullwidth); +# note: a global filter is more efficient, but MUST include all source chars!! +#:: ([\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]]); +# MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD :: ( [[\ -~¢-£¥-¦¬\u0304₩。-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ│-○][~、-。がぎぐげござじずぜぞだぢづでどば-ぱび-ぴぶ-ぷべ-ぺぼ-ぽゔ\u3099-゛ゞァ-ヺー-ヾ][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] ) ; +# eof + diff --git a/icu4c/source/data/translit/Latin_Kannada.txt b/icu4c/source/data/translit/Latn_Knda.txt similarity index 92% rename from icu4c/source/data/translit/Latin_Kannada.txt rename to icu4c/source/data/translit/Latn_Knda.txt index e669bce68c6..2e1dad82dcb 100644 --- a/icu4c/source/data/translit/Latin_Kannada.txt +++ b/icu4c/source/data/translit/Latn_Knda.txt @@ -1,15 +1,17 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Latin_Kannada.txt +# File: Latn_Knda.txt # Generated from CLDR # + ::['.0-9A-Za-z~À-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳʔ\u0301\u0303-\u0304\u0306-\u0307\u0310\u0314-\u0315\u0323\u0325\u0331\u0341\u0344΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЃЌЎЙйѓќўӁ-ӂӐ-ӑӖ-ӗӢ-ӣӮ-ӯḀ-ẙẠ-ỹἁἃ-ἅἇἉἋ-ἍἏἑἓ-ἕἙἛ-Ἕἡἣ-ἥἧἩἫ-ἭἯἱἳ-ἵἷἹἻ-ἽἿὁὃ-ὅὉὋ-Ὅὑὓ-ὕὗὙὛὝὟὡὣ-ὥὧὩὫ-ὭὯάέήίόύώᾁᾃ-ᾅᾇᾉᾋ-ᾍᾏᾑᾓ-ᾕᾗᾙᾛ-ᾝᾟᾡᾣ-ᾥᾧᾩᾫ-ᾭᾯ-ᾱᾴᾸ-ᾹΆῄΈΉ῎ῐ-ῑΐῘ-ῙΊ῞ῠ-ῡΰῥῨ-ῩΎ-Ῥ΅ῴΌΏK-Å\uE04D\uE064]; ::NFD; ::Lower; ::Latin-InterIndic; ::InterIndic-Kannada; ::NFC; + diff --git a/icu4c/source/data/translit/Latin_Malayalam.txt b/icu4c/source/data/translit/Latn_Mlym.txt similarity index 91% rename from icu4c/source/data/translit/Latin_Malayalam.txt rename to icu4c/source/data/translit/Latn_Mlym.txt index 80ad8bed7db..46c5ec449a8 100644 --- a/icu4c/source/data/translit/Latin_Malayalam.txt +++ b/icu4c/source/data/translit/Latn_Mlym.txt @@ -1,15 +1,17 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Latin_Malayalam.txt +# File: Latn_Mlym.txt # Generated from CLDR # + ::['.0-9A-Za-z~À-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳʔ\u0301\u0303-\u0304\u0306-\u0307\u0310\u0314-\u0315\u0323\u0325\u0331\u0341\u0344΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЃЌЎЙйѓќўӁ-ӂӐ-ӑӖ-ӗӢ-ӣӮ-ӯḀ-ẙẠ-ỹἁἃ-ἅἇἉἋ-ἍἏἑἓ-ἕἙἛ-Ἕἡἣ-ἥἧἩἫ-ἭἯἱἳ-ἵἷἹἻ-ἽἿὁὃ-ὅὉὋ-Ὅὑὓ-ὕὗὙὛὝὟὡὣ-ὥὧὩὫ-ὭὯάέήίόύώᾁᾃ-ᾅᾇᾉᾋ-ᾍᾏᾑᾓ-ᾕᾗᾙᾛ-ᾝᾟᾡᾣ-ᾥᾧᾩᾫ-ᾭᾯ-ᾱᾴᾸ-ᾹΆῄΈΉ῎ῐ-ῑΐῘ-ῙΊ῞ῠ-ῡΰῥῨ-ῩΎ-Ῥ΅ῴΌΏK-Å\uE04D\uE064]; ::NFD; ::Lower; ::Latin-InterIndic; ::InterIndic-Malayalam; ::NFC; + diff --git a/icu4c/source/data/translit/Latin_Oriya.txt b/icu4c/source/data/translit/Latn_Orya.txt similarity index 92% rename from icu4c/source/data/translit/Latin_Oriya.txt rename to icu4c/source/data/translit/Latn_Orya.txt index 7dccb3669eb..7a3c356e59c 100644 --- a/icu4c/source/data/translit/Latin_Oriya.txt +++ b/icu4c/source/data/translit/Latn_Orya.txt @@ -1,15 +1,17 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Latin_Oriya.txt +# File: Latn_Orya.txt # Generated from CLDR # + ::['.0-9A-Za-z~À-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳʔ\u0301\u0303-\u0304\u0306-\u0307\u0310\u0314-\u0315\u0323\u0325\u0331\u0341\u0344΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЃЌЎЙйѓќўӁ-ӂӐ-ӑӖ-ӗӢ-ӣӮ-ӯḀ-ẙẠ-ỹἁἃ-ἅἇἉἋ-ἍἏἑἓ-ἕἙἛ-Ἕἡἣ-ἥἧἩἫ-ἭἯἱἳ-ἵἷἹἻ-ἽἿὁὃ-ὅὉὋ-Ὅὑὓ-ὕὗὙὛὝὟὡὣ-ὥὧὩὫ-ὭὯάέήίόύώᾁᾃ-ᾅᾇᾉᾋ-ᾍᾏᾑᾓ-ᾕᾗᾙᾛ-ᾝᾟᾡᾣ-ᾥᾧᾩᾫ-ᾭᾯ-ᾱᾴᾸ-ᾹΆῄΈΉ῎ῐ-ῑΐῘ-ῙΊ῞ῠ-ῡΰῥῨ-ῩΎ-Ῥ΅ῴΌΏK-Å\uE04D\uE064]; ::NFD; ::Lower; ::Latin-InterIndic; ::InterIndic-Oriya; ::NFC; + diff --git a/icu4c/source/data/translit/Latin_Tamil.txt b/icu4c/source/data/translit/Latn_Taml.txt similarity index 92% rename from icu4c/source/data/translit/Latin_Tamil.txt rename to icu4c/source/data/translit/Latn_Taml.txt index 71da77f94dc..111aeadcf51 100644 --- a/icu4c/source/data/translit/Latin_Tamil.txt +++ b/icu4c/source/data/translit/Latn_Taml.txt @@ -1,15 +1,17 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Latin_Tamil.txt +# File: Latn_Taml.txt # Generated from CLDR # + ::['.0-9A-Za-z~À-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳʔ\u0301\u0303-\u0304\u0306-\u0307\u0310\u0314-\u0315\u0323\u0325\u0331\u0341\u0344΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЃЌЎЙйѓќўӁ-ӂӐ-ӑӖ-ӗӢ-ӣӮ-ӯḀ-ẙẠ-ỹἁἃ-ἅἇἉἋ-ἍἏἑἓ-ἕἙἛ-Ἕἡἣ-ἥἧἩἫ-ἭἯἱἳ-ἵἷἹἻ-ἽἿὁὃ-ὅὉὋ-Ὅὑὓ-ὕὗὙὛὝὟὡὣ-ὥὧὩὫ-ὭὯάέήίόύώᾁᾃ-ᾅᾇᾉᾋ-ᾍᾏᾑᾓ-ᾕᾗᾙᾛ-ᾝᾟᾡᾣ-ᾥᾧᾩᾫ-ᾭᾯ-ᾱᾴᾸ-ᾹΆῄΈΉ῎ῐ-ῑΐῘ-ῙΊ῞ῠ-ῡΰῥῨ-ῩΎ-Ῥ΅ῴΌΏK-Å\uE04D\uE064]; ::NFD; ::Lower; ::Latin-InterIndic; ::InterIndic-Tamil; ::NFC; + diff --git a/icu4c/source/data/translit/Latin_Telugu.txt b/icu4c/source/data/translit/Latn_Telu.txt similarity index 92% rename from icu4c/source/data/translit/Latin_Telugu.txt rename to icu4c/source/data/translit/Latn_Telu.txt index 4d0c63fef06..43824aa48b9 100644 --- a/icu4c/source/data/translit/Latin_Telugu.txt +++ b/icu4c/source/data/translit/Latn_Telu.txt @@ -1,15 +1,17 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Latin_Telugu.txt +# File: Latn_Telu.txt # Generated from CLDR # + ::['.0-9A-Za-z~À-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳʔ\u0301\u0303-\u0304\u0306-\u0307\u0310\u0314-\u0315\u0323\u0325\u0331\u0341\u0344΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЃЌЎЙйѓќўӁ-ӂӐ-ӑӖ-ӗӢ-ӣӮ-ӯḀ-ẙẠ-ỹἁἃ-ἅἇἉἋ-ἍἏἑἓ-ἕἙἛ-Ἕἡἣ-ἥἧἩἫ-ἭἯἱἳ-ἵἷἹἻ-ἽἿὁὃ-ὅὉὋ-Ὅὑὓ-ὕὗὙὛὝὟὡὣ-ὥὧὩὫ-ὭὯάέήίόύώᾁᾃ-ᾅᾇᾉᾋ-ᾍᾏᾑᾓ-ᾕᾗᾙᾛ-ᾝᾟᾡᾣ-ᾥᾧᾩᾫ-ᾭᾯ-ᾱᾴᾸ-ᾹΆῄΈΉ῎ῐ-ῑΐῘ-ῙΊ῞ῠ-ῡΰῥῨ-ῩΎ-Ῥ΅ῴΌΏK-Å\uE04D\uE064]; ::NFD; ::Lower; ::Latin-InterIndic; ::InterIndic-Telugu; ::NFC; + diff --git a/icu4c/source/data/translit/Latin_Thaana.txt b/icu4c/source/data/translit/Latn_Thaa.txt similarity index 98% rename from icu4c/source/data/translit/Latin_Thaana.txt rename to icu4c/source/data/translit/Latn_Thaa.txt index f2fa6834471..2efe446689d 100644 --- a/icu4c/source/data/translit/Latin_Thaana.txt +++ b/icu4c/source/data/translit/Latn_Thaa.txt @@ -1,12 +1,13 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Latin_Thaana.txt +# File: Latn_Thaa.txt # Generated from CLDR # + tta ↔ ޘ\u07A6 ; ḥa ↔ ޙ\u07A6 ; kha ↔ ޚ\u07A6 ; @@ -438,3 +439,4 @@ e ↔ \u07AC ; o ↔ \u07AE ; ō ↔ \u07AF ; ← \u07B0 ; + diff --git a/icu4c/source/data/translit/Latin_Thai.txt b/icu4c/source/data/translit/Latn_Thai.txt similarity index 81% rename from icu4c/source/data/translit/Latin_Thai.txt rename to icu4c/source/data/translit/Latn_Thai.txt index 402e849b014..658aaad5058 100644 --- a/icu4c/source/data/translit/Latin_Thai.txt +++ b/icu4c/source/data/translit/Latn_Thai.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Latin_Thai.txt +# File: Latn_Thai.txt # Generated from CLDR # + ::[[:Latin:][:Mn:][:Me:] \u0020\u0026 ;0-9|~«»ʹʹ‡ˌ]; ::NFD; ::Latin-ThaiLogical; ::ThaiLogical-Thai; ::NFC; + diff --git a/icu4c/source/data/translit/Malayalam_InterIndic.txt b/icu4c/source/data/translit/Malayalam_InterIndic.txt index 9d2b95da177..4db1ed626dc 100644 --- a/icu4c/source/data/translit/Malayalam_InterIndic.txt +++ b/icu4c/source/data/translit/Malayalam_InterIndic.txt @@ -1,12 +1,15 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Malayalam_InterIndic.txt # Generated from CLDR # + +# Malayalam-InterIndic +#:: NFD (NFC) ; ം→\uE002; # SIGN ANUSVARA ഃ→\uE003; # SIGN VISARGA അ→\uE005; # LETTER A @@ -82,3 +85,6 @@ ൭→\uE06D; # DIGIT SEVEN ൮→\uE06E; # DIGIT EIGHT ൯→\uE06F; # DIGIT NINE +# :: NFC (NFD) ; +# eof + diff --git a/icu4c/source/data/translit/Maldivian_Latin_BGN.txt b/icu4c/source/data/translit/Maldivian_Latin_BGN.txt deleted file mode 100644 index 46f8f1f0354..00000000000 --- a/icu4c/source/data/translit/Maldivian_Latin_BGN.txt +++ /dev/null @@ -1,78 +0,0 @@ -# *************************************************************************** -# * -# * Copyright (C) 2004-2015, International Business Machines -# * Corporation; Unicode, Inc.; and others. All Rights Reserved. -# * -# *************************************************************************** -# File: Maldivian_Latin_BGN.txt -# Generated from CLDR -# -:: [ހށނރބޅކއވމފދތލގޏސޑޒޓޔޕޖޗ\u07A6\u07A7\u07A8\u07A9\u07AA\u07AB\u07AC\u07AD\u07AE\u07AF\u07B0]; -:: NFD (NFC) ; -އ\u07A6 → a ; # THAANA LETTER ALIFU + ABAFILI -އ\u07A7 → aa ; # THAANA LETTER ALIFU + AABAAFILI -އ\u07A8 → i ; # THAANA LETTER ALIFU + IBIFILI -އ\u07A9 → ee ; # THAANA LETTER ALIFU + EEBEEFILI -އ\u07AA → u ; # THAANA LETTER ALIFU + UBUFILI -އ\u07AB → oo ; # THAANA LETTER ALIFU + OOBOOFILI -އ\u07AC → e ; # THAANA LETTER ALIFU + EBEFILI -އ\u07AD → ey ; # THAANA LETTER ALIFU + EYBEYFILI -އ\u07AE → o ; # THAANA LETTER ALIFU + OBOFILI -އ\u07AF → oa ; # THAANA LETTER ALIFU + OABOAFILI -އހ → hh ; # THAANA LETTER ALIFU + HAA -އށ → shsh ; # THAANA LETTER ALIFU + SHAVIYANI -އނ → nn ; # THAANA LETTER ALIFU + NOONU -އރ → rr ; # THAANA LETTER ALIFU + RAA -އބ → bb ; # THAANA LETTER ALIFU + BAA -އޅ → lhlh ; # THAANA LETTER ALIFU + LHAVIYANI -އކ → kk ; # THAANA LETTER ALIFU + KAAFU -އވ → vv ; # THAANA LETTER ALIFU + VAAVU -އމ → mm ; # THAANA LETTER ALIFU + MEEMU -އފ → ff ; # THAANA LETTER ALIFU + FAAFU -އދ → dhdh ; # THAANA LETTER ALIFU + DHAALU -އތ → thth ; # THAANA LETTER ALIFU + THAA -އލ → ll ; # THAANA LETTER ALIFU + LAAMU -އގ → gg ; # THAANA LETTER ALIFU + GAAFU -އޏ → gngn ; # THAANA LETTER ALIFU + GNAVIYANI -އސ → ss ; # THAANA LETTER ALIFU + SEENU -އޑ → dd ; # THAANA LETTER ALIFU + DAVIYANI -އޒ → zz ; # THAANA LETTER ALIFU + ZAVIYANI -އޓ → tt ; # THAANA LETTER ALIFU + TAVIYANI -އޔ → yy ; # THAANA LETTER ALIFU + YAA -އޕ → pp ; # THAANA LETTER ALIFU + PAVIYANI -އޖ → jj ; # THAANA LETTER ALIFU + JAVIYANI -އޗ → chch ; # THAANA LETTER ALIFU + CHAVIYANI -ހ → h ; # THAANA LETTER HAA -ށ → sh ; # THAANA LETTER SHAVIYANI -ނ → n ; # THAANA LETTER NOONU -ރ → r ; # THAANA LETTER RAA -ބ → b ; # THAANA LETTER BAA -ޅ → lh ; # THAANA LETTER LHAVIYANI -ކ → k ; # THAANA LETTER KAAFU -ވ → v ; # THAANA LETTER VAAVU -މ → m ; # THAANA LETTER MEEMU -ފ → f ; # THAANA LETTER FAAFU -ދ → dh ; # THAANA LETTER DHAALU -ތ → th ; # THAANA LETTER THAA -ލ → l ; # THAANA LETTER LAAMU -ގ → g ; # THAANA LETTER GAAFU -ޏ → gn ; # THAANA LETTER GNAVIYANI -ސ → s ; # THAANA LETTER SEENU -ޑ → d ; # THAANA LETTER DAVIYANI -ޒ → z ; # THAANA LETTER ZAVIYANI -ޓ → t ; # THAANA LETTER TAVIYANI -ޔ → y ; # THAANA LETTER YAA -ޕ → p ; # THAANA LETTER PAVIYANI -ޖ → j ; # THAANA LETTER JAVIYANI -ޗ → ch ; # THAANA LETTER CHAVIYANI -\u07A6 → a ; # THAANA ABAFILI -\u07A7 → aa ; # THAANA AABAAFILI -\u07A8 → i ; # THAANA IBIFILI -\u07A9 → ee ; # THAANA EEBEEFILI -\u07AA → u ; # THAANA UBUFILI -\u07AB → oo ; # THAANA OOBOOFILI -\u07AC → e ; # THAANA EBEFILI -\u07AD → ey ; # THAANA EYBEYFILI -\u07AE → o ; # THAANA OBOFILI -\u07AF → oa ; # THAANA OABOAFILI -\u07B0 → ; # THAANA SUKUUN diff --git a/icu4c/source/data/translit/Malayalam_Bengali.txt b/icu4c/source/data/translit/Mlym_Beng.txt similarity index 81% rename from icu4c/source/data/translit/Malayalam_Bengali.txt rename to icu4c/source/data/translit/Mlym_Beng.txt index 7811de832c0..1b1a51709fb 100644 --- a/icu4c/source/data/translit/Malayalam_Bengali.txt +++ b/icu4c/source/data/translit/Mlym_Beng.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Malayalam_Bengali.txt +# File: Mlym_Beng.txt # Generated from CLDR # + ::[ം-ഃഅ-ഌഎ-ഐഒ-നപ-ഹാ-\u0D43െ-ൈൊ-\u0D4Dൗൠ-ൡ൦-൯]; ::NFD; ::Malayalam-InterIndic; ::InterIndic-Bengali; ::NFC; + diff --git a/icu4c/source/data/translit/Malayalam_Devanagari.txt b/icu4c/source/data/translit/Mlym_Deva.txt similarity index 81% rename from icu4c/source/data/translit/Malayalam_Devanagari.txt rename to icu4c/source/data/translit/Mlym_Deva.txt index 15c192cc09b..649d60b42e7 100644 --- a/icu4c/source/data/translit/Malayalam_Devanagari.txt +++ b/icu4c/source/data/translit/Mlym_Deva.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Malayalam_Devanagari.txt +# File: Mlym_Deva.txt # Generated from CLDR # + ::[ം-ഃഅ-ഌഎ-ഐഒ-നപ-ഹാ-\u0D43െ-ൈൊ-\u0D4Dൗൠ-ൡ൦-൯]; ::NFD; ::Malayalam-InterIndic; ::InterIndic-Devanagari; ::NFC; + diff --git a/icu4c/source/data/translit/Malayalam_Gujarati.txt b/icu4c/source/data/translit/Mlym_Gujr.txt similarity index 81% rename from icu4c/source/data/translit/Malayalam_Gujarati.txt rename to icu4c/source/data/translit/Mlym_Gujr.txt index a55e2bc20ca..f6489153d3e 100644 --- a/icu4c/source/data/translit/Malayalam_Gujarati.txt +++ b/icu4c/source/data/translit/Mlym_Gujr.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Malayalam_Gujarati.txt +# File: Mlym_Gujr.txt # Generated from CLDR # + ::[ം-ഃഅ-ഌഎ-ഐഒ-നപ-ഹാ-\u0D43െ-ൈൊ-\u0D4Dൗൠ-ൡ൦-൯]; ::NFD; ::Malayalam-InterIndic; ::InterIndic-Gujarati; ::NFC; + diff --git a/icu4c/source/data/translit/Malayalam_Gurmukhi.txt b/icu4c/source/data/translit/Mlym_Guru.txt similarity index 81% rename from icu4c/source/data/translit/Malayalam_Gurmukhi.txt rename to icu4c/source/data/translit/Mlym_Guru.txt index 829d4845faf..47fcd6d2b83 100644 --- a/icu4c/source/data/translit/Malayalam_Gurmukhi.txt +++ b/icu4c/source/data/translit/Mlym_Guru.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Malayalam_Gurmukhi.txt +# File: Mlym_Guru.txt # Generated from CLDR # + ::[ം-ഃഅ-ഌഎ-ഐഒ-നപ-ഹാ-\u0D43െ-ൈൊ-\u0D4Dൗൠ-ൡ൦-൯]; ::NFD; ::Malayalam-InterIndic; ::InterIndic-Gurmukhi; ::NFC; + diff --git a/icu4c/source/data/translit/Malayalam_Kannada.txt b/icu4c/source/data/translit/Mlym_Knda.txt similarity index 81% rename from icu4c/source/data/translit/Malayalam_Kannada.txt rename to icu4c/source/data/translit/Mlym_Knda.txt index 2f03c08ad53..dd52b2d618c 100644 --- a/icu4c/source/data/translit/Malayalam_Kannada.txt +++ b/icu4c/source/data/translit/Mlym_Knda.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Malayalam_Kannada.txt +# File: Mlym_Knda.txt # Generated from CLDR # + ::[ം-ഃഅ-ഌഎ-ഐഒ-നപ-ഹാ-\u0D43െ-ൈൊ-\u0D4Dൗൠ-ൡ൦-൯]; ::NFD; ::Malayalam-InterIndic; ::InterIndic-Kannada; ::NFC; + diff --git a/icu4c/source/data/translit/Malayalam_Latin.txt b/icu4c/source/data/translit/Mlym_Latn.txt similarity index 81% rename from icu4c/source/data/translit/Malayalam_Latin.txt rename to icu4c/source/data/translit/Mlym_Latn.txt index 67303514f8c..b70c99b9216 100644 --- a/icu4c/source/data/translit/Malayalam_Latin.txt +++ b/icu4c/source/data/translit/Mlym_Latn.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Malayalam_Latin.txt +# File: Mlym_Latn.txt # Generated from CLDR # + ::[ം-ഃഅ-ഌഎ-ഐഒ-നപ-ഹാ-\u0D43െ-ൈൊ-\u0D4Dൗൠ-ൡ൦-൯]; ::NFD; ::Malayalam-InterIndic; ::InterIndic-Latin; ::NFC; + diff --git a/icu4c/source/data/translit/Malayalam_Oriya.txt b/icu4c/source/data/translit/Mlym_Orya.txt similarity index 81% rename from icu4c/source/data/translit/Malayalam_Oriya.txt rename to icu4c/source/data/translit/Mlym_Orya.txt index 20709e1eef5..1771c1af934 100644 --- a/icu4c/source/data/translit/Malayalam_Oriya.txt +++ b/icu4c/source/data/translit/Mlym_Orya.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Malayalam_Oriya.txt +# File: Mlym_Orya.txt # Generated from CLDR # + ::[ം-ഃഅ-ഌഎ-ഐഒ-നപ-ഹാ-\u0D43െ-ൈൊ-\u0D4Dൗൠ-ൡ൦-൯]; ::NFD; ::Malayalam-InterIndic; ::InterIndic-Oriya; ::NFC; + diff --git a/icu4c/source/data/translit/Malayalam_Tamil.txt b/icu4c/source/data/translit/Mlym_Taml.txt similarity index 81% rename from icu4c/source/data/translit/Malayalam_Tamil.txt rename to icu4c/source/data/translit/Mlym_Taml.txt index f391af5ff8d..3d2d96542c9 100644 --- a/icu4c/source/data/translit/Malayalam_Tamil.txt +++ b/icu4c/source/data/translit/Mlym_Taml.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Malayalam_Tamil.txt +# File: Mlym_Taml.txt # Generated from CLDR # + ::[ം-ഃഅ-ഌഎ-ഐഒ-നപ-ഹാ-\u0D43െ-ൈൊ-\u0D4Dൗൠ-ൡ൦-൯]; ::NFD; ::Malayalam-InterIndic; ::InterIndic-Tamil; ::NFC; + diff --git a/icu4c/source/data/translit/Malayalam_Telugu.txt b/icu4c/source/data/translit/Mlym_Telu.txt similarity index 81% rename from icu4c/source/data/translit/Malayalam_Telugu.txt rename to icu4c/source/data/translit/Mlym_Telu.txt index 6a45e1b3a1c..e5b84f08bb9 100644 --- a/icu4c/source/data/translit/Malayalam_Telugu.txt +++ b/icu4c/source/data/translit/Mlym_Telu.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Malayalam_Telugu.txt +# File: Mlym_Telu.txt # Generated from CLDR # + ::[ം-ഃഅ-ഌഎ-ഐഒ-നപ-ഹാ-\u0D43െ-ൈൊ-\u0D4Dൗൠ-ൡ൦-൯]; ::NFD; ::Malayalam-InterIndic; ::InterIndic-Telugu; ::NFC; + diff --git a/icu4c/source/data/translit/Oriya_InterIndic.txt b/icu4c/source/data/translit/Oriya_InterIndic.txt index 46d19874c87..788624f0bd9 100644 --- a/icu4c/source/data/translit/Oriya_InterIndic.txt +++ b/icu4c/source/data/translit/Oriya_InterIndic.txt @@ -1,12 +1,17 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Oriya_InterIndic.txt # Generated from CLDR # + +# Oriya-InterIndic +#:: NFD (NFC) ; +#ଡ\u0B3C→\uE05C;# LETTER RRA +#ଢ\u0B3C→\uE05D;# LETTER RHA େ\u0B56→\uE048;# VOWEL SIGN AI ୋ→\uE04B;# VOWEL SIGN O ୌ→\uE04C;# VOWEL SIGN AU @@ -68,11 +73,13 @@ \u0B42→\uE042; # VOWEL SIGN UU \u0B43→\uE043; # VOWEL SIGN VOCALIC R େ→\uE047; # VOWEL SIGN E +# \u0B4D→\uE04D; # SIGN VIRAMA \u0B56→\uE056; # AI LENGTH MARK ୗ→\uE057; # AU LENGTH MARK ।→\uE064; # DANDA ॥→\uE065; # DOUBLE DANDA +# ୟ→\uE05F; # LETTER YYA ୠ→\uE060; # LETTER VOCALIC RR ୡ→\uE061; # LETTER VOCALIC LL @@ -88,3 +95,6 @@ ୯→\uE06F; # DIGIT NINE ୰→\uE07B; # ISSHAR ୱ→\uE081; # LETTER WA +# :: NFC (NFD) ; +# eof + diff --git a/icu4c/source/data/translit/Oriya_Bengali.txt b/icu4c/source/data/translit/Orya_Beng.txt similarity index 83% rename from icu4c/source/data/translit/Oriya_Bengali.txt rename to icu4c/source/data/translit/Orya_Beng.txt index 91f64c36617..ed050f0f996 100644 --- a/icu4c/source/data/translit/Oriya_Bengali.txt +++ b/icu4c/source/data/translit/Orya_Beng.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Oriya_Bengali.txt +# File: Orya_Beng.txt # Generated from CLDR # + ::[\u0B01-ଃଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵ-ହ\u0B3C-\u0B43େ-ୈୋ-\u0B4D\u0B56-ୗଡ଼-ଢ଼ୟ-ୡ୦-ୱ]; ::NFD; ::Oriya-InterIndic; ::InterIndic-Bengali; ::NFC; + diff --git a/icu4c/source/data/translit/Oriya_Devanagari.txt b/icu4c/source/data/translit/Orya_Deva.txt similarity index 82% rename from icu4c/source/data/translit/Oriya_Devanagari.txt rename to icu4c/source/data/translit/Orya_Deva.txt index 8cb6d6f8660..fa39cc45694 100644 --- a/icu4c/source/data/translit/Oriya_Devanagari.txt +++ b/icu4c/source/data/translit/Orya_Deva.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Oriya_Devanagari.txt +# File: Orya_Deva.txt # Generated from CLDR # + ::[\u0B01-ଃଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵ-ହ\u0B3C-\u0B43େ-ୈୋ-\u0B4D\u0B56-ୗଡ଼-ଢ଼ୟ-ୡ୦-ୱ]; ::NFD; ::Oriya-InterIndic; ::InterIndic-Devanagari; ::NFC; + diff --git a/icu4c/source/data/translit/Oriya_Gujarati.txt b/icu4c/source/data/translit/Orya_Gujr.txt similarity index 83% rename from icu4c/source/data/translit/Oriya_Gujarati.txt rename to icu4c/source/data/translit/Orya_Gujr.txt index 67a94d310f4..0c231d18b00 100644 --- a/icu4c/source/data/translit/Oriya_Gujarati.txt +++ b/icu4c/source/data/translit/Orya_Gujr.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Oriya_Gujarati.txt +# File: Orya_Gujr.txt # Generated from CLDR # + ::[\u0B01-ଃଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵ-ହ\u0B3C-\u0B43େ-ୈୋ-\u0B4D\u0B56-ୗଡ଼-ଢ଼ୟ-ୡ୦-ୱ]; ::NFD; ::Oriya-InterIndic; ::InterIndic-Gujarati; ::NFC; + diff --git a/icu4c/source/data/translit/Oriya_Gurmukhi.txt b/icu4c/source/data/translit/Orya_Guru.txt similarity index 83% rename from icu4c/source/data/translit/Oriya_Gurmukhi.txt rename to icu4c/source/data/translit/Orya_Guru.txt index 86c4c2ba82e..13c44c4bd00 100644 --- a/icu4c/source/data/translit/Oriya_Gurmukhi.txt +++ b/icu4c/source/data/translit/Orya_Guru.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Oriya_Gurmukhi.txt +# File: Orya_Guru.txt # Generated from CLDR # + ::[\u0B01-ଃଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵ-ହ\u0B3C-\u0B43େ-ୈୋ-\u0B4D\u0B56-ୗଡ଼-ଢ଼ୟ-ୡ୦-ୱ]; ::NFD; ::Oriya-InterIndic; ::InterIndic-Gurmukhi; ::NFC; + diff --git a/icu4c/source/data/translit/Oriya_Kannada.txt b/icu4c/source/data/translit/Orya_Knda.txt similarity index 83% rename from icu4c/source/data/translit/Oriya_Kannada.txt rename to icu4c/source/data/translit/Orya_Knda.txt index f20fe430878..33eb48d46cd 100644 --- a/icu4c/source/data/translit/Oriya_Kannada.txt +++ b/icu4c/source/data/translit/Orya_Knda.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Oriya_Kannada.txt +# File: Orya_Knda.txt # Generated from CLDR # + ::[\u0B01-ଃଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵ-ହ\u0B3C-\u0B43େ-ୈୋ-\u0B4D\u0B56-ୗଡ଼-ଢ଼ୟ-ୡ୦-ୱ]; ::NFD; ::Oriya-InterIndic; ::InterIndic-Kannada; ::NFC; + diff --git a/icu4c/source/data/translit/Oriya_Latin.txt b/icu4c/source/data/translit/Orya_Latn.txt similarity index 83% rename from icu4c/source/data/translit/Oriya_Latin.txt rename to icu4c/source/data/translit/Orya_Latn.txt index 8a4d3b6af2e..2fa449c7a11 100644 --- a/icu4c/source/data/translit/Oriya_Latin.txt +++ b/icu4c/source/data/translit/Orya_Latn.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Oriya_Latin.txt +# File: Orya_Latn.txt # Generated from CLDR # + ::[।-॥\u0B01-ଃଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵଶ-ହ\u0B3C-\u0B43େ-ୈୋ-\u0B4D\u0B56-ୗଡ଼-ଢ଼ୟ-ୡ୦-୰ୱ]; ::NFD; ::Oriya-InterIndic; ::InterIndic-Latin; ::NFC; + diff --git a/icu4c/source/data/translit/Oriya_Malayalam.txt b/icu4c/source/data/translit/Orya_Mlym.txt similarity index 82% rename from icu4c/source/data/translit/Oriya_Malayalam.txt rename to icu4c/source/data/translit/Orya_Mlym.txt index be456501e2d..e1fd2369f46 100644 --- a/icu4c/source/data/translit/Oriya_Malayalam.txt +++ b/icu4c/source/data/translit/Orya_Mlym.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Oriya_Malayalam.txt +# File: Orya_Mlym.txt # Generated from CLDR # + ::[\u0B01-ଃଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵ-ହ\u0B3C-\u0B43େ-ୈୋ-\u0B4D\u0B56-ୗଡ଼-ଢ଼ୟ-ୡ୦-ୱ]; ::NFD; ::Oriya-InterIndic; ::InterIndic-Malayalam; ::NFC; + diff --git a/icu4c/source/data/translit/Oriya_Tamil.txt b/icu4c/source/data/translit/Orya_Taml.txt similarity index 83% rename from icu4c/source/data/translit/Oriya_Tamil.txt rename to icu4c/source/data/translit/Orya_Taml.txt index 4e0d656526a..f3b64f853b0 100644 --- a/icu4c/source/data/translit/Oriya_Tamil.txt +++ b/icu4c/source/data/translit/Orya_Taml.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Oriya_Tamil.txt +# File: Orya_Taml.txt # Generated from CLDR # + ::[\u0B01-ଃଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵ-ହ\u0B3C-\u0B43େ-ୈୋ-\u0B4D\u0B56-ୗଡ଼-ଢ଼ୟ-ୡ୦-ୱ]; ::NFD; ::Oriya-InterIndic; ::InterIndic-Tamil; ::NFC; + diff --git a/icu4c/source/data/translit/Oriya_Telugu.txt b/icu4c/source/data/translit/Orya_Telu.txt similarity index 83% rename from icu4c/source/data/translit/Oriya_Telugu.txt rename to icu4c/source/data/translit/Orya_Telu.txt index 24965f9e5a7..1061f58447c 100644 --- a/icu4c/source/data/translit/Oriya_Telugu.txt +++ b/icu4c/source/data/translit/Orya_Telu.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Oriya_Telugu.txt +# File: Orya_Telu.txt # Generated from CLDR # + ::[\u0B01-ଃଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵ-ହ\u0B3C-\u0B43େ-ୈୋ-\u0B4D\u0B56-ୗଡ଼-ଢ଼ୟ-ୡ୦-ୱ]; ::NFD; ::Oriya-InterIndic; ::InterIndic-Telugu; ::NFC; + diff --git a/icu4c/source/data/translit/Pinyin_NumericPinyin.txt b/icu4c/source/data/translit/Pinyin_NumericPinyin.txt index fd8e173fcb6..a0d1e3e49e8 100644 --- a/icu4c/source/data/translit/Pinyin_NumericPinyin.txt +++ b/icu4c/source/data/translit/Pinyin_NumericPinyin.txt @@ -1,14 +1,17 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Pinyin_NumericPinyin.txt # Generated from CLDR # + +# Only intended for internal use \u0304 ↔ 1; \u0301 ↔ 2; \u030C ↔ 3; \u0300 ↔ 4; ← 5; + diff --git a/icu4c/source/data/translit/Russian_Latin_BGN.txt b/icu4c/source/data/translit/Russian_Latin_BGN.txt deleted file mode 100644 index 16d52cd7c3e..00000000000 --- a/icu4c/source/data/translit/Russian_Latin_BGN.txt +++ /dev/null @@ -1,114 +0,0 @@ -# *************************************************************************** -# * -# * Copyright (C) 2004-2015, International Business Machines -# * Corporation; Unicode, Inc.; and others. All Rights Reserved. -# * -# *************************************************************************** -# File: Russian_Latin_BGN.txt -# Generated from CLDR -# -::[АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюя]; -$prime = ʹ ; -$doublePrime = ʺ ; -$wordBoundary = [^[:L:][:M:][:N:]] ; -$upperConsonants = [БВГДЖЙКЛМНПРСТФХЦЧШЩЭ] ; -$lowerConsonants = [бвгджйклмнпрстфхцчшщэ] ; -$consonants = [$upperConsonants $lowerConsonants] ; -$upperVowels = [АЕЁЭИОУЫЮЯ] ; -$lowerVowels = [аеёэиоуыюя] ; -$vowels = [$upperVowels $lowerVowels] ; -$lower = [$lowerConsonants $lowerVowels] ; -$upper = [$upperConsonants $upperVowels] ; -[$upperVowels [ЙЪЬ]] { Е } $upper → YE ; # CYRILLIC CAPITAL LETTER IE -[$upperVowels [ЙЪЬ]] { Е → Ye ; # CYRILLIC CAPITAL LETTER IE -[$upperVowels $lowerVowels [ЙйЪъЬь]] { е → ye ; # CYRILLIC SMALL LETTER IE -[$upperVowels [ЙЪЬ]] { Ё } $upper → YË ; # CYRILLIC CAPITAL LETTER IO -[$upperVowels [ЙЪЬ]] { Ё → Yë ; # CYRILLIC CAPITAL LETTER IO -[$upperVowels $lowerVowels [ЙйЪъЬь]] { ё → yë ; # CYRILLIC SMALL LETTER IO -::Null; -А → A ; # CYRILLIC CAPITAL LETTER A -а → a ; # CYRILLIC SMALL LETTER A -Б → B ; # CYRILLIC CAPITAL LETTER BE -б → b ; # CYRILLIC SMALL LETTER BE -В → V ; # CYRILLIC CAPITAL LETTER VE -в → v ; # CYRILLIC SMALL LETTER VE -Г → G ; # CYRILLIC CAPITAL LETTER GHE -г → g ; # CYRILLIC SMALL LETTER GHE -Д → D ; # CYRILLIC CAPITAL LETTER DE -д → d ; # CYRILLIC SMALL LETTER DE -$wordBoundary{Е} $upper → YE ; # CYRILLIC CAPITAL LETTER IE -$wordBoundary{Е → Ye ; # CYRILLIC CAPITAL LETTER IE -Е → E ; # CYRILLIC CAPITAL LETTER IE -$wordBoundary{е → ye ; # CYRILLIC SMALL LETTER IE -е → e ; # CYRILLIC SMALL LETTER IE -$wordBoundary {Ё} $upper → YË ; # CYRILLIC CAPITAL LETTER IO -$wordBoundary {Ё} $lower → Yë ; # CYRILLIC CAPITAL LETTER IO -Ё → Ë ; # CYRILLIC CAPITAL LETTER IO -$wordBoundary{ё → yë ; # CYRILLIC SMALL LETTER IO -ё → ë ; # CYRILLIC SMALL LETTER IO -Ж} $lower → Zh ; # CYRILLIC CAPITAL LETTER ZHE -Ж → ZH ; # CYRILLIC CAPITAL LETTER ZHE -ж → zh ; # CYRILLIC SMALL LETTER ZHE -З → Z ; # CYRILLIC CAPITAL LETTER ZE -з → z ; # CYRILLIC SMALL LETTER ZE -И → I ; # CYRILLIC CAPITAL LETTER I -и → i ; # CYRILLIC SMALL LETTER I -Й → Y ; # CYRILLIC CAPITAL LETTER I -й → y ; # CYRILLIC SMALL LETTER I -К → K ; # CYRILLIC CAPITAL LETTER KA -к → k ; # CYRILLIC SMALL LETTER KA -Л → L ; # CYRILLIC CAPITAL LETTER EL -л → l ; # CYRILLIC SMALL LETTER EL -М → M ; # CYRILLIC CAPITAL LETTER EM -м → m ; # CYRILLIC SMALL LETTER EM -Н → N ; # CYRILLIC CAPITAL LETTER EN -н → n ; # CYRILLIC SMALL LETTER EN -О → O ; # CYRILLIC CAPITAL LETTER O -о → o ; # CYRILLIC SMALL LETTER O -П → P ; # CYRILLIC CAPITAL LETTER PE -п → p ; # CYRILLIC SMALL LETTER PE -Р → R ; # CYRILLIC CAPITAL LETTER ER -р → r ; # CYRILLIC SMALL LETTER ER -С → S ; # CYRILLIC CAPITAL LETTER ES -с → s ; # CYRILLIC SMALL LETTER ES -ТС → TS ; # CYRILLIC CAPITAL LETTER TE -Тс → Ts ; # CYRILLIC CAPITAL LETTER TE -тс → ts ; # CYRILLIC SMALL LETTER TE -Т → T ; # CYRILLIC CAPITAL LETTER TE -т → t ; # CYRILLIC SMALL LETTER TE -У → U ; # CYRILLIC CAPITAL LETTER U -у → u ; # CYRILLIC SMALL LETTER U -Ф → F ; # CYRILLIC CAPITAL LETTER EF -ф → f ; # CYRILLIC SMALL LETTER EF -Х} $lower → Kh ; # CYRILLIC CAPITAL LETTER HA -Х → KH ; # CYRILLIC CAPITAL LETTER HA -х → kh ; # CYRILLIC SMALL LETTER HA -Ц} $lower → Ts ; # CYRILLIC CAPITAL LETTER TSE -Ц → TS ; # CYRILLIC CAPITAL LETTER TSE -ц → ts ; # CYRILLIC SMALL LETTER TSE -Ч} $lower → Ch ; # CYRILLIC CAPITAL LETTER CHE -Ч → CH ; # CYRILLIC CAPITAL LETTER CHE -ч → ch ; # CYRILLIC SMALL LETTER CHE -ШЧ → SHCH ; # CYRILLIC CAPITAL LETTER SHA -Шч → Shch ; # CYRILLIC CAPITAL LETTER SHA -шч → shch ; # CYRILLIC SMALL LETTER SHA -Ш} $lower → Sh ; # CYRILLIC CAPITAL LETTER SHA -Ш → SH ; # CYRILLIC CAPITAL LETTER SHA -ш → sh ; # CYRILLIC SMALL LETTER SHA -Щ} $lower → Shch ; # CYRILLIC CAPITAL LETTER SHCHA -Щ → SHCH ; # CYRILLIC CAPITAL LETTER SHCHA -щ → shch ; # CYRILLIC SMALL LETTER SHCHA -Ъ → $doublePrime ; # CYRILLIC CAPITAL LETTER HARD SIGN -ъ → $doublePrime ; # CYRILLIC SMALL LETTER HARD SIGN -Ы → Y ; # CYRILLIC CAPITAL LETTER YERU -ы → y ; # CYRILLIC SMALL LETTER YERU -Ь → $prime ; # CYRILLIC CAPITAL LETTER SOFT SIGN -ь → $prime ; # CYRILLIC SMALL LETTER SOFT SIGN -Э → E ; # CYRILLIC CAPITAL LETTER E -э → e ; # CYRILLIC SMALL LETTER E -Ю} $lower → Yu ; # CYRILLIC CAPITAL LETTER YU -Ю → YU ; # CYRILLIC CAPITAL LETTER YU -ю → yu ; # CYRILLIC SMALL LETTER YU -Я} $lower → Ya ; # CYRILLIC CAPITAL LETTER YA -Я → YA ; # CYRILLIC CAPITAL LETTER YA -я → ya ; # CYRILLIC SMALL LETTER YA diff --git a/icu4c/source/data/translit/Syriac_Latin.txt b/icu4c/source/data/translit/Syrc_Latn.txt similarity index 87% rename from icu4c/source/data/translit/Syriac_Latin.txt rename to icu4c/source/data/translit/Syrc_Latn.txt index 24121d5cb57..391cce616c5 100644 --- a/icu4c/source/data/translit/Syriac_Latin.txt +++ b/icu4c/source/data/translit/Syrc_Latn.txt @@ -1,12 +1,14 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Syriac_Latin.txt +# File: Syrc_Latn.txt # Generated from CLDR # + +# Consonants ܫ ↔ sh; ܞ → yh; ܖ ↔ dr; @@ -36,6 +38,7 @@ ܩ ↔ q; ܪ ↔ r; ܬ ↔ t; +# Vowels \u0730 → a; \u0731 → a; \u0732 ↔ a; @@ -53,4 +56,6 @@ \u073E → u; \u073C ↔ u; \u073F ↔ o; +# Punctuation ܍ → \*; + diff --git a/icu4c/source/data/translit/Tamil_InterIndic.txt b/icu4c/source/data/translit/Tamil_InterIndic.txt index 6bf5d83f7b0..df3bc223cf7 100644 --- a/icu4c/source/data/translit/Tamil_InterIndic.txt +++ b/icu4c/source/data/translit/Tamil_InterIndic.txt @@ -1,12 +1,15 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Tamil_InterIndic.txt # Generated from CLDR # + +# Tamil-InterIndic +#:: NFD (NFC) ; ொ→\uE04A;# VOWEL SIGN O ோ→\uE04B;# VOWEL SIGN OO ௌ→\uE04C;# VOWEL SIGN AU @@ -71,3 +74,6 @@ ௱→\uE067\uE066\uE066; # UNMAPPED Tamil-InterIndic: NUMBER ONE HUNDRED ௲→\uE067\uE066\uE066\uE066;# UNMAPPED Tamil-InterIndic: NUMBER ONE THOUSAND \u0BE6→\uE066; +# :: NFC (NFD) ; +# eof + diff --git a/icu4c/source/data/translit/Tamil_Bengali.txt b/icu4c/source/data/translit/Taml_Beng.txt similarity index 83% rename from icu4c/source/data/translit/Tamil_Bengali.txt rename to icu4c/source/data/translit/Taml_Beng.txt index bc987cc3cf5..a7f58973d0a 100644 --- a/icu4c/source/data/translit/Tamil_Bengali.txt +++ b/icu4c/source/data/translit/Taml_Beng.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Tamil_Bengali.txt +# File: Taml_Beng.txt # Generated from CLDR # + ::[\u0BE6\u0B82-ஃஅ-ஊஎ-ஐஒ-கங-சஜஞ-டண-தந-பம-வஷ-ஹா-ூெ-ைொ-\u0BCDௗ௧-௲ஶ]; ::NFD; ::Tamil-InterIndic; ::InterIndic-Bengali; ::NFC; + diff --git a/icu4c/source/data/translit/Tamil_Devanagari.txt b/icu4c/source/data/translit/Taml_Deva.txt similarity index 82% rename from icu4c/source/data/translit/Tamil_Devanagari.txt rename to icu4c/source/data/translit/Taml_Deva.txt index 58a8db01ae1..479a79b1184 100644 --- a/icu4c/source/data/translit/Tamil_Devanagari.txt +++ b/icu4c/source/data/translit/Taml_Deva.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Tamil_Devanagari.txt +# File: Taml_Deva.txt # Generated from CLDR # + ::[\u0BE6\u0B82-ஃஅ-ஊஎ-ஐஒ-கங-சஜஞ-டண-தந-பம-வஷ-ஹா-ூெ-ைொ-\u0BCDௗ௧-௲ஶ]; ::NFD; ::Tamil-InterIndic; ::InterIndic-Devanagari; ::NFC; + diff --git a/icu4c/source/data/translit/Tamil_Gujarati.txt b/icu4c/source/data/translit/Taml_Gujr.txt similarity index 83% rename from icu4c/source/data/translit/Tamil_Gujarati.txt rename to icu4c/source/data/translit/Taml_Gujr.txt index 1ea8840d581..0fd1ba6a7cf 100644 --- a/icu4c/source/data/translit/Tamil_Gujarati.txt +++ b/icu4c/source/data/translit/Taml_Gujr.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Tamil_Gujarati.txt +# File: Taml_Gujr.txt # Generated from CLDR # + ::[\u0BE6\u0B82-ஃஅ-ஊஎ-ஐஒ-கங-சஜஞ-டண-தந-பம-வஷ-ஹா-ூெ-ைொ-\u0BCDௗ௧-௲ஶ]; ::NFD; ::Tamil-InterIndic; ::InterIndic-Gujarati; ::NFC; + diff --git a/icu4c/source/data/translit/Tamil_Gurmukhi.txt b/icu4c/source/data/translit/Taml_Guru.txt similarity index 83% rename from icu4c/source/data/translit/Tamil_Gurmukhi.txt rename to icu4c/source/data/translit/Taml_Guru.txt index da04e686b86..2bbe846af36 100644 --- a/icu4c/source/data/translit/Tamil_Gurmukhi.txt +++ b/icu4c/source/data/translit/Taml_Guru.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Tamil_Gurmukhi.txt +# File: Taml_Guru.txt # Generated from CLDR # + ::[\u0BE6\u0B82-ஃஅ-ஊஎ-ஐஒ-கங-சஜஞ-டண-தந-பம-வஷ-ஹா-ூெ-ைொ-\u0BCDௗ௧-௲ஶ]; ::NFD; ::Tamil-InterIndic; ::InterIndic-Gurmukhi; ::NFC; + diff --git a/icu4c/source/data/translit/Tamil_Kannada.txt b/icu4c/source/data/translit/Taml_Knda.txt similarity index 83% rename from icu4c/source/data/translit/Tamil_Kannada.txt rename to icu4c/source/data/translit/Taml_Knda.txt index 962b33f2e9e..7a5cf729868 100644 --- a/icu4c/source/data/translit/Tamil_Kannada.txt +++ b/icu4c/source/data/translit/Taml_Knda.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Tamil_Kannada.txt +# File: Taml_Knda.txt # Generated from CLDR # + ::[\u0BE6\u0B82-ஃஅ-ஊஎ-ஐஒ-கங-சஜஞ-டண-தந-பம-வஷ-ஹா-ூெ-ைொ-\u0BCDௗ௧-௲ஶ]; ::NFD; ::Tamil-InterIndic; ::InterIndic-Kannada; ::NFC; + diff --git a/icu4c/source/data/translit/Tamil_Latin.txt b/icu4c/source/data/translit/Taml_Latn.txt similarity index 83% rename from icu4c/source/data/translit/Tamil_Latin.txt rename to icu4c/source/data/translit/Taml_Latn.txt index b7fb5eb7ff9..73718622b5f 100644 --- a/icu4c/source/data/translit/Tamil_Latin.txt +++ b/icu4c/source/data/translit/Taml_Latn.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Tamil_Latin.txt +# File: Taml_Latn.txt # Generated from CLDR # + ::[\u0BE6\u0B82-ஃஅ-ஊஎ-ஐஒ-கங-சஜஞ-டண-தந-பம-வஷ-ஹா-ூெ-ைொ-\u0BCDௗ௧-௲ஶ]; ::NFD; ::Tamil-InterIndic; ::InterIndic-Latin; ::NFC; + diff --git a/icu4c/source/data/translit/Tamil_Malayalam.txt b/icu4c/source/data/translit/Taml_Mlym.txt similarity index 83% rename from icu4c/source/data/translit/Tamil_Malayalam.txt rename to icu4c/source/data/translit/Taml_Mlym.txt index 772d87b7296..8ba488e0eb6 100644 --- a/icu4c/source/data/translit/Tamil_Malayalam.txt +++ b/icu4c/source/data/translit/Taml_Mlym.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Tamil_Malayalam.txt +# File: Taml_Mlym.txt # Generated from CLDR # + ::[\u0BE6\u0B82-ஃஅ-ஊஎ-ஐஒ-கங-சஜஞ-டண-தந-பம-வஷ-ஹா-ூெ-ைொ-\u0BCDௗ௧-௲ஶ]; ::NFD; ::Tamil-InterIndic; ::InterIndic-Malayalam; ::NFC; + diff --git a/icu4c/source/data/translit/Tamil_Oriya.txt b/icu4c/source/data/translit/Taml_Orya.txt similarity index 83% rename from icu4c/source/data/translit/Tamil_Oriya.txt rename to icu4c/source/data/translit/Taml_Orya.txt index c4678483142..a951df5138d 100644 --- a/icu4c/source/data/translit/Tamil_Oriya.txt +++ b/icu4c/source/data/translit/Taml_Orya.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Tamil_Oriya.txt +# File: Taml_Orya.txt # Generated from CLDR # + ::[\u0BE6\u0B82-ஃஅ-ஊஎ-ஐஒ-கங-சஜஞ-டண-தந-பம-வஷ-ஹா-ூெ-ைொ-\u0BCDௗ௧-௲ஶ]; ::NFD; ::Tamil-InterIndic; ::InterIndic-Oriya; ::NFC; + diff --git a/icu4c/source/data/translit/Tamil_Telugu.txt b/icu4c/source/data/translit/Taml_Telu.txt similarity index 83% rename from icu4c/source/data/translit/Tamil_Telugu.txt rename to icu4c/source/data/translit/Taml_Telu.txt index bda82bf7bb8..3170a78b968 100644 --- a/icu4c/source/data/translit/Tamil_Telugu.txt +++ b/icu4c/source/data/translit/Taml_Telu.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Tamil_Telugu.txt +# File: Taml_Telu.txt # Generated from CLDR # + ::[\u0BE6\u0B82-ஃஅ-ஊஎ-ஐஒ-கங-சஜஞ-டண-தந-பம-வஷ-ஹா-ூெ-ைொ-\u0BCDௗ௧-௲ஶ]; ::NFD; ::Tamil-InterIndic; ::InterIndic-Telugu; ::NFC; + diff --git a/icu4c/source/data/translit/Telugu_Bengali.txt b/icu4c/source/data/translit/Telu_Beng.txt similarity index 82% rename from icu4c/source/data/translit/Telugu_Bengali.txt rename to icu4c/source/data/translit/Telu_Beng.txt index 9c354c7623e..aba3698f0ac 100644 --- a/icu4c/source/data/translit/Telugu_Bengali.txt +++ b/icu4c/source/data/translit/Telu_Beng.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Telugu_Bengali.txt +# File: Telu_Beng.txt # Generated from CLDR # + ::[ఁ-ఃఅ-ఌఎ-ఐఒ-నప-ళవ-హ\u0C3E-ౄ\u0C46-\u0C48\u0C4A-\u0C4D\u0C55-\u0C56ౠ-ౡ౦-౯]; ::NFD; ::Telugu-InterIndic; ::InterIndic-Bengali; ::NFC; + diff --git a/icu4c/source/data/translit/Telugu_Devanagari.txt b/icu4c/source/data/translit/Telu_Deva.txt similarity index 82% rename from icu4c/source/data/translit/Telugu_Devanagari.txt rename to icu4c/source/data/translit/Telu_Deva.txt index 022131468d6..4789cc9141d 100644 --- a/icu4c/source/data/translit/Telugu_Devanagari.txt +++ b/icu4c/source/data/translit/Telu_Deva.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Telugu_Devanagari.txt +# File: Telu_Deva.txt # Generated from CLDR # + ::[ఁ-ఃఅ-ఌఎ-ఐఒ-నప-ళవ-హ\u0C3E-ౄ\u0C46-\u0C48\u0C4A-\u0C4D\u0C55-\u0C56ౠ-ౡ౦-౯]; ::NFD; ::Telugu-InterIndic; ::InterIndic-Devanagari; ::NFC; + diff --git a/icu4c/source/data/translit/Telugu_Gujarati.txt b/icu4c/source/data/translit/Telu_Gujr.txt similarity index 82% rename from icu4c/source/data/translit/Telugu_Gujarati.txt rename to icu4c/source/data/translit/Telu_Gujr.txt index d4c24cec701..d5dbc8095ff 100644 --- a/icu4c/source/data/translit/Telugu_Gujarati.txt +++ b/icu4c/source/data/translit/Telu_Gujr.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Telugu_Gujarati.txt +# File: Telu_Gujr.txt # Generated from CLDR # + ::[ఁ-ఃఅ-ఌఎ-ఐఒ-నప-ళవ-హ\u0C3E-ౄ\u0C46-\u0C48\u0C4A-\u0C4D\u0C55-\u0C56ౠ-ౡ౦-౯]; ::NFD; ::Telugu-InterIndic; ::InterIndic-Gujarati; ::NFC; + diff --git a/icu4c/source/data/translit/Telugu_Gurmukhi.txt b/icu4c/source/data/translit/Telu_Guru.txt similarity index 82% rename from icu4c/source/data/translit/Telugu_Gurmukhi.txt rename to icu4c/source/data/translit/Telu_Guru.txt index 0d79783973a..3613b42274e 100644 --- a/icu4c/source/data/translit/Telugu_Gurmukhi.txt +++ b/icu4c/source/data/translit/Telu_Guru.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Telugu_Gurmukhi.txt +# File: Telu_Guru.txt # Generated from CLDR # + ::[ఁ-ఃఅ-ఌఎ-ఐఒ-నప-ళవ-హ\u0C3E-ౄ\u0C46-\u0C48\u0C4A-\u0C4D\u0C55-\u0C56ౠ-ౡ౦-౯]; ::NFD; ::Telugu-InterIndic; ::InterIndic-Gurmukhi; ::NFC; + diff --git a/icu4c/source/data/translit/Telugu_Kannada.txt b/icu4c/source/data/translit/Telu_Knda.txt similarity index 82% rename from icu4c/source/data/translit/Telugu_Kannada.txt rename to icu4c/source/data/translit/Telu_Knda.txt index ab94315ec3f..2ec397a71a5 100644 --- a/icu4c/source/data/translit/Telugu_Kannada.txt +++ b/icu4c/source/data/translit/Telu_Knda.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Telugu_Kannada.txt +# File: Telu_Knda.txt # Generated from CLDR # + ::[ఁ-ఃఅ-ఌఎ-ఐఒ-నప-ళవ-హ\u0C3E-ౄ\u0C46-\u0C48\u0C4A-\u0C4D\u0C55-\u0C56ౠ-ౡ౦-౯]; ::NFD; ::Telugu-InterIndic; ::InterIndic-Kannada; ::NFC; + diff --git a/icu4c/source/data/translit/Telugu_Latin.txt b/icu4c/source/data/translit/Telu_Latn.txt similarity index 83% rename from icu4c/source/data/translit/Telugu_Latin.txt rename to icu4c/source/data/translit/Telu_Latn.txt index 8d3a0d7b53f..005a08976cd 100644 --- a/icu4c/source/data/translit/Telugu_Latin.txt +++ b/icu4c/source/data/translit/Telu_Latn.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Telugu_Latin.txt +# File: Telu_Latn.txt # Generated from CLDR # + ::[ఁ-ఃఅ-ఌఎ-ఐఒ-నప-ళవ-హ\u0C3E-ౄ\u0C46-\u0C48\u0C4A-\u0C4D\u0C55-\u0C56ౠ-ౡ౦-౯]; ::NFD; ::Telugu-InterIndic; ::InterIndic-Latin; ::NFC; + diff --git a/icu4c/source/data/translit/Telugu_Malayalam.txt b/icu4c/source/data/translit/Telu_Mlym.txt similarity index 82% rename from icu4c/source/data/translit/Telugu_Malayalam.txt rename to icu4c/source/data/translit/Telu_Mlym.txt index 5465ca1450b..dbe2f4199a9 100644 --- a/icu4c/source/data/translit/Telugu_Malayalam.txt +++ b/icu4c/source/data/translit/Telu_Mlym.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Telugu_Malayalam.txt +# File: Telu_Mlym.txt # Generated from CLDR # + ::[ఁ-ఃఅ-ఌఎ-ఐఒ-నప-ళవ-హ\u0C3E-ౄ\u0C46-\u0C48\u0C4A-\u0C4D\u0C55-\u0C56ౠ-ౡ౦-౯]; ::NFD; ::Telugu-InterIndic; ::InterIndic-Malayalam; ::NFC; + diff --git a/icu4c/source/data/translit/Telugu_Oriya.txt b/icu4c/source/data/translit/Telu_Orya.txt similarity index 83% rename from icu4c/source/data/translit/Telugu_Oriya.txt rename to icu4c/source/data/translit/Telu_Orya.txt index 40cf346efaa..3badd8c917a 100644 --- a/icu4c/source/data/translit/Telugu_Oriya.txt +++ b/icu4c/source/data/translit/Telu_Orya.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Telugu_Oriya.txt +# File: Telu_Orya.txt # Generated from CLDR # + ::[ఁ-ఃఅ-ఌఎ-ఐఒ-నప-ళవ-హ\u0C3E-ౄ\u0C46-\u0C48\u0C4A-\u0C4D\u0C55-\u0C56ౠ-ౡ౦-౯]; ::NFD; ::Telugu-InterIndic; ::InterIndic-Oriya; ::NFC; + diff --git a/icu4c/source/data/translit/Telugu_Tamil.txt b/icu4c/source/data/translit/Telu_Taml.txt similarity index 83% rename from icu4c/source/data/translit/Telugu_Tamil.txt rename to icu4c/source/data/translit/Telu_Taml.txt index 18214687c69..e0852655cb8 100644 --- a/icu4c/source/data/translit/Telugu_Tamil.txt +++ b/icu4c/source/data/translit/Telu_Taml.txt @@ -1,14 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Telugu_Tamil.txt +# File: Telu_Taml.txt # Generated from CLDR # + ::[ఁ-ఃఅ-ఌఎ-ఐఒ-నప-ళవ-హ\u0C3E-ౄ\u0C46-\u0C48\u0C4A-\u0C4D\u0C55-\u0C56ౠ-ౡ౦-౯]; ::NFD; ::Telugu-InterIndic; ::InterIndic-Tamil; ::NFC; + diff --git a/icu4c/source/data/translit/Telugu_InterIndic.txt b/icu4c/source/data/translit/Telugu_InterIndic.txt index 8981a1e1ce4..9cfff21f96d 100644 --- a/icu4c/source/data/translit/Telugu_InterIndic.txt +++ b/icu4c/source/data/translit/Telugu_InterIndic.txt @@ -1,12 +1,15 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Telugu_InterIndic.txt # Generated from CLDR # + +# Telugu-InterIndic +#:: NFD (NFC) ; \u0C46\u0C4D\u0C56→\uE048\uE04D; \u0C46\u0C56→\uE048;# VOWEL SIGN AI ఁ→\uE001; # SIGN CANDRABINDU @@ -88,3 +91,6 @@ ౭→\uE06D; # DIGIT SEVEN ౮→\uE06E; # DIGIT EIGHT ౯→\uE06F; # DIGIT NINE +# :: NFC (NFD) ; +# eof + diff --git a/icu4c/source/data/translit/ThaiLogical_Latin.txt b/icu4c/source/data/translit/ThaiLogical_Latin.txt index 4912be5a56a..84f6f12917e 100644 --- a/icu4c/source/data/translit/ThaiLogical_Latin.txt +++ b/icu4c/source/data/translit/ThaiLogical_Latin.txt @@ -1,14 +1,40 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: ThaiLogical_Latin.txt # Generated from CLDR # + +# Thai-Latin +# This set of rules follows ISO 11940 +# see http://homepage.mac.com/sirbinks/pdf/Thai.r2.pdf +# except that that does not mention an implicit vowel, so we use o\u0323 +# +# The transcription is fairly ugly, so we ought to also do the UNGEGN version +# see: http://www.eki.ee/wgrs/rom1_th.pdf +# and probably make that the main variant. +# +# Note: this is an internal file. The NFD/NFC is handled externally, in the index +# The insertion of spaces between words, the reversal of the vowels +# and the conversion of space to semicolon are done *outside* of these rules. +# So as far as these rules are concerned, the vowels are in logical order! +# insert implicit vowel (and remove it going the other way) +# COMMENTED out: the implicit vowel positions cannot be predicted algorithmically +#$consonant = [ก-ฮ]; +#$vowel = [ะ-\u0E3Aเ-ไ\u0E47]; +#{ ( $consonant ) } [^$vowel \uE000] → | $1 \uE000 ; +#\uE000 → o\u0323 ; +# ← o\u0323 ; $notAbove = [^\p{ccc=0}\p{ccc=above}] ; $notBelow = [^\p{ccc=0}\p{ccc=below}] ; +# Consonants +# Warning: the 'h's need to be handled carefully! +# What we really want to say is the following, but we can't +# $notHAccent = !($notAbove* \u0304 | $notBelow* \u0323) ; +# Since the only accents we care about that could cause problems are free-standing accents below, we use instead: $freeStandingBelow = [\u0325 ]; $hAccent = [ \u0304 \u0323]; $notHAccent0 = [^$freeStandingBelow$hAccent]; @@ -40,8 +66,10 @@ $notHAccent1 = $freeStandingBelow [^$hAccent]; ธ ↔ t\u0323h ; # THAI CHARACTER THO THONG ท ← th } $notHAccent1 ; # THAI CHARACTER THO THAHAN ท ↔ th } $notHAccent0 ; # THAI CHARACTER THO THAHAN +#Note: TO PATAK deviates from ISO since t-dotunder + h would be ambigous. So it uses vertical tick. ฏ ↔ t\u0329 ; # THAI CHARACTER TO PATAK ต ↔ t ; # THAI CHARACTER TO TAO +# since there is no singleton g (generated), don't worry about that. ง ↔ ng ; # THAI CHARACTER NGO NGU ณ ↔ n\u0323 ; # THAI CHARACTER NO NEN น ↔ n ; # THAI CHARACTER NO NU @@ -67,9 +95,11 @@ $notHAccent1 = $freeStandingBelow [^$hAccent]; ฟ ↔ f ; # THAI CHARACTER FO FAN อ ↔ x ; # THAI CHARACTER O ANG ซ ↔ s ; # THAI CHARACTER SO SO +# vowels \u0E31 ↔ a\u0323 ; # THAI CHARACTER MAI HAN-AKAT า → a\u0304 ; # THAI CHARACTER SARA AA า | $1 ← a ($notAbove*) \u0304; # backward case, account for reordering +# We deviate from ISO for SARA AM for disambiguation ำ → a \u0309; # THAI CHARACTER SARA AM ำ | $1 ← a ($notAbove*) \u0309 ; # backward case, account for reordering ะ ↔ a ; # THAI CHARACTER SARA A @@ -82,6 +112,7 @@ $notHAccent1 = $freeStandingBelow [^$hAccent]; \u0E39 | $1 ← u ($notAbove*) \u0304 ; # backward case, account for reordering \u0E38 ↔ u ; # THAI CHARACTER SARA U ฯ ↔ ‡ ; # THAI CHARACTER PAIYANNOI +# ฿ ↔ XXX ; # THAI CURRENCY SYMBOL BAHT เ ↔ e ; # THAI CHARACTER SARA E แ ↔ æ ; # THAI CHARACTER SARA AE โ ↔ o ; # THAI CHARACTER SARA O @@ -95,6 +126,7 @@ $notHAccent1 = $freeStandingBelow [^$hAccent]; \u0E4B ↔ \u030C ; # THAI CHARACTER MAI CHATTAWA \u0E4C ↔ \u0312 ; # THAI CHARACTER THANTHAKHAT \u0E4E ↔ '~' ; # THAI CHARACTER YAMAKKAN +# We deviate from ISO for disambiguation \u0E4D ↔ \u030A ; # THAI CHARACTER NIKHAHIT ๏ ↔ '§' ; # THAI CHARACTER FONGMAN ๐ ↔ 0 ; # THAI DIGIT ZERO @@ -110,11 +142,15 @@ $notHAccent1 = $freeStandingBelow [^$hAccent]; ๚ ↔ '||' ; # THAI CHARACTER ANGKHANKHU ๛ ↔ » ; # THAI CHARACTER KHOMUT ๆ ↔ « ; # THAI CHARACTER MAIYAMOK +# moved down to make shorter first +#Note: PHINTHU deviates from ISO since underring causes canonical problems. So it uses spacing tick below. \u0E3A ↔ ˌ ; # THAI CHARACTER PHINTHU \u0E34 ↔ i ; # THAI CHARACTER SARA I +# fallbacks | k ← g ; | k ← h ; | c ← j ; | k ← q ; | s ← z ; :: (lower); + diff --git a/icu4c/source/data/translit/Thai_Latin.txt b/icu4c/source/data/translit/Thai_Latn.txt similarity index 81% rename from icu4c/source/data/translit/Thai_Latin.txt rename to icu4c/source/data/translit/Thai_Latn.txt index 3b8f70a9919..6f7e2364489 100644 --- a/icu4c/source/data/translit/Thai_Latin.txt +++ b/icu4c/source/data/translit/Thai_Latn.txt @@ -1,12 +1,13 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Thai_Latin.txt +# File: Thai_Latn.txt # Generated from CLDR # + ::[[:thai:] ก-\u0E3Aเ-๛]; ::NFD; ::Thai-ThaiSemi; @@ -14,3 +15,4 @@ ::Thai-ThaiLogical; ::ThaiLogical-Latin; ::NFC; + diff --git a/icu4c/source/data/translit/Thai_ThaiLogical.txt b/icu4c/source/data/translit/Thai_ThaiLogical.txt index 76ac08f59db..791320fbdbc 100644 --- a/icu4c/source/data/translit/Thai_ThaiLogical.txt +++ b/icu4c/source/data/translit/Thai_ThaiLogical.txt @@ -1,16 +1,25 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Thai_ThaiLogical.txt # Generated from CLDR # + +# This reverses the Thai LogicalOrderException vowels, and does (part of) spaces +# The rules that convert space into semicolon are in another file; +# since they have to come BEFORE the break iterator $thai = [[:thai:] ก-\u0E3Aเ-๛] ; +# First convert the semicolon back ' ' ← $thai { '; ' } $thai; +# Remove any other spaces between thai letters ← $thai { ' ' } $thai; +# Now vowels $thai_reversing = [[:Logical_Order_Exception:] & $thai]; $thai_non_reversing = [$thai - $thai_reversing ]; ( $thai_reversing ) ( $thai_non_reversing ) → $2 $1; +# other direction $2 $1 ← ( $thai_non_reversing ) ( $thai_reversing ) ; + diff --git a/icu4c/source/data/translit/Thai_ThaiSemi.txt b/icu4c/source/data/translit/Thai_ThaiSemi.txt index e0fcff18a7f..78cce79c526 100644 --- a/icu4c/source/data/translit/Thai_ThaiSemi.txt +++ b/icu4c/source/data/translit/Thai_ThaiSemi.txt @@ -1,11 +1,15 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: Thai_ThaiSemi.txt # Generated from CLDR # + +# The rules that convert space into semicolon are in this file; +# since they have to come BEFORE the break iterator. $thai = [[:thai:] ก-\u0E3Aเ-๛] ; $thai { ' ' } $thai → '; ' ; + diff --git a/icu4c/source/data/translit/am_am_FONIPA.txt b/icu4c/source/data/translit/am_am_FONIPA.txt index 20467a7e3e4..195439d5fd3 100755 --- a/icu4c/source/data/translit/am_am_FONIPA.txt +++ b/icu4c/source/data/translit/am_am_FONIPA.txt @@ -1,6 +1,6 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** @@ -8,29 +8,70 @@ # Generated from CLDR # -\u135D → ''; # U+135D ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK -\u135E → ''; # U+135E ETHIOPIC COMBINING VOWEL LENGTH MARK -\u135F → ''; # U+135F ETHIOPIC COMBINING GEMINATION MARK - +# Transforms Amharic (am) to Amharic in phonemic IPA transcription (am_FONIPA). +# +# Long vowels, long/geminated consonants: +# In the direction from am_FONIPA to am, we emit Ethiopic gemination +# and vowel length markers (U+135D, U+135E, U+135F) although +# they are rarely written in Amharic text. Exceptions include +# school books and textbooks for non-native speakers. +# Clients who do not want these markers can easily strip them off +# in a post-processing step. +# +# Labialization: +# Amharic speakers will usually say ሟ as [mʷa] instead of [mwa]; +# labializing [m] instead of saying [m] followed by a separate [w]. +# Most Amharic consonants can get labialized. To keep the phonemic +# transcription simple, we emit /m/ + /w/; otherwise, our phoneme +# set would almost double, and it would include very unusual phonemes +# such as /ɲʷ/ or /t\u0361ʃʼʷ/. +# +# References: +# [1] The Ge’ez Frontier Foundation: “Principles and Specification +# for Mnemonic Ethiopic Keyboards.” Version of January 17, 2009; +# retrieved on November 4, 2014. +# http://keyboards.ethiopic.org/specification/GFF-MnemonicEthiopicKeyboardSpecification.pdf +# Other than most online sources, this report uses correct IPA notation +# with the exception of /j/, which it consistently (but wrongly) +# writes as */y/. +$IPA_VOWEL = [aeəiɨou]; +$IPA_CONSONANT = [mnɲɴ p{pʼ}bt{tʼ}dk{kʼ}ɡʔʕ fvs{sʼ}zʃʒxh lr {t\u0361ʃ}{t\u0361ʃʼ}{d\u0361ʒ}]; +# Some consonants have a special syllable when labialized, such as ፗ ↔ /pʷa/. +# Amharic restricts this mostly to /a/ syllables. While the Ethiopic script +# does offer labialized syllables for other vowels, these are typically +# not written in Amharic. +$LABIALIZABLE_BEFORE_A = [p{pʼ}t{tʼ} {t\u0361ʃ}{t\u0361ʃʼ}{d\u0361ʒ}{d\u0361ʒʼ} s{sʼ}zʃʒ fv r]; +← [ ʼ \u0361 \u035C \u032F]; +::(null); +# Appendix B of [1] transcribes ሀ as /hə/. However, according to +# an Amharic-speaking person, there is no /hə/ sequence +# in Amharic; instead, it gets pronounced as /ha/. ሀ → ha; -ሁ → hu; -ሂ → hi; -ሃ → ha; -ሄ → he; -ህ → hɨ; -ሆ → ho; +ሀ ← hə; +ሁ ↔ hu; +ሂ ↔ hi; +ሃ ↔ ha; +ሄ ↔ he; +ህ ↔ hɨ; +ሆ ↔ ho; ሇ → ho; # Dizi, Me’en, Mursi, Suri /hɔ/ ([1], Appendix E); not used in Amharic. - -ለ → lə; -ሉ → lu; -ሊ → li; -ላ → la; -ሌ → le; -ል → lɨ; -ሎ → lo; +ህ ← h; +ለ ↔ lə; +ሉ ↔ lu; +ሊ ↔ li; +ላ ↔ la; +ሌ ↔ le; +ል ↔ lɨ; +ሎ ↔ lo; ⶀ → lo; # Dizi, Me’en, Mursi, Suri /lɔ/ ([1], Appendix E); not used in Amharic. -ሏ → lwa; - +ሏ ↔ lwa; +ል ← l; +# Appendix B of [1] transcribes ሐ as Voiceless pharyngeal fricative +# /ħə/. However, according to an Amharic-speaking person, Amharic +# makes no difference in pronunciation between ሐ...ሓ and ሀ...ሃ; both +# are pronounced as Voiceless glottal fricative /h/. Also, according +# to the speaker there is no /hə/ sequence in Amharic; instead, it +# gets pronounced as /ha/. ሐ → ha; ሑ → hu; ሒ → hi; @@ -39,22 +80,21 @@ ሕ → hɨ; ሖ → ho; ሗ → hwa; - -መ → mə; -ሙ → mu; -ሚ → mi; -ማ → ma; -ሜ → me; -ም → mɨ; -ሞ → mo; +መ ↔ mə; +ሙ ↔ mu; +ሚ ↔ mi; +ማ ↔ ma; +ሜ ↔ me; +ም ↔ mɨ; +ሞ ↔ mo; ⶁ → mo; # Dizi, Me’en, Mursi, Suri /mɔ/ ([1], Appendix E); not used in Amharic. ᎀ → mwə; # Sebatbeit /mwə/ ([1], Appendix H); not used in Amharic. ᎃ → mwu; # Sebatbeit /mwu/ ([1], Appendix H); not used in Amharic. ᎁ → mwi; # Sebatbeit /mwi/ ([1], Appendix H); not used in Amharic. -ሟ → mwa; +ሟ ↔ mwa; ᎂ → mwe; # Sebatbeit /mwe/ ([1], Appendix H); not used in Amharic. ፙ → mja; # Unclear which language; Appendix L of [1] transcribes ፙ as /mʲa/. - +ም ← m; ሠ → sə; ሡ → su; ሢ → si; @@ -63,38 +103,18 @@ ሥ → sɨ; ሦ → so; ሧ → swa; - -ረ → rə; -ሩ → ru; -ሪ → ri; -ራ → ra; -ሬ → re; -ር → rɨ; -ሮ → ro; +ረ ↔ rə; +ሩ ↔ ru; +ሪ ↔ ri; +ራ ↔ ra; +ሬ ↔ re; +ር ↔ rɨ; +ሮ ↔ ro; ⶂ → ro; # Dizi, Me’en, Mursi, Suri /rɔ/ ([1], Appendix E); not used in Amharic. -ሯ → rwa; +ሯ ↔ rwa; ፘ → rja; # Unclear which language; Appendix L of [1] transcribes ፘ as /rʲa/. - -ሰ → sə; -ሱ → su; -ሲ → si; -ሳ → sa; -ሴ → se; -ስ → sɨ; -ሶ → so; -ⶃ → so; # Dizi, Me’en, Mursi, Suri /sɔ/ ([1], Appendix E); not used in Amharic. -ሷ → swa; - -ሸ → ʃə; -ሹ → ʃu; -ሺ → ʃi; -ሻ → ʃa; -ሼ → ʃe; -ሽ → ʃɨ; -ሾ → ʃo; -ⶄ → ʃo; # Dizi, Me’en, Mursi, Suri /ʃɔ/ ([1], Appendix E); not used in Amharic. -ሿ → ʃwa; - +ር ← r; +# Amharic speakers pronounce ⶠ like ሸ. Source: [1], Appendix B. ⶠ → ʃə; ⶡ → ʃu; ⶢ → ʃi; @@ -102,21 +122,34 @@ ⶤ → ʃe; ⶥ → ʃɨ; ⶦ → ʃo; - -ቀ → kʼə; -ቁ → kʼu; -ቂ → kʼi; -ቃ → kʼa; -ቄ → kʼe; -ቅ → kʼɨ; -ቆ → kʼo; +ሸ ↔ ʃə; +ሹ ↔ ʃu; +ሺ ↔ ʃi; +ሻ ↔ ʃa; +ሼ ↔ ʃe; +ሽ ↔ ʃɨ; +ሾ ↔ ʃo; +ⶄ → ʃo; # Dizi, Me’en, Mursi, Suri /ʃɔ/ ([1], Appendix E); not used in Amharic. +ሿ ↔ ʃwa; +ሽ ← ʃ; +ቀ ↔ kʼə; +ቁ ↔ kʼu; +ቂ ↔ kʼi; +ቃ ↔ kʼa; +ቄ ↔ kʼe; +ቅ ↔ kʼɨ; +ቆ ↔ kʼo; ቇ → kʼo; # Dizi, Me’en, Mursi, Suri /kʼɔ/ ([1], Appendix E); not used in Amharic. -ቈ → kʼwə; -ቍ → kʼwu; -ቊ → kʼwi; -ቋ → kʼwa; -ቌ → kʼwe; - +ቈ ↔ kʼwə; +ቍ ↔ kʼwu; +ቊ ↔ kʼwi; +ቋ ↔ kʼwa; +ቌ ↔ kʼwe; +ቅ ← kʼ; +# In Awngi, Blin, Qimant, and Xamtanga, ቐ is spoken as voiced uvular fricative [ʁ]. +# Source: [1], Appendix C. However, */ʁ/ is not an Amharic phoneme. +# When reading foreign words with ቐ, Amharic speakers pronounce +# ቐ like ቀ, i.e. as velar ejective /kʼ/. ቐ → kʼə; ቑ → kʼu; ቒ → kʼi; @@ -129,7 +162,8 @@ ቚ → kʼwi; ቛ → kʼwa; ቜ → kʼwe; - +# In Sebatbeit, ⷀ is spoken as palatalized velar ejective /kʼʲ/ ([1], Appendix H). +# In Amharic, the syllable is not used, but it might appear in names. ⷀ → kʼjə; ⷁ → kʼju; ⷂ → kʼji; @@ -137,49 +171,32 @@ ⷄ → kʼje; ⷅ → kʼjɨ; ⷆ → kʼjo; - -በ → bə; -ቡ → bu; -ቢ → bi; -ባ → ba; -ቤ → be; -ብ → bɨ; -ቦ → bo; +በ ↔ bə; +ቡ ↔ bu; +ቢ ↔ bi; +ባ ↔ ba; +ቤ ↔ be; +ብ ↔ bɨ; +ቦ ↔ bo; ⶅ → bo; # Dizi, Me’en, Mursi, Suri /bɔ/ ([1], Appendix E); not used in Amharic. ᎄ → bwə; # Sebatbeit /bʷə/ ([1], Appendix H); not used in Amharic. ᎇ → bwu; # Sebatbeit /bʷu/ ([1], Appendix H); not used in Amharic. ᎅ → bwi; # Sebatbeit /bʷi/ ([1], Appendix H); not used in Amharic. ቧ → bwa; # Sebatbeit /bʷa/ ([1], Appendix H); not used in Amharic. ᎆ → bwe; # Sebatbeit /bʷe/ ([1], Appendix H); not used in Amharic. - -ቨ → və; -ቩ → vu; -ቪ → vi; -ቫ → va; -ቬ → ve; -ቭ → vɨ; -ቮ → vo; -ቯ → vwa; - -ተ → tə; -ቱ → tu; -ቲ → ti; -ታ → ta; -ቴ → te; -ት → tɨ; -ቶ → to; -ⶆ → to; # Dizi, Me’en, Mursi, Suri /tɔ/ ([1], Appendix E); not used in Amharic. -ቷ → twa; - -ቸ → t\u0361ʃə; -ቹ → t\u0361ʃu; -ቺ → t\u0361ʃi; -ቻ → t\u0361ʃa; -ቼ → t\u0361ʃe; -ች → t\u0361ʃɨ; -ቾ → t\u0361ʃo; -ቿ → t\u0361ʃwa; - +ብ ← b; +ቨ ↔ və; +ቩ ↔ vu; +ቪ ↔ vi; +ቫ ↔ va; +ቬ ↔ ve; +ቭ ↔ vɨ; +ቮ ↔ vo; +ቯ ↔ vwa; +ቭ ← v; +# Unclear which Ethiopic language uses ⶨ. It only appears in the +# “Language Neutral” list of Appendix L in [1], which transcribes it as t\u0361ʃ. +# For Amharic, we pronounce ⶨ therefore like ቸ. ⶨ → t\u0361ʃə; ⶩ → t\u0361ʃu; ⶪ → t\u0361ʃi; @@ -187,8 +204,11 @@ ⶬ → t\u0361ʃe; ⶭ → t\u0361ʃɨ; ⶮ → t\u0361ʃo; - - +# In Amharic, ኀ is pronounced like ሀ. +# Source: [1], section on “Phonological Redundancy” for Amharic, page 5. +# Appendix B of [1] transcribes ሀ as /hə/. However, according to +# an Amharic-speaking person, there is no /hə/ sequence in Amharic. +# Instead, ሀ (and hence also ኀ) gets pronounced as /ha/. ኀ → ha; ኁ → hu; ኂ → hi; @@ -202,49 +222,79 @@ ኊ → hwi; ኋ → hwa; ኌ → hwe; - -ነ → nə; -ኑ → nu; -ኒ → ni; -ና → na; -ኔ → ne; -ን → nɨ; -ኖ → no; +ነ ↔ nə; +ኑ ↔ nu; +ኒ ↔ ni; +ና ↔ na; +ኔ ↔ ne; +ን ↔ nɨ; +ኖ ↔ no; ⶈ → no; # Dizi, Me’en, Mursi, Suri /nɔ/ ([1], Appendix E); not used in Amharic. -ኗ → nwa; - -ኘ → ɲə; -ኙ → ɲu; -ኚ → ɲi; -ኛ → ɲa; -ኜ → ɲe; -ኝ → ɲɨ; -ኞ → ɲo; +ኗ ↔ nwa; +ን ← n; +ኘ ↔ ɲə; +ኙ ↔ ɲu; +ኚ ↔ ɲi; +ኛ ↔ ɲa; +ኜ ↔ ɲe; +ኝ ↔ ɲɨ; +ኞ ↔ ɲo; ⶉ → ɲo; # Dizi, Me’en, Mursi, Suri /ɲɔ/ ([1], Appendix E); not used in Amharic. -ኟ → ɲwa; - -አ → ʔə; -ኡ → ʔu; -ኢ → ʔi; -ኣ → ʔa; -ኤ → ʔe; -እ → ʔɨ; -ኦ → ʔo; +ኟ ↔ ɲwa; +ኝ ← ɲ; +# Amharic speakers pronounce ኸ as [h] because Amharic has no [x] sound. +# However, in transliterations of foreign (eg. Spanish) words with [x], +# several Amharic speakers have confirmed that they prefer ኻ over ሃ. +ዀ → hwə; +ዂ → hwi; +ዃ → hwa; +ዄ → hwe; +ዅ → hwɨ; +ኸ → hə; +ኹ → hu; +ኺ → hi; +ኻ → ha; +ኼ → he; +ኽ → hɨ; +ኾ → ho; +ዀ ← xwə; +ዂ ← xwi; +ዃ ← xwa; +ዄ ← xwe; +ዅ ← xwɨ; +ዅ ← xw; +ኸ ← xə; +ኹ ← xu; +ኺ ← xi; +ኻ ← xa; +ኼ ← xe; +ኽ ← xɨ; +ኾ ← xo; +ኽ ← x; +አ ↔ ʔə; +ኡ ↔ ʔu; +ኢ ↔ ʔi; +ኣ ↔ ʔa; +ኤ ↔ ʔe; +እ ↔ ʔɨ; +ኦ ↔ ʔo; ⶊ → ʔo; # Dizi, Me’en, Mursi, Suri /ɲɔ/ ([1], Appendix E); not used in Amharic. - -ከ → kə; -ኩ → ku; -ኪ → ki; -ካ → ka; -ኬ → ke; -ክ → kɨ; -ኮ → ko; -ኰ → kwə; -ኵ → kwu; -ኲ → kwi; -ኳ → kwa; -ኴ → kwe; - +እ ← ʔ; +ከ ↔ kə; +ኩ ↔ ku; +ኪ ↔ ki; +ካ ↔ ka; +ኬ ↔ ke; +ክ ↔ kɨ; +ኮ ↔ ko; +ኰ ↔ kwə; +ኵ ↔ kwu; +ኲ ↔ kwi; +ኳ ↔ kwa; +ኴ ↔ kwe; +ክ ← k; +# In Sebatbeit, ⷈ is spoken as palatalized velar plosive /kʲ/ ([1], Appendix H). +# Amharic speakers pronounce it as /k/ without palatalization. ⷈ → kə; ⷉ → ku; ⷊ → ki; @@ -252,7 +302,9 @@ ⷌ → ke; ⷍ → kɨ; ⷎ → ko; - +# In Sebatbeit, ⷐ is spoken as palatalized voiceless velar fricative/xʲə/ +# according to [1], Appendix H. When the syllable appears in names, +# Amharic speakers pronounce it as /kə/ without palatalization. ⷐ → kə; ⷑ → ku; ⷒ → ki; @@ -260,43 +312,45 @@ ⷔ → ke; ⷕ → kɨ; ⷖ → ko; - -ወ → wə; -ዉ → wu; -ዊ → wi; -ዋ → wa; -ዌ → we; -ው → wɨ; -ዎ → wo; +ወ ↔ wə; +ዉ ↔ wu; +ዊ ↔ wi; +ዋ ↔ wa; +ዌ ↔ we; +ው ↔ wɨ; +ዎ ↔ wo; ዏ → wo; # Dizi, Me’en, Mursi, Suri /wɔ/ ([1], Appendix E); not used in Amharic. - -ዐ → ʕə; -ዑ → ʕu; -ዒ → ʕi; -ዓ → ʕa; -ዔ → ʕe; -ዕ → ʕɨ; -ዖ → ʕo; - -ዘ → zə; -ዙ → zu; -ዚ → zi; -ዛ → za; -ዜ → ze; -ዝ → zɨ; -ዞ → zo; +ው ← w; +ዐ ↔ ʕə; +ዑ ↔ ʕu; +ዒ ↔ ʕi; +ዓ ↔ ʕa; +ዔ ↔ ʕe; +ዕ ↔ ʕɨ; +ዖ ↔ ʕo; +ዒ ← ʕ; +ዘ ↔ zə; +ዙ ↔ zu; +ዚ ↔ zi; +ዛ ↔ za; +ዜ ↔ ze; +ዝ ↔ zɨ; +ዞ ↔ zo; ⶋ → zo; # Dizi, Me’en, Mursi, Suri /zɔ/ ([1], Appendix E); not used in Amharic. -ዟ → zwa; - -ዠ → ʒə; -ዡ → ʒu; -ዢ → ʒi; -ዣ → ʒa; -ዤ → ʒe; -ዥ → ʒɨ; -ዦ → ʒo; -ዧ → ʒwa; - +ዟ ↔ zwa; +ዝ ← z; +ዠ ↔ ʒə; +ዡ ↔ ʒu; +ዢ ↔ ʒi; +ዣ ↔ ʒa; +ዤ ↔ ʒe; +ዥ ↔ ʒɨ; +ዦ ↔ ʒo; +ዧ ↔ ʒwa; +ዢ ← ʒ; +# Unclear which Ethiopic language uses ⶰ. It only appears in the +# “Language Neutral” list of Appendix L in [1], which transcribes it as ʒ. +# For Amharic, we pronounce ⶰ therefore like ዠ. ⶰ → ʒə; ⶱ → ʒu; ⶲ → ʒi; @@ -304,48 +358,52 @@ ⶴ → ʒe; ⶵ → ʒɨ; ⶶ → ʒo; - -የ → jə; -ዩ → ju; -ዪ → ji; -ያ → ja; -ዬ → je; -ይ → jɨ; -ዮ → jo; +የ ↔ jə; +ዩ ↔ ju; +ዪ ↔ ji; +ያ ↔ ja; +ዬ ↔ je; +ይ ↔ jɨ; +ዮ ↔ jo; ዯ → jo; # Dizi, Me’en, Mursi, Suri /zɔ/ ([1], Appendix E); not used in Amharic. - -ደ → də; -ዱ → du; -ዲ → di; -ዳ → da; -ዴ → de; -ድ → dɨ; -ዶ → do; +ይ ← j; +ጀ ↔ d\u0361ʒə; +ጁ ↔ d\u0361ʒu; +ጂ ↔ d\u0361ʒi; +ጃ ↔ d\u0361ʒa; +ጄ ↔ d\u0361ʒe; +ጅ ↔ d\u0361ʒɨ; +ጆ ↔ d\u0361ʒo; +ጇ ↔ d\u0361ʒwa; +ጅ ← d\u0361ʒ; +ደ ↔ də; +ዱ ↔ du; +ዲ ↔ di; +ዳ ↔ da; +ዴ ↔ de; +ድ ↔ dɨ; +ዶ ↔ do; ⶌ → do; # Dizi, Me’en, Mursi, Suri /zɔ/ ([1], Appendix E); not used in Amharic. -ዷ → dwa; - -ጀ → d\u0361ʒə; -ጁ → d\u0361ʒu; -ጂ → d\u0361ʒi; -ጃ → d\u0361ʒa; -ጄ → d\u0361ʒe; -ጅ → d\u0361ʒɨ; -ጆ → d\u0361ʒo; -ጇ → d\u0361ʒwa; - -ገ → ɡə; -ጉ → ɡu; -ጊ → ɡi; -ጋ → ɡa; -ጌ → ɡe; -ግ → ɡɨ; -ጎ → ɡo; -ጐ → ɡwə; -ጕ → ɡwu; -ጒ → ɡwi; -ጓ → ɡwa; -ጔ → ɡwe; - +ዷ ↔ dwa; +ድ ← d; +ገ ↔ ɡə; +ጉ ↔ ɡu; +ጊ ↔ ɡi; +ጋ ↔ ɡa; +ጌ ↔ ɡe; +ግ ↔ ɡɨ; +ጎ ↔ ɡo; +ጐ ↔ ɡwə; +ጕ ↔ ɡwu; +ጒ ↔ ɡwi; +ጓ ↔ ɡwa; +ጔ ↔ ɡwe; +ግ ← ɡ; +# In Awngi, Blin, Qimant, and Xamtanga, ጘ is spoken as voiced velar nasal [ŋ]. +# Source: [1], Appendix C. While /ŋ/ is not an Amharic phoneme, Amharic speakers +# still can pronounce it according to our source. However, when transliterating +# foreign words with [ŋ], Amharic uses the sequence ንግ /nɡ/. For example, +# the Amharic transliteration of Washington /waʃiŋtən/ is ዋሺንግተን. ጘ → ŋə; ጙ → ŋu; ጚ → ŋi; @@ -358,7 +416,17 @@ ⶔ → ŋwi; ጟ → ŋwa; ⶕ → ŋwe; - +# Since there is no uvular nasal [ɴ] in Amharic, we use the velar nasal [ŋ]. +ጘ ← ɴə; +ጙ ← ɴu; +ጚ ← ɴi; +ጛ ← ɴa; +ጜ ← ɴe; +ጝ ← ɴɨ; +ጞ ← ɴo; +ጝ ← ɴ; +# In Sebatbeit, ⷘ is spoken as palatalized voiced velar stop /ɡj/ ([1], Appendix H). +# Amharic speakers pronounce it as voiced velar stop /ɡ/ without palatalization. ⷘ → ɡə; ⷙ → ɡu; ⷚ → ɡi; @@ -366,26 +434,29 @@ ⷜ → ɡe; ⷝ → ɡɨ; ⷞ → ɡo; - -ጠ → tʼə; -ጡ → tʼu; -ጢ → tʼi; -ጣ → tʼa; -ጤ → tʼe; -ጥ → tʼɨ; -ጦ → tʼo; -ጧ → tʼwa; - -ጨ → t\u0361ʃʼə; -ጩ → t\u0361ʃʼu; -ጪ → t\u0361ʃʼi; -ጫ → t\u0361ʃʼa; -ጬ → t\u0361ʃʼe; -ጭ → t\u0361ʃʼɨ; -ጮ → t\u0361ʃʼo; +ጠ ↔ tʼə; +ጡ ↔ tʼu; +ጢ ↔ tʼi; +ጣ ↔ tʼa; +ጤ ↔ tʼe; +ጥ ↔ tʼɨ; +ጦ ↔ tʼo; +ጧ ↔ tʼwa; +ጢ ← tʼ; +ጨ ↔ t\u0361ʃʼə; +ጩ ↔ t\u0361ʃʼu; +ጪ ↔ t\u0361ʃʼi; +ጫ ↔ t\u0361ʃʼa; +ጬ ↔ t\u0361ʃʼe; +ጭ ↔ t\u0361ʃʼɨ; +ጮ ↔ t\u0361ʃʼo; ⶐ → t\u0361ʃʼo; # Dizi, Me’en, Mursi, Suri /t\u0361ʃʼɔ/ ([1], Appendix E); not used in Amharic. -ጯ → t\u0361ʃʼwa; - +ጯ ↔ t\u0361ʃʼwa; +ጪ ← t\u0361ʃʼ; +# According to Appendix B of [1], the following are used in the Bench language +# (aka Benchnon, Gimira). In Bench, ⶻ is pronounced as /ʈ\u0361ʂʼ/ Retroflex +# ejective affricate; with a phonemic distrinction to the non-retroflex version. +# Amharic does not have retroflex phonemes, so we go with /t\u0361ʃʼ/. ⶸ → t\u0361ʃʼə; ⶹ → t\u0361ʃʼu; ⶺ → t\u0361ʃʼi; @@ -393,26 +464,46 @@ ⶼ → t\u0361ʃʼe; ⶽ → t\u0361ʃʼɨ; ⶾ → t\u0361ʃʼo; - -ጰ → pʼə; -ጱ → pʼu; -ጲ → pʼi; -ጳ → pʼa; -ጴ → pʼe; -ጵ → pʼɨ; -ጶ → pʼo; +ቸ ↔ t\u0361ʃə; +ቹ ↔ t\u0361ʃu; +ቺ ↔ t\u0361ʃi; +ቻ ↔ t\u0361ʃa; +ቼ ↔ t\u0361ʃe; +ች ↔ t\u0361ʃɨ; +ቾ ↔ t\u0361ʃo; +ቿ ↔ t\u0361ʃwa; +ች ← t\u0361ʃ; +ተ ↔ tə; +ቱ ↔ tu; +ቲ ↔ ti; +ታ ↔ ta; +ቴ ↔ te; +ት ↔ tɨ; +ቶ ↔ to; +ⶆ → to; # Dizi, Me’en, Mursi, Suri /tɔ/ ([1], Appendix E); not used in Amharic. +ቷ ↔ twa; +ት ← t; +ጰ ↔ pʼə; +ጱ ↔ pʼu; +ጲ ↔ pʼi; +ጳ ↔ pʼa; +ጴ ↔ pʼe; +ጵ ↔ pʼɨ; +ጶ ↔ pʼo; ⶑ → pʼo; # Dizi, Me’en, Mursi, Suri /pʼɔ/ ([1], Appendix E); not used in Amharic. -ጷ → pʼwa; - -ጸ → sʼə; -ጹ → sʼu; -ጺ → sʼi; -ጻ → sʼa; -ጼ → sʼe; -ጽ → sʼɨ; -ጾ → sʼo; -ጿ → sʼwa; - +ጷ ↔ pʼwa; +ጵ ← pʼ; +ጸ ↔ sʼə; +ጹ ↔ sʼu; +ጺ ↔ sʼi; +ጻ ↔ sʼa; +ጼ ↔ sʼe; +ጽ ↔ sʼɨ; +ጾ ↔ sʼo; +ጿ ↔ sʼwa; +ጽ ← sʼ; +# In Amharic, ፀ is pronounced like ጸ. +# Source: [1], section on “Phonological Redundancy” for Amharic, page 5. ፀ → sʼə; ፁ → sʼu; ፂ → sʼi; @@ -421,37 +512,54 @@ ፅ → sʼɨ; ፆ → sʼo; ፇ → sʼo; # Dizi, Me’en, Mursi, Suri /sʼɔ/ ([1], Appendix E); not used in Amharic. - -ፈ → fə; -ፉ → fu; -ፊ → fi; -ፋ → fa; -ፌ → fe; -ፍ → fɨ; -ፎ → fo; +# Amharic speakers pronounce ሰ like ሠ. Source: [1], Appendix B. +ሰ ↔ sə; +ሱ ↔ su; +ሲ ↔ si; +ሳ ↔ sa; +ሴ ↔ se; +ስ ↔ sɨ; +ሶ ↔ so; +ⶃ → so; # Dizi, Me’en, Mursi, Suri /sɔ/ ([1], Appendix E); not used in Amharic. +ሷ ↔ swa; +ስ ← s; +ፈ ↔ fə; +ፉ ↔ fu; +ፊ ↔ fi; +ፋ ↔ fa; +ፌ ↔ fe; +ፍ ↔ fɨ; +ፎ ↔ fo; ᎈ → fwə; # Sebatbeit /fwə/ ([1], Appendix H); not used in Amharic. ᎉ → fwu; # Sebatbeit /fwu/ ([1], Appendix H); not used in Amharic. ᎋ → fwi; # Sebatbeit /fwi/ ([1], Appendix H); not used in Amharic. -ፏ → fwa; +ፏ ↔ fwa; ᎊ → fwe; # Sebatbeit /fwe/ ([1], Appendix H); not used in Amharic. ፚ → fja; # Unclear which language; Appendix L of [1] transcribes ፚ as /fja/. - -ፐ → pə; -ፑ → pu; -ፒ → pi; -ፓ → pa; -ፔ → pe; -ፕ → pɨ; -ፖ → po; +ፍ ← f; +ፐ ↔ pə; +ፑ ↔ pu; +ፒ ↔ pi; +ፓ ↔ pa; +ፔ ↔ pe; +ፕ ↔ pɨ; +ፖ ↔ po; ⶒ → po; # Dizi, Me’en, Mursi, Suri /pɔ/ ([1], Appendix E); not used in Amharic. ᎌ → pwə; # Sebatbeit /pwə/ ([1], Appendix H); not used in Amharic. ᎍ → pwu; # Sebatbeit /pwu/ ([1], Appendix H); not used in Amharic. ᎏ → pwi; # Sebatbeit /pwi/ ([1], Appendix H); not used in Amharic. -ፗ → pwa; +ፗ ↔ pwa; ᎎ → pwe; # Sebatbeit /pwe/ ([1], Appendix H); not used in Amharic. - -ኧ → ə; - +ፕ ← p; +ኧ ↔ ə; +ኡ ← u; # ኡላዓን ባዓታር ← Ulaan Baatar /ulaʕan baʕatar/ +አ ← a; # አምስተርዳም ← Amsterdam /amstərdam/ +ኤ ← e; +እ ← ɨ; +ኦ ← o; # ፖርት ኦፍ ስፔን ← Port of Spain /port of speːn/ +ኢ ← i; # ኢስላማባድ ← Islamabad /islamabad/ +# Applications will typically split words before calling our rules. +# To be resilient, we replace punctuation by whitespace in IPA. ፠ → ' '; # U+1360 ETHIOPIC SECTION MARK ፡ → ' '; # U+1361 ETHIOPIC WORDSPACE ። → ' '; # U+1362 ETHIOPIC FULL STOP @@ -461,7 +569,10 @@ ፦ → ' '; # U+1366 ETHIOPIC PREFACE COLON ፧ → ' '; # U+1367 ETHIOPIC QUESTION MARK ፨ → ' '; # U+1368 ETHIOPIC PARAGRAPH SEPARATOR - +# Likewise, Ethiopic numberals cannot be pronounced by these rules, +# so we replace them by whitespace in the output IPA notation. +# Applications will typically pre-process text before calling +# the am → am_FONIPA transform. ፩ → ' '; # U+1369 ETHIOPIC DIGIT ONE ፪ → ' '; # U+136A ETHIOPIC DIGIT TWO ፫ → ' '; # U+136B ETHIOPIC DIGIT THREE @@ -482,12 +593,111 @@ ፺ → ' '; # U+137A ETHIOPIC NUMBER NINETY ፻ → ' '; # U+137B ETHIOPIC NUMBER HUNDRED ፼ → ' '; # U+137C ETHIOPIC NUMBER TEN THOUSAND +# Transform IPA length markers to one of these: +# U+135D ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK +# U+135E ETHIOPIC COMBINING VOWEL LENGTH MARK +# U+135F ETHIOPIC COMBINING GEMINATION MARK +::null(); +← ː ; # Strip off any remaining IPA length markers. +::(null); +($IPA_CONSONANT) ([jw]? $IPA_VOWEL) \u135D → $1 ː $2 ː; +($IPA_CONSONANT) ([jw]? $IPA_VOWEL) \u135E → $1 $2 ː; +($IPA_CONSONANT) ([jw]? $IPA_VOWEL?) \u135F → $1 ː $2; +[\u135D \u135E \u135F] → ; # Strip off any remaining length markers. +$1 wa \u135D ← ($LABIALIZABLE_BEFORE_A) ː waː; # ቷ\u135D ← [tːʷaː] +$1 wa \u135E ← ($LABIALIZABLE_BEFORE_A) waː; # ቷ\u135E ← [tʷaː] +$1 wa \u135F ← ($LABIALIZABLE_BEFORE_A) ː wa; # አቷ\u135F ← [tːʷa] +$1 \u135F $2 \u135E ← ([b $LABIALIZABLE_BEFORE_A]) ː ([jw] $IPA_VOWEL) ː; +$1 \u135F $2 ← {([b $LABIALIZABLE_BEFORE_A]) ː ([jw] $IPA_VOWEL?)}; +$1 \u135E ← ($IPA_VOWEL ː); +$1 \u135D ← (jː $IPA_VOWEL ː); +$1 \u135E ← ([jw] $IPA_VOWEL ː); +$1 \u135F ← (jː $IPA_VOWEL?); +$1 \u135D ← ($IPA_CONSONANT ː [w]? $IPA_VOWEL ː); +$1 \u135E ← ($IPA_CONSONANT [w]? $IPA_VOWEL ː); +$1 \u135F ← ($IPA_CONSONANT ː [w]? $IPA_VOWEL?); +# Insert syllable markers in a separate pass. +::null; +{($IPA_VOWEL ː?)} [[:L:]] → $1 \.; +::(null); +← [ˈˌ\. \u0303\u032F]; +aj ← ai; # Nairobi /nairobi/ ናይሮቢ, Cairo /kairo/ ካይሮ +aw ← au; # Bissau /bisːau/ ቢሳው +eji ← ei; # Beijing /beid\u0361ʒiŋ/ ቤዪጂንግ +ewo ← eo; # Montevideo /montevideo/ ሞንቴቪዴዎ +ija ← ia; # Monrovia /monrovia/ ሞንሮቪያ +ijə ← iə; # Reunion /rijunijən/ ሪዩኒየን +iw ← iu; # Vilnius /vilnius/ ቪልኒውስ, New Delhi /niu deːli/ ኒው ዴሊ +jo ← io; # Tokyo /tokio/ ቶክዮ +nɡ ← ŋɡ; # Kongo /koŋɡo/ ኮንጎ, Hungary /həŋɡari/ ሀንጋሪ +nɡ ← ŋ; # Bangkok /baŋkok/ ባንግኮክ, Beijing /beid\u0361ʒiŋ/ ቤዪጂንግ +uwa ← ua; # Kuala Lumpur /kuala lumpur/ ኩዋላ ሉምፑር, Ruanda /ruanda/ ሩዋንዳ +bwe ← bue; # Buenos Aires /buenos aires/ ብዌኖስ አይሬስ +sʼ ← t\u0361s; # Podgorica /podɡorit\u0361sa/ ፖድጎሪጻ, Vaduz /fadut\u0361s/ ፋዱጽ +uwi ← ui; # Port Luis /port luis/ ፖርት ሉዊስ +uwe ← ue; # Lithuania /lituenia/ ሊቱዌኒያ, Venezuela /venɨzuela/ ቬንዙዌላ +::(null); +ʔə ← \. ə; +ʔu ← \. u; +ʔi ← \. i; +ʔa ← \. a; +ʔe ← \. e; +ʔɨ ← \. ɨ; +ʔo ← \. o; +$1 w ← {($IPA_VOWEL ː?) \u032F} $IPA_VOWEL; # /ewowa/ ← /e\u032Fo\u032Fa/ +::(null); +n ← [n {n\u033C} {n\u033C\u030A} {m\u033A} {n\u030A} {n\u0325} ⁿ ᵑ]; +m ← [ɱ {m\u0325} {m\u032A} ᵐ]; +ɲ ← [{ɳ\u030A} {ɳ\u0325} ɳ {ɲ\u030A} {ɲ\u0325} ɲ]; +ŋ ← [{ŋ\u030A} {ŋ\u0325} ŋ]; +ɴ ← [{ɴ\u030A} {ɴ\u0325} ɴ]; +p ← [{t\u033C} {p\u033A}]; +pʼ ← [ʘ ɋ]; +b ← [{d\u033C} {b\u033A} {ɾ\u033C} ɓ]; +t ← [{t\u032A} ʈ]; +tʼ ← [ǁ ʖ]; +d ← [ɖ ɗ ᶑ]; +k ← q; +kʼ ← [ǃ ʗ]; +ɡ ← [g ɢ ɣ ɠ ʛ]; +nɡ ← ᵑɡ; +ʔ ← ʡ; +s ← [θ {θ\u0331} {θ\u031E} {θ\u033C} {ɸ\u033A}]; +z ← [ð {ð\u0320} {ð\u033C} {β\u033A}]; +sʼ ← [{t\u0361s} {t\u035Cs} ʦ]; +t\u0361ʃ ← [{t\u035Cʃ} ʧ {t\u0361ɕ} {t\u035Cɕ} ʨ {ʈ\u0361ʂ} c]; +t\u0361ʃʼ ← [ǀ ʇ ǂ ʄ]; +d\u0361ʒ ← [ʤ ʣ {d\u0361z} {d\u035Cz} {d\u0361ɕ} ʥ {d\u0361ʑ} {d\u035Cʑ} {ɖ\u0361ʐ} {d\u0361ʐ} ɟ]; +pf ← [{p\u032A} {p\u0346} ȹ {p\u0361f} {p\u032Af} {p\u032A\u035Cf}]; +bv ← [{b\u032A} {b\u0346} ȸ {b\u0361v} {b\u032A\u0361v}]; +ʃ ← [ʂ ɕ]; +ʒ ← [ʐ ʑ]; +r ← [ɾ ɽ ʁ]; +rːʒ ← r\u031Dː; +rʒ ← r\u031D; +v ← β; +x ← [ç x χ]; +ʕ ← ʕ\u031D; +h ← ɦ; +j ← [ʝ ʲ]; +lj ← ʎ [iɨ]? [jʝʲ]?; +t\u0361ʃl ← [{t\u0361ɬ} {tɬ}]; +ʃl ← ɬ; +w ← {u\u032F} $IPA_VOWEL; +w ← ʷ; +ʼː ← ːʼ; # /pʼː/ ← /pːʼ/; /sʼː/ ← /sːʼ/; etc. +::(null); +i ← y; +ɨ ← [ɪ ʉ]; +u ← [ʊ ɯ]; +ə ← [ɛ æ ɘ]; +o ← [ɔ ø]; +a ← ɑ; +ʼ ← ʰ; +← [ʱ]; +$1ːʲ ← ([pbtd])ʲː; # [bːʲeː] ← [bʲːeː] +$1ːʷ ← ([pbtd])ʷː; # [bːʷeː] ← [bʷːeː] +::(NFC); +← [ \u0303 \u0330 \u030B \u0301 \u0304 \u0300 \u030F \u030C \u0302 ˥ ˦ ˧ ˨ ˩ ꜜ ꜛ ↗ ↘ ]; +::(NFD); -::NULL; -{i} [[:L:]] → i \.; -{ɨ} [[:L:]] → ɨ \.; -{u} [[:L:]] → u \.; -{e} [[:L:]] → e \.; -{o} [[:L:]] → o \.; -{ə} [[:L:]] → ə \.; -{a} [[:L:]] → a \.; diff --git a/icu4c/source/data/translit/Amharic_Latin_BGN.txt b/icu4c/source/data/translit/am_am_Latn_BGN.txt similarity index 70% rename from icu4c/source/data/translit/Amharic_Latin_BGN.txt rename to icu4c/source/data/translit/am_am_Latn_BGN.txt index 44d68056b10..c845a955d6b 100644 --- a/icu4c/source/data/translit/Amharic_Latin_BGN.txt +++ b/icu4c/source/data/translit/am_am_Latn_BGN.txt @@ -1,18 +1,39 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Amharic_Latin_BGN.txt +# File: am_am_Latn_BGN.txt # Generated from CLDR # + +######################################################################## +# BGN/PCGN 1967 System +# +# The BGN/PCGN system for Amharic was designed for use in romanizing +# names written in Amharic characters. The Roman letters and letter +# combinations shown as equivalents to the Amharic characters reflect +# modern Amharic pronunciation. Different consonant characters in three +# groups are pronounced alike in modern Amharic and are therefore +# romanized identically for use in geographic names. +# +# https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/320088/Amharic_Romanization.pdf +# +# Originally prepared by Michael Everson +######################################################################## +# MINIMAL FILTER: Amharic-Latin :: [ሀ-᎙] ; :: NFD (NFC) ; $ejective = ’; $glottal = ’; $pharyngeal = ‘; +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/trac/ticket/2034 $wordBoundary = [^[:L:][:M:][:N:]] ; +######################################################################## +# Start of Syllabic Transformations +######################################################################## ሀ → hā ; # ETHIOPIC SYLLABLE HA ሁ → hu ; # ETHIOPIC SYLLABLE HU ሂ → hī ; # ETHIOPIC SYLLABLE HI @@ -28,6 +49,12 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; ል → li ; # ETHIOPIC SYLLABLE LE ሎ → lo ; # ETHIOPIC SYLLABLE LO ሏ → lwa ; # ETHIOPIC SYLLABLE LWA +######################################################################## +# BGN Page 3 Rule 2: +# +# For documentation purposes the characters romanized with h in rows +# 1, 3, 13 and 18 may be romanized with h, h\u0323, h\u032E, and h\u0331, respectively. +######################################################################## ሐ → h\u0323ā ; # ETHIOPIC SYLLABLE HHA ሑ → h\u0323u ; # ETHIOPIC SYLLABLE HHU ሒ → h\u0323ī ; # ETHIOPIC SYLLABLE HHI @@ -35,6 +62,9 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; ሔ → h\u0323ē ; # ETHIOPIC SYLLABLE HHEE ሕ → h\u0323i ; # ETHIOPIC SYLLABLE HHE ሖ → h\u0323o ; # ETHIOPIC SYLLABLE HHO +######################################################################## +# End of Rule 2 +######################################################################## መ → me ; # ETHIOPIC SYLLABLE MA ሙ → mu ; # ETHIOPIC SYLLABLE MU ሚ → mī ; # ETHIOPIC SYLLABLE MI @@ -43,6 +73,14 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; ም → mi ; # ETHIOPIC SYLLABLE ME ሞ → mo ; # ETHIOPIC SYLLABLE MO ሟ → mwa ; # ETHIOPIC SYLLABLE MWA +######################################################################## +# BGN Page 3 Rule 2: +# +# The characters romanized with s in rows 5 and 7 may, instead, be +# romanized with š and s, respectively; and the characters romanized +# with ts’ in rows 30 and 31 may, instead, be romanized with ts’ and +# t\u035Fs’ respectively. +######################################################################## ሠ → še ; # ETHIOPIC SYLLABLE SZA ሡ → šu ; # ETHIOPIC SYLLABLE SZU ሢ → šī ; # ETHIOPIC SYLLABLE SZI @@ -50,6 +88,9 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; ሤ → šē ; # ETHIOPIC SYLLABLE SZEE ሥ → ši ; # ETHIOPIC SYLLABLE SZE ሦ → šo ; # ETHIOPIC SYLLABLE SZO +######################################################################## +# End of Rule 2 +######################################################################## ረ → re ; # ETHIOPIC SYLLABLE RA ሩ → ru ; # ETHIOPIC SYLLABLE RU ሪ → rī ; # ETHIOPIC SYLLABLE RI @@ -81,11 +122,20 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; ቄ → k $ejective ē ; # ETHIOPIC SYLLABLE QEE ቅ → k $ejective i ; # ETHIOPIC SYLLABLE QE ቆ → k $ejective o ; # ETHIOPIC SYLLABLE QO +# +# No rule yet for ቇ U+1247 ETHIOPIC SYLLABLE QOA ቈ → k $ejective o ; # ETHIOPIC SYLLABLE QWA ቍ → k $ejective wi ; # ETHIOPIC SYLLABLE QWE ቋ → k $ejective wa ; # ETHIOPIC SYLLABLE QWAA ቌ → k $ejective wē ; # ETHIOPIC SYLLABLE QWEE ቊ → k $ejective wī ; # ETHIOPIC SYLLABLE QWI +######################################################################## +# BGN Page 3 Rule 3: +# +# The character ቐ which occurs only in the writing system of the Tigre +# and # Tigrinya languages, should be romanized with k’ in geographic +# names but may be romanized with k\u0331 in documentation. +######################################################################## ቐ → k\u0331 $ejective e ; # ETHIOPIC SYLLABLE QHA ቑ → k\u0331 $ejective u ; # ETHIOPIC SYLLABLE QHU ቒ → k\u0331 $ejective ī ; # ETHIOPIC SYLLABLE QHI @@ -98,6 +148,9 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; ቛ → k\u0331 $ejective wa ; # ETHIOPIC SYLLABLE QHWAA ቜ → k\u0331 $ejective wē ; # ETHIOPIC SYLLABLE QHWEE ቝ → k\u0331 $ejective wi ; # ETHIOPIC SYLLABLE QHWE +######################################################################## +# End of Rule 3 +######################################################################## በ → be ; # ETHIOPIC SYLLABLE BA ቡ → bu ; # ETHIOPIC SYLLABLE BU ቢ → bī ; # ETHIOPIC SYLLABLE BI @@ -122,6 +175,12 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; ች → chi ; # ETHIOPIC SYLLABLE CE ቾ → cho ; # ETHIOPIC SYLLABLE CO ቿ → chwa ; # ETHIOPIC SYLLABLE CWA +######################################################################## +# BGN Page 3 Rule 2: +# +# For documentation purposes the characters romanized with h in rows +# 1, 3, 13 and 18 may be romanized with h, h\u0323, h\u032E, and h\u0331, respectively. +######################################################################## ኀ → h\u032Eā ; # ETHIOPIC SYLLABLE XA ኁ → h\u032Eu ; # ETHIOPIC SYLLABLE XU ኂ → h\u032Eī ; # ETHIOPIC SYLLABLE XI @@ -129,11 +188,15 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; ኄ → h\u032Eē ; # ETHIOPIC SYLLABLE XEE ኅ → h\u032Ei ; # ETHIOPIC SYLLABLE XE ኆ → h\u032Eo ; # ETHIOPIC SYLLABLE XO +# No rule yet for ኇ U+1287 ETHIOPIC SYLLABLE XOA ኈ → h\u032Eo; # ETHIOPIC SYLLABLE XWA ኊ → h\u032Ewī ; # ETHIOPIC SYLLABLE XWI ኋ → h\u032Ewa ; # ETHIOPIC SYLLABLE XWAA ኌ → h\u032Ewē ; # ETHIOPIC SYLLABLE XWEE ኍ → h\u032Ewi ; # ETHIOPIC SYLLABLE XWE +######################################################################## +# End of Rule 2 +######################################################################## ነ → ne ; # ETHIOPIC SYLLABLE NA ኑ → nu ; # ETHIOPIC SYLLABLE NU ኒ → nī ; # ETHIOPIC SYLLABLE NI @@ -150,6 +213,13 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; ኝ → nyi ; # ETHIOPIC SYLLABLE NYE ኞ → nyo ; # ETHIOPIC SYLLABLE NYO ኟ → nywa ; # ETHIOPIC SYLLABLE NYWA +######################################################################## +# BGN Page 3 Rule 5: +# +# The vowel characters in row 16 should be Romanized ā, u, ī, a, ē, i, +# and o initially and ’ā, ’u, ’ī, ’a, ’ē, ’i, and ’o in all other +# positions. +######################################################################## $wordBoundary{አ → ā ; # ETHIOPIC SYLLABLE GLOTTAL A $wordBoundary{ኡ → u ; # ETHIOPIC SYLLABLE GLOTTAL U $wordBoundary{ኢ → ī ; # ETHIOPIC SYLLABLE GLOTTAL I @@ -166,6 +236,9 @@ $wordBoundary{ኧ → e ; # ETHIOPIC SYLLABLE GLOTTAL WA እ → $glottal i ; # ETHIOPIC SYLLABLE GLOTTAL E ኦ → $glottal o ; # ETHIOPIC SYLLABLE GLOTTAL O ኧ → $glottal e ; # ETHIOPIC SYLLABLE GLOTTAL WA +######################################################################## +# End of Rule 5 +######################################################################## ከ → ke ; # ETHIOPIC SYLLABLE KA ኩ → ku ; # ETHIOPIC SYLLABLE KU ኪ → kī ; # ETHIOPIC SYLLABLE KI @@ -173,11 +246,18 @@ $wordBoundary{ኧ → e ; # ETHIOPIC SYLLABLE GLOTTAL WA ኬ → kē ; # ETHIOPIC SYLLABLE KEE ክ → ki ; # ETHIOPIC SYLLABLE KE ኮ → ko ; # ETHIOPIC SYLLABLE KO +# No rule yet for ኯ U+12AF ETHIOPIC SYLLABLE KOA ኰ → ko ; # ETHIOPIC SYLLABLE KWA ኲ → kwī ; # ETHIOPIC SYLLABLE KWI ኳ → kwa ; # ETHIOPIC SYLLABLE KWAA ኴ → kwē ; # ETHIOPIC SYLLABLE KWEE ኵ → kwi ; # ETHIOPIC SYLLABLE KWE +######################################################################## +# BGN Page 3 Rule 2: +# +# For documentation purposes the characters romanized with h in rows +# 1, 3, 13 and 18 may be romanized with h, h\u0323, h\u032E, and h\u0331, respectively. +######################################################################## ኸ → h\u0331e ; # ETHIOPIC SYLLABLE KXA ኹ → h\u0331u ; # ETHIOPIC SYLLABLE KXU ኺ → h\u0331ī ; # ETHIOPIC SYLLABLE KXI @@ -185,6 +265,14 @@ $wordBoundary{ኧ → e ; # ETHIOPIC SYLLABLE GLOTTAL WA ኼ → h\u0331ē ; # ETHIOPIC SYLLABLE KXEE ኽ → h\u0331i ; # ETHIOPIC SYLLABLE KXE ኾ → h\u0331o ; # ETHIOPIC SYLLABLE KXO +# No rule yet for ዀ U+12C0 ETHIOPIC SYLLABLE KXWA +# No rule yet for ዂ U+12C2 ETHIOPIC SYLLABLE KXWI +# No rule yet for ዃ U+12C3 ETHIOPIC SYLLABLE KXWAA +# No rule yet for ዄ U+12C4 ETHIOPIC SYLLABLE KXWEE +# No rule yet for ዅ U+12C5 ETHIOPIC SYLLABLE KXWE +######################################################################## +# End of Rule 2 +######################################################################## ወ → we ; # ETHIOPIC SYLLABLE WA ዉ → wu ; # ETHIOPIC SYLLABLE WU ዊ → wī ; # ETHIOPIC SYLLABLE WI @@ -192,6 +280,7 @@ $wordBoundary{ኧ → e ; # ETHIOPIC SYLLABLE GLOTTAL WA ዌ → wē ; # ETHIOPIC SYLLABLE WEE ው → wi ; # ETHIOPIC SYLLABLE WE ዎ → wo ; # ETHIOPIC SYLLABLE WO +# No rule yet for ዏ U+12CF ETHIOPIC SYLLABLE WOA ዐ → $pharyngeal ā ; # ETHIOPIC SYLLABLE PHARYNGEAL A ዑ → $pharyngeal u ; # ETHIOPIC SYLLABLE PHARYNGEAL U ዒ → $pharyngeal ī ; # ETHIOPIC SYLLABLE PHARYNGEAL I @@ -230,6 +319,7 @@ $wordBoundary{ኧ → e ; # ETHIOPIC SYLLABLE GLOTTAL WA ድ → di ; # ETHIOPIC SYLLABLE DE ዶ → do ; # ETHIOPIC SYLLABLE DO ዷ → dwa ; # ETHIOPIC SYLLABLE DWA +# No rule yet for ዸ U+12F8 ETHIOPIC SYLLABLE DDA ... ጀ → je ; # ETHIOPIC SYLLABLE JA ጁ → ju ; # ETHIOPIC SYLLABLE JU ጂ → jī ; # ETHIOPIC SYLLABLE JI @@ -245,11 +335,15 @@ $wordBoundary{ኧ → e ; # ETHIOPIC SYLLABLE GLOTTAL WA ጌ → gē ; # ETHIOPIC SYLLABLE GEE ግ → gi ; # ETHIOPIC SYLLABLE GE ጎ → go ; # ETHIOPIC SYLLABLE GO +# No rule yet for ጏ U+130F ETHIOPIC SYLLABLE GOA ጐ → go ; # ETHIOPIC SYLLABLE GWA ጒ → gwī ; # ETHIOPIC SYLLABLE GWI ጓ → gwa ; # ETHIOPIC SYLLABLE GWAA ጔ → gwē ; # ETHIOPIC SYLLABLE GWEE ጕ → gwi ; # ETHIOPIC SYLLABLE GWE +# No rule yet for ጘ U+1318 ETHIOPIC SYLLABLE GGA +# ...ጙጚጛጜጝጞ... +# No rule yet for ጟ U+131F ETHIOPIC SYLLABLE GGWAA ጠ → t $ejective e ; # ETHIOPIC SYLLABLE THA ጡ → t $ejective u ; # ETHIOPIC SYLLABLE THU ጢ → t $ejective ī ; # ETHIOPIC SYLLABLE THI @@ -274,6 +368,14 @@ $wordBoundary{ኧ → e ; # ETHIOPIC SYLLABLE GLOTTAL WA ጵ → p $ejective i ; # ETHIOPIC SYLLABLE PHE ጶ → p $ejective o ; # ETHIOPIC SYLLABLE PHO ጷ → p $ejective wa ; # ETHIOPIC SYLLABLE PHWA +######################################################################## +# BGN Page 3 Rule 2: +# +# The characters romanized with s in rows 5 and 7 may, instead, be +# romanized with š and s, respectively; and the characters romanized +# with ts’ in rows 30 and 31 may, instead, be romanized with ts’ and +# t\u035Fs’ respectively. +######################################################################## ጸ → ts $ejective e ; # ETHIOPIC SYLLABLE TSA ጹ → ts $ejective u ; # ETHIOPIC SYLLABLE TSU ጺ → ts $ejective ī ; # ETHIOPIC SYLLABLE TSI @@ -289,6 +391,10 @@ $wordBoundary{ኧ → e ; # ETHIOPIC SYLLABLE GLOTTAL WA ፄ → t\u035Fs $ejective ē ; # ETHIOPIC SYLLABLE TZEE ፅ → t\u035Fs $ejective i ; # ETHIOPIC SYLLABLE TZE ፆ → t\u035Fs $ejective o ; # ETHIOPIC SYLLABLE TZO +# No rule yet for ፇ U+1347 ETHIOPIC SYLLABLE TZOA +######################################################################## +# End of Rule 2 +######################################################################## ፈ → fe ; # ETHIOPIC SYLLABLE FA ፉ → fu ; # ETHIOPIC SYLLABLE FU ፊ → fī ; # ETHIOPIC SYLLABLE FI @@ -308,6 +414,7 @@ $wordBoundary{ኧ → e ; # ETHIOPIC SYLLABLE GLOTTAL WA ፘ → rya ; # ETHIOPIC SYLLABLE RYA ፙ → mya ; # ETHIOPIC SYLLABLE MYA ፚ → fya ; # ETHIOPIC SYLLABLE FYA +# No rule yet for ፚ U+135A ETHIOPIC SYLLABLE FYA ቨ → ve ; # ETHIOPIC SYLLABLE VA ቩ → vu ; # ETHIOPIC SYLLABLE VU ቪ → vī ; # ETHIOPIC SYLLABLE VI @@ -316,6 +423,11 @@ $wordBoundary{ኧ → e ; # ETHIOPIC SYLLABLE GLOTTAL WA ቭ → vi ; # ETHIOPIC SYLLABLE VE ቮ → vo ; # ETHIOPIC SYLLABLE VO ቯ → vwa ; # ETHIOPIC SYLLABLE VWA +######################################################################## +# Start of Numeric Transformations +# +# The BGN table on page 3 does not include ፼. +######################################################################## ፩ → 1 ; # ETHIOPIC DIGIT ONE ፪ → 2 ; # ETHIOPIC DIGIT TWO ፫ → 3 ; # ETHIOPIC DIGIT THREE @@ -335,3 +447,4 @@ $wordBoundary{ኧ → e ; # ETHIOPIC SYLLABLE GLOTTAL WA ፹ → 80 ; # ETHIOPIC NUMBER EIGHTY ፺ → 90 ; # ETHIOPIC NUMBER NINETY ፻ → 100 ; # ETHIOPIC NUMBER HUNDRED + diff --git a/icu4c/source/data/translit/am_ar.txt b/icu4c/source/data/translit/am_ar.txt new file mode 100644 index 00000000000..3fdcb4b5f32 --- /dev/null +++ b/icu4c/source/data/translit/am_ar.txt @@ -0,0 +1,14 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: am_ar.txt +# Generated from CLDR +# + +::am-am_FONIPA; +ɨ → ə; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/am_fa.txt b/icu4c/source/data/translit/am_fa.txt new file mode 100644 index 00000000000..bc0746fbb5a --- /dev/null +++ b/icu4c/source/data/translit/am_fa.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: am_fa.txt +# Generated from CLDR +# + +::am-am_FONIPA; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/Arabic_Latin_BGN.txt b/icu4c/source/data/translit/ar_ar_Latn_BGN.txt similarity index 63% rename from icu4c/source/data/translit/Arabic_Latin_BGN.txt rename to icu4c/source/data/translit/ar_ar_Latn_BGN.txt index 6248e62b46a..39e752b1847 100644 --- a/icu4c/source/data/translit/Arabic_Latin_BGN.txt +++ b/icu4c/source/data/translit/ar_ar_Latn_BGN.txt @@ -1,22 +1,59 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Arabic_Latin_BGN.txt +# File: ar_ar_Latn_BGN.txt # Generated from CLDR # + +# +######################################################################## +# BGN/PCGN 1956 System +# +# This system was adopted by the BGN in 1946 and by the PCGN +# in 1956 and has been applied in the systematic romanization +# of geographic names in Bahrain, Egypt, Iraq, Jordan, +# Kuwait, Lebanon, Libya, Oman, Qatar, Saudi Arabia, Sudan, +# Syria, Tunisia, the United Arab Emirates, and Yemen, all +# of which has been covered by published BGN engineers. +# +# Originally prepared by Michael Everson +######################################################################## +# +# MINIMAL FILTER: Arabic-Latin +# :: [[:arabic:][:block=ARABIC:][ءآابةتثجحخدذرزسشصضطظعغفقكلمنهوىي\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652٠١٢٣٤٥٦٧٨٩ٱ]] ; :: NFKD (NFC) ; +# +# +######################################################################## +# +######################################################################## +# +# Define All Transformation Variables +# +######################################################################## +# $alef = ’; $ayin = ‘; $disambig = \u0331 ; +# +# +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/cgi-bin/icu-bugs/transliterate?id=2034;expression=boundary;user=guest +# $wordBoundary = [^[:L:][:M:][:N:]] ; +# +# +######################################################################## +# non-letters [:Nd:]{٫}[:Nd:] ↔ [:Nd:]{','}[:Nd:] ; # ARABIC DECIMAL SEPARATOR [:Nd:]{٬}[:Nd:] ↔ [:Nd:]{'.'}[:Nd:] ; # ARABIC THOUSANDS SEPARATOR ٫ ↔ ',' $disambig ; # ARABIC DECIMAL SEPARATOR ٬ ↔ '.' $disambig ; # ARABIC THOUSANDS SEPARATOR +# ٭ ↔ ; # ARABIC FIVE POINTED STAR // no need to transliterate ، ↔ ',' ; # ARABIC COMMA ؛ ↔ ';' ; # ARABIC SEMICOLON ؟ ↔ '?' ; # ARABIC QUESTION MARK @@ -41,10 +78,46 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; ٧ ↔ 7 ; # ARABIC-INDIC DIGIT SEVEN ٨ ↔ 8 ; # ARABIC-INDIC DIGIT EIGHT ٩ ↔ 9 ; # ARABIC-INDIC DIGIT NINE +# +######################################################################## +# +# Rules moved to front to avoid masking +# +######################################################################## +# +######################################################################## +# +# BGN Page 8 Rule 5 +# +# The character sequences ت , كه , ته , and سه may be romanized t·h, k·h, +# d·h, and s·h in order to differentiate those romanizations from the +# digraphs th, kh, dh, and sh. +# +######################################################################## +# ته → t·h ; # ARABIC LETTER TEH + HEH كه → k·h ; # ARABIC LETTER KAF + HEH ده → d·h ; # ARABIC LETTER DAL + HEH سه → s·h ; # ARABIC LETTER SEEN + HEH +# +# +######################################################################## +# +# End Rule 5 +# +######################################################################## +######################################################################## +# +# +# BGN Page 8 Rule 9 +# +# Doubles consonant sounds are represented in Arabic script by placing +# a shaddah ( \u0651 ) over a consonant character. In romanization the letter +# should be doubled. [The remainder of this rule deals with the definite +# article and is lexical.] +# +######################################################################## +# ب\u0651 → bb ; # ARABIC LETTER BEH + SHADDA ت\u0651 → tt ; # ARABIC LETTER TEH + SHADDA ث\u0651 → thth ; # ARABIC LETTER THEH + SHADDA @@ -72,6 +145,20 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; ه\u0651 → hh ; # ARABIC LETTER HEH + SHADDA و\u0651 → ww ; # ARABIC LETTER WAW + SHADDA ى\u0651 → yy ; # ARABIC LETTER YEH + SHADDA +# +# +######################################################################## +# +# End Rule 9 +# +######################################################################## +# +######################################################################## +# +# Start of Transformations +# +######################################################################## +# $wordBoundary{ء → ; # ARABIC LETTER HAMZA ء → $alef ; # ARABIC LETTER HAMZA $wordBoundary{ا → ; # ARABIC LETTER ALEF @@ -121,3 +208,7 @@ $wordBoundary{آ → ā ; # ARABIC LETTER ALEF WITH MADDA ABOVE \u064D → iⁿ ; # ARABIC KASRATAN \u064C → uⁿ ; # ARABIC DAMMATAN ::NFC (NFD) ; +# +# +######################################################################## + diff --git a/icu4c/source/data/translit/Azerbaijani_Latin_BGN.txt b/icu4c/source/data/translit/az_Cyrl_az_BGN.txt similarity index 54% rename from icu4c/source/data/translit/Azerbaijani_Latin_BGN.txt rename to icu4c/source/data/translit/az_Cyrl_az_BGN.txt index 500fdc13a07..5e90ca8dc75 100644 --- a/icu4c/source/data/translit/Azerbaijani_Latin_BGN.txt +++ b/icu4c/source/data/translit/az_Cyrl_az_BGN.txt @@ -1,14 +1,46 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Azerbaijani_Latin_BGN.txt +# File: az_Cyrl_az_BGN.txt # Generated from CLDR # + +# +######################################################################## +# BGN/PCGN 1993 Agreement +# +# Azerbaijani is the official language of Azerbaijan. In 1991, the +# Azerbaijani government adopted a Roman alphabet to replace the +# existing Cyrillic alphabet. The Azerbaijani Cyrillic alphabet +# contains nine letters not present in the Russian alphabet: +# Ғғ, Әә, Јј, Ҝҝ, Өө, Үү, Һһ, Ҹҹ, and ’. Four obsolete letters +# Йй, Ээ, Юю and Яя are also given. +# +# The Azerbaijani Alphabet as defined by the BGN (Page 13): +# +# АБВГҒДЕӘЖЗИЫЈКҜЛМНОӨПРСТУҮФХҺЧҸШЙЭЮЯ +# абвгғдеәжзиыјкҝлмноөпрстуүфхһчҹш’йэюя +# +# Originally prepared by Michael Everson +######################################################################## +# +# MINIMAL FILTER: AzerbaijaniCyrl-Latin +# :: [АБВГҒДЕӘЖЗИЫКҜЛМНОӨПРСТУҮФХҺЧҸШЙЭЮЯабвгғдеәжзиыкҝлмноөпрстуүфхһчҹш’йэюя] ; :: NFD (NFC) ; +# +# +######################################################################## +# +######################################################################## +# +# Define All Transformation Variables +# +######################################################################## +# $prime = ’ ; $wordBoundary = [^[:L:][:M:][:N:]] ; $upperConsonants = [БВГҒДЖЗЈКҜЛМНПРСТФХҺЧҸШЙ] ; @@ -18,6 +50,20 @@ $upperVowels = [АЕӘИЫОӨУҮЭЮЯ] ; $lowerVowels = [аеәиыоөуүэюя] ; $vowels = [$upperVowels $lowerVowels] ; $lower = [$lowerConsonants $lowerVowels] ; +# +# +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/cgi-bin/icu-bugs/transliterate?id=2034;expression=boundary;user=guest +# +# +# +######################################################################## +# +######################################################################## +# +# Start of Alphabetic Transformations +# +######################################################################## А → A ; # CYRILLIC CAPITAL LETTER A а → a ; # CYRILLIC SMALL LETTER A Б → B ; # CYRILLIC CAPITAL LETTER BE @@ -32,8 +78,33 @@ $lower = [$lowerConsonants $lowerVowels] ; д → d ; # CYRILLIC SMALL LETTER DE Е → E ; # CYRILLIC CAPITAL LETTER DE е → e ; # CYRILLIC SMALL LETTER DE +# +######################################################################## +# +# BGN Page 14 Note 1 +# +# The special letter Ə ə, known as schwa, should be reproduced in that +# form whenever encountered. In those instances when it cannot be +# reproduced, however, the letter Ä ä may be substituted for it. +# +######################################################################## +# Ә → Ə; # CYRILLIC CAPITAL LETTER SCHWA ә → ə; # CYRILLIC SMALL LETTER SCHWA +# +# +# Alternative rule when schwa is not available. To apply uncomment the +# following by removing the '#' mark at the start of the line and insert +# before the two rule lines above. +# +# Ә → Ä; # CYRILLIC CAPITAL LETTER SCHWA +# ә → ä; # CYRILLIC SMALL LETTER SCHWA +# +######################################################################## +# +# End BGN Page 14 Note 1 +# +######################################################################## Ж → J ; # CYRILLIC CAPITAL LETTER ZHE ж → j ; # CYRILLIC SMALL LETTER ZHE З → Z ; # CYRILLIC CAPITAL LETTER ZE @@ -82,6 +153,25 @@ $lower = [$lowerConsonants $lowerVowels] ; ҹ → c ; # CYRILLIC SMALL LETTER CHE WITH VERTICAL STROKE Ш → Ş ; # CYRILLIC CAPITAL LETTER SHA ш → ş ; # CYRILLIC SMALL LETTER SHA +# +######################################################################## +# +# BGN Page 13 Rule 33, maps the symbol onto itself and +# is ignored here for computational efficiency. +# +# $prime → $prime ; # RIGHT SINGLE QUOTATION MARK +# +######################################################################## +# +######################################################################## +# +# BGN Page 14 Note 2: +# +# The obsolete characters й, э, ю, and я should be romanized ẏ, ė, +# yu\u0307, and yȧ. +# +######################################################################## +# Й → Ẏ ; # CYRILLIC CAPITAL LETTER HARD SIGN й → ẏ ; # CYRILLIC SMALL LETTER HARD SIGN Э → Ė ; # CYRILLIC CAPITAL LETTER SOFT SIGN @@ -92,3 +182,11 @@ $lower = [$lowerConsonants $lowerVowels] ; Я} $lower → Yȧ ; # CYRILLIC CAPITAL LETTER YA Я → YȦ ; # CYRILLIC CAPITAL LETTER YA я → yȧ ; # CYRILLIC SMALL LETTER YA +# +# +######################################################################## +# +# End BGN Page 14 Note 2. +# +######################################################################## + diff --git a/icu4c/source/data/translit/az_Lower.txt b/icu4c/source/data/translit/az_Lower.txt index 2407873a4ab..dc6e5efc809 100755 --- a/icu4c/source/data/translit/az_Lower.txt +++ b/icu4c/source/data/translit/az_Lower.txt @@ -1,13 +1,22 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: az_Lower.txt # Generated from CLDR # + +# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri +# 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE İ→i; +# When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. +# This matches the behavior of the canonically equivalent I-dot_above +# 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE +# When lowercasing, unless an I is before a dot_above, it turns into a dotless i. +# 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ; I→ı ; ::Any-Lower(); + diff --git a/icu4c/source/data/translit/az_Title.txt b/icu4c/source/data/translit/az_Title.txt index 238a477ac9a..a9cb16f99f7 100755 --- a/icu4c/source/data/translit/az_Title.txt +++ b/icu4c/source/data/translit/az_Title.txt @@ -1,14 +1,20 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: az_Title.txt # Generated from CLDR # + +# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri +# Make any string of letters after a cased letter be lower, with rules for i [:cased:] [:case-ignorable:]* { İ → i; [:cased:] [:case-ignorable:]* { I → ı; [:cased:] [:case-ignorable:]* { (.) → &Any-Lower($1) ; +# Otherwise all lowercase go to upper (titlecase stay as is) i→İ ; ([:Lowercase:]) → &Any-Upper($1) ; +# do later I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ; + diff --git a/icu4c/source/data/translit/az_Upper.txt b/icu4c/source/data/translit/az_Upper.txt index f4635ea0a5c..44139310bb9 100644 --- a/icu4c/source/data/translit/az_Upper.txt +++ b/icu4c/source/data/translit/az_Upper.txt @@ -1,11 +1,14 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: az_Upper.txt # Generated from CLDR # + +# Copyright (C) 2011-2013, Apple Inc.; Unicode, Inc.; and others. All Rights Reserved. i→İ; ::Any-Upper(); + diff --git a/icu4c/source/data/translit/Belarusian_Latin_BGN.txt b/icu4c/source/data/translit/be_be_Latn_BGN.txt similarity index 52% rename from icu4c/source/data/translit/Belarusian_Latin_BGN.txt rename to icu4c/source/data/translit/be_be_Latn_BGN.txt index 057d771f026..1161e0074f0 100644 --- a/icu4c/source/data/translit/Belarusian_Latin_BGN.txt +++ b/icu4c/source/data/translit/be_be_Latn_BGN.txt @@ -1,14 +1,45 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Belarusian_Latin_BGN.txt +# File: be_be_Latn_BGN.txt # Generated from CLDR # + +# +######################################################################## +# BGN/PCGN 1979 System +# +# The BGN/PCGN system for Belarusian (formerly Byelorussian) was +# designed for use in romanizing names written in the Belarusian +# Cyrillic alphabet. The Belarusian alphabet contains three +# letters not present in the Russian alphabet: Іі, Ўў, ’. +# One obsolete letter Ґґ is included. +# +# The Belarusian Alphabet as defined by the BGN (Page 23): +# +# АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯҐ +# абвгдеёжзійклмнопрстуўфхцчшыьэюя’ґ +# +# Originally prepared by Michael Everson +######################################################################## +# +# MINIMAL FILTER: Belarusian-Latin +# :: [АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЩЪЫЬЭЮЯҐабвгдеёжзійклмнопрстуўфхцчшщъыьэюя’ґ] ; :: NFD (NFC) ; +# +# +######################################################################## +# +######################################################################## +# +# Define All Transformation Variables +# +######################################################################## +# $prime = ʹ ; $doublePrime = ʺ ; $upperConsonants = [БВГДЖЗЙКЛМНПРСТЎФХЦЧШЬҐ] ; @@ -18,7 +49,22 @@ $upperVowels = [АЕЁІОУЫЭЮЯ] ; $lowerVowels = [аеёіоуыэюя] ; $vowels = [$upperVowels $lowerVowels] ; $lower = [$lowerConsonants $lowerVowels] ; +# +# +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/cgi-bin/icu-bugs/transliterate?id=2034;expression=boundary;user=guest +# $wordBoundary = [^[:L:][:M:][:N:]] ; +# +# +######################################################################## +# +######################################################################## +# +# Start of Alphabetic Transformations +# +######################################################################## +# А → A ; # CYRILLIC CAPITAL LETTER A а → a ; # CYRILLIC SMALL LETTER A Б → B ; # CYRILLIC CAPITAL LETTER BE @@ -38,20 +84,56 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; Ж} $lower → Zh ; # CYRILLIC CAPITAL LETTER ZHE Ж → ZH ; # CYRILLIC CAPITAL LETTER ZHE ж → zh ; # CYRILLIC SMALL LETTER ZHE +# +# +######################################################################## +# +# BGN Page 23 Note 1 +# +# The character sequences зг, цг, сг, тс, and кг may be romanized z·h, +# k·h, s·h, t·s and ts·h in order to differentiate those romanizations +# from the digraphs zh, kh, sh, ts, and the letter sequence tsh, which +# are used to render characters ж, х, ш, ц, and the character sequence тш. +# +######################################################################## +# ЗГ → Z·H ; # CYRILLIC CAPITAL LETTER ZE Зг → Z·h ; # CYRILLIC CAPITAL LETTER ZE зг → z·h ; # CYRILLIC SMALL LETTER ZE З → Z ; # CYRILLIC CAPITAL LETTER ZE з → z ; # CYRILLIC SMALL LETTER ZE +# +# +######################################################################## +# +# End Rule 1 +# +######################################################################## І → I ; # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I і → i ; # CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I Й → Y ; # CYRILLIC CAPITAL LETTER I й → y ; # CYRILLIC SMALL LETTER I +# +######################################################################## +# +# BGN Page 23 Rule 1 +# +# кг becomes k·h +# +######################################################################## +# КГ → K·H ; # CYRILLIC CAPITAL LETTER KA Кг → K·h ; # CYRILLIC CAPITAL LETTER KA кг → k·h ; # CYRILLIC SMALL LETTER KA К → K ; # CYRILLIC CAPITAL LETTER KA к → k ; # CYRILLIC SMALL LETTER KA +# +# +######################################################################## +# +# End Rule 1 +# +######################################################################## Л → L ; # CYRILLIC CAPITAL LETTER EL л → l ; # CYRILLIC SMALL LETTER EL М → M ; # CYRILLIC CAPITAL LETTER EM @@ -64,16 +146,48 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; п → p ; # CYRILLIC SMALL LETTER PE Р → R ; # CYRILLIC CAPITAL LETTER ER р → r ; # CYRILLIC SMALL LETTER ER +# +######################################################################## +# +# BGN Page 23 Rule 1 +# +# сг becomes s·h +# +######################################################################## +# СГ → S·H ; # CYRILLIC CAPITAL LETTER ES Сг → S·h ; # CYRILLIC CAPITAL LETTER ES сг → s·h ; # CYRILLIC SMALL LETTER ES С → S ; # CYRILLIC CAPITAL LETTER ES с → s ; # CYRILLIC SMALL LETTER ES +# +# +######################################################################## +# +# End Rule 1 +# +######################################################################## +# +######################################################################## +# +# BGN Page 23 Rule 1 +# +# тс becomes t·s +# +######################################################################## +# ТС → T·S ; # CYRILLIC CAPITAL LETTER TE Тс → T·s ; # CYRILLIC CAPITAL LETTER TE тс → t·s ; # CYRILLIC SMALL LETTER TE Т → T ; # CYRILLIC CAPITAL LETTER TE т → t ; # CYRILLIC SMALL LETTER TE +# +# +######################################################################## +# +# End Rule 1 +# +######################################################################## У → U ; # CYRILLIC CAPITAL LETTER U у → u ; # CYRILLIC SMALL LETTER U Ў → W ; # CYRILLIC CAPITAL LETTER SHORT U @@ -83,12 +197,28 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; Х} $lower → Kh ; # CYRILLIC CAPITAL LETTER HA Х → KH ; # CYRILLIC CAPITAL LETTER HA х → kh ; # CYRILLIC SMALL LETTER HA +# +######################################################################## +# +# BGN Page 23 Rule 1 +# +# цг becomes ts·h +# +######################################################################## +# ЦГ → TS·H ; # CYRILLIC CAPITAL LETTER TSE Цг → Ts·h ; # CYRILLIC CAPITAL LETTER TSE цг → ts·h ; # CYRILLIC SMALL LETTER TSE Ц} $lower → Ts ; # CYRILLIC CAPITAL LETTER TSE Ц → TS ; # CYRILLIC CAPITAL LETTER TSE ц → ts ; # CYRILLIC SMALL LETTER TSE +# +# +######################################################################## +# +# End Rule 1 +# +######################################################################## Ч} $lower → Ch ; # CYRILLIC CAPITAL LETTER CHE Ч → CH ; # CYRILLIC CAPITAL LETTER CHE ч → ch ; # CYRILLIC SMALL LETTER CHE @@ -105,5 +235,22 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; Я → YA ; # CYRILLIC CAPITAL LETTER YA я → ya ; # CYRILLIC SMALL LETTER YA ’ → $doublePrime ; # LEFT SINGLE QUOTATION MARK +# +######################################################################## +# +# BGN Page 23 Note 2 +# +# The obsolete character ґ should be romanized g. +# +######################################################################## +# Ґ → G ; # CYRILLIC CAPITAL LETTER GHE WITH UPTURN ґ → g ; # CYRILLIC SMALL LETTER GHE WITH UPTURN +# +# +######################################################################## +# +# End Note 2 +# +######################################################################## + diff --git a/icu4c/source/data/translit/bg_bg_Latn_BGN.txt b/icu4c/source/data/translit/bg_bg_Latn_BGN.txt new file mode 100644 index 00000000000..16f62722e45 --- /dev/null +++ b/icu4c/source/data/translit/bg_bg_Latn_BGN.txt @@ -0,0 +1,246 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: bg_bg_Latn_BGN.txt +# Generated from CLDR +# + +# +######################################################################## +# BGN/PCGN 1952 System +# +# This system was adopted by the BGN in 1949 and by the PCGN in 1952. +# It reflects the much simplified Bulgarian orthography as officially +# revised in February 1945. The Bulgarian alphabet contains all of +# the characters present in the Russian alphabet with the exception +# of Ёё, Ыы, and Ээ. Two obsolete letters Ѫѫ and Ѣѣ are also given. +# +# The Bulgarian Alphabet as defined by the BGN (Page 15): +# +# АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯѪѢ +# абвгдежзийклмнопрстуфхцчшщъьюяѫѣ +# +# Originally prepared by Michael Everson +######################################################################## +# +# MINIMAL FILTER: Bulgarian-Latin +# +:: [АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯѪѢабвгдежзийклмнопрстуфхцчшщъьюяѫѣ] ; +:: NFD (NFC) ; +# +# +######################################################################## +# +######################################################################## +# +# Define All Transformation Variables +# +######################################################################## +# +$upperConsonants = [БВГДЖЗЙКЛМНПРСТФХЦЧШЩЬ] ; +$lowerConsonants = [бвгджзйклмнпрстфхцчшщь] ; +$consonants = [$upperConsonants $lowerConsonants] ; +$upperVowels = [АЕИОУЪЮЯѪѢ] ; +$lowerVowels = [аеиоуъюяѫѣ] ; +$vowels = [$upperVowels $lowerVowels] ; +$lower = [$lowerConsonants $lowerVowels] ; +$bulgarian = [ $lower $upperConsonants $upperVowels ] ; +# +# +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/cgi-bin/icu-bugs/transliterate?id=2034;expression=boundary;user=guest +# +$wordBoundary = [^[:L:][:M:][:N:]] ; +# +# +######################################################################## +# +######################################################################## +# +# Start of Alphabetic Transformations +# +######################################################################## +# +А → A ; # CYRILLIC CAPITAL LETTER A +а → a ; # CYRILLIC SMALL LETTER A +Б → B ; # CYRILLIC CAPITAL LETTER BE +б → b ; # CYRILLIC SMALL LETTER BE +В → V ; # CYRILLIC CAPITAL LETTER VE +в → v ; # CYRILLIC SMALL LETTER VE +Г → G ; # CYRILLIC CAPITAL LETTER GHE +г → g ; # CYRILLIC SMALL LETTER GHE +Д → D ; # CYRILLIC CAPITAL LETTER DE +д → d ; # CYRILLIC SMALL LETTER DE +Е → E ; # CYRILLIC CAPITAL LETTER DE +е → e ; # CYRILLIC SMALL LETTER DE +Ж} $lower → Zh ; # CYRILLIC CAPITAL LETTER ZHE +Ж → ZH ; # CYRILLIC CAPITAL LETTER ZHE +ж → zh ; # CYRILLIC SMALL LETTER ZHE +З → Z ; # CYRILLIC CAPITAL LETTER ZE +з → z ; # CYRILLIC SMALL LETTER ZE +И → I ; # CYRILLIC CAPITAL LETTER I +и → i ; # CYRILLIC SMALL LETTER I +Й → Y ; # CYRILLIC CAPITAL LETTER I +й → y ; # CYRILLIC SMALL LETTER I +К → K ; # CYRILLIC CAPITAL LETTER KA +к → k ; # CYRILLIC SMALL LETTER KA +Л → L ; # CYRILLIC CAPITAL LETTER EL +л → l ; # CYRILLIC SMALL LETTER EL +М → M ; # CYRILLIC CAPITAL LETTER EM +м → m ; # CYRILLIC SMALL LETTER EM +Н → N ; # CYRILLIC CAPITAL LETTER EN +н → n ; # CYRILLIC SMALL LETTER EN +О → O ; # CYRILLIC CAPITAL LETTER O +о → o ; # CYRILLIC SMALL LETTER O +П → P ; # CYRILLIC CAPITAL LETTER PE +п → p ; # CYRILLIC SMALL LETTER PE +Р → R ; # CYRILLIC CAPITAL LETTER ER +р → r ; # CYRILLIC SMALL LETTER ER +С → S ; # CYRILLIC CAPITAL LETTER ES +с → s ; # CYRILLIC SMALL LETTER ES +# +# +######################################################################## +# +# BGN Page 16 Note 4 +# +# тс becomes t·s +# +######################################################################## +# +ТС → T·S ; # CYRILLIC CAPITAL LETTER TE +Тс → T·s ; # CYRILLIC CAPITAL LETTER TE +тс → t·s ; # CYRILLIC SMALL LETTER TE +Т → T ; # CYRILLIC CAPITAL LETTER TE +т → t ; # CYRILLIC SMALL LETTER TE +# +# +######################################################################## +# +# End Note 4 +# +######################################################################## +У → U ; # CYRILLIC CAPITAL LETTER U +у → u ; # CYRILLIC SMALL LETTER U +Ф → F ; # CYRILLIC CAPITAL LETTER EF +ф → f ; # CYRILLIC SMALL LETTER EF +Х} $lower → Kh ; # CYRILLIC CAPITAL LETTER HA +Х → KH ; # CYRILLIC CAPITAL LETTER HA +х → kh ; # CYRILLIC SMALL LETTER HA +Ц} $lower → Ts ; # CYRILLIC CAPITAL LETTER TSE +Ц → TS ; # CYRILLIC CAPITAL LETTER TSE +ц → ts ; # CYRILLIC SMALL LETTER TSE +Ч} $lower → Ch ; # CYRILLIC CAPITAL LETTER CHE +Ч → CH ; # CYRILLIC CAPITAL LETTER CHE +ч → ch ; # CYRILLIC SMALL LETTER CHE +# +######################################################################## +# +# Implied rule from BGN Russian-Latin transliteration (Page 94 Note 3.6). +# +# шт becomes sh·t +# +######################################################################## +# +ШТ → SH·T ; # CYRILLIC CAPITAL LETTER SHA +Шт → Sh·t ; # CYRILLIC CAPITAL LETTER SHA +шт → sh·t ; # CYRILLIC SMALL LETTER SHA +Ш} $lower → Sh ; # CYRILLIC CAPITAL LETTER SHA +Ш → SH ; # CYRILLIC CAPITAL LETTER SHA +ш → sh ; # CYRILLIC SMALL LETTER SHA +Щ} $lower → Sht ; # CYRILLIC CAPITAL LETTER SHCHA +Щ → SHT ; # CYRILLIC CAPITAL LETTER SHCHA +щ → sht ; # CYRILLIC SMALL LETTER SHCHA +# +# +######################################################################## +# +# End Implied rule +# +######################################################################## +Ъ → Ŭ ; # CYRILLIC CAPITAL LETTER HARD SIGN +ъ → ŭ ; # CYRILLIC SMALL LETTER HARD SIGN +# +######################################################################## +# +# BGN Page 16 Note 1 +# +# In modern Bulgarian orthography, the character ъ does not occur in +# word-final position. It should be omitted in romanization when found +# on older sources. +# +# The following rule removes all Ъъ at the end of a word. It is assumed +# that when the condition is met, the text must be from an older source. +# Comment out with a '#' at the start of a line to disable. +# +# +######################################################################## +# +$bulgarian { [Ъъ] } $wordBoundary > ; +# +# +######################################################################## +# +# End BGN Page 16 Note 1 +# +######################################################################## +Ь → ’ ; # CYRILLIC CAPITAL LETTER SOFT SIGN +ь → ’ ; # CYRILLIC SMALL LETTER SOFT SIGN +Ю} $lower → Yu ; # CYRILLIC CAPITAL LETTER YU +Ю → YU ; # CYRILLIC CAPITAL LETTER YU +ю → yu ; # CYRILLIC SMALL LETTER YU +Я} $lower → Ya ; # CYRILLIC CAPITAL LETTER YA +Я → YA ; # CYRILLIC CAPITAL LETTER YA +я → ya ; # CYRILLIC SMALL LETTER YA +# +######################################################################## +# +# BGN Page 16 Note 2 +# +# The obsolete character Ѫ, which was replaced by Ъ in 1945, should be +# romanized Ŭ. +# +######################################################################## +# +Ѫ → Ŭ ; # CYRILLIC CAPITAL LETTER BIG YUS +ѫ → ŭ ; # CYRILLIC SMALL LETTER BIG YUS +# +# +######################################################################## +# +# End BGN Page 16 Note 2 +# +######################################################################## +# +######################################################################## +# +# BGN Page 16 Note 3 +# +# The obsolete character Ѣ, replaced in 1945 by Я or Е according to local +# pronunciation, should be romanized as e or ya, accordingly, if the +# pronunciation is known; otherwise as ye. +# +######################################################################## +# +Ѣ} $lower → Ye ; # CYRILLIC CAPITAL LETTER YAT +Ѣ → YE ; # CYRILLIC CAPITAL LETTER YAT +ѣ → ye ; # CYRILLIC SMALL LETTER YAT +# +# +# Alternative rule where appropriate for local pronounciation. To apply +# uncomment the following by removing the '#' mark at the start of the +# line and insert before the three rule lines above. +# +# Ѣ} $lower → e ; # CYRILLIC CAPITAL LETTER YAT +# Ѣ → E ; # CYRILLIC CAPITAL LETTER YAT +# ѣ → e ; # CYRILLIC SMALL LETTER YAT +# +######################################################################## +# +# End BGN Page 16 Note 3 +# +######################################################################## + diff --git a/icu4c/source/data/translit/ch_am.txt b/icu4c/source/data/translit/ch_am.txt new file mode 100644 index 00000000000..da5df90fde9 --- /dev/null +++ b/icu4c/source/data/translit/ch_am.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: ch_am.txt +# Generated from CLDR +# + +::ch-ch_FONIPA; +::am_FONIPA-am; + diff --git a/icu4c/source/data/translit/ch_ar.txt b/icu4c/source/data/translit/ch_ar.txt new file mode 100644 index 00000000000..8e9e7510649 --- /dev/null +++ b/icu4c/source/data/translit/ch_ar.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: ch_ar.txt +# Generated from CLDR +# + +::ch-ch_FONIPA; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/ch_ch_FONIPA.txt b/icu4c/source/data/translit/ch_ch_FONIPA.txt index e688ebf7373..2c4c0dcd7c4 100755 --- a/icu4c/source/data/translit/ch_ch_FONIPA.txt +++ b/icu4c/source/data/translit/ch_ch_FONIPA.txt @@ -1,6 +1,6 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** @@ -8,15 +8,31 @@ # Generated from CLDR # +# Transformation from Chamorro (ch) to its IPA transcription (ch_FONIPA). +# +# http://en.wikipedia.org/wiki/Chamorro_language#Orthography +# http://www.omniglot.com/writing/chamorro.htm +# http://guampedia.com/chamorro-orthography-rules/ +# http://finochamoru.blogspot.com/2009/04/leksion-chamoru-pronunsiasion.html +# +# Recorded sound samples: http://www.chamorro.com/fino/fino.html +# +# http://guampedia.com/chamorro-orthography-rules/ lists in section 3.b) +# graphemes that would be used for loanwords/proper names. Most examples +# are Spanish. Our rules thus generate the Spanish sounds [θ], [x], [β] +# and [w] even though these sounds are not used by the Chamorro language. ::Lower; ::NFC; - \' → ʔ; ’ → ʔ; - +# The IPA chart from Omniglot appears to be mixing up [æ] and [ɑ] when +# explaining how to pronounce ‹a› and ‹å›. The language course on +# finochamoru.blogspot.com copies the pronunciation chart from Omniglot, +# but then explains that ‹å› gets prounounced like in English ‹father›, +# which would be [ɑ]. Also, the sound samples on www.chamorro.com pronounce +# ‹a› as [æ] and ‹å› as [ɑ]. a → æ; å → ɑ; - b → b; ch → t\u0361s; {c} [eéií] → θ; # loanwords @@ -49,11 +65,11 @@ w → w; # loanwords {x} h?[aáåeéiíoóuú$] → ks; # loanwords {x} [^aáåeéiíoóuú$] → s; # loanwords x → ks ; # loanwords - +# Wikipedia [http://en.wikipedia.org/wiki/Chamorro_language#Orthography] +# writes that ‹y› gets pronounced as [d\u0361z], while Omniglot says [d\u0361ʒ]. y → d\u0361z; - \- → \.; # hyphen is a syllable boundary, eg ‹sena-ta› - +# Handle geminated consonants. ::Null; bb → bː; dd → dː; @@ -68,5 +84,5 @@ pp → pː; rr → rː; ss → sː; tt → tː; - ::NFC; + diff --git a/icu4c/source/data/translit/ch_fa.txt b/icu4c/source/data/translit/ch_fa.txt new file mode 100644 index 00000000000..f0f2d81dd4a --- /dev/null +++ b/icu4c/source/data/translit/ch_fa.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: ch_fa.txt +# Generated from CLDR +# + +::ch-ch_FONIPA; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/cs_FONIPA_ja.txt b/icu4c/source/data/translit/cs_FONIPA_ja.txt index 64b466de7b6..ae92021f875 100644 --- a/icu4c/source/data/translit/cs_FONIPA_ja.txt +++ b/icu4c/source/data/translit/cs_FONIPA_ja.txt @@ -1,15 +1,20 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: cs_FONIPA_ja.txt # Generated from CLDR # + +# Transforms a Phonemic IPA transcription of Czech (cs_FONIPA) to Katakana. $word_boundary = [-\ $] ; $vowel = [aeiouw] ; # Vowels and glides $not_vowel = [^$vowel] ; +# +# +# First pass: Collapse phonetic distinctions not preserved in Katakana. d\u0361ʒ → | ʒ ; d\u0361z → | z ; ɛ → | e; @@ -22,8 +27,15 @@ t \u0361 ʃ → | ʧ; t \u0361 → t; [i{i\u032F}ɪ]+ → i; [uw{u\u032F}]+ → u; +# +# nn → n ; +# +# ::Null; +# +# +# Main pass: Phoneme to Katakana conversion. '.' → ; a → ア; ba → バ; @@ -78,12 +90,16 @@ i → イ ; ɟo → ジョ; # not backed by data ɟe → ジェ; ɟu → ジュ; # not backed by data +# +# ja → ヤ; ji → イ; jo → ヨ; je → イェ; ju → ユ; # not backed by data j → イ; +# +# ka → カ; ke → ケ; ki → キ; @@ -100,6 +116,8 @@ lu → ル ; l → ル ; ma → マ ; me → メ ; +# +#mɲe → ミェ; mi → ミ ; mo → モ ; mu → ム ; @@ -125,12 +143,16 @@ po → ポ ; pp → ッ | p; pu → プ ; p → プ ; +# +# r\u031Da → ジャ; r\u031De → ジェ; r\u031Di → ジ; r\u031Do → ジョ; # not backed by data r\u031Du → ジュ; # not backed by data r\u031D → ルシ; +# +# ra → ラ ; re → レ ; ri → リ ; @@ -162,6 +184,8 @@ tsu → ツ ; ts → ツ ; tt → ッ | t; t → ト ; +# +# ʧa → チャ ; ʧe → チェ ; ʧi → チ ; @@ -170,6 +194,8 @@ t → ト ; ʧ } k → チ ; ʧ → チュ ; u → ウ ; +# +# va → ヴァ; ve → ヴェ; vi → ヴィ; @@ -178,12 +204,16 @@ vu → ヴ; vje → ヴィエ ; v } $word_boundary → フ; v → ヴ; +# +# xa → ハ ; xe → ヘ ; xi → ヒ ; xo → ホ ; xu → フ ; x → フ ; +# +# za → ザ; ze → ゼ; zi → ジ; @@ -198,5 +228,10 @@ z → ズ; ʒu → ジュ; # not backed by data ʒ } k → シュ; ʒ → ジュ; +# +# ː → ー; # Long vowel ' ' → ・; +# +# + diff --git a/icu4c/source/data/translit/cs_FONIPA_ko.txt b/icu4c/source/data/translit/cs_FONIPA_ko.txt index 28fe05abc38..010aac4f425 100644 --- a/icu4c/source/data/translit/cs_FONIPA_ko.txt +++ b/icu4c/source/data/translit/cs_FONIPA_ko.txt @@ -1,12 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: cs_FONIPA_ko.txt # Generated from CLDR # + +# Transliteration of Phonemic Czech (cs_FONIPA) to Korean (ko). +# +# First pass: Phonemic Czech (cs_FONIPA) to Latinized Korean (ko_Latn). $vowel = [aeɛiɪoux]; $start = [\u0020$]; $end = [\u0020$]; @@ -86,4 +90,8 @@ z → jeu ; # pozdniː → pojeudeuni ɟ → ti ; ː → ; \u0020 → ; # space +# +# +# Second pass. :: Latin-Hangul (); + diff --git a/icu4c/source/data/translit/cs_am.txt b/icu4c/source/data/translit/cs_am.txt new file mode 100644 index 00000000000..33787e34aaa --- /dev/null +++ b/icu4c/source/data/translit/cs_am.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: cs_am.txt +# Generated from CLDR +# + +::cs-cs_FONIPA; +::am_FONIPA-am; + diff --git a/icu4c/source/data/translit/cs_ar.txt b/icu4c/source/data/translit/cs_ar.txt new file mode 100644 index 00000000000..6749ca3248b --- /dev/null +++ b/icu4c/source/data/translit/cs_ar.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: cs_ar.txt +# Generated from CLDR +# + +::cs-cs_FONIPA; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/cs_cs_FONIPA.txt b/icu4c/source/data/translit/cs_cs_FONIPA.txt index ffbcf93bc61..2477210b829 100644 --- a/icu4c/source/data/translit/cs_cs_FONIPA.txt +++ b/icu4c/source/data/translit/cs_cs_FONIPA.txt @@ -1,14 +1,30 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: cs_cs_FONIPA.txt # Generated from CLDR # + +# Tranformation from Czech to Czech in IPA transcription (cs_FONIPA). +# The transcription is not fully phonemic since we mark allophonic variations +# of /m/, /n/, /x/ and /ɦ/. +# +# http://en.wikipedia.org/wiki/Czech_alphabet +# http://en.wikipedia.org/wiki/Czech_language#Phonology +# http://en.wikipedia.org/wiki/Czech_orthography +# +# Transform input to normalized form NFC, and to lowercase. ::NFC; ::Lower; +# +# +# +# +# Digraphs. +# ch } [ bdďjlmnňrřvwzž ] → ɣ ; ch → x ; dě → ɟɛ ; @@ -17,6 +33,9 @@ tě → cɛ ; ně → ɲɛ ; dž → d \u0361 ʒ; # affricate indicated by ligature tie dz → d \u0361 z; # affricate indicated by ligature tie +# +# +# a → a ; á → aː ; b → b ; @@ -61,3 +80,4 @@ y → ɪ ; ý → iː ; z → z ; ž → ʒ ; + diff --git a/icu4c/source/data/translit/cs_fa.txt b/icu4c/source/data/translit/cs_fa.txt new file mode 100644 index 00000000000..bfbde2464b8 --- /dev/null +++ b/icu4c/source/data/translit/cs_fa.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: cs_fa.txt +# Generated from CLDR +# + +::cs-cs_FONIPA; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/cs_ja.txt b/icu4c/source/data/translit/cs_ja.txt index 0d03a7cdad2..a7189fe5ea7 100644 --- a/icu4c/source/data/translit/cs_ja.txt +++ b/icu4c/source/data/translit/cs_ja.txt @@ -1,11 +1,13 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: cs_ja.txt # Generated from CLDR # + ::cs-cs_FONIPA; ::cs_FONIPA-ja; + diff --git a/icu4c/source/data/translit/cs_ko.txt b/icu4c/source/data/translit/cs_ko.txt index fb50c8910d9..31e0ab8df1a 100644 --- a/icu4c/source/data/translit/cs_ko.txt +++ b/icu4c/source/data/translit/cs_ko.txt @@ -1,11 +1,13 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: cs_ko.txt # Generated from CLDR # + ::cs-cs_FONIPA; ::cs_FONIPA-ko; + diff --git a/icu4c/source/data/translit/dsb_dsb_FONIPA.txt b/icu4c/source/data/translit/dsb_dsb_FONIPA.txt index cdae5cf6d15..1cb3762684d 100755 --- a/icu4c/source/data/translit/dsb_dsb_FONIPA.txt +++ b/icu4c/source/data/translit/dsb_dsb_FONIPA.txt @@ -1,6 +1,6 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** @@ -8,9 +8,11 @@ # Generated from CLDR # +# Transforms Lower Sorbian (dsb) to its IPA transcription (dsb_FONIPA). +# http://en.wikipedia.org/wiki/Sorbian_alphabet +# Transform input to normalized form NFC, and to lowercase. ::NFC; ::Lower; - a → a ; b\u0301 → bʲ ; # old spelling bj → bʲ ; # modern spelling @@ -56,13 +58,12 @@ y → ɨ ; ž → ʒ ; ź → ʑ ; z → z ; - ::NFC; - +# Assimilation. b } [k] → p ; d } [k] → t ; ʃt\u0361ɕ → ɕt\u0361ɕ ; - +# Final de-voicing. b } [$] → p ; d \u0361 z } [$] → t \u0361 s ; d } [$] → t ; @@ -72,3 +73,4 @@ w } [$] → f ; ʑ } [$] → ɕ ; z } [$] → s ; ʒ } [$] → ʃ ; + diff --git a/icu4c/source/data/translit/dv_dv_Latn_BGN.txt b/icu4c/source/data/translit/dv_dv_Latn_BGN.txt new file mode 100644 index 00000000000..ca0404fdb59 --- /dev/null +++ b/icu4c/source/data/translit/dv_dv_Latn_BGN.txt @@ -0,0 +1,175 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: dv_dv_Latn_BGN.txt +# Generated from CLDR +# + +# +######################################################################## +# BGN/PCGN 1988 Agreement, with modifications 2009 +# +# This romanization system supersedes the one which was approved by +# the BGN and the PCGN in 1972. This official system was submitted +# to the PCGN by the Maldivian government in 1987 and approved by BGN +# and PCGN in 1988. The system presented here reflects the 1988 Agreement +# with minor modifications introduced by the government of the Maldives +# in 2009. +# +######################################################################## +# +::[[:block=thaana:]\uFDF2] ; +:: NFD (NFC) ; +$wordBoundary = [^[:L:][:M:][:N:]] ; +$vowel = [\u07A6-\u07AF] ; +$sukun = \u07B0 ; +$sign = [$sukun $vowel] ; +$rule4 = [އށ] $sukun ; # see note 4 +### Consonants +# HAA +$rule4 ހ → hh ; +ހ → h ; +# NOONU +# See note 5: "romanized n’ when appearing without any vowel or auxiliary sign" +$rule4 ނ } $sign → nn ; +$rule4 ނ → nn\' ; +ނ } $sign → n ; +ނ → n\' ; +# RAA +$rule4 ރ → rr ; +ރ → r ; +# BAA +$rule4 ބ → bb ; +ބ → b ; +# LHAVIYANI +$rule4 ޅ → hlh ; +ޅ → lh; +# KAAFU +$rule4 ކ → kk ; +ކ → k ; +# VAAVU +$rule4 ވ → vv ; +ވ → v ; +# MEEMU +$rule4 މ → mm ; +މ → m ; +# FAAFU +$rule4 ފ → ff ; +ފ → f; +# DHAALU +$rule4 ދ → hdh ; +ދ → dh; +# THAA +# See note 6: "romanized iy when appearing in combination with a supercircle" +$rule4 ތ $sukun → hiy ; +$rule4 ތ → hth ; +\u07A8 ތ $sukun → iy ; +ތ $sukun → iy ; +ތ → th ; +# LAAMU +$rule4 ލ → ll ; +ލ → l ; +# GAAFU +$rule4 ގ → gg ; +ގ → g ; +# GNAVIYANI +$rule4 ޏ → hgn ; +ޏ → gn ; +# SEENU +$rule4 ސ → ss ; +ސ → s ; +# DAVIYANI +$rule4 ޑ → dd ; +ޑ → d ; +# ZAVIYANI +$rule4 ޒ → zz ; +ޒ → z ; +# TAVIYANI +$rule4 ޓ → tt ; +ޓ → t ; +# YAA +$rule4 ޔ → yy ; +ޔ → y ; +# PAVIYANI +$rule4 ޕ → pp ; +ޕ → p ; +# JAVIYANI +$rule4 ޖ → jj ; +ޖ → j ; +# CHAVIYANI +$rule4 ޗ → hch ; +ޗ → ch ; +### Borrowed Consonants (See Rule 7) +# SAADHU +$rule4 ޞ → şş ; +ޞ → ş ; +# SHEENU +$rule4 ޝ → hsh ; +ޝ → sh ; +# ZAA +$rule4 ޜ → zz ; +ޜ → z; +# KHAA +$rule4 ޚ → hkh ; +ޚ → kh; +# HHAA +$rule4 ޙ → ḩḩ ; +ޙ → ḩ ; +# THAALU +$rule4 ޛ → hdh ; +ޛ → dh ; +# TTAA +$rule4 ޘ → hth ; +ޘ → th ; +# WAAVU +$rule4 ޥ → ww ; +ޥ → w ; +# QAAFU +$rule4 ޤ → qq ; +ޤ → q ; +# GHAINU +$rule4 ޣ → hgh ; +ޣ → gh ; +# AINU +$rule4 ޢ → \'\' ; +ޢ → \' ; +# ZO +$rule4 ޡ → z\u0327z\u0327 ; +ޡ → z\u0327 ; +# TO +$rule4 ޠ → ţţ ; +ޠ → ţ ; +# DAADHU +$rule4 ޟ → ḑḑ ; +ޟ → ḑ ; +# NOTE: not in Maldivian BGN system, but for completeness of Thaana block +# NAA +$rule4 ޱ → n\u0332n\u0332 ; +ޱ → n\u0332 ; +# Rule 4 in word-final position +$rule4 } $wordBoundary → h; +# SHAVIYANI (placed last to avoid masking) +$rule4 ށ → hsh; +ށ → sh; +# Otherwise, these signs are not romanized elsewhere +$rule4 → ; +\u07B0 → ; +އ → ; +# NOTE: not in Maldivian BGN system, but common in names (e.g. Abdullah) +($vowel) \uFDF2 → | $1 llāh ; +\uFDF2 → allāh; +### Vowels +\u07A6 → a; # ABAFILI +\u07A7 → aa; # AABAAFILI +\u07AC → e; # EBEFILI +\u07AD → ey; # EYBEYFILI +\u07A8 → i; # IBIFILI +\u07A9 → ee; # EEBEEFILI +\u07AE → o; # OBOFILI +\u07AF → oa; # OABOAFILI +\u07AA → u; # UBUFILI +\u07AB → oo; # OOBOOFILI + diff --git a/icu4c/source/data/translit/el_Lower.txt b/icu4c/source/data/translit/el_Lower.txt index ac96ae3fb7f..2666c9067f4 100644 --- a/icu4c/source/data/translit/el_Lower.txt +++ b/icu4c/source/data/translit/el_Lower.txt @@ -1,14 +1,21 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: el_Lower.txt # Generated from CLDR # + +# Special case for final form of sigma. ::NFD(); +# C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable characters, +# and C is not followed by a sequence consisting of zero or more case-ignorable characters and then a cased letter. +# 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA +# With translit rules, easiest is to handle the negative condition first, mapping in that case to the regular sigma. Σ } [:case-ignorable:]* [:cased:] → σ; [:cased:] [:case-ignorable:]* { Σ → ς; ::Any-Lower; ::NFC(); + diff --git a/icu4c/source/data/translit/el_Title.txt b/icu4c/source/data/translit/el_Title.txt index 748b32e85e2..5ed25ef8dd7 100644 --- a/icu4c/source/data/translit/el_Title.txt +++ b/icu4c/source/data/translit/el_Title.txt @@ -1,15 +1,21 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: el_Title.txt # Generated from CLDR # + ::NFD(); +# Remove \0301 following Greek, with possible intervening 0308 marks. +# [[:Greek:] & [:Ll:]] [\u0308]? { \u0301 → ; +# Make any string of letters after a cased letter be lower, with rules for sigma [:cased:] [:case-ignorable:]* { Σ } [:case-ignorable:]* [:cased:] → σ; [:cased:] [:case-ignorable:]* { Σ → ς; [:cased:] [:case-ignorable:]* { (.) → &Any-Lower($1) ; +# Otherwise all lowercase go to upper (titlecase stay as is) ([:Lowercase:]) → &Any-Title($1) ; ::NFC(); + diff --git a/icu4c/source/data/translit/el_Upper.txt b/icu4c/source/data/translit/el_Upper.txt index e8d40f2ec0e..7280342d999 100644 --- a/icu4c/source/data/translit/el_Upper.txt +++ b/icu4c/source/data/translit/el_Upper.txt @@ -1,14 +1,20 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: el_Upper.txt # Generated from CLDR # + +# Copyright (C) 2011-2013, Apple Inc. and others. All Rights Reserved. +# Remove \0301 following Greek, with possible intervening 0308 marks. ::NFD(); +# For uppercasing (not titlecasing!) remove all greek accents from greek letters. +# This is done in two groups, to account for canonical ordering. [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ; [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ; ::NFC(); ::Any-Upper(); + diff --git a/icu4c/source/data/translit/Greek_Latin_BGN.txt b/icu4c/source/data/translit/el_el_Latn_BGN.txt similarity index 58% rename from icu4c/source/data/translit/Greek_Latin_BGN.txt rename to icu4c/source/data/translit/el_el_Latn_BGN.txt index a81e88596d6..85434ee8eed 100644 --- a/icu4c/source/data/translit/Greek_Latin_BGN.txt +++ b/icu4c/source/data/translit/el_el_Latn_BGN.txt @@ -1,14 +1,50 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Greek_Latin_BGN.txt +# File: el_el_Latn_BGN.txt # Generated from CLDR # + +# +######################################################################## +# BGN/PCGN 1962 System +# +# This system is a simplified version of the system devised by the PCGN +# in 1941 and later adopted by the BGN. In 1962 the two organizations +# agreed to joint adoption of certain changes in the original system, +# specifically the omission of special rules for the treatment of Greek +# geographic names of Albanian, Bulgarian, Italian, Macedonian, and +# Turkish origin. That revision eliminated the need to consider the +# origin of names and removed ambiguity from the romanization of Greek +# expressions of possible non-Greek origin. This system is based on +# the pronunciation of modern Greek and is not intended for use in +# the romanization of classical Greek. +# +# The Greek Alphabet as defined by the BGN (Pages 29-31): +# +# ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ +# αβγδεζηθικλμνξοπρσςτυφχψω +# +# Originally prepared by Michael Everson +######################################################################## +# +# MINIMAL FILTER: Greek-Latin +# :: [ΆΈΉΊΌΎΏΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώἀἁἂἃἄἅἆἇἈἉἊἋἌἍἎἏἐἑἒἓἔἕἘἙἚἛἜἝἠἡἢἣἤἥἦἧἨἩἪἫἬἭἮἯἰἱἲἳἴἵἶἷἸἹἺἻἼἽἾἿὀὁὂὃὄὅὈὉὊὋὌὍὐὑὒὓὔὕὖὗὙὛὝὟὠὡὢὣὤὥὦὧὨὩὫὬὭὮὯὰάὲέὴήὶίὸόὺύὼώᾀᾁᾂᾃᾄᾅᾆᾇᾈᾉᾊᾋᾌᾍᾎᾏᾐᾑᾒᾓᾔᾕᾖᾗᾘᾙᾚᾛᾜᾝᾞᾟᾠᾡᾢᾣᾤᾥᾦᾧᾨᾩᾪᾫᾬᾭᾮᾯᾲᾳᾴᾶᾷᾺΆᾼῂῃῄῆῇῈΈῊΉῌῖῚΊῤῥῦῪΎῲῳῴῶῷῸΌῺΏῼ῾] ; :: NFD (NFC) ; +# +# +######################################################################## +# +######################################################################## +# +# Define All Transformation Variables +# +######################################################################## +# $upperConsonants = [ΒΓΔΖΘΚΛΜΝΞΠΡΣΤΦΧΨ] ; $lowerConsonants = [βγδζθκλμνξπρσςτφχψ] ; $consonants = [$upperConsonants $lowerConsonants] ; @@ -16,7 +52,53 @@ $upperVowels = [ΑΕΗΙΟΥΩ] ; $lowerVowels = [αεηιουω] ; $vowels = [$upperVowels $lowerVowels] ; $lower = [$lowerConsonants $lowerVowels] ; +# +# +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/cgi-bin/icu-bugs/transliterate?id=2034;expression=boundary;user=guest +# $wordBoundary = [^[:L:][:M:][:N:]] ; +# +# +######################################################################## +# +######################################################################## +# +# Rules moved to front to avoid masking +# +######################################################################## +# +######################################################################## +# +# BGN Page 32 Rule 1: +# +# The apostrophe and reversed apostrophe, on or the other of which is +# written in Greek in front of all initial uppercase vowel characters, +# above all initial lowercase vowel characters, and above the second +# character of all initial two-vowel character sequences, should not +# be romanized, e.g., Ἀθῆναι → Athínai, Ἠράκλειον → Iráklion, +# Οἰνόφυτα → Oinófita. These apostrophes must be distinguished from +# accent marks hen they occur together, e.g. Ἄβατον → Ávaton, +# Ἤλια → Ília, Οἴτη → Oíti. The reversed apostrophe is sometimes found +# also with ρ and should, likewise, not be romanized: ῥέμα → réma. +# +# BGN Page 32 Rule 2a: +# +# Stress is shown in Greek by the use of the tilde or circumflex, +# the acute accent, or the grave accent; all of those marks should +# be represented in romanization by an acute accent, e.g., +# Ἀθῆναι → Athínai, Νδία → Día, Ζεμενὸν → Zemenón. +# +# BGN Page 32 Rule 4: +# +# The character ι (ióta) is sometimes found written under, or, +# in uppercase, to the right of the vowel characters α, η, and ω. +# This "subscript iota" should not be romanized, e.g., +# Μυρτῷον Πέλαγος or ΜΥΡΤῼΟΝ ΠΕΛΑΓΟΣ [but not ΜΥΡΤΩΙΟΝ ΠΕΛΑΓΟΣ] +# → Mirtóön Pélagos. +# +######################################################################## +# [ἈἉᾼᾈᾉ] → Α ; # GREEK CAPITAL LETTER ALPHA [ἀἁᾳᾀᾁ] → α ; # GREEK SMALL LETTER ALPHA [ἊἋἌἍἎἏᾊᾋᾌᾍᾎᾏᾺΆ] → Ά ; # GREEK CAPITAL LETTER ALPHA WITH TONOS @@ -47,6 +129,29 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; [ὢὣὤὥὦὧὼώᾢᾣᾤᾥᾦᾧῲῴῶῷ] → ώ ; # GREEK SMALL LETTER OMEGA WITH TONOS Ῥ → Ρ ; # GREEK CAPITAL LETTER RHO [ῤῥ] → ρ ; # GREEK SMALL LETTER RHO +# +# +######################################################################## +# +# End of Rules 1, 2a, and 4 +# +######################################################################## +# +######################################################################## +# +# BGN Page 32 Rules 2b and 2c: +# +# If the stressed vowel is written as a sequence of two vowel characters +# in Greek, the # second vowel character should carry the accent; +# similarly, in Romanization the acute accent should be placed over the +# second vowel letter, e.g., Οἰνοῦσαι → Oinoúsai, Οἴτη → Oíti, +# Θεσπιαὶ → Thespiaí. +# +# Where a syllable containing on the combinations αυ, ευ, or ηυ +# carries the stress, this is marked in Greek on the character υ. +# In romanization it should be shown on the preceding vowel +# letter, e.g., Πειραιεύς → Piraiévs, Αὔρα → Ávra. +# Αί → Aí ; αί → aí ; Οί → Oí ; @@ -59,6 +164,24 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; εύ → έυ ; Ηύ → Ήυ ; ηύ → ήυ ; +# +# +######################################################################## +# +# End of Rules 2b and 2c +# +######################################################################## +# +######################################################################## +# +# BGN Page 32 Rule 3: +# +# The dieresis should be shown in romanization where it occurs in Greek, +# e.g., Μαρινέϊκα → Marinéïka, Ἀχαΐα → Akhaï\u0301a; and over the second vowel +# etter in romanization of the following combinations fo Greek vowel +# characters: αε, e.g., Ἀετὸς → Aëtos; αη, e.g., Ἀηδὼν → Aïdhon; οη, +# e.g. Οἰνόη → Oinóï; ωο, e.g., Ἠρῶον → Iróön. +# [ΪΫ] → Ï ; [ϊϋ] → ï ; [ΐΰ] → ï\u0301 ; @@ -78,6 +201,20 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; όη → óï ; Ώο → Óö ; ώο → óö ; +# +# +######################################################################## +# +# End of Rule 3 +# +######################################################################## +# +######################################################################## +# +# Start of Alphabetic Transformations +# +######################################################################## +# ΑΙ → AI ; # GREEK CAPITAL LETTER ALPHA + CAPITAL IOTA Αι → Ai ; # GREEK CAPITAL LETTER ALPHA + SMALL IOTA αι → ai ; # GREEK SMALL LETTER ALPHA + SMALL IOTA @@ -99,25 +236,95 @@ $wordBoundary{γκ → g ; # GREEK SMALL LETTER GAMMA + SMALL KAPPA ΓΚ → NG ; # GREEK CAPITAL LETTER GAMMA + CAPITAL KAPPA Γκ → Ng ; # GREEK CAPITAL LETTER GAMMA + SMALL KAPPA γκ → ng ; # GREEK SMALL LETTER GAMMA + SMALL KAPPA +# +# +######################################################################## +# +# BGN Page 29 Rule 3a: +# +# The character γ should be romanized g before α, ο, ου, ω, and +# consonants other than γ, ξ, and χ. +# +######################################################################## +# Γ}[ΑΟΩ [$upperConsonants - [ΓΞΧ]]] → G ; # GREEK CAPITAL LETTER GAMMA Γ}[αοω [$lowerConsonants - [γξχ]]] → G ; # GREEK CAPITAL LETTER GAMMA Γ}ΟΥ → G ; # GREEK CAPITAL LETTER GAMMA Γ}ου → G ; # GREEK CAPITAL LETTER GAMMA γ}[αοω [$lowerConsonants - [γξχ]]] → g ; # GREEK SMALL LETTER GAMMA γ}ου → g ; # GREEK SMALL LETTER GAMMA +# +# +######################################################################## +# +# End of Rule 3a +# +######################################################################## +# +######################################################################## +# +# BGN Page 29 Rule 3b: +# +# The character γ should be romanized y before αι, ε, ει, η, ι, οι, υ, +# and υι. +# +######################################################################## +# Γ}[ΑΕΟΥ]Ι → Y ; # GREEK CAPITAL LETTER GAMMA Γ}[ΕΗΙΥ] → Y ; # GREEK CAPITAL LETTER GAMMA Γ}[αεου]ι → Y ; # GREEK CAPITAL LETTER GAMMA Γ}[εηιυ] → Y ; # GREEK CAPITAL LETTER GAMMA γ}[αεου]ι → y ; # GREEK SMALL LETTER GAMMA γ}[εηιυ] → y ; # GREEK SMALL LETTER GAMMA +# +# +######################################################################## +# +# End of Rule 3b +# +######################################################################## +# +######################################################################## +# +# BGN Page 29 Rule 3c: +# +# The character γ should be romanized n before ξ and χ. +# +######################################################################## +# Γ}[ΞΧ] → N ; # GREEK CAPITAL LETTER GAMMA Γ}[ξχ] → N ; # GREEK CAPITAL LETTER GAMMA γ}[ξχ] → n ; # GREEK SMALL LETTER GAMMA +# +# +######################################################################## +# +# End of Rule 3c +# +######################################################################## +# Γ → G ; # GREEK CAPITAL LETTER GAMMA γ → g ; # GREEK SMALL LETTER GAMMA +# +# +######################################################################## +# +# BGN Page 29 Rule 4a: +# +# The character δ should be romanized d when between ν and ρ. +# +######################################################################## +# Ν{Δ}Ρ → D ; # GREEK CAPITAL LETTER DELTA ν{δ}ρ → d ; # GREEK SMALL LETTER GAMMA +# +# +######################################################################## +# +# End of Rule 4a +# +######################################################################## +# Δ} $lower → Dh ; # GREEK CAPITAL LETTER PSI Δ → DH ; # GREEK CAPITAL LETTER DELTA δ → dh ; # GREEK SMALL LETTER DELTA @@ -191,6 +398,14 @@ $wordBoundary{ντ → d ; # GREEK SMALL LETTER NU + SMALL TAU ς → s ; # GREEK SMALL LETTER FINAL SIGMA Τ → T ; # GREEK CAPITAL LETTER TAU τ → t ; # GREEK SMALL LETTER TAU +# +# +######################################################################## +# +# End Rule 3.5 +# +######################################################################## +# Υ → I ; # GREEK CAPITAL LETTER UPSILON υ → i ; # GREEK SMALL LETTER UPSILON Ύ → Í ; # GREEK CAPITAL LETTER UPSILON WITH TONOS @@ -207,3 +422,7 @@ $wordBoundary{ντ → d ; # GREEK SMALL LETTER NU + SMALL TAU ω → o ; # GREEK SMALL LETTER OMEGA Ώ → Ó ; # GREEK CAPITAL LETTER OMEGA WITH TONOS ώ → ó ; # GREEK SMALL LETTER OMEGA WITH TONOS +# +# +######################################################################## + diff --git a/icu4c/source/data/translit/eo_am.txt b/icu4c/source/data/translit/eo_am.txt new file mode 100644 index 00000000000..a4ec15d9c88 --- /dev/null +++ b/icu4c/source/data/translit/eo_am.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: eo_am.txt +# Generated from CLDR +# + +::eo-eo_FONIPA; +::am_FONIPA-am; + diff --git a/icu4c/source/data/translit/eo_ar.txt b/icu4c/source/data/translit/eo_ar.txt new file mode 100644 index 00000000000..0396efd1a76 --- /dev/null +++ b/icu4c/source/data/translit/eo_ar.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: eo_ar.txt +# Generated from CLDR +# + +::eo-eo_FONIPA; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/eo_eo_FONIPA.txt b/icu4c/source/data/translit/eo_eo_FONIPA.txt index 76d3b70c11f..97efd94bebf 100755 --- a/icu4c/source/data/translit/eo_eo_FONIPA.txt +++ b/icu4c/source/data/translit/eo_eo_FONIPA.txt @@ -1,6 +1,6 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** @@ -8,9 +8,9 @@ # Generated from CLDR # +# https://en.wikipedia.org/wiki/Esperanto_phonology ::NFC; ::Lower; - [\-\'’] → ; # eg. vorto-provizo, famili’ aj → ai\u032F; aŭ → au\u032F; @@ -47,10 +47,10 @@ r → r; s → s; t → t; uj → ui\u032F; -ŭ → v; # eg. ŭa! +ŭ → w; # eg. ŭa! ú → u; u → u; v → v; z → z; - ::NFC; + diff --git a/icu4c/source/data/translit/eo_fa.txt b/icu4c/source/data/translit/eo_fa.txt new file mode 100644 index 00000000000..4bf3af8b3e5 --- /dev/null +++ b/icu4c/source/data/translit/eo_fa.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: eo_fa.txt +# Generated from CLDR +# + +::eo-eo_FONIPA; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/es_419_am.txt b/icu4c/source/data/translit/es_419_am.txt new file mode 100644 index 00000000000..9ac0e188dd9 --- /dev/null +++ b/icu4c/source/data/translit/es_419_am.txt @@ -0,0 +1,14 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: es_419_am.txt +# Generated from CLDR +# + +::es-es_FONIPA; +::es_FONIPA-es_419_FONIPA; +::am_FONIPA-am; + diff --git a/icu4c/source/data/translit/es_419_ar.txt b/icu4c/source/data/translit/es_419_ar.txt new file mode 100644 index 00000000000..5f762d0383e --- /dev/null +++ b/icu4c/source/data/translit/es_419_ar.txt @@ -0,0 +1,23 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: es_419_ar.txt +# Generated from CLDR +# + +$Boundary = [^[:L:][:M:][:N:]]; +$Vowel = [i e o u a]; +::es-es_FONIPA; +::es_FONIPA-es_419_FONIPA; +# In Arabic transcription of Spanish, un-stressed [e] should be treated +# like [ə] which gets stripped off. However, we currently do have not +# have a good way of finding stress in Spanish words. In the long term, +# it would be _much_ better to look at stress markers, but for now +# we do this trivial heuristics to find unstressed [e] in the first +# syllable. +$Boundary [^Vowel] {e} [^$Vowel]* $Vowel → ə; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/es_419_fa.txt b/icu4c/source/data/translit/es_419_fa.txt new file mode 100644 index 00000000000..f09252891bd --- /dev/null +++ b/icu4c/source/data/translit/es_419_fa.txt @@ -0,0 +1,23 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: es_419_fa.txt +# Generated from CLDR +# + +$Boundary = [^[:L:][:M:][:N:]]; +$Vowel = [i e o u a]; +::es-es_FONIPA; +::es_FONIPA-es_419_FONIPA; +# In Farsi transcription of Spanish, un-stressed [e] should be treated +# like [ə] which gets stripped off. However, we currently do have not +# have a good way of finding stress in Spanish words. In the long term, +# it would be _much_ better to look at stress markers, but for now +# we do this trivial heuristics to find unstressed [e] in the first +# syllable. +$Boundary [^Vowel] {e} [^$Vowel]* $Vowel → ə; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/es_419_ja.txt b/icu4c/source/data/translit/es_419_ja.txt index 18f5eecc06b..0900463b8e5 100644 --- a/icu4c/source/data/translit/es_419_ja.txt +++ b/icu4c/source/data/translit/es_419_ja.txt @@ -1,12 +1,14 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: es_419_ja.txt # Generated from CLDR # + ::es-es_FONIPA; ::es_FONIPA-es_419_FONIPA; ::es_FONIPA-ja; + diff --git a/icu4c/source/data/translit/es_419_zh.txt b/icu4c/source/data/translit/es_419_zh.txt index 8f6a670c68c..8041844a850 100644 --- a/icu4c/source/data/translit/es_419_zh.txt +++ b/icu4c/source/data/translit/es_419_zh.txt @@ -1,12 +1,14 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: es_419_zh.txt # Generated from CLDR # + ::es-es_FONIPA; ::es_FONIPA-es_419_FONIPA; ::es_FONIPA-zh; + diff --git a/icu4c/source/data/translit/es_FONIPA_am.txt b/icu4c/source/data/translit/es_FONIPA_am.txt index c999a52ee7d..680b8e88868 100644 --- a/icu4c/source/data/translit/es_FONIPA_am.txt +++ b/icu4c/source/data/translit/es_FONIPA_am.txt @@ -1,25 +1,42 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: es_FONIPA_am.txt # Generated from CLDR # + +# Phonemic transcription of Spanish into Amharic. +# First pass: Collapse phonetic distinctions not preserved in Amharic. β → b; ð → d; ɣ → g; ʎ → ʝ; +# +# ŋ → n; θ → s; ɾ → r; +# +# i\u032F → i; u\u032F → u; +# +# j → i; [^gk] { w → u; +# +# +# Main pass: Phoneme to Katakana conversion. ::Null; +# +# '.' → ; +# +# +# Degemination. bb → | b; dd → | d; ff → | f; @@ -38,26 +55,38 @@ tt → | t; ʧʧ → | ʧ; xx → | x; ʒʒ → | ʒ; +# +# a → አ ; +# +# ba → ባ ; be → ቤ ; bi → ቢ ; bo → ቦ ; bu → ቡ ; b → ብ ; +# +# da → ዳ ; de → ዴ ; di → ዲ ; do → ዶ ; du → ዱ ; d → ድ ; +# +# e → ኤ ; +# +# fa → ፋ ; fe → ፌ ; fi → ፊ ; fo → ፎ ; fu → ፉ ; f → ፍ ; +# +# ga → ጋ ; ge → ጌ ; gi → ጊ ; @@ -68,13 +97,19 @@ gwe → ጔ ; gwi → ጒ ; gwo → ጉዎ ; g → ግ ; +# +# i → ኢ ; +# +# ʝa → ያ ; ʝe → ዬ ; ʝi → ዪ ; ʝo → ዮ ; ʝu → ዩ ; ʝ → ይ ; +# +# ka → ካ ; ke → ኬ ; ki → ኪ ; @@ -85,84 +120,120 @@ kwe → ኴ ; kwi → ኲ ; kwo → ኩዎ ; k → ክ ; +# +# la → ላ ; le → ሌ ; li → ሊ ; lo → ሎ ; lu → ሉ ; l → ል ; +# +# ma → ማ ; me → ሜ ; mi → ሚ ; mo → ሞ ; mu → ሙ ; m → ም ; +# +# na → ና ; ne → ኔ ; ni → ኒ ; no → ኖ ; nu → ኑ ; n → ን ; +# +# ɲa → ኛ ; ɲe → ኜ ; ɲi → ኚ ; ɲo → ኞ ; ɲu → ኙ ; ɲ → ኝ ; +# +# o → ኦ ; +# +# pa → ፓ ; pe → ፔ ; pi → ፒ ; po → ፖ ; pu → ፑ ; p → ፕ ; +# +# ra → ራ ; re → ሬ ; ri → ሪ ; ro → ሮ ; ru → ሩ ; r → ር ; +# +# sa → ሳ ; se → ሴ ; si → ሲ ; so → ሶ ; su → ሱ ; s → ስ ; +# +# +# Not used in Iberian Spanish, but occurs e.g. in Galician. ʃa → ሻ ; ʃe → ሼ ; ʃi → ሺ ; ʃo → ሾ ; ʃu → ሹ ; ʃ → ሽ ; +# +# ta → ታ ; te → ቴ ; ti → ቲ ; to → ቶ ; tu → ቱ ; t → ት ; +# +# ʧa → ቻ ; ʧe → ቼ ; ʧi → ቺ ; ʧo → ቾ ; ʧu → ቹ ; ʧ → ች ; +# +# u → ኡ ; +# +# +# Not strictly used in Spanish, but needed for Amharic. va → ቫ ; ve → ቬ ; vi → ቪ ; vo → ቮ ; vu → ቩ ; v → ቭ ; +# +# xa → ኻ ; xe → ኼ ; xi → ኺ ; xo → ኾ ; xu → ኹ ; x → ኽ ; +# +# +# Not used in Iberian Spanish, but occurs in e.g. Catalan. ʒa → ዣ ; ʒe → ዤ ; ʒi → ዢ ; ʒo → ዦ ; ʒu → ዡ ; ʒ → ዥ ; +# +# ::NFC; + diff --git a/icu4c/source/data/translit/es_FONIPA_es_419_FONIPA.txt b/icu4c/source/data/translit/es_FONIPA_es_419_FONIPA.txt index b166a02ce22..4b4b5ee3ba6 100644 --- a/icu4c/source/data/translit/es_FONIPA_es_419_FONIPA.txt +++ b/icu4c/source/data/translit/es_FONIPA_es_419_FONIPA.txt @@ -1,11 +1,15 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: es_FONIPA_es_419_FONIPA.txt # Generated from CLDR # + +# Conflate sounds of Castilian Spanish to produce Latin American Spanish. +# This operates on a phonemic IPA transcription of Spanish (es_FONIPA). ʎ → ʝ; [sθ]+ → s; + diff --git a/icu4c/source/data/translit/es_FONIPA_ja.txt b/icu4c/source/data/translit/es_FONIPA_ja.txt index 6e3506cec79..ed4617ff65c 100644 --- a/icu4c/source/data/translit/es_FONIPA_ja.txt +++ b/icu4c/source/data/translit/es_FONIPA_ja.txt @@ -1,15 +1,20 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: es_FONIPA_ja.txt # Generated from CLDR # + +# Phonemic transcription of Spanish into Katakana. $word_boundary = [-\ $] ; $vowel = [aeijouw] ; # Vowels and glides $not_vowel = [^$vowel] ; +# +# +# First pass: Collapse phonetic distinctions not preserved in Katakana. β → | b; ð → | d; ɣ → | g; @@ -18,8 +23,15 @@ $not_vowel = [^$vowel] ; ɾ → | r; [ij{i\u032F}]+ → i; [uw{u\u032F}]+ → u; +# +# nn → n ; +# +# ::Null; +# +# +# Main pass: Phoneme to Katakana conversion. '.' → ; a → ア; ba → バ; @@ -35,6 +47,9 @@ de → デ; di → ディ; do → ド; du → ドゥ; +# +# +# 'd' at the end of a word is usually ignored. d } $word_boundary → ー; d → ド; e → エ; @@ -143,4 +158,7 @@ xi → ヒ ; xo → ホ ; xu → フ ; x → フ ; +# +# ::NFC; + diff --git a/icu4c/source/data/translit/es_FONIPA_zh.txt b/icu4c/source/data/translit/es_FONIPA_zh.txt index cece3beef36..96fb4f234ac 100644 --- a/icu4c/source/data/translit/es_FONIPA_zh.txt +++ b/icu4c/source/data/translit/es_FONIPA_zh.txt @@ -1,15 +1,19 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: es_FONIPA_zh.txt # Generated from CLDR # + +# Tranforms Spanish to Mandarin Chinese. The input Spanish string must be in +# phonemic IPA transcription (es_FONIPA); the output is in Simplified Chinese. $word_boundary = [-\ $]; $vowel = [aeijouw]; # Vowels and glides $not_vowel = [^$vowel]; +# First pass: Collapse phonetic distinctions not preserved in Mandarin. ð → | d; ɣ → | g; ŋ → | n; @@ -35,13 +39,20 @@ s[θs] → s; # GB/T 17693.5-2009, 5.3.4 [^ʧ] { jo → io; # GB/T 17693.5-2009 表 1, 注 7 ::Null; j } an $not_vowel → i ; # GB/T 17693.5-2009 表 1, 注 8 +# GB/T 17693.5-2009 表 1, 注 8 also says that should be treated as if +# it was plus . This is not borne out by the observed data, which +# suggests that plus is the more appropriate choice in some +# situations. [g.$] { wai\u032F → wai ; wai\u032F → uai\u032F ; [g.$] { wau\u032F → wau ; wau\u032F → uau\u032F ; jau\u032F → iau\u032F ; +# Even though "ao" is not a diphthong in Spanish, Mandarin treats it as one. [^jw] { ao } [^n] → au\u032F ; [^jw] { ao } n $vowel → au\u032F ; +# Main pass: Phoneme to Hanzi conversion. +# This generally follows GB/T 17693.5-2009 表 1, unless otherwise noted. ::Null; '.' → ; ai\u032F → 艾 ; @@ -145,6 +156,11 @@ fwen } $not_vowel → 丰 ; fwe → 富埃 ; fwi → 富伊 ; fwo → 福 ; +# The choice of 弗 vs. 夫 sounds simple according to the GB/T standard, but the +# data suggest otherwise. Ideally, 弗 should occur at the beginning of a +# morpheme (e.g. in "villafranca" 比利亚弗兰卡) and 夫 everywhere else. Since +# we don't have morpheme boundaries, we'll fudge it by writing 夫 at the end of +# a word and 弗 everywhere else. f } $word_boundary → 夫 ; f → 弗 ; gai\u032F → 盖 ; @@ -410,6 +426,9 @@ tje → 铁 ; tju → 蒂乌 ; ton } $not_vowel → 通 ; to → 托 ; +# The rules for /ts/ (tz in the orthography) are nonstandard and derived +# entirely from the observed data. They apply mostly to native toponyms +# in Mexico. tsa → 察 ; tsen } $not_vowel → 岑 ; tse → 采 ; @@ -487,12 +506,26 @@ xwe → 胡埃 ; xwi → 惠 ; xwo → 霍 ; x → 赫 ; +# 尔 simplification pass. The idea is to drop most occurences of 尔 +# corresponding to (not to or ) from a word if there is another /l/ +# sound nearby. There is a vague pattern like this in the data, but the details +# remain to be determined. At the moment, this does nothing, it just puts 尔 in +# for every in a syllable coda. ::Null; $r = [R利拉]; +# +# +# R } . $r → ; +# R } .. $r → ; +# R } ... $r → ; +# R } .... $r → ; R → 尔 ; +# Dong-nan-xi-hai pass. Per GB/T 17693.5-2009 表 1, 注 4, replace confusing +# characters at the beginning and end of a word. ::Null; $word_boundary { 东 → 栋 ; $word_boundary { 南 → 楠 ; $word_boundary { 西 → 锡 ; 海 } $word_boundary → 亥 ; ::NFC; + diff --git a/icu4c/source/data/translit/es_am.txt b/icu4c/source/data/translit/es_am.txt index 6982569fc94..abbe20bff0b 100644 --- a/icu4c/source/data/translit/es_am.txt +++ b/icu4c/source/data/translit/es_am.txt @@ -1,11 +1,13 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: es_am.txt # Generated from CLDR # + ::es-es_FONIPA; ::es_FONIPA-am; + diff --git a/icu4c/source/data/translit/es_ar.txt b/icu4c/source/data/translit/es_ar.txt new file mode 100644 index 00000000000..109cbdcdebc --- /dev/null +++ b/icu4c/source/data/translit/es_ar.txt @@ -0,0 +1,22 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: es_ar.txt +# Generated from CLDR +# + +$Boundary = [^[:L:][:M:][:N:]]; +$Vowel = [i e o u a]; +::es-es_FONIPA; +# In Ararbic transcription of Spanish, un-stressed [e] should be treated +# like [ə] which gets stripped off. However, we currently do have not +# have a good way of finding stress in Spanish words. In the long term, +# it would be _much_ better to look at stress markers, but for now +# we do this trivial heuristics to find unstressed [e] in the first +# syllable. +$Boundary [^Vowel] {e} [^$Vowel]* $Vowel → ə; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/es_es_FONIPA.txt b/icu4c/source/data/translit/es_es_FONIPA.txt index a375cd76308..2fb220cc47d 100644 --- a/icu4c/source/data/translit/es_es_FONIPA.txt +++ b/icu4c/source/data/translit/es_es_FONIPA.txt @@ -1,17 +1,37 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: es_es_FONIPA.txt # Generated from CLDR # + +# Tranformation from Spanish to Spanish in IPA transcription (es_FONIPA). +# Not fully phonemic, since we mark up allophonic variants of voiced stops, +# e.g. we break down /b/ into [b] and [β]. +# +# See e.g. "Ortografía de la lengua española" by Real Academia Española, +# available in PDF format on the web (non-stable URL). +# +# Definitions. $bow = [-\ $] ; # Beginning of word. $consonant = [bβdðfgɣʝklʎmnŋɲθprɾstʧx] ; $syll = '.' ; # Syllable boundary. +# +# +# +# ::NFC; ::Lower; +# +# +# +# +# +# Word-initial cluster simplification. +# $bow { ct → | t ; # ctónico (?) $bow { cz → | z ; # czar $bow { gn → | n ; # gnomo @@ -19,24 +39,40 @@ $bow { mn → | n ; # mnemotécnico $bow { ps → | s ; # psicología $bow { pt → | t ; # pterodáctilo $bow { x → | s ; # xilófono +# +# +# +# Vowels and glides. +# $bow { i → i ; $consonant { i } [ aáeé oóuú] → j ; [aeo] { i } [^aáeé oóuú] → i\u032F ; i } [ aáeé oóuú] → ʝ ; i → i ; +# +# [aeo] { y } [^aáeéiíoóuú] → i\u032F ; y } [ aáeéiíoóuú] → ʝ ; y → i ; +# +# [aeo] { u } [^aáeéiíoó ] → u\u032F ; u } [ aáeéiíoó ] → w ; ü } [ eéií ] → w ; u → u ; ü → u ; # Should not be needed, but just in case. +# +# [aá] → a ; [eé] → e ; í → i ; [oó] → o ; ú → u ; +# +# +# +# Consonants. +# b → β ; cch → ʧ ; ch → ʧ ; @@ -73,10 +109,40 @@ x } h?[aáeéiíoóuú$] → ks ; x } [^aáeéiíoóuú$] → s ; x → ks ; z → θ ; +# +# +# +# Second pass: phoneme-to-phone rules. Differentiation of /β/ into [b] and [β], +# place assimilation of [n], etc. +# ::Null; +# +# [-\ ] → ; +# +# [mnɲŋ $] { β → b ; [mnɲŋlʎ$] { ð → d ; [mnɲŋ $] { ɣ → g ; +# +# n } [gɣk] → ŋ ; +# +# +# Optional: Place assimilation of n before labial consonants. +# +# n } [bβpfm] → m ; +# +# Optional: Voicing of [s]. +# +# s } [bβdð] → z ; +# s } [gɣ][^ei] → z ; +# s } [mnɲŋlʎrɾ] → z ; +# +# Optional: Lenition of [k] before [θ]. +# +# k } θ → ɣ ; +# +# ::NFC; + diff --git a/icu4c/source/data/translit/es_fa.txt b/icu4c/source/data/translit/es_fa.txt new file mode 100644 index 00000000000..b545c78103c --- /dev/null +++ b/icu4c/source/data/translit/es_fa.txt @@ -0,0 +1,22 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: es_fa.txt +# Generated from CLDR +# + +$Boundary = [^[:L:][:M:][:N:]]; +$Vowel = [i e o u a]; +::es-es_FONIPA; +# In Farsi transcription of Spanish, un-stressed [e] should be treated +# like [ə] which gets stripped off. However, we currently do have not +# have a good way of finding stress in Spanish words. In the long term, +# it would be _much_ better to look at stress markers, but for now +# we do this trivial heuristics to find unstressed [e] in the first +# syllable. +$Boundary [^Vowel] {e} [^$Vowel]* $Vowel → ə; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/es_ja.txt b/icu4c/source/data/translit/es_ja.txt index 69f5985952c..02718efe621 100644 --- a/icu4c/source/data/translit/es_ja.txt +++ b/icu4c/source/data/translit/es_ja.txt @@ -1,11 +1,13 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: es_ja.txt # Generated from CLDR # + ::es-es_FONIPA; ::es_FONIPA-ja; + diff --git a/icu4c/source/data/translit/es_zh.txt b/icu4c/source/data/translit/es_zh.txt index f89a8b690f7..975357ace7a 100644 --- a/icu4c/source/data/translit/es_zh.txt +++ b/icu4c/source/data/translit/es_zh.txt @@ -1,11 +1,13 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: es_zh.txt # Generated from CLDR # + ::es-es_FONIPA; ::es_FONIPA-zh; + diff --git a/icu4c/source/data/translit/Persian_Latin_BGN.txt b/icu4c/source/data/translit/fa_fa_Latn_BGN.txt similarity index 65% rename from icu4c/source/data/translit/Persian_Latin_BGN.txt rename to icu4c/source/data/translit/fa_fa_Latn_BGN.txt index 6082a862d55..0d486403aea 100644 --- a/icu4c/source/data/translit/Persian_Latin_BGN.txt +++ b/icu4c/source/data/translit/fa_fa_Latn_BGN.txt @@ -1,22 +1,56 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Persian_Latin_BGN.txt +# File: fa_fa_Latn_BGN.txt # Generated from CLDR # + +# +######################################################################## +# BGN/PCGN 1956 System +# +# This system was adopted by the BGN in 1946 and by the PCGN in 1958. +# It is used for the romanization of geographic names in Iran and +# for Persian-language names in Afghanistan. +# +# Originally prepared by Michael Everson +######################################################################## +# +# MINIMAL FILTER: Persian-Latin +# :: [[:arabic:][:block=ARABIC:][ءآابةتثجحخدذرزسشصضطظعغفقكلمنهویي\u064E\u064F\u0650\u0651\u0652٠١٢٣٤٥٦٧٨٩پچژگی]] ; :: NFKD (NFC) ; +# +# +######################################################################## +# +######################################################################## +# +# Define All Transformation Variables +# +######################################################################## +# $alef = ’; $ayin = ‘; $disambig = \u0331 ; +# +# +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/cgi-bin/icu-bugs/transliterate?id=2034;expression=boundary;user=guest +# $wordBoundary = [^[:L:][:M:][:N:]] ; +# +# +######################################################################## +# non-letters [:Nd:]{٫}[:Nd:] ↔ [:Nd:]{','}[:Nd:] ; # ARABIC DECIMAL SEPARATOR [:Nd:]{٬}[:Nd:] ↔ [:Nd:]{'.'}[:Nd:] ; # ARABIC THOUSANDS SEPARATOR ٫ ↔ ',' $disambig ; # ARABIC DECIMAL SEPARATOR ٬ ↔ '.' $disambig ; # ARABIC THOUSANDS SEPARATOR +# ٭ ↔ ; # ARABIC FIVE POINTED STAR // no need to transliterate ، ↔ ',' ; # ARABIC COMMA ؛ ↔ ';' ; # ARABIC SEMICOLON ؟ ↔ '?' ; # ARABIC QUESTION MARK @@ -41,10 +75,46 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; ۷ ↔ 7 ; # EXTENDED ARABIC-INDIC DIGIT SEVEN ۸ ↔ 8 ; # EXTENDED ARABIC-INDIC DIGIT EIGHT ۹ ↔ 9 ; # EXTENDED ARABIC-INDIC DIGIT NINE +# +######################################################################## +# +# Rules moved to front to avoid masking +# +######################################################################## +# +######################################################################## +# +# BGN Page 89 Rule 4 +# +# The character sequences كه , زه , سه , and گه may be romanized k·h, z·h, +# s·h, and g·h in order to differentiate those romanizations from the +# digraphs kh, zh, sh, and gh. +# +######################################################################## +# كه → k·h ; # ARABIC LETTER KAF + HEH زه → z·h ; # ARABIC LETTER ZAIN + HEH سه → s·h ; # ARABIC LETTER SEEN + HEH گه → g·h ; # ARABIC LETTER GAF + HEH +# +# +######################################################################## +# +# End Rule 4 +# +######################################################################## +# +######################################################################## +# +# BGN Page 91 Rule 7 +# +# Doubles consonant sounds are represented in Arabic script by +# placing a shaddah ( \u0651 ) over a consonant character. In romanization +# the letter should be doubled. [The remainder of this rule deals with +# the definite article and is lexical.] +# +######################################################################## +# ب\u0651 → bb ; # ARABIC LETTER BEH + SHADDA پ\u0651 → pp ; # ARABIC LETTER PEH + SHADDA ت\u0651 → tt ; # ARABIC LETTER TEH + SHADDA @@ -75,6 +145,20 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; ه\u0651 → hh ; # ARABIC LETTER HEH + SHADDA و\u0651 → ww ; # ARABIC LETTER WAW + SHADDA ی\u0651 → yy ; # ARABIC LETTER FARSI YEH + SHADDA +# +# +######################################################################## +# +# End Rule 7 +# +######################################################################## +# +######################################################################## +# +# Start of Transformations +# +######################################################################## +# $wordBoundary{ء → ; # ARABIC LETTER HAMZA ء → $alef ; # ARABIC LETTER HAMZA $wordBoundary{ا → ; # ARABIC LETTER ALEF @@ -122,3 +206,7 @@ $wordBoundary{ا → ; # ARABIC LETTER ALEF \u064F → o ; # ARABIC DAMMA \u0652 → ; # ARABIC SUKUN ::NFC (NFD) ; +# +# +######################################################################## + diff --git a/icu4c/source/data/translit/Hebrew_Latin_BGN.txt b/icu4c/source/data/translit/he_he_Latn_BGN.txt similarity index 57% rename from icu4c/source/data/translit/Hebrew_Latin_BGN.txt rename to icu4c/source/data/translit/he_he_Latn_BGN.txt index a21cc17c71c..fe8105f4291 100644 --- a/icu4c/source/data/translit/Hebrew_Latin_BGN.txt +++ b/icu4c/source/data/translit/he_he_Latn_BGN.txt @@ -1,17 +1,62 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Hebrew_Latin_BGN.txt +# File: he_he_Latn_BGN.txt # Generated from CLDR # + +# +######################################################################## +# BGN/PCGN 1981 System +# +# The BGN/PCGN system for Hebrew was designed for use in romanizing +# names written in the Hebrew alphabet. The Roman letters and letter +# combinations shown as equivalents to the Hebrew characters reflect +# the eastern variety of Hebrew, i.e., the language spoken in +# the Republic of Armenia. +# +# The Hebrew Alphabet as defined by the BGN (Page 33-35): +# +# אבגדהוזחטיכךלמםנןסעפףצץקרששת +# +# Originally prepared by Michael Everson +######################################################################## +# +# MINIMAL FILTER: Hebrew-Latin +# :: [ \u05B0\u05B1\u05B2\u05B3\u05B4\u05B5\u05B6\u05B7\u05B8\u05B9\u05BB\u05BC\u05C1\u05C2אבגדהוזחטיךכלםמןנסעףפץצקרשת׳] ; :: NFD (NFC) ; +# +# +######################################################################## +# +######################################################################## +# +# Define All Transformation Variables +# +######################################################################## +# $alef = ’; $ayin = ‘; +# +# +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/cgi-bin/icu-bugs/transliterate?id=2034;expression=boundary;user=guest +# $wordBoundary = [^[:L:][:M:][:N:]] ; +# +# +######################################################################## +# +######################################################################## +# +# Rules moved to front to avoid masking +# +######################################################################## +# ב\u05BC → b ; # HEBREW LETTER BET + DAGESH פ\u05BC → P ; # HEBREW LETTER PE + DAGESH ג\u05BC → g ; # HEBREW LETTER GIMEL + DAGESH @@ -26,6 +71,16 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; כ\u05BC → k ; # HEBREW LETTER KAF + DAGESH ך\u05B0 → kh ; # HEBREW LETTER FINAL KAF + SHEVA ת\u05BC → t ; # HEBREW LETTER TAV + DAGESH +# +# +######################################################################## +# +######################################################################## +# +# Start of Alphabetic Transformations +# +######################################################################## +# א → $alef ; # HEBREW LETTER ALEF ב → v ; # HEBREW LETTER BET ג → g ; # HEBREW LETTER GIMEL @@ -61,3 +116,7 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; \u05B3 → o ; # HEBREW LETTER HATAF QAMATS \u05B9 → o ; # HEBREW POINT HOLAM \u05BB → u ; # HEBREW POINT QUBUTS +# +# +######################################################################## + diff --git a/icu4c/source/data/translit/hy_AREVMDA_am.txt b/icu4c/source/data/translit/hy_AREVMDA_am.txt new file mode 100644 index 00000000000..30e20a74874 --- /dev/null +++ b/icu4c/source/data/translit/hy_AREVMDA_am.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: hy_AREVMDA_am.txt +# Generated from CLDR +# + +::hy_AREVMDA-hy_AREVMDA_FONIPA; +::am_FONIPA-am; + diff --git a/icu4c/source/data/translit/hy_AREVMDA_ar.txt b/icu4c/source/data/translit/hy_AREVMDA_ar.txt new file mode 100644 index 00000000000..92e8c057d72 --- /dev/null +++ b/icu4c/source/data/translit/hy_AREVMDA_ar.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: hy_AREVMDA_ar.txt +# Generated from CLDR +# + +::hy_AREVMDA-hy_AREVMDA_FONIPA; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/hy_AREVMDA_fa.txt b/icu4c/source/data/translit/hy_AREVMDA_fa.txt new file mode 100644 index 00000000000..c3eb596f08f --- /dev/null +++ b/icu4c/source/data/translit/hy_AREVMDA_fa.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: hy_AREVMDA_fa.txt +# Generated from CLDR +# + +::hy_AREVMDA-hy_AREVMDA_FONIPA; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/hy_AREVMDA_hy_AREVMDA_FONIPA.txt b/icu4c/source/data/translit/hy_AREVMDA_hy_AREVMDA_FONIPA.txt new file mode 100644 index 00000000000..472824b9f60 --- /dev/null +++ b/icu4c/source/data/translit/hy_AREVMDA_hy_AREVMDA_FONIPA.txt @@ -0,0 +1,89 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: hy_AREVMDA_hy_AREVMDA_FONIPA.txt +# Generated from CLDR +# + +# https://en.wikipedia.org/wiki/Western_Armenian#Phonology +# http://www.omniglot.com/writing/armenian.htm +# https://en.wikipedia.org/wiki/Classical_Armenian_orthography +::lower(); +$wordBoundary = [^[:L:][:M:][:N:]]; +$vowel = [աեէըիոևօւ]; +'՚' → ; # կ՚ուտէ → /ɡudɛ/ +մ → m; +ն → n; +պ → b; +տ → d; +կ → ɡ; +բ → pʰ; +դ → tʰ; +գ → kʰ; +փ → pʰ; +{թիւն} $wordBoundary → tʰjun; # միութիւն → /mijutʰjun/, գիտութիւն → /kʰidutʰjun/ +թ → tʰ; +ք → kʰ; +ծ → d\u0361z; +ճ → d\u0361ʒ; +ձ → t\u0361sʰ; +ջ → t\u0361ʃʰ; +ց → t\u0361sʰ; +չ → t\u0361ʃʰ; +ֆ → f; +ս → s; +շ → ʃ; +խ → χ; +հ → h; +վ → v; +ւ → v; +զ → z; +ժ → ʒ; +ղ → ʁ; +լ → l; +$wordBoundary {յ} → h; # յետոյ → /hɛdo/, յատակ → /hɑdɑɡ/ +յ → j; +ռ → ɾ; +ր → ɾ; +$wordBoundary {իւ} → ju; # իւղ → /juʁ/ +իու → iju; # միութիւն → /mijutʰjun/ +իւ → ʏ; # հիւր → /hʏɾ/ +{իայ} $wordBoundary → ja; +իա → ijɑ; # միասին → /mijɑsin/ +ի → i; +{եայ} $wordBoundary → jɑ; # առօրեայ → /ɑɾoɾjɑ/ +եա → jɑ; # Եանիքեան → /jɑnikʰjɑn/ +եօ → jo; # եօթը → /jotʰə/ +ով → ov; # երազով → /jɛɾɑzov/ +{ոյ} $wordBoundary → o; # երեկոյ → /jɛɾɛɡo/ +{ոյ} $vowel → oj; # գոյական → /kʰojɑɡɑn/ +ոյ → uj; # քոյր → /kʰujɾ/ +{ու} $vowel → v; # վաղուընէ → /vɑʁvənɛ/, պահուըտիլ → /bɑhvədil/ +ու → u; # մուկ → /muɡ/ +$wordBoundary {ո} → vo; # ոսկի → /vosɡi/ +ո → o; # ցորեն → /t\u0361sʰoɾɛn/ +$vowel {ե} → jɛ; # հայելի → /hɑjɛli/ +$wordBoundary {ե} → jɛ; # երազ → /jɛɾɑz/ +ե → ɛ; +# և is Eastern Armenian, but let's be resilient and pronounce something. +$wordBoundary {և} → jɛv; +և → ɛv; +{էայ} $wordBoundary → ɛjɑ; +էա → ɛjɑ; # էակ → /ɛjɑɡ/ +էի → ɛji; # էի → /ɛji/, կուզէին → /ɡuzɛjin/ +էու → ɛju; # էութիւն → /ɛjutʰjun/ +էօ → œ; # Էօժենի → /œʒɛni/ +էյ → ej; # թէյ → /tʰej/ +է → ɛ; +ը → ə; +օ → o; +{այ} $wordBoundary → ɑ; # ծառայ → /d\u0361zɑɾɑ/, կը դողայ → /ɡə tʰoʁɑ/ +ա → ɑ; +::NULL; +jj → j; # Գայեանէ → /kʰɑjjɑnɛ/ → /kʰɑjɑnɛ/ +nɡ → ŋɡ; # Քոնկօ → /kʰonɡo/ → /kʰoŋɡo/ +nk → ŋk; # օրէնք → /oɾɛnkʰ/ → /oɾɛŋkʰ/ + diff --git a/icu4c/source/data/translit/hy_am.txt b/icu4c/source/data/translit/hy_am.txt new file mode 100644 index 00000000000..d308cf7e22c --- /dev/null +++ b/icu4c/source/data/translit/hy_am.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: hy_am.txt +# Generated from CLDR +# + +::hy-hy_FONIPA; +::am_FONIPA-am; + diff --git a/icu4c/source/data/translit/hy_ar.txt b/icu4c/source/data/translit/hy_ar.txt new file mode 100644 index 00000000000..cd509fdfefb --- /dev/null +++ b/icu4c/source/data/translit/hy_ar.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: hy_ar.txt +# Generated from CLDR +# + +::hy-hy_FONIPA; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/hy_fa.txt b/icu4c/source/data/translit/hy_fa.txt new file mode 100644 index 00000000000..7f121cb9a41 --- /dev/null +++ b/icu4c/source/data/translit/hy_fa.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: hy_fa.txt +# Generated from CLDR +# + +::hy-hy_FONIPA; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/hy_hy_FONIPA.txt b/icu4c/source/data/translit/hy_hy_FONIPA.txt new file mode 100644 index 00000000000..77dd6784e2b --- /dev/null +++ b/icu4c/source/data/translit/hy_hy_FONIPA.txt @@ -0,0 +1,61 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: hy_hy_FONIPA.txt +# Generated from CLDR +# + +# https://en.wikipedia.org/wiki/Armenian_language#Phonology +::lower(); +$wordBoundary = [^[:L:][:M:][:N:]]; +$vowel = [աեէըիոևօւ]; +# Special cases +ով → ov; +մ → m; +ն → n; +պ → p; +տ → t; +կ → k; +բ → b; +դ → d; +գ → ɡ; +փ → pʰ; +թ → tʰ; +ք → kʰ; +ծ → t\u0361s; +ճ → t\u0361ʃ; +ձ → d\u0361z; +ջ → d\u0361ʒ; +ց → t\u0361sʰ; +չ → t\u0361ʃʰ; +ֆ → f; +ս → s; +շ → ʃ; +խ → x; +հ → h; +վ → v; +ւ → v; +զ → z; +ժ → ʒ; +ղ → ɣ; +լ → l; +յ → j; +ռ → r; +ր → ɾ; +ի → i; +ու → u; +$wordBoundary {ո} → vo; +ո → o; +$vowel {ե} → jɛ; +$wordBoundary {ե} → jɛ; +ե → ɛ; +$wordBoundary {և} → jɛv; +և → ɛv; +է → ɛ; +ը → ə; +օ → o; +ա → a; + diff --git a/icu4c/source/data/translit/Armenian_Latin_BGN.txt b/icu4c/source/data/translit/hy_hy_Latn_BGN.txt similarity index 62% rename from icu4c/source/data/translit/Armenian_Latin_BGN.txt rename to icu4c/source/data/translit/hy_hy_Latn_BGN.txt index c065d7d0d02..c98925d8242 100644 --- a/icu4c/source/data/translit/Armenian_Latin_BGN.txt +++ b/icu4c/source/data/translit/hy_hy_Latn_BGN.txt @@ -1,14 +1,45 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Armenian_Latin_BGN.txt +# File: hy_hy_Latn_BGN.txt # Generated from CLDR # + +# +######################################################################## +# BGN/PCGN 1981 System +# +# The BGN/PCGN system for Armenian was designed for use in romanizing +# names written in the Armenian alphabet. The Roman letters and letter +# combinations shown as equivalents to the Armenian characters reflect +# the eastern variety of Armenian, i.e., the language spoken in +# the Republic of Armenia. +# +# The Armenian Alphabet as defined by the BGN (Page 11): +# +# ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՐՑՓՔՕՖ +# աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցփքևօֆ +# +# Originally prepared by Michael Everson +######################################################################## +# +# MINIMAL FILTER: Armenian-Latin +# :: [ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖաբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆև] ; :: NFD (NFC) ; +# +# +######################################################################## +# +######################################################################## +# +# Define All Transformation Variables +# +######################################################################## +# $upperConsonants = [ԲԳԴԶԹԺԼԽԾԿՀՁՂՃՄՅՆՇՉՊՋՌՍՎՐՑՓՔՖ] ; $lowerConsonants = [բգդզթժլխծկհձղճմյնշչպջռսվտրցփքֆ] ; $consonants = [$upperConsonants $lowerConsonants] ; @@ -17,7 +48,26 @@ $lowerVowels = [աեէըիոևօւ] ; $vowels = [$upperVowels $lowerVowels] ; $lower = [$lowerConsonants $lowerVowels] ; $aspirate = ’ ; +# +# +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/cgi-bin/icu-bugs/transliterate?id=2034;expression=boundary;user=guest +# $wordBoundary = [^[:L:][:M:][:N:]] ; +# +# +######################################################################## +# +######################################################################## +# +# BGN Page 12 Rule 3: +# +# The character և should be romanized yev initially, after the vowel +# characters ա, ե, է, ը, ի, ո, ւ, and օ. +# In all other instances, it should be romanized ev. +# +######################################################################## +# $wordBoundary{ԵՒ}$wordBoundary → YEV ; # ARMENIAN CAPITAL ECH + CAPITAL YIWN $wordBoundary{ԵՒ → YEV ; # ARMENIAN CAPITAL ECH + CAPITAL YIWN $vowels{ԵՒ → YEV ; # ARMENIAN CAPITAL ECH + CAPITAL YIWN @@ -34,6 +84,20 @@ $wordBoundary{և}$wordBoundary → yev ; # ARMENIAN SMALL LIGATURE ECH YIWN $wordBoundary{և → yev ; # ARMENIAN SMALL LIGATURE ECH YIWN $vowels{և → yev ; # ARMENIAN SMALL LIGATURE ECH YIWN և → ev ; # ARMENIAN SMALL LIGATURE ECH YIWN +# +# +######################################################################## +# +# End of Rule 3 +# +######################################################################## +# +######################################################################## +# +# Start of Alphabetic Transformations +# +######################################################################## +# Ա → A ; # ARMENIAN CAPITAL LETTER AYB ա → a ; # ARMENIAN SMALL LETTER AYB Բ → B ; # ARMENIAN CAPITAL LETTER BEN @@ -42,6 +106,18 @@ $vowels{և → yev ; # ARMENIAN SMALL LIGATURE ECH YIWN գ → g ; # ARMENIAN SMALL LETTER GIM Դ → D ; # ARMENIAN CAPITAL LETTER DA դ → d ; # ARMENIAN SMALL LETTER DA +# +# +######################################################################## +# +# BGN Page 12 Rule 1: +# +# The character ե should be romanized ye initially, after the vowel +# characters ա, ե, է, ը, ի, ո, ւ, and օ. +# In all other instances, it should be romanized e. +# +######################################################################## +# $upperVowels{Ե → YE ; # ARMENIAN CAPITAL LETTER ECH $lowerVowels{Ե → Ye ; # ARMENIAN CAPITAL LETTER ECH $wordBoundary{Ե → Ye ; # ARMENIAN CAPITAL LETTER ECH @@ -49,6 +125,14 @@ $wordBoundary{Ե → Ye ; # ARMENIAN CAPITAL LETTER ECH $vowels{ե → ye ; # ARMENIAN SMALL LETTER ECH $wordBoundary{ե → ye ; # ARMENIAN SMALL LETTER ECH ե → e ; # ARMENIAN SMALL LETTER ECH +# +# +######################################################################## +# +# End of Rule 1 +# +######################################################################## +# Զ → Z ; # ARMENIAN CAPITAL LETTER ZA զ → z ; # ARMENIAN SMALL LETTER ZA Է → E ; # ARMENIAN CAPITAL LETTER EH @@ -92,9 +176,29 @@ $wordBoundary{ե → ye ; # ARMENIAN SMALL LETTER ECH Շ} $lower → Sh ; # ARMENIAN CAPITAL LETTER SHA Շ → SH ; # ARMENIAN CAPITAL LETTER SHA շ → sh ; # ARMENIAN SMALL LETTER SHA +# +# +######################################################################## +# +# Transliteration Case 34: +# Ու} $lower → U ; # ARMENIAN CAPITAL LETTER VO + SMALL YIWN ՈՒ → U ; # ARMENIAN CAPITAL LETTER VO + CAPITAL YIWN ու → u ; # ARMENIAN SMALL LETTER VO + SMALL YIWN +# +# +######################################################################## +# +######################################################################## +# +# BGN Page 12 Rule 2: +# +# The character ո should be romanized vo initially except in the +# word ով, which should be romanized ov. In all other instances, it +# should be romanized o. +# +######################################################################## +# ՈՎ → OV ; Ով → Ov ; ով → ov ; @@ -103,6 +207,14 @@ $wordBoundary{Ո → VO ; # ARMENIAN CAPITAL LETTER VO Ո → O ; # ARMENIAN CAPITAL LETTER ECH $wordBoundary{ո → vo ; # ARMENIAN SMALL LETTER VO ո → o ; # ARMENIAN SMALL LETTER VO +# +# +######################################################################## +# +# End of Rule 2 +# +######################################################################## +# Չ} $lower → Ch $aspirate ; # ARMENIAN CAPITAL LETTER CHA Չ → CH $aspirate ; # ARMENIAN CAPITAL LETTER CHA չ → ch $aspirate ; # ARMENIAN SMALL LETTER CHA @@ -124,6 +236,17 @@ $wordBoundary{ո → vo ; # ARMENIAN SMALL LETTER VO Ց} $lower → Ts $aspirate ; # ARMENIAN CAPITAL LETTER CHEH Ց → TS $aspirate ; # ARMENIAN CAPITAL LETTER CO ց → ts $aspirate ; # ARMENIAN SMALL LETTER CO +# +# +######################################################################## +# +# The BGN does not show YIWN on its own. +# +#Ւ → W ; # ARMENIAN CAPITAL LETTER YIWN +#ւ → w ; # ARMENIAN SMALL LETTER YIWN +# +######################################################################## +# Փ → P $aspirate ; # ARMENIAN CAPITAL LETTER PIWR փ → p $aspirate ; # ARMENIAN SMALL LETTER PIWR Ք → K $aspirate ; # ARMENIAN CAPITAL LETTER KEH @@ -132,3 +255,7 @@ $wordBoundary{ո → vo ; # ARMENIAN SMALL LETTER VO օ → o ; # ARMENIAN SMALL LETTER OH Ֆ → F ; # ARMENIAN CAPITAL LETTER FEH ֆ → f ; # ARMENIAN SMALL LETTER FEH +# +# +######################################################################## + diff --git a/icu4c/source/data/translit/ia_am.txt b/icu4c/source/data/translit/ia_am.txt new file mode 100644 index 00000000000..25e92c502b9 --- /dev/null +++ b/icu4c/source/data/translit/ia_am.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: ia_am.txt +# Generated from CLDR +# + +::ia-ia_FONIPA; +::am_FONIPA-am; + diff --git a/icu4c/source/data/translit/ia_ar.txt b/icu4c/source/data/translit/ia_ar.txt new file mode 100644 index 00000000000..767ca8ce45d --- /dev/null +++ b/icu4c/source/data/translit/ia_ar.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: ia_ar.txt +# Generated from CLDR +# + +::ia-ia_FONIPA; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/ia_fa.txt b/icu4c/source/data/translit/ia_fa.txt new file mode 100644 index 00000000000..f2740a131ed --- /dev/null +++ b/icu4c/source/data/translit/ia_fa.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: ia_fa.txt +# Generated from CLDR +# + +::ia-ia_FONIPA; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/ia_ia_FONIPA.txt b/icu4c/source/data/translit/ia_ia_FONIPA.txt index 3486205145e..b81a25a43ff 100755 --- a/icu4c/source/data/translit/ia_ia_FONIPA.txt +++ b/icu4c/source/data/translit/ia_ia_FONIPA.txt @@ -1,6 +1,6 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** @@ -8,15 +8,20 @@ # Generated from CLDR # +# Transformation from Interlingua (ia) to its IPA transcription (ia_FONIPA). +# http://en.wikipedia.org/wiki/Interlingua#Interlingua_alphabet +# http://www.omniglot.com/writing/interlingua.htm ::NFC; ::Lower; - +# Interlinua has five falling diphthongs. +# http://en.wikipedia.org/wiki/Interlingua#Orthography_and_pronunciation ai → ai\u032F; au → au\u032F; ei → ei\u032F; # rare eu → eu\u032F; oi → oi\u032F; # rare - +# ‹g› is usually [ɡ], but it is [d\u0361ʒ] in -age, -agi-, and -egi-. +# http://www.omniglot.com/writing/interlingua.htm $vowel = [aeiouy]; $end_of_word = [$ ]; {age} $end_of_word → ad\u0361ʒe; @@ -26,11 +31,10 @@ agi → ad\u0361ʒi; egi → ed\u0361ʒi; gg → ɡ; g → ɡ; - +# Omniglot: “The sounds of g and k assimilate a preceding n as in English.” {n} [gkqx] → ŋ; nn → n; n → n; - a → a; bb → b; b → b; @@ -73,3 +77,4 @@ w → v; x → ks; y → i; z → z; + diff --git a/icu4c/source/data/translit/it_am.txt b/icu4c/source/data/translit/it_am.txt index ada57676c0f..d8a06981019 100644 --- a/icu4c/source/data/translit/it_am.txt +++ b/icu4c/source/data/translit/it_am.txt @@ -1,19 +1,29 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: it_am.txt # Generated from CLDR # + +# Italian to Amharic Transliteration ::NFD(NFC); ::Lower(); ::[:Latin:] fullwidth-halfwidth(); +# +# +# Variables. $vowel = [aeiou]; $consonant = [bcdfghjklmnpqrstvwxyz]; +# +# +# Ignore apostrophe. ($consonant) \' → | $1; \' → ; +# +# cqu → ኩ ; cc → | c; ca → ካ; @@ -30,6 +40,8 @@ chu → ኩ; che → ኬ; cho → ኮ; c } $consonant → ክ; +# +# gg → | g; ghi → ጊ; ghe → ጌ; @@ -41,6 +53,8 @@ gnu → ኙ; gne → ኜ; gno → ኞ; gn } $consonant → ኝ; +# +# ga → ጋ; gia → ጂያ; giu → ጂዩ; @@ -50,6 +64,8 @@ gu → ጉ; ge → ጄ; go → ጎ; g } $consonant → ግ; +# +# rr → | r; ra → ራ; ri → ሪ; @@ -57,6 +73,8 @@ ru → ሩ; re → ሬ; ro → ሮ; r } $consonant → ር; +# +# ll → | l; la → ላ; li → ሊ; @@ -64,6 +82,8 @@ lu → ሉ; le → ሌ; lo → ሎ; l } $consonant → ል; +# +# tt → | t; ta → ታ; ti → ቲ; @@ -77,6 +97,8 @@ tho → ቶ; tzu → ፁ; tz → | zz; t } $consonant → ት; +# +# dd → | d; da → ዳ; di → ዲ; @@ -84,6 +106,8 @@ du → ዱ; de → ዴ; do → ዶ; d } $consonant → ድ; +# +# mm → | m; ma → ማ; mi → ሚ; @@ -91,6 +115,8 @@ mu → ሙ; me → ሜ; mo → ሞ; m } $consonant → ም; +# +# nn → | n; na → ና; ni → ኒ; @@ -98,6 +124,8 @@ nu → ኑ; ne → ኔ; no → ኖ; n } $consonant → ን; +# +# ff → | f; fa → ፋ; fi → ፊ; @@ -105,6 +133,8 @@ fu → ፉ; fe → ፌ; fo → ፎ; f } $consonant → ፍ; +# +# bb → | b; ba → ባ; bi → ቢ; @@ -112,6 +142,8 @@ bu → ቡ; be → ቤ; bo → ቦ; b } $consonant → ብ; +# +# pp → | p; pa → ፓ; pi → ፒ; @@ -119,6 +151,8 @@ pu → ፑ; pe → ፔ; po → ፖ; p } $consonant → ፕ; +# +# vv → | v; va → ቫ; vi → ቪ; @@ -126,8 +160,13 @@ vu → ቩ; ve → ቬ; vo → ቮ; v } $consonant → ቩ; +# +# sa } nt[ao] → ሣ; ss → | \~s; +# +# +# 's' is voiced before [bdglmnrv]. sb → ዝ | b; sd → ዝ | d; sg → ዝ | g; @@ -136,46 +175,71 @@ sm → ዝ | m; sn → ዝ | n; sr → ዝ | r; sv → ዝ | v; +# +# +# Force 's' after a consonat to be unvoiced. ($consonant) s } $vowel → | $1 \~ s; \~sa → ሣ; \~si → ሢ; \~su → ሡ; \~se → ሤ; \~so → ሦ; +# +# +# 's' at the beginning is usually unvoiced. [:^Letter:] { sa → ሣ; [:^Letter:] { si → ሢ; [:^Letter:] { su → ሡ; [:^Letter:] { se → ሤ; [:^Letter:] { so → ሦ; +# +# +# Otherwise voiced 's' are common. sa → ዛ; si → ዚ; su → ዙ; se → ዜ; so → ዞ; +# +# scia → ሺያ; sci → ሺ; sce → ሼ; +# +# zz → | \~z; +# +# Force 'z' after a consonat to be unvoiced. ($consonant) z → | $1 \~z; \~za → ጻ; \~zi → ጺ; \~zu → ጹ; \~ze → ጼ; \~zo → ጾ; +# +# +# Otherwise voiced 'z' are common except for 'zi'. za → ዛ; [:^Letter:] { zi → ዚ; zi → ዚ; zu → ዙ; ze → ዜ; zo → ዞ; +# +# ja → ያ; je → ዬ; j → | i; +# +# +# Standalone vowels and consonants. a → አ; i → ዒ; u → ዑ; e → ዔ; o → ዖ; +# +# b → ብ; c → ክ; d → ድ; @@ -195,5 +259,8 @@ v → ው; x → | cs; y → | i; z → ዝ; +# +# [:nonspacing mark:] → ; ::NFC(NFD); + diff --git a/icu4c/source/data/translit/it_ja.txt b/icu4c/source/data/translit/it_ja.txt index 27ec5e89820..1ae73f87612 100644 --- a/icu4c/source/data/translit/it_ja.txt +++ b/icu4c/source/data/translit/it_ja.txt @@ -1,19 +1,32 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: it_ja.txt # Generated from CLDR # + +# Italian to Katakana Transliteration Table for ICU +# Based on: +# "現代イタリア語入門" (大学書林, 1974. ISBN:978-4475017176) +# http://ja.wikipedia.org/wiki/%E3%82%A4%E3%82%BF%E3%83%AA%E3%82%A2%E8%AA%9E ::NFD(NFC); ::Lower(); ::[:Latin:] fullwidth-halfwidth(); +# +# +# Variables. $vowel = [aeiou]; $consonant = [bcdfghjklmnpqrstvwxyz]; +# +# +# Ignore apostrophe. ($consonant) \' → | $1; \' → ; +# +# cqu → ック; cc → ッ | c; ca → カ; @@ -23,11 +36,15 @@ ci → チ; cu → ク; ce → チェ; co → コ; +# +# cha → シャ; chi → キ; chu → チュ; che → ケ; cho → チョ; +# +# gg → ッ | g; ghi → ギ; ghe → ゲ; @@ -38,6 +55,8 @@ gni → ニ; gnu → ヌ; gne → ニェ; gno → ニョ; +# +# ga → ガ; gia → ジャ; giu → ジュ; @@ -46,18 +65,24 @@ gi → ジ; gu → グ; ge → ジェ; go → ゴ; +# +# rr → ッ | r; ra → ラ; ri → リ; ru → ル; re → レ; ro → ロ; +# +# ll → ッ | l; la → ラ; li → リ; lu → ル; le → レ; lo → ロ; +# +# tt → ッ | t; ta → タ; ti → ティ; @@ -70,49 +95,68 @@ to → ト; tho → ト; tzu → | ッツ; tz → | zz; +# +# dd → ッ | d; da → ダ; di → ディ; du → ドゥ; de → デ; do → ド; +# +# ma → マ; mi → ミ; mu → ム; me → メ; mo → モ; m } $consonant → ン; +# +# na → ナ; ni → ニ; nu → ヌ; ne → ネ; no → ノ; +# +# ff → ッ | f; fa → ファ; fi → フィ; fu → フ; fe → フェ; fo → フォ; +# +# bb → ッ | b; ba → バ; bi → ビ; bu → ブ; be → ベ; bo → ボ; +# +# pp → ッ | p; pa → パ; pi → ピ; pu → プ; pe → ペ; po → ポ; +# +# vv → ッ | v; va → ヴァ; vi → ヴィ; vu → ヴ; ve → ヴェ; vo → ヴォ; +# +# sa } nt[ao] → サ; ss → ッ | \~s; +# +# +# 's' is voiced before [bdglmnrv]. sb → ズ | b; sd → ズ | d; sg → ズ | g; @@ -121,46 +165,71 @@ sm → ズ | m; sn → ズ | n; sr → ズ | r; sv → ズ | v; +# +# +# Force 's' after a consonat to be unvoiced. ($consonant) s } $vowel → | $1 \~ s; \~sa → サ; \~si → シ; \~su → ス; \~se → セ; \~so → ソ; +# +# +# 's' at the beginning is usually unvoiced. [:^Letter:] { sa → サ; [:^Letter:] { si → シ; [:^Letter:] { su → ス; [:^Letter:] { se → セ; [:^Letter:] { so → ソ; +# +# +# Otherwise voiced 's' are common. sa → ザ; si → ジ; su → ズ; se → ゼ; so → ゾ; +# +# scia → シャ; sci → シ; sce → シェ; +# +# zz → ッ | \~z; +# +# Force 'z' after a consonat to be unvoiced. ($consonant) z → | $1 \~z; \~za → ツァ; \~zi → ツィ; \~zu → ツ; \~ze → ツェ; \~zo → ツォ; +# +# +# Otherwise voiced 'z' are common except for 'zi'. za → ザ; [:^Letter:] { zi → ジ; zi → ツィ; zu → ズ; ze → ゼ; zo → ゾ; +# +# ja → ヤ; je → イェ; j → | i; +# +# +# Standalone vowels and consonants. a → ア; i → イ; u → ウ; e → エ; o → オ; +# +# b → ブ; c → ク; d → ド; @@ -180,7 +249,20 @@ v → ヴ; x → | cs; y → | i; z → ツ; +# +# +# word delimiter of transliterated foreign phrase is '・'. ' ' → ・; +# +# +# Latin hyphen should be transliterated to U+30A0 (KATAKANA-HIRAGANA +# DOUBLE HYPHEN), ideally. But since the character isn't supported by +# many fonts or softwares, we use U+FF1D (FULLWIDTH EQUALS SIGN), +# which is widely used as "double hyphen". +# \- → =; +# +# [:nonspacing mark:] → ; ::NFC(NFD); + diff --git a/icu4c/source/data/translit/Katakana_Latin_BGN.txt b/icu4c/source/data/translit/ja_Hrkt_ja_Latn_BGN.txt old mode 100755 new mode 100644 similarity index 82% rename from icu4c/source/data/translit/Katakana_Latin_BGN.txt rename to icu4c/source/data/translit/ja_Hrkt_ja_Latn_BGN.txt index 38f8738d12f..4a589294165 --- a/icu4c/source/data/translit/Katakana_Latin_BGN.txt +++ b/icu4c/source/data/translit/ja_Hrkt_ja_Latn_BGN.txt @@ -1,16 +1,62 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Katakana_Latin_BGN.txt +# File: ja_Hrkt_ja_Latn_BGN.txt # Generated from CLDR # + +# +######################################################################## +# BGN/PCGN Agreement +# +# The modified Hepburn system for the romanization of Japanese has been +# in use by the U.S. Board on Geographic Names since about 1930 and has +# been used extensively in the romanization of Japanese geographic names. +# The system is well adapted to the general needs of speakers of English +# and is the most widely used system for the romanization of Japanese. +# +# Originally prepared by Michael Everson +######################################################################## +# +# MINIMAL FILTER: Japanese-Latin +# :: [あいうえおかがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺほぼぽまみむめもゃやゅゆょよらりるれろわゐゑをんゔアイウエオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロワヰヱヲンヴ] ; :: NFD (NFC) ; +# +# +######################################################################## +# +######################################################################## +# +# Define All Transformation Variables +# +######################################################################## $apostrophe = ’; +# +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/cgi-bin/icu-bugs/transliterate?id=2034;expression=boundary;user=guest +# $wordBoundary = [^[:L:][:M:][:N:]] ; +# +######################################################################## +# +# Rules moved to front to avoid masking +# +######################################################################## +# +######################################################################## +# +# BGN Page 45 Rule 2: +# +# A small-script tsu form (ッ or っ) is inserted between kana symbols +# to indicate a double consonant and is romanized as k before k; +# as s before s or sh; as t before t, ts, or ch; and as p before p. +# +######################################################################## +# ッ}[カキクケコ] → k ; # KATAKANA LETTER SMALL TU っ}[かきくけこ] → k ; # HIRAGANA LETTER SMALL TU ッ}[サシスセソ] → s ; # KATAKANA LETTER SMALL TU @@ -19,6 +65,20 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; っ}[たちつてと] → t ; # HIRAGANA LETTER SMALL TU ッ}[パピプペポ] → p ; # KATAKANA LETTER SMALL TU っ}[ぱぴぷぺぽ] → p ; # HIRAGANA LETTER SMALL TU +# +# +######################################################################## +# +# End of Rule 2 +# +######################################################################## +# +######################################################################## +# +# Start of Syllabic Transformations +# +######################################################################## +# ア → a ; # KATAKANA LETTER A イ → i ; # KATAKANA LETTER I ウ → u ; # KATAKANA LETTER U @@ -110,10 +170,31 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; ヰ → i ; # KATAKANA LETTER WI ヱ → e ; # KATAKANA LETTER WE ヲ → o ; # KATAKANA LETTER WO +# +# +######################################################################## +# +# BGN Page 45 Rule 3: +# +# The character ン should be romanized m before b, p, or m. +# The character ん should be romanized m before b, p, or m. +# The character ン should be romanized n’ before y or a vowel letter. +# The character ん should be romanized n’ before y or a vowel letter. +# +######################################################################## +# ン}[バビブベボパピプペポマミムメモ] → m ; # KATAKANA LETTER N ん}[ばびぶべぼぱぴぷぺぽまみむめも] → m ; # HIRAGANA LETTER N ン}[ヤユヨアイウエオ] → n $apostrophe ; # KATAKANA LETTER N ん}[やゆよあいうえお] → n $apostrophe ; # HIRAGANA LETTER N +# +# +######################################################################## +# +# End of Rule 3 +# +######################################################################## +# ン → n ; # KATAKANA LETTER N ガ → ga ; # KATAKANA LETTER GA ギョウ → gyō ; # KATAKANA LETTER GI + SMALL YO + U @@ -309,3 +390,7 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; ぽう → pō ; # HIRAGANA LETTER PO + U ぽ → po ; # HIRAGANA LETTER PO ゔ → v ; # HIRAGANA LETTER VU +# +# +######################################################################## + diff --git a/icu4c/source/data/translit/ja_Latn_ko.txt b/icu4c/source/data/translit/ja_Latn_ko.txt index 11857253c0b..2eb824aa088 100644 --- a/icu4c/source/data/translit/ja_Latn_ko.txt +++ b/icu4c/source/data/translit/ja_Latn_ko.txt @@ -1,19 +1,39 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: ja_Latn_ko.txt # Generated from CLDR # + +# Japanese (Rōmaji) to Korean (Hangul) transliteration table for ICU. +# Can be run in sequence after e.g. Katakana-Latin. +# +# Based on 문교부 고시 제85-11호 (1986. 1. 7.) 외래어 표기법 +# For background info, see http://ko.wikisource.org/wiki/문교부_고시_제85-11호 +# and http://ko.wikipedia.org/wiki/외래어_표기법 (外來語表記法) ::NFD(NFC); ::[:Latin:] Lower(); $consonant = [ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑᄒ]; $lengthMarker = [\u0302\u0304]; +# +# +# Drop hyphens and apostrophes. [\-\'] → ; +# +# +# Turn long /e:/ into diphthong /ei/. e $lengthMarker → | e i ; +# +# +# Ignore vowel length everywhere else. $lengthMarker → ; +# +# +# Vowels. +# [^$consonant] { ( [aiueoyw] ) → ᄋ | $1 ; # Supply a required null initial. a → ᅡ ; i\~e → | ie ; # イェ @@ -25,6 +45,10 @@ u\~o → ᅯ ; # ウォ u → ᅮ ; e → ᅦ ; o → ᅩ ; +# +# +# Geminates. +# kk → ᆺ | k ; ss → ᆺ | s ; tt → ᆺ | t ; @@ -40,12 +64,20 @@ dd → ᆺ | d ; bb → ᆺ | b ; vv → ᆺ | v ; pp → ᆺ | p ; +# +# +# Consonants. +# ' ' { k → | g ; # Beginning of a word (after space). ^k → | g ; # Beginning of the string. k → ᄏ ; +# +# sh → | sy ; su → 스 ; s → ᄉ ; +# +# te\~ → | t ; # テュ to\~ → | t ; # トゥ tsu\~ → | ch ; # ツァ, ツィ, etc. @@ -57,27 +89,45 @@ t → ᄐ ; ' ' { ch → | j ; ^ch → | j ; ch → ᄎ ; +# +# n } [\ \'bcdfghjkmnprstwz] → ᆫ ; n$ → ᆫ ; n → ᄂ ; +# +# h → ᄒ ; fu\~ → | p ; # フュ fu → | hu ; f → | p ; +# +# m } [bmp] → ᆫ ; m → ᄆ ; +# +# ya → ᅣ ; yi → ᅵ ; # Added for convenience, after shi. yu → ᅲ ; ye → ᅨ ; yo → ᅭ ; +# +# r → ᄅ ; +# +# wa → ᅪ ; w → ; +# +# g → ᄀ ; +# +# zu → 즈 ; z → | j ; j → ᄌ ; +# +# de\~ → | d ; # デュ dji\~ → | j ; # ヂァ, ヂゥ, etc. dji → | ji ; # ヂ @@ -86,8 +136,15 @@ dzu\~ → | j ; # ヅァ, ヅィ, etc. dzu → | zu ; # ヅ dz → | j ; d → ᄃ ; +# +# b → ᄇ ; vu\~ → | b ; # ヴァ, etc. v → | b ; +# +# p → ᄑ ; +# +# ::NFC(NFD); + diff --git a/icu4c/source/data/translit/ja_Latn_ru.txt b/icu4c/source/data/translit/ja_Latn_ru.txt index 6a5f3fb3af0..d2fe3213ed7 100644 --- a/icu4c/source/data/translit/ja_Latn_ru.txt +++ b/icu4c/source/data/translit/ja_Latn_ru.txt @@ -1,29 +1,70 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: ja_Latn_ru.txt # Generated from CLDR # + +# Japanese (Rōmaji) to Russian (Cyrillic) Polivanov transliteration for ICU. +# Can be run in sequence after e.g. Katakana-Latin. +# +# These rules cannot be used to target Bulgarian, Serbian, Tajik, or Ukrainian. +# +# TODO: Cyrillization needs to respect morpheme/Kanji boundaries. +# 中井 becomes Накаи, but 北海道 becomes Хоккайдо. We need boundary +# markup in the input in order to do that properly. +# ::NFD(NFC); ::[:Latin:] Lower(); +# +# $lengthMarker = [\u0302\u0304]; +# +# +# Delete apostrophes. Apostrophes after "n" are consumed below. \' → ; +# +# +# Turn long /e:/ into diphthong /ei/. +# Note that /ei/ across a morpheme boundary (e.g. 武井 Takei) becomes эи. e $lengthMarker → эй ; +# +# +# Turn long /i:/ into two vowels /ii/. i $lengthMarker → | i i ; +# +# +# Ignore vowel length everywhere else. $lengthMarker → ; +# +# +# Vowels. +# +# TODO(mjansche): Enable diphthongs once we have Kanji boundaries. +## ai → ай ; a → а ; i\~e → | ye ; i → и ; u\~ → в ; # ウィ etc. +# +## ui → уй ; u → у ; e → э ; o → о ; +# +# +# Consonants. +# k → к ; +# +# sh → | sy ; s → с ; +# +# ch → | ty ; c } ch → t ; te\~ → | t ; # テュ @@ -31,25 +72,45 @@ to\~ → | t ; # トゥ tsu\~ → | ts ; # ツァ, ツィ, etc. ts → ц ; t → т ; +# +# \~tsu → | tsu ; +# +# n } [bpm] → м ; # 群馬 → Гумма n\' → нъ ; n → н ; +# +# h → х ; fu\~ → | f ; # フュ f → ф ; +# +# m → м ; +# +# ya → я ; yi → и ; # Added for convenience, after sh, ch, j. yu → ю ; ye → е ; # ?? unobserved yo → ё ; +# +# r → р ; +# +# wa → ва ; w → ; +# +# g → г ; +# +# j → | zy ; z → дз ; +# +# de\~ → | d ; # デュ dji\~ → | z ; # ヂャ, ヂュ, etc. dj → | j ; # ヂ @@ -57,8 +118,15 @@ do\~ → | d ; # ドゥ dzu\~ → | z ; # ヅァ, ヅィ, etc. dz → | z ; # ヅ d → д ; +# +# b → б ; vu\~ → | v ; # ヴァ, etc. v → в ; # ?? unobserved +# +# p → п ; +# +# ::NFC(NFD); + diff --git a/icu4c/source/data/translit/Georgian_Latin_BGN.txt b/icu4c/source/data/translit/ka_ka_Latn_BGN.txt similarity index 50% rename from icu4c/source/data/translit/Georgian_Latin_BGN.txt rename to icu4c/source/data/translit/ka_ka_Latn_BGN.txt index d5f02e61575..9c850b7bc94 100644 --- a/icu4c/source/data/translit/Georgian_Latin_BGN.txt +++ b/icu4c/source/data/translit/ka_ka_Latn_BGN.txt @@ -1,15 +1,57 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Georgian_Latin_BGN.txt +# File: ka_ka_Latn_BGN.txt # Generated from CLDR # + +# +######################################################################## +# BGN/PCGN 1981 System +# +# The BGN/PCGN system for Georgian was designed for use in romanizing +# names written in the Georgian alphabet. The alphabet shown here is +# known as the Mkhedruli alphabet and is the alphabet presently +# used in the Republic of Georgia. +# +# The Georgian Alphabet as defined by the BGN (Page 27): +# +# აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰ +# +# Originally prepared by Michael Everson +######################################################################## +# +# MINIMAL FILTER: Georgian-Latin +# :: [აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰ] ; :: NFD (NFC) ; +# +# +######################################################################## +# +######################################################################## +# +# Define All Transformation Variables +# +######################################################################## +# +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/cgi-bin/icu-bugs/transliterate?id=2034;expression=boundary;user=guest +# $wordBoundary = [^[:L:][:M:][:N:]] ; +# +# +######################################################################## +# +######################################################################## +# +# Start of Alphabetic Transformations +# +######################################################################## +# ა → a ; # GEORGIAN LETTER AN ბ → b ; # GEORGIAN LETTER BAN გ → g ; # GEORGIAN LETTER GAN @@ -43,3 +85,7 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; ხ → kh ; # GEORGIAN LETTER XAN ჯ → j ; # GEORGIAN LETTER JHAN ჰ → h ; # GEORGIAN LETTER HAE +# +# +######################################################################## + diff --git a/icu4c/source/data/translit/kk_am.txt b/icu4c/source/data/translit/kk_am.txt new file mode 100644 index 00000000000..b01cf23feb9 --- /dev/null +++ b/icu4c/source/data/translit/kk_am.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: kk_am.txt +# Generated from CLDR +# + +::kk-kk_FONIPA; +::am_FONIPA-am; + diff --git a/icu4c/source/data/translit/kk_ar.txt b/icu4c/source/data/translit/kk_ar.txt new file mode 100644 index 00000000000..9e6336e90f7 --- /dev/null +++ b/icu4c/source/data/translit/kk_ar.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: kk_ar.txt +# Generated from CLDR +# + +::kk-kk_FONIPA; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/kk_fa.txt b/icu4c/source/data/translit/kk_fa.txt new file mode 100644 index 00000000000..2e58e87c517 --- /dev/null +++ b/icu4c/source/data/translit/kk_fa.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: kk_fa.txt +# Generated from CLDR +# + +::kk-kk_FONIPA; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/kk_kk_FONIPA.txt b/icu4c/source/data/translit/kk_kk_FONIPA.txt new file mode 100644 index 00000000000..24be67e0138 --- /dev/null +++ b/icu4c/source/data/translit/kk_kk_FONIPA.txt @@ -0,0 +1,68 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: kk_kk_FONIPA.txt +# Generated from CLDR +# + +# http://en.wikipedia.org/wiki/Kazakh_language#Phonology +# +# Output: +# m n ŋ +# p b t d k ɡ q ɢ +# f v x h +# s z ʃ ʒ ɕ t\u0361s t\u0361ɕ +# j w l ɾ +# ʉ ʊ ɘ ə ɛ æ ɑ +# i\u032Fɘ y\u032Fʉ u\u032Fʊ +::NFC; +::Lower; +ә → æ; +а → ɑ; +п → p; +б → b; +д → d; +е → i\u032Fɘ; +г → ɡ; +ғ → ɢ; +һ → h; +і → ɘ; +й → j; +к → k; +қ → q; +л → l; +м → m; +н → n; +ң → ŋ; +р → ɾ; +с → s; +т → t; +у → w; +з → z; +ш → ʃ; +ж → ʒ; +ы → ə; +ө → y\u032Fʉ; +о → u\u032Fʊ; +ү → ʉ; +ұ → ʊ; +# Some characters that are not really Kazakh, but appear frequently +# in Kazakh-language text as part of loanwords. +в → v; +и → i; +ц → t\u0361s; +ч → t\u0361ɕ; +щ → ɕ; +х → x; +ф → f; +э → ɛ; +ю → ju; +я → jɑ; +ё → jo; +ъ →; +ь →; +\- → ' '; + diff --git a/icu4c/source/data/translit/Kazakh_Latin_BGN.txt b/icu4c/source/data/translit/kk_kk_Latn_BGN.txt similarity index 53% rename from icu4c/source/data/translit/Kazakh_Latin_BGN.txt rename to icu4c/source/data/translit/kk_kk_Latn_BGN.txt index ed695acbdda..4e58f187a3d 100644 --- a/icu4c/source/data/translit/Kazakh_Latin_BGN.txt +++ b/icu4c/source/data/translit/kk_kk_Latn_BGN.txt @@ -1,14 +1,44 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Kazakh_Latin_BGN.txt +# File: kk_kk_Latn_BGN.txt # Generated from CLDR # + +# +######################################################################## +# BGN/PCGN 1979 System +# +# The BGN/PCGN system for Kazakh Cyrillic was designed for use in +# romanizing names written in the Kazakh Cyrillic alphabet. +# The Kazakh Cyrillic alphabet contains nine letters not present +# in the Russian alphabet: Әә, Ғғ, Ққ, Ңң, Өө, Ұұ, Үү, Һһ, and Іі. +# +# The Kazakh Cyrillic Alphabet as defined by the BGN (Page 47): +# +# АӘБВГҒДЕЁЖЗИЙКҚЛМНҢОӨПРСТУҰҮФХҺЦЧШЩЪЫІЬЭЮЯ +# аәбвгғдеёжзийкқлмнңоөпрстуұүфхһцчшщъыіьэюя +# +# Originally prepared by Michael Everson +######################################################################## +# +# MINIMAL FILTER: KazakhCyrl-Latin +# :: [АӘБВГҒДЕЁЖЗИЙКҚЛМНҢОӨПРСТУҰҮФХҺЦЧШЩЪЫІЬЭЮЯаәбвгғдеёжзийкқлмнңоөпрстуұүфхһцчшщъыіьэюя] ; :: NFD (NFC) ; +# +# +######################################################################## +# +######################################################################## +# +# Define All Transformation Variables +# +######################################################################## +# $prime = ʹ ; $doublePrime = ʺ ; $upperConsonants = [БВГҒДЖЗЙКҚЛМНҢПРСТФХҺЦЧШЩЪЬ] ; @@ -18,7 +48,22 @@ $upperVowels = [АӘЕЁИОӨУҰҮЫІЭЮЯ] ; $lowerVowels = [аәеёиоөуұүыіэюя] ; $vowels = [$upperVowels $lowerVowels] ; $lower = [$lowerConsonants $lowerVowels] ; +# +# +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/cgi-bin/icu-bugs/transliterate?id=2034;expression=boundary;user=guest +# $wordBoundary = [^[:L:][:M:][:N:]] ; +# +# +######################################################################## +# +######################################################################## +# +# Start of Alphabetic Transformations +# +######################################################################## +# А → A ; # CYRILLIC CAPITAL LETTER A а → a ; # CYRILLIC SMALL LETTER A Ә → Ä ; # CYRILLIC CAPITAL LETTER SCHWA @@ -27,11 +72,33 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; б → b ; # CYRILLIC SMALL LETTER BE В → V ; # CYRILLIC CAPITAL LETTER VE в → v ; # CYRILLIC SMALL LETTER VE +# +# +######################################################################## +# +# BGN Page 48 Rule 1 +# +# The character sequences гһ, зһ, кһ, нг, сһ and цһ may be romanized +# g·h, z·h, k·h, n·g, s·h and ts·h in order to differentiate those +# romanizations from the digraphs gh, zh, kh, ng, sh, and the letter +# sequence tsh, which are used to render the characters г, ж, х, ң, ш, +# and the character sequence тш. +# +######################################################################## +# ГҺ → G·H ; # CYRILLIC CAPITAL LETTER GHE Гһ → G·h ; # CYRILLIC CAPITAL LETTER GHE гһ → g·h ; # CYRILLIC SMALL LETTER GHE Г → G ; # CYRILLIC CAPITAL LETTER GHE г → g ; # CYRILLIC SMALL LETTER GHE +# +# +######################################################################## +# +# End Rule 1 +# +######################################################################## +# Ғ} $lower → Gh ; # CYRILLIC CAPITAL LETTER GHE WITH STROKE Ғ → GH ; # CYRILLIC CAPITAL LETTER GHE WITH STROKE ғ → gh ; # CYRILLIC SMALL LETTER GHE WITH STROKE @@ -45,31 +112,85 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; Ж} $lower → Zh ; # CYRILLIC CAPITAL LETTER ZHE Ж → ZH ; # CYRILLIC CAPITAL LETTER ZHE ж → zh ; # CYRILLIC SMALL LETTER ZHE +# +# +######################################################################## +# +# BGN Page 48 Rule 1 +# +# зһ becomes z·h +# +######################################################################## +# ЗҺ → Z·H ; # CYRILLIC CAPITAL LETTER ZE Зһ → Z·h ; # CYRILLIC CAPITAL LETTER ZE зһ → z·h ; # CYRILLIC SMALL LETTER ZE З → Z ; # CYRILLIC CAPITAL LETTER ZE з → z ; # CYRILLIC SMALL LETTER ZE +# +# +######################################################################## +# +# End Rule 1 +# +######################################################################## +# И → Ī ; # CYRILLIC CAPITAL LETTER I и → ī ; # CYRILLIC SMALL LETTER I Й → Y ; # CYRILLIC CAPITAL LETTER I й → y ; # CYRILLIC SMALL LETTER I +# +# +######################################################################## +# +# BGN Page 48 Rule 1 +# +# кһ becomes k·h +# +######################################################################## +# КҺ → K·H ; # CYRILLIC CAPITAL LETTER KA Кһ → K·h ; # CYRILLIC CAPITAL LETTER KA кһ → k·h ; # CYRILLIC SMALL LETTER KA К → K ; # CYRILLIC CAPITAL LETTER KA к → k ; # CYRILLIC SMALL LETTER KA +# +# +######################################################################## +# +# End Rule 1 +# +######################################################################## +# Қ → Q ; # CYRILLIC CAPITAL LETTER KA WITH DESCENDER қ → q ; # CYRILLIC SMALL LETTER KA WITH DESCENDER Л → L ; # CYRILLIC CAPITAL LETTER EL л → l ; # CYRILLIC SMALL LETTER EL М → M ; # CYRILLIC CAPITAL LETTER EM м → m ; # CYRILLIC SMALL LETTER EM +# +# +######################################################################## +# +# BGN Page 48 Rule 1 +# +# нг becomes n·g +# +######################################################################## +# НГ → N·G ; # CYRILLIC CAPITAL LETTER EN Нг → N·g ; # CYRILLIC CAPITAL LETTER EN нг → n·g ; # CYRILLIC SMALL LETTER EN Н → N ; # CYRILLIC CAPITAL LETTER EN н → n ; # CYRILLIC SMALL LETTER EN +# +# +######################################################################## +# +# End Rule 1 +# +######################################################################## +# Ң} $lower → Ng ; # CYRILLIC CAPITAL LETTER EN WITH DESCENDER Ң → NG ; # CYRILLIC CAPITAL LETTER EN WITH DESCENDER ң → ng ; # CYRILLIC SMALL LETTER EN WITH DESCENDER @@ -81,11 +202,29 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; п → p ; # CYRILLIC SMALL LETTER PE Р → R ; # CYRILLIC CAPITAL LETTER ER р → r ; # CYRILLIC SMALL LETTER ER +# +# +######################################################################## +# +# BGN Page 48 Rule 1 +# +# сһ becomes s·h +# +######################################################################## +# СҺ → S·H ; # CYRILLIC CAPITAL LETTER ES Сһ → S·h ; # CYRILLIC CAPITAL LETTER ES сһ → s·h ; # CYRILLIC SMALL LETTER ES С → S ; # CYRILLIC CAPITAL LETTER ES с → s ; # CYRILLIC SMALL LETTER ES +# +# +######################################################################## +# +# End Rule 1 +# +######################################################################## +# Т → T ; # CYRILLIC CAPITAL LETTER TE т → t ; # CYRILLIC SMALL LETTER TE У → Ū ; # CYRILLIC CAPITAL LETTER U @@ -101,15 +240,43 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; х → kh ; # CYRILLIC SMALL LETTER HA Һ → H ; # CYRILLIC CAPITAL LETTER SHHA һ → h ; # CYRILLIC SMALL LETTER SHHA +# +# +######################################################################## +# +# BGN Page 48 Rule 1 +# +# цһ becomes ts·h +# +######################################################################## +# ЦҺ → TS·H ; # CYRILLIC CAPITAL LETTER GHE Цһ → Ts·h ; # CYRILLIC CAPITAL LETTER GHE цһ → ts·h ; # CYRILLIC SMALL LETTER GHE Ц} $lower → Ts ; # CYRILLIC CAPITAL LETTER TSE Ц → TS ; # CYRILLIC CAPITAL LETTER TSE ц → ts ; # CYRILLIC SMALL LETTER TSE +# +# +######################################################################## +# +# End Rule 1 +# +######################################################################## +# Ч} $lower → Ch ; # CYRILLIC CAPITAL LETTER CHE Ч → CH ; # CYRILLIC CAPITAL LETTER CHE ч → ch ; # CYRILLIC SMALL LETTER CHE +# +# +######################################################################## +# +# Implied rule from BGN Russian-Latin transliteration (Page 94 Note 3.6). +# +# шч becomes sh·ch +# +######################################################################## +# ШЧ → SH·CH ; # CYRILLIC CAPITAL LETTER SHA Шч → Sh·ch ; # CYRILLIC CAPITAL LETTER SHA шч → sh·ch ; # CYRILLIC SMALL LETTER SHA @@ -119,10 +286,43 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; Щ} $lower → Shch ; # CYRILLIC CAPITAL LETTER SHCHA Щ → SHCH ; # CYRILLIC CAPITAL LETTER SHCHA щ → shch ; # CYRILLIC SMALL LETTER SHCHA +# +# +######################################################################## +# +# End Implied rule +# +######################################################################## +# Ъ → $doublePrime ; # CYRILLIC CAPITAL LETTER HARD SIGN ъ → $doublePrime ; # CYRILLIC SMALL LETTER HARD SIGN +# +# +######################################################################## +# +# BGN Page 48 Note 2 +# +# The character Ыы may be romanized Ɨɨ instead of Yy, if so desired. +# +######################################################################## +# Ы → Y ; # CYRILLIC CAPITAL LETTER YERU ы → y ; # CYRILLIC SMALL LETTER YERU +# +# +# Alternative rule to implement the option described here. To apply +# uncomment the following by removing the '#' mark at the start of the +# line and insert before the two rule lines above. +# +#Ы → Ɨ ; # CYRILLIC CAPITAL LETTER YERU +#ы → ɨ ; # CYRILLIC SMALL LETTER YERU +# +######################################################################## +# +# End BGN Page 48 Note 2 +# +######################################################################## +# І → I ; # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I і → i ; # CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I Ь → $prime ; # CYRILLIC CAPITAL LETTER SOFT SIGN @@ -135,3 +335,7 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; Я} $lower → Ya ; # CYRILLIC CAPITAL LETTER YA Я → YA ; # CYRILLIC CAPITAL LETTER YA я → ya ; # CYRILLIC SMALL LETTER YA +# +# +######################################################################## + diff --git a/icu4c/source/data/translit/Korean_Latin_BGN.txt b/icu4c/source/data/translit/ko_ko_Latn_BGN.txt similarity index 87% rename from icu4c/source/data/translit/Korean_Latin_BGN.txt rename to icu4c/source/data/translit/ko_ko_Latn_BGN.txt index 682493d99de..0e4ef0bfd42 100644 --- a/icu4c/source/data/translit/Korean_Latin_BGN.txt +++ b/icu4c/source/data/translit/ko_ko_Latn_BGN.txt @@ -1,18 +1,72 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Korean_Latin_BGN.txt +# File: ko_ko_Latn_BGN.txt # Generated from CLDR # + +# +######################################################################## +# BGN/PCGN Agreement +# +# This system wad devised by G. M. McCune and E. O. Reischauer, and +# was originally published in the Transactions of the Korea Branch of +# the Royal Asiatic Society, Volume XXIX, 1939. It has been used by +# the BGN since 1943, and was later adopted for use by the PCGN. A +# main characteristic of this system is the attempt to represent +# approximate Korean pronunciation, while systematically converting +# the Hangul characters to corresponding Roman-script letters. Since +# Korean pronunciation is often inconsistently represented in Hangul, +# the McCune-Reischauer conversion tables are rather elaborate, and +# reverse conversion (from Roman script back to Hangul) is not possible. +# +# Originally prepared by Michael Everson +######################################################################## +# +# MINIMAL FILTER: Korean-Latin +# :: [ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑᄒᄭᄯᄲᄶᅡᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵᆨᆫᆮᆯᆰᆱᆲᆷᆸᆺᆼᆽᆾᆿᇀᇁ] ; :: NFD (NFC) ; +# +# +######################################################################## +# +######################################################################## +# +# Define All Transformation Variables +# +######################################################################## +# $aspirate = ’; $apostrophe = ’; $vowels = [ᅡᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵ] ; +# +# +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/cgi-bin/icu-bugs/transliterate?id=2034;expression=boundary;user=guest +# $wordBoundary = [^[:L:][:M:][:N:]] ; +# +# +######################################################################## +# +######################################################################## +# +# Rules moved to front to avoid masking +# +######################################################################## +# +######################################################################## +# +# BGN Page 60 Rule 1: +# +# Romanization of Hangul consonants and consonant clusters within words. +# +######################################################################## +# ᆨᄀ → kk ; # HANGUL JONGSEONG KIYEOK + CHOSEONG KIYEOK ᆨᄂ → ngn ; # HANGUL JONGSEONG KIYEOK + CHOSEONG NIEUN ᆨᄃ → kt ; # HANGUL JONGSEONG KIYEOK + CHOSEONG TIKEUT @@ -222,6 +276,19 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; ᆲᄈ → lpp ; # HANGUL JONGSEONG RIEUL-PIEUP + CHOSEONG SSANGPIEUP ᆲᄊ → pss ; # HANGUL JONGSEONG RIEUL-PIEUP + CHOSEONG SSANGSIOS ᆲᄍ → ptch ; # HANGUL JONGSEONG RIEUL-PIEUP + CHOSEONG SSANGCIEUC +# +# +######################################################################## +# +# End of Rule 1 +# +######################################################################## +# +######################################################################## +# +# Start of Transformations +# +######################################################################## $wordBoundary{ᄀ → k ; # HANGUL CHOSEONG KIYEOK $wordBoundary{ᄂ → n ; # HANGUL CHOSEONG NIEUN $wordBoundary{ᄃ → t ; # HANGUL CHOSEONG TIKEUT @@ -281,3 +348,7 @@ $wordBoundary{ᄶ → tch ; # HANGUL CHOSEONG SIOS-CIEUC ᇁ}$wordBoundary → p ; # HANGUL JONGSEONG PHIEUPH ᆰ}$wordBoundary → k ; # HANGUL JONGSEONG RIEUL-KIYEOK ᆲ}$wordBoundary → p ; # HANGUL JONGSEONG RIEUL-PIEUP +# +# +######################################################################## + diff --git a/icu4c/source/data/translit/ky_am.txt b/icu4c/source/data/translit/ky_am.txt new file mode 100644 index 00000000000..0b89baaa985 --- /dev/null +++ b/icu4c/source/data/translit/ky_am.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: ky_am.txt +# Generated from CLDR +# + +::ky-ky_FONIPA; +::am_FONIPA-am; + diff --git a/icu4c/source/data/translit/ky_ar.txt b/icu4c/source/data/translit/ky_ar.txt new file mode 100644 index 00000000000..b807e9f7111 --- /dev/null +++ b/icu4c/source/data/translit/ky_ar.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: ky_ar.txt +# Generated from CLDR +# + +::ky-ky_FONIPA; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/ky_fa.txt b/icu4c/source/data/translit/ky_fa.txt new file mode 100644 index 00000000000..ae1627edf9b --- /dev/null +++ b/icu4c/source/data/translit/ky_fa.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: ky_fa.txt +# Generated from CLDR +# + +::ky-ky_FONIPA; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/ky_ky_FONIPA.txt b/icu4c/source/data/translit/ky_ky_FONIPA.txt index a59dc1069d9..c5cf34053ed 100755 --- a/icu4c/source/data/translit/ky_ky_FONIPA.txt +++ b/icu4c/source/data/translit/ky_ky_FONIPA.txt @@ -1,6 +1,6 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** @@ -8,11 +8,11 @@ # Generated from CLDR # +# Transformation from Kyrgyz (ky) to its IPA transcription (ky_FONIPA). +# http://en.wikipedia.org/wiki/Kyrgyz_alphabet#Correspondence_chart ::Lower; ::NFC; - $consonant_sound = [bdfɡklmnŋpqrʁsʃtvzʒχ]; - аа → ɑː; а → ɑ; б → b; @@ -38,9 +38,9 @@ $consonant_sound = [bdfɡklmnŋpqrʁsʃtvzʒχ]; $consonant_sound {к} [$] → q; # eg. даңк, калк, кырк [ŋ] {к} → q; к → k; - +# TODO(sascha): Verify whether /lʲ/ is really phonemic in Kyrgyz; +# is there really a minimal pair with /l/ versus /lʲ/? [eøy] ː? {л}к → lʲ; # eg. мүлк, күлкү, өлкө, эзелки - лл → lː; л → l; мм → mː; @@ -76,3 +76,5 @@ $consonant_sound {к} [$] → q; # eg. даңк, калк, кырк э → e; ю → ju; я → jɑ; +\- → ' '; + diff --git a/icu4c/source/data/translit/Kirghiz_Latin_BGN.txt b/icu4c/source/data/translit/ky_ky_Latn_BGN.txt similarity index 58% rename from icu4c/source/data/translit/Kirghiz_Latin_BGN.txt rename to icu4c/source/data/translit/ky_ky_Latn_BGN.txt index 6ea9a4085b3..4c034e90e0a 100644 --- a/icu4c/source/data/translit/Kirghiz_Latin_BGN.txt +++ b/icu4c/source/data/translit/ky_ky_Latn_BGN.txt @@ -1,14 +1,44 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Kirghiz_Latin_BGN.txt +# File: ky_ky_Latn_BGN.txt # Generated from CLDR # + +# +######################################################################## +# BGN/PCGN 1979 System +# +# The BGN/PCGN system for Kirghiz Cyrillic was designed for use in +# romanizing names written in the Kirghiz Cyrillic alphabet. +# The Kirghiz Cyrillic alphabet contains three letters not present +# in the Russian alphabet: Ңң, Өө, and Үү. +# +# The Kirghiz Cyrillic Alphabet as defined by the BGN (Page 55): +# +# АБВГДЕЁЖЗИЙКЛМНҢОӨПРСТУҮФХЦЧШЩЪЫЬЭЮЯ +# абвгдеёжзийклмнңоөпрстуүфхцчшщъыьэюя +# +# Originally prepared by Michael Everson +######################################################################## +# +# MINIMAL FILTER: KirghizCyrl-Latin +# :: [АБВГДЕЁЖЗИЙКЛМНҢОӨПРСТУҮФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнңоөпрстуүфхцчшщъыьэюя] ; :: NFD (NFC) ; +# +# +######################################################################## +# +######################################################################## +# +# Define All Transformation Variables +# +######################################################################## +# $prime = ʹ ; $doublePrime = ʺ ; $upperConsonants = [БВГДЖЗЙКЛМНҢПРСТФХЦЧШЩЪЬ] ; @@ -18,7 +48,22 @@ $upperVowels = [АЕЁИОӨУҮЫЭЮЯ] ; $lowerVowels = [аеёиоөуүыэюя] ; $vowels = [$upperVowels $lowerVowels] ; $lower = [$lowerConsonants $lowerVowels] ; +# +# +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/cgi-bin/icu-bugs/transliterate?id=2034;expression=boundary;user=guest +# $wordBoundary = [^[:L:][:M:][:N:]] ; +# +# +######################################################################## +# +######################################################################## +# +# Start of Alphabetic Transformations +# +######################################################################## +# А → A ; # CYRILLIC CAPITAL LETTER A а → a ; # CYRILLIC SMALL LETTER A Б → B ; # CYRILLIC CAPITAL LETTER BE @@ -48,11 +93,31 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; л → l ; # CYRILLIC SMALL LETTER EL М → M ; # CYRILLIC CAPITAL LETTER EM м → m ; # CYRILLIC SMALL LETTER EM +# +# +######################################################################## +# +# BGN Page 56 Rule 1 +# +# The character sequence нг may be romanized n·g in order to differentiate +# that romanizations from the digraph ng, which is used to render the +# character ң. +# +######################################################################## +# НГ → N·G ; # CYRILLIC CAPITAL LETTER EN Нг → N·g ; # CYRILLIC CAPITAL LETTER EN нг → n·g ; # CYRILLIC SMALL LETTER EN Н → N ; # CYRILLIC CAPITAL LETTER EN н → n ; # CYRILLIC SMALL LETTER EN +# +# +######################################################################## +# +# End Rule 1 +# +######################################################################## +# Ң} $lower → Ng ; # CYRILLIC CAPITAL LETTER EN WITH DESCENDER Ң → NG ; # CYRILLIC CAPITAL LETTER EN WITH DESCENDER ң → ng ; # CYRILLIC SMALL LETTER EN WITH DESCENDER @@ -83,6 +148,16 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; Ч} $lower → Ch ; # CYRILLIC CAPITAL LETTER CHE Ч → CH ; # CYRILLIC CAPITAL LETTER CHE ч → ch ; # CYRILLIC SMALL LETTER CHE +# +# +######################################################################## +# +# Implied rule from BGN Russian-Latin transliteration (Page 94 Note 3.6). +# +# шч becomes sh·ch +# +######################################################################## +# ШЧ → SH·CH ; # CYRILLIC CAPITAL LETTER SHA Шч → Sh·ch ; # CYRILLIC CAPITAL LETTER SHA шч → sh·ch ; # CYRILLIC SMALL LETTER SHA @@ -92,10 +167,43 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; Щ} $lower → Shch ; # CYRILLIC CAPITAL LETTER SHCHA Щ → SHCH ; # CYRILLIC CAPITAL LETTER SHCHA щ → shch ; # CYRILLIC SMALL LETTER SHCHA +# +# +######################################################################## +# +# End Rule 3.6 +# +######################################################################## +# Ъ → $doublePrime ; # CYRILLIC CAPITAL LETTER HARD SIGN ъ → $doublePrime ; # CYRILLIC SMALL LETTER HARD SIGN +# +# +######################################################################## +# +# BGN Page 56 Note 2 +# +# The character Ыы may be romanized Ɨɨ instead of Yy, if so desired. +# +######################################################################## +# Ы → Y ; # CYRILLIC CAPITAL LETTER YERU ы → y ; # CYRILLIC SMALL LETTER YERU +# +# +# Alternative rule to implement the option described here. To apply +# uncomment the following by removing the '#' mark at the start of the +# line and insert before the two rule lines above. +# +#Ы → Ɨ ; # CYRILLIC CAPITAL LETTER YERU +#ы → ɨ ; # CYRILLIC SMALL LETTER YERU +# +######################################################################## +# +# End BGN Page 56 Note 2 +# +######################################################################## +# Ь → $prime ; # CYRILLIC CAPITAL LETTER SOFT SIGN ь → $prime ; # CYRILLIC SMALL LETTER SOFT SIGN Э → E ; # CYRILLIC CAPITAL LETTER E @@ -106,3 +214,7 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; Я} $lower → Ya ; # CYRILLIC CAPITAL LETTER YA Я → YA ; # CYRILLIC CAPITAL LETTER YA я → ya ; # CYRILLIC SMALL LETTER YA +# +# +######################################################################## + diff --git a/icu4c/source/data/translit/la_la_FONIPA.txt b/icu4c/source/data/translit/la_la_FONIPA.txt index 7a73519cb55..3a7a3c71aea 100755 --- a/icu4c/source/data/translit/la_la_FONIPA.txt +++ b/icu4c/source/data/translit/la_la_FONIPA.txt @@ -1,6 +1,6 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** @@ -8,12 +8,18 @@ # Generated from CLDR # +# Transformation from Latin (la) to its IPA transcription (la_FONIPA). +# http://en.wikipedia.org/wiki/Latin_spelling_and_pronunciation +# http://en.wikipedia.org/wiki/Wikipedia:IPA_for_Latin +# +# These rules follow the Wikipedia description of the presumed pronunciation +# of Classical Latin. This is different from Medieval Latin, and it is also +# different from the ecclesiastical pronunciation used by the Roman Catholic +# church. ::Lower; ::NFC; - $vowel = [aáàăāeéèĕēiíìĭīoóòŏōuúùŭūæœ]; $end_of_word = [$ ]; - ae → aj; av → aw; æ → aj; # 19th century English orthography @@ -60,10 +66,9 @@ xs → ks; # Old Latin spelling x → ks; y → y; # Greek loanwords z → d\u0361z; # eg. zerum - ::Null; - - +# Gemination of double consonants. +# http://en.wikipedia.org/wiki/Latin_spelling_and_pronunciation#Double_consonants bb → bː; dd → dː; ɡɡ → ɡː; @@ -76,7 +81,8 @@ pp → pː; rr → rː; ss → sː; tt → tː; - +# Velarization of [l]. +# http://en.wikipedia.org/wiki/Latin_spelling_and_pronunciation#cite_note-20 {l} [^aeɛiouː] → ɫ; - ::NFC; + diff --git a/icu4c/source/data/translit/lt_Lower.txt b/icu4c/source/data/translit/lt_Lower.txt index 6b118baea70..f321ae2b262 100755 --- a/icu4c/source/data/translit/lt_Lower.txt +++ b/icu4c/source/data/translit/lt_Lower.txt @@ -1,12 +1,22 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: lt_Lower.txt # Generated from CLDR # + +# Introduce an explicit dot above when lowercasing capital Is and Js +# whenever there are more accents above. +# (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) +# 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I +# 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J +# 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK +# 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE +# 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE +# 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE ::NFD(); I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307; J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307; @@ -16,3 +26,4 @@ I \u0301 → i \u0307 \u0301; I \u0303 → i \u0307 \u0303; ::Any-Lower(); ::NFC(); + diff --git a/icu4c/source/data/translit/lt_Title.txt b/icu4c/source/data/translit/lt_Title.txt index 050edc91ff8..56fa8e9e8cc 100755 --- a/icu4c/source/data/translit/lt_Title.txt +++ b/icu4c/source/data/translit/lt_Title.txt @@ -1,12 +1,14 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: lt_Title.txt # Generated from CLDR # + +# Make any string of letters after a cased letter be lower ::NFD(); [:cased:] [:case-ignorable:]* {I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307; [:cased:] [:case-ignorable:]* {J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307; @@ -15,6 +17,8 @@ [:cased:] [:case-ignorable:]* {I \u0301 → i \u0307 \u0301; [:cased:] [:case-ignorable:]* {I \u0303 → i \u0307 \u0303; [:cased:] [:case-ignorable:]* { (.) → &Any-Lower($1) ; +# Otherwise all lowercase go to upper (titlecase stay as is) [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ; ([:Lowercase:]) → &Any-Upper($1) ; ::NFC(); + diff --git a/icu4c/source/data/translit/lt_Upper.txt b/icu4c/source/data/translit/lt_Upper.txt index 2fbc1377bbe..f070c4ff7cb 100644 --- a/icu4c/source/data/translit/lt_Upper.txt +++ b/icu4c/source/data/translit/lt_Upper.txt @@ -1,13 +1,17 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: lt_Upper.txt # Generated from CLDR # + +# Copyright (C) 2011-2013, Apple Inc.; Unicode, Inc.; and others. All Rights Reserved. +# Remove \u0307 following soft-dotteds (i, j, and the like), with possible intervening non-230 marks. ::NFD(); [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ; ::Any-Upper(); ::NFC(); + diff --git a/icu4c/source/data/translit/Macedonian_Latin_BGN.txt b/icu4c/source/data/translit/mk_mk_Latn_BGN.txt similarity index 55% rename from icu4c/source/data/translit/Macedonian_Latin_BGN.txt rename to icu4c/source/data/translit/mk_mk_Latn_BGN.txt index e0289da2a8b..b69d4de4c74 100644 --- a/icu4c/source/data/translit/Macedonian_Latin_BGN.txt +++ b/icu4c/source/data/translit/mk_mk_Latn_BGN.txt @@ -1,14 +1,45 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Macedonian_Latin_BGN.txt +# File: mk_mk_Latn_BGN.txt # Generated from CLDR # + +# +######################################################################## +# BGN/PCGN 1981 System +# +# Macedonian was official established as a literary language in +# Yugoslavia during World War II and is now the official language +# of Macedonia. Its alphabet is identical to Serbian, except +# that the letters Ђђ and Ћћ are replaced by Ѓѓ and Ќќ, and +# the letter Ѕѕ and the apostrophe are added. +# +# The Macedonian Alphabet as defined by the BGN (Page 69): +# +# АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШ +# абвгдѓежзѕијклљмнњопрстќуфхцчџш’ +# +# Originally prepared by Michael Everson +######################################################################## +# +# MINIMAL FILTER: Macedonian-Latin +# :: [АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш’] ; :: NFD (NFC) ; +# +# +######################################################################## +# +######################################################################## +# +# Define All Transformation Variables +# +######################################################################## +# $prime = ’ ; $upperConsonants = [БВГДЃЖЗЅЈКЛЉМНЊПРСТЌФХЦЧЏШ] ; $lowerConsonants = [бвгдѓжзѕјклљмнњпрстќфхцчџш’] ; @@ -17,7 +48,22 @@ $upperVowels = [АЕИОУ] ; $lowerVowels = [аеиоу] ; $vowels = [$upperVowels $lowerVowels] ; $lower = [$lowerConsonants $lowerVowels] ; +# +# +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/cgi-bin/icu-bugs/transliterate?id=2034;expression=boundary;user=guest +# $wordBoundary = [^[:L:][:M:][:N:]] ; +# +# +######################################################################## +# +######################################################################## +# +# Start of Alphabetic Transformations +# +######################################################################## +# А → A ; # CYRILLIC CAPITAL LETTER A а → a ; # CYRILLIC SMALL LETTER A Б → B ; # CYRILLIC CAPITAL LETTER BE @@ -28,10 +74,29 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; г → g ; # CYRILLIC SMALL LETTER GHE Д → D ; # CYRILLIC CAPITAL LETTER DE д → d ; # CYRILLIC SMALL LETTER DE +# +# +######################################################################## +# +# BGN Page 70 Rule 1: +# +# The character ѓ should be romanized g when it occurs before е +# and и. In all other instances, it should be romanized đ (Đ). +# +######################################################################## +# Ѓ}[ЕеИи] → G ; # CYRILLIC CAPITAL LETTER GJE ѓ}[ЕеИи] → g ; # CYRILLIC SMALL LETTER GJE Ѓ → Đ ; # CYRILLIC CAPITAL LETTER GJE ѓ → đ ; # CYRILLIC SMALL LETTER GJE +# +# +######################################################################## +# +# End Rule 1 +# +######################################################################## +# Е → E ; # CYRILLIC CAPITAL LETTER DE е → e ; # CYRILLIC SMALL LETTER DE Ж → Ž ; # CYRILLIC CAPITAL LETTER ZHE @@ -44,7 +109,7 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; И → I ; # CYRILLIC CAPITAL LETTER I и → i ; # CYRILLIC SMALL LETTER I Ј → J ; # CYRILLIC CAPITAL LETTER JE -ј → J ; # CYRILLIC SMALL LETTER JE +ј → j ; # CYRILLIC SMALL LETTER JE К → K ; # CYRILLIC CAPITAL LETTER KA к → k ; # CYRILLIC SMALL LETTER KA Л → L ; # CYRILLIC CAPITAL LETTER EL @@ -69,10 +134,29 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; с → s ; # CYRILLIC SMALL LETTER ES Т → T ; # CYRILLIC CAPITAL LETTER TE т → t ; # CYRILLIC SMALL LETTER TE +# +# +######################################################################## +# +# BGN Page 70 Rule 2: +# +# The character ќ should be romanized k when it occurs before е +# and и. In all other instances, it should be romanized c\u0301. +# +######################################################################## +# Ќ}[ЕеИи] → K ; # CYRILLIC CAPITAL LETTER KJE ќ}[ЕеИи] → k ; # CYRILLIC SMALL LETTER KJE Ќ → C\u0301 ; # CYRILLIC CAPITAL LETTER KJE ќ → c\u0301 ; # CYRILLIC SMALL LETTER KJE +# +# +######################################################################## +# +# End Rule 2 +# +######################################################################## +# У → U ; # CYRILLIC CAPITAL LETTER U у → u ; # CYRILLIC SMALL LETTER U Ф → F ; # CYRILLIC CAPITAL LETTER EF @@ -88,3 +172,14 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; џ → dž ; # CYRILLIC SMALL LETTER SHA Ш → Š ; # CYRILLIC CAPITAL LETTER SHA ш → š ; # CYRILLIC SMALL LETTER SHA +# +# +######################################################################## +# +# BGN Page 69 Rule 32, maps the symbol onto itself and +# is ignored here for computational efficiency. +# +# $prime → $prime ; # RIGHT SINGLE QUOTATION MARK +# +######################################################################## + diff --git a/icu4c/source/data/translit/Mongolian_Latin_BGN.txt b/icu4c/source/data/translit/mn_mn_Latn_BGN.txt similarity index 58% rename from icu4c/source/data/translit/Mongolian_Latin_BGN.txt rename to icu4c/source/data/translit/mn_mn_Latn_BGN.txt index 9bb8e740524..33c9b9f1300 100644 --- a/icu4c/source/data/translit/Mongolian_Latin_BGN.txt +++ b/icu4c/source/data/translit/mn_mn_Latn_BGN.txt @@ -1,14 +1,43 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Mongolian_Latin_BGN.txt +# File: mn_mn_Latn_BGN.txt # Generated from CLDR # -:: [АБВГДЕЁЖЗИЙКЛМНОӨПРСТУҮФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмноөпрстуүфхцчшщъыьэюя] ; -:: NFD (NFC) ; + +######################################################################## +# BGN/PCGN 1964 System +# +# The BGN/PCGN system for Mongolian was adopted by the BGN in 1957 +# and by the PCGN in 1964 for use in romanizing names written in +# the Mongolian Cyrillic alphabet. The Mongolian Cyrillic alphabet +# contains two letters not present in the Russian alphabet, Өө +# and Үү. Names written in the indigenous Mongolian alphabet, which +# is still utilized in the Inner Mongolia Autonomous Region of China, +# are not romanized by BGN and PCGN. Instead, for such names, +# BGN and PCGN utilize the Roman-script spellings appearing in +# official sources published by the People's Republic of China +# +# The Mongolian Alphabet as defined by the BGN (Page 73): +# +# АБВГДЕЁЖЗИЙКЛМНОӨПРСТУҮФХЦЧШЩЪЫЬЭЮЯ +# абвгдеёжзийклмноөпрстуүфхцчшщъыьэюя +# +# Originally prepared by Michael Everson +######################################################################## +# +# MINIMAL FILTER: Mongolian-Latin, works both in NFC and NFD +::[АБВГДЕЁЖЗИЙКЛМНОӨПРСТУҮФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмноөпрстуүфхцчшщъыьэюя\u0308]; +::NFC; +######################################################################## +# +# Define All Transformation Variables +# +######################################################################## +# $prime = ʹ ; $upperConsonants = [БВГДЖЙКЛМНПРСТФХЦЧШЩЭ] ; $lowerConsonants = [бвгджйклмнпрстфхцчшщэ] ; @@ -17,7 +46,12 @@ $upperVowels = [АЕЁЭИОУЫЮЯ] ; $lowerVowels = [аеёэиоуыюя] ; $vowels = [$upperVowels $lowerVowels] ; $lower = [$lowerConsonants $lowerVowels] ; +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/cgi-bin/icu-bugs/transliterate?id=2034;expression=boundary;user=guest $wordBoundary = [^[:L:][:M:][:N:]] ; +######################################################################## +# Start of Alphabetic Transformations +######################################################################## А → A ; # CYRILLIC CAPITAL LETTER A а → a ; # CYRILLIC SMALL LETTER A Б → B ; # CYRILLIC CAPITAL LETTER BE @@ -77,6 +111,10 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; Ч} $lower → Ch ; # CYRILLIC CAPITAL LETTER CHE Ч → CH ; # CYRILLIC CAPITAL LETTER CHE ч → ch ; # CYRILLIC SMALL LETTER CHE +######################################################################## +# Implied rule from BGN Russian-Latin transliteration (Page 94 Note 3.6). +# шч becomes sh·ch +######################################################################## ШЧ → SH·CH ; # CYRILLIC CAPITAL LETTER SHA Шч → Sh·ch ; # CYRILLIC CAPITAL LETTER SHA шч → sh·ch ; # CYRILLIC SMALL LETTER SHA @@ -86,6 +124,9 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; Щ} $lower → Shch ; # CYRILLIC CAPITAL LETTER SHCHA Щ → SHCH ; # CYRILLIC CAPITAL LETTER SHCHA щ → shch ; # CYRILLIC SMALL LETTER SHCHA +######################################################################## +# End Implied rule +######################################################################## Ъ → $prime ; # CYRILLIC CAPITAL LETTER HARD SIGN ъ → $prime ; # CYRILLIC SMALL LETTER HARD SIGN Ы → Ï ; # CYRILLIC CAPITAL LETTER YERU @@ -94,9 +135,26 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; ь → ĭ ; # CYRILLIC SMALL LETTER SOFT SIGN Э → E ; # CYRILLIC CAPITAL LETTER E э → e ; # CYRILLIC SMALL LETTER E +######################################################################## +# +# BGN Page 74 Rule 7 +# +# In monosyllables, the character ю is romanized yu or yü depending on +# pronunciation; in polysyllables, it is romanized yu when followed by +# a, o, or u, buy yü when followed by i, e, ö, or ü. +# +# This rule is lexical and has not been implemented in this file. +# +######################################################################## Ю} $lower → Yu ; # CYRILLIC CAPITAL LETTER YU Ю → YU ; # CYRILLIC CAPITAL LETTER YU ю → yu ; # CYRILLIC SMALL LETTER YU +######################################################################## +# +# End Rule 7 +# +######################################################################## Я} $lower → Ya ; # CYRILLIC CAPITAL LETTER YA Я → YA ; # CYRILLIC CAPITAL LETTER YA я → ya ; # CYRILLIC SMALL LETTER YA + diff --git a/icu4c/source/data/translit/mn_mn_Latn_MNS.txt b/icu4c/source/data/translit/mn_mn_Latn_MNS.txt new file mode 100644 index 00000000000..b18855d3a4c --- /dev/null +++ b/icu4c/source/data/translit/mn_mn_Latn_MNS.txt @@ -0,0 +1,95 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: mn_mn_Latn_MNS.txt +# Generated from CLDR +# + +# Transliteration of Mongolian Cyrillic Characters into Mongolian Latin +# Characters according to Mongolian National Standard MNS 5217:2012. +# http://estandard.gov.mn/file.php?sid=2579 +::[[:Cyrl:]]; +$lower = [[:Ll:]]; +А → A; +а → a; +Б → B; +б → b; +В → V; +в → v; +Г → G; +г → g; +Д → D; +д → d; +Е} $lower → Ye; +Е → YE; +е → ye; +Ё} $lower → Yo; +Ё → YO; +ё → yo; +Ж → J; +ж → j; +З → Z; +з → z; +К → K; +к → k; +И → I; +и → i; +Й → I; +й → i; +Л → L; +л → l; +М → M; +м → m; +Н → N; +н → n; +О → O; +о → o; +Ө → Ö; +ө → ö; +П → P; +п → p; +Р → R; +р → r; +С → S; +с → s; +Т → T; +т → t; +У → U; +у → u; +Ү → Ü; +ү → ü; +Ф → F; +ф → f; +Х} $lower → Kh; +Х → KH; +х → kh; +Ц} $lower → Ts; +Ц → TS; +ц → ts; +Ч} $lower → Ch; +Ч → CH; +ч → ch; +Ш} $lower → Sh; +Ш → SH; +ш → sh; +Щ} $lower → Sh; +Щ → SH; +щ → sh; +Ъ → I; +ъ → i; +Ы → Y; +ы → y; +Ь → I; +ь → i; +Э → E; +э → e; +Ю} $lower → Yu; +Ю → YU; +ю → yu; +Я} $lower → Ya; +Я → YA; +я → ya; + diff --git a/icu4c/source/data/translit/my_am.txt b/icu4c/source/data/translit/my_am.txt new file mode 100644 index 00000000000..7b3adc00b19 --- /dev/null +++ b/icu4c/source/data/translit/my_am.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: my_am.txt +# Generated from CLDR +# + +::my-my_FONIPA; +::am_FONIPA-am; + diff --git a/icu4c/source/data/translit/my_ar.txt b/icu4c/source/data/translit/my_ar.txt new file mode 100644 index 00000000000..7637e4acaab --- /dev/null +++ b/icu4c/source/data/translit/my_ar.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: my_ar.txt +# Generated from CLDR +# + +::my-my_FONIPA; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/my_fa.txt b/icu4c/source/data/translit/my_fa.txt new file mode 100644 index 00000000000..a8e7213cc87 --- /dev/null +++ b/icu4c/source/data/translit/my_fa.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: my_fa.txt +# Generated from CLDR +# + +::my-my_FONIPA; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/my_my_FONIPA.txt b/icu4c/source/data/translit/my_my_FONIPA.txt new file mode 100644 index 00000000000..d090bbe1fe1 --- /dev/null +++ b/icu4c/source/data/translit/my_my_FONIPA.txt @@ -0,0 +1,334 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: my_my_FONIPA.txt +# Generated from CLDR +# + +# Pronunciation rules for Burmese. +# +# The following rules are lexical and heuristic: lexical in the sense +# that they generate phoneme strings which may further undergo +# post-lexical phonological processes, in particular voicing, to +# result in actual surface forms; heuristic in the sense that they try +# to resolve ambiguities, especially around reduced vowels, in a +# systematic way that may be incorrect in many situations. Vowel +# reduction depends on many factors, such as morphemic structure, +# which are not available here. +# +# Definitions +# +# Dependent vowel signs +$vs_AA = \u102B; +$vs_aa = \u102C; +$vs_i = \u102D; +$vs_ii = \u102E; +$vs_u = \u102F; +$vs_uu = \u1030; +$vs_e = \u1031; +$vs_ai = \u1032; +# Various signs +$anusvara = \u1036; +$visarga = \u1038; +$virama = \u1039; +$asat = \u103A; +# Dependent (medial) consonant signs +$med_y = \u103B; +$med_r = \u103C; +$med_w = \u103D; +$med_h = \u103E; +# Independent letters and letter-like punctuation symbols +$independent = [\u1000-\u102A \u103F \u104C-\u104F \u1050-\u1055]; +$creaky = \u0330; +$high = \u0301; +$low = \u0300; +$coda = [$creaky $high $low ɴ ʔ ə]; # TODO: remove if unused +# +# Preprocessing +# +::NFC; +# Replace U+102B TALL AA with U+102C AA. Their pronunciation is identical. +$vs_AA → $vs_aa; +# Unstack kinzi (င\u103A plus U+1039 VIRAMA) into plain င\u103A. +# Hmm, what would happen if the syllable ending in kinzi had non-low tone? +င\u103A $virama → င\u103A; +# Unstack everything else, i.e. replace U+1039 VIRAMA with U+103A ASAT. +$virama → $asat; +# Unstack U+103F GREAT SA. +ဿ → သ\u103Aသ; +# Insert a syllable boundary marker /./ before every independent letter. +::Null; +[^.$] { } $independent ([\u1037\u103B-\u103E])* [^\u103A] → \.; +# Insert default inherent vowel: /a\u0330/ at the end, /ə/ everywhere else. +::Null; +([\u1000-\u1021\u103F] [\u103B-\u103E]*) } [$] → $1 a $creaky; +([\u1000-\u1021\u103F] [\u103B-\u103E]*) } \. → $1 ə; +# Allow for additional coda consonants. +# +# This only covers a few of the cases in which full coda consonants +# can appear in loanwords. The general situation is somewhat rare and +# is more easily dealt with in a formalism that can impose structural +# constraints on syllables more easily. +::Null; +$asat ($visarga)? [\u1000-\u102A] { $asat → ; +# Deal with ၎င\u103Aး early. +၎င\u103Aး → lə\.ɡa $high ʊ\u032Fɴ; +# +# Rhymes +# +::Null; +က\u103A → ɛʔ; +ဂ\u103A → ɛʔ; # in မဂ\u1039ဂဇင\u103Aး ~ မဂ\u103Aဂဇင\u103Aး /mɛʔ.ɡə.zɪ\u0301ɴ/ +င\u1037\u103A → ɪ $creaky ɴ; +င\u103Aး → ɪ $high ɴ; +င\u103A → ɪ $low ɴ; +စ\u103A → ɪʔ; # maybe sometimes /eɪ\u032Fʔ/ +ဉ\u1037\u103A → ɪ $creaky ɴ; +ဉ\u103Aး → ɪ $high ɴ; +ဉ\u103A → ɪ $low ɴ; +ည\u1037\u103A → ɛ $creaky; +ည\u103Aး → ɛ $high; +ည\u103A → ɛ $low; +ဏ\u1037\u103A → a $creaky ɴ; +ဏ\u103Aး → a $high ɴ; +ဏ\u103A → a $low ɴ; +တ\u103A → aʔ; +န\u1037\u103A → a $creaky ɴ; +န\u103Aး → a $high ɴ; +န\u103A → a $low ɴ; +ပ\u103A → aʔ; +မ\u1037\u103A → a $creaky ɴ; +မ\u103Aး → a $high ɴ; +မ\u103A → a $low ɴ; +ယ\u1037\u103A → ɛ $creaky; +ယ\u103Aး → ɛ $high; +ယ\u103A → ɛ $low; +သ\u103A → aʔ; +$vs_aa ဉ\u1037\u103A → ɪ $creaky ɴ; +$vs_aa ဉ\u103Aး → ɪ $high ɴ; +$vs_aa ဉ\u103A → ɪ $low ɴ; +$vs_aa တ\u103A → aʔ; +$vs_aa ဏ\u1037\u103A → a $creaky ɴ; +$vs_aa ဏ\u103Aး → a $high ɴ; +$vs_aa ဏ\u103A → a $low ɴ; +$vs_aa န\u1037\u103A → a $creaky ɴ; +$vs_aa န\u103Aး → a $high ɴ; +$vs_aa န\u103A → a $low ɴ; +$vs_aa ပ\u103A → aʔ; # in ကလာပ\u103Aစည\u103Aး /kə.laʔ.sɛ\u0301/ (club cell) +$vs_aa ယ\u1037\u103A → ɛ $creaky; +$vs_aa ယ\u103Aး → ɛ $high; +$vs_aa ယ\u103A → ɛ $low; +$vs_aa \u1037 → a $creaky; # redundant creaky tone +$vs_aa း → a $high; +$vs_aa → a $low; +$vs_i က\u103A → eɪ\u032Fʔ; +$vs_i စ\u103A → eɪ\u032Fʔ; +$vs_i တ\u103A → eɪ\u032Fʔ; +$vs_i န\u1037\u103A → e $creaky ɪ\u032Fɴ; +$vs_i န\u103Aး → e $high ɪ\u032Fɴ; +$vs_i န\u103A → e $low ɪ\u032Fɴ; +$vs_i ပ\u103A → eɪ\u032Fʔ; +$vs_i မ\u1037\u103A → e $creaky ɪ\u032Fɴ; +$vs_i မ\u103Aး → e $high ɪ\u032Fɴ; +$vs_i မ\u103A → e $low ɪ\u032Fɴ; +$vs_i $vs_u က\u103A → aɪ\u032Fʔ; +$vs_i $vs_u င\u1037\u103A → a $creaky ɪ\u032Fɴ; +$vs_i $vs_u င\u103Aး → a $high ɪ\u032Fɴ; +$vs_i $vs_u င\u103A → a $low ɪ\u032Fɴ; +$vs_i $vs_u ဏ\u1037\u103A → a $creaky ɪ\u032Fɴ; +$vs_i $vs_u ဏ\u103Aး → a $high ɪ\u032Fɴ; +$vs_i $vs_u ဏ\u103A → a $low ɪ\u032Fɴ; +$vs_i $vs_u ယ\u1037\u103A → o $creaky; +$vs_i $vs_u ယ\u103Aး → o $high; +$vs_i $vs_u ယ\u103A → o $low; # in က\u102D\u102Fယ\u103A /kò/ +$vs_i $vs_u \u1037 → o $creaky; +$vs_i $vs_u း → o $high; +$vs_i $vs_u → o $low; +$vs_i $anusvara \u1037 → e $creaky ɪ\u032Fɴ; +$vs_i $anusvara း → e $high ɪ\u032Fɴ; +$vs_i $anusvara → e $low ɪ\u032Fɴ; +$vs_i → i $creaky; +$vs_ii \u1037 → i $creaky; # this does not usually occur +$vs_ii း → i $high; +$vs_ii → i $low; +$vs_u က\u103A → oʊ\u032Fʔ; +$vs_u ဂ\u103A → oʊ\u032Fʔ; +$vs_u ဏ\u1037\u103A → o $creaky ʊ\u032Fɴ; +$vs_u ဏ\u103Aး → o $high ʊ\u032Fɴ; +$vs_u ဏ\u103A → o $low ʊ\u032Fɴ; +$vs_u တ\u103A → oʊ\u032Fʔ; +$vs_u န\u1037\u103A → o $creaky ʊ\u032Fɴ; +$vs_u န\u103Aး → o $high ʊ\u032Fɴ; +$vs_u န\u103A → o $low ʊ\u032Fɴ; +$vs_u ပ\u103A → oʊ\u032Fʔ; +$vs_u မ\u1037\u103A → o $creaky ʊ\u032Fɴ; +$vs_u မ\u103Aး → o $high ʊ\u032Fɴ; +$vs_u မ\u103A → o $low ʊ\u032Fɴ; +$vs_u $anusvara \u1037 → o $creaky ʊ\u032Fɴ; +$vs_u $anusvara း → o $high ʊ\u032Fɴ; +$vs_u $anusvara → o $low ʊ\u032Fɴ; +$vs_u → u $creaky; +$vs_uu \u1037 → u $creaky; # this does not usually occur +$vs_uu း → u $high; +$vs_uu → u $low; +$vs_e တ\u103A → ɪʔ; +$vs_e $vs_aa က\u103A → aʊ\u032Fʔ; +$vs_e $vs_aa င\u1037\u103A → a $creaky ʊ\u032Fɴ; +$vs_e $vs_aa င\u103Aး → a $high ʊ\u032Fɴ; +$vs_e $vs_aa င\u103A → a $low ʊ\u032Fɴ; +$vs_e $vs_aa \u1037 → ɔ $creaky; +$vs_e $vs_aa း → ɔ $high; # redundant high tone; this does not usually occur +$vs_e $vs_aa \u103A → ɔ $low; +$vs_e $vs_aa → ɔ $high; +$vs_e \u1037 → e $creaky; +$vs_e း → e $high; +$vs_e → e $low; +$vs_ai \u1037 → ɛ $creaky; +$vs_ai း → ɛ $high; # redundant high tone; this does not usually occur +$vs_ai → ɛ $high; +$anusvara \u1037 → a $creaky ɴ; +$anusvara း → a $high ɴ; +$anusvara → a $low ɴ; +$med_w တ\u103A → ʊʔ; +$med_w န\u1037\u103A → ʊ $creaky ɴ; +$med_w န\u103Aး → ʊ $high ɴ; +$med_w န\u103A → ʊ $low ɴ; +$med_w ပ\u103A → ʊʔ; +$med_w မ\u1037\u103A → ʊ $creaky ɴ; +$med_w မ\u103Aး → ʊ $high ɴ; +$med_w မ\u103A → ʊ $low ɴ; +# +# Medials +# +::Null; +# Palatalization of the velar stops before MEDIAL YA and MEDIAL RA: +# velar + /j/ ==> modern palatals. +ကျ → t\u0361ɕ; +ချ → t\u0361ɕʰ; +ဂျ → d\u0361ʑ; +ဃျ → d\u0361ʑ; +ကြ → t\u0361ɕ; +ခြ → t\u0361ɕʰ; +ဂြ → d\u0361ʑ; +ဃြ → d\u0361ʑ; +# Remove redundant MEDIAL YA and MEDIAL RA after initial YA. +ယ { [$med_y $med_r] → ; +# Reorder the medials so that U+103E SIGN MEDIAL HA comes before any +# other medials. +# First, push U+103E MEDIAL HA before U+103D MEDIAL WA. +\u103D \u103E → \u103E \u103D; +::Null; +# Now MEDIAL WA comes last. +# Produce the palatal ʃ from (SA|LA)+YA+HA. +သျ\u103E → ʃ; +လျ\u103E → ʃ; +# Second, push U+103E MEDIAL HA before U+103C MEDIAL RA. +\u103C \u103E → \u103E \u103C; +::Null; +# Finally, push U+103E MEDIAL HA before U+103B MEDIAL YA. +\u103B \u103E → \u103E \u103B; +::Null; +# Consume MEDIAL HA and apply devoicing. +င\u103E → ŋ\u030A; +ဉ\u103E → ɲ\u0325; +ည\u103E → ɲ\u0325; +ဏ\u103E → n\u0325; +န\u103E → n\u0325; +မ\u103E → m\u0325; +ယ\u103E → ʃ; +ရ\u103E → ʃ; +လ\u103E → l\u0325; +ဝ\u103E → w\u0325; +ဠ\u103E → l\u0325; +# Drop any remaining U+103E MEDIAL HA. +\u103E → ; +# Simplify medial cluster /jw/ to /w/, i.e. drop U+103B MEDIAL YA and +# U+103C MEDIAL RA before U+103D MEDIAL WA. # TODO: revisit this +\u103B } \u103D → ; +\u103C } \u103D → ; +\u103B → j; +\u103C → j; +\u103D → w; +# +# Initials +# +# Velars +က → k; +ခ → kʰ; +ဂ → ɡ; +ဃ → ɡ; +င → ŋ; +# Historic palatals +စ → s; +ဆ → sʰ; +ဇ → z; +ဈ → z; +ဉ → ɲ; +ည → ɲ; +# Alveolars +ဋ → t; +ဌ → tʰ; +ဍ → d; +ဎ → d; +ဏ → n; +# Historic dentals ==> alveolars +တ → t; +ထ → tʰ; +ဒ → d; +ဓ → d; +န → n; +# Labials +ပ → p; +ဖ → pʰ; +ဗ → b; +ဘ → b; +မ → m; +# Other letters +ယ → j; +ရ → j; # historic /r/ +လ\u103A → ; # final, typically not pronounced in native words +လ → l; +ဝ → w; +သ → θ; # historic /s/ ==> modern dental +ဟ → h; +ဠ → l; +အ → ʔ; +# Independent vowels +ဣ\u1037 → ʔḭ; # redundant creaky tone; this does not usually occur +ဣး → ʔí; # this does not usually occur +ဣ → ʔḭ; +ဤ\u1037 → ʔḭ; # this does not usually occur +ဤး → ʔí; # this does not usually occur +ဤ → ʔì; +ဥ\u1037 → ʔṵ; # redundant creaky tone; this does not usually occur +ဥး → ʔú; # this does not usually occur +ဥ → ʔṵ; +ဦ\u1037 → ʔṵ; # this does not usually occur +ဦး → ʔú; +ဦ → ʔù; +ဧ\u1037 → ʔḛ; # this does not usually occur +ဧး → ʔé; +ဧ → ʔè; +ဩ\u1037 → ʔɔ\u0330; # this does not usually occur +ဩး → ʔɔ\u0301; # redundant high tone; this does not usually occur +ဩ → ʔɔ\u0301; +ဪ\u1037 → ʔɔ\u0330; # this does not usually occur +ဪး → ʔɔ\u0301; # this does not usually occur +ဪ → ʔɔ\u0300; +# Various signs +၌ → n\u0325aɪ\u032Fʔ; +၍ → jwḛ; +# ၎င\u103Aး was handled earlier. +၏ → ʔḭ; +# +# Postprocessing +# +# Delete any remaining U+103A ASAT. +$asat → ; +# Delete zero-width space, non-joiner, joiner. +[\u200B-\u200D] → ; +::NFC; + diff --git a/icu4c/source/data/translit/nl_Title.txt b/icu4c/source/data/translit/nl_Title.txt index 6226039d8bd..778a57d4dd0 100644 --- a/icu4c/source/data/translit/nl_Title.txt +++ b/icu4c/source/data/translit/nl_Title.txt @@ -1,11 +1,16 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: nl_Title.txt # Generated from CLDR # + +# Copyright (C) 2011-2013, Apple Inc. and others. All Rights Reserved. +# Special titlecasing for Dutch initial "ij". ::Any-Title(); +# Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29) [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ; + diff --git a/icu4c/source/data/translit/pl_FONIPA_ja.txt b/icu4c/source/data/translit/pl_FONIPA_ja.txt index 20f849b325f..bd9e904abdc 100644 --- a/icu4c/source/data/translit/pl_FONIPA_ja.txt +++ b/icu4c/source/data/translit/pl_FONIPA_ja.txt @@ -1,15 +1,21 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: pl_FONIPA_ja.txt # Generated from CLDR # + +# Transforms a Phonemic IPA transcription of Polish (pl_FONIPA) to Katakana. +# $word_boundary = [-\ $] ; $vowel = [aeiouw] ; # Vowels and glides $not_vowel = [^$vowel] ; +# +# +# First pass: Collapse phonetic distinctions not preserved in Katakana. ç → | h; ɡ → | g; ʎ → | l; @@ -17,14 +23,23 @@ $not_vowel = [^$vowel] ; d \u0361 ʑ → | ʑ; d \u0361 ʐ → | ʐ; d \u0361 z → | z; +# +# ɛ\u0303 → | en; ɛ → | e; [ɨʲ] → | i; ɔ\u0303 → | on; ɔ → | o; +# +# :: Null (); +# +# +# Main pass: Phoneme to Katakana conversion. '.' → ; a → ア; +# +# ba → バ; bb → ッ | b; be → ベ; @@ -35,14 +50,20 @@ bo → ボ; bu → ブ; b } $word_boundary → プ; b → ブ; +# +# ca → チャ ; ce → チェ ; ci → チ ; cu → チュ ; co → チョ ; c → チ ; +# +# ^ d \u0361 ɕ → dɕ; d \u0361 ɕ → ッ | dɕ; +# +# da → ダ; dd → ッ | d; de → デ; @@ -52,7 +73,11 @@ du → ドゥ; dɕ → チ; d } $word_boundary → ト; d → ド; +# +# e → エ; +# +# fa → ファ; fe → フェ; ff → ッ | f; @@ -60,6 +85,8 @@ fi → フィ; fo → フォ; fu → フ; f → フ; +# +# ha → ハ; hi → ヒ; hu → フ; @@ -67,6 +94,8 @@ he → ヘ; ho → ホ; h } $word_boundary → ; h → フ; +# +# ga → ガ; ge → グエ; gi → ギ; @@ -75,13 +104,19 @@ go → ゴ; gu → グ; g } $word_boundary → ク; g → グ; +# +# i → イ ; +# +# ja → ヤ; ji → イ; jo → ヨ; je → イェ; ju → ユ; j → イ; +# +# ka → カ; ke → ケ; ki → キ; @@ -89,6 +124,8 @@ kk → ッ | k; ko → コ; ku → ク; k → ク; +# +# la → ラ ; le → レ ; li → リ ; @@ -96,6 +133,8 @@ lho → ロ ; lo → ロ ; lu → ル ; l → ル ; +# +# ma → マ ; me → メ ; mi → ミ ; @@ -103,19 +142,27 @@ mo → モ ; mu → ム ; m } [bp] → ン ; m → ム ; +# +# na → ナ ; ne → ネ ; ni → ニ ; no → ノ ; nu → ヌ ; n → ン ; +# +# ɲa → ニャ ; ɲe → ニエ ; ɲi → ニ ; ɲo → ニョ ; ɲu → ニュ ; ɲ → ン ; +# +# o → オ ; +# +# pa → パ ; pe → ペ ; pio → ピョ ; @@ -124,18 +171,24 @@ po → ポ ; pp → ッ | p; pu → プ ; p → プ ; +# +# ra → ラ ; re → レ ; ri → リ ; ro → ロ ; ru → ル ; r → ル; +# +# sa → サ ; se → セ ; si → シ ; so → ソ ; su → ス ; s → ス ; +# +# ɕa → シャ; # not backed by data ɕe → シェ; ɕu → シュ; # not backed by data @@ -143,31 +196,50 @@ s → ス ; ɕvi → シフィ; ɕi → シ; ɕ → シ; +# +# ʂa → シャ; ʂe → シェ; ʂu → シュ; ʂo → ショ; # not backed by data ʂi → シ; ʂ → シュ; +# +# +#tʂa → ツァ; +#tʂi → トシ; +#tʂu → チュ; +#tʂe → トシェ; +#tʂ } $word_boundary → チ; +#tʂ → チュ; +# tɕa → チャ; tɕe → チェ; tɕi → チ; tɕu → チュ; tɕo → チョ; tɕ → チ; +# +# ta → タ; te → テ ; ti → ティ ; to → ト ; tu → トゥ ; +# +# tsa → ツァ ; tse → ツェ ; ts[ij] → ツィ ; tso → ツォ ; tsu → ツ ; ts → ツ ; +# +# ^tt → ト | t; tt → ッ | t; +# +# t \u0361 ʂa → ツァ; t \u0361 ʂi → チ; t \u0361 ʂu → チュ; @@ -177,7 +249,11 @@ t \u0361 ʂ } $word_boundary → チ; t \u0361 ʂ → チュ; t \u0361 → | t; t → ト ; +# +# u → ウ ; +# +# va → バ; ve → ベ; vi → ビ; @@ -185,18 +261,24 @@ vo → ボ; vu → ブ; v } $word_boundary → フ; v → ブ; +# +# wa → ワ; wu → ウ; wi → ウィ; we → ウェ; wo → ウォ; w → ウ; +# +# xa → ハ ; xe → ヘ ; xi → ヒ ; xo → ホ ; xu → フ ; x → フ ; +# +# ʐa → ジャ; ʐe → ジェ; ʐi → ジ; @@ -204,12 +286,16 @@ x → フ ; ʐu → ジュ; ʐ } $word_boundary → ジュ; ʐ → ジ; +# +# ʑa → ジャ; ʑi → ジ; ʑo → ジオ; ʑe → ジェ; ʑu → ジュ; ʑ → ジ; +# +# za → ザ; ze → ジェ; zi → ジ; @@ -217,3 +303,8 @@ zo → ゾ; zu → ズ; z } $word_boundary → ス; z → ズ; +# +# +#' ' → ・; +# + diff --git a/icu4c/source/data/translit/pl_am.txt b/icu4c/source/data/translit/pl_am.txt new file mode 100644 index 00000000000..17277b992f4 --- /dev/null +++ b/icu4c/source/data/translit/pl_am.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: pl_am.txt +# Generated from CLDR +# + +::pl-pl_FONIPA; +::am_FONIPA-am; + diff --git a/icu4c/source/data/translit/pl_ar.txt b/icu4c/source/data/translit/pl_ar.txt new file mode 100644 index 00000000000..73685274bf2 --- /dev/null +++ b/icu4c/source/data/translit/pl_ar.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: pl_ar.txt +# Generated from CLDR +# + +::pl-pl_FONIPA; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/pl_fa.txt b/icu4c/source/data/translit/pl_fa.txt new file mode 100644 index 00000000000..182b6c25155 --- /dev/null +++ b/icu4c/source/data/translit/pl_fa.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: pl_fa.txt +# Generated from CLDR +# + +::pl-pl_FONIPA; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/pl_ja.txt b/icu4c/source/data/translit/pl_ja.txt index 5578c0a9e43..cee877222bf 100644 --- a/icu4c/source/data/translit/pl_ja.txt +++ b/icu4c/source/data/translit/pl_ja.txt @@ -1,11 +1,13 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: pl_ja.txt # Generated from CLDR # + ::pl-pl_FONIPA; ::pl_FONIPA-ja; + diff --git a/icu4c/source/data/translit/pl_pl_FONIPA.txt b/icu4c/source/data/translit/pl_pl_FONIPA.txt index af37bd2060e..145f7002017 100644 --- a/icu4c/source/data/translit/pl_pl_FONIPA.txt +++ b/icu4c/source/data/translit/pl_pl_FONIPA.txt @@ -1,16 +1,28 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: pl_pl_FONIPA.txt # Generated from CLDR # + +# Polish orthography to phonemic transcription. +# http://en.wikipedia.org/wiki/Polish_phonology +# +# Transform input to normalized form NFC, and to lowercase. :: NFC () ; :: Lower () ; +# +# +# Definitions. $voiceless = [cfhkpst]; $vowel = [ aąeęioóuy ]; +# +# +# Digraphs and Trigraphs. +# ch } i → ç ; ch → x ; ci } $vowel → t \u0361 ɕ ; @@ -36,6 +48,8 @@ trw → trf ; tw → tf ; zi } $vowel → ʑ ; zi → ʑ i ; +# +# a → a ; ą } [bp] → ɔm ; ą } [kg] → ɔŋ ; @@ -84,11 +98,19 @@ y → ɨ ; ź → ʑ ; ż } $voiceless → ʂ ; ż → ʐ ; +# +# +# Second pass: Phoneme-to-phone rules. :: Null ; +# +# tʐ → tʂ ; pʐ → pʂ ; xʐ → xʂ ; ʐt\u0361ʂ → ʂt\u0361ʂ ; +# +# +# Final de-voicing. b } [$] → p ; d \u0361 z } [$] → t \u0361 s ; # rydz → rɨt\u0361s d } [$] → t ; @@ -97,3 +119,4 @@ v } [$] → f ; ʐ } [$] → ʂ ; ʑ } [$] → ɕ ; z } [$] → s ; + diff --git a/icu4c/source/data/translit/Pashto_Latin_BGN.txt b/icu4c/source/data/translit/ps_ps_Latn_BGN.txt similarity index 70% rename from icu4c/source/data/translit/Pashto_Latin_BGN.txt rename to icu4c/source/data/translit/ps_ps_Latn_BGN.txt index f6c0d2fddcf..f9771509d99 100644 --- a/icu4c/source/data/translit/Pashto_Latin_BGN.txt +++ b/icu4c/source/data/translit/ps_ps_Latn_BGN.txt @@ -1,22 +1,56 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Pashto_Latin_BGN.txt +# File: ps_ps_Latn_BGN.txt # Generated from CLDR # + +# +######################################################################## +# BGN/PCGN 1968 System +# +# This system was adopted in 1968 for the romanization of Pashto +# geographic names in Afghanistan. Persian names in Afghanistan are +# romanized in accordance with the Romanization System for Persian +# (BGN/PCGN 1958 System), shown on pages 87-92). +# +# Originally prepared by Michael Everson +######################################################################## +# +# MINIMAL FILTER: Pashto-Latin +# :: [ءآابةتثجحخدذرزسشصضطظعغفقكلمنهوىي\u064E\u064F\u0650\u0651\u0652\u0654٠١٢٣٤٥٦٧٨٩ټپځڅچډړږژښگڰڼیۍې] ; :: NFD (NFC) ; +# +# +######################################################################## +# +######################################################################## +# +# Define All Transformation Variables +# +######################################################################## +# $alef = ’; $ayin = ‘; $disambig = \u0331 ; +# +# +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/cgi-bin/icu-bugs/transliterate?id=2034;expression=boundary;user=guest +# $wordBoundary = [^[:L:][:M:][:N:]] ; +# +# +######################################################################## [:Nd:]{٫}[:Nd:] ↔ [:Nd:]{','}[:Nd:] ; # ARABIC DECIMAL SEPARATOR [:Nd:]{٬}[:Nd:] ↔ [:Nd:]{'.'}[:Nd:] ; # ARABIC THOUSANDS SEPARATOR ٫ ↔ ',' $disambig ; # ARABIC DECIMAL SEPARATOR ٬ ↔ '.' $disambig ; # ARABIC THOUSANDS SEPARATOR +# ٭ ↔ ; # ARABIC FIVE POINTED STAR // no need to transliterate ، ↔ ',' ; # ARABIC COMMA ؛ ↔ ';' ; # ARABIC SEMICOLON ؟ ↔ '?' ; # ARABIC QUESTION MARK @@ -41,10 +75,46 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; ۷ ↔ 7 ; # EXTENDED ARABIC-INDIC DIGIT SEVEN ۸ ↔ 8 ; # EXTENDED ARABIC-INDIC DIGIT EIGHT ۹ ↔ 9 ; # EXTENDED ARABIC-INDIC DIGIT NINE +# +######################################################################## +# +# Rules moved to front to avoid masking +# +######################################################################## +# +######################################################################## +# +# BGN Page 89 Rule 4 +# +# The character sequences كه , زه , سه , and گه may be romanized k·h, z·h, +# s·h, and g·h in order to differentiate those romanizations from the +# digraphs kh, zh, sh, and gh. +# +######################################################################## +# كه → k·h ; # ARABIC LETTER KAF + HEH زه → z·h ; # ARABIC LETTER ZAIN + HEH سه → s·h ; # ARABIC LETTER SEEN + HEH گه → g·h ; # ARABIC LETTER GAF + HEH +# +# +######################################################################## +# +# End Rule 4 +# +######################################################################## +# +######################################################################## +# +# BGN Page 91 Rule 7 +# +# Doubles consonant sounds are represented in Arabic script by +# placing a shaddah ( \u0651 ) over a consonant character. In romanization +# the letter should be doubled. [The remainder of this rule deals with +# the definite article and is lexical.] +# +######################################################################## +# ب\u0651 → bb ; # ARABIC LETTER BEH + SHADDA پ\u0651 → pp ; # ARABIC LETTER PEH + SHADDA ت\u0651 → tt ; # ARABIC LETTER TEH + SHADDA @@ -86,6 +156,20 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; و\u0651 → ww ; # ARABIC LETTER WAW + SHADDA \u0651ی → yy ; # ARABIC LETTER FARSI YEH + SHADDA ى\u0651 → yy ; # ARABIC LETTER YEH + SHADDA +# +# +######################################################################## +# +# End Rule 7 +# +######################################################################## +# +######################################################################## +# +# Start of Transformations +# +######################################################################## +# $wordBoundary{ء → ; # ARABIC LETTER HAMZA ء → $alef ; # ARABIC LETTER HAMZA $wordBoundary{ا → ; # ARABIC LETTER ALEF @@ -135,7 +219,7 @@ $wordBoundary{ا → ; # ARABIC LETTER ALEF ى → y ; # ARABIC LETTER YEH ې → e ; # ARABIC LETTER E \u064Eا → ā ; # ARABIC FATHA + ALEF -\u064Eى\u0652 → ay ; # ARABIC FATHA + FARSI YEH + SUKUN +\u064Eى\u0652 → ay ; # ARABIC FATHA + FARSI YEH + SUKUN \u064Eى → á ; # ARABIC FATHA + ALEF MAKSURA \u064E\u0652ۍ → êy ; # ARABIC FATHA + SUKUN + YEH WITH TAIL \u064E\u0652 → ê ; # ARABIC FATHA + SUKUN @@ -150,3 +234,7 @@ $wordBoundary{ا → ; # ARABIC LETTER ALEF \u064Fو → ū ; # ARABIC DAMMA + WAW \u064F → u ; # ARABIC DAMMA \u0652 → ; # ARABIC SUKUN +# +# +######################################################################## + diff --git a/icu4c/source/data/translit/rm_SURSILV_am.txt b/icu4c/source/data/translit/rm_SURSILV_am.txt new file mode 100644 index 00000000000..ea54b83aab0 --- /dev/null +++ b/icu4c/source/data/translit/rm_SURSILV_am.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: rm_SURSILV_am.txt +# Generated from CLDR +# + +::rm_SURSILV-rm_FONIPA_SURSILV; +::am_FONIPA-am; + diff --git a/icu4c/source/data/translit/rm_SURSILV_ar.txt b/icu4c/source/data/translit/rm_SURSILV_ar.txt new file mode 100644 index 00000000000..d6191f7473c --- /dev/null +++ b/icu4c/source/data/translit/rm_SURSILV_ar.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: rm_SURSILV_ar.txt +# Generated from CLDR +# + +::rm_SURSILV-rm_FONIPA_SURSILV; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/rm_SURSILV_fa.txt b/icu4c/source/data/translit/rm_SURSILV_fa.txt new file mode 100644 index 00000000000..8fe5548d817 --- /dev/null +++ b/icu4c/source/data/translit/rm_SURSILV_fa.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: rm_SURSILV_fa.txt +# Generated from CLDR +# + +::rm_SURSILV-rm_FONIPA_SURSILV; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/rm_SURSILV_rm_FONIPA_SURSILV.txt b/icu4c/source/data/translit/rm_SURSILV_rm_FONIPA_SURSILV.txt new file mode 100644 index 00000000000..9affbdee104 --- /dev/null +++ b/icu4c/source/data/translit/rm_SURSILV_rm_FONIPA_SURSILV.txt @@ -0,0 +1,105 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: rm_SURSILV_rm_FONIPA_SURSILV.txt +# Generated from CLDR +# + +# Transforms Rumantsch Sursilvan (rm-sursilv) to IPA (rm-fonipa-sursilv). +# +# References +# ---------- +# [1] https://als.wikipedia.org/wiki/Surselvisch#Orthographi +# with links to sound recordings on Wikimedia Commons +# +# Output +# ------ +# m n ɲ +# p b t d c ɟ k ɡ +# f v s z ʃ ʒ h +# t\u0361ʃ t\u0361s +# r l j ʎ +# w +# i u e ʊ ɛ ɔ a +# ɪa\u032F ɪa\u032Fʊ\u032F ɪʊ\u032F ɪɛ\u032F u ʊa\u032F ʊa\u032Fʊ\u032F ʊɛ\u032F ʊɛ\u032Fɪ\u032F ʊɔ\u032F +# ɛɪ\u032F ɛʊ\u032F aɪ\u032F aʊ\u032F +::Lower; +ai → aɪ\u032F ; +au → aʊ\u032F ; +a → a ; +b → b ; +{c} [ei] → t\u0361s ; +c → k ; +d → d ; +ei → ɛɪ\u032F ; # can also be /aɪ\u032F/ or /ɔɪ\u032F/ in some regions +eu → ɛʊ\u032F ; +e → e ; # can also be /ɛ/; needs a dictionary +é → e ; +è → ɛ ; +f → f ; +ge → ɟ ; +gh → ɡ ; +gi → ɟ ; +{gl} [aeou] → ɡl ; +{gl} → ʎ ; +gn → ɲ ; +g → ɡ ; +h → ; +iau → ɪa\u032Fʊ\u032F ; +ia → ɪa\u032F ; +ie → ɪɛ\u032F ; +iu → ɪʊ\u032F ; +i → i ; +j → j ; +k → k ; +l → l ; +m → m ; +n → n ; +o → ɔ ; +p → p ; +q → k ; +r → r ; +{sch} [aeiou] → ʒ ; # can also be /ʃ/; needs a dictionary +{sch} → ʃ ; # can also be /ʒ/; needs a dictionary +{s} [cptnm] → ʃ ; +{s} [gbdv] → ʒ ; +s → s ; # can also be /z/; needs a dictionary +tg → c ; +tsch → t\u0361ʃ ; +t → t ; +uau → ʊa\u032Fʊ\u032F ; +ua → ʊa\u032F ; +uei → ʊɛ\u032Fɪ\u032F ; +ue → ʊɛ\u032F ; +uo → ʊɔ\u032F ; +u → u ; +v → v ; +w → v ; +x → ks ; +y → i ; +z → t\u0361s ; +::NULL; +mm+ → mː; +nn+ → nː; +ɲɲ+ → ɲː; +pp+ → pː; +bb+ → bː; +tt+ → tː; +dd+ → dː; +cc+ → cː ; +ɟɟ+ → ɟː ; +kk+ → kː ; +ɡɡ+ → ɡː ; +ff+ → fː ; +vv+ → vː ; +ss+ → sː ; +zz+ → zː ; +ʃʃ+ → ʃː ; +ʒʒ+ → ʒː ; +rr+ → rː ; +ll+ → lː ; +jj+ → jː ; + diff --git a/icu4c/source/data/translit/ro_FONIPA_ja.txt b/icu4c/source/data/translit/ro_FONIPA_ja.txt index c9d31ddf0f4..dcb5c2b08b6 100644 --- a/icu4c/source/data/translit/ro_FONIPA_ja.txt +++ b/icu4c/source/data/translit/ro_FONIPA_ja.txt @@ -1,21 +1,27 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: ro_FONIPA_ja.txt # Generated from CLDR # + +# Transforms an IPA transcription of Romanian (ro_FONIPA) to Japanese Katakana. $word_boundary = [-\ $] ; $vowel = [aeiouw] ; # Vowels and glides $not_vowel = [^$vowel] ; +# +# +# First pass: Collapse phonetic distinctions that are not preserved in Katakana. [ǎə] → | a ; e\u032F → | e ; [jy] → | i ; o\u032F → | o ; [ɨȋ] → | u ; ul } $word_boundary → u ; +ŋ → | nɡ; ɡ → | g ; ș → | ʃ ; t\u0361 → | t ; @@ -24,52 +30,75 @@ x → | ks ; dʒ → | ʒ ; d\u0361ʒ → | ʒ ; :: Null (); +# +# +# Main pass: Phoneme to Katakana conversion. a → ア; +# +# ba → バ; be → ベ; bi → ビ; bo → ボ; bu → ブ; b → ブ; +# +# da → ダ ; de → デ ; di → ディ ; do → ド ; du → ドゥ ; d → ド ; +# +# e → エ ; +# +# fa → ファ ; fe → フェ ; fi → フィ ; fo → フォ ; fu → フ ; f → フ ; +# +# ga → ガ; ge → ゲ; gi → ギ; go → ゴ; gu → グ; g → グ; +# +# ha → ハ ; hwe → フェ ; he → ヘ ; hi → ヒ ; # not backed by data ho → ホ ; hu → フ ; +# +# ^ { ia → ヤ ; i → イ ; +# +# ka → カ ; ke → ケ ; ki → キ ; ko → コ ; ku → ク ; k → ク ; +# +# la → ラ ; le → レ ; li → リ ; lo → ロ ; lu → ル ; l → ル ; +# +# ma → マ ; me → メ ; mi → ミ ; @@ -77,31 +106,43 @@ mo → モ ; mu → ム ; m } [bp] → ン ; m → ム ; +# +# na → ナ ; ne → ネ ; ni → ニ ; no → ノ ; nu → ヌ ; n → ン ; +# +# o → オ ; +# +# pa → パ ; pe → ペ ; pi → ピ ; po → ポ ; pu → プ ; p → プ ; +# +# ra → ラ ; re → レ ; ri → リ ; ro → ロ ; ru → ル ; r → ル; +# +# sa → サ ; se → セ ; si → シ ; so → ソ ; su → ス ; s → ス ; +# +# ʃa → シャ ; ʃe → シェ ; ʃio → ショ ; @@ -109,11 +150,15 @@ s → ス ; ʃo → ショ ; ʃu → シュ ; ʃ → シュ ; +# +# ta → タ ; te → テ ; ti → ティ ; to → ト ; tu → トゥ ; +# +# tʃa → チャ ; tʃea → チャ ; tʃe → チェ ; @@ -121,6 +166,8 @@ tʃiu → チュ ; tʃi → チ ; tʃo → チョ ; tʃu → チュ ; +# +# tsa → ツァ ; tse → ツェ ; tsi → ツィ; @@ -128,25 +175,35 @@ tso → ツォ ; tsu → ツ ; ts → ツ ; t → ト ; +# +# u → ウ ; +# +# va → バ ; ve → ベ ; vu → ブ ; vi → ビ ; vo → ボ ; v → ヴ ; +# +# wa → ワ ; we → エ ; # not backed by data wi → イ ; # not backed by data wo → オ ; # not backed by data wu → ウ ; # not backed by data w → ウ ; +# +# za → ザ ; ze → ゼ ; zi → ジ ; zo → ゾ ; zu → ズ ; z → ズ ; +# +# ʒa → ジャ ; ʒea → ジャ ; ʒe → ジェ ; @@ -155,4 +212,9 @@ z → ズ ; ʒo → ジョ ; ʒu → ジュ ; ʒ → ジュ ; +# +# ' ' → ・; +# +# + diff --git a/icu4c/source/data/translit/ro_am.txt b/icu4c/source/data/translit/ro_am.txt new file mode 100644 index 00000000000..66416d44cb6 --- /dev/null +++ b/icu4c/source/data/translit/ro_am.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: ro_am.txt +# Generated from CLDR +# + +::ro-ro_FONIPA; +::am_FONIPA-am; + diff --git a/icu4c/source/data/translit/ro_ar.txt b/icu4c/source/data/translit/ro_ar.txt new file mode 100644 index 00000000000..474348eca42 --- /dev/null +++ b/icu4c/source/data/translit/ro_ar.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: ro_ar.txt +# Generated from CLDR +# + +::ro-ro_FONIPA; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/ro_fa.txt b/icu4c/source/data/translit/ro_fa.txt new file mode 100644 index 00000000000..36a78b15232 --- /dev/null +++ b/icu4c/source/data/translit/ro_fa.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: ro_fa.txt +# Generated from CLDR +# + +::ro-ro_FONIPA; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/ro_ja.txt b/icu4c/source/data/translit/ro_ja.txt index 6097ea81a28..a2be0c68a6e 100644 --- a/icu4c/source/data/translit/ro_ja.txt +++ b/icu4c/source/data/translit/ro_ja.txt @@ -1,11 +1,13 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: ro_ja.txt # Generated from CLDR # + ::ro-ro_FONIPA; ::ro_FONIPA-ja; + diff --git a/icu4c/source/data/translit/ro_ro_FONIPA.txt b/icu4c/source/data/translit/ro_ro_FONIPA.txt index 0f1258c1629..7524ed2e9e5 100644 --- a/icu4c/source/data/translit/ro_ro_FONIPA.txt +++ b/icu4c/source/data/translit/ro_ro_FONIPA.txt @@ -1,15 +1,26 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: ro_ro_FONIPA.txt # Generated from CLDR # + +# Romanian orthography to phonemic transcription. +# http://en.wikipedia.org/wiki/Romanian_phonology +# +# TODO: Currently this transform does not palatalize consonants. +$VowelEI = [e i î]; +$VowelAOU = [a â ă o u]; +$Vowel = [$VowelEI $VowelAOU]; +$Boundary = [^[:L:][:M:][:N:]]; :: NFC () ; :: Lower () ; +# Special cases. eoai → eo\u032Faj ; # eg. leoaică → /leo\u032Fajkə/, not /le\u032Fo\u032Faikə/ +# Triphthongs. eai → e\u032Faj ; eau → e\u032Faw ; eoa → e\u032Fo\u032Fa ; @@ -18,7 +29,7 @@ ia\-i → jaj ; iau → jaw ; iei → jej ; ieu → jew ; -[\uffff] { eu → jew ; +$Boundary {eu} → jew ; ioa → jo\u032Fa ; ioi → joj ; i\-oi → joj ; @@ -27,10 +38,11 @@ oai → o\u032Faj ; uai → waj ; uau → waw ; uăi → wəj ; +# Diphthongs. ai → aj ; âi → ɨj ; ăi → əj ; -au } r → au ; +au} r → au ; au → aw ; âu → ɨw ; ău → əw ; @@ -45,7 +57,7 @@ i\-a → ja ; ie → je ; ii → ij ; io → jo ; -iu } [aâăeiîou$] → iw ; +iu} [$Vowel $Boundary] → iw ; iu → ju ; oa → o\u032Fa ; oi → oj ; @@ -61,13 +73,15 @@ a → a ; ă → ə ; b → b ; ch → k ; -c } [ei] → t \u0361 ʃ ; +{c [ei]} $VowelAOU → t\u0361ʃ ; +{c} [ei] → t\u0361ʃ ; c → k ; d → d ; e → e ; f → f ; gh → ɡ ; -g } [ei] → d \u0361 ʒ ; +{g [ei]} $VowelAOU → d\u0361ʒ ; +{g} [ei] → d\u0361ʒ ; g → ɡ ; h → h ; i → i ; @@ -76,14 +90,43 @@ j → ʒ ; k → k ; l → l ; m → m ; +ng → ŋ ; n → n ; o → o ; p → p ; +q → k ; r → r ; s → s ; ş → ʃ ; +ș → ʃ ; t → t ; -ţ → t \u0361 s ; +ţ → t\u0361s ; +ț → t\u0361s ; u → u ; v → v ; +x → ks ; +y → i ; z → z ; +[:P:]+ → ' '; +# Romanian does not have any gemination. +# https://en.wikipedia.org/wiki/Gemination#Latin_and_Romance_languages +::null; +pp+ → p; +bb+ → b; +tt+ → t; +dd+ → d; +kk+ → k; +dd+ → d; +ɡɡ+ → ɡ; +ff+ → f; +vv+ → v; +hh+ → h; +ss+ → s; +zz+ → z; +ʃʃ+ → ʃ; +ʒʒ+ → ʒ; +rr+ → r; +ll+ → l; +jj+ → j; +ww+ → w; + diff --git a/icu4c/source/data/translit/root.txt b/icu4c/source/data/translit/root.txt index abe1324a178..464075c1b99 100644 --- a/icu4c/source/data/translit/root.txt +++ b/icu4c/source/data/translit/root.txt @@ -1,6 +1,6 @@ // *************************************************************************** // * -// * Copyright (C) 2004-2015, International Business Machines +// * Copyright (C) 2004-2016, International Business Machines // * Corporation; Unicode, Inc.; and others. All Rights Reserved. // * // *************************************************************************** @@ -15,25 +15,31 @@ root { Digit-Tone { alias {"NumericPinyin-Pinyin"} } - Hans-Hant { - alias {"Simplified-Traditional"} - } - Hant-Hans { - alias {"Traditional-Simplified"} - } Amharic-Latin/BGN { + alias {"am-am_Latn/BGN"} + } + am-Latn-t-am-m0-bgn { + alias {"am-am_Latn/BGN"} + } + am-am_Latn/BGN { file { - resource:process(transliterator) {"Amharic_Latin_BGN.txt"} + resource:process(transliterator) {"am_am_Latn_BGN.txt"} direction {"FORWARD"} } } + und-t-d0-accents { + alias {"Any-Accents"} + } Any-Accents { file { resource:process(transliterator) {"Any_Accents.txt"} direction {"FORWARD"} } } + und-t-s0-accents { + alias {"Accents-Any"} + } Accents-Any { file { resource:process(transliterator) {"Any_Accents.txt"} @@ -41,12 +47,18 @@ root { } } + und-t-d0-publish { + alias {"Any-Publishing"} + } Any-Publishing { file { resource:process(transliterator) {"Any_Publishing.txt"} direction {"FORWARD"} } } + und-t-s0-publish { + alias {"Publishing-Any"} + } Publishing-Any { file { resource:process(transliterator) {"Any_Publishing.txt"} @@ -55,63 +67,117 @@ root { } Arabic-Latin { + alias {"Arab-Latn"} + } + und-Latn-t-und-arab { + alias {"Arab-Latn"} + } + Arab-Latn { file { - resource:process(transliterator) {"Arabic_Latin.txt"} + resource:process(transliterator) {"Arab_Latn.txt"} direction {"FORWARD"} } } Latin-Arabic { + alias {"Latn-Arab"} + } + und-Arab-t-und-latn { + alias {"Latn-Arab"} + } + Latn-Arab { file { - resource:process(transliterator) {"Arabic_Latin.txt"} + resource:process(transliterator) {"Arab_Latn.txt"} direction {"REVERSE"} } } Arabic-Latin/BGN { + alias {"ar-ar_Latn/BGN"} + } + ar-Latn-t-ar-m0-bgn { + alias {"ar-ar_Latn/BGN"} + } + ar-ar_Latn/BGN { file { - resource:process(transliterator) {"Arabic_Latin_BGN.txt"} + resource:process(transliterator) {"ar_ar_Latn_BGN.txt"} direction {"FORWARD"} } } Armenian-Latin/BGN { + alias {"hy-hy_Latn/BGN"} + } + hy-Latn-t-hy-m0-bgn { + alias {"hy-hy_Latn/BGN"} + } + hy-hy_Latn/BGN { file { - resource:process(transliterator) {"Armenian_Latin_BGN.txt"} + resource:process(transliterator) {"hy_hy_Latn_BGN.txt"} direction {"FORWARD"} } } Azerbaijani-Latin/BGN { + alias {"az_Cyrl-az/BGN"} + } + az-t-az-cyrl-m0-bgn { + alias {"az_Cyrl-az/BGN"} + } + az_Cyrl-az/BGN { file { - resource:process(transliterator) {"Azerbaijani_Latin_BGN.txt"} + resource:process(transliterator) {"az_Cyrl_az_BGN.txt"} direction {"FORWARD"} } } Belarusian-Latin/BGN { + alias {"be-be_Latn/BGN"} + } + be-Latn-t-be-m0-bgn { + alias {"be-be_Latn/BGN"} + } + be-be_Latn/BGN { file { - resource:process(transliterator) {"Belarusian_Latin_BGN.txt"} + resource:process(transliterator) {"be_be_Latn_BGN.txt"} direction {"FORWARD"} } } Bengali-Devanagari { + alias {"Beng-Deva"} + } + und-Deva-t-und-beng { + alias {"Beng-Deva"} + } + Beng-Deva { file { - resource:process(transliterator) {"Bengali_Devanagari.txt"} + resource:process(transliterator) {"Beng_Deva.txt"} direction {"FORWARD"} } } Bengali-Gujarati { + alias {"Beng-Gujr"} + } + und-Gujr-t-und-beng { + alias {"Beng-Gujr"} + } + Beng-Gujr { file { - resource:process(transliterator) {"Bengali_Gujarati.txt"} + resource:process(transliterator) {"Beng_Gujr.txt"} direction {"FORWARD"} } } Bengali-Gurmukhi { + alias {"Beng-Guru"} + } + und-Guru-t-und-beng { + alias {"Beng-Guru"} + } + Beng-Guru { file { - resource:process(transliterator) {"Bengali_Gurmukhi.txt"} + resource:process(transliterator) {"Beng_Guru.txt"} direction {"FORWARD"} } } @@ -124,84 +190,156 @@ root { } Bengali-Kannada { + alias {"Beng-Knda"} + } + und-Knda-t-und-beng { + alias {"Beng-Knda"} + } + Beng-Knda { file { - resource:process(transliterator) {"Bengali_Kannada.txt"} + resource:process(transliterator) {"Beng_Knda.txt"} direction {"FORWARD"} } } Bengali-Latin { + alias {"Beng-Latn"} + } + und-Latn-t-und-beng { + alias {"Beng-Latn"} + } + Beng-Latn { file { - resource:process(transliterator) {"Bengali_Latin.txt"} + resource:process(transliterator) {"Beng_Latn.txt"} direction {"FORWARD"} } } Bengali-Malayalam { + alias {"Beng-Mlym"} + } + und-Mlym-t-und-beng { + alias {"Beng-Mlym"} + } + Beng-Mlym { file { - resource:process(transliterator) {"Bengali_Malayalam.txt"} + resource:process(transliterator) {"Beng_Mlym.txt"} direction {"FORWARD"} } } Bengali-Oriya { + alias {"Beng-Orya"} + } + und-Orya-t-und-beng { + alias {"Beng-Orya"} + } + Beng-Orya { file { - resource:process(transliterator) {"Bengali_Oriya.txt"} + resource:process(transliterator) {"Beng_Orya.txt"} direction {"FORWARD"} } } Bengali-Tamil { + alias {"Beng-Taml"} + } + und-Taml-t-und-beng { + alias {"Beng-Taml"} + } + Beng-Taml { file { - resource:process(transliterator) {"Bengali_Tamil.txt"} + resource:process(transliterator) {"Beng_Taml.txt"} direction {"FORWARD"} } } Bengali-Telugu { + alias {"Beng-Telu"} + } + und-Telu-t-und-beng { + alias {"Beng-Telu"} + } + Beng-Telu { file { - resource:process(transliterator) {"Bengali_Telugu.txt"} + resource:process(transliterator) {"Beng_Telu.txt"} direction {"FORWARD"} } } Bulgarian-Latin/BGN { + alias {"bg-bg_Latn/BGN"} + } + bg-Latn-t-bg-m0-bgn { + alias {"bg-bg_Latn/BGN"} + } + bg-bg_Latn/BGN { file { - resource:process(transliterator) {"Bulgarian_Latin_BGN.txt"} + resource:process(transliterator) {"bg_bg_Latn_BGN.txt"} direction {"FORWARD"} } } Cyrillic-Latin { + alias {"Cyrl-Latn"} + } + und-Latn-t-und-cyrl { + alias {"Cyrl-Latn"} + } + Cyrl-Latn { file { - resource:process(transliterator) {"Cyrillic_Latin.txt"} + resource:process(transliterator) {"Cyrl_Latn.txt"} direction {"FORWARD"} } } Latin-Cyrillic { + alias {"Latn-Cyrl"} + } + und-Cyrl-t-und-latn { + alias {"Latn-Cyrl"} + } + Latn-Cyrl { file { - resource:process(transliterator) {"Cyrillic_Latin.txt"} + resource:process(transliterator) {"Cyrl_Latn.txt"} direction {"REVERSE"} } } Devanagari-Bengali { + alias {"Deva-Beng"} + } + und-Beng-t-und-deva { + alias {"Deva-Beng"} + } + Deva-Beng { file { - resource:process(transliterator) {"Devanagari_Bengali.txt"} + resource:process(transliterator) {"Deva_Beng.txt"} direction {"FORWARD"} } } Devanagari-Gujarati { + alias {"Deva-Gujr"} + } + und-Gujr-t-und-deva { + alias {"Deva-Gujr"} + } + Deva-Gujr { file { - resource:process(transliterator) {"Devanagari_Gujarati.txt"} + resource:process(transliterator) {"Deva_Gujr.txt"} direction {"FORWARD"} } } Devanagari-Gurmukhi { + alias {"Deva-Guru"} + } + und-Guru-t-und-deva { + alias {"Deva-Guru"} + } + Deva-Guru { file { - resource:process(transliterator) {"Devanagari_Gurmukhi.txt"} + resource:process(transliterator) {"Deva_Guru.txt"} direction {"FORWARD"} } } @@ -214,53 +352,95 @@ root { } Devanagari-Kannada { + alias {"Deva-Knda"} + } + und-Knda-t-und-deva { + alias {"Deva-Knda"} + } + Deva-Knda { file { - resource:process(transliterator) {"Devanagari_Kannada.txt"} + resource:process(transliterator) {"Deva_Knda.txt"} direction {"FORWARD"} } } Devanagari-Latin { + alias {"Deva-Latn"} + } + und-Latn-t-und-deva { + alias {"Deva-Latn"} + } + Deva-Latn { file { - resource:process(transliterator) {"Devanagari_Latin.txt"} + resource:process(transliterator) {"Deva_Latn.txt"} direction {"FORWARD"} } } Devanagari-Malayalam { + alias {"Deva-Mlym"} + } + und-Mlym-t-und-deva { + alias {"Deva-Mlym"} + } + Deva-Mlym { file { - resource:process(transliterator) {"Devanagari_Malayalam.txt"} + resource:process(transliterator) {"Deva_Mlym.txt"} direction {"FORWARD"} } } Devanagari-Oriya { + alias {"Deva-Orya"} + } + und-Orya-t-und-deva { + alias {"Deva-Orya"} + } + Deva-Orya { file { - resource:process(transliterator) {"Devanagari_Oriya.txt"} + resource:process(transliterator) {"Deva_Orya.txt"} direction {"FORWARD"} } } Devanagari-Tamil { + alias {"Deva-Taml"} + } + und-Taml-t-und-deva { + alias {"Deva-Taml"} + } + Deva-Taml { file { - resource:process(transliterator) {"Devanagari_Tamil.txt"} + resource:process(transliterator) {"Deva_Taml.txt"} direction {"FORWARD"} } } Devanagari-Telugu { + alias {"Deva-Telu"} + } + und-Telu-t-und-deva { + alias {"Deva-Telu"} + } + Deva-Telu { file { - resource:process(transliterator) {"Devanagari_Telugu.txt"} + resource:process(transliterator) {"Deva_Telu.txt"} direction {"FORWARD"} } } + und-t-d0-hwidth { + alias {"Fullwidth-Halfwidth"} + } Fullwidth-Halfwidth { file { resource:process(transliterator) {"Fullwidth_Halfwidth.txt"} direction {"FORWARD"} } } + und-t-d0-fwidth { + alias {"Halfwidth-Fullwidth"} + } Halfwidth-Fullwidth { file { resource:process(transliterator) {"Fullwidth_Halfwidth.txt"} @@ -269,75 +449,141 @@ root { } Georgian-Latin { + alias {"Geor-Latn"} + } + und-Latn-t-und-geor { + alias {"Geor-Latn"} + } + Geor-Latn { file { - resource:process(transliterator) {"Georgian_Latin.txt"} + resource:process(transliterator) {"Geor_Latn.txt"} direction {"FORWARD"} } } Latin-Georgian { + alias {"Latn-Geor"} + } + und-Geor-t-und-latn { + alias {"Latn-Geor"} + } + Latn-Geor { file { - resource:process(transliterator) {"Georgian_Latin.txt"} + resource:process(transliterator) {"Geor_Latn.txt"} direction {"REVERSE"} } } Georgian-Latin/BGN { + alias {"ka-ka_Latn/BGN"} + } + ka-Latn-t-ka-m0-bgn { + alias {"ka-ka_Latn/BGN"} + } + ka-ka_Latn/BGN { file { - resource:process(transliterator) {"Georgian_Latin_BGN.txt"} + resource:process(transliterator) {"ka_ka_Latn_BGN.txt"} direction {"FORWARD"} } } Greek-Latin { + alias {"Grek-Latn"} + } + und-Latn-t-und-grek { + alias {"Grek-Latn"} + } + Grek-Latn { file { - resource:process(transliterator) {"Greek_Latin.txt"} + resource:process(transliterator) {"Grek_Latn.txt"} direction {"FORWARD"} } } Latin-Greek { + alias {"Latn-Grek"} + } + und-Grek-t-und-latn { + alias {"Latn-Grek"} + } + Latn-Grek { file { - resource:process(transliterator) {"Greek_Latin.txt"} + resource:process(transliterator) {"Grek_Latn.txt"} direction {"REVERSE"} } } Greek-Latin/BGN { + alias {"el-el_Latn/BGN"} + } + el-Latn-t-el-m0-bgn { + alias {"el-el_Latn/BGN"} + } + el-el_Latn/BGN { file { - resource:process(transliterator) {"Greek_Latin_BGN.txt"} + resource:process(transliterator) {"el_el_Latn_BGN.txt"} direction {"FORWARD"} } } Greek-Latin/UNGEGN { + alias {"Grek-Latn/UNGEGN"} + } + und-Latn-t-und-grek-m0-ungegn { + alias {"Grek-Latn/UNGEGN"} + } + Grek-Latn/UNGEGN { file { - resource:process(transliterator) {"Greek_Latin_UNGEGN.txt"} + resource:process(transliterator) {"Grek_Latn_UNGEGN.txt"} direction {"FORWARD"} } } Latin-Greek/UNGEGN { + alias {"Latn-Grek/UNGEGN"} + } + und-Grek-t-und-latn-m0-ungegn { + alias {"Latn-Grek/UNGEGN"} + } + Latn-Grek/UNGEGN { file { - resource:process(transliterator) {"Greek_Latin_UNGEGN.txt"} + resource:process(transliterator) {"Grek_Latn_UNGEGN.txt"} direction {"REVERSE"} } } Gujarati-Bengali { + alias {"Gujr-Beng"} + } + und-Beng-t-und-gujr { + alias {"Gujr-Beng"} + } + Gujr-Beng { file { - resource:process(transliterator) {"Gujarati_Bengali.txt"} + resource:process(transliterator) {"Gujr_Beng.txt"} direction {"FORWARD"} } } Gujarati-Devanagari { + alias {"Gujr-Deva"} + } + und-Deva-t-und-gujr { + alias {"Gujr-Deva"} + } + Gujr-Deva { file { - resource:process(transliterator) {"Gujarati_Devanagari.txt"} + resource:process(transliterator) {"Gujr_Deva.txt"} direction {"FORWARD"} } } Gujarati-Gurmukhi { + alias {"Gujr-Guru"} + } + und-Guru-t-und-gujr { + alias {"Gujr-Guru"} + } + Gujr-Guru { file { - resource:process(transliterator) {"Gujarati_Gurmukhi.txt"} + resource:process(transliterator) {"Gujr_Guru.txt"} direction {"FORWARD"} } } @@ -350,64 +596,118 @@ root { } Gujarati-Kannada { + alias {"Gujr-Knda"} + } + und-Knda-t-und-gujr { + alias {"Gujr-Knda"} + } + Gujr-Knda { file { - resource:process(transliterator) {"Gujarati_Kannada.txt"} + resource:process(transliterator) {"Gujr_Knda.txt"} direction {"FORWARD"} } } Gujarati-Latin { + alias {"Gujr-Latn"} + } + und-Latn-t-und-gujr { + alias {"Gujr-Latn"} + } + Gujr-Latn { file { - resource:process(transliterator) {"Gujarati_Latin.txt"} + resource:process(transliterator) {"Gujr_Latn.txt"} direction {"FORWARD"} } } Gujarati-Malayalam { + alias {"Gujr-Mlym"} + } + und-Mlym-t-und-gujr { + alias {"Gujr-Mlym"} + } + Gujr-Mlym { file { - resource:process(transliterator) {"Gujarati_Malayalam.txt"} + resource:process(transliterator) {"Gujr_Mlym.txt"} direction {"FORWARD"} } } Gujarati-Oriya { + alias {"Gujr-Orya"} + } + und-Orya-t-und-gujr { + alias {"Gujr-Orya"} + } + Gujr-Orya { file { - resource:process(transliterator) {"Gujarati_Oriya.txt"} + resource:process(transliterator) {"Gujr_Orya.txt"} direction {"FORWARD"} } } Gujarati-Tamil { + alias {"Gujr-Taml"} + } + und-Taml-t-und-gujr { + alias {"Gujr-Taml"} + } + Gujr-Taml { file { - resource:process(transliterator) {"Gujarati_Tamil.txt"} + resource:process(transliterator) {"Gujr_Taml.txt"} direction {"FORWARD"} } } Gujarati-Telugu { + alias {"Gujr-Telu"} + } + und-Telu-t-und-gujr { + alias {"Gujr-Telu"} + } + Gujr-Telu { file { - resource:process(transliterator) {"Gujarati_Telugu.txt"} + resource:process(transliterator) {"Gujr_Telu.txt"} direction {"FORWARD"} } } Gurmukhi-Bengali { + alias {"Guru-Beng"} + } + und-Beng-t-und-guru { + alias {"Guru-Beng"} + } + Guru-Beng { file { - resource:process(transliterator) {"Gurmukhi_Bengali.txt"} + resource:process(transliterator) {"Guru_Beng.txt"} direction {"FORWARD"} } } Gurmukhi-Devanagari { + alias {"Guru-Deva"} + } + und-Deva-t-und-guru { + alias {"Guru-Deva"} + } + Guru-Deva { file { - resource:process(transliterator) {"Gurmukhi_Devanagari.txt"} + resource:process(transliterator) {"Guru_Deva.txt"} direction {"FORWARD"} } } Gurmukhi-Gujarati { + alias {"Guru-Gujr"} + } + und-Gujr-t-und-guru { + alias {"Guru-Gujr"} + } + Guru-Gujr { file { - resource:process(transliterator) {"Gurmukhi_Gujarati.txt"} + resource:process(transliterator) {"Guru_Gujr.txt"} direction {"FORWARD"} } } @@ -420,54 +720,99 @@ root { } Gurmukhi-Kannada { + alias {"Guru-Knda"} + } + und-Knda-t-und-guru { + alias {"Guru-Knda"} + } + Guru-Knda { file { - resource:process(transliterator) {"Gurmukhi_Kannada.txt"} + resource:process(transliterator) {"Guru_Knda.txt"} direction {"FORWARD"} } } Gurmukhi-Latin { + alias {"Guru-Latn"} + } + und-Latn-t-und-guru { + alias {"Guru-Latn"} + } + Guru-Latn { file { - resource:process(transliterator) {"Gurmukhi_Latin.txt"} + resource:process(transliterator) {"Guru_Latn.txt"} direction {"FORWARD"} } } Gurmukhi-Malayalam { + alias {"Guru-Mlym"} + } + und-Mlym-t-und-guru { + alias {"Guru-Mlym"} + } + Guru-Mlym { file { - resource:process(transliterator) {"Gurmukhi_Malayalam.txt"} + resource:process(transliterator) {"Guru_Mlym.txt"} direction {"FORWARD"} } } Gurmukhi-Oriya { + alias {"Guru-Orya"} + } + und-Orya-t-und-guru { + alias {"Guru-Orya"} + } + Guru-Orya { file { - resource:process(transliterator) {"Gurmukhi_Oriya.txt"} + resource:process(transliterator) {"Guru_Orya.txt"} direction {"FORWARD"} } } Gurmukhi-Tamil { + alias {"Guru-Taml"} + } + und-Taml-t-und-guru { + alias {"Guru-Taml"} + } + Guru-Taml { file { - resource:process(transliterator) {"Gurmukhi_Tamil.txt"} + resource:process(transliterator) {"Guru_Taml.txt"} direction {"FORWARD"} } } Gurmukhi-Telugu { + alias {"Guru-Telu"} + } + und-Telu-t-und-guru { + alias {"Guru-Telu"} + } + Guru-Telu { file { - resource:process(transliterator) {"Gurmukhi_Telugu.txt"} + resource:process(transliterator) {"Guru_Telu.txt"} direction {"FORWARD"} } } Han-Latin { + alias {"Hani-Latn"} + } + und-Latn-t-und-hani { + alias {"Hani-Latn"} + } + Hani-Latn { file { - resource:process(transliterator) {"Han_Latin.txt"} + resource:process(transliterator) {"Hani_Latn.txt"} direction {"FORWARD"} } } + und-Latn-t-und-hani-m0-prprname { + alias {"Han-Latin/Names"} + } Han-Latin/Names { file { resource:process(transliterator) {"Han_Latin_Names.txt"} @@ -489,67 +834,127 @@ root { } Hangul-Latin { + alias {"Hang-Latn"} + } + und-Latn-t-und-hang { + alias {"Hang-Latn"} + } + Hang-Latn { file { - resource:process(transliterator) {"Hangul_Latin.txt"} + resource:process(transliterator) {"Hang_Latn.txt"} direction {"FORWARD"} } } Hebrew-Latin { + alias {"Hebr-Latn"} + } + und-Latn-t-und-hebr { + alias {"Hebr-Latn"} + } + Hebr-Latn { file { - resource:process(transliterator) {"Hebrew_Latin.txt"} + resource:process(transliterator) {"Hebr_Latn.txt"} direction {"FORWARD"} } } Latin-Hebrew { + alias {"Latn-Hebr"} + } + und-Hebr-t-und-latn { + alias {"Latn-Hebr"} + } + Latn-Hebr { file { - resource:process(transliterator) {"Hebrew_Latin.txt"} + resource:process(transliterator) {"Hebr_Latn.txt"} direction {"REVERSE"} } } Hebrew-Latin/BGN { + alias {"he-he_Latn/BGN"} + } + he-Latn-t-he-m0-bgn { + alias {"he-he_Latn/BGN"} + } + he-he_Latn/BGN { file { - resource:process(transliterator) {"Hebrew_Latin_BGN.txt"} + resource:process(transliterator) {"he_he_Latn_BGN.txt"} direction {"FORWARD"} } } Hiragana-Katakana { + alias {"Hira-Kana"} + } + und-Kana-t-und-hira { + alias {"Hira-Kana"} + } + Hira-Kana { file { - resource:process(transliterator) {"Hiragana_Katakana.txt"} + resource:process(transliterator) {"Hira_Kana.txt"} direction {"FORWARD"} } } Katakana-Hiragana { + alias {"Kana-Hira"} + } + und-Hira-t-und-kana { + alias {"Kana-Hira"} + } + Kana-Hira { file { - resource:process(transliterator) {"Hiragana_Katakana.txt"} + resource:process(transliterator) {"Hira_Kana.txt"} direction {"REVERSE"} } } Hiragana-Latin { + alias {"Hira-Latn"} + } + und-Latn-t-und-hira { + alias {"Hira-Latn"} + } + Hira-Latn { file { - resource:process(transliterator) {"Hiragana_Latin.txt"} + resource:process(transliterator) {"Hira_Latn.txt"} direction {"FORWARD"} } } Latin-Hiragana { + alias {"Latn-Hira"} + } + und-Hira-t-und-latn { + alias {"Latn-Hira"} + } + Latn-Hira { file { - resource:process(transliterator) {"Hiragana_Latin.txt"} + resource:process(transliterator) {"Hira_Latn.txt"} direction {"REVERSE"} } } IPA-XSampa { + alias {"und_FONIPA-und_FONXSAMP"} + } + und-fonxsamp-t-und-fonipa { + alias {"und_FONIPA-und_FONXSAMP"} + } + und_FONIPA-und_FONXSAMP { file { - resource:process(transliterator) {"IPA_XSampa.txt"} + resource:process(transliterator) {"und_FONIPA_und_FONXSAMP.txt"} direction {"FORWARD"} } } XSampa-IPA { + alias {"und_FONXSAMP-und_FONIPA"} + } + und-fonipa-t-und-fonxsamp { + alias {"und_FONXSAMP-und_FONIPA"} + } + und_FONXSAMP-und_FONIPA { file { - resource:process(transliterator) {"IPA_XSampa.txt"} + resource:process(transliterator) {"und_FONIPA_und_FONXSAMP.txt"} direction {"REVERSE"} } } @@ -625,36 +1030,66 @@ root { } Jamo-Latin { + alias {"Jamo-Latn"} + } + und-Latn-t-und-jamo { + alias {"Jamo-Latn"} + } + Jamo-Latn { file { - resource:process(transliterator) {"Jamo_Latin.txt"} + resource:process(transliterator) {"Jamo_Latn.txt"} direction {"FORWARD"} } } Kannada-Bengali { + alias {"Knda-Beng"} + } + und-Beng-t-und-knda { + alias {"Knda-Beng"} + } + Knda-Beng { file { - resource:process(transliterator) {"Kannada_Bengali.txt"} + resource:process(transliterator) {"Knda_Beng.txt"} direction {"FORWARD"} } } Kannada-Devanagari { + alias {"Knda-Deva"} + } + und-Deva-t-und-knda { + alias {"Knda-Deva"} + } + Knda-Deva { file { - resource:process(transliterator) {"Kannada_Devanagari.txt"} + resource:process(transliterator) {"Knda_Deva.txt"} direction {"FORWARD"} } } Kannada-Gujarati { + alias {"Knda-Gujr"} + } + und-Gujr-t-und-knda { + alias {"Knda-Gujr"} + } + Knda-Gujr { file { - resource:process(transliterator) {"Kannada_Gujarati.txt"} + resource:process(transliterator) {"Knda_Gujr.txt"} direction {"FORWARD"} } } Kannada-Gurmukhi { + alias {"Knda-Guru"} + } + und-Guru-t-und-knda { + alias {"Knda-Guru"} + } + Knda-Guru { file { - resource:process(transliterator) {"Kannada_Gurmukhi.txt"} + resource:process(transliterator) {"Knda_Guru.txt"} direction {"FORWARD"} } } @@ -667,68 +1102,128 @@ root { } Kannada-Latin { + alias {"Knda-Latn"} + } + und-Latn-t-und-knda { + alias {"Knda-Latn"} + } + Knda-Latn { file { - resource:process(transliterator) {"Kannada_Latin.txt"} + resource:process(transliterator) {"Knda_Latn.txt"} direction {"FORWARD"} } } Kannada-Malayalam { + alias {"Knda-Mlym"} + } + und-Mlym-t-und-knda { + alias {"Knda-Mlym"} + } + Knda-Mlym { file { - resource:process(transliterator) {"Kannada_Malayalam.txt"} + resource:process(transliterator) {"Knda_Mlym.txt"} direction {"FORWARD"} } } Kannada-Oriya { + alias {"Knda-Orya"} + } + und-Orya-t-und-knda { + alias {"Knda-Orya"} + } + Knda-Orya { file { - resource:process(transliterator) {"Kannada_Oriya.txt"} + resource:process(transliterator) {"Knda_Orya.txt"} direction {"FORWARD"} } } Kannada-Tamil { + alias {"Knda-Taml"} + } + und-Taml-t-und-knda { + alias {"Knda-Taml"} + } + Knda-Taml { file { - resource:process(transliterator) {"Kannada_Tamil.txt"} + resource:process(transliterator) {"Knda_Taml.txt"} direction {"FORWARD"} } } Kannada-Telugu { + alias {"Knda-Telu"} + } + und-Telu-t-und-knda { + alias {"Knda-Telu"} + } + Knda-Telu { file { - resource:process(transliterator) {"Kannada_Telugu.txt"} + resource:process(transliterator) {"Knda_Telu.txt"} direction {"FORWARD"} } } Katakana-Latin/BGN { + alias {"ja_Hrkt-ja_Latn/BGN"} + } + ja-Latn-t-ja-hrkt-m0-bgn { + alias {"ja_Hrkt-ja_Latn/BGN"} + } + ja_Hrkt-ja_Latn/BGN { file { - resource:process(transliterator) {"Katakana_Latin_BGN.txt"} + resource:process(transliterator) {"ja_Hrkt_ja_Latn_BGN.txt"} direction {"FORWARD"} } } Kazakh-Latin/BGN { + alias {"kk-kk_Latn/BGN"} + } + kk-Latn-t-kk-m0-bgn { + alias {"kk-kk_Latn/BGN"} + } + kk-kk_Latn/BGN { file { - resource:process(transliterator) {"Kazakh_Latin_BGN.txt"} + resource:process(transliterator) {"kk_kk_Latn_BGN.txt"} direction {"FORWARD"} } } Kirghiz-Latin/BGN { + alias {"ky-ky_Latn/BGN"} + } + ky-Latn-t-ky-m0-bgn { + alias {"ky-ky_Latn/BGN"} + } + ky-ky_Latn/BGN { file { - resource:process(transliterator) {"Kirghiz_Latin_BGN.txt"} + resource:process(transliterator) {"ky_ky_Latn_BGN.txt"} direction {"FORWARD"} } } Korean-Latin/BGN { + alias {"ko-ko_Latn/BGN"} + } + ko-Latn-t-ko-m0-bgn { + alias {"ko-ko_Latn/BGN"} + } + ko-ko_Latn/BGN { file { - resource:process(transliterator) {"Korean_Latin_BGN.txt"} + resource:process(transliterator) {"ko_ko_Latn_BGN.txt"} direction {"FORWARD"} } } + und-t-d0-ascii { + alias {"Latin-ASCII"} + } + und-Latn-t-s0-ascii { + alias {"Latin-ASCII"} + } Latin-ASCII { file { resource:process(transliterator) {"Latin_ASCII.txt"} @@ -743,34 +1238,64 @@ root { } Latin-Armenian { + alias {"Latn-Armn"} + } + und-Armn-t-und-latn { + alias {"Latn-Armn"} + } + Latn-Armn { file { - resource:process(transliterator) {"Latin_Armenian.txt"} + resource:process(transliterator) {"Latn_Armn.txt"} direction {"FORWARD"} } } Armenian-Latin { + alias {"Armn-Latn"} + } + und-Latn-t-und-armn { + alias {"Armn-Latn"} + } + Armn-Latn { file { - resource:process(transliterator) {"Latin_Armenian.txt"} + resource:process(transliterator) {"Latn_Armn.txt"} direction {"REVERSE"} } } Latin-Bengali { + alias {"Latn-Beng"} + } + und-Beng-t-und-latn { + alias {"Latn-Beng"} + } + Latn-Beng { file { - resource:process(transliterator) {"Latin_Bengali.txt"} + resource:process(transliterator) {"Latn_Beng.txt"} direction {"FORWARD"} } } Latin-Bopomofo { + alias {"Latn-Bopo"} + } + und-Bopo-t-und-latn { + alias {"Latn-Bopo"} + } + Latn-Bopo { file { - resource:process(transliterator) {"Latin_Bopomofo.txt"} + resource:process(transliterator) {"Latn_Bopo.txt"} direction {"FORWARD"} } } Bopomofo-Latin { + alias {"Bopo-Latn"} + } + und-Latn-t-und-bopo { + alias {"Bopo-Latn"} + } + Bopo-Latn { file { - resource:process(transliterator) {"Latin_Bopomofo.txt"} + resource:process(transliterator) {"Latn_Bopo.txt"} direction {"REVERSE"} } } @@ -789,29 +1314,53 @@ root { } Latin-Devanagari { + alias {"Latn-Deva"} + } + und-Deva-t-und-latn { + alias {"Latn-Deva"} + } + Latn-Deva { file { - resource:process(transliterator) {"Latin_Devanagari.txt"} + resource:process(transliterator) {"Latn_Deva.txt"} direction {"FORWARD"} } } Latin-Gujarati { + alias {"Latn-Gujr"} + } + und-Gujr-t-und-latn { + alias {"Latn-Gujr"} + } + Latn-Gujr { file { - resource:process(transliterator) {"Latin_Gujarati.txt"} + resource:process(transliterator) {"Latn_Gujr.txt"} direction {"FORWARD"} } } Latin-Gurmukhi { + alias {"Latn-Guru"} + } + und-Guru-t-und-latn { + alias {"Latn-Guru"} + } + Latn-Guru { file { - resource:process(transliterator) {"Latin_Gurmukhi.txt"} + resource:process(transliterator) {"Latn_Guru.txt"} direction {"FORWARD"} } } Latin-Hangul { + alias {"Latn-Hang"} + } + und-Hang-t-und-latn { + alias {"Latn-Hang"} + } + Latn-Hang { file { - resource:process(transliterator) {"Latin_Hangul.txt"} + resource:process(transliterator) {"Latn_Hang.txt"} direction {"FORWARD"} } } @@ -824,45 +1373,81 @@ root { } Latin-Jamo { + alias {"Latn-Jamo"} + } + und-Jamo-t-und-latn { + alias {"Latn-Jamo"} + } + Latn-Jamo { file { - resource:process(transliterator) {"Latin_Jamo.txt"} + resource:process(transliterator) {"Latn_Jamo.txt"} direction {"FORWARD"} } } Latin-Kannada { + alias {"Latn-Knda"} + } + und-Knda-t-und-latn { + alias {"Latn-Knda"} + } + Latn-Knda { file { - resource:process(transliterator) {"Latin_Kannada.txt"} + resource:process(transliterator) {"Latn_Knda.txt"} direction {"FORWARD"} } } Latin-Katakana { + alias {"Latn-Kana"} + } + und-Kana-t-und-latn { + alias {"Latn-Kana"} + } + Latn-Kana { file { - resource:process(transliterator) {"Latin_Katakana.txt"} + resource:process(transliterator) {"Latn_Kana.txt"} direction {"FORWARD"} } } Katakana-Latin { + alias {"Kana-Latn"} + } + und-Latn-t-und-kana { + alias {"Kana-Latn"} + } + Kana-Latn { file { - resource:process(transliterator) {"Latin_Katakana.txt"} + resource:process(transliterator) {"Latn_Kana.txt"} direction {"REVERSE"} } } Latin-Malayalam { + alias {"Latn-Mlym"} + } + und-Mlym-t-und-latn { + alias {"Latn-Mlym"} + } + Latn-Mlym { file { - resource:process(transliterator) {"Latin_Malayalam.txt"} + resource:process(transliterator) {"Latn_Mlym.txt"} direction {"FORWARD"} } } + und-pinyin-t-d0-npinyin { + alias {"Latin-NumericPinyin"} + } Latin-NumericPinyin { file { resource:process(transliterator) {"Latin_NumericPinyin.txt"} direction {"FORWARD"} } } + und-pinyin-t-s0-npinyin { + alias {"NumericPinyin-Latin"} + } NumericPinyin-Latin { file { resource:process(transliterator) {"Latin_NumericPinyin.txt"} @@ -871,64 +1456,118 @@ root { } Latin-Oriya { + alias {"Latn-Orya"} + } + und-Orya-t-und-latn { + alias {"Latn-Orya"} + } + Latn-Orya { file { - resource:process(transliterator) {"Latin_Oriya.txt"} + resource:process(transliterator) {"Latn_Orya.txt"} direction {"FORWARD"} } } Latin-Tamil { + alias {"Latn-Taml"} + } + und-Taml-t-und-latn { + alias {"Latn-Taml"} + } + Latn-Taml { file { - resource:process(transliterator) {"Latin_Tamil.txt"} + resource:process(transliterator) {"Latn_Taml.txt"} direction {"FORWARD"} } } Latin-Telugu { + alias {"Latn-Telu"} + } + und-Telu-t-und-latn { + alias {"Latn-Telu"} + } + Latn-Telu { file { - resource:process(transliterator) {"Latin_Telugu.txt"} + resource:process(transliterator) {"Latn_Telu.txt"} direction {"FORWARD"} } } Latin-Thai { + alias {"Latn-Thai"} + } + und-Thai-t-und-latn { + alias {"Latn-Thai"} + } + Latn-Thai { file { - resource:process(transliterator) {"Latin_Thai.txt"} + resource:process(transliterator) {"Latn_Thai.txt"} direction {"FORWARD"} } } Macedonian-Latin/BGN { + alias {"mk-mk_Latn/BGN"} + } + mk-Latn-t-mk-m0-bgn { + alias {"mk-mk_Latn/BGN"} + } + mk-mk_Latn/BGN { file { - resource:process(transliterator) {"Macedonian_Latin_BGN.txt"} + resource:process(transliterator) {"mk_mk_Latn_BGN.txt"} direction {"FORWARD"} } } Malayalam-Bengali { + alias {"Mlym-Beng"} + } + und-Beng-t-und-mlym { + alias {"Mlym-Beng"} + } + Mlym-Beng { file { - resource:process(transliterator) {"Malayalam_Bengali.txt"} + resource:process(transliterator) {"Mlym_Beng.txt"} direction {"FORWARD"} } } Malayalam-Devanagari { + alias {"Mlym-Deva"} + } + und-Deva-t-und-mlym { + alias {"Mlym-Deva"} + } + Mlym-Deva { file { - resource:process(transliterator) {"Malayalam_Devanagari.txt"} + resource:process(transliterator) {"Mlym_Deva.txt"} direction {"FORWARD"} } } Malayalam-Gujarati { + alias {"Mlym-Gujr"} + } + und-Gujr-t-und-mlym { + alias {"Mlym-Gujr"} + } + Mlym-Gujr { file { - resource:process(transliterator) {"Malayalam_Gujarati.txt"} + resource:process(transliterator) {"Mlym_Gujr.txt"} direction {"FORWARD"} } } Malayalam-Gurmukhi { + alias {"Mlym-Guru"} + } + und-Guru-t-und-mlym { + alias {"Mlym-Guru"} + } + Mlym-Guru { file { - resource:process(transliterator) {"Malayalam_Gurmukhi.txt"} + resource:process(transliterator) {"Mlym_Guru.txt"} direction {"FORWARD"} } } @@ -941,78 +1580,144 @@ root { } Malayalam-Kannada { + alias {"Mlym-Knda"} + } + und-Knda-t-und-mlym { + alias {"Mlym-Knda"} + } + Mlym-Knda { file { - resource:process(transliterator) {"Malayalam_Kannada.txt"} + resource:process(transliterator) {"Mlym_Knda.txt"} direction {"FORWARD"} } } Malayalam-Latin { + alias {"Mlym-Latn"} + } + und-Latn-t-und-mlym { + alias {"Mlym-Latn"} + } + Mlym-Latn { file { - resource:process(transliterator) {"Malayalam_Latin.txt"} + resource:process(transliterator) {"Mlym_Latn.txt"} direction {"FORWARD"} } } Malayalam-Oriya { + alias {"Mlym-Orya"} + } + und-Orya-t-und-mlym { + alias {"Mlym-Orya"} + } + Mlym-Orya { file { - resource:process(transliterator) {"Malayalam_Oriya.txt"} + resource:process(transliterator) {"Mlym_Orya.txt"} direction {"FORWARD"} } } Malayalam-Tamil { + alias {"Mlym-Taml"} + } + und-Taml-t-und-mlym { + alias {"Mlym-Taml"} + } + Mlym-Taml { file { - resource:process(transliterator) {"Malayalam_Tamil.txt"} + resource:process(transliterator) {"Mlym_Taml.txt"} direction {"FORWARD"} } } Malayalam-Telugu { + alias {"Mlym-Telu"} + } + und-Telu-t-und-mlym { + alias {"Mlym-Telu"} + } + Mlym-Telu { file { - resource:process(transliterator) {"Malayalam_Telugu.txt"} + resource:process(transliterator) {"Mlym_Telu.txt"} direction {"FORWARD"} } } Maldivian-Latin/BGN { + alias {"dv-dv_Latn/BGN"} + } + dv-Latn-t-dv-m0-bgn { + alias {"dv-dv_Latn/BGN"} + } + dv-dv_Latn/BGN { file { - resource:process(transliterator) {"Maldivian_Latin_BGN.txt"} + resource:process(transliterator) {"dv_dv_Latn_BGN.txt"} direction {"FORWARD"} } } Mongolian-Latin/BGN { + alias {"mn-mn_Latn/BGN"} + } + mn-Latn-t-mn-m0-bgn { + alias {"mn-mn_Latn/BGN"} + } + mn-mn_Latn/BGN { file { - resource:process(transliterator) {"Mongolian_Latin_BGN.txt"} + resource:process(transliterator) {"mn_mn_Latn_BGN.txt"} direction {"FORWARD"} } } Oriya-Bengali { + alias {"Orya-Beng"} + } + und-Beng-t-und-orya { + alias {"Orya-Beng"} + } + Orya-Beng { file { - resource:process(transliterator) {"Oriya_Bengali.txt"} + resource:process(transliterator) {"Orya_Beng.txt"} direction {"FORWARD"} } } Oriya-Devanagari { + alias {"Orya-Deva"} + } + und-Deva-t-und-orya { + alias {"Orya-Deva"} + } + Orya-Deva { file { - resource:process(transliterator) {"Oriya_Devanagari.txt"} + resource:process(transliterator) {"Orya_Deva.txt"} direction {"FORWARD"} } } Oriya-Gujarati { + alias {"Orya-Gujr"} + } + und-Gujr-t-und-orya { + alias {"Orya-Gujr"} + } + Orya-Gujr { file { - resource:process(transliterator) {"Oriya_Gujarati.txt"} + resource:process(transliterator) {"Orya_Gujr.txt"} direction {"FORWARD"} } } Oriya-Gurmukhi { + alias {"Orya-Guru"} + } + und-Guru-t-und-orya { + alias {"Orya-Guru"} + } + Orya-Guru { file { - resource:process(transliterator) {"Oriya_Gurmukhi.txt"} + resource:process(transliterator) {"Orya_Guru.txt"} direction {"FORWARD"} } } @@ -1025,50 +1730,92 @@ root { } Oriya-Kannada { + alias {"Orya-Knda"} + } + und-Knda-t-und-orya { + alias {"Orya-Knda"} + } + Orya-Knda { file { - resource:process(transliterator) {"Oriya_Kannada.txt"} + resource:process(transliterator) {"Orya_Knda.txt"} direction {"FORWARD"} } } Oriya-Latin { + alias {"Orya-Latn"} + } + und-Latn-t-und-orya { + alias {"Orya-Latn"} + } + Orya-Latn { file { - resource:process(transliterator) {"Oriya_Latin.txt"} + resource:process(transliterator) {"Orya_Latn.txt"} direction {"FORWARD"} } } Oriya-Malayalam { + alias {"Orya-Mlym"} + } + und-Mlym-t-und-orya { + alias {"Orya-Mlym"} + } + Orya-Mlym { file { - resource:process(transliterator) {"Oriya_Malayalam.txt"} + resource:process(transliterator) {"Orya_Mlym.txt"} direction {"FORWARD"} } } Oriya-Tamil { + alias {"Orya-Taml"} + } + und-Taml-t-und-orya { + alias {"Orya-Taml"} + } + Orya-Taml { file { - resource:process(transliterator) {"Oriya_Tamil.txt"} + resource:process(transliterator) {"Orya_Taml.txt"} direction {"FORWARD"} } } Oriya-Telugu { + alias {"Orya-Telu"} + } + und-Telu-t-und-orya { + alias {"Orya-Telu"} + } + Orya-Telu { file { - resource:process(transliterator) {"Oriya_Telugu.txt"} + resource:process(transliterator) {"Orya_Telu.txt"} direction {"FORWARD"} } } Pashto-Latin/BGN { + alias {"ps-ps_Latn/BGN"} + } + ps-Latn-t-ps-m0-bgn { + alias {"ps-ps_Latn/BGN"} + } + ps-ps_Latn/BGN { file { - resource:process(transliterator) {"Pashto_Latin_BGN.txt"} + resource:process(transliterator) {"ps_ps_Latn_BGN.txt"} direction {"FORWARD"} } } Persian-Latin/BGN { + alias {"fa-fa_Latn/BGN"} + } + fa-Latn-t-fa-m0-bgn { + alias {"fa-fa_Latn/BGN"} + } + fa-fa_Latn/BGN { file { - resource:process(transliterator) {"Persian_Latin_BGN.txt"} + resource:process(transliterator) {"fa_fa_Latn_BGN.txt"} direction {"FORWARD"} } } @@ -1087,69 +1834,129 @@ root { } Russian-Latin/BGN { + alias {"ru-ru_Latn/BGN"} + } + ru-Latn-t-ru-m0-bgn { + alias {"ru-ru_Latn/BGN"} + } + ru-ru_Latn/BGN { file { - resource:process(transliterator) {"Russian_Latin_BGN.txt"} + resource:process(transliterator) {"ru_ru_Latn_BGN.txt"} direction {"FORWARD"} } } Serbian-Latin/BGN { + alias {"sr-sr_Latn/BGN"} + } + sr-Latn-t-sr-m0-bgn { + alias {"sr-sr_Latn/BGN"} + } + sr-sr_Latn/BGN { file { - resource:process(transliterator) {"Serbian_Latin_BGN.txt"} + resource:process(transliterator) {"sr_sr_Latn_BGN.txt"} direction {"FORWARD"} } } Simplified-Traditional { + alias {"Hans-Hant"} + } + und-Hant-t-und-hans { + alias {"Hans-Hant"} + } + Hans-Hant { file { - resource:process(transliterator) {"Simplified_Traditional.txt"} + resource:process(transliterator) {"Hans_Hant.txt"} direction {"FORWARD"} } } Traditional-Simplified { + alias {"Hant-Hans"} + } + und-Hans-t-und-hant { + alias {"Hant-Hans"} + } + Hant-Hans { file { - resource:process(transliterator) {"Simplified_Traditional.txt"} + resource:process(transliterator) {"Hans_Hant.txt"} direction {"REVERSE"} } } Syriac-Latin { + alias {"Syrc-Latn"} + } + und-Latn-t-und-syrc { + alias {"Syrc-Latn"} + } + Syrc-Latn { file { - resource:process(transliterator) {"Syriac_Latin.txt"} + resource:process(transliterator) {"Syrc_Latn.txt"} direction {"FORWARD"} } } Latin-Syriac { + alias {"Latn-Syrc"} + } + und-Syrc-t-und-latn { + alias {"Latn-Syrc"} + } + Latn-Syrc { file { - resource:process(transliterator) {"Syriac_Latin.txt"} + resource:process(transliterator) {"Syrc_Latn.txt"} direction {"REVERSE"} } } Tamil-Bengali { + alias {"Taml-Beng"} + } + und-Beng-t-und-taml { + alias {"Taml-Beng"} + } + Taml-Beng { file { - resource:process(transliterator) {"Tamil_Bengali.txt"} + resource:process(transliterator) {"Taml_Beng.txt"} direction {"FORWARD"} } } Tamil-Devanagari { + alias {"Taml-Deva"} + } + und-Deva-t-und-taml { + alias {"Taml-Deva"} + } + Taml-Deva { file { - resource:process(transliterator) {"Tamil_Devanagari.txt"} + resource:process(transliterator) {"Taml_Deva.txt"} direction {"FORWARD"} } } Tamil-Gujarati { + alias {"Taml-Gujr"} + } + und-Gujr-t-und-taml { + alias {"Taml-Gujr"} + } + Taml-Gujr { file { - resource:process(transliterator) {"Tamil_Gujarati.txt"} + resource:process(transliterator) {"Taml_Gujr.txt"} direction {"FORWARD"} } } Tamil-Gurmukhi { + alias {"Taml-Guru"} + } + und-Guru-t-und-taml { + alias {"Taml-Guru"} + } + Taml-Guru { file { - resource:process(transliterator) {"Tamil_Gurmukhi.txt"} + resource:process(transliterator) {"Taml_Guru.txt"} direction {"FORWARD"} } } @@ -1162,64 +1969,118 @@ root { } Tamil-Kannada { + alias {"Taml-Knda"} + } + und-Knda-t-und-taml { + alias {"Taml-Knda"} + } + Taml-Knda { file { - resource:process(transliterator) {"Tamil_Kannada.txt"} + resource:process(transliterator) {"Taml_Knda.txt"} direction {"FORWARD"} } } Tamil-Latin { + alias {"Taml-Latn"} + } + und-Latn-t-und-taml { + alias {"Taml-Latn"} + } + Taml-Latn { file { - resource:process(transliterator) {"Tamil_Latin.txt"} + resource:process(transliterator) {"Taml_Latn.txt"} direction {"FORWARD"} } } Tamil-Malayalam { + alias {"Taml-Mlym"} + } + und-Mlym-t-und-taml { + alias {"Taml-Mlym"} + } + Taml-Mlym { file { - resource:process(transliterator) {"Tamil_Malayalam.txt"} + resource:process(transliterator) {"Taml_Mlym.txt"} direction {"FORWARD"} } } Tamil-Oriya { + alias {"Taml-Orya"} + } + und-Orya-t-und-taml { + alias {"Taml-Orya"} + } + Taml-Orya { file { - resource:process(transliterator) {"Tamil_Oriya.txt"} + resource:process(transliterator) {"Taml_Orya.txt"} direction {"FORWARD"} } } Tamil-Telugu { + alias {"Taml-Telu"} + } + und-Telu-t-und-taml { + alias {"Taml-Telu"} + } + Taml-Telu { file { - resource:process(transliterator) {"Tamil_Telugu.txt"} + resource:process(transliterator) {"Taml_Telu.txt"} direction {"FORWARD"} } } Telugu-Bengali { + alias {"Telu-Beng"} + } + und-Beng-t-und-telu { + alias {"Telu-Beng"} + } + Telu-Beng { file { - resource:process(transliterator) {"Telugu_Bengali.txt"} + resource:process(transliterator) {"Telu_Beng.txt"} direction {"FORWARD"} } } Telugu-Devanagari { + alias {"Telu-Deva"} + } + und-Deva-t-und-telu { + alias {"Telu-Deva"} + } + Telu-Deva { file { - resource:process(transliterator) {"Telugu_Devanagari.txt"} + resource:process(transliterator) {"Telu_Deva.txt"} direction {"FORWARD"} } } Telugu-Gujarati { + alias {"Telu-Gujr"} + } + und-Gujr-t-und-telu { + alias {"Telu-Gujr"} + } + Telu-Gujr { file { - resource:process(transliterator) {"Telugu_Gujarati.txt"} + resource:process(transliterator) {"Telu_Gujr.txt"} direction {"FORWARD"} } } Telugu-Gurmukhi { + alias {"Telu-Guru"} + } + und-Guru-t-und-telu { + alias {"Telu-Guru"} + } + Telu-Guru { file { - resource:process(transliterator) {"Telugu_Gurmukhi.txt"} + resource:process(transliterator) {"Telu_Guru.txt"} direction {"FORWARD"} } } @@ -1232,56 +2093,104 @@ root { } Telugu-Kannada { + alias {"Telu-Knda"} + } + und-Knda-t-und-telu { + alias {"Telu-Knda"} + } + Telu-Knda { file { - resource:process(transliterator) {"Telugu_Kannada.txt"} + resource:process(transliterator) {"Telu_Knda.txt"} direction {"FORWARD"} } } Telugu-Latin { + alias {"Telu-Latn"} + } + und-Latn-t-und-telu { + alias {"Telu-Latn"} + } + Telu-Latn { file { - resource:process(transliterator) {"Telugu_Latin.txt"} + resource:process(transliterator) {"Telu_Latn.txt"} direction {"FORWARD"} } } Telugu-Malayalam { + alias {"Telu-Mlym"} + } + und-Mlym-t-und-telu { + alias {"Telu-Mlym"} + } + Telu-Mlym { file { - resource:process(transliterator) {"Telugu_Malayalam.txt"} + resource:process(transliterator) {"Telu_Mlym.txt"} direction {"FORWARD"} } } Telugu-Oriya { + alias {"Telu-Orya"} + } + und-Orya-t-und-telu { + alias {"Telu-Orya"} + } + Telu-Orya { file { - resource:process(transliterator) {"Telugu_Oriya.txt"} + resource:process(transliterator) {"Telu_Orya.txt"} direction {"FORWARD"} } } Telugu-Tamil { + alias {"Telu-Taml"} + } + und-Taml-t-und-telu { + alias {"Telu-Taml"} + } + Telu-Taml { file { - resource:process(transliterator) {"Telugu_Tamil.txt"} + resource:process(transliterator) {"Telu_Taml.txt"} direction {"FORWARD"} } } Latin-Thaana { + alias {"Latn-Thaa"} + } + und-Thaa-t-und-latn { + alias {"Latn-Thaa"} + } + Latn-Thaa { file { - resource:process(transliterator) {"Latin_Thaana.txt"} + resource:process(transliterator) {"Latn_Thaa.txt"} direction {"FORWARD"} } } Thaana-Latin { + alias {"Thaa-Latn"} + } + und-Latn-t-und-thaa { + alias {"Thaa-Latn"} + } + Thaa-Latn { file { - resource:process(transliterator) {"Latin_Thaana.txt"} + resource:process(transliterator) {"Latn_Thaa.txt"} direction {"REVERSE"} } } Thai-Latin { + alias {"Thai-Latn"} + } + und-Latn-t-und-thai { + alias {"Thai-Latn"} + } + Thai-Latn { file { - resource:process(transliterator) {"Thai_Latin.txt"} + resource:process(transliterator) {"Thai_Latn.txt"} direction {"FORWARD"} } } @@ -1326,33 +2235,86 @@ root { } Turkmen-Latin/BGN { + alias {"tk_Cyrl-tk/BGN"} + } + tk-t-tk-cyrl-m0-bgn { + alias {"tk_Cyrl-tk/BGN"} + } + tk_Cyrl-tk/BGN { file { - resource:process(transliterator) {"Turkmen_Latin_BGN.txt"} + resource:process(transliterator) {"tk_Cyrl_tk_BGN.txt"} direction {"FORWARD"} } } Ukrainian-Latin/BGN { + alias {"uk-uk_Latn/BGN"} + } + uk-Latn-t-uk-m0-bgn { + alias {"uk-uk_Latn/BGN"} + } + uk-uk_Latn/BGN { file { - resource:process(transliterator) {"Ukrainian_Latin_BGN.txt"} + resource:process(transliterator) {"uk_uk_Latn_BGN.txt"} direction {"FORWARD"} } } Uzbek-Latin/BGN { + alias {"uz_Cyrl-uz/BGN"} + } + uz-t-uz-cyrl-m0-bgn { + alias {"uz_Cyrl-uz/BGN"} + } + uz_Cyrl-uz/BGN { file { - resource:process(transliterator) {"Uzbek_Latin_BGN.txt"} + resource:process(transliterator) {"uz_Cyrl_uz_BGN.txt"} direction {"FORWARD"} } } + am-fonipa-t-am { + alias {"am-am_FONIPA"} + } am-am_FONIPA { file { resource:process(transliterator) {"am_am_FONIPA.txt"} direction {"FORWARD"} } } + am-t-am-fonipa { + alias {"am_FONIPA-am"} + } + am_FONIPA-am { + file { + resource:process(transliterator) {"am_am_FONIPA.txt"} + direction {"REVERSE"} + } + } + ar-t-am { + alias {"am-ar"} + } + am-ar { + file { + resource:process(transliterator) {"am_ar.txt"} + direction {"FORWARD"} + } + } + + fa-t-am { + alias {"am-fa"} + } + am-fa { + file { + resource:process(transliterator) {"am_fa.txt"} + direction {"FORWARD"} + } + } + + az-t-d0-lower { + alias {"az-Lower"} + } az-Lower { file { resource:process(transliterator) {"az_Lower.txt"} @@ -1360,6 +2322,9 @@ root { } } + az-t-d0-title { + alias {"az-Title"} + } az-Title { file { resource:process(transliterator) {"az_Title.txt"} @@ -1367,6 +2332,9 @@ root { } } + az-t-d0-upper { + alias {"az-Upper"} + } az-Upper { file { resource:process(transliterator) {"az_Upper.txt"} @@ -1374,6 +2342,29 @@ root { } } + am-t-ch { + alias {"ch-am"} + } + ch-am { + file { + resource:process(transliterator) {"ch_am.txt"} + direction {"FORWARD"} + } + } + + ar-t-ch { + alias {"ch-ar"} + } + ch-ar { + file { + resource:process(transliterator) {"ch_ar.txt"} + direction {"FORWARD"} + } + } + + ch-fonipa-t-ch { + alias {"ch-ch_FONIPA"} + } ch-ch_FONIPA { file { resource:process(transliterator) {"ch_ch_FONIPA.txt"} @@ -1381,6 +2372,39 @@ root { } } + fa-t-ch { + alias {"ch-fa"} + } + ch-fa { + file { + resource:process(transliterator) {"ch_fa.txt"} + direction {"FORWARD"} + } + } + + am-t-cs { + alias {"cs-am"} + } + cs-am { + file { + resource:process(transliterator) {"cs_am.txt"} + direction {"FORWARD"} + } + } + + ar-t-cs { + alias {"cs-ar"} + } + cs-ar { + file { + resource:process(transliterator) {"cs_ar.txt"} + direction {"FORWARD"} + } + } + + cs-fonipa-t-cs { + alias {"cs-cs_FONIPA"} + } cs-cs_FONIPA { file { resource:process(transliterator) {"cs_cs_FONIPA.txt"} @@ -1388,6 +2412,19 @@ root { } } + fa-t-cs { + alias {"cs-fa"} + } + cs-fa { + file { + resource:process(transliterator) {"cs_fa.txt"} + direction {"FORWARD"} + } + } + + ja-t-cs { + alias {"cs-ja"} + } cs-ja { file { resource:process(transliterator) {"cs_ja.txt"} @@ -1395,6 +2432,9 @@ root { } } + ko-t-cs { + alias {"cs-ko"} + } cs-ko { file { resource:process(transliterator) {"cs_ko.txt"} @@ -1402,6 +2442,9 @@ root { } } + ja-t-cs-fonipa { + alias {"cs_FONIPA-ja"} + } cs_FONIPA-ja { file { resource:process(transliterator) {"cs_FONIPA_ja.txt"} @@ -1409,6 +2452,9 @@ root { } } + ko-t-cs-fonipa { + alias {"cs_FONIPA-ko"} + } cs_FONIPA-ko { file { resource:process(transliterator) {"cs_FONIPA_ko.txt"} @@ -1416,6 +2462,9 @@ root { } } + dsb-fonipa-t-dsb { + alias {"dsb-dsb_FONIPA"} + } dsb-dsb_FONIPA { file { resource:process(transliterator) {"dsb_dsb_FONIPA.txt"} @@ -1423,6 +2472,9 @@ root { } } + el-t-d0-lower { + alias {"el-Lower"} + } el-Lower { file { resource:process(transliterator) {"el_Lower.txt"} @@ -1430,6 +2482,9 @@ root { } } + el-t-d0-title { + alias {"el-Title"} + } el-Title { file { resource:process(transliterator) {"el_Title.txt"} @@ -1437,6 +2492,9 @@ root { } } + el-t-d0-upper { + alias {"el-Upper"} + } el-Upper { file { resource:process(transliterator) {"el_Upper.txt"} @@ -1444,6 +2502,29 @@ root { } } + am-t-eo { + alias {"eo-am"} + } + eo-am { + file { + resource:process(transliterator) {"eo_am.txt"} + direction {"FORWARD"} + } + } + + ar-t-eo { + alias {"eo-ar"} + } + eo-ar { + file { + resource:process(transliterator) {"eo_ar.txt"} + direction {"FORWARD"} + } + } + + eo-fonipa-t-eo { + alias {"eo-eo_FONIPA"} + } eo-eo_FONIPA { file { resource:process(transliterator) {"eo_eo_FONIPA.txt"} @@ -1451,6 +2532,19 @@ root { } } + fa-t-eo { + alias {"eo-fa"} + } + eo-fa { + file { + resource:process(transliterator) {"eo_fa.txt"} + direction {"FORWARD"} + } + } + + am-t-es { + alias {"es-am"} + } es-am { file { resource:process(transliterator) {"es_am.txt"} @@ -1458,6 +2552,19 @@ root { } } + ar-t-es { + alias {"es-ar"} + } + es-ar { + file { + resource:process(transliterator) {"es_ar.txt"} + direction {"FORWARD"} + } + } + + es-fonipa-t-es { + alias {"es-es_FONIPA"} + } es-es_FONIPA { file { resource:process(transliterator) {"es_es_FONIPA.txt"} @@ -1465,6 +2572,19 @@ root { } } + fa-t-es { + alias {"es-fa"} + } + es-fa { + file { + resource:process(transliterator) {"es_fa.txt"} + direction {"FORWARD"} + } + } + + ja-t-es { + alias {"es-ja"} + } es-ja { file { resource:process(transliterator) {"es_ja.txt"} @@ -1472,6 +2592,9 @@ root { } } + zh-t-es { + alias {"es-zh"} + } es-zh { file { resource:process(transliterator) {"es_zh.txt"} @@ -1479,6 +2602,39 @@ root { } } + am-t-es-419 { + alias {"es_419-am"} + } + es_419-am { + file { + resource:process(transliterator) {"es_419_am.txt"} + direction {"FORWARD"} + } + } + + ar-t-es-419 { + alias {"es_419-ar"} + } + es_419-ar { + file { + resource:process(transliterator) {"es_419_ar.txt"} + direction {"FORWARD"} + } + } + + fa-t-es-419 { + alias {"es_419-fa"} + } + es_419-fa { + file { + resource:process(transliterator) {"es_419_fa.txt"} + direction {"FORWARD"} + } + } + + ja-t-es-419 { + alias {"es_419-ja"} + } es_419-ja { file { resource:process(transliterator) {"es_419_ja.txt"} @@ -1486,6 +2642,9 @@ root { } } + zh-t-es-419 { + alias {"es_419-zh"} + } es_419-zh { file { resource:process(transliterator) {"es_419_zh.txt"} @@ -1493,6 +2652,9 @@ root { } } + am-t-es-fonipa { + alias {"es_FONIPA-am"} + } es_FONIPA-am { file { resource:process(transliterator) {"es_FONIPA_am.txt"} @@ -1500,6 +2662,9 @@ root { } } + es-419-fonipa-t-es-fonipa { + alias {"es_FONIPA-es_419_FONIPA"} + } es_FONIPA-es_419_FONIPA { file { resource:process(transliterator) {"es_FONIPA_es_419_FONIPA.txt"} @@ -1507,6 +2672,9 @@ root { } } + ja-t-es-fonipa { + alias {"es_FONIPA-ja"} + } es_FONIPA-ja { file { resource:process(transliterator) {"es_FONIPA_ja.txt"} @@ -1514,6 +2682,9 @@ root { } } + zh-t-es-fonipa { + alias {"es_FONIPA-zh"} + } es_FONIPA-zh { file { resource:process(transliterator) {"es_FONIPA_zh.txt"} @@ -1521,6 +2692,119 @@ root { } } + am-t-hy { + alias {"hy-am"} + } + hy-am { + file { + resource:process(transliterator) {"hy_am.txt"} + direction {"FORWARD"} + } + } + + ar-t-hy { + alias {"hy-ar"} + } + hy-ar { + file { + resource:process(transliterator) {"hy_ar.txt"} + direction {"FORWARD"} + } + } + + fa-t-hy { + alias {"hy-fa"} + } + hy-fa { + file { + resource:process(transliterator) {"hy_fa.txt"} + direction {"FORWARD"} + } + } + + hy-fonipa-t-hy { + alias {"hy-hy_FONIPA"} + } + hy-hy_FONIPA { + file { + resource:process(transliterator) {"hy_hy_FONIPA.txt"} + direction {"FORWARD"} + } + } + + am-t-hy-arevmda { + alias {"hy_AREVMDA-am"} + } + hy_AREVMDA-am { + file { + resource:process(transliterator) {"hy_AREVMDA_am.txt"} + direction {"FORWARD"} + } + } + + ar-t-hy-arevmda { + alias {"hy_AREVMDA-ar"} + } + hy_AREVMDA-ar { + file { + resource:process(transliterator) {"hy_AREVMDA_ar.txt"} + direction {"FORWARD"} + } + } + + fa-t-hy-arevmda { + alias {"hy_AREVMDA-fa"} + } + hy_AREVMDA-fa { + file { + resource:process(transliterator) {"hy_AREVMDA_fa.txt"} + direction {"FORWARD"} + } + } + + hy-arevmda-fonipa-t-hy-arevmda { + alias {"hy_AREVMDA-hy_AREVMDA_FONIPA"} + } + hy_AREVMDA-hy_AREVMDA_FONIPA { + file { + resource:process(transliterator) {"hy_AREVMDA_hy_AREVMDA_FONIPA.txt"} + direction {"FORWARD"} + } + } + + am-t-ia { + alias {"ia-am"} + } + ia-am { + file { + resource:process(transliterator) {"ia_am.txt"} + direction {"FORWARD"} + } + } + + ar-t-ia { + alias {"ia-ar"} + } + ia-ar { + file { + resource:process(transliterator) {"ia_ar.txt"} + direction {"FORWARD"} + } + } + + fa-t-ia { + alias {"ia-fa"} + } + ia-fa { + file { + resource:process(transliterator) {"ia_fa.txt"} + direction {"FORWARD"} + } + } + + ia-fonipa-t-ia { + alias {"ia-ia_FONIPA"} + } ia-ia_FONIPA { file { resource:process(transliterator) {"ia_ia_FONIPA.txt"} @@ -1528,6 +2812,9 @@ root { } } + am-t-it { + alias {"it-am"} + } it-am { file { resource:process(transliterator) {"it_am.txt"} @@ -1535,6 +2822,9 @@ root { } } + ja-t-it { + alias {"it-ja"} + } it-ja { file { resource:process(transliterator) {"it_ja.txt"} @@ -1542,6 +2832,9 @@ root { } } + ko-t-ja-latn { + alias {"ja_Latn-ko"} + } ja_Latn-ko { file { resource:process(transliterator) {"ja_Latn_ko.txt"} @@ -1549,6 +2842,9 @@ root { } } + ru-t-ja-latn { + alias {"ja_Latn-ru"} + } ja_Latn-ru { file { resource:process(transliterator) {"ja_Latn_ru.txt"} @@ -1556,6 +2852,79 @@ root { } } + am-t-kk { + alias {"kk-am"} + } + kk-am { + file { + resource:process(transliterator) {"kk_am.txt"} + direction {"FORWARD"} + } + } + + ar-t-kk { + alias {"kk-ar"} + } + kk-ar { + file { + resource:process(transliterator) {"kk_ar.txt"} + direction {"FORWARD"} + } + } + + fa-t-kk { + alias {"kk-fa"} + } + kk-fa { + file { + resource:process(transliterator) {"kk_fa.txt"} + direction {"FORWARD"} + } + } + + kk-fonipa-t-kk { + alias {"kk-kk_FONIPA"} + } + kk-kk_FONIPA { + file { + resource:process(transliterator) {"kk_kk_FONIPA.txt"} + direction {"FORWARD"} + } + } + + am-t-ky { + alias {"ky-am"} + } + ky-am { + file { + resource:process(transliterator) {"ky_am.txt"} + direction {"FORWARD"} + } + } + + ar-t-ky { + alias {"ky-ar"} + } + ky-ar { + file { + resource:process(transliterator) {"ky_ar.txt"} + direction {"FORWARD"} + } + } + + fa-t-ky { + alias {"ky-fa"} + } + ky-fa { + file { + resource:process(transliterator) {"ky_fa.txt"} + direction {"FORWARD"} + } + } + + ky-fonipa-t-ky { + alias {"ky-ky_FONIPA"} + } ky-ky_FONIPA { file { resource:process(transliterator) {"ky_ky_FONIPA.txt"} @@ -1563,6 +2932,9 @@ root { } } + la-fonipa-t-la { + alias {"la-la_FONIPA"} + } la-la_FONIPA { file { resource:process(transliterator) {"la_la_FONIPA.txt"} @@ -1570,6 +2942,9 @@ root { } } + lt-t-d0-lower { + alias {"lt-Lower"} + } lt-Lower { file { resource:process(transliterator) {"lt_Lower.txt"} @@ -1577,6 +2952,9 @@ root { } } + lt-t-d0-title { + alias {"lt-Title"} + } lt-Title { file { resource:process(transliterator) {"lt_Title.txt"} @@ -1584,6 +2962,9 @@ root { } } + lt-t-d0-upper { + alias {"lt-Upper"} + } lt-Upper { file { resource:process(transliterator) {"lt_Upper.txt"} @@ -1591,6 +2972,59 @@ root { } } + mn-Latn-t-mn-m0-mns { + alias {"mn-mn_Latn/MNS"} + } + mn-mn_Latn/MNS { + file { + resource:process(transliterator) {"mn_mn_Latn_MNS.txt"} + direction {"FORWARD"} + } + } + + am-t-my { + alias {"my-am"} + } + my-am { + file { + resource:process(transliterator) {"my_am.txt"} + direction {"FORWARD"} + } + } + + ar-t-my { + alias {"my-ar"} + } + my-ar { + file { + resource:process(transliterator) {"my_ar.txt"} + direction {"FORWARD"} + } + } + + fa-t-my { + alias {"my-fa"} + } + my-fa { + file { + resource:process(transliterator) {"my_fa.txt"} + direction {"FORWARD"} + } + } + + my-fonipa-t-my { + alias {"my-my_FONIPA"} + } + my-my_FONIPA { + file { + resource:process(transliterator) {"my_my_FONIPA.txt"} + direction {"FORWARD"} + } + } + + nl-t-d0-title { + alias {"nl-Title"} + } nl-Title { file { resource:process(transliterator) {"nl_Title.txt"} @@ -1598,6 +3032,39 @@ root { } } + am-t-pl { + alias {"pl-am"} + } + pl-am { + file { + resource:process(transliterator) {"pl_am.txt"} + direction {"FORWARD"} + } + } + + ar-t-pl { + alias {"pl-ar"} + } + pl-ar { + file { + resource:process(transliterator) {"pl_ar.txt"} + direction {"FORWARD"} + } + } + + fa-t-pl { + alias {"pl-fa"} + } + pl-fa { + file { + resource:process(transliterator) {"pl_fa.txt"} + direction {"FORWARD"} + } + } + + ja-t-pl { + alias {"pl-ja"} + } pl-ja { file { resource:process(transliterator) {"pl_ja.txt"} @@ -1605,6 +3072,9 @@ root { } } + pl-fonipa-t-pl { + alias {"pl-pl_FONIPA"} + } pl-pl_FONIPA { file { resource:process(transliterator) {"pl_pl_FONIPA.txt"} @@ -1612,6 +3082,9 @@ root { } } + ja-t-pl-fonipa { + alias {"pl_FONIPA-ja"} + } pl_FONIPA-ja { file { resource:process(transliterator) {"pl_FONIPA_ja.txt"} @@ -1619,6 +3092,79 @@ root { } } + am-t-rm-sursilv { + alias {"rm_SURSILV-am"} + } + rm_SURSILV-am { + file { + resource:process(transliterator) {"rm_SURSILV_am.txt"} + direction {"FORWARD"} + } + } + + ar-t-rm-sursilv { + alias {"rm_SURSILV-ar"} + } + rm_SURSILV-ar { + file { + resource:process(transliterator) {"rm_SURSILV_ar.txt"} + direction {"FORWARD"} + } + } + + fa-t-rm-sursilv { + alias {"rm_SURSILV-fa"} + } + rm_SURSILV-fa { + file { + resource:process(transliterator) {"rm_SURSILV_fa.txt"} + direction {"FORWARD"} + } + } + + rm-fonipa-sursilv-t-rm-sursilv { + alias {"rm_SURSILV-rm_FONIPA_SURSILV"} + } + rm_SURSILV-rm_FONIPA_SURSILV { + file { + resource:process(transliterator) {"rm_SURSILV_rm_FONIPA_SURSILV.txt"} + direction {"FORWARD"} + } + } + + am-t-ro { + alias {"ro-am"} + } + ro-am { + file { + resource:process(transliterator) {"ro_am.txt"} + direction {"FORWARD"} + } + } + + ar-t-ro { + alias {"ro-ar"} + } + ro-ar { + file { + resource:process(transliterator) {"ro_ar.txt"} + direction {"FORWARD"} + } + } + + fa-t-ro { + alias {"ro-fa"} + } + ro-fa { + file { + resource:process(transliterator) {"ro_fa.txt"} + direction {"FORWARD"} + } + } + + ja-t-ro { + alias {"ro-ja"} + } ro-ja { file { resource:process(transliterator) {"ro_ja.txt"} @@ -1626,6 +3172,9 @@ root { } } + ro-fonipa-t-ro { + alias {"ro-ro_FONIPA"} + } ro-ro_FONIPA { file { resource:process(transliterator) {"ro_ro_FONIPA.txt"} @@ -1633,6 +3182,9 @@ root { } } + ja-t-ro-fonipa { + alias {"ro_FONIPA-ja"} + } ro_FONIPA-ja { file { resource:process(transliterator) {"ro_FONIPA_ja.txt"} @@ -1640,6 +3192,9 @@ root { } } + ja-t-ru { + alias {"ru-ja"} + } ru-ja { file { resource:process(transliterator) {"ru_ja.txt"} @@ -1647,6 +3202,9 @@ root { } } + zh-t-ru { + alias {"ru-zh"} + } ru-zh { file { resource:process(transliterator) {"ru_zh.txt"} @@ -1654,6 +3212,142 @@ root { } } + Latin-Russian/BGN { + alias {"ru_Latn-ru/BGN"} + } + ru-t-ru-latn-m0-bgn { + alias {"ru_Latn-ru/BGN"} + } + ru_Latn-ru/BGN { + file { + resource:process(transliterator) {"ru_Latn_ru_BGN.txt"} + direction {"FORWARD"} + } + } + + am-t-sat { + alias {"sat-am"} + } + sat-am { + file { + resource:process(transliterator) {"sat_am.txt"} + direction {"FORWARD"} + } + } + + ar-t-sat { + alias {"sat-ar"} + } + sat-ar { + file { + resource:process(transliterator) {"sat_ar.txt"} + direction {"FORWARD"} + } + } + + fa-t-sat { + alias {"sat-fa"} + } + sat-fa { + file { + resource:process(transliterator) {"sat_fa.txt"} + direction {"FORWARD"} + } + } + + sat-fonipa-t-sat-olck { + alias {"sat_Olck-sat_FONIPA"} + } + sat_Olck-sat_FONIPA { + file { + resource:process(transliterator) {"sat_Olck_sat_FONIPA.txt"} + direction {"FORWARD"} + } + } + + am-t-si { + alias {"si-am"} + } + si-am { + file { + resource:process(transliterator) {"si_am.txt"} + direction {"FORWARD"} + } + } + + ar-t-si { + alias {"si-ar"} + } + si-ar { + file { + resource:process(transliterator) {"si_ar.txt"} + direction {"FORWARD"} + } + } + + fa-t-si { + alias {"si-fa"} + } + si-fa { + file { + resource:process(transliterator) {"si_fa.txt"} + direction {"FORWARD"} + } + } + + si-fonipa-t-si { + alias {"si-si_FONIPA"} + } + si-si_FONIPA { + file { + resource:process(transliterator) {"si_si_FONIPA.txt"} + direction {"FORWARD"} + } + } + + si-Latn-t-si { + alias {"si-si_Latn"} + } + si-si_Latn { + file { + resource:process(transliterator) {"si_si_Latn.txt"} + direction {"FORWARD"} + } + } + + am-t-sk { + alias {"sk-am"} + } + sk-am { + file { + resource:process(transliterator) {"sk_am.txt"} + direction {"FORWARD"} + } + } + + ar-t-sk { + alias {"sk-ar"} + } + sk-ar { + file { + resource:process(transliterator) {"sk_ar.txt"} + direction {"FORWARD"} + } + } + + fa-t-sk { + alias {"sk-fa"} + } + sk-fa { + file { + resource:process(transliterator) {"sk_fa.txt"} + direction {"FORWARD"} + } + } + + ja-t-sk { + alias {"sk-ja"} + } sk-ja { file { resource:process(transliterator) {"sk_ja.txt"} @@ -1661,6 +3355,9 @@ root { } } + sk-fonipa-t-sk { + alias {"sk-sk_FONIPA"} + } sk-sk_FONIPA { file { resource:process(transliterator) {"sk_sk_FONIPA.txt"} @@ -1668,6 +3365,9 @@ root { } } + ja-t-sk-fonipa { + alias {"sk_FONIPA-ja"} + } sk_FONIPA-ja { file { resource:process(transliterator) {"sk_FONIPA_ja.txt"} @@ -1675,6 +3375,39 @@ root { } } + am-t-tlh { + alias {"tlh-am"} + } + tlh-am { + file { + resource:process(transliterator) {"tlh_am.txt"} + direction {"FORWARD"} + } + } + + ar-t-tlh { + alias {"tlh-ar"} + } + tlh-ar { + file { + resource:process(transliterator) {"tlh_ar.txt"} + direction {"FORWARD"} + } + } + + fa-t-tlh { + alias {"tlh-fa"} + } + tlh-fa { + file { + resource:process(transliterator) {"tlh_fa.txt"} + direction {"FORWARD"} + } + } + + tlh-fonipa-t-tlh { + alias {"tlh-tlh_FONIPA"} + } tlh-tlh_FONIPA { file { resource:process(transliterator) {"tlh_tlh_FONIPA.txt"} @@ -1682,6 +3415,9 @@ root { } } + tr-t-d0-lower { + alias {"tr-Lower"} + } tr-Lower { file { resource:process(transliterator) {"tr_Lower.txt"} @@ -1689,6 +3425,9 @@ root { } } + tr-t-d0-title { + alias {"tr-Title"} + } tr-Title { file { resource:process(transliterator) {"tr_Title.txt"} @@ -1696,6 +3435,9 @@ root { } } + tr-t-d0-upper { + alias {"tr-Upper"} + } tr-Upper { file { resource:process(transliterator) {"tr_Upper.txt"} @@ -1703,12 +3445,38 @@ root { } } + ar-t-und-fonipa { + alias {"und_FONIPA-ar"} + } + und_FONIPA-ar { + file { + resource:process(transliterator) {"und_FONIPA_ar.txt"} + direction {"FORWARD"} + } + } + + fa-t-und-fonipa { + alias {"und_FONIPA-fa"} + } + und_FONIPA-fa { + file { + resource:process(transliterator) {"und_FONIPA_fa.txt"} + direction {"FORWARD"} + } + } + + uz-Latn-t-uz-cyrl { + alias {"uz_Cyrl-uz_Latn"} + } uz_Cyrl-uz_Latn { file { resource:process(transliterator) {"uz_Cyrl_uz_Latn.txt"} direction {"FORWARD"} } } + uz-Cyrl-t-uz-latn { + alias {"uz_Latn-uz_Cyrl"} + } uz_Latn-uz_Cyrl { file { resource:process(transliterator) {"uz_Cyrl_uz_Latn.txt"} @@ -1716,6 +3484,49 @@ root { } } + am-t-xh { + alias {"xh-am"} + } + xh-am { + file { + resource:process(transliterator) {"xh_am.txt"} + direction {"FORWARD"} + } + } + + ar-t-xh { + alias {"xh-ar"} + } + xh-ar { + file { + resource:process(transliterator) {"xh_ar.txt"} + direction {"FORWARD"} + } + } + + fa-t-xh { + alias {"xh-fa"} + } + xh-fa { + file { + resource:process(transliterator) {"xh_fa.txt"} + direction {"FORWARD"} + } + } + + xh-fonipa-t-xh { + alias {"xh-xh_FONIPA"} + } + xh-xh_FONIPA { + file { + resource:process(transliterator) {"xh_xh_FONIPA.txt"} + direction {"FORWARD"} + } + } + + yo-BJ-t-yo { + alias {"yo-yo_BJ"} + } yo-yo_BJ { file { resource:process(transliterator) {"yo_yo_BJ.txt"} @@ -1723,6 +3534,9 @@ root { } } + ru-t-zh-latn-pinyin { + alias {"zh_Latn_PINYIN-ru"} + } zh_Latn_PINYIN-ru { file { resource:process(transliterator) {"zh_Latn_PINYIN_ru.txt"} @@ -1730,6 +3544,46 @@ root { } } + am-t-zu { + alias {"zu-am"} + } + zu-am { + file { + resource:process(transliterator) {"zu_am.txt"} + direction {"FORWARD"} + } + } + + ar-t-zu { + alias {"zu-ar"} + } + zu-ar { + file { + resource:process(transliterator) {"zu_ar.txt"} + direction {"FORWARD"} + } + } + + fa-t-zu { + alias {"zu-fa"} + } + zu-fa { + file { + resource:process(transliterator) {"zu_fa.txt"} + direction {"FORWARD"} + } + } + + zu-fonipa-t-zu { + alias {"zu-zu_FONIPA"} + } + zu-zu_FONIPA { + file { + resource:process(transliterator) {"zu_zu_FONIPA.txt"} + direction {"FORWARD"} + } + } + } TransliteratorNamePattern { // Format for the display name of a Transliterator. diff --git a/icu4c/source/data/translit/ru_Latn_ru_BGN.txt b/icu4c/source/data/translit/ru_Latn_ru_BGN.txt new file mode 100644 index 00000000000..5f03d619302 --- /dev/null +++ b/icu4c/source/data/translit/ru_Latn_ru_BGN.txt @@ -0,0 +1,106 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: ru_Latn_ru_BGN.txt +# Generated from CLDR +# + +# BGN/PCGN 1947 System for Russian, in direction ru_Latn → ru +# http://geonames.nga.mil/gns/html/Romanization/Romanization_Russian.pdf +$prime = ʹ; +$doublePrime = ʺ; +$wordBoundary = [^[:L:][:M:][:N:]]; +$upperConsonant = [БВГДЖЙКЛМНПРСТФХЦЧШЩЭ]; +$lowerConsonant = [бвгджйклмнпрстфхцчшщэ]; +$consonant = [$upperConsonant $lowerConsonant]; +::NFC; +[:Upper:] {$prime} [^[:Lower:]] → Ь; +$prime → ь; +[:Upper:] {$doublePrime} [^[:Lower:]] → Ъ; +$doublePrime → ъ; +K[Hh] → Х; +k[Hh] → х; +T·S → ТС; +T·s → Тс; +t·S → тС; +t·s → тс; +T[Ss] → Ц; +t[Ss] → ц; +C[Hh] → Ч; +c[Hh] → ч; +S[Hh]·C[Hh] → ШЧ; +S[Hh]·c[Hh] → Шч; +s[Hh]·C[Hh] → шЧ; +s[Hh]·c[Hh] → шч; +S[Hh][Cc][Hh] → Щ; +s[Hh][Cc][Hh] → щ; +S[Hh] → Ш; +s[Hh] → ш; +Y[Ee] → Е; +y[Ee] → е; +Y[Ëë] → Ё; +y[Ëë] → ё; +Y[Uu] → Ю; +y[Uu] → ю; +Y[Aa] → Я; +y[Aa] → я; +{yy} $wordBoundary → ый; +$wordBoundary {Y} [^aeëiouyAEËIOUY] → Ы; +$wordBoundary {y} [^aeëiouyAEËIOUY] → ы; +$consonant {Y} → Ы; +$consonant {y} → ы; +Y → Й; +y → й; +$wordBoundary {E} → Э; +$wordBoundary {e} → э; +·E → Э; +·e → э; +E → Е; +e → е; +A → А; +a → а; +B → Б; +b → б; +V → В; +v → в; +G → Г; +g → г; +D → Д; +d → д; +Ë → Ё; +ë → ё; +Z[Hh] → Ж; +z[Hh] → ж; +Z → З; +z → з; +I → И; +i → и; +K → К; +k → к; +L → Л; +l → л; +M → М; +m → м; +N → Н; +n → н; +O → О; +o → о; +P → П; +p → п; +R → Р; +r → р; +S → С; +s → с; +T → Т; +t → т; +U → У; +u → у; +F → Ф; +f → ф; +·Y → Ы; +·y → ы; +· → ; + diff --git a/icu4c/source/data/translit/ru_ja.txt b/icu4c/source/data/translit/ru_ja.txt index b1fa876acee..ccec4992701 100644 --- a/icu4c/source/data/translit/ru_ja.txt +++ b/icu4c/source/data/translit/ru_ja.txt @@ -1,19 +1,27 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: ru_ja.txt # Generated from CLDR # + +# Transliteration from Russian into Japanese (Katakana). $word_boundary = [-\ $]; $vowel = [аеийоуыьэюяёъ]; $not_vowel = [^$vowel]; +# +# ::NFC; ::Lower; +# +# $vowel { го } $word_boundary → во; ::Null; +# +# а → ア ; ба → バ ; бе → ベ ; @@ -439,4 +447,7 @@ $vowel { го } $word_boundary → во; ю → ユ ; я → ヤ ; ё → ョ ; +# +# ::NFC; + diff --git a/icu4c/source/data/translit/ru_ru_Latn_BGN.txt b/icu4c/source/data/translit/ru_ru_Latn_BGN.txt new file mode 100644 index 00000000000..8c751391c4c --- /dev/null +++ b/icu4c/source/data/translit/ru_ru_Latn_BGN.txt @@ -0,0 +1,246 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: ru_ru_Latn_BGN.txt +# Generated from CLDR +# + +# BGN/PCGN 1947 System +# +# The BGN/PCGN system for Russian was adopted by the BGN in 1944 and +# by the PCGN in 1947 for use in romanizing names written in the +# Russian Cyrillic alphabet. +# +# The Russian Alphabet as defined by the BGN (Page 93): +# АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ +# абвгдеёжзийклмнопрстуфхцчшщъыьэюя +# +# Originally prepared by Michael Everson everson@evertype.com +# Fixed by Frank Yung-Fong Tang ftang@google.com +# +# Test Data from http://en.wikipedia.org/wiki/BGN/PCGN_romanization_of_Russian +######################################################################## +# MINIMAL FILTER: Russian-Latin +::[АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюя]; +# BUG(ftang) remove the following line. Otherwise the rule for +# Й й Ё ё will break since the rule is written in NFC but +# the line decomposes the text. +# :: NFD (NFC) ; +######################################################################## +# Define All Transformation Variables +######################################################################## +$prime = ʹ ; +$doublePrime = ʺ ; +$wordBoundary = [^[:L:][:M:][:N:]] ; +$upperConsonants = [БВГДЖЙКЛМНПРСТФХЦЧШЩЭ] ; +$lowerConsonants = [бвгджйклмнпрстфхцчшщэ] ; +$consonants = [$upperConsonants $lowerConsonants] ; +$upperVowels = [АЕЁЭИОУЫЮЯ] ; +$lowerVowels = [аеёэиоуыюя] ; +$vowels = [$upperVowels $lowerVowels] ; +$lower = [$lowerConsonants $lowerVowels] ; +$upper = [$upperConsonants $upperVowels] ; +######################################################################## +# Rules moved to front to avoid masking +######################################################################## +$lowerVowels { ы → ·y ; +$upperVowels { [Ыы] → ·Y ; +[$consonants - [Йй]]{Э → ·E ; +[$consonants - [Йй]]{э → ·e ; +[$upperVowels [ЙЪЬ]] { Е } $upper → YE ; # CYRILLIC CAPITAL LETTER IE +[$upperVowels [ЙЪЬ]] { Е → Ye ; # CYRILLIC CAPITAL LETTER IE +[$upperVowels $lowerVowels [ЙйЪъЬь]] { е → ye ; # CYRILLIC SMALL LETTER IE +[$upperVowels [ЙЪЬ]] { Ё } $upper → YË ; # CYRILLIC CAPITAL LETTER IO +[$upperVowels [ЙЪЬ]] { Ё → Yë ; # CYRILLIC CAPITAL LETTER IO +[$upperVowels $lowerVowels [ЙйЪъЬь]] { ё → yë ; # CYRILLIC SMALL LETTER IO +# Since in the above rule we look use the context before the characters, +# we have to perform them in a separate pass before we change the vowels +# the ::Null forces a separate pass. +::Null; +######################################################################## +# Start of Alphabetic Transformations +######################################################################## +А → A ; # CYRILLIC CAPITAL LETTER A +а → a ; # CYRILLIC SMALL LETTER A +Б → B ; # CYRILLIC CAPITAL LETTER BE +б → b ; # CYRILLIC SMALL LETTER BE +В → V ; # CYRILLIC CAPITAL LETTER VE +в → v ; # CYRILLIC SMALL LETTER VE +Г → G ; # CYRILLIC CAPITAL LETTER GHE +г → g ; # CYRILLIC SMALL LETTER GHE +Д → D ; # CYRILLIC CAPITAL LETTER DE +д → d ; # CYRILLIC SMALL LETTER DE +######################################################################## +# BGN Page 94 Rule 1: +# # The character e should be romanized ye +# initially, after the vowel # characters a, e, ё, и, о, у, ы, э, ю, +# and я, and after й, ъ, and ь. +# In all other instances, it should +# be romanized e. +######################################################################## +# BUG(ftang)- the following two lines said BEFORE the vowels, instead of AFTER +# Е}[$upperVowels [ЙЪЬ]] → YE ; # CYRILLIC CAPITAL LETTER IE +# Е}[$lowerVowels [йъь]] → Ye ; # CYRILLIC CAPITAL LETTER IE +$wordBoundary{Е} $upper → YE ; # CYRILLIC CAPITAL LETTER IE +$wordBoundary{Е → Ye ; # CYRILLIC CAPITAL LETTER IE +Е → E ; # CYRILLIC CAPITAL LETTER IE +# +# BUG(ftang)- the following line said BEFORE the vowels, instead of AFTER +# е}[$upperVowels $lowerVowels [ЙйЪъЬь]] → ye ; # CYRILLIC SMALL LETTER IE +$wordBoundary{е → ye ; # CYRILLIC SMALL LETTER IE +е → e ; # CYRILLIC SMALL LETTER IE +######################################################################## +# End of Rule 1 +######################################################################## +######################################################################## +# BGN Page 94 Rule 2: +# +# The character ё is not considered a separate character of the +# Russian alphabet and the dieresis is generally not shown. When the +# dieresis is shown, the character should be romanized yë initially, +# after the vowel characters a, e, ё, и, о, у, ы, э, ю, and я, and +# after й, ъ, and ь, In all other instances, it should be romanized +# ё. When the dieresis is not shown, the character may still be +# romanized in the preceding manner or, alternatively, in accordance +# with note 1. +######################################################################## +# BUG(ftang)- the following two lines said BEFORE the vowels, instead of AFTER +# Ё}[$upperVowels [ЙЪЬ]] → YË ; # CYRILLIC CAPITAL LETTER IO +# Ё}[$lowerVowels [йъь]] → Yë ; # CYRILLIC CAPITAL LETTER IO +$wordBoundary {Ё} $upper → YË ; # CYRILLIC CAPITAL LETTER IO +$wordBoundary {Ё} $lower → Yë ; # CYRILLIC CAPITAL LETTER IO +Ё → Ë ; # CYRILLIC CAPITAL LETTER IO +# BUG(ftang)- the following line said BEFORE the vowels, instead of AFTER +# ё}[$upperVowels $lowerVowels [ЙйЪъЬь]] → yë ; # CYRILLIC SMALL LETTER IO +$wordBoundary{ё → yë ; # CYRILLIC SMALL LETTER IO +ё → ë ; # CYRILLIC SMALL LETTER IO +######################################################################## +# End of Rule 2 +######################################################################## +Ж} $lower → Zh ; # CYRILLIC CAPITAL LETTER ZHE +Ж → ZH ; # CYRILLIC CAPITAL LETTER ZHE +ж → zh ; # CYRILLIC SMALL LETTER ZHE +######################################################################## +# BGN Page 94 Rule 3.4 +# э after any consonant character except +# й becomes ·е +######################################################################## +З → Z ; # CYRILLIC CAPITAL LETTER ZE +з → z ; # CYRILLIC SMALL LETTER ZE +# BUG(ftang) The following two lines said those consonant becomes ·е +# [$consonants - [Йй]]}Э → ·Е ; +# [$consonants - [Йй]]}э → ·е ; +######################################################################## +# End of Rule 3.4 +######################################################################## +И → I ; # CYRILLIC CAPITAL LETTER I +и → i ; # CYRILLIC SMALL LETTER I +######################################################################## +# BGN Page 94 Rule 3: +# +# Unusual Russian character sequences occurring primarily in +# non-Russian-language names may be romanized as shown below in order +# to provide differentiation from regularly-occurring digraphs and +# character sequences. +# +# BGN Page 94 Rule 3.1 +# й before а, у, ы, or э becomes у· +######################################################################## +Й}[АаУуЫыЭэ] → Y· ; # CYRILLIC CAPITAL LETTER I +й}[АаУуЫыЭэ] → y· ; # CYRILLIC SMALL LETTER I +Й → Y ; # CYRILLIC CAPITAL LETTER I +й → y ; # CYRILLIC SMALL LETTER I +######################################################################## +# End Rule 3.1 +######################################################################## +К → K ; # CYRILLIC CAPITAL LETTER KA +к → k ; # CYRILLIC SMALL LETTER KA +Л → L ; # CYRILLIC CAPITAL LETTER EL +л → l ; # CYRILLIC SMALL LETTER EL +М → M ; # CYRILLIC CAPITAL LETTER EM +м → m ; # CYRILLIC SMALL LETTER EM +Н → N ; # CYRILLIC CAPITAL LETTER EN +н → n ; # CYRILLIC SMALL LETTER EN +О → O ; # CYRILLIC CAPITAL LETTER O +о → o ; # CYRILLIC SMALL LETTER O +П → P ; # CYRILLIC CAPITAL LETTER PE +п → p ; # CYRILLIC SMALL LETTER PE +Р → R ; # CYRILLIC CAPITAL LETTER ER +р → r ; # CYRILLIC SMALL LETTER ER +С → S ; # CYRILLIC CAPITAL LETTER ES +с → s ; # CYRILLIC SMALL LETTER ES +######################################################################## +# BGN Page 94 Rule 3.5 +# тс becomes t·s +######################################################################## +ТС → T·S ; # CYRILLIC CAPITAL LETTER TE +Тс → T·s ; # CYRILLIC CAPITAL LETTER TE +тс → t·s ; # CYRILLIC SMALL LETTER TE +Т → T ; # CYRILLIC CAPITAL LETTER TE +т → t ; # CYRILLIC SMALL LETTER TE +######################################################################## +# End Rule 3.5 +######################################################################## +У → U ; # CYRILLIC CAPITAL LETTER U +у → u ; # CYRILLIC SMALL LETTER U +Ф → F ; # CYRILLIC CAPITAL LETTER EF +ф → f ; # CYRILLIC SMALL LETTER EF +Х} $lower → Kh ; # CYRILLIC CAPITAL LETTER HA +Х → KH ; # CYRILLIC CAPITAL LETTER HA +х → kh ; # CYRILLIC SMALL LETTER HA +Ц} $lower → Ts ; # CYRILLIC CAPITAL LETTER TSE +Ц → TS ; # CYRILLIC CAPITAL LETTER TSE +ц → ts ; # CYRILLIC SMALL LETTER TSE +Ч} $lower → Ch ; # CYRILLIC CAPITAL LETTER CHE +Ч → CH ; # CYRILLIC CAPITAL LETTER CHE +ч → ch ; # CYRILLIC SMALL LETTER CHE +######################################################################## +# BGN Page 94 Rule 3.6 +# шч becomes sh·ch +######################################################################## +ШЧ → SH·CH ; # CYRILLIC CAPITAL LETTER SHA +Шч → Sh·ch ; # CYRILLIC CAPITAL LETTER SHA +шч → sh·ch ; # CYRILLIC SMALL LETTER SHA +Ш} $lower → Sh ; # CYRILLIC CAPITAL LETTER SHA +Ш → SH ; # CYRILLIC CAPITAL LETTER SHA +ш → sh ; # CYRILLIC SMALL LETTER SHA +Щ} $lower → Shch ; # CYRILLIC CAPITAL LETTER SHCHA +Щ → SHCH ; # CYRILLIC CAPITAL LETTER SHCHA +щ → shch ; # CYRILLIC SMALL LETTER SHCHA +######################################################################## +# End Rule 3.6 +######################################################################## +Ъ → $doublePrime ; # CYRILLIC CAPITAL LETTER HARD SIGN +ъ → $doublePrime ; # CYRILLIC SMALL LETTER HARD SIGN +######################################################################## +# BGN Page 94 Rule 3.2 +# ы before а, у, ы, or э becomes у· +# +# BGN Page 94 Rule 3.3 +# ы after any vowel character becomes ·у +######################################################################## +# +# BUG(ftang) the following line said the vowels will change +# $vowels}Ы → ·Y ; # CYRILLIC CAPITAL LETTER I +# $vowels}ы → ·y ; # CYRILLIC CAPITAL LETTER I +Ы}[АаУуЫыЭэ] → Y· ; # CYRILLIC CAPITAL LETTER YERU +ы}[ауыэ] → y· ; # CYRILLIC SMALL LETTER YERU +Ы → Y ; # CYRILLIC CAPITAL LETTER YERU +ы → y ; # CYRILLIC SMALL LETTER YERU +######################################################################## +# End Rule 3.2 and 3.3 +######################################################################## +Ь → $prime ; # CYRILLIC CAPITAL LETTER SOFT SIGN +ь → $prime ; # CYRILLIC SMALL LETTER SOFT SIGN +Э → E ; # CYRILLIC CAPITAL LETTER E +э → e ; # CYRILLIC SMALL LETTER E +Ю} $lower → Yu ; # CYRILLIC CAPITAL LETTER YU +Ю → YU ; # CYRILLIC CAPITAL LETTER YU +ю → yu ; # CYRILLIC SMALL LETTER YU +Я} $lower → Ya ; # CYRILLIC CAPITAL LETTER YA +Я → YA ; # CYRILLIC CAPITAL LETTER YA +я → ya ; # CYRILLIC SMALL LETTER YA + diff --git a/icu4c/source/data/translit/ru_zh.txt b/icu4c/source/data/translit/ru_zh.txt index a89dbb4331f..d556557a314 100644 --- a/icu4c/source/data/translit/ru_zh.txt +++ b/icu4c/source/data/translit/ru_zh.txt @@ -1,17 +1,26 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: ru_zh.txt # Generated from CLDR # + +# Transliteration of Russian into Mandarin written in simplified Chinese. +# +# TODO(mjansche): Implement exception rules from GB/T 17693.4-2009 section 5.3. +# TODO(anyone): Simplify. $bow = [-\ $]; # Word boundary. $vowel = [аеийоуыьэюяё]; $not_vowel = [^$vowel]; +# +# ::NFC; ::Lower; +# +# аа → а; бб → б; вв → в; @@ -36,11 +45,20 @@ $vowel { нь → н; чч → ч; шш → ш; щщ → щ; +# +# +## иа → я; ## TODO: Figure out if/when this applies. +# ::Null; +# +# +# Special exceptions, per GB/T 17693.4-2009 表 1, 注 8: бург } $bow → 堡 ; град } $bow → 格勒 ; город } $bow → 哥罗德 ; цов } $bow → 佐夫 ; +# +# аи → 艾 ; ай → 艾 ; ан } $not_vowel → 安 ; @@ -448,6 +466,8 @@ $vowel { нь → н; лян } $not_vowel → 良 ; ля → 利亚 ; лё → 廖 ; +# +## $not_vowel { л → 勒 ; ## FIXME: Figure out if/when this applies. л → 尔 ; маи → 迈 ; май → 迈 ; @@ -574,6 +594,8 @@ $vowel { нь → н; рян } $not_vowel → 良 ; ря → 里亚 ; рё → 廖 ; +# +## $not_vowel { р → 勒 ; ## FIXME: Figure out if/when this applies. р → 尔 ; саи → 赛 ; сай → 赛 ; @@ -965,6 +987,10 @@ $vowel { нь → н; ян } $not_vowel → 扬 ; я → 亚 ; ё → 约 ; +# +# +# Dong-nan-xi-hai pass. Per GB/T 17693.4-2009 表 1, 注 4, replace confusing +# characters at the beginning and end of a word. :: Null (); $bow { 耶 → 叶 ; $bow { 夫 → 弗 ; @@ -973,4 +999,7 @@ $bow { 南 → 楠 ; $bow { 西 → 锡 ; 江 } $bow → 姜 ; 海 } $bow → 亥 ; +# +# ::NFC; + diff --git a/icu4c/source/data/translit/sat_Olck_sat_FONIPA.txt b/icu4c/source/data/translit/sat_Olck_sat_FONIPA.txt new file mode 100644 index 00000000000..c57d5181a4f --- /dev/null +++ b/icu4c/source/data/translit/sat_Olck_sat_FONIPA.txt @@ -0,0 +1,183 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: sat_Olck_sat_FONIPA.txt +# Generated from CLDR +# + +# Santali (Ol Chiki) → Santali (International Phonetic Alphabet) +# Output +# ------ +# m mː n nː ɳ ɳː ɲ ɲː ŋ ŋː +# p pʰ pʼ b bʰ t tʰ tʼ d dʰ ʈ ʈʰ ɖ ɖʰ c cʰ cʼ k kʰ kʼ ɡ ʔ +# s sː h +# d\u0361ʒ +# ɽ r +# l lː +# w wː w\u0303 w\u0303ː +# +# i iː ĩ ĩː u uː ũ ũː +# e eː ẽ ẽː ə əː ə\u0303 ə\u0303ː o oː õ õː +# ɛ ɛː ɛ\u0303 ɛ\u0303ː ɔ ɔː ɔ\u0303 ɔ\u0303ː +# a aː ã ãː +# References +# ---------- +# [1] Michael Everson: Final proposal to encode the Ol Chiki script +# in the UCS. ISO/IEC JTC1/SC2/WG2 Working Group Document N2984R, +# September 21, 2005. http://std.dkuug.dk/jtc1/sc2/wg2/docs/n2984.pdf +# +# [2] George L. Campbell: Compendium of the World's Languages. +# Volume 2: Ladakhi to Zuni. ISBN 0-415-20297-3. Taylor & Francis, 2000. +# Pages 1454 to 1458. +# Notes +# ----- +# According to [1] (page 3), ᱽ can only follow the four ejective +# consonants ᱵ /pʼ/, ᱡ /cʼ/, ᱫ /tʼ/, and ᱜ /kʼ/; these become +# ᱵᱽ /b/, ᱫᱽ /d/, ᱡᱽ /d\u0361ʒ/, and ᱜᱽ /ɡ/. In online texts, however, +# we have occasionally encountered ᱽ following non-ejective plosives, +# for example after ᱯ /p/. These might possibly be typos. Our rules +# try to be resilient and handle ᱯᱽ as /b/. +# +# According to [1] (page 2), U+1C7C PHAARKAA follows the four “glottal” +# consonants ᱵ /pʼ/, ᱡ /cʼ/, ᱫ /tʼ/, and ᱜ /kʼ/ (these are actually +# ejective, not glottal). In online texts, however, we have frequently +# encountered ᱼ following non-ejective consonants. +$inword = [[:L:][:M:]]; +# Some online texts use a decomposed form of U+1C7A MU-GAAHLAA TTUDDAG. +ᱹᱸ → ᱺ ; +ᱸᱹ → ᱺ ; +::null(); +# To simplify the rules below, enforce a uniform ordering of marks. +ᱻᱹ → ᱹᱻ ; +ᱻᱸ → ᱸᱻ ; +ᱻᱺ → ᱺᱻ ; +ᱼᱹ → ᱹᱼ ; +ᱼᱸ → ᱸᱼ ; +ᱼᱺ → ᱺᱼ ; +::null(); +# Some online texts use U+1C7C PHAARKAA instead of U+1C7B RELAA for indicating +# long phonemes, presumably because the graphemes look similar in some fonts. +# Since phaarkaa is used for voicing ejectives and plosives (which cannot +# be lenghtened), we rewrite phaarkaa to relaa. +[ᱚᱟᱤᱩᱮᱳᱶᱢᱝᱞᱱ] [ᱹᱸᱺ]* {ᱼ} → ᱻ ; +::null(); +ᱚᱹᱻ → ɔː ; +ᱚᱹ → ɔ ; +ᱚᱸᱻ → ɔ\u0303ː ; +ᱚᱸ → ɔ\u0303 ; +ᱚᱺᱻ → ɔ\u0303ː ; +ᱚᱺ → ɔ\u0303 ; +ᱚᱻ → ɔː ; +ᱚ → ɔ ; +ᱛᱼ → t ; +ᱛᱷ → tʰ ; +ᱛᱽ → d ; +$inword {ᱛ} → d ; +ᱛ → t ; +ᱜᱼ → kʼ ; +ᱜᱷ → kʰ ; +ᱜᱽ → ɡ ; +$inword {ᱜ} → ɡ ; +ᱜ → kʼ ; +ᱝᱻ → ŋː ; +ᱝ → ŋ ; +ᱞᱻ → lː ; +ᱞ → l ; +ᱟᱹᱻ → əː ; +ᱟᱹ → ə ; +ᱟᱸᱻ → ãː ; +ᱟᱸ → ã ; +ᱟᱺᱻ → ə\u0303ː ; +ᱟᱺ → ə\u0303 ; +ᱟᱻ → aː ; +ᱟ → a ; +ᱠᱼ → k ; +ᱠᱷ → kʰ ; +ᱠᱽ → ɡ ; +ᱠ → k ; +ᱡᱼ → cʼ ; +ᱡᱷ → cʰ ; +ᱡᱽ → d\u0361ʒ ; +$inword {ᱡ} → d\u0361ʒ ; +ᱡ → cʼ ; +ᱢᱻ → mː ; +ᱢ → m ; +# According to [1], ᱣ is sometimes /v/ and sometimes /w/. +# TODO: Find out if there is a rule for this. +ᱣᱸ → w\u0303 ; +ᱣ → w ; +ᱤᱹᱻ → iː ; +ᱤᱹ → i ; +ᱤᱸᱻ → ĩː ; +ᱤᱸ → ĩ ; +ᱤᱺᱻ → ĩː ; +ᱤᱺ → ĩ ; +ᱤᱻ → iː ; +ᱤ → i ; +ᱥᱻ → sː ; +ᱥ → s ; +# According to [1], ᱦ is sometimes /h/ and sometimes /ʔ/. +# TODO: Find out if there is a rule for this. +ᱦ → h ; +ᱧᱻ → ɲː ; +ᱧ → ɲ ; +ᱨᱻ → r ; +ᱨ → r ; +ᱩᱹᱻ → uː ; +ᱩᱹ → u ; +ᱩᱸᱻ → ũː ; +ᱩᱸ → ũ ; +ᱩᱺᱻ → ũː ; +ᱩᱺ → ũ ; +ᱩᱻ → uː ; +ᱩ → u ; +ᱪᱼ → c ; +ᱪᱷ → cʰ ; +ᱪᱽ → d\u0361ʒ ; +ᱪ → c ; +ᱫᱼ → tʼ ; +ᱫᱷ → tʰ ; +ᱫᱽ → d ; +$inword {ᱫ} → d ; +ᱫ → tʼ ; +ᱬᱻ → ɳː ; +ᱬ → ɳ ; +# TODO: ᱵᱷᱭᱨᱚᱵ → bʰhrɔb seems unlikely; would be good to verify. +ᱭ → h ; +ᱮᱹᱻ → ɛː ; +ᱮᱹ → ɛ ; +ᱮᱺᱻ → ɛ\u0303ː ; +ᱮᱺ → ɛ\u0303 ; +ᱮᱸᱻ → ẽː ; +ᱮᱸ → ẽ ; +ᱮᱻ → eː ; +ᱮ → e ; +ᱯᱼ → p ; +ᱯᱷ → pʰ ; +ᱯᱽ → b ; +ᱯ → p ; +ᱰᱷ → ɖʰ ; +ᱰ → ɖ ; +ᱱᱻ → nː ; +ᱱ → n ; +ᱲᱻ → ɽ ; +ᱲ → ɽ ; +ᱳᱸᱻ → õː ; +ᱳᱸ → õ ; +ᱳᱻ → oː ; +ᱳ → o ; +ᱴᱼ → ʈ ; +ᱴᱷ → ʈʰ ; +ᱴᱽ → ɖ ; +ᱴ → ʈ ; +ᱵᱼ → pʼ ; +ᱵᱷ → bʰ ; +ᱵᱽ → b ; +$inword {ᱵ} → b ; +ᱵ → pʼ ; +ᱶᱻ → w\u0303ː ; +ᱶ → w\u0303 ; + diff --git a/icu4c/source/data/translit/sat_am.txt b/icu4c/source/data/translit/sat_am.txt new file mode 100644 index 00000000000..cd9a584af81 --- /dev/null +++ b/icu4c/source/data/translit/sat_am.txt @@ -0,0 +1,15 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: sat_am.txt +# Generated from CLDR +# + +# TODO: Add other scripts (eg. sat-Beng) once we can transcribe them to IPA. +# Do this in a separate rule for "sat-sat_FONIPA", so it can be reused. +::sat_Olck-sat_FONIPA; +::am_FONIPA-am; + diff --git a/icu4c/source/data/translit/sat_ar.txt b/icu4c/source/data/translit/sat_ar.txt new file mode 100644 index 00000000000..9a17b95c823 --- /dev/null +++ b/icu4c/source/data/translit/sat_ar.txt @@ -0,0 +1,15 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: sat_ar.txt +# Generated from CLDR +# + +# TODO: Add other scripts (eg. sat-Beng) once we can transcribe them to IPA. +# Do this in a separate rule for "sat-sat_FONIPA", so it can be reused. +::sat_Olck-sat_FONIPA; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/sat_fa.txt b/icu4c/source/data/translit/sat_fa.txt new file mode 100644 index 00000000000..f35e2ff0138 --- /dev/null +++ b/icu4c/source/data/translit/sat_fa.txt @@ -0,0 +1,15 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: sat_fa.txt +# Generated from CLDR +# + +# TODO: Add other scripts (eg. sat-Beng) once we can transcribe them to IPA. +# Do this in a separate rule for "sat-sat_FONIPA", so it can be reused. +::sat_Olck-sat_FONIPA; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/si_am.txt b/icu4c/source/data/translit/si_am.txt new file mode 100644 index 00000000000..5862015ad0d --- /dev/null +++ b/icu4c/source/data/translit/si_am.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: si_am.txt +# Generated from CLDR +# + +::si-si_FONIPA; +::am_FONIPA-am; + diff --git a/icu4c/source/data/translit/si_ar.txt b/icu4c/source/data/translit/si_ar.txt new file mode 100644 index 00000000000..d23442b3ff3 --- /dev/null +++ b/icu4c/source/data/translit/si_ar.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: si_ar.txt +# Generated from CLDR +# + +::si-si_FONIPA; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/si_fa.txt b/icu4c/source/data/translit/si_fa.txt new file mode 100644 index 00000000000..b28c18c90e7 --- /dev/null +++ b/icu4c/source/data/translit/si_fa.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: si_fa.txt +# Generated from CLDR +# + +::si-si_FONIPA; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/si_si_FONIPA.txt b/icu4c/source/data/translit/si_si_FONIPA.txt new file mode 100644 index 00000000000..58e98139230 --- /dev/null +++ b/icu4c/source/data/translit/si_si_FONIPA.txt @@ -0,0 +1,166 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: si_si_FONIPA.txt +# Generated from CLDR +# + +# Sinhala pronunciation rules +# +# Output +# k ɡ ŋ ᵑɡ c ɟ ɲ ʈ ɖ ⁿɖ t d n ⁿd p b m ᵐb j r l w ʃ s h f +# ə əː a aː æ æː i iː u uː e eː o oː +# +# References +# [1] Asanka Wasala, Ruvan Weerasinghe, and Kumudu Gamage: +# Sinhala Grapheme-to-Phoneme Conversion and Rules for Schwa Epenthesis. +# Proceedings of the COLING/ACL 2006 Main Conference Poster Sessions, +# pages 890–897. http://www.aclweb.org/anthology/P06-2114 +# Simplify ya + yansaya to plain ya after a consonant. +[\u0D9A-\u0DC6] \u0DCA (\u200D)? { ය\u0DCA‍ය → ය; +# Delete ZWNJ and ZWJ to simplify further processing. +\u200C → ; +\u200D → ; +# Insert a schwa after every consonant that is not followed by a dependent vowel +# or virama. +::Null; +([\u0D9A-\u0DC6]) } [^\u0DCA-\u0DDF \u0DF2\u0DF3] → $1 ə; +# Pronunciation rules proper. +::Null; +# fප is an alternative spelling of ෆ. +# This occurs e.g. in ඩේව\u0DD2ඩ\u0DCA කොපර\u0DCAfප\u0DD3ල\u0DCAඩ\u0DCA (David Copperfield) +# [see http://bradshawofthefuture.blogspot.com/2013/02/f.html]. +[Ff]ප → f; +# zස is seemingly the only way to unambiguously indicate a voiced /z/ sound. +# This occurs in e.g. ඇල\u0DCAzසය\u0DD2ම' රෝගය (Alzheimer's disease) +# [see https://si.wikipedia.org/wiki/ඇල\u0DCAzසය\u0DD2ම%27_රෝගය] +# or in zස\u0DD3බ\u0DCA‍රා (zebra) [see https://si.wikipedia.org/wiki/‍zස\u0DD3බ\u0DCA‍රා]. +[Zz]ස → z; +ං → ŋ; +o → ŋ; # common substitution for anusvaraya +ඃ ([\u0D9A-\u0DC6]) → | $1 \u0DCA $1; # TODO: check which consonants geminate +ඃ → h; +අ → a; +ආ → aː; +ඇ → æ; +ඈ → æː; +ඉ → i; +ඊ → iː; +උ → u; +ඌ → uː; +ඍ → ri; +ඎ → ruː; +ඏ → ilu; +ඐ → iluː; +එ → e; +ඒ → eː; +ඓ → aj; +ඔ → o; +ඕ → oː; +ඖ → aw; # TODO: check if this is correct +ක → k; +ඛ → k; +ග → ɡ; +ඝ → ɡ; +ඞ → ŋ; +ඟ → ᵑɡ; +ච → c; +ඡ → c; +ජ → ɟ; +ඣ → ɟ; +ඤ → ɲ; +ඥ → kɲ; # TODO: double-check +ඦ → ɟ; +ට → ʈ; +ඨ → ʈ; +ඩ → ɖ; +ඪ → ɖ; +ණ → n; +ඬ → ⁿɖ; +ත → t; +ථ → t; +ද → d; +ධ → d; +න → n; +ඳ → ⁿd; +ප → p; +ඵ → p; +බ → b; +භ → b; +ම → m; +ඹ → ᵐb; +ය → j; +ර → r; +ල → l; +ව → w; +ශ → ʃ; +ෂ → ʃ; +ස → s; +හ → h; +ළ → l; +ෆ → f; +\u0DCA → ; # delete virama +ා → aː; +ැ → æ; +ෑ → æː; +\u0DD2 → i; +\u0DD3 → iː; +\u0DD4 → u; +\u0DD6 → uː; +ෘ → ru; +ෙ → e; +ේ → eː; +ෛ → aj; +ො → o; +ෝ → oː; +ෞ → aw; # TODO: check if this is correct +ෟ → lu; +ෲ → ruː; +ෳ → luː; +# Heuristics for turning /ə/ into /a/. Based on [1]. +$c=[k ɡ ŋ {ᵑɡ} c ɟ ɲ ʈ ɖ {ⁿɖ} t d n {ⁿd} p b m {ᵐb} j r l w ʃ s z h f]; +$s=[:^L:]; +# Rule #1 +::Null; +$s sv { ə → ə; # exception (a) +$s k { ə } r → ə; # exception (b) +$s $c { ə } $s → ə; # exception (c) +$s $c $c { ə → a; +$s $c { ə → a; +# Rule #2 +::Null; +$c r { ə } $c → a; # clause (a) and (b) +$c r { a } h → a; # clause (d), exception +$c r { a } $c → ə; # clause (c) +# Rule #3 +# The paper is unclear about what this rule means. The interpretation here +# assumes that "preceded" in the paper is a typo and should be read "followed". +::Null; +[a e æ o ə] h { ə → a; +# Rules #4 through #7 +::Null; +ə } $c $c → a; # Rule #4 +ə } [rbɖʈ] $s → ə; # Rule #5 exception +ə } $c $s → a; # Rule #5 +ə } ji $s → a; # Rule #6 +k { ə } [rl] u → a; # Rule #7 +# Rule #8 +# Note that the paper doesn't say explicitly that this rule should be +# anchored at the beginning of a word, but the remarks before the rules +# seem to imply this. +::Null; +$s k { a } l[aeo]ːj → ə; # Typo in paper: /j/ was /y/. +$s k { a } le[mh][ui] → ə; +$s k { alə } h[ui] → əle; +$s k { a } lə → ə; +# Diphthongs +::Null; +www+ → ww; # යෞව\u0DCAවන +[i {iː} e {eː} æ {æː} o {oː} a {aː}] { wu → w; +əji → aj; +iji → iː; # perhaps: ij +[u {uː} e {eː} æ {æː} o {oː} a {aː}] { ji → j; + diff --git a/icu4c/source/data/translit/si_si_Latn.txt b/icu4c/source/data/translit/si_si_Latn.txt new file mode 100644 index 00000000000..393a440ae66 --- /dev/null +++ b/icu4c/source/data/translit/si_si_Latn.txt @@ -0,0 +1,103 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: si_si_Latn.txt +# Generated from CLDR +# + +# Based on http://en.wiktionary.org/wiki/Wiktionary:Sinhalese_transliteration +::[[:Sinh:][\u200C\u200D]]; +::NFKC; +# Delete ZWNJ and ZWJ to simplify further processing. +\u200C → ; +\u200D → ; +# Insert "a" after every consonant that is not followed by a dependent vowel +# or virama. +::Null; +([\u0D9A-\u0DC6]) } [^\u0DCA-\u0DDF] → $1 a; +::Null; +ක → k; +ට → ṭ; +ත → t; +ප → p; +ග → g; +ඩ → ḍ; +ද → d; +බ → b; +[Zz]ස → z; +ස → s; +ච → c; +ම → m; +ල → l; +ව → v; +ණ → ṇ; +හ → h; +ජ → j; +න → n; +ර → r; +ය → y; +ළ → ḷ; +අ → a; +එ → e; +ඉ → i; +ඔ → o; +උ → u; +ඇ → æ; +ෙ → e; +\u0DD2 → i; +ො → o; +\u0DD4 → u; +ැ → æ; +ආ → ā; +ඒ → ē; +ඊ → ī; +ඕ → ō; +ඌ → ū; +ඈ → ǣ; +ා → ā; +ේ → ē; +\u0DD3 → ī; +ෝ → ō; +\u0DD6 → ū; +ෑ → ǣ; +ඟ → n\u0306g; +ඬ → n\u0306ḍ; +ඳ → n\u0306d; +ඹ → m\u0306b; +ඛ → kh; +ඨ → ṭh; +ථ → th; +ඵ → ph; +ඝ → gh; +ඪ → ḍh; +ධ → dh; +භ → bh; +ශ → ś; +ඡ → ch; +ඤ → ñ; +ඞ → ṅ; +ඦ → n\u0306j; +ෂ → ṣ; +ඣ → jh; +ඥ → gn; +ෆ → f; +[Ff]ප → f; +ඓ → ai; +ඍ → ṛ; +ඏ → ḷ; +ෛ → ai; +ෘ → ṛ; +ෟ → ḷ; +ඖ → au; +ඎ → ṝ; +ඐ → ḹ; +ෞ → au; +ෲ → ṝ; +ෳ → ḹ; +ඃ → ḥ; +ං → ṁ; +\u0DCA → ; + diff --git a/icu4c/source/data/translit/sk_FONIPA_ja.txt b/icu4c/source/data/translit/sk_FONIPA_ja.txt index ea8796f705e..adbf0106500 100644 --- a/icu4c/source/data/translit/sk_FONIPA_ja.txt +++ b/icu4c/source/data/translit/sk_FONIPA_ja.txt @@ -1,14 +1,20 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: sk_FONIPA_ja.txt # Generated from CLDR # + +# Phonemic transcription of Slovak into Katakana. $vowel = [aeiouw] ; # Vowels and glides $not_vowel = [^$vowel] ; +# +# +# First pass: Collapse phonetic distinctions that are not preserved in Katakana +# t \' → | t ʃ ; t \u0361 → | t ; d \u0361 ʒ → | ʒ ; @@ -25,6 +31,9 @@ u\u032F → | u ; ɱ → | m ; ʎ → | l; ::Null; +([bcdfghklmnprstvzʃʒ]) ː → $1 $1; +::Null; +# Main pass: Phoneme to Katakana conversion. a → ア; ba → バ; be → ベ; @@ -32,31 +41,43 @@ bi → ビ; bo → ボ; bu → ブ; b → ブ; +# +# ca → チャ ; # not backed by data ce → チェ ; ci → チ ; cu → チュ ; # not backed by data co → チョ ; # not backed by data c → チ ; +# +# da → ダ ; de → デ ; di → ディ ; do → ド ; du → ドゥ ; d → ド ; +# +# e → エ ; +# +# fa → ファ ; fe → フェ ; fi → フィ ; fo → フォ ; fu → フ ; f → フ ; +# +# ga → ガ; ge → ゲ; gi → ギ; go → ゴ; gu → グ; g → グ; +# +# ha → ハ ; hwe → フェ ; he → ヘ ; @@ -64,31 +85,43 @@ hi → ヒ ; # not backed by data ho → ホ ; hu → フ ; h → フ ; +# +# ^ { ia → ヤ ; i → イ ; +# +# ja → ヤ ; je → イェ ; ji → イ ; # not backed by data jo → ヨ ; ju → ユ ; +# +# ka → カ ; ke → ケ ; ki → キ ; ko → コ ; ku → ク ; k → ク ; +# +# l \' a → リヤ ; l \' e → レ ; # not backed by data l \' i → リ ; # not backed by data l \' o → リヨ ; # not backed by data l \' u → リユ ; # not backed by data l \' → リ ; +# +# la → ラ ; le → レ ; li → リ ; lo → ロ ; lu → ル ; l → ル ; +# +# ma → マ ; me → メ ; mi → ミ ; @@ -96,37 +129,51 @@ mo → モ ; mu → ム ; m } [bp] → ン ; m → ム ; +# +# ɲa → ニャ ; ɲe → ネ ; ɲi → ニ ; ɲo → ニョ ; # not backed by data ɲu → ニュ ; # not backed by data ɲ → ニ ; +# +# na → ナ ; ne → ネ ; ni → ニ ; no → ノ ; nu → ヌ ; n → ン ; +# +# o → オ ; +# +# pa → パ ; pe → ペ ; pi → ピ ; po → ポ ; pu → プ ; p → プ ; +# +# ra → ラ ; re → レ ; ri → リ ; ro → ロ ; ru → ル ; r → ル; +# +# sa → サ ; se → セ ; si → シ ; so → ソ ; su → ス ; s → ス ; +# +# ʃa → シャ ; ʃe → シェ ; ʃio → ショ ; @@ -134,11 +181,15 @@ s → ス ; ʃo → ショ ; ʃu → シュ ; ʃ → シュ ; +# +# ta → タ ; te → テ ; ti → ティ ; to → ト ; tu → トゥ ; +# +# tʃa → チャ ; tʃea → チャ ; tʃe → チェ ; @@ -147,6 +198,8 @@ tʃi → チ ; tʃo → チョ ; tʃu → チュ ; tʃ → チュ ; +# +# tsa → チャ ; tse → ツェ ; tsi → ツィ; @@ -154,31 +207,43 @@ tso → ツォ ; tsu → ツ ; ts → ツ ; t → ト ; +# +# u → ウ ; +# +# va → バ ; ve → ベ ; vu → ブ ; vi → ビ ; vo → ボ ; v → ヴ ; +# +# wa → ワ ; we → エ ; # not backed by data wi → イ ; # not backed by data wo → オ ; # not backed by data wu → ウ ; # not backed by data w → ウ ; +# +# xa → ハ ; xe → ヘ ; # not backed by data xi → ヒ ; # not backed by data xo → ホ ; xu → フ ; # not backed by data x → フ ; +# +# za → ザ ; ze → ゼ ; zi → ジ ; zo → ゾ ; zu → ズ ; z → ズ ; +# +# ʒa → ジャ ; ʒea → ジャ ; ʒe → ジェ ; @@ -187,5 +252,10 @@ z → ズ ; ʒo → ジョ ; ʒu → ジュ ; ʒ → ジュ ; +# +# ː → ー ; ' ' → ・; +# +# + diff --git a/icu4c/source/data/translit/sk_am.txt b/icu4c/source/data/translit/sk_am.txt new file mode 100644 index 00000000000..5c4dc7be053 --- /dev/null +++ b/icu4c/source/data/translit/sk_am.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: sk_am.txt +# Generated from CLDR +# + +::sk-sk_FONIPA; +::am_FONIPA-am; + diff --git a/icu4c/source/data/translit/sk_ar.txt b/icu4c/source/data/translit/sk_ar.txt new file mode 100644 index 00000000000..17985be1ca2 --- /dev/null +++ b/icu4c/source/data/translit/sk_ar.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: sk_ar.txt +# Generated from CLDR +# + +::sk-sk_FONIPA; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/sk_fa.txt b/icu4c/source/data/translit/sk_fa.txt new file mode 100644 index 00000000000..ba50a524bde --- /dev/null +++ b/icu4c/source/data/translit/sk_fa.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: sk_fa.txt +# Generated from CLDR +# + +::sk-sk_FONIPA; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/sk_ja.txt b/icu4c/source/data/translit/sk_ja.txt index f7dab51f761..87a1a5653c1 100644 --- a/icu4c/source/data/translit/sk_ja.txt +++ b/icu4c/source/data/translit/sk_ja.txt @@ -1,11 +1,13 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: sk_ja.txt # Generated from CLDR # + ::sk-sk_FONIPA; ::sk_FONIPA-ja; + diff --git a/icu4c/source/data/translit/sk_sk_FONIPA.txt b/icu4c/source/data/translit/sk_sk_FONIPA.txt index ee24e286e81..405f22e0fa9 100644 --- a/icu4c/source/data/translit/sk_sk_FONIPA.txt +++ b/icu4c/source/data/translit/sk_sk_FONIPA.txt @@ -1,20 +1,38 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: sk_sk_FONIPA.txt # Generated from CLDR # + +# Slovak orthography to phonemic transcription. +# http://en.wikipedia.org/wiki/Slovak_language +# http://en.wikipedia.org/wiki/Slovak_alphabet +# +# Letters that trigger softening; also triggered at end of word. +# Softening also occurs before "ch", but that starts in "c" so it +# is included in the set below. $soften = [ptťkcčsš$] ; +# +# +# Transform input to normalized form NFC, and to lowercase. :: NFC () ; :: Lower () ; +# +# +# digraphs ch → x ; +# +# dž } $soften → t \u0361 ʃ ; # affricate indicated by ligature tie dz } $soften → t \u0361 s ; dž → d \u0361 ʒ ; dz → d \u0361 z ; +# +# a → a ; á → aː ; ä → ɛ ; @@ -38,9 +56,10 @@ i → ɪ ; í → iː ; j → j ; k → k ; +ľ → ʎ ; +l\' → ʎ ; l → l ; ĺ → l\u0329ː ; -ľ → ʎ ; m } [fv] → ɱ ; m → m ; n } [kg] → ŋ ; @@ -57,8 +76,9 @@ r → r ; s → s ; š → ʃ ; t } [ie] → c ; -t → t ; +t\' → c ; ť → c ; +t → t ; u → u ; ú → uː ; vz } $soften → fs ; @@ -73,3 +93,18 @@ z } $soften → s ; z → z ; ž } $soften → ʃ ; ž → ʒ ; +::null; +bb → bː; +cc → cː; +dd → dː; +ff → fː; +nn → nː; +pp → pː; +rr → rː; +ss → sː; +tt → tː; +vv → vː; +zz → zː; +ʃʃ → ʃː; +ʒʒ → ʒː; + diff --git a/icu4c/source/data/translit/Serbian_Latin_BGN.txt b/icu4c/source/data/translit/sr_sr_Latn_BGN.txt similarity index 70% rename from icu4c/source/data/translit/Serbian_Latin_BGN.txt rename to icu4c/source/data/translit/sr_sr_Latn_BGN.txt index 29d3ab34d13..63281816306 100644 --- a/icu4c/source/data/translit/Serbian_Latin_BGN.txt +++ b/icu4c/source/data/translit/sr_sr_Latn_BGN.txt @@ -1,14 +1,41 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Serbian_Latin_BGN.txt +# File: sr_sr_Latn_BGN.txt # Generated from CLDR # + +# +######################################################################## +# BGN/PCGN Agreemente +# +# Serbian is transliterated as Croatian. +# +# The Serbian Alphabet as defined by the BGN (Page 95): +# +# АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ +# абвгдђежзијклљмнњопрстћуфхцчџш +# +# Originally prepared by Michael Everson +######################################################################## +# +# MINIMAL FILTER: Serbian-Latin +# :: [АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш] ; :: NFD (NFC) ; +# +# +######################################################################## +# +######################################################################## +# +# Define All Transformation Variables +# +######################################################################## +# $upperConsonants = [БВГДЂЖЗЈКЛЉМНЊПРСТЋФХЦЧЏШ] ; $lowerConsonants = [бвгдђжзјклљмнњпрстћфхцчџш] ; $consonants = [$upperConsonants $lowerConsonants] ; @@ -16,7 +43,22 @@ $upperVowels = [АЕИОУ] ; $lowerVowels = [аеиоу] ; $vowels = [$upperVowels $lowerVowels] ; $lower = [$lowerConsonants $lowerVowels] ; +# +# +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/cgi-bin/icu-bugs/transliterate?id=2034;expression=boundary;user=guest +# $wordBoundary = [^[:L:][:M:][:N:]] ; +# +# +######################################################################## +# +######################################################################## +# +# Start of Alphabetic Transformations +# +######################################################################## +# А → A ; # CYRILLIC CAPITAL LETTER A а → a ; # CYRILLIC SMALL LETTER A Б → B ; # CYRILLIC CAPITAL LETTER BE @@ -80,3 +122,7 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; џ → dž ; # CYRILLIC SMALL LETTER SHA Ш → Š ; # CYRILLIC CAPITAL LETTER SHA ш → š ; # CYRILLIC SMALL LETTER SHA +# +# +######################################################################## + diff --git a/icu4c/source/data/translit/Turkmen_Latin_BGN.txt b/icu4c/source/data/translit/tk_Cyrl_tk_BGN.txt similarity index 50% rename from icu4c/source/data/translit/Turkmen_Latin_BGN.txt rename to icu4c/source/data/translit/tk_Cyrl_tk_BGN.txt index ea78ac8f187..fee9396907a 100644 --- a/icu4c/source/data/translit/Turkmen_Latin_BGN.txt +++ b/icu4c/source/data/translit/tk_Cyrl_tk_BGN.txt @@ -1,13 +1,44 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Turkmen_Latin_BGN.txt +# File: tk_Cyrl_tk_BGN.txt # Generated from CLDR # + +# +######################################################################## +# BGN/PCGN 1979 System +# +# The BGN/PCGN system for Turkmen was designed for use in +# romanizing names written in the Turkmen alphabet. +# The Turkmen alphabet contains five letters not present +# in the Russian alphabet: Җҗ, Ңң, Өө, Үү, and Әә. +# +# The Turkmen Cyrillic Alphabet as defined by the BGN (Page 103): +# +# АБВГДЕЁЖҖЗИЙКЛМНҢОӨПРСТУҮФХЦЧШЩЪЫЬЭӘЮЯ +# абвгдеёжзҗийклмнңоөпрстуүфхцчшщъыьэәюя +# +# Originally prepared by Michael Everson +######################################################################## +# +# MINIMAL FILTER: Turkmen-Latin +# +# :: [АБВГДЕЁЖҖЗИЙКЛМНҢОӨПРСТУҮФХЦЧШЩЪЫЬЭӘЮЯабвгдеёжзҗийклмнңоөпрстуүфхцчшщъыьэәюя] ; :: NFD (NFC) ; +# +# +######################################################################## +# +######################################################################## +# +# Define All Transformation Variables +# +######################################################################## +# $prime = ʹ ; $doublePrime = ʺ ; $upperConsonants = [БВГДЖҖЗЙКЛМНҢПРСТФХЦЧШЩЪЬ] ; @@ -17,7 +48,22 @@ $upperVowels = [АЕЁИОӨУҮЫЭӘЮЯ] ; $lowerVowels = [аеёиоөуүыэәюя] ; $vowels = [$upperVowels $lowerVowels] ; $lower = [$lowerConsonants $lowerVowels] ; +# +# +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/cgi-bin/icu-bugs/transliterate?id=2034;expression=boundary;user=guest +# $wordBoundary = [^[:L:][:M:][:N:]] ; +# +# +######################################################################## +# +######################################################################## +# +# Start of Alphabetic Transformations +# +######################################################################## +# А → A ; # CYRILLIC CAPITAL LETTER A а → a ; # CYRILLIC SMALL LETTER A Б → B ; # CYRILLIC CAPITAL LETTER BE @@ -28,6 +74,18 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; г → g ; # CYRILLIC SMALL LETTER GHE Д → D ; # CYRILLIC CAPITAL LETTER DE д → d ; # CYRILLIC SMALL LETTER DE +# +# +######################################################################## +# +# BGN Page 104 Rule 1: +# +# The character e should be romanized ye initially, after the vowel +# characters a, e, ё, и, о, ө, у, ү, ы, э, ю, and я, and after й, ъ, and ь. +# In all other instances, it should be romanized e. +# +######################################################################## +# Е}[[$upperVowels - [Ә]] [ЙЪЬ]] → YE ; # CYRILLIC CAPITAL LETTER IE Е}[[$lowerVowels - [ә]] [йъь]] → Ye ; # CYRILLIC CAPITAL LETTER IE $wordBoundary{Е → Ye ; # CYRILLIC CAPITAL LETTER IE @@ -35,17 +93,46 @@ $wordBoundary{Е → Ye ; # CYRILLIC CAPITAL LETTER IE е}[[$upperVowels - [Ә]] [$lowerVowels - [ә]] [ЙйЪъЬь]] → ye ; # CYRILLIC SMALL LETTER IE $wordBoundary{е → ye ; # CYRILLIC SMALL LETTER IE е → e ; # CYRILLIC SMALL LETTER IE +# +# +######################################################################## +# +# End of Rule 1 +# +######################################################################## +# Ё} $lower → Yo ; # CYRILLIC CAPITAL LETTER IO Ё → YO ; # CYRILLIC CAPITAL LETTER IO ё → yo ; # CYRILLIC SMALL LETTER IO Ж} $lower → Zh ; # CYRILLIC CAPITAL LETTER ZHE Ж → ZH ; # CYRILLIC CAPITAL LETTER ZHE ж → zh ; # CYRILLIC SMALL LETTER ZHE +# +# +######################################################################## +# +# BGN Page 104 Rule 2 +# +# The character sequences зх, нг, сх, and цх may be romanized z·h, +# n·g, s·h, and ts·h in order to differentiate those romanizations form +# the digraphs zh, ng, sh, and the letter sequence tsh, which are used +# to render the characters ж, ң, ш, and the character sequence тш. +# +######################################################################## +# ЗХ → Z·H ; # CYRILLIC CAPITAL LETTER ZE Зх → Z·h ; # CYRILLIC CAPITAL LETTER ZE зх → z·h ; # CYRILLIC SMALL LETTER ZE З → Z ; # CYRILLIC CAPITAL LETTER ZE з → z ; # CYRILLIC SMALL LETTER ZE +# +# +######################################################################## +# +# End Rule 2 +# +######################################################################## +# И → И ; # CYRILLIC CAPITAL LETTER I и → и ; # CYRILLIC SMALL LETTER I Й → Y ; # CYRILLIC CAPITAL LETTER I @@ -56,11 +143,29 @@ $wordBoundary{е → ye ; # CYRILLIC SMALL LETTER IE л → l ; # CYRILLIC SMALL LETTER EL М → M ; # CYRILLIC CAPITAL LETTER EM м → m ; # CYRILLIC SMALL LETTER EM +# +# +######################################################################## +# +# BGN Page 104 Rule 2 +# +# нг becomes n·g +# +######################################################################## +# НГ → N·G ; # CYRILLIC CAPITAL LETTER EN Нг → N·g ; # CYRILLIC CAPITAL LETTER EN нг → n·g ; # CYRILLIC SMALL LETTER EN Н → N ; # CYRILLIC CAPITAL LETTER EN н → n ; # CYRILLIC SMALL LETTER EN +# +# +######################################################################## +# +# End Rule 2 +# +######################################################################## +# Ң} $lower → Ng ; # CYRILLIC CAPITAL LETTER EN WITH DESCENDER Ң → NG ; # CYRILLIC CAPITAL LETTER EN WITH DESCENDER ң → ng ; # CYRILLIC SMALL LETTER EN WITH DESCENDER @@ -72,11 +177,29 @@ $wordBoundary{е → ye ; # CYRILLIC SMALL LETTER IE п → p ; # CYRILLIC SMALL LETTER PE Р → R ; # CYRILLIC CAPITAL LETTER ER р → r ; # CYRILLIC SMALL LETTER ER +# +# +######################################################################## +# +# BGN Page 104 Rule 2 +# +# сх becomes s·h +# +######################################################################## +# СХ → S·H ; # CYRILLIC CAPITAL LETTER ES Сх → S·h ; # CYRILLIC CAPITAL LETTER ES сх → s·h ; # CYRILLIC SMALL LETTER ES С → S ; # CYRILLIC CAPITAL LETTER ES с → s ; # CYRILLIC SMALL LETTER ES +# +# +######################################################################## +# +# End Rule 2 +# +######################################################################## +# Т → T ; # CYRILLIC CAPITAL LETTER TE т → t ; # CYRILLIC SMALL LETTER TE У → U ; # CYRILLIC CAPITAL LETTER U @@ -87,15 +210,43 @@ $wordBoundary{е → ye ; # CYRILLIC SMALL LETTER IE ф → f ; # CYRILLIC SMALL LETTER EF Х → H ; # CYRILLIC CAPITAL LETTER HA х → h ; # CYRILLIC SMALL LETTER HA +# +# +######################################################################## +# +# BGN Page 104 Rule 2 +# +# цх becomes ts·h +# +######################################################################## +# ЦХ → TS·H ; # CYRILLIC CAPITAL LETTER GHE Цх → Ts·h ; # CYRILLIC CAPITAL LETTER GHE цх → ts·h ; # CYRILLIC SMALL LETTER GHE Ц} $lower → Ts ; # CYRILLIC CAPITAL LETTER TSE Ц → TS ; # CYRILLIC CAPITAL LETTER TSE ц → ts ; # CYRILLIC SMALL LETTER TSE +# +# +######################################################################## +# +# End Rule 2 +# +######################################################################## +# Ч} $lower → Ch ; # CYRILLIC CAPITAL LETTER CHE Ч → CH ; # CYRILLIC CAPITAL LETTER CHE ч → ch ; # CYRILLIC SMALL LETTER CHE +# +# +######################################################################## +# +# Implied rule from BGN Russian-Latin transliteration (Page 94 Note 3.6). +# +# шч becomes sh·ch +# +######################################################################## +# ШЧ → SH·CH ; # CYRILLIC CAPITAL LETTER SHA Шч → Sh·ch ; # CYRILLIC CAPITAL LETTER SHA шч → sh·ch ; # CYRILLIC SMALL LETTER SHA @@ -105,10 +256,43 @@ $wordBoundary{е → ye ; # CYRILLIC SMALL LETTER IE Щ} $lower → Shch ; # CYRILLIC CAPITAL LETTER SHCHA Щ → SHCH ; # CYRILLIC CAPITAL LETTER SHCHA щ → shch ; # CYRILLIC SMALL LETTER SHCHA +# +# +######################################################################## +# +# End Implied rule +# +######################################################################## +# Ъ → $doublePrime ; # CYRILLIC CAPITAL LETTER HARD SIGN ъ → $doublePrime ; # CYRILLIC SMALL LETTER HARD SIGN +# +# +######################################################################## +# +# BGN Page 104 Note 3 +# +# The character Ыы may be romanized Ɨɨ instead of Yy, if so desired. +# +######################################################################## +# Ы → Y ; # CYRILLIC CAPITAL LETTER YERU ы → y ; # CYRILLIC SMALL LETTER YERU +# +# +# Alternative rule to implement the option described here. To apply +# uncomment the following by removing the '#' mark at the start of the +# line and insert before the two rule lines above. +# +#Ы → Ɨ ; # CYRILLIC CAPITAL LETTER YERU +#ы → ɨ ; # CYRILLIC SMALL LETTER YERU +# +######################################################################## +# +# End BGN Page 104 Note 2 +# +######################################################################## +# Ь → $prime ; # CYRILLIC CAPITAL LETTER SOFT SIGN ь → $prime ; # CYRILLIC SMALL LETTER SOFT SIGN Э → E ; # CYRILLIC CAPITAL LETTER E @@ -121,3 +305,7 @@ $wordBoundary{е → ye ; # CYRILLIC SMALL LETTER IE Я} $lower → Ya ; # CYRILLIC CAPITAL LETTER YA Я → YA ; # CYRILLIC CAPITAL LETTER YA я → ya ; # CYRILLIC SMALL LETTER YA +# +# +######################################################################## + diff --git a/icu4c/source/data/translit/tlh_am.txt b/icu4c/source/data/translit/tlh_am.txt new file mode 100644 index 00000000000..6d98695b29c --- /dev/null +++ b/icu4c/source/data/translit/tlh_am.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: tlh_am.txt +# Generated from CLDR +# + +::tlh-tlh_FONIPA; +::am_FONIPA-am; + diff --git a/icu4c/source/data/translit/tlh_ar.txt b/icu4c/source/data/translit/tlh_ar.txt new file mode 100644 index 00000000000..96c888b6766 --- /dev/null +++ b/icu4c/source/data/translit/tlh_ar.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: tlh_ar.txt +# Generated from CLDR +# + +::tlh-tlh_FONIPA; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/tlh_fa.txt b/icu4c/source/data/translit/tlh_fa.txt new file mode 100644 index 00000000000..c50b897e7f4 --- /dev/null +++ b/icu4c/source/data/translit/tlh_fa.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: tlh_fa.txt +# Generated from CLDR +# + +::tlh-tlh_FONIPA; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/tlh_tlh_FONIPA.txt b/icu4c/source/data/translit/tlh_tlh_FONIPA.txt index dacf0adc5a8..c23253c119d 100755 --- a/icu4c/source/data/translit/tlh_tlh_FONIPA.txt +++ b/icu4c/source/data/translit/tlh_tlh_FONIPA.txt @@ -1,6 +1,6 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** @@ -8,10 +8,14 @@ # Generated from CLDR # +# Transformation from Klingon (tlh) to its IPA transcription (tlh_FONIPA). +# http://en.wikipedia.org/wiki/Klingon_language#Phonology +# http://www.kli.org/tlh/sounds.html ::NFC; - tlh → t\u0361ɬ; - +# Dipthongs as per http://www.kli.org/tlh/sounds.html. For those that +# exist in English, we following the American English pronunciation. +# http://en.wikipedia.org/wiki/Diphthong#English aw → aʊ\u032F; # similar to English ‹cow› ew → ɛʊ\u032F; # does not exist in English Iw → ɪʊ\u032F; # des not exist English @@ -20,11 +24,9 @@ ey → eɪ\u032F; # similar to English ‹may› Iy → ɪː; # KLI says like English ‹key›, which is not a diphthong oy → oɪ\u032F; # KLI says English ‹boy›, which would be [ɔɪ\u032F], but ‹o› is [o] uy → uɪ\u032F; # similar to English ‹gooey› but in on syllable - ch → t\u0361ʃ; gh → ɣ; ng → ŋ; - p → pʰ; t → tʰ; q → qʰ; @@ -43,13 +45,11 @@ r → r; w → w; l → l; y → j; - a → ɑ; e → ɛ; I → ɪ; o → o; u → u; - \- → ; - ::NFC; + diff --git a/icu4c/source/data/translit/tr_Lower.txt b/icu4c/source/data/translit/tr_Lower.txt index da10a1e8743..7b86e1d87b1 100644 --- a/icu4c/source/data/translit/tr_Lower.txt +++ b/icu4c/source/data/translit/tr_Lower.txt @@ -1,13 +1,22 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: tr_Lower.txt # Generated from CLDR # + +# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri +# 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE İ→i; +# When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. +# This matches the behavior of the canonically equivalent I-dot_above +# 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE +# When lowercasing, unless an I is before a dot_above, it turns into a dotless i. +# 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ; I→ı ; ::Any-Lower(); + diff --git a/icu4c/source/data/translit/tr_Title.txt b/icu4c/source/data/translit/tr_Title.txt index 541f908a1e7..c623e7b2ff8 100644 --- a/icu4c/source/data/translit/tr_Title.txt +++ b/icu4c/source/data/translit/tr_Title.txt @@ -1,14 +1,20 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: tr_Title.txt # Generated from CLDR # + +# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri +# Make any string of letters after a cased letter be lower, with rules for i [:cased:] [:case-ignorable:]* { İ → i; [:cased:] [:case-ignorable:]* { I → ı; [:cased:] [:case-ignorable:]* { (.) → &Any-Lower($1) ; +# Otherwise all lowercase go to upper (titlecase stay as is) i→İ ; ([:Lowercase:]) → &Any-Upper($1) ; +# do later I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ; + diff --git a/icu4c/source/data/translit/tr_Upper.txt b/icu4c/source/data/translit/tr_Upper.txt index 7af47b8ccbc..0b0e1254959 100644 --- a/icu4c/source/data/translit/tr_Upper.txt +++ b/icu4c/source/data/translit/tr_Upper.txt @@ -1,11 +1,17 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: tr_Upper.txt # Generated from CLDR # + +# Copyright (C) 2011-2013, Apple Inc.; Unicode, Inc.; and others. All Rights Reserved. +# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri +# When uppercasing, i turns into a dotted capital I +# 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I i→İ; ::Any-Upper(); + diff --git a/icu4c/source/data/translit/Ukrainian_Latin_BGN.txt b/icu4c/source/data/translit/uk_uk_Latn_BGN.txt similarity index 52% rename from icu4c/source/data/translit/Ukrainian_Latin_BGN.txt rename to icu4c/source/data/translit/uk_uk_Latn_BGN.txt index e6349978bda..88a42d097b6 100644 --- a/icu4c/source/data/translit/Ukrainian_Latin_BGN.txt +++ b/icu4c/source/data/translit/uk_uk_Latn_BGN.txt @@ -1,14 +1,44 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Ukrainian_Latin_BGN.txt +# File: uk_uk_Latn_BGN.txt # Generated from CLDR # + +# +######################################################################## +# BGN/PCGN 1979 System +# +# The BGN/PCGN system for Ukrainian was designed for use in romanizing +# names written in the Ukrainian alphabet. The Ukrainian alphabet +# contains five letters not present in the Russian alphabet: +# Ґґ, Єє, Іі, Її, and ’. +# +# The Ukrainian Alphabet as defined by the BGN (Page 105): +# +# АБВГҐДЕЄЖЗИІЇЙКЛМНОПРСТУФХЦЧШЩЮЯЬ +# абвгґдеєжзиіїйклмнопрстуфхцчшщюяь’ +# +# Originally prepared by Michael Everson +######################################################################## +# +# MINIMAL FILTER: Ukrainian-Latin +# :: [АБВГҐДЕЄЖЗИІЇЙКЛМНОПРСТУФХЦЧШЩЮЯЬабвгґдеєжзиіїйклмнопрстуфхцчшщюяь’] ; :: NFC ; +# +# +######################################################################## +# +######################################################################## +# +# Define All Transformation Variables +# +######################################################################## +# $prime = ʹ ; $doublePrime = ʺ ; $upperConsonants = [БВГҐДЖЗЙКЛМНПРСТФХЦЧШЩЬ] ; @@ -18,17 +48,49 @@ $upperVowels = [АЕЄИІЇОУЮЯ] ; $lowerVowels = [аеєиіїоуюя] ; $vowels = [$upperVowels $lowerVowels] ; $lower = [$lowerConsonants $lowerVowels] ; +# +# +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/cgi-bin/icu-bugs/transliterate?id=2034;expression=boundary;user=guest +# $wordBoundary = [^[:L:][:M:][:N:]] ; +# +# +######################################################################## +# +######################################################################## +# +# Start of Alphabetic Transformations +# +######################################################################## +# А → A ; # CYRILLIC CAPITAL LETTER A а → a ; # CYRILLIC SMALL LETTER A Б → B ; # CYRILLIC CAPITAL LETTER BE б → b ; # CYRILLIC SMALL LETTER BE В → V ; # CYRILLIC CAPITAL LETTER VE в → v ; # CYRILLIC SMALL LETTER VE +# +# +######################################################################## +# +# Comment. The BGN gives h as the transliteration for both г and ґ. +# This is an error: г is h and ґ is g. +# +######################################################################## +# Г → H ; # CYRILLIC CAPITAL LETTER GHE г → h ; # CYRILLIC SMALL LETTER GHE Ґ → G ; # CYRILLIC CAPITAL LETTER GHE WITH UPTURN ґ → g ; # CYRILLIC SMALL LETTER GHE WITH UPTURN +# +# +######################################################################## +# +# End Comment. +# +######################################################################## +# Д → D ; # CYRILLIC CAPITAL LETTER DE д → d ; # CYRILLIC SMALL LETTER DE Е → E ; # CYRILLIC CAPITAL LETTER IE @@ -39,11 +101,33 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; Ж} $lower → Zh ; # CYRILLIC CAPITAL LETTER ZHE Ж → ZH ; # CYRILLIC CAPITAL LETTER ZHE ж → zh ; # CYRILLIC SMALL LETTER ZHE +# +# +######################################################################## +# +# BGN Page 105 Rule 1 +# +# The character sequences зг, кг, сг, тс, and цг may be romanized +# z·h, k·h, s·h, t·s, and ts·h in order to differentiate those +# romanizations from the digraphs zh, kh, sh, ts, and the letter +# sequence tsh, which are used to render the characters ж, х, ш, ц +# and the character sequence тш. +# +######################################################################## +# ЗГ → Z·H ; # CYRILLIC CAPITAL LETTER ZE Зг → Z·h ; # CYRILLIC CAPITAL LETTER ZE зг → z·h ; # CYRILLIC SMALL LETTER ZE З → Z ; # CYRILLIC CAPITAL LETTER ZE з → z ; # CYRILLIC SMALL LETTER ZE +# +# +######################################################################## +# +# End Rule 1 +# +######################################################################## +# И → Y ; # CYRILLIC CAPITAL LETTER I и → y ; # CYRILLIC SMALL LETTER I І → I ; # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I @@ -53,11 +137,29 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; ї → yi ; # CYRILLIC SMALL LETTER YI Й → Y ; # CYRILLIC CAPITAL LETTER I й → y ; # CYRILLIC SMALL LETTER I +# +# +######################################################################## +# +# BGN Page 105 Rule 1 +# +# кг becomes k·h +# +######################################################################## +# КГ → K·H ; # CYRILLIC CAPITAL LETTER KA Кг → K·h ; # CYRILLIC CAPITAL LETTER KA кг → k·h ; # CYRILLIC SMALL LETTER KA К → K ; # CYRILLIC CAPITAL LETTER KA к → k ; # CYRILLIC SMALL LETTER KA +# +# +######################################################################## +# +# End Rule 1 +# +######################################################################## +# Л → L ; # CYRILLIC CAPITAL LETTER EL л → l ; # CYRILLIC SMALL LETTER EL М → M ; # CYRILLIC CAPITAL LETTER EM @@ -70,16 +172,50 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; п → p ; # CYRILLIC SMALL LETTER PE Р → R ; # CYRILLIC CAPITAL LETTER ER р → r ; # CYRILLIC SMALL LETTER ER +# +# +######################################################################## +# +# BGN Page 105 Rule 1 +# +# сг becomes s·h +# +######################################################################## +# СГ → S·H ; # CYRILLIC CAPITAL LETTER ES Сг → S·h ; # CYRILLIC CAPITAL LETTER ES сг → s·h ; # CYRILLIC SMALL LETTER ES С → S ; # CYRILLIC CAPITAL LETTER ES с → s ; # CYRILLIC SMALL LETTER ES +# +# +######################################################################## +# +# End Rule 1 +# +######################################################################## +# +######################################################################## +# +# BGN Page 105 Rule 1 +# +# тс becomes t·s +# +######################################################################## +# ТС → T·S ; # CYRILLIC CAPITAL LETTER TE Тс → T·s ; # CYRILLIC CAPITAL LETTER TE тс → t·s ; # CYRILLIC SMALL LETTER TE Т → T ; # CYRILLIC CAPITAL LETTER TE т → t ; # CYRILLIC SMALL LETTER TE +# +# +######################################################################## +# +# End Rule 1 +# +######################################################################## +# У → U ; # CYRILLIC CAPITAL LETTER U у → u ; # CYRILLIC SMALL LETTER U Ф → F ; # CYRILLIC CAPITAL LETTER EF @@ -87,15 +223,43 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; Х} $lower → Kh ; # CYRILLIC CAPITAL LETTER HA Х → KH ; # CYRILLIC CAPITAL LETTER HA х → kh ; # CYRILLIC SMALL LETTER HA +# +# +######################################################################## +# +# BGN Page 105 Rule 1 +# +# цг becomes ts·h +# +######################################################################## +# ЦГ → TS·H ; # CYRILLIC CAPITAL LETTER TSE Цг → Ts·h ; # CYRILLIC CAPITAL LETTER TSE цг → ts·h ; # CYRILLIC SMALL LETTER TSE Ц} $lower → Ts ; # CYRILLIC CAPITAL LETTER TSE Ц → TS ; # CYRILLIC CAPITAL LETTER TSE ц → ts ; # CYRILLIC SMALL LETTER TSE +# +# +######################################################################## +# +# End Rule 1 +# +######################################################################## +# Ч} $lower → Ch ; # CYRILLIC CAPITAL LETTER CHE Ч → CH ; # CYRILLIC CAPITAL LETTER CHE ч → ch ; # CYRILLIC SMALL LETTER CHE +# +# +######################################################################## +# +# BGN Page 94 Rule 3.6 +# +# шч becomes sh·ch +# +######################################################################## +# ШЧ → SH·CH ; # CYRILLIC CAPITAL LETTER SHA Шч → Sh·ch ; # CYRILLIC CAPITAL LETTER SHA шч → sh·ch ; # CYRILLIC SMALL LETTER SHA @@ -105,6 +269,14 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; Щ} $lower → Shch ; # CYRILLIC CAPITAL LETTER SHCHA Щ → SHCH ; # CYRILLIC CAPITAL LETTER SHCHA щ → shch ; # CYRILLIC SMALL LETTER SHCHA +# +# +######################################################################## +# +# End Rule 3.6 +# +######################################################################## +# Ю} $lower → Yu ; # CYRILLIC CAPITAL LETTER YU Ю → YU ; # CYRILLIC CAPITAL LETTER YU ю → yu ; # CYRILLIC SMALL LETTER YU @@ -114,3 +286,7 @@ $wordBoundary = [^[:L:][:M:][:N:]] ; Ь → $prime ; # CYRILLIC CAPITAL LETTER SOFT SIGN ь → $prime ; # CYRILLIC SMALL LETTER SOFT SIGN ’ → $doublePrime ; # LEFT SINGLE QUOTATION MARK +# +# +######################################################################## + diff --git a/icu4c/source/data/translit/und_FONIPA_ar.txt b/icu4c/source/data/translit/und_FONIPA_ar.txt new file mode 100644 index 00000000000..55674fbe919 --- /dev/null +++ b/icu4c/source/data/translit/und_FONIPA_ar.txt @@ -0,0 +1,123 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: und_FONIPA_ar.txt +# Generated from CLDR +# + +# Vowels +# ------ +# In these rules, we produce ي و ا both for short and for long vowels. +# This would be wrong for writing Arabic, but when transliterating +# foreign words and names, it is strongly preferred to vowel marks. +# However, we emit short schwa [ə] and a few other, schwa-like vowels. +$IVowel = [i ɪ e {e\u031E}]; +$UVowel = [y {ɨ} {ʉ} ɯ u ʏ {ɪ\u0308} {ʊ\u0308} {ɯ\u033D} {ʊ} ø ɤ o {ø\u031E} {ɤ\u031E} {o\u031E} ɞ ɔ w {w\u0325} ʍ ʷ]; +$AVowel = [ɛ œ ɜ ʌ æ ɐ a ɶ {ä} {ɒ\u0308} ɑ ɒ]; +$SchwaVowel = [ɘ ɵ ə {ɵ\u031E}]; +$Vowel = [$IVowel $UVowel $AVowel $SchwaVowel]; +$Click = [ʘ ɋ ǀ ʇ ǃ ʗ ǂ ʄ ǁ ʖ]; +$Boundary = [^[:L:][:M:][:N:]]; +::NFD; +[ʰ ʱ ʼ \u0303 \u0330 \u030B \u0301 \u0304 \u0300 \u030F \u030C \u0302 ˥ ˦ ˧ ˨ ˩ ꜜ ꜛ ↗ ↘ \u0361 \u035C \u032F] → ; +ʲ → j; +ᵐ → m; +ⁿ → n; +ᵑ → ŋ; +::NFC; +# TODO: Diphthongs probably need more work. +# Romanian [sekujesk] → [sekujask], for emitting سيكوياسك not سيكويسك +$UVowel [j $IVowel] [e {e\u031E} $SchwaVowel] → uia; +# Kazakh Аягөз [ɑjɑɡy\u032Fʉz] → [ɑjɑɡiuz], to emit TODO +yʉ → iu; +::NULL; +# Vowels +$Boundary {ʔ? $IVowel ː} → إ\u0650ي; +$Boundary {ʔ? $IVowel} → إ\u0650; +{$IVowel ʔ} $Boundary → ئ; +{$IVowel ː ʔ} $Boundary → يء; +{$IVowel ː ʔ} [$Vowel] → ئ; +$IVowel ː? → ي; +$Boundary {ʔ? $UVowel ː} → أو; +$Boundary {ʔ? $UVowel} → أ; +{$UVowel ʔ} $Boundary → ؤ; +{$UVowel ː ʔ} $Boundary → وء; +$UVowel ː? → و; +$Boundary {ʔ? $AVowel ː} → آ; +$Boundary {ʔ? $AVowel} → أ; +{$AVowel ʔ} $Boundary → أ; +{$AVowel ː ʔ} $Boundary → اء; +$AVowel ː? ʔ $AVowel ː? → اءا; +$AVowel ː? → ا; +$Boundary {ʔ? $SchwaVowel ː} → إ\u0650ي; +$Boundary {ʔ? $SchwaVowel} → أ; +$SchwaVowel ː → ي; +$SchwaVowel → ; +# TODO: Handle glottal stop. +ʔ → ; +# Shadda for long (geminated) consonants +ː → \u0651; +# Affricates +[{t\u0361ʃ} ʧ] → ت\u0652ش; +# Clicks +[ɡ g ɠ k] $Click → ك\u0652ش; +$Click → ت\u0652ش; +# Nasal stops +[{m\u0325} m ɱ] → م; +[{n\u033C\u030A} {n\u033C} {n\u0325} n {ɳ\u030A} ɳ {ɲ\u030A} {ɲ\u0325} ɲ] → ن; +[{ŋ\u030A} ŋ {ɴ\u0325} ɴ] k → نك; +[{ŋ\u030A} ŋ {ɴ\u0325} ɴ] [ɡ g ɠ]? → ن\u0652غ; +# Non-nasal stops +[p b {p\u032A} {b\u032A} ɓ] → ب; +[{d\u033C} d ɗ ᶑ] → د; +[{t\u033C} t] → ت; +[ʈ] → ط; +[ɖ] → ض; +c → ت\u0652ش; +ɟ → دج; +k → ك; +[ɡ g ɠ] → غ; +[q ɢ ʡ ʛ] → ق; +# Sibilant fricatives +s → س; +z → ز; +[ʃ ʂ ɕ ʄ] → ش; +[ʒ ʐ ʑ] → ج; +# Non-sibilant fricatives +[ɸ f v] → ف; +β → ب; +[{θ\u033C} θ {θ\u0331}] → ث; +[{ð\u033C} ð {ð\u0320}] → ذ; +ç → ش; +ʝ $IVowel? ː? → ي; +[x χ] → خ; +[ɣ ʁ] → غ; +ħ → ح; +ʕ → ع; +[h ɦ {ʔ\u031E}] → ه; +# Approximants, trills, flaps +ʋ → و; +ʙ → بر; +{r\u031D} → رش; +[{ɹ\u0325} {ɹ} {ɻ\u030A} {ɻ} {ɾ\u0325} ɾ {ɽ\u030A} ɽ {r\u033C} {r\u0325} r] → ر; +[{ʀ\u0325} ʀ] → غ; +ʜ → ح; +ʢ → ع; +j $IVowel? ː? → ي; +# Laterals +ɬ → ش\u0652ل; +ɮ → ج\u0652ل; +{[{ʎ\u0325} ʎ]} [^ $IVowel j ʝ] → لي; +[{l\u033C} {l\u0325} l {ɭ\u030A} ɭ {ʎ\u0325} ʎ] → ل; +[ʟ {ʟ\u0320}] → غ; +# Independent pass for misc cleanup. +::NULL; +# Strip off syllable markers +\. → ; +# Sequences of three or more ووو look very confusing; we shorten them. +# Polish Darłowo [darwɔvɔ] → داروو → داروووو +ووو+ → وو; + diff --git a/icu4c/source/data/translit/und_FONIPA_fa.txt b/icu4c/source/data/translit/und_FONIPA_fa.txt new file mode 100644 index 00000000000..9a397217810 --- /dev/null +++ b/icu4c/source/data/translit/und_FONIPA_fa.txt @@ -0,0 +1,118 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: und_FONIPA_fa.txt +# Generated from CLDR +# + +# Vowels +# ------ +# In these rules, we produce ی و ا both for short and for long vowels. +# This would be wrong for writing Farsi or Arabic, but when transliterating +# foreign words and names, it is strongly preferred to vowel marks. +# Short schwa [ə] and a few other, schwa-like vowels get omitted entirely +# unless at the end of the word, in which case we emit ه whose Farsi +# word-final pronunciation comes close to [ə]. At the beginning of words, +# Farsi speakers prefer to see آ for [ɑ] and a few other similar-sounding +# dark vowels; note that this use of آ is quite different from Arabic. +$IVowel = [i ɪ e {e\u031E}]; +$UVowel = [y {ɨ} {ʉ} ɯ u ʏ {ɪ\u0308} {ʊ\u0308} {ɯ\u033D} {ʊ} ø ɤ o {ø\u031E} {ɤ\u031E} {o\u031E} ɔ w {w\u0325} ʍ ʷ]; +$AVowel = [ɛ œ ɜ æ ɶ]; +$DarkAVowel = [ʌ a ɑ ɒ ɐ ɞ {ä} {ɒ\u0308}]; # آ instead of ا at beginning of words +$SchwaVowel = [ɘ ɵ ə {ɵ\u031E}]; +$Click = [ʘ ɋ ǀ ʇ ǃ ʗ ǂ ʄ ǁ ʖ]; +$Boundary = [^[:L:][:M:][:N:]]; +::NFD; +[ʰ ʱ ʼ \u0303 \u0330 \u030B \u0301 \u0304 \u0300 \u030F \u030C \u0302 ˥ ˦ ˧ ˨ ˩ ꜜ ꜛ ↗ ↘ \u0361 \u035C \u032F] → ; +ʲ → j; +ᵐ → m; +ⁿ → n; +ᵑ → ŋ; +::NFC; +# TODO: Diphthongs probably need more work. +# Romanian [sekujesk] → [sekujask], for emitting سیکویاسک not سیکویسک +$UVowel [j $IVowel] [e {e\u031E} $SchwaVowel] → uia; +# Kazakh Аягөз [ɑjɑɡy\u032Fʉz] → [ɑjɑɡiuz], to emit آیاگیوز not آیاگووز +yʉ → iu; +::NULL; +# Vowels +$Boundary {$SchwaVowel ː?} → ای; +$SchwaVowel ː → ی; +{[$SchwaVowel e {e\u031E}]} [^[:L:][:M:][:N:][\.]] → ه; +$SchwaVowel → ; +$Boundary {$IVowel ː?} → ای; +$IVowel ː? j? → ی; +$Boundary {$UVowel ː?} → او; +$UVowel ː? → و; +$Boundary {$AVowel ː?} → ا; +$AVowel ː? → ا; +$Boundary {$DarkAVowel ː?} → آ; +$DarkAVowel ː? → ا; +# Shadda for long (geminated) consonants +ː → \u0651; +# Affricates +[{t\u0361ʃ} ʧ] → چ; +# Clicks +[ɡ g ɠ k] $Click → کچ; +[n ɲ]? $Click → نچ; +# Nasal stops +[{m\u0325} m ɱ] → م; +[{n\u033C\u030A} {n\u033C} {n\u0325} n {ɳ\u030A} ɳ {ɲ\u030A} {ɲ\u0325} ɲ] → ن; +[{ŋ\u030A} ŋ {ɴ\u0325} ɴ] k → نک; +[{ŋ\u030A} ŋ {ɴ\u0325} ɴ] [ɡ g]? → نگ; +# Non-nasal stops +[p {p\u032A}] → پ; +[b {b\u032A} ɓ] → ب; +[{d\u033C} d ɗ ᶑ] → د; +[{t\u033C} t] → ت; +[ʈ] → ط; +[ɖ] → ض; +c → چ; +ɟ → دج; +k → ک; +[ɡ g ɠ] → گ; +[q ɢ ʡ ʛ] → ق; +ʔ → ; +# Sibilant fricatives +s → س; +z → ز; +[ʃ ʂ ɕ ʄ] → ش; +[ʒ ʐ ʑ] → ژ; +# Non-sibilant fricatives +[ɸ f] → ف; +[β v] → و; +[{θ\u033C} θ {θ\u0331}] → ث; +[{ð\u033C} ð {ð\u0320}] → ذ; +ç → ش; +ʝ $IVowel? ː? → ی; +[x χ] → خ; +[ɣ ʁ] → غ; +ħ → ح; +ʕ → ع; +[h ɦ {ʔ\u031E}] → ه; +# Approximants, trills, flaps +ʋ → و; +ʙ → بر; +{r\u031D} → رژ; +[{ɹ\u0325} {ɹ} {ɻ\u030A} {ɻ} {ɾ\u0325} ɾ {ɽ\u030A} ɽ {r\u033C} {r\u0325} r] → ر; +[{ʀ\u0325} ʀ] → غ; +ʜ → ح; +ʢ → ع; +j $IVowel? ː? → ی; +# Laterals +ɬ → شل; +ɮ → ژل; +{[{ʎ\u0325} ʎ]} [^ $IVowel j ʝ] → لی; +[{l\u033C} {l\u0325} l {ɭ\u030A} ɭ {ʎ\u0325} ʎ] → ل; +[ʟ {ʟ\u0320}] → غ; +# Independent pass for misc cleanup. +::NULL; +# Strip off syllable markers +\. → ; +# Sequences of three or more ووو look very confusing; we shorten them. +# Polish Darłowo [darwɔvɔ] → داروو → داروووو +ووو+ → وو; + diff --git a/icu4c/source/data/translit/und_FONIPA_und_FONXSAMP.txt b/icu4c/source/data/translit/und_FONIPA_und_FONXSAMP.txt new file mode 100644 index 00000000000..44df2fa0953 --- /dev/null +++ b/icu4c/source/data/translit/und_FONIPA_und_FONXSAMP.txt @@ -0,0 +1,237 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: und_FONIPA_und_FONXSAMP.txt +# Generated from CLDR +# + +# Conversion between IPA and X-SAMPA phonetic transcription. +# +# See http://www.phon.ucl.ac.uk/home/sampa/x-sampa.htm for a description of +# X-SAMPA, an ASCII encoding of the International Phonetic Alphabet. +# +# The following obsolete or extended IPA symbols have no X-SAMPA equivalents +# and remain unaffected by this transform: +# +# ʞ LATIN SMALL LETTER TURNED K +# ʩ LATIN SMALL LETTER FENG DIGRAPH +# ʪ LATIN SMALL LETTER LS DIGRAPH +# ʫ LATIN SMALL LETTER LZ DIGRAPH +# ʬ LATIN LETTER BILABIAL PERCUSSIVE +# ʭ LATIN LETTER BIDENTAL PERCUSSIVE +# +# An IPA tie bar is transformed to an X-SAMPA underscore, per the official +# X-SAMPA guidelines. This can result in certain ambiguities: For example, the +# labial-velar nasal (http://en.wikipedia.org/wiki/Labial-velar_nasal) can be +# either written as [ŋ\u0361m] or [m\u0361ŋ] in IPA. However, neither version can be +# represented unambiguously in X-SAMPA: IPA [ŋ\u0361m] becomes X-SAMPA [N_m], which +# is also used to represent a hypothetical laminal (_m) velar nasal, IPA [ŋ\u033B]; +# and IPA [m\u0361ŋ] becomes X-SAMPA [m_N], which can also represent a linguolabial +# (_N) nasal, IPA [m\u033C], which is more appropriately written [n\u033C]. To avoid +# unintended ambiguities, it may therefore be advisable to write affricates +# without tie bars. +# +$t = '_'; # X-SAMPA representation of IPA tie bar. +::NFD(NFC); +# 5-character X-SAMPA representations +ʯ ↔ 'z`_w='; # LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL +# Alternative 5-character X-SAMPA representations +n\u031F\u030A ← 'n_+_0'; +n\u0320\u030A ← 'n_-_0'; +n\u032A\u030A ← 'n_d_0'; +n\u033A\u030A ← 'n_a_0'; +n\u033B\u030A ← 'n_m_0'; +n\u033C\u030A ← 'n_N_0'; +ɻ\u030A ← 'r\`_0'; +# 4-character X-SAMPA representations +ǁ ↔ '|\|\'; # LATIN LETTER LATERAL CLICK +ʄ ↔ 'J\_<'; # LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK +ʛ ↔ 'G\_<'; # LATIN LETTER SMALL CAPITAL G WITH HOOK +ʮ ↔ 'z_w='; # LATIN SMALL LETTER TURNED H WITH FISHHOOK +\u1DC4 ↔ '_H_T'; # COMBINING MACRON-ACUTE +\u1DC5 ↔ '_B_L'; # COMBINING GRAVE-MACRON +\u1DC8 ↔ '_R_F'; # COMBINING GRAVE-ACUTE-GRAVE +# Alternative 4-character X-SAMPA representations +ɭ\u030A ← 'l`_0'; +ɰ\u030A ← 'M\_0'; +ɳ\u030A ← 'n`_0'; +ɽ\u030A ← 'r`_0'; +# 3-character X-SAMPA representations +ɓ ↔ 'b_<'; # LATIN SMALL LETTER B WITH HOOK +ɗ ↔ 'd_<'; # LATIN SMALL LETTER D WITH HOOK +ɠ ↔ 'g_<'; # LATIN SMALL LETTER G WITH HOOK +ɻ ↔ 'r\`'; # LATIN SMALL LETTER TURNED R WITH HOOK +↗ ↔ ''; # NORTH EAST ARROW +↘ ↔ ''; # SOUTH EAST ARROW +# Alternative 3-character X-SAMPA representations +j\u030A ← 'j_0'; +ŋ\u030A ← 'N_0'; +ɥ\u030A ← 'H_0'; +ɱ\u030A ← 'F_0'; +ɲ\u030A ← 'J_0'; +# 2-character X-SAMPA representations +ħ ↔ 'X\'; # LATIN SMALL LETTER H WITH STROKE +ǀ ↔ '|\'; # LATIN LETTER DENTAL CLICK +ǂ ↔ '=\'; # LATIN LETTER ALVEOLAR CLICK +ǃ ↔ '!\'; # LATIN LETTER RETROFLEX CLICK +ɕ ↔ 's\'; # LATIN SMALL LETTER C WITH CURL +ɖ ↔ 'd`'; # LATIN SMALL LETTER D WITH TAIL +ɘ ↔ '@\'; # LATIN SMALL LETTER REVERSED E +ɚ ↔ '@`'; # LATIN SMALL LETTER SCHWA WITH HOOK +ɝ ↔ '3`'; # LATIN SMALL LETTER REVERSED OPEN E WITH HOOK +ɞ ↔ '3\'; # LATIN SMALL LETTER CLOSED REVERSED OPEN E +ɟ ↔ 'J\'; # LATIN SMALL LETTER DOTLESS J WITH STROKE +ɢ ↔ 'G\'; # LATIN LETTER SMALL CAPITAL G +ɦ ↔ 'h\'; # LATIN SMALL LETTER H WITH HOOK +ɧ ↔ 'x\'; # LATIN SMALL LETTER HENG WITH HOOK +ɭ ↔ 'l`'; # LATIN SMALL LETTER L WITH RETROFLEX HOOK +ɮ ↔ 'K\'; # LATIN SMALL LETTER LEZH +ɰ ↔ 'M\'; # LATIN SMALL LETTER TURNED M WITH LONG LEG +ɳ ↔ 'n`'; # LATIN SMALL LETTER N WITH RETROFLEX HOOK +ɴ ↔ 'N\'; # LATIN LETTER SMALL CAPITAL N +ɸ ↔ 'p\'; # LATIN SMALL LETTER PHI +ɹ ↔ 'r\'; # LATIN SMALL LETTER TURNED R +ɺ ↔ 'l\'; # LATIN SMALL LETTER TURNED R WITH LONG LEG +ɽ ↔ 'r`'; # LATIN SMALL LETTER R WITH TAIL +ʀ ↔ 'R\'; # LATIN LETTER SMALL CAPITAL R +ʂ ↔ 's`'; # LATIN SMALL LETTER S WITH HOOK +ʈ ↔ 't`'; # LATIN SMALL LETTER T WITH RETROFLEX HOOK +ʐ ↔ 'z`'; # LATIN SMALL LETTER Z WITH RETROFLEX HOOK +ʑ ↔ 'z\'; # LATIN SMALL LETTER Z WITH CURL +ʕ ↔ '?\'; # LATIN LETTER PHARYNGEAL VOICED FRICATIVE +ʘ ↔ 'O\'; # LATIN LETTER BILABIAL CLICK +ʙ ↔ 'B\'; # LATIN LETTER SMALL CAPITAL B +ʜ ↔ 'H\'; # LATIN LETTER SMALL CAPITAL H +ʝ ↔ 'j\'; # LATIN SMALL LETTER J WITH CROSSED-TAIL +ʟ ↔ 'L\'; # LATIN LETTER SMALL CAPITAL L +ʡ ↔ '>\'; # LATIN LETTER GLOTTAL STOP WITH STROKE +ʢ ↔ '<\'; # LATIN LETTER REVERSED GLOTTAL STOP WITH STROKE +ʰ ↔ '_h'; # MODIFIER LETTER SMALL H +ʷ ↔ '_w'; # MODIFIER LETTER SMALL W +ʼ ↔ '_>'; # MODIFIER LETTER APOSTROPHE +ˆ ↔ '_\'; # MODIFIER LETTER CIRCUMFLEX ACCENT +ˇ ↔ '_/'; # CARON +ˑ ↔ ':\'; # MODIFIER LETTER HALF TRIANGULAR COLON +ˠ ↔ '_G'; # MODIFIER LETTER SMALL GAMMA +ˡ ↔ '_l'; # MODIFIER LETTER SMALL L +ˤ ↔ '_?\'; # MODIFIER LETTER SMALL REVERSED GLOTTAL STOP +\u0300 ↔ '_L'; # COMBINING GRAVE ACCENT +\u0301 ↔ '_H'; # COMBINING ACUTE ACCENT +\u0302 ↔ '_F'; # COMBINING CIRCUMFLEX ACCENT +\u0304 ↔ '_M'; # COMBINING MACRON +\u0306 ↔ '_X'; # COMBINING BREVE +\u0308 ↔ '_"'; # COMBINING DIAERESIS +\u030B ↔ '_T'; # COMBINING DOUBLE ACUTE ACCENT +\u030C ↔ '_R'; # COMBINING CARON +\u030F ↔ '_B'; # COMBINING DOUBLE GRAVE ACCENT +\u0318 ↔ '_A'; # COMBINING LEFT TACK BELOW +\u0319 ↔ '_q'; # COMBINING RIGHT TACK BELOW +\u031A ↔ '_}'; # COMBINING LEFT ANGLE ABOVE +\u031C ↔ '_c'; # COMBINING LEFT HALF RING BELOW +\u031D ↔ '_r'; # COMBINING UP TACK BELOW +\u031E ↔ '_o'; # COMBINING DOWN TACK BELOW +\u031F ↔ '_+'; # COMBINING PLUS SIGN BELOW +\u0320 ↔ '_-'; # COMBINING MINUS SIGN BELOW +\u0324 ↔ '_t'; # COMBINING DIAERESIS BELOW +\u0325 ↔ '_0'; # COMBINING RING BELOW +\u032A ↔ '_d'; # COMBINING BRIDGE BELOW +\u032C ↔ '_v'; # COMBINING CARON BELOW +\u032F ↔ '_^'; # COMBINING INVERTED BREVE BELOW +\u0330 ↔ '_k'; # COMBINING TILDE BELOW +\u0334 ↔ '_e'; # COMBINING TILDE OVERLAY +\u0339 ↔ '_O'; # COMBINING RIGHT HALF RING BELOW +\u033A ↔ '_a'; # COMBINING INVERTED BRIDGE BELOW +\u033B ↔ '_m'; # COMBINING SQUARE BELOW +\u033C ↔ '_N'; # COMBINING SEAGULL BELOW +\u033D ↔ '_x'; # COMBINING X ABOVE +ᵻ ↔ 'I\'; # LATIN SMALL CAPITAL LETTER I WITH STROKE +ᵿ ↔ 'U\'; # LATIN SMALL CAPITAL LETTER U WITH STROKE +ⁿ ↔ '_n'; # MODIFIER LETTER LATIN SMALL LETTER N +# Alternative 2-character X-SAMPA representations +ʋ ← 'v\'; # LATIN SMALL LETTER V WITH HOOK +ʲ ← '_j'; # MODIFIER LETTER SMALL J +\u0303 ← '_~'; # COMBINING TILDE +\u0329 ← '_='; # COMBINING VERTICAL LINE BELOW +# 1-character X-SAMPA representations +c\u0327 ↔ C; # LATIN SMALL LETTER C WITH CEDILLA (decomposed) +æ ↔ '{'; # LATIN SMALL LETTER AE +ð ↔ D; # LATIN SMALL LETTER ETH +ø ↔ 2; # LATIN SMALL LETTER O WITH STROKE +ŋ ↔ N; # LATIN SMALL LETTER ENG +œ ↔ 9; # LATIN SMALL LIGATURE OE +ɐ ↔ 6; # LATIN SMALL LETTER TURNED A +ɑ ↔ A; # LATIN SMALL LETTER ALPHA +ɒ ↔ Q; # LATIN SMALL LETTER TURNED ALPHA +ɔ ↔ O; # LATIN SMALL LETTER OPEN O +ə ↔ '@'; # LATIN SMALL LETTER SCHWA +ɛ ↔ E; # LATIN SMALL LETTER OPEN E +ɜ ↔ 3; # LATIN SMALL LETTER REVERSED OPEN E +ɡ ↔ g; # LATIN SMALL LETTER SCRIPT G +ɣ ↔ G; # LATIN SMALL LETTER GAMMA +ɤ ↔ 7; # LATIN SMALL LETTER RAMS HORN +ɥ ↔ H; # LATIN SMALL LETTER TURNED H +ɨ ↔ 1; # LATIN SMALL LETTER I WITH STROKE +ɪ ↔ I; # LATIN LETTER SMALL CAPITAL I +ɫ ↔ 5; # LATIN SMALL LETTER L WITH MIDDLE TILDE +ɬ ↔ K; # LATIN SMALL LETTER L WITH BELT +ɯ ↔ M; # LATIN SMALL LETTER TURNED M +ɱ ↔ F; # LATIN SMALL LETTER M WITH HOOK +ɲ ↔ J; # LATIN SMALL LETTER N WITH LEFT HOOK +ɵ ↔ 8; # LATIN SMALL LETTER BARRED O +ɶ ↔ '&'; # LATIN LETTER SMALL CAPITAL OE +ɾ ↔ 4; # LATIN SMALL LETTER R WITH FISHHOOK +ʁ ↔ R; # LATIN LETTER SMALL CAPITAL INVERTED R +ʃ ↔ S; # LATIN SMALL LETTER ESH +ʉ ↔ '}'; # LATIN SMALL LETTER U BAR +ʊ ↔ U; # LATIN SMALL LETTER UPSILON +ʋ ↔ P; # LATIN SMALL LETTER V WITH HOOK +ʌ ↔ V; # LATIN SMALL LETTER TURNED V +ʍ ↔ W; # LATIN SMALL LETTER TURNED W +ʎ ↔ L; # LATIN SMALL LETTER TURNED Y +ʏ ↔ Y; # LATIN LETTER SMALL CAPITAL Y +ʒ ↔ Z; # LATIN SMALL LETTER EZH +ʔ ↔ '?'; # LATIN LETTER GLOTTAL STOP +ʲ ↔ \'; # MODIFIER LETTER SMALL J +ˈ ↔ '"'; # MODIFIER LETTER VERTICAL LINE +ˌ ↔ '%'; # MODIFIER LETTER LOW VERTICAL LINE +ː ↔ ':'; # MODIFIER LETTER TRIANGULAR COLON +˞ ↔ '`'; # MODIFIER LETTER RHOTIC HOOK +\u0303 ↔ '~'; # COMBINING TILDE +\u0329 ↔ '='; # COMBINING VERTICAL LINE BELOW +\u0361 ↔ $t; # COMBINING DOUBLE INVERTED BREVE +β ↔ B; # GREEK SMALL LETTER BETA +θ ↔ T; # GREEK SMALL LETTER THETA +χ ↔ X; # GREEK SMALL LETTER CHI +↑ ↔ '^'; # UPWARDS ARROW +↓ ↔ '!'; # DOWNWARDS ARROW +# Compatibility rules for variant or obsolete IPA symbols +g → g; # LATIN SMALL LETTER G (redundant, for additional clarity) +ȵ → J; # LATIN SMALL LETTER N WITH CURL +ɩ → I; # LATIN SMALL LETTER IOTA +ɷ → U; # LATIN SMALL LETTER CLOSED OMEGA +ɼ → 'r_r'; # LATIN SMALL LETTER R WITH LONG LEG +ɿ → 'z='; # LATIN SMALL LETTER REVERSED R WITH FISHHOOK +ʅ → 'z`='; # LATIN SMALL LETTER SQUAT REVERSED ESH +ʆ → S\'; # LATIN SMALL LETTER ESH WITH CURL +ʇ → '|\' ; # LATIN SMALL LETTER TURNED T +ʓ → Z\'; # LATIN SMALL LETTER EZH WITH CURL +ʖ → '|\|\'; # LATIN LETTER INVERTED GLOTTAL STOP +ʗ → '!\'; # LATIN LETTER STRETCHED C +ʚ → '3\'; # LATIN SMALL LETTER CLOSED OPEN E +ʠ → 'G\_<_0'; # LATIN SMALL LETTER Q WITH HOOK +ʣ → d $t z; # LATIN SMALL LETTER DZ DIGRAPH +ʤ → d $t Z; # LATIN SMALL LETTER DEZH DIGRAPH +ʥ → d $t 'z\'; # LATIN SMALL LETTER DZ DIGRAPH WITH CURL +ʦ → t $t s; # LATIN SMALL LETTER TS DIGRAPH +ʧ → t $t S; # LATIN SMALL LETTER TESH DIGRAPH +ʨ → t $t 's\'; # LATIN SMALL LETTER TC DIGRAPH WITH CURL +˔ → '_r'; # MODIFIER LETTER UP TACK +˕ → '_o'; # MODIFIER LETTER DOWN TACK +\u030A → '_0'; # COMBINING RING ABOVE +φ → 'p\'; # GREEK SMALL LETTER PHI +ꞎ → 'K`'; # LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT +::NFC(NFD); + diff --git a/icu4c/source/data/translit/Uzbek_Latin_BGN.txt b/icu4c/source/data/translit/uz_Cyrl_uz_BGN.txt similarity index 55% rename from icu4c/source/data/translit/Uzbek_Latin_BGN.txt rename to icu4c/source/data/translit/uz_Cyrl_uz_BGN.txt index d6e947c4f7d..05cb31aebe9 100644 --- a/icu4c/source/data/translit/Uzbek_Latin_BGN.txt +++ b/icu4c/source/data/translit/uz_Cyrl_uz_BGN.txt @@ -1,14 +1,44 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** -# File: Uzbek_Latin_BGN.txt +# File: uz_Cyrl_uz_BGN.txt # Generated from CLDR # + +# +######################################################################## +# BGN/PCGN 1979 System +# +# The BGN/PCGN system for Uzbek was designed for use in +# romanizing names written in the Uzbek alphabet. +# The Uzbek alphabet contains four letters not present +# in the Russian alphabet: Ўў, Ққ, Ғғ, and Ҳҳ. +# +# The Uzbek Alphabet as defined by the BGN (Page 107): +# +# АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЪЬЭЮЯЎҚҒҲ +# абвгдеёжзийклмнопрстуфхцчшъьэюяўқғҳ +# +# Originally prepared by Michael Everson +######################################################################## +# +# MINIMAL FILTER: Uzbek-Latin +# :: [АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЪЬЭЮЯЎҚҒҲабвгдеёжзийклмнопрстуфхцчшъьэюяўқғҳ] ; :: NFD (NFC) ; +# +# +######################################################################## +# +######################################################################## +# +# Define All Transformation Variables +# +######################################################################## +# $prime = ʹ ; $doublePrime = ʺ ; $upperConsonants = [БВГДЖЗЙКЛМНПРСТФХЦЧШЪЬҚҒҲ] ; @@ -18,22 +48,78 @@ $upperVowels = [АЕЁИОУЭЮЯЎ] ; $lowerVowels = [аеёиоуэюяў] ; $vowels = [$upperVowels $lowerVowels] ; $lower = [$lowerConsonants $lowerVowels] ; +# +# +# Use this $wordBoundary until bug 2034 is fixed in ICU: +# http://bugs.icu-project.org/cgi-bin/icu-bugs/transliterate?id=2034;expression=boundary;user=guest +# $wordBoundary = [^[:L:][:M:][:N:]] ; +# +# +######################################################################## +# +######################################################################## +# +# Rules moved to front to avoid masking +# +######################################################################## +# $lowerVowels ы → y ; $upperVowels[Ыы] → Y ; +# +# +######################################################################## +# +# Start of Alphabetic Transformations +# +######################################################################## +# А → A ; # CYRILLIC CAPITAL LETTER A а → a ; # CYRILLIC SMALL LETTER A Б → B ; # CYRILLIC CAPITAL LETTER BE б → b ; # CYRILLIC SMALL LETTER BE В → W ; # CYRILLIC CAPITAL LETTER VE в → w ; # CYRILLIC SMALL LETTER VE +# +# +######################################################################## +# +# BGN Page 108 Rule 2 +# +# The character sequences гҳ, кҳ, сҳ, and цҳ may be romanized g·h, +# k·h, s·h, and ts·h in order to differentiate those romanizations from +# the digraphs gh, kh, sh, and the letter sequence tsh, which are used +# to render the chаracters г, х, ш, and the character sequence тш. +# +######################################################################## +# ГҲ → G·H ; # CYRILLIC CAPITAL LETTER GHE Гҳ → G·h ; # CYRILLIC CAPITAL LETTER GHE гҳ → g·h ; # CYRILLIC SMALL LETTER GHE Г → G ; # CYRILLIC CAPITAL LETTER GHE г → g ; # CYRILLIC SMALL LETTER GHE +# +# +######################################################################## +# +# End Rule 2 +# +######################################################################## +# Д → D ; # CYRILLIC CAPITAL LETTER DE д → d ; # CYRILLIC SMALL LETTER DE +# +# +######################################################################## +# +# BGN Page 108 Rule 1: +# +# The character e should be romanized ye initially, after the vowel +# characters a, e, ё, и, о, у, э, ю, я, and ў, and after й and ь. +# In all other instances, it should be romanized e. +# +######################################################################## +# Е}[$upperVowels [ЙЬ]] → YE ; # CYRILLIC CAPITAL LETTER IE Е}[$lowerVowels [йь]] → Ye ; # CYRILLIC CAPITAL LETTER IE $wordBoundary{Е → Ye ; # CYRILLIC CAPITAL LETTER IE @@ -41,6 +127,14 @@ $wordBoundary{Е → Ye ; # CYRILLIC CAPITAL LETTER IE е}[$upperVowels $lowerVowels [ЙйЬь]] → ye ; # CYRILLIC SMALL LETTER IE $wordBoundary{е → ye ; # CYRILLIC SMALL LETTER IE е → e ; # CYRILLIC SMALL LETTER IE +# +# +######################################################################## +# +# End of Rule 1 +# +######################################################################## +# Ё} $lower → Yo ; # CYRILLIC CAPITAL LETTER IO Ё → YO ; # CYRILLIC CAPITAL LETTER IO ё → yo ; # CYRILLIC SMALL LETTER IO @@ -52,11 +146,29 @@ $wordBoundary{е → ye ; # CYRILLIC SMALL LETTER IE и → i ; # CYRILLIC SMALL LETTER I Й → Y ; # CYRILLIC CAPITAL LETTER I й → y ; # CYRILLIC SMALL LETTER I +# +# +######################################################################## +# +# BGN Page 108 Rule 2 +# +# кҳ becomes k·h +# +######################################################################## +# КҲ → K·H ; # CYRILLIC CAPITAL LETTER KA Кҳ → K·h ; # CYRILLIC CAPITAL LETTER KA кҳ → k·h ; # CYRILLIC SMALL LETTER KA К → K ; # CYRILLIC CAPITAL LETTER KA к → k ; # CYRILLIC SMALL LETTER KA +# +# +######################################################################## +# +# End Rule 2 +# +######################################################################## +# Л → L ; # CYRILLIC CAPITAL LETTER EL л → l ; # CYRILLIC SMALL LETTER EL М → M ; # CYRILLIC CAPITAL LETTER EM @@ -69,11 +181,29 @@ $wordBoundary{е → ye ; # CYRILLIC SMALL LETTER IE п → p ; # CYRILLIC SMALL LETTER PE Р → R ; # CYRILLIC CAPITAL LETTER ER р → r ; # CYRILLIC SMALL LETTER ER +# +# +######################################################################## +# +# BGN Page 108 Rule 2 +# +# сҳ becomes s·h +# +######################################################################## +# СҲ → S·H ; # CYRILLIC CAPITAL LETTER ES Сҳ → S·h ; # CYRILLIC CAPITAL LETTER ES сҳ → s·h ; # CYRILLIC SMALL LETTER ES С → S ; # CYRILLIC CAPITAL LETTER ES с → s ; # CYRILLIC SMALL LETTER ES +# +# +######################################################################## +# +# End Rule 2 +# +######################################################################## +# Т → T ; # CYRILLIC CAPITAL LETTER TE т → t ; # CYRILLIC SMALL LETTER TE У → Ū ; # CYRILLIC CAPITAL LETTER U @@ -83,12 +213,30 @@ $wordBoundary{е → ye ; # CYRILLIC SMALL LETTER IE Х} $lower → Kh ; # CYRILLIC CAPITAL LETTER HA Х → KH ; # CYRILLIC CAPITAL LETTER HA х → kh ; # CYRILLIC SMALL LETTER HA +# +# +######################################################################## +# +# BGN Page 108 Rule 2 +# +# цҳ becomes ts·h +# +######################################################################## +# ЦҲ → TS·H ; # CYRILLIC CAPITAL LETTER GHE Цҳ → Ts·h ; # CYRILLIC CAPITAL LETTER GHE цҳ → ts·h ; # CYRILLIC SMALL LETTER GHE Ц} $lower → Ts ; # CYRILLIC CAPITAL LETTER TSE Ц → TS ; # CYRILLIC CAPITAL LETTER TSE ц → ts ; # CYRILLIC SMALL LETTER TSE +# +# +######################################################################## +# +# End Rule 2 +# +######################################################################## +# Ч} $lower → Ch ; # CYRILLIC CAPITAL LETTER CHE Ч → CH ; # CYRILLIC CAPITAL LETTER CHE ч → ch ; # CYRILLIC SMALL LETTER CHE @@ -116,3 +264,7 @@ $wordBoundary{е → ye ; # CYRILLIC SMALL LETTER IE ғ → gh ; # CYRILLIC SMALL LETTER GHE WITH STROKE Ҳ → H ; # CYRILLIC CAPITAL LETTER HA WITH DESCENDER ҳ → h ; # CYRILLIC SMALL LETTER HA WITH DESCENDER +# +# +######################################################################## + diff --git a/icu4c/source/data/translit/uz_Cyrl_uz_Latn.txt b/icu4c/source/data/translit/uz_Cyrl_uz_Latn.txt index 6997a5dbbfd..d80a3f1c335 100755 --- a/icu4c/source/data/translit/uz_Cyrl_uz_Latn.txt +++ b/icu4c/source/data/translit/uz_Cyrl_uz_Latn.txt @@ -1,12 +1,14 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: uz_Cyrl_uz_Latn.txt # Generated from CLDR # + +# Rules are predicated on running NFD first, and NFC afterwards :: NFD (NFC) ; $letters = [[:sc=Latn:][:sc=Cyrl:] & [:L:]] ; $latinVowel = [AEIOUaeiouĬĭʼËë{oʻ}{Oʻ}] ; @@ -15,24 +17,30 @@ $lower = [$letters & [:Ll:]] ; У \u0306 ↔ Oʻ ; ғ ↔ gʻ ; Ғ ↔ Gʻ ; +# For many cases, use a following character to determine the difference between XY and Xy ш ↔ sh ; { Ш } $lower ↔ Sh ; Ш ↔ SH ; ч ↔ ch ; { Ч } $lower ↔ Ch ; Ч ↔ CH ; +# нг ↔ ng ; as separate letters works +# Нг ↔ Ng ; as separate letters works +# If we have a lowercase letter on either side, use the lowercase hard sign ъ ↔ { ʼ } $lower ; ъ ← $lower { ʼ } ; Ъ ↔ ʼ ; е\u0308 ↔ yo ; Е\u0308 } $lower ↔ Yo ; Е\u0308 ↔ YO ; +# е → 'ye' at the beginning of a syllable, after a vowel, ъ or ь, otherwise 'e' [:^L:] { е ↔ ye ; [:^L:] { Е } $lower ↔ Ye ; [:^L:] {Е ↔ YE ; $latinVowel { е → ye ; $latinVowel { Е } $lower → Ye ; $latinVowel { Е → YE ; +# handle these specially, since ьЬ otherwise disappear. ье → ye ; { ьЕ } $lower → Ye ; ьЕ → YE ; @@ -101,3 +109,4 @@ $latinVowel { Е → YE ; ь → ; Ь → ; ::NFC (NFD) ; + diff --git a/icu4c/source/data/translit/xh_am.txt b/icu4c/source/data/translit/xh_am.txt new file mode 100644 index 00000000000..efd637d4966 --- /dev/null +++ b/icu4c/source/data/translit/xh_am.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: xh_am.txt +# Generated from CLDR +# + +::xh-xh_FONIPA; +::am_FONIPA-am; + diff --git a/icu4c/source/data/translit/xh_ar.txt b/icu4c/source/data/translit/xh_ar.txt new file mode 100644 index 00000000000..55b068a7b81 --- /dev/null +++ b/icu4c/source/data/translit/xh_ar.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: xh_ar.txt +# Generated from CLDR +# + +::xh-xh_FONIPA; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/xh_fa.txt b/icu4c/source/data/translit/xh_fa.txt new file mode 100644 index 00000000000..9433be2a465 --- /dev/null +++ b/icu4c/source/data/translit/xh_fa.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: xh_fa.txt +# Generated from CLDR +# + +::xh-xh_FONIPA; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/xh_xh_FONIPA.txt b/icu4c/source/data/translit/xh_xh_FONIPA.txt new file mode 100644 index 00000000000..30919aa75d2 --- /dev/null +++ b/icu4c/source/data/translit/xh_xh_FONIPA.txt @@ -0,0 +1,94 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: xh_xh_FONIPA.txt +# Generated from CLDR +# + +# Pronunciation rules for isiXhosa. +# +# Author: mjansche@google.com (Martin Jansche) +# +# These rules transcribe isiXhosa into the phoneme inventory used within the +# NCHLT Speech Corpus (https://sites.google.com/site/nchltspeechcorpus/home). +# +# The rules were tested using the NCHLT-inlang isiXhosa pronunciation dictionary +# (http://rma.nwu.ac.za/index.php/resource-catalogue/nchlt-inlang-dictionaries.html). +# They correctly account for 14,999 out of 15,000 entries in the dictionary. +# +# The NCHLT 2013 phone set does not distinguish short and long vowels and does +# not indicate tone in any way. Transcription of tone is out of scope without a +# dictionary, since tone is generally not indicated in the orthography. Nasal +# clicks are not treated as separated phonemes in the NCHLT 2013 phone set and +# are transcribed as a sequence of nasal plus click instead. +# +# One minor notational deviation from the NCHLT 2013 phone set is that we use a +# tie bar within the complex (slack voiced) clicks, e.g. ɡ\u0361ǀ instead of ɡǀ, to +# avoid ambiguity and make the phoneme inventory uniquely decodable. +::Lower; +nyh → ɲʰ; +n { tsh → t\u0361ʃʼ; +tsh → t\u0361ʃʰ; +tyh → cʰ; +bh → bʰ; +ch → ǀʰ; +dl → ɮ; +dy → ɟ; +gc → ɡ\u0361ǀ; +gq → ɡ\u0361ǃ; +gr → ɣ; +gx → ɡ\u0361ǁ; +hl → ɬ; +kh → kʰ; +kr → k\u0361x; +mh } [^l] → mʰ; # denotes /mɬ/ instead +nh → nʰ; +ny → ɲ; +ph → pʰ; +qh → ǃʰ; +sh → ʃ; +th → tʰ; +tl → t\u0361ɬʼ; +ts → t\u0361sʼ; +ty → cʼ; +xh → ǁʰ; +aa → | a; +ee → | e; +ii → | i; +kc → | c; +kq → | q; +mm → | m; +oo → | o; +rh → | r; +uu → | u; +a → a; +b → ɓ; +c → ǀ; +d → d; +e → ɛ; +f → f; +g → ɡ; +h → h; +i → i; +j → d\u0361ʒ; +k → kʼ; +l → l; +m → m; +n } g → ŋ; +n → n; +o → ɔ; +p → pʼ; +q → ǃ; +r → r; +s → s; +t → tʼ; +u → u; +v → v; +w → w; +x → ǁ; +y → j; +z → z; + diff --git a/icu4c/source/data/translit/yo_yo_BJ.txt b/icu4c/source/data/translit/yo_yo_BJ.txt index b5d9759bc00..f512ff4692d 100644 --- a/icu4c/source/data/translit/yo_yo_BJ.txt +++ b/icu4c/source/data/translit/yo_yo_BJ.txt @@ -1,17 +1,45 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: yo_yo_BJ.txt # Generated from CLDR # + +# +######################################################################## +# +# Latin Characters that must be changed: +# +# ẸỌṢ +# ẹọṣ +# +# Originally prepared by Jonathan Lai ( jali01@ca.ibm.com ) +######################################################################## +# +# MINIMAL FILTER: yo-yo-BJ +# :: [ẸỌṢẹọṣ] ; :: NFC ; +# +# +######################################################################## +# +######################################################################## +# +# Start of Alphabetic Transformations +# +######################################################################## +# Ẹ→Ɛ;# LATIN CAPITAL LETTER E WITH DOT BELOW ẹ→ɛ;# LATIN SMALL LETTER E WITH DOT BELOW Ọ→Ɔ ;# LATIN CAPITAL LETTER O WITH DOT BELOW ọ→ɔ;# LATIN SMALL LETTER O WITH DOT BELOW Ṣ→Sh;# LATIN CAPITAL LETTER S WITH DOT BELOW ṣ→sh;# LATIN SMALL LETTER S WITH DOT BELOW +# +# +######################################################################## + diff --git a/icu4c/source/data/translit/zh_Latn_PINYIN_ru.txt b/icu4c/source/data/translit/zh_Latn_PINYIN_ru.txt index a4aaa397d91..5aa44e9c3cd 100644 --- a/icu4c/source/data/translit/zh_Latn_PINYIN_ru.txt +++ b/icu4c/source/data/translit/zh_Latn_PINYIN_ru.txt @@ -1,21 +1,44 @@ # *************************************************************************** # * -# * Copyright (C) 2004-2015, International Business Machines +# * Copyright (C) 2004-2016, International Business Machines # * Corporation; Unicode, Inc.; and others. All Rights Reserved. # * # *************************************************************************** # File: zh_Latn_PINYIN_ru.txt # Generated from CLDR # + +# Cyrillization of Mandarin Chinese from Pinyin into Russian (Palladius system). +# +# References: +# http://ru.wikipedia.org/wiki/Транскрипционная_система_Палладия +# http://www.omniglot.com/writing/mandarin_pts.htm +# http://www.pinyin.info/romanization/russian/index.html +# These differ in the treatment of some syllables (e.g. ) from the rules +# below. +# +# Further commentary: +# http://languagelog.ldc.upenn.edu/nll/?p=604 +# +# Remove tone marks. :: NFD (NFC); [\u0304\u0301\u030C\u0300\u0306] → ; :: NFC (NFD); +# +# +# Syllabify. Add apostrophes to disambiguate whether and belong to the +# coda or onset of a syllable. :: Null (); ng } [aeou] → n\'g; ng → ng\'; n } [aeiouü] → \'n; n → n\'; +# +# +# Main pass. :: Null (); +# +# Ai → Ай; A → А; B → Б; @@ -57,9 +80,13 @@ Yu → Ю | v; Zh → Чж; Zi → Цзы; Z → Цз; +# +# [$] { n → н; \'n → н; \' → ; +# +# ai → ай; a → а; b → б; @@ -100,7 +127,12 @@ ui → уй; uo → о; u → у; ü → ю | v; +# +# We use the dummy symbol to signal that an <ü> or equivalent character has +# just been processed. ve → э; +# +#alternative: ve → е; v } [an] → ; v → й; wu → у; @@ -118,4 +150,8 @@ yu → ю | v; zh → чж; zi → цзы; z → цз; +# +# +# Final pass: Make sure that the output consists entirely of Cyrillic letters. :: NFC (); + diff --git a/icu4c/source/data/translit/zu_am.txt b/icu4c/source/data/translit/zu_am.txt new file mode 100644 index 00000000000..77ed18128a9 --- /dev/null +++ b/icu4c/source/data/translit/zu_am.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: zu_am.txt +# Generated from CLDR +# + +::zu-zu_FONIPA; +::am_FONIPA-am; + diff --git a/icu4c/source/data/translit/zu_ar.txt b/icu4c/source/data/translit/zu_ar.txt new file mode 100644 index 00000000000..a92040fe6a2 --- /dev/null +++ b/icu4c/source/data/translit/zu_ar.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: zu_ar.txt +# Generated from CLDR +# + +::zu-zu_FONIPA; +::und_FONIPA-ar; + diff --git a/icu4c/source/data/translit/zu_fa.txt b/icu4c/source/data/translit/zu_fa.txt new file mode 100644 index 00000000000..b4d60509d80 --- /dev/null +++ b/icu4c/source/data/translit/zu_fa.txt @@ -0,0 +1,13 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: zu_fa.txt +# Generated from CLDR +# + +::zu-zu_FONIPA; +::und_FONIPA-fa; + diff --git a/icu4c/source/data/translit/zu_zu_FONIPA.txt b/icu4c/source/data/translit/zu_zu_FONIPA.txt new file mode 100644 index 00000000000..c56327477bf --- /dev/null +++ b/icu4c/source/data/translit/zu_zu_FONIPA.txt @@ -0,0 +1,81 @@ +# *************************************************************************** +# * +# * Copyright (C) 2004-2016, International Business Machines +# * Corporation; Unicode, Inc.; and others. All Rights Reserved. +# * +# *************************************************************************** +# File: zu_zu_FONIPA.txt +# Generated from CLDR +# + +# Pronunciation rules for isiZulu. +# +# Author: mjansche@google.com (Martin Jansche) +# +# These rules transcribe isiZulu into the phoneme inventory used within the +# NCHLT Speech Corpus (https://sites.google.com/site/nchltspeechcorpus/home). +# +# The rules were tested using the NCHLT-inlang isiZulu pronunciation dictionary +# (http://rma.nwu.ac.za/index.php/resource-catalogue/nchlt-inlang-dictionaries.html). +# They correctly account for all 15,000 entries in the dictionary. +# +# The NCHLT 2013 phone set does not indicate tone in any way. Transcription of +# tone is out of scope without a dictionary, since tone is generally not +# indicated in the orthography. Nasal clicks are not treated as separated +# phonemes in the NCHLT 2013 phone set and are transcribed as a sequence of +# nasal plus click instead. +# +# One minor notational deviation from the NCHLT 2013 phone set is that we use a +# tie bar within the complex (depressor) clicks, e.g. ɡ\u0361ǀ instead of ɡǀ, to +# avoid ambiguity and make the phoneme inventory uniquely decodable. +::Lower; +tsh → t\u0361ʃʼ; +bh → b; +ch → ǀʰ; +dl → ɮ; +gc → ɡ\u0361ǀ; +gq → ɡ\u0361ǃ; +gx → ɡ\u0361ǁ; +hh → ɦ; # To investigate: /ɦ/ and /h/ may be switched in the NCHLT dictionary. +hl → ɬ; +kh → kʰ; +kl → k\u0361ɬ; +ny → ɲ; +ph → pʰ; +qh → ǃʰ; +n { sh → t\u0361sʼ; +sh → ʃ; +th → tʰ; +xh → ǁʰ; +a → a; +m { b → b; +b → ɓ; +c → ǀ; +d → d; +e → ɛ; +f → f; +g → ɡ; +h → h; +i → i; +j → d\u0361ʒ; +k → k; +l → l; +m → m; +[$] { n } gc → n; +n } [gk] → ŋ; +n } j → ɲ; +n → n; +o → ɔ; +p → pʼ; +q → ǃ; +n { s → t\u0361sʼ; +s → s; +t → tʼ; +u → u; +v → v; +w → w; +x → ǁ; +y → j; +n { z → d\u0361z; +z → z; +