mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-16 18:25:57 +00:00
ICU-1291 fixed roundtripping with new filters, Katakana
X-SVN-Rev: 7241
This commit is contained in:
parent
f55499cf92
commit
5b8a915180
2 changed files with 18 additions and 12 deletions
|
@ -3,16 +3,18 @@
|
|||
# Corporation and others. All Rights Reserved.
|
||||
#--------------------------------------------------------------------
|
||||
# $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/Attic/Transliterator_Latin_Katakana.txt,v $
|
||||
# $Date: 2001/11/30 22:36:12 $
|
||||
# $Revision: 1.17 $
|
||||
# $Date: 2001/12/01 00:51:28 $
|
||||
# $Revision: 1.18 $
|
||||
#--------------------------------------------------------------------
|
||||
|
||||
# note: a global filter is more efficient, but MUST include all source chars
|
||||
#:: [\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]] ;
|
||||
# MINIMAL FILTER GENERATED FOR: Latin-Katakana
|
||||
:: [',.A-Za-z~\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01ED\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0304\u04E2-\u04E3\u04EE-\u04EF\u1E00-\u1E99\u1EA0-\u1EF9\u1FB1\u1FB9\u1FD1\u1FD9\u1FE1\u1FE9\u212A-\u212B] ;
|
||||
### WARNING -- must add width filter, both here and below!!! ###
|
||||
:: [[\u1100-\u1112\u111A\u1121\u1160-\u1175\u11AA\u11AC-\u11AD\u11B0-\u11B5\u2190-\u2193\u2502\u25A0\u25CB\u3000-\u3002\u300C-\u300D\u3099-\u309A\u30A1-\u30ED\u30EF\u30F2-\u30F4\u30F7\u30FA-\u30FC\uFF01-\uFF5E\uFFE0-\uFFE6][',.A-Za-z~\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01ED\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0304\u04E2-\u04E3\u04EE-\u04EF\u1E00-\u1E99\u1EA0-\u1EF9\u1FB1\u1FB9\u1FD1\u1FD9\u1FE1\u1FE9\u212A-\u212B]] ;
|
||||
|
||||
:: NFD (NFC); # use NFKD to get the fullwidth latin characters
|
||||
:: fullwidth-halfwidth ();
|
||||
:: NFD (NFC);
|
||||
:: Lower (); # whenever transliterating from cased to uncased script, include this
|
||||
# :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese
|
||||
|
||||
|
@ -486,11 +488,12 @@ x > | ks ;
|
|||
[:Katakana:] { '' } [:Latin:] > ; # delete stray quotes between letters
|
||||
# [\u02BE[:Nonspacing Mark:]-[\u3099-\u309C]] > ; # delete any non-spacing marks that we didn't use
|
||||
|
||||
:: NFC (NFD) ; # use NFKD to get the halfwidth katakana characters
|
||||
:: NFC (NFD) ;
|
||||
:: (halfwidth-fullwidth);
|
||||
|
||||
# note: a global filter is more efficient, but MUST include all source chars!!
|
||||
#:: ([\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]]);
|
||||
# MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
|
||||
:: ( [~\u3001-\u3002\u304C\u304E\u3050\u3052\u3054\u3056\u3058\u305A\u305C\u305E\u3060\u3062\u3065\u3067\u3069\u3070-\u3071\u3073-\u3074\u3076-\u3077\u3079-\u307A\u307C-\u307D\u3094\u3099-\u309B\u309E\u30A1-\u30FA\u30FC-\u30FE] ) ;
|
||||
:: ( [[\ -~\u00A2-\u00A3\u00A5-\u00A6\u00AC\u0304\u20A9\uFF61-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC\uFFE8-\uFFEE][~\u3001-\u3002\u304C\u304E\u3050\u3052\u3054\u3056\u3058\u305A\u305C\u305E\u3060\u3062\u3065\u3067\u3069\u3070-\u3071\u3073-\u3074\u3076-\u3077\u3079-\u307A\u307C-\u307D\u3094\u3099-\u309B\u309E\u30A1-\u30FA\u30FC-\u30FE]] ) ;
|
||||
|
||||
# eof
|
||||
|
|
|
@ -3,16 +3,18 @@
|
|||
# Corporation and others. All Rights Reserved.
|
||||
#--------------------------------------------------------------------
|
||||
# $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/resources/Attic/Transliterator_Latin_Katakana.txt,v $
|
||||
# $Date: 2001/11/30 22:36:12 $
|
||||
# $Revision: 1.17 $
|
||||
# $Date: 2001/12/01 00:51:28 $
|
||||
# $Revision: 1.18 $
|
||||
#--------------------------------------------------------------------
|
||||
|
||||
# note: a global filter is more efficient, but MUST include all source chars
|
||||
#:: [\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]] ;
|
||||
# MINIMAL FILTER GENERATED FOR: Latin-Katakana
|
||||
:: [',.A-Za-z~\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01ED\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0304\u04E2-\u04E3\u04EE-\u04EF\u1E00-\u1E99\u1EA0-\u1EF9\u1FB1\u1FB9\u1FD1\u1FD9\u1FE1\u1FE9\u212A-\u212B] ;
|
||||
### WARNING -- must add width filter, both here and below!!! ###
|
||||
:: [[\u1100-\u1112\u111A\u1121\u1160-\u1175\u11AA\u11AC-\u11AD\u11B0-\u11B5\u2190-\u2193\u2502\u25A0\u25CB\u3000-\u3002\u300C-\u300D\u3099-\u309A\u30A1-\u30ED\u30EF\u30F2-\u30F4\u30F7\u30FA-\u30FC\uFF01-\uFF5E\uFFE0-\uFFE6][',.A-Za-z~\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01ED\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0304\u04E2-\u04E3\u04EE-\u04EF\u1E00-\u1E99\u1EA0-\u1EF9\u1FB1\u1FB9\u1FD1\u1FD9\u1FE1\u1FE9\u212A-\u212B]] ;
|
||||
|
||||
:: NFD (NFC); # use NFKD to get the fullwidth latin characters
|
||||
:: fullwidth-halfwidth ();
|
||||
:: NFD (NFC);
|
||||
:: Lower (); # whenever transliterating from cased to uncased script, include this
|
||||
# :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese
|
||||
|
||||
|
@ -486,11 +488,12 @@ x > | ks ;
|
|||
[:Katakana:] { '' } [:Latin:] > ; # delete stray quotes between letters
|
||||
# [\u02BE[:Nonspacing Mark:]-[\u3099-\u309C]] > ; # delete any non-spacing marks that we didn't use
|
||||
|
||||
:: NFC (NFD) ; # use NFKD to get the halfwidth katakana characters
|
||||
:: NFC (NFD) ;
|
||||
:: (halfwidth-fullwidth);
|
||||
|
||||
# note: a global filter is more efficient, but MUST include all source chars!!
|
||||
#:: ([\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]]);
|
||||
# MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
|
||||
:: ( [~\u3001-\u3002\u304C\u304E\u3050\u3052\u3054\u3056\u3058\u305A\u305C\u305E\u3060\u3062\u3065\u3067\u3069\u3070-\u3071\u3073-\u3074\u3076-\u3077\u3079-\u307A\u307C-\u307D\u3094\u3099-\u309B\u309E\u30A1-\u30FA\u30FC-\u30FE] ) ;
|
||||
:: ( [[\ -~\u00A2-\u00A3\u00A5-\u00A6\u00AC\u0304\u20A9\uFF61-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC\uFFE8-\uFFEE][~\u3001-\u3002\u304C\u304E\u3050\u3052\u3054\u3056\u3058\u305A\u305C\u305E\u3060\u3062\u3065\u3067\u3069\u3070-\u3071\u3073-\u3074\u3076-\u3077\u3079-\u307A\u307C-\u307D\u3094\u3099-\u309B\u309E\u30A1-\u30FA\u30FC-\u30FE]] ) ;
|
||||
|
||||
# eof
|
||||
|
|
Loading…
Add table
Reference in a new issue