mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
Add Hiragana-Katakana transliterator
X-SVN-Rev: 1715
This commit is contained in:
parent
6d8fd93c2d
commit
820a963b3b
7 changed files with 309 additions and 8 deletions
icu4j/src/com/ibm
icu
test/translit
text/resources
tools/translit
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
|
||||
* $Date: 2000/06/29 21:59:36 $
|
||||
* $Revision: 1.23 $
|
||||
* $Date: 2000/06/30 00:00:21 $
|
||||
* $Revision: 1.24 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -604,6 +604,42 @@ public class TransliteratorTest extends TestFmwk {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the Hiragana-Katakana transliterator.
|
||||
*/
|
||||
public void TestHiraganaKatakana() {
|
||||
Transliterator hk = Transliterator.getInstance("Hiragana-Katakana");
|
||||
Transliterator kh = Transliterator.getInstance("Katakana-Hiragana");
|
||||
|
||||
// Array of 3n items
|
||||
// Each item is "hk"|"kh"|"both", <Hiragana>, <Katakana>
|
||||
String[] DATA = {
|
||||
"both",
|
||||
"\u3042\u3090\u3099\u3092\u3050",
|
||||
"\u30A2\u30F8\u30F2\u30B0",
|
||||
|
||||
"kh",
|
||||
"\u307C\u3051\u3060\u3042\u3093\u30FC",
|
||||
"\u30DC\u30F6\u30C0\u30FC\u30F3\u30FC",
|
||||
};
|
||||
|
||||
for (int i=0; i<DATA.length; i+=3) {
|
||||
switch (DATA[i].charAt(0)) {
|
||||
case 'h': // Hiragana-Katakana
|
||||
expect(hk, DATA[i+1], DATA[i+2]);
|
||||
break;
|
||||
case 'k': // Katakana-Hiragana
|
||||
expect(kh, DATA[i+2], DATA[i+1]);
|
||||
break;
|
||||
case 'b': // both
|
||||
expect(hk, DATA[i+1], DATA[i+2]);
|
||||
expect(kh, DATA[i+2], DATA[i+1]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
|
|
|
@ -58,6 +58,7 @@ $NAME_MAP = <<'END';
|
|||
{ "Latin-Hebrew", "Hebrew-Latin", "lhebrew" }
|
||||
{ "Latin-Jamo", "Jamo-Latin", "ljamo" }
|
||||
{ "Latin-Kana", "Kana-Latin", "lkana" }
|
||||
{ "Hiragana-Katakana", "Katakana-Hiragana", "kana" }
|
||||
|
||||
// Other miscellaneous rules
|
||||
{ "StraightQuotes-CurlyQuotes", "CurlyQuotes-StraightQuotes", "quotes" }
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/Attic/LocaleElements.java,v $
|
||||
* $Date: 2000/03/10 04:07:27 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2000/06/30 00:00:09 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -64,6 +64,9 @@ public class LocaleElements extends ListResourceBundle {
|
|||
"Latin-Kana",
|
||||
"*Kana-Latin",
|
||||
|
||||
"Hiragana-Katakana",
|
||||
"*Katakana-Hiragana",
|
||||
|
||||
"StraightQuotes-CurlyQuotes",
|
||||
"*CurlyQuotes-StraightQuotes",
|
||||
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $
|
||||
* $Date: 2000/06/29 21:59:36 $
|
||||
* $Revision: 1.23 $
|
||||
* $Date: 2000/06/30 00:00:21 $
|
||||
* $Revision: 1.24 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -604,6 +604,42 @@ public class TransliteratorTest extends TestFmwk {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the Hiragana-Katakana transliterator.
|
||||
*/
|
||||
public void TestHiraganaKatakana() {
|
||||
Transliterator hk = Transliterator.getInstance("Hiragana-Katakana");
|
||||
Transliterator kh = Transliterator.getInstance("Katakana-Hiragana");
|
||||
|
||||
// Array of 3n items
|
||||
// Each item is "hk"|"kh"|"both", <Hiragana>, <Katakana>
|
||||
String[] DATA = {
|
||||
"both",
|
||||
"\u3042\u3090\u3099\u3092\u3050",
|
||||
"\u30A2\u30F8\u30F2\u30B0",
|
||||
|
||||
"kh",
|
||||
"\u307C\u3051\u3060\u3042\u3093\u30FC",
|
||||
"\u30DC\u30F6\u30C0\u30FC\u30F3\u30FC",
|
||||
};
|
||||
|
||||
for (int i=0; i<DATA.length; i+=3) {
|
||||
switch (DATA[i].charAt(0)) {
|
||||
case 'h': // Hiragana-Katakana
|
||||
expect(hk, DATA[i+1], DATA[i+2]);
|
||||
break;
|
||||
case 'k': // Katakana-Hiragana
|
||||
expect(kh, DATA[i+2], DATA[i+1]);
|
||||
break;
|
||||
case 'b': // both
|
||||
expect(hk, DATA[i+1], DATA[i+2]);
|
||||
expect(kh, DATA[i+2], DATA[i+1]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/resources/Attic/LocaleElements.java,v $
|
||||
* $Date: 2000/03/10 04:07:27 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2000/06/30 00:00:09 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -64,6 +64,9 @@ public class LocaleElements extends ListResourceBundle {
|
|||
"Latin-Kana",
|
||||
"*Kana-Latin",
|
||||
|
||||
"Hiragana-Katakana",
|
||||
"*Katakana-Hiragana",
|
||||
|
||||
"StraightQuotes-CurlyQuotes",
|
||||
"*CurlyQuotes-StraightQuotes",
|
||||
|
||||
|
|
221
icu4j/src/com/ibm/text/resources/TransliterationRule_Hiragana_Katakana.java
Executable file
221
icu4j/src/com/ibm/text/resources/TransliterationRule_Hiragana_Katakana.java
Executable file
|
@ -0,0 +1,221 @@
|
|||
/*******************************************************************************
|
||||
* Copyright (C) 1997-2000, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/resources/Attic/TransliterationRule_Hiragana_Katakana.java,v $
|
||||
* $Date: 2000/06/30 00:00:09 $
|
||||
* $Revision: 1.1 $
|
||||
*******************************************************************************
|
||||
* Date Name Description
|
||||
* 06/29/00 aliu Creation.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.resources;
|
||||
|
||||
import java.util.ListResourceBundle;
|
||||
|
||||
public class TransliterationRule_Hiragana_Katakana extends ListResourceBundle {
|
||||
/**
|
||||
* Overrides ListResourceBundle
|
||||
*/
|
||||
public Object[][] getContents() {
|
||||
return new Object[][] {
|
||||
{ "Rule", "" +
|
||||
|
||||
// Hiragana-Katana
|
||||
|
||||
// This is largely a one-to-one mapping, but it has a
|
||||
// few kinks:
|
||||
|
||||
// 1. The Katakana va/vi/ve/vo (30F7-30FA) have no
|
||||
// Hiragana equivalents. We use Hiragana wa/wi/we/wo
|
||||
// (308F-3092) with a voicing mark (3099), which is
|
||||
// semantically equivalent. However, this is a non-
|
||||
// roundtripping transformation.
|
||||
|
||||
// 2. The Katakana small ka/ke (30F5,30F6) have no
|
||||
// Hiragana equiavlents. We convert them to normal
|
||||
// Hiragana ka/ke (304B,3051). This is a one-way
|
||||
// information-losing transformation and precludes
|
||||
// round-tripping of 30F5 and 30F6.
|
||||
|
||||
// 3. The combining marks 3099-309C are in the Hiragana
|
||||
// block, but they apply to Katakana as well, so we
|
||||
// leave the untouched.
|
||||
|
||||
// 4. The Katakana prolonged sound mark 30FC doubles the
|
||||
// preceding vowel. This is a one-way information-
|
||||
// losing transformation from Katakana to Hiragana.
|
||||
|
||||
// 5. The Katakana middle dot separates words in foreign
|
||||
// expressions; we leave this unmodified.
|
||||
|
||||
// The above points preclude successful round-trip
|
||||
// transformations of arbitrary input text. However,
|
||||
// they provide naturalistic results that should conform
|
||||
// to natural language expectations.
|
||||
|
||||
|
||||
// Combining equivalents
|
||||
"\u308F\u3099 <> \u30F7;" +
|
||||
"\u3090\u3099 <> \u30F8;" +
|
||||
"\u3091\u3099 <> \u30F9;" +
|
||||
"\u3092\u3099 <> \u30FA;" +
|
||||
|
||||
// One-to-one mappings, main block
|
||||
// 3041:3094 <> 30A1:30F4
|
||||
// 309D,E <> 30FD,E
|
||||
"\u3041 <> \u30A1;" +
|
||||
"\u3042 <> \u30A2;" +
|
||||
"\u3043 <> \u30A3;" +
|
||||
"\u3044 <> \u30A4;" +
|
||||
"\u3045 <> \u30A5;" +
|
||||
"\u3046 <> \u30A6;" +
|
||||
"\u3047 <> \u30A7;" +
|
||||
"\u3048 <> \u30A8;" +
|
||||
"\u3049 <> \u30A9;" +
|
||||
"\u304A <> \u30AA;" +
|
||||
"\u304B <> \u30AB;" +
|
||||
"\u304C <> \u30AC;" +
|
||||
"\u304D <> \u30AD;" +
|
||||
"\u304E <> \u30AE;" +
|
||||
"\u304F <> \u30AF;" +
|
||||
"\u3050 <> \u30B0;" +
|
||||
"\u3051 <> \u30B1;" +
|
||||
"\u3052 <> \u30B2;" +
|
||||
"\u3053 <> \u30B3;" +
|
||||
"\u3054 <> \u30B4;" +
|
||||
"\u3055 <> \u30B5;" +
|
||||
"\u3056 <> \u30B6;" +
|
||||
"\u3057 <> \u30B7;" +
|
||||
"\u3058 <> \u30B8;" +
|
||||
"\u3059 <> \u30B9;" +
|
||||
"\u305A <> \u30BA;" +
|
||||
"\u305B <> \u30BB;" +
|
||||
"\u305C <> \u30BC;" +
|
||||
"\u305D <> \u30BD;" +
|
||||
"\u305E <> \u30BE;" +
|
||||
"\u305F <> \u30BF;" +
|
||||
"\u3060 <> \u30C0;" +
|
||||
"\u3061 <> \u30C1;" +
|
||||
"\u3062 <> \u30C2;" +
|
||||
"\u3063 <> \u30C3;" +
|
||||
"\u3064 <> \u30C4;" +
|
||||
"\u3065 <> \u30C5;" +
|
||||
"\u3066 <> \u30C6;" +
|
||||
"\u3067 <> \u30C7;" +
|
||||
"\u3068 <> \u30C8;" +
|
||||
"\u3069 <> \u30C9;" +
|
||||
"\u306A <> \u30CA;" +
|
||||
"\u306B <> \u30CB;" +
|
||||
"\u306C <> \u30CC;" +
|
||||
"\u306D <> \u30CD;" +
|
||||
"\u306E <> \u30CE;" +
|
||||
"\u306F <> \u30CF;" +
|
||||
"\u3070 <> \u30D0;" +
|
||||
"\u3071 <> \u30D1;" +
|
||||
"\u3072 <> \u30D2;" +
|
||||
"\u3073 <> \u30D3;" +
|
||||
"\u3074 <> \u30D4;" +
|
||||
"\u3075 <> \u30D5;" +
|
||||
"\u3076 <> \u30D6;" +
|
||||
"\u3077 <> \u30D7;" +
|
||||
"\u3078 <> \u30D8;" +
|
||||
"\u3079 <> \u30D9;" +
|
||||
"\u307A <> \u30DA;" +
|
||||
"\u307B <> \u30DB;" +
|
||||
"\u307C <> \u30DC;" +
|
||||
"\u307D <> \u30DD;" +
|
||||
"\u307E <> \u30DE;" +
|
||||
"\u307F <> \u30DF;" +
|
||||
"\u3080 <> \u30E0;" +
|
||||
"\u3081 <> \u30E1;" +
|
||||
"\u3082 <> \u30E2;" +
|
||||
"\u3083 <> \u30E3;" +
|
||||
"\u3084 <> \u30E4;" +
|
||||
"\u3085 <> \u30E5;" +
|
||||
"\u3086 <> \u30E6;" +
|
||||
"\u3087 <> \u30E7;" +
|
||||
"\u3088 <> \u30E8;" +
|
||||
"\u3089 <> \u30E9;" +
|
||||
"\u308A <> \u30EA;" +
|
||||
"\u308B <> \u30EB;" +
|
||||
"\u308C <> \u30EC;" +
|
||||
"\u308D <> \u30ED;" +
|
||||
"\u308E <> \u30EE;" +
|
||||
"\u308F <> \u30EF;" +
|
||||
"\u3090 <> \u30F0;" +
|
||||
"\u3091 <> \u30F1;" +
|
||||
"\u3092 <> \u30F2;" +
|
||||
"\u3093 <> \u30F3;" +
|
||||
"\u3094 <> \u30F4;" +
|
||||
"\u309D <> \u30FD;" +
|
||||
"\u309E <> \u30FE;" +
|
||||
|
||||
// Fallback; this is a one-way Katakana-Hiragana xform.
|
||||
"\u304B < \u30F5;" +
|
||||
"\u3051 < \u30F6;" +
|
||||
|
||||
// Anything followed by a prolonged sound mark 30FC has
|
||||
// its final vowel doubled. This is a Katakana-Hiragana
|
||||
// one-way information-losing transformation. We
|
||||
// include the small Katakana (e.g., small A 3041) and
|
||||
// do not distinguish them from their large
|
||||
// counterparts. It doesn't make sense to double a
|
||||
// small counterpart vowel as a small Hiragana vowel, so
|
||||
// we don't do so. In natural text this should never
|
||||
// occur anyway. If a 30FC is seen without a preceding
|
||||
// vowel sound (e.g., after n 30F3) we do not change it.
|
||||
|
||||
"$long = \u30FC;" +
|
||||
|
||||
// The following categories are Hiragana, not Katakana
|
||||
// as might be expected, since by the time we get to the
|
||||
// 30FC, the preceding character will have already been
|
||||
// transformed to Hiragana.
|
||||
|
||||
// {The following mechanically generated from the
|
||||
// Unicode 3.0 data:}
|
||||
|
||||
"$xa = [" +
|
||||
"\u3041 \u3042 \u304B \u304C \u3055 \u3056" +
|
||||
"\u305F \u3060 \u306A \u306F \u3070 \u3071" +
|
||||
"\u307E \u3083 \u3084 \u3089 \u308E \u308F" +
|
||||
"];" +
|
||||
|
||||
"$xi = [" +
|
||||
"\u3043 \u3044 \u304D \u304E \u3057 \u3058" +
|
||||
"\u3061 \u3062 \u306B \u3072 \u3073 \u3074" +
|
||||
"\u307F \u308A \u3090" +
|
||||
"];" +
|
||||
|
||||
"$xu = [" +
|
||||
"\u3045 \u3046 \u304F \u3050 \u3059 \u305A" +
|
||||
"\u3063 \u3064 \u3065 \u306C \u3075 \u3076" +
|
||||
"\u3077 \u3080 \u3085 \u3086 \u308B \u3094" +
|
||||
"];" +
|
||||
|
||||
"$xe = [" +
|
||||
"\u3047 \u3048 \u3051 \u3052 \u305B \u305C" +
|
||||
"\u3066 \u3067 \u306D \u3078 \u3079 \u307A" +
|
||||
"\u3081 \u308C \u3091" +
|
||||
"];" +
|
||||
|
||||
"$xo = [" +
|
||||
"\u3049 \u304A \u3053 \u3054 \u305D \u305E" +
|
||||
"\u3068 \u3069 \u306E \u307B \u307C \u307D" +
|
||||
"\u3082 \u3087 \u3088 \u308D \u3092" +
|
||||
"];" +
|
||||
|
||||
"\u3042 < $xa {$long};" +
|
||||
"\u3044 < $xi {$long};" +
|
||||
"\u3046 < $xu {$long};" +
|
||||
"\u3048 < $xe {$long};" +
|
||||
"\u304A < $xo {$long};" +
|
||||
|
||||
""
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
|
@ -58,6 +58,7 @@ $NAME_MAP = <<'END';
|
|||
{ "Latin-Hebrew", "Hebrew-Latin", "lhebrew" }
|
||||
{ "Latin-Jamo", "Jamo-Latin", "ljamo" }
|
||||
{ "Latin-Kana", "Kana-Latin", "lkana" }
|
||||
{ "Hiragana-Katakana", "Katakana-Hiragana", "kana" }
|
||||
|
||||
// Other miscellaneous rules
|
||||
{ "StraightQuotes-CurlyQuotes", "CurlyQuotes-StraightQuotes", "quotes" }
|
||||
|
|
Loading…
Add table
Reference in a new issue