mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-114 Transliterator framework first working version
X-SVN-Rev: 194
This commit is contained in:
parent
a2f31432aa
commit
bd14077b79
35 changed files with 14712 additions and 1 deletions
1806
icu4c/data/translit/expcon.txt
Normal file
1806
icu4c/data/translit/expcon.txt
Normal file
File diff suppressed because it is too large
Load diff
128
icu4c/data/translit/kbdescl1.txt
Normal file
128
icu4c/data/translit/kbdescl1.txt
Normal file
|
@ -0,0 +1,128 @@
|
|||
//--------------------------------------------------------------------
|
||||
// Copyright (C) 1999, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//--------------------------------------------------------------------
|
||||
// Date Name Description
|
||||
// 11/17/99 aliu Creation.
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
// KeyboardEscape-Latin1
|
||||
|
||||
kbdescl1 {
|
||||
Rule {
|
||||
"esc=''\n"
|
||||
"grave=`\n"
|
||||
"acute=''\n"
|
||||
"hat=^\n"
|
||||
"tilde=~\n"
|
||||
"umlaut=:\n"
|
||||
"ring=.\n"
|
||||
"cedilla=,\n"
|
||||
"slash=/\n"
|
||||
"super=^\n"
|
||||
|
||||
// Make keyboard entry of {esc} possible
|
||||
// and of backslash
|
||||
"'\\'{esc}>{esc}\n"
|
||||
"'\\\\'>'\\'\n"
|
||||
|
||||
// Long keys
|
||||
"cur{esc}>\u00A4\n"
|
||||
"sec{esc}>\u00A7\n"
|
||||
"not{esc}>\u00AC\n"
|
||||
"mul{esc}>\u00D7\n"
|
||||
"div{esc}>\u00F7\n"
|
||||
|
||||
" {esc}>\u00A0\n" // non-breaking space
|
||||
"!{esc}>\u00A1\n" // inverted exclamation
|
||||
"c/{esc}>\u00A2\n" // cent sign
|
||||
"lb{esc}>\u00A3\n" // pound sign
|
||||
"'|'{esc}>\u00A6\n" // broken vertical bar
|
||||
":{esc}>\u00A8\n" // umlaut
|
||||
"{super}a{esc}>\u00AA\n" // feminine ordinal
|
||||
"'<<'{esc}>\u00AB\n"
|
||||
"r{esc}>\u00AE\n"
|
||||
"--{esc}>\u00AF\n"
|
||||
"-{esc}>\u00AD\n"
|
||||
"+-{esc}>\u00B1\n"
|
||||
"{super}2{esc}>\u00B2\n"
|
||||
"{super}3{esc}>\u00B3\n"
|
||||
"{acute}{esc}>\u00B4\n"
|
||||
"m{esc}>\u00B5\n"
|
||||
"para{esc}>\u00B6\n"
|
||||
"dot{esc}>\u00B7\n"
|
||||
"{cedilla}{esc}>\u00B8\n"
|
||||
"{super}1{esc}>\u00B9\n"
|
||||
"{super}o{esc}>\u00BA\n" // masculine ordinal
|
||||
"'>>'{esc}>\u00BB\n"
|
||||
"1/4{esc}>\u00BC\n"
|
||||
"1/2{esc}>\u00BD\n"
|
||||
"3/4{esc}>\u00BE\n"
|
||||
"?{esc}>\u00BF\n"
|
||||
"A{grave}{esc}>\u00C0\n"
|
||||
"A{acute}{esc}>\u00C1\n"
|
||||
"A{hat}{esc}>\u00C2\n"
|
||||
"A{tilde}{esc}>\u00C3\n"
|
||||
"A{umlaut}{esc}>\u00C4\n"
|
||||
"A{ring}{esc}>\u00C5\n"
|
||||
"AE{esc}>\u00C6\n"
|
||||
"C{cedilla}{esc}>\u00C7\n"
|
||||
"E{grave}{esc}>\u00C8\n"
|
||||
"E{acute}{esc}>\u00C9\n"
|
||||
"E{hat}{esc}>\u00CA\n"
|
||||
"E{umlaut}{esc}>\u00CB\n"
|
||||
"I{grave}{esc}>\u00CC\n"
|
||||
"I{acute}{esc}>\u00CD\n"
|
||||
"I{hat}{esc}>\u00CE\n"
|
||||
"I{umlaut}{esc}>\u00CF\n"
|
||||
"D-{esc}>\u00D0\n"
|
||||
"N{tilde}{esc}>\u00D1\n"
|
||||
"O{grave}{esc}>\u00D2\n"
|
||||
"O{acute}{esc}>\u00D3\n"
|
||||
"O{hat}{esc}>\u00D4\n"
|
||||
"O{tilde}{esc}>\u00D5\n"
|
||||
"O{umlaut}{esc}>\u00D6\n"
|
||||
"O{slash}{esc}>\u00D8\n"
|
||||
"U{grave}{esc}>\u00D9\n"
|
||||
"U{acute}{esc}>\u00DA\n"
|
||||
"U{hat}{esc}>\u00DB\n"
|
||||
"U{umlaut}{esc}>\u00DC\n"
|
||||
"Y{acute}{esc}>\u00DD\n"
|
||||
"TH{esc}>\u00DE\n"
|
||||
"ss{esc}>\u00DF\n"
|
||||
"a{grave}{esc}>\u00E0\n"
|
||||
"a{acute}{esc}>\u00E1\n"
|
||||
"a{hat}{esc}>\u00E2\n"
|
||||
"a{tilde}{esc}>\u00E3\n"
|
||||
"a{umlaut}{esc}>\u00E4\n"
|
||||
"a{ring}{esc}>\u00E5\n"
|
||||
"ae{esc}>\u00E6\n"
|
||||
"c{cedilla}{esc}>\u00E7\n"
|
||||
"c{esc}>\u00A9\n" // copyright - after c{cedilla}
|
||||
"e{grave}{esc}>\u00E8\n"
|
||||
"e{acute}{esc}>\u00E9\n"
|
||||
"e{hat}{esc}>\u00EA\n"
|
||||
"e{umlaut}{esc}>\u00EB\n"
|
||||
"i{grave}{esc}>\u00EC\n"
|
||||
"i{acute}{esc}>\u00ED\n"
|
||||
"i{hat}{esc}>\u00EE\n"
|
||||
"i{umlaut}{esc}>\u00EF\n"
|
||||
"d-{esc}>\u00F0\n"
|
||||
"n{tilde}{esc}>\u00F1\n"
|
||||
"o{grave}{esc}>\u00F2\n"
|
||||
"o{acute}{esc}>\u00F3\n"
|
||||
"o{hat}{esc}>\u00F4\n"
|
||||
"o{tilde}{esc}>\u00F5\n"
|
||||
"o{umlaut}{esc}>\u00F6\n"
|
||||
"o{slash}{esc}>\u00F8\n"
|
||||
"o{esc}>\u00B0\n"
|
||||
"u{grave}{esc}>\u00F9\n"
|
||||
"u{acute}{esc}>\u00FA\n"
|
||||
"u{hat}{esc}>\u00FB\n"
|
||||
"u{umlaut}{esc}>\u00FC\n"
|
||||
"y{acute}{esc}>\u00FD\n"
|
||||
"y{esc}>\u00A5\n" // yen sign
|
||||
"th{esc}>\u00FE\n"
|
||||
"ss{esc}>\u00FF\n"
|
||||
}
|
||||
}
|
240
icu4c/data/translit/larabic.txt
Normal file
240
icu4c/data/translit/larabic.txt
Normal file
|
@ -0,0 +1,240 @@
|
|||
//--------------------------------------------------------------------
|
||||
// Copyright (C) 1999, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//--------------------------------------------------------------------
|
||||
// Date Name Description
|
||||
// 11/17/99 aliu Creation.
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
// Latin-Arabic
|
||||
|
||||
larabic {
|
||||
Rule {
|
||||
// To Do: finish adding shadda, add sokoon
|
||||
|
||||
"alefmadda=\u0622\n"
|
||||
"alefuhamza=\u0623\n"
|
||||
"wauuhamza=\u0624\n"
|
||||
"alefhamza=\u0625\n"
|
||||
"yehuhamza=\u0626\n"
|
||||
"alef=\u0627\n"
|
||||
"beh=\u0628\n"
|
||||
"tehmarbuta=\u0629\n"
|
||||
"teh=\u062A\n"
|
||||
"theh=\u062B\n"
|
||||
"geem=\u062C\n"
|
||||
"hah=\u062D\n"
|
||||
"kha=\u062E\n"
|
||||
"dal=\u062F\n"
|
||||
"dhal=\u0630\n"
|
||||
"reh=\u0631\n"
|
||||
"zain=\u0632\n"
|
||||
"seen=\u0633\n"
|
||||
"sheen=\u0634\n"
|
||||
"sad=\u0635\n"
|
||||
"dad=\u0636\n"
|
||||
"tah=\u0637\n"
|
||||
"zah=\u0638\n"
|
||||
"ein=\u0639\n"
|
||||
"ghein=\u063A\n"
|
||||
"feh=\u0641\n"
|
||||
"qaaf=\u0642\n"
|
||||
"kaf=\u0643\n"
|
||||
"lam=\u0644\n"
|
||||
"meem=\u0645\n"
|
||||
"noon=\u0646\n"
|
||||
"heh=\u0647\n"
|
||||
"wau=\u0648\n"
|
||||
"yehmaqsura=\u0649\n"
|
||||
"yeh=\u064A\n"
|
||||
"peh=\u06A4\n"
|
||||
|
||||
"hamza=\u0621\n"
|
||||
"fathatein=\u064B\n"
|
||||
"dammatein=\u064C\n"
|
||||
"kasratein=\u064D\n"
|
||||
"fatha=\u064E\n"
|
||||
"damma=\u064F\n"
|
||||
"kasra=\u0650\n"
|
||||
"shadda=\u0651\n"
|
||||
"sokoon=\u0652\n"
|
||||
|
||||
// convert English to Arabic
|
||||
"Arabic>"
|
||||
"\u062a\u062a\u0645\u062a\u0639\u0020"
|
||||
"\u0627\u0644\u0644\u063a\u0629\u0020"
|
||||
"\u0627\u0644\u0639\u0631\u0628\u0628\u064a\u0629\u0020"
|
||||
"\u0628\u0628\u0646\u0638\u0645\u0020"
|
||||
"\u0643\u062a\u0627\u0628\u0628\u064a\u0629\u0020"
|
||||
"\u062c\u0645\u064a\u0644\u0629\n"
|
||||
|
||||
"ai>{alefmadda}\n"
|
||||
"ae>{alefuhamza}\n"
|
||||
"ao>{alefhamza}\n"
|
||||
"aa>{alef}\n"
|
||||
"an>{fathatein}\n"
|
||||
"a>{fatha}\n"
|
||||
"b>{beh}\n"
|
||||
"c>{kaf}\n"
|
||||
"{dhal}]dh>{shadda}\n"
|
||||
"dh>{dhal}\n"
|
||||
"{dad}]dd>{shadda}\n"
|
||||
"dd>{dad}\n"
|
||||
"{dal}]d>{shadda}\n"
|
||||
"d>{dal}\n"
|
||||
"e>{ein}\n"
|
||||
"f>{feh}\n"
|
||||
"gh>{ghein}\n"
|
||||
"g>{geem}\n"
|
||||
"hh>{hah}\n"
|
||||
"h>{heh}\n"
|
||||
"ii>{kasratein}\n"
|
||||
"i>{kasra}\n"
|
||||
"j>{geem}\n"
|
||||
"kh>{kha}\n"
|
||||
"k>{kaf}\n"
|
||||
"l>{lam}\n"
|
||||
"m>{meem}\n"
|
||||
"n>{noon}\n"
|
||||
"o>{hamza}\n"
|
||||
"p>{peh}\n"
|
||||
"q>{qaaf}\n"
|
||||
"r>{reh}\n"
|
||||
"sh>{sheen}\n"
|
||||
"ss>{sad}\n"
|
||||
"s>{seen}\n"
|
||||
"th>{theh}\n"
|
||||
"tm>{tehmarbuta}\n"
|
||||
"tt>{tah}\n"
|
||||
"t>{teh}\n"
|
||||
"uu>{dammatein}\n"
|
||||
"u>{damma}\n"
|
||||
"v>{beh}\n"
|
||||
"we>{wauuhamza}\n"
|
||||
"w>{wau}\n"
|
||||
"x>{kaf}{shadda}{seen}\n"
|
||||
"ye>{yehuhamza}\n"
|
||||
"ym>{yehmaqsura}\n"
|
||||
"y>{yeh}\n"
|
||||
"zz>{zah}\n"
|
||||
"z>{zain}\n"
|
||||
|
||||
"0>\u0660\n"+ // Arabic digit 0
|
||||
"1>\u0661\n"+ // Arabic digit 1
|
||||
"2>\u0662\n"+ // Arabic digit 2
|
||||
"3>\u0663\n"+ // Arabic digit 3
|
||||
"4>\u0664\n"+ // Arabic digit 4
|
||||
"5>\u0665\n"+ // Arabic digit 5
|
||||
"6>\u0666\n"+ // Arabic digit 6
|
||||
"7>\u0667\n"+ // Arabic digit 7
|
||||
"8>\u0668\n"+ // Arabic digit 8
|
||||
"9>\u0669\n"+ // Arabic digit 9
|
||||
"%>\u066A\n"+ // Arabic %
|
||||
".>\u066B\n"+ // Arabic decimal separator
|
||||
",>\u066C\n"+ // Arabic thousands separator
|
||||
"*>\u066D\n"+ // Arabic five-pointed star
|
||||
|
||||
"`0>0\n"+ // Escaped forms of the above
|
||||
"`1>1\n"
|
||||
"`2>2\n"
|
||||
"`3>3\n"
|
||||
"`4>4\n"
|
||||
"`5>5\n"
|
||||
"`6>6\n"
|
||||
"`7>7\n"
|
||||
"`8>8\n"
|
||||
"`9>9\n"
|
||||
"`%>%\n"
|
||||
"`.>.\n"
|
||||
"`,>,\n"
|
||||
"`*>*\n"
|
||||
"``>`\n"
|
||||
|
||||
"''>\n"
|
||||
|
||||
// now Arabic to English
|
||||
|
||||
"''ai<a]{alefmadda}\n"
|
||||
"ai<{alefmadda}\n"
|
||||
"''ae<a]{alefuhamza}\n"
|
||||
"ae<{alefuhamza}\n"
|
||||
"''ao<a]{alefhamza}\n"
|
||||
"ao<{alefhamza}\n"
|
||||
"''aa<a]{alef}\n"
|
||||
"aa<{alef}\n"
|
||||
"''an<a]{fathatein}\n"
|
||||
"an<{fathatein}\n"
|
||||
"''a<a]{fatha}\n"
|
||||
"a<{fatha}\n"
|
||||
"b<{beh}\n"
|
||||
"''dh<d]{dhal}\n"
|
||||
"dh<{dhal}\n"
|
||||
"''dd<d]{dad}\n"
|
||||
"dd<{dad}\n"
|
||||
"''d<d]{dal}\n"
|
||||
"d<{dal}\n"
|
||||
"''e<a]{ein}\n"
|
||||
"''e<w]{ein}\n"
|
||||
"''e<y]{ein}\n"
|
||||
"e<{ein}\n"
|
||||
"f<{feh}\n"
|
||||
"gh<{ghein}\n"
|
||||
"''hh<d]{hah}\n"
|
||||
"''hh<t]{hah}\n"
|
||||
"''hh<k]{hah}\n"
|
||||
"''hh<s]{hah}\n"
|
||||
"hh<{hah}\n"
|
||||
"''h<d]{heh}\n"
|
||||
"''h<t]{heh}\n"
|
||||
"''h<k]{heh}\n"
|
||||
"''h<s]{heh}\n"
|
||||
"h<{heh}\n"
|
||||
"''ii<i]{kasratein}\n"
|
||||
"ii<{kasratein}\n"
|
||||
"''i<i]{kasra}\n"
|
||||
"i<{kasra}\n"
|
||||
"j<{geem}\n"
|
||||
"kh<{kha}\n"
|
||||
"x<{kaf}{shadda}{seen}\n"
|
||||
"k<{kaf}\n"
|
||||
"l<{lam}\n"
|
||||
"''m<y]{meem}\n"
|
||||
"''m<t]{meem}\n"
|
||||
"m<{meem}\n"
|
||||
"n<{noon}\n"
|
||||
"''o<a]{hamza}\n"
|
||||
"o<{hamza}\n"
|
||||
"p<{peh}\n"
|
||||
"q<{qaaf}\n"
|
||||
"r<{reh}\n"
|
||||
"sh<{sheen}\n"
|
||||
"''ss<s]{sad}\n"
|
||||
"ss<{sad}\n"
|
||||
"''s<s]{seen}\n"
|
||||
"s<{seen}\n"
|
||||
"th<{theh}\n"
|
||||
"tm<{tehmarbuta}\n"
|
||||
"''tt<t]{tah}\n"
|
||||
"tt<{tah}\n"
|
||||
"''t<t]{teh}\n"
|
||||
"t<{teh}\n"
|
||||
"''uu<u]{dammatein}\n"
|
||||
"uu<{dammatein}\n"
|
||||
"''u<u]{damma}\n"
|
||||
"u<{damma}\n"
|
||||
"we<{wauuhamza}\n"
|
||||
"w<{wau}\n"
|
||||
"ye<{yehuhamza}\n"
|
||||
"ym<{yehmaqsura}\n"
|
||||
"''y<y]{yeh}\n"
|
||||
"y<{yeh}\n"
|
||||
"''zz<z]{zah}\n"
|
||||
"zz<{zah}\n"
|
||||
"''z<z]{zain}\n"
|
||||
"z<{zain}\n"
|
||||
|
||||
"dh<dh]{shadda}\n"
|
||||
"dd<dd]{shadda}\n"
|
||||
"''d<d]{shadda}\n"
|
||||
}
|
||||
}
|
411
icu4c/data/translit/ldevan.txt
Normal file
411
icu4c/data/translit/ldevan.txt
Normal file
|
@ -0,0 +1,411 @@
|
|||
//--------------------------------------------------------------------
|
||||
// Copyright (C) 1999, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//--------------------------------------------------------------------
|
||||
// Date Name Description
|
||||
// 11/17/99 aliu Creation.
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
// Latin-Devanagari
|
||||
|
||||
ldevan {
|
||||
Rule {
|
||||
//#####################################################################
|
||||
// Keyboard Transliteration Table
|
||||
//#####################################################################
|
||||
// Conversions should be:
|
||||
// 1. complete
|
||||
// * convert every sequence of Latin letters (a to z plus apostrophe)
|
||||
// to a sequence of Native letters
|
||||
// * convert every sequence of Native letters to Latin letters
|
||||
// 2. reversable
|
||||
// * any string of Native converted to Latin and back should be the same
|
||||
// * this is not true for English converted to Native & back, e.g.:
|
||||
// k -> {kaf} -> k
|
||||
// c -> {kaf} -> k
|
||||
//#####################################################################
|
||||
// Sequences of Latin letters may convert to a single Native letter.
|
||||
// When this is the case, an apostrophe can be used to indicate separate
|
||||
// letters.$
|
||||
// E.g. sh -> {shin}
|
||||
// s'h -> {sin}{heh}
|
||||
// ss -> {sad}
|
||||
// s's -> {sin}{shadda}
|
||||
//#####################################################################
|
||||
// To Do:
|
||||
// finish adding shadda, add sokoon, fix uppercase
|
||||
// make two transliteration tables: one with vowels, one without
|
||||
//#####################################################################
|
||||
// Modifications
|
||||
// Devanagari Transliterator: broken up with consonsants/vowels
|
||||
//#####################################################################
|
||||
// Unicode character name definitions
|
||||
//#####################################################################
|
||||
|
||||
//consonants
|
||||
"candrabindu=\u0901\n"
|
||||
"bindu=\u0902\n"
|
||||
"visarga=\u0903\n"
|
||||
|
||||
// w<vowel> represents the stand-alone form
|
||||
"wa=\u0905\n"
|
||||
"waa=\u0906\n"
|
||||
"wi=\u0907\n"
|
||||
"wii=\u0908\n"
|
||||
"wu=\u0909\n"
|
||||
"wuu=\u090A\n"
|
||||
"wr=\u090B\n"
|
||||
"wl=\u090C\n"
|
||||
"we=\u090F\n"
|
||||
"wai=\u0910\n"
|
||||
"wo=\u0913\n"
|
||||
"wau=\u0914\n"
|
||||
|
||||
"ka=\u0915\n"
|
||||
"kha=\u0916\n"
|
||||
"ga=\u0917\n"
|
||||
"gha=\u0918\n"
|
||||
"nga=\u0919\n"
|
||||
|
||||
"ca=\u091A\n"
|
||||
"cha=\u091B\n"
|
||||
"ja=\u091C\n"
|
||||
"jha=\u091D\n"
|
||||
"nya=\u091E\n"
|
||||
|
||||
"tta=\u091F\n"
|
||||
"ttha=\u0920\n"
|
||||
"dda=\u0921\n"
|
||||
"ddha=\u0922\n"
|
||||
"nna=\u0923\n"
|
||||
|
||||
"ta=\u0924\n"
|
||||
"tha=\u0925\n"
|
||||
"da=\u0926\n"
|
||||
"dha=\u0927\n"
|
||||
"na=\u0928\n"
|
||||
|
||||
"pa=\u092A\n"
|
||||
"pha=\u092B\n"
|
||||
"ba=\u092C\n"
|
||||
"bha=\u092D\n"
|
||||
"ma=\u092E\n"
|
||||
|
||||
"ya=\u092F\n"
|
||||
"ra=\u0930\n"
|
||||
"rra=\u0931\n"
|
||||
"la=\u0933\n"
|
||||
"va=\u0935\n"
|
||||
|
||||
"sha=\u0936\n"
|
||||
"ssa=\u0937\n"
|
||||
"sa=\u0938\n"
|
||||
"ha=\u0939\n"
|
||||
|
||||
// <vowel> represents the dependent form
|
||||
"aa=\u093E\n"
|
||||
"i=\u093F\n"
|
||||
"ii=\u0940\n"
|
||||
"u=\u0941\n"
|
||||
"uu=\u0942\n"
|
||||
"rh=\u0943\n"
|
||||
"lh=\u0944\n"
|
||||
"e=\u0947\n"
|
||||
"ai=\u0948\n"
|
||||
"o=\u094B\n"
|
||||
"au=\u094C\n"
|
||||
|
||||
"virama=\u094D\n"
|
||||
|
||||
"wrr=\u0960\n"
|
||||
"rrh=\u0962\n"
|
||||
|
||||
"danda=\u0964\n"
|
||||
"doubleDanda=\u0965\n"
|
||||
"depVowelAbove=[\u093E-\u0940\u0945-\u094C]\n"
|
||||
"depVowelBelow=[\u0941-\u0944]\n"
|
||||
// Ech: Double escape U+0000, so UnicodeString doesn't consider it
|
||||
// to be the end of the string. This is only necessary for U+0000
|
||||
// right now. [liu]
|
||||
"endThing=[{danda}{doubleDanda}\\u0000-\u08FF\u0980-\uFFFF]\n"
|
||||
|
||||
"&=[{virama}{aa}{ai}{au}{ii}{i}{uu}{u}{rrh}{rh}{lh}{e}{o}]\n"
|
||||
"%=[bcdfghjklmnpqrstvwxyz]\n"
|
||||
|
||||
//#####################################################################
|
||||
// convert from Latin letters to Native letters
|
||||
//#####################################################################
|
||||
//Hindi>\u092d\u093e\u0930\u0924--\u0020\u0926\u0947\u0936\u0020\u092c\u0928\u094d\u0927\u0941\u002e
|
||||
|
||||
// special forms with no good conversion
|
||||
|
||||
"mm>{bindu}\n"
|
||||
"x>{visarga}\n"
|
||||
|
||||
// convert to independent forms at start of word or syllable:
|
||||
// e.g. keai -> {ka}{e}{wai}; k'ai -> {ka}{wai}; (ai) -> ({wai})
|
||||
// Moved up [LIU]
|
||||
|
||||
"aa>{waa}\n"
|
||||
"ai>{wai}\n"
|
||||
"au>{wau}\n"
|
||||
"ii>{wii}\n"
|
||||
"i>{wi}\n"
|
||||
"uu>{wuu}\n"
|
||||
"u>{wu}\n"
|
||||
"rrh>{wrr}\n"
|
||||
"rh>{wr}\n"
|
||||
"lh>{wl}\n"
|
||||
"e>{we}\n"
|
||||
"o>{wo}\n"
|
||||
"a>{wa}\n"
|
||||
|
||||
// normal consonants
|
||||
|
||||
"kh>{kha}|{virama}\n"
|
||||
"k>{ka}|{virama}\n"
|
||||
"q>{ka}|{virama}\n"
|
||||
"gh>{gha}|{virama}\n"
|
||||
"g>{ga}|{virama}\n"
|
||||
"ng>{nga}|{virama}\n"
|
||||
"ch>{cha}|{virama}\n"
|
||||
"c>{ca}|{virama}\n"
|
||||
"jh>{jha}|{virama}\n"
|
||||
"j>{ja}|{virama}\n"
|
||||
"ny>{nya}|{virama}\n"
|
||||
"tth>{ttha}|{virama}\n"
|
||||
"tt>{tta}|{virama}\n"
|
||||
"ddh>{ddha}|{virama}\n"
|
||||
"dd>{dda}|{virama}\n"
|
||||
"nn>{nna}|{virama}\n"
|
||||
"th>{tha}|{virama}\n"
|
||||
"t>{ta}|{virama}\n"
|
||||
"dh>{dha}|{virama}\n"
|
||||
"d>{da}|{virama}\n"
|
||||
"n>{na}|{virama}\n"
|
||||
"ph>{pha}|{virama}\n"
|
||||
"p>{pa}|{virama}\n"
|
||||
"bh>{bha}|{virama}\n"
|
||||
"b>{ba}|{virama}\n"
|
||||
"m>{ma}|{virama}\n"
|
||||
"y>{ya}|{virama}\n"
|
||||
"r>{ra}|{virama}\n"
|
||||
"l>{la}|{virama}\n"
|
||||
"v>{va}|{virama}\n"
|
||||
"f>{va}|{virama}\n"
|
||||
"w>{va}|{virama}\n"
|
||||
"sh>{sha}|{virama}\n"
|
||||
"ss>{ssa}|{virama}\n"
|
||||
"s>{sa}|{virama}\n"
|
||||
"z>{sa}|{virama}\n"
|
||||
"h>{ha}|{virama}\n"
|
||||
|
||||
".>{danda}\n"
|
||||
"{danda}.>{doubleDanda}\n"
|
||||
"{depVowelAbove}]~>{bindu}\n"
|
||||
"{depVowelBelow}]~>{candrabindu}\n"
|
||||
|
||||
// convert to dependent forms after consonant with no vowel:
|
||||
// e.g. kai -> {ka}{virama}ai -> {ka}{ai}
|
||||
|
||||
"{virama}aa>{aa}\n"
|
||||
"{virama}ai>{ai}\n"
|
||||
"{virama}au>{au}\n"
|
||||
"{virama}ii>{ii}\n"
|
||||
"{virama}i>{i}\n"
|
||||
"{virama}uu>{uu}\n"
|
||||
"{virama}u>{u}\n"
|
||||
"{virama}rrh>{rrh}\n"
|
||||
"{virama}rh>{rh}\n"
|
||||
"{virama}lh>{lh}\n"
|
||||
"{virama}e>{e}\n"
|
||||
"{virama}o>{o}\n"
|
||||
"{virama}a>\n"
|
||||
|
||||
// otherwise convert independent forms when separated by ': k'ai -> {ka}{virama}{wai}
|
||||
|
||||
"{virama}''aa>{waa}\n"
|
||||
"{virama}''ai>{wai}\n"
|
||||
"{virama}''au>{wau}\n"
|
||||
"{virama}''ii>{wii}\n"
|
||||
"{virama}''i>{wi}\n"
|
||||
"{virama}''uu>{wuu}\n"
|
||||
"{virama}''u>{wu}\n"
|
||||
"{virama}''rrh>{wrr}\n"
|
||||
"{virama}''rh>{wr}\n"
|
||||
"{virama}''lh>{wl}\n"
|
||||
"{virama}''e>{we}\n"
|
||||
"{virama}''o>{wo}\n"
|
||||
"{virama}''a>{wa}\n"
|
||||
|
||||
"{virama}[{endThing}>\n"
|
||||
|
||||
// convert any left-over apostrophes used for separation
|
||||
|
||||
"''>\n"
|
||||
|
||||
//#####################################################################
|
||||
// convert from Native letters to Latin letters
|
||||
//#####################################################################
|
||||
|
||||
// special forms with no good conversion
|
||||
|
||||
"mm<{bindu}\n"
|
||||
"x<{visarga}\n"
|
||||
|
||||
// normal consonants
|
||||
|
||||
"kh<{kha}[&\n"
|
||||
"kha<{kha}\n"
|
||||
"k''<{ka}{virama}[{ha}\n"
|
||||
"k<{ka}[&\n"
|
||||
"ka<{ka}\n"
|
||||
"gh<{gha}[&\n"
|
||||
"gha<{gha}\n"
|
||||
"g''<{ga}{virama}[{ha}\n"
|
||||
"g<{ga}[&\n"
|
||||
"ga<{ga}\n"
|
||||
"ng<{nga}[&\n"
|
||||
"nga<{nga}\n"
|
||||
"ch<{cha}[&\n"
|
||||
"cha<{cha}\n"
|
||||
"c''<{ca}{virama}[{ha}\n"
|
||||
"c<{ca}[&\n"
|
||||
"ca<{ca}\n"
|
||||
"jh<{jha}[&\n"
|
||||
"jha<{jha}\n"
|
||||
"j''<{ja}{virama}[{ha}\n"
|
||||
"j<{ja}[&\n"
|
||||
"ja<{ja}\n"
|
||||
"ny<{nya}[&\n"
|
||||
"nya<{nya}\n"
|
||||
"tth<{ttha}[&\n"
|
||||
"ttha<{ttha}\n"
|
||||
"tt''<{tta}{virama}[{ha}\n"
|
||||
"tt<{tta}[&\n"
|
||||
"tta<{tta}\n"
|
||||
"ddh<{ddha}[&\n"
|
||||
"ddha<{ddha}\n"
|
||||
"dd''<{dda}[&{ha}\n"
|
||||
"dd<{dda}[&\n"
|
||||
"dda<{dda}\n"
|
||||
"dh<{dha}[&\n"
|
||||
"dha<{dha}\n"
|
||||
"d''<{da}{virama}[{ha}\n"
|
||||
"d''<{da}{virama}[{ddha}\n"
|
||||
"d''<{da}{virama}[{dda}\n"
|
||||
"d''<{da}{virama}[{dha}\n"
|
||||
"d''<{da}{virama}[{da}\n"
|
||||
"d<{da}[&\n"
|
||||
"da<{da}\n"
|
||||
"th<{tha}[&\n"
|
||||
"tha<{tha}\n"
|
||||
"t''<{ta}{virama}[{ha}\n"
|
||||
"t''<{ta}{virama}[{ttha}\n"
|
||||
"t''<{ta}{virama}[{tta}\n"
|
||||
"t''<{ta}{virama}[{tha}\n"
|
||||
"t''<{ta}{virama}[{ta}\n"
|
||||
"t<{ta}[&\n"
|
||||
"ta<{ta}\n"
|
||||
"n''<{na}{virama}[{ga}\n"
|
||||
"n''<{na}{virama}[{ya}\n"
|
||||
"n<{na}[&\n"
|
||||
"na<{na}\n"
|
||||
"ph<{pha}[&\n"
|
||||
"pha<{pha}\n"
|
||||
"p''<{pa}{virama}[{ha}\n"
|
||||
"p<{pa}[&\n"
|
||||
"pa<{pa}\n"
|
||||
"bh<{bha}[&\n"
|
||||
"bha<{bha}\n"
|
||||
"b''<{ba}{virama}[{ha}\n"
|
||||
"b<{ba}[&\n"
|
||||
"ba<{ba}\n"
|
||||
"m''<{ma}{virama}[{ma}\n"
|
||||
"m''<{ma}{virama}[{bindu}\n"
|
||||
"m<{ma}[&\n"
|
||||
"ma<{ma}\n"
|
||||
"y<{ya}[&\n"
|
||||
"ya<{ya}\n"
|
||||
"r''<{ra}{virama}[{ha}\n"
|
||||
"r<{ra}[&\n"
|
||||
"ra<{ra}\n"
|
||||
"l''<{la}{virama}[{ha}\n"
|
||||
"l<{la}[&\n"
|
||||
"la<{la}\n"
|
||||
"v<{va}[&\n"
|
||||
"va<{va}\n"
|
||||
"sh<{sha}[&\n"
|
||||
"sha<{sha}\n"
|
||||
"ss<{ssa}[&\n"
|
||||
"ssa<{ssa}\n"
|
||||
"s''<{sa}{virama}[{ha}\n"
|
||||
"s''<{sa}{virama}[{sha}\n"
|
||||
"s''<{sa}{virama}[{ssa}\n"
|
||||
"s''<{sa}{virama}[{sa}\n"
|
||||
"s<{sa}[&\n"
|
||||
"sa<{sa}\n"
|
||||
"h<{ha}[&\n"
|
||||
"ha<{ha}\n"
|
||||
|
||||
// dependent vowels (should never occur except following consonants)
|
||||
|
||||
"aa<{aa}\n"
|
||||
"ai<{ai}\n"
|
||||
"au<{au}\n"
|
||||
"ii<{ii}\n"
|
||||
"i<{i}\n"
|
||||
"uu<{uu}\n"
|
||||
"u<{u}\n"
|
||||
"rrh<{rrh}\n"
|
||||
"rh<{rh}\n"
|
||||
"lh<{lh}\n"
|
||||
"e<{e}\n"
|
||||
"o<{o}\n"
|
||||
|
||||
// independent vowels (when following consonants)
|
||||
|
||||
"''aa<a]{waa}\n"
|
||||
"''aa<%]{waa}\n"
|
||||
"''ai<a]{wai}\n"
|
||||
"''ai<%]{wai}\n"
|
||||
"''au<a]{wau}\n"
|
||||
"''au<%]{wau}\n"
|
||||
"''ii<a]{wii}\n"
|
||||
"''ii<%]{wii}\n"
|
||||
"''i<a]{wi}\n"
|
||||
"''i<%]{wi}\n"
|
||||
"''uu<a]{wuu}\n"
|
||||
"''uu<%]{wuu}\n"
|
||||
"''u<a]{wu}\n"
|
||||
"''u<%]{wu}\n"
|
||||
"''rrh<%]{wrr}\n"
|
||||
"''rh<%]{wr}\n"
|
||||
"''lh<%]{wl}\n"
|
||||
"''e<%]{we}\n"
|
||||
"''o<%]{wo}\n"
|
||||
"''a<a]{wa}\n"
|
||||
"''a<%]{wa}\n"
|
||||
|
||||
|
||||
// independent vowels (otherwise)
|
||||
|
||||
"aa<{waa}\n"
|
||||
"ai<{wai}\n"
|
||||
"au<{wau}\n"
|
||||
"ii<{wii}\n"
|
||||
"i<{wi}\n"
|
||||
"uu<{wuu}\n"
|
||||
"u<{wu}\n"
|
||||
"rrh<{wrr}\n"
|
||||
"rh<{wr}\n"
|
||||
"lh<{wl}\n"
|
||||
"e<{we}\n"
|
||||
"o<{wo}\n"
|
||||
"a<{wa}\n"
|
||||
|
||||
// blow away any remaining viramas
|
||||
|
||||
"<{virama}\n"
|
||||
}
|
||||
}
|
380
icu4c/data/translit/lgreek.txt
Normal file
380
icu4c/data/translit/lgreek.txt
Normal file
|
@ -0,0 +1,380 @@
|
|||
//--------------------------------------------------------------------
|
||||
// Copyright (C) 1999, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//--------------------------------------------------------------------
|
||||
// Date Name Description
|
||||
// 11/17/99 aliu Creation.
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
// Latin-Greek
|
||||
|
||||
lgreek {
|
||||
Rule {
|
||||
// Greek Letters
|
||||
|
||||
"grAl=\u0391\n"
|
||||
"grBe=\u0392\n"
|
||||
"grGa=\u0393\n"
|
||||
"grDe=\u0394\n"
|
||||
"grEp=\u0395\n"
|
||||
"grZe=\u0396\n"
|
||||
"grEt=\u0397\n"
|
||||
"grTh=\u0398\n"
|
||||
"grIo=\u0399\n"
|
||||
"grKa=\u039A\n"
|
||||
"grLa=\u039B\n"
|
||||
"grMu=\u039C\n"
|
||||
"grNu=\u039D\n"
|
||||
"grKs=\u039E\n"
|
||||
"grOm=\u039F\n"
|
||||
"grPi=\u03A0\n"
|
||||
"grRh=\u03A1\n"
|
||||
"grSi=\u03A3\n"
|
||||
"grTa=\u03A4\n"
|
||||
"grUp=\u03A5\n"
|
||||
"grPh=\u03A6\n"
|
||||
"grKh=\u03A7\n"
|
||||
"grPs=\u03A8\n"
|
||||
"grOme=\u03A9\n"
|
||||
|
||||
"gral=\u03B1\n"
|
||||
"grbe=\u03B2\n"
|
||||
"grga=\u03B3\n"
|
||||
"grde=\u03B4\n"
|
||||
"grep=\u03B5\n"
|
||||
"grze=\u03B6\n"
|
||||
"gret=\u03B7\n"
|
||||
"grth=\u03B8\n"
|
||||
"grio=\u03B9\n"
|
||||
"grka=\u03BA\n"
|
||||
"grla=\u03BB\n"
|
||||
"grmu=\u03BC\n"
|
||||
"grnu=\u03BD\n"
|
||||
"grks=\u03BE\n"
|
||||
"grom=\u03BF\n"
|
||||
"grpi=\u03C0\n"
|
||||
"grrh=\u03C1\n"
|
||||
"grsi=\u03C3\n"
|
||||
"grta=\u03C4\n"
|
||||
"grup=\u03C5\n"
|
||||
"grph=\u03C6\n"
|
||||
"grkh=\u03C7\n"
|
||||
"grps=\u03C8\n"
|
||||
"grome=\u03C9\n"
|
||||
|
||||
//forms
|
||||
"grfinal=\u03C2\n"
|
||||
|
||||
"grAcAl=\u0386\n"
|
||||
"grAcEp=\u0388\n"
|
||||
"grAcEt=\u0389\n"
|
||||
"grAcIo=\u038A\n"
|
||||
"grAcOm=\u038C\n"
|
||||
"grAcUp=\u038E\n"
|
||||
"grAcOme=\u038F\n"
|
||||
"grDiIo=\u03AA\n"
|
||||
"grDiUp=\u03AB\n"
|
||||
|
||||
"gracal=\u03AC\n"
|
||||
"gracep=\u03AD\n"
|
||||
"gracet=\u03AE\n"
|
||||
"gracio=\u03AF\n"
|
||||
"gracom=\u03CC\n"
|
||||
"gracup=\u03CD\n"
|
||||
"gracome=\u03CE\n"
|
||||
"grdiio=\u03CA\n"
|
||||
"grdiup=\u03CB\n"
|
||||
|
||||
//gracdiio=\u00FD
|
||||
//gracdiup=\u00FE
|
||||
|
||||
"letter=[[:Lu:][:Ll:]]\n"
|
||||
|
||||
// convert Roman to Native
|
||||
"Greek>\u039c\u0397\u039d\u0399\u039d\u0020\u0391\u0395\u0399\u0394\u0395\u002c\u0020\u0398\u0395\u0391\u002c\u0020--\u0397\u039b\u0397\u0399\u0391\u0394\u0395\u03a9\u0020\u0391\u03a7\u0399\u039b\u0397\u039f\u03a3\n"
|
||||
|
||||
"AV`>{grAl}{grAcUp}\n"
|
||||
"EV`>{grEp}{grAcUp}\n"
|
||||
"AV>{grAl}{grUp}\n"
|
||||
"EV>{grEp}{grUp}\n"
|
||||
"NG>{grGa}{grGa}\n"
|
||||
"NK>{grGa}{grKa}\n"
|
||||
"NX>{grGa}{grKs}\n"
|
||||
"NCH>{grGa}{grKh}\n"
|
||||
|
||||
//+ "final = [ .;]\n" // Syntax error, unused anyway - Liu
|
||||
|
||||
"A`>{grAcAl}\n"
|
||||
"EE`>{grAcEt}\n"
|
||||
"E`>{grAcEp}\n"
|
||||
"I`>{grAcIo}\n"
|
||||
"U`>{grAcUp}\n"
|
||||
"OO`>{grAcOme}\n"
|
||||
"O`>{grAcOm}\n"
|
||||
"''I>{grDiIo}\n"
|
||||
"''U>{grDiUp}\n"
|
||||
"A>{grAl}\n"
|
||||
"B>{grBe}\n"
|
||||
"C[I>{grSi}\n"
|
||||
"C[E>{grSi}\n"
|
||||
"C[Y>{grSi}\n"
|
||||
"CH>{grKh}\n"
|
||||
"C>{grKa}\n"
|
||||
"D>{grDe}\n"
|
||||
"EE>{grEt}\n"
|
||||
"E>{grEp}\n"
|
||||
"F>{grPh}\n"
|
||||
"G>{grGa}\n"
|
||||
"H>{grKh}\n"
|
||||
"I>{grIo}\n"
|
||||
"J>{grIo}\n"
|
||||
"KS>{grKs}\n"
|
||||
"KH>{grKh}\n"
|
||||
"K>{grKa}\n"
|
||||
"L>{grLa}\n"
|
||||
"M>{grMu}\n"
|
||||
"N>{grNu}\n"
|
||||
"OO>{grOme}\n"
|
||||
"O>{grOm}\n"
|
||||
"PS>{grPs}\n"
|
||||
"PH>{grPh}\n"
|
||||
"P>{grPi}\n"
|
||||
"Q>{grKa}\n"
|
||||
"R>{grRh}\n"
|
||||
"S>{grSi}\n"
|
||||
"TH>{grTh}\n"
|
||||
"T>{grTa}\n"
|
||||
"W>{grUp}{grUp}\n"
|
||||
"U>{grUp}\n"
|
||||
"V>{grUp}\n"
|
||||
"X>{grKs}\n"
|
||||
"Y>{grUp}\n"
|
||||
"Z>{grZe}\n"
|
||||
|
||||
//now Native to Roman
|
||||
|
||||
"AV<{grAl}{grUp}\n"
|
||||
"EV<{grEp}{grUp}\n"
|
||||
"AV`<{grAl}{grAcUp}\n"
|
||||
"EV`<{grEp}{grAcUp}\n"
|
||||
"N''<{grNu}[{grGa}\n"
|
||||
"NG<{grGa}{grGa}\n"
|
||||
"N''<{grNu}[{grKa}\n"
|
||||
"NK<{grGa}{grKa}\n"
|
||||
"N''<{grNu}[{grKs}\n"
|
||||
"NX<{grGa}{grKs}\n"
|
||||
"N''<{grNu}[{grKh}\n"
|
||||
"NCH<{grGa}{grKh}\n"
|
||||
|
||||
"A<{grAl}\n"
|
||||
"B<{grBe}\n"
|
||||
"G<{grGa}\n"
|
||||
"D<{grDe}\n"
|
||||
"E''<{grEp}[{grEp}\n"
|
||||
"E''<{grEp}[{grEt}\n"
|
||||
"E''<{grEp}[{grAcEp}\n"
|
||||
"E''<{grEp}[{grAcEt}\n"
|
||||
"E<{grEp}\n"
|
||||
"Z<{grZe}\n"
|
||||
"EE<{grEt}\n"
|
||||
"TH<{grTh}\n"
|
||||
"I<{grIo}\n"
|
||||
"K<{grKa}\n"
|
||||
"L<{grLa}\n"
|
||||
"M<{grMu}\n"
|
||||
"N<{grNu}\n"
|
||||
"X<{grKs}\n"
|
||||
"O''<{grOm}[{grOm}\n"
|
||||
"O''<{grOm}[{grOme}\n"
|
||||
"O''<{grOm}[{grAcOm}\n"
|
||||
"O''<{grOm}[{grAcOme}\n"
|
||||
"O<{grOm}\n"
|
||||
"P''<{grPi}[{grSi}\n"
|
||||
"P''<{grPi}[{grfinal}\n"
|
||||
"P<{grPi}\n"
|
||||
"R<{grRh}\n"
|
||||
"S<{grSi}\n"
|
||||
"T<{grTa}\n"
|
||||
"W<{grUp}{grUp}\n"
|
||||
|
||||
"V<{grUp}[{grAcAl}\n"
|
||||
"V<{grUp}[{grAcEp}\n"
|
||||
"V<{grUp}[{grAcEt}\n"
|
||||
"V<{grUp}[{grAcIo}\n"
|
||||
"V<{grUp}[{grAcOm}\n"
|
||||
"V<{grUp}[{grAcUp}\n"
|
||||
"V<{grUp}[{grAcOme}\n"
|
||||
|
||||
"V<{grUp}[{grAl}\n"
|
||||
"V<{grUp}[{grEp}\n"
|
||||
"V<{grUp}[{grEt}\n"
|
||||
"V<{grUp}[{grIo}\n"
|
||||
"V<{grUp}[{grOm}\n"
|
||||
//{grUp}[{grUp}<V
|
||||
"V<{grUp}[{grOme}\n"
|
||||
|
||||
"U<{grUp}\n"
|
||||
"PH<{grPh}\n"
|
||||
"CH<{grKh}\n"
|
||||
"PS<{grPs}\n"
|
||||
"OO<{grOme}\n"
|
||||
//forms
|
||||
"A`<{grAcAl}\n"
|
||||
"E`<{grAcEp}\n"
|
||||
"EE`<{grAcEt}\n"
|
||||
"I`<{grAcIo}\n"
|
||||
"O`<{grAcOm}\n"
|
||||
"U`<{grAcUp}\n"
|
||||
"OO`<{grAcOme}\n"
|
||||
"''I<{grDiIo}\n"
|
||||
"''U<{grDiUp}\n"
|
||||
|
||||
//{gracdiio}<XX
|
||||
//{gracdiup}<XX
|
||||
//{grfinal}<XX
|
||||
|
||||
"av`>{gral}{gracup}\n"
|
||||
"ev`>{grep}{gracup}\n"
|
||||
"av>{gral}{grup}\n"
|
||||
"ev>{grep}{grup}\n"
|
||||
"ng>{grga}{grga}\n"
|
||||
"nk>{grga}{grka}\n"
|
||||
"nx>{grga}{grks}\n"
|
||||
"nch>{grga}{grkh}\n"
|
||||
|
||||
"a`>{gracal}\n"
|
||||
"ee`>{gracet}\n"
|
||||
"e`>{gracep}\n"
|
||||
"i`>{gracio}\n"
|
||||
"u`>{gracup}\n"
|
||||
"oo`>{gracome}\n"
|
||||
"o`>{gracom}\n"
|
||||
"''i>{grdiio}\n"
|
||||
"''u>{grdiup}\n"
|
||||
"a>{gral}\n"
|
||||
"b>{grbe}\n"
|
||||
"c[i>{grsi}\n"
|
||||
"c[e>{grsi}\n"
|
||||
"c[y>{grsi}\n"
|
||||
"ch>{grkh}\n"
|
||||
"c>{grka}\n"
|
||||
"d>{grde}\n"
|
||||
"ee>{gret}\n"
|
||||
"e>{grep}\n"
|
||||
"f>{grph}\n"
|
||||
"g>{grga}\n"
|
||||
"h>{grkh}\n"
|
||||
"i>{grio}\n"
|
||||
"j>{grio}\n"
|
||||
"ks>{grks}\n"
|
||||
"kh>{grkh}\n"
|
||||
"k>{grka}\n"
|
||||
"l>{grla}\n"
|
||||
"m>{grmu}\n"
|
||||
"n>{grnu}\n"
|
||||
"oo>{grome}\n"
|
||||
"o>{grom}\n"
|
||||
"ps>{grps}\n"
|
||||
"ph>{grph}\n"
|
||||
"p>{grpi}\n"
|
||||
"q>{grka}\n"
|
||||
"r>{grrh}\n"
|
||||
"s>|{grfinal}\n"
|
||||
"{grfinal}[{letter}>{grsi}\n"
|
||||
"th>{grth}\n"
|
||||
"t>{grta}\n"
|
||||
"w>{grup}{grup}\n"
|
||||
"u>{grup}\n"
|
||||
"v>{grup}\n"
|
||||
"x>{grks}\n"
|
||||
"y>{grup}\n"
|
||||
"z>{grze}\n"
|
||||
|
||||
|
||||
//forms
|
||||
"''>\n"
|
||||
//now native to roman
|
||||
|
||||
"av<{gral}{grup}\n"
|
||||
"ev<{grep}{grup}\n"
|
||||
"av`<{gral}{gracup}\n"
|
||||
"ev`<{grep}{gracup}\n"
|
||||
"n''<{grnu}[{grga}\n"
|
||||
"ng<{grga}{grga}\n"
|
||||
"n''<{grnu}[{grka}\n"
|
||||
"nk<{grga}{grka}\n"
|
||||
"n''<{grnu}[{grks}\n"
|
||||
"nx<{grga}{grks}\n"
|
||||
"n''<{grnu}[{grkh}\n"
|
||||
"nch<{grga}{grkh}\n"
|
||||
|
||||
"a<{gral}\n"
|
||||
"b<{grbe}\n"
|
||||
"g<{grga}\n"
|
||||
"d<{grde}\n"
|
||||
"e''<{grep}[{grep}\n"
|
||||
"e''<{grep}[{gret}\n"
|
||||
"e''<{grep}[{gracep}\n"
|
||||
"e''<{grep}[{gracet}\n"
|
||||
"e<{grep}\n"
|
||||
"z<{grze}\n"
|
||||
"ee<{gret}\n"
|
||||
"th<{grth}\n"
|
||||
"i<{grio}\n"
|
||||
"k<{grka}\n"
|
||||
"l<{grla}\n"
|
||||
"m<{grmu}\n"
|
||||
"n<{grnu}\n"
|
||||
"x<{grks}\n"
|
||||
"o''<{grom}[{grom}\n"
|
||||
"o''<{grom}[{grome}\n"
|
||||
"o''<{grom}[{gracom}\n"
|
||||
"o''<{grom}[{gracome}\n"
|
||||
"o<{grom}\n"
|
||||
"p''<{grpi}[{grsi}\n"
|
||||
"p''<{grpi}[{grfinal}\n"
|
||||
"p<{grpi}\n"
|
||||
"r<{grrh}\n"
|
||||
"s<{grsi}\n"
|
||||
"s<{grfinal}\n"
|
||||
"t<{grta}\n"
|
||||
"w<{grup}{grup}\n"
|
||||
|
||||
"v<{grup}[{gracal}\n"
|
||||
"v<{grup}[{gracep}\n"
|
||||
"v<{grup}[{gracet}\n"
|
||||
"v<{grup}[{gracio}\n"
|
||||
"v<{grup}[{gracom}\n"
|
||||
"v<{grup}[{gracup}\n"
|
||||
"v<{grup}[{gracome}\n"
|
||||
|
||||
"v<{grup}[{gral}\n"
|
||||
"v<{grup}[{grep}\n"
|
||||
"v<{grup}[{gret}\n"
|
||||
"v<{grup}[{grio}\n"
|
||||
"v<{grup}[{grom}\n"
|
||||
//{grup}[{grup}<v
|
||||
"v<{grup}[{grome}\n"
|
||||
|
||||
"u<{grup}\n"
|
||||
"ph<{grph}\n"
|
||||
"ch<{grkh}\n"
|
||||
"ps<{grps}\n"
|
||||
"oo<{grome}\n"
|
||||
//forms
|
||||
"a`<{gracal}\n"
|
||||
"e`<{gracep}\n"
|
||||
"ee`<{gracet}\n"
|
||||
"i`<{gracio}\n"
|
||||
"o`<{gracom}\n"
|
||||
"u`<{gracup}\n"
|
||||
"oo`<{gracome}\n"
|
||||
"''i<{grdiio}\n"
|
||||
"''u<{grdiup}\n"
|
||||
"<''\n"
|
||||
|
||||
//{gracdiio}<xx
|
||||
//{gracdiup}<xx
|
||||
//{grfinal}<xx
|
||||
}
|
||||
}
|
2267
icu4c/data/translit/lhalfwid.txt
Normal file
2267
icu4c/data/translit/lhalfwid.txt
Normal file
File diff suppressed because it is too large
Load diff
279
icu4c/data/translit/lhebrew.txt
Normal file
279
icu4c/data/translit/lhebrew.txt
Normal file
|
@ -0,0 +1,279 @@
|
|||
//--------------------------------------------------------------------
|
||||
// Copyright (C) 1999, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//--------------------------------------------------------------------
|
||||
// Date Name Description
|
||||
// 11/17/99 aliu Creation.
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
// Latin-Hebrew
|
||||
|
||||
lhebrew {
|
||||
Rule {
|
||||
//variable names, derived from the Unicode names.
|
||||
|
||||
"POINT_SHEVA=\u05B0\n"
|
||||
"POINT_HATAF_SEGOL=\u05B1\n"
|
||||
"POINT_HATAF_PATAH=\u05B2\n"
|
||||
"POINT_HATAF_QAMATS=\u05B3\n"
|
||||
"POINT_HIRIQ=\u05B4\n"
|
||||
"POINT_TSERE=\u05B5\n"
|
||||
"POINT_SEGOL=\u05B6\n"
|
||||
"POINT_PATAH=\u05B7\n"
|
||||
"POINT_QAMATS=\u05B8\n"
|
||||
"POINT_HOLAM=\u05B9\n"
|
||||
"POINT_QUBUTS=\u05BB\n"
|
||||
"POINT_DAGESH_OR_MAPIQ=\u05BC\n"
|
||||
"POINT_METEG=\u05BD\n"
|
||||
"PUNCTUATION_MAQAF=\u05BE\n"
|
||||
"POINT_RAFE=\u05BF\n"
|
||||
"PUNCTUATION_PASEQ=\u05C0\n"
|
||||
"POINT_SHIN_DOT=\u05C1\n"
|
||||
"POINT_SIN_DOT=\u05C2\n"
|
||||
"PUNCTUATION_SOF_PASUQ=\u05C3\n"
|
||||
"ALEF=\u05D0\n"
|
||||
"BET=\u05D1\n"
|
||||
"GIMEL=\u05D2\n"
|
||||
"DALET=\u05D3\n"
|
||||
"HE=\u05D4\n"
|
||||
"VAV=\u05D5\n"
|
||||
"ZAYIN=\u05D6\n"
|
||||
"HET=\u05D7\n"
|
||||
"TET=\u05D8\n"
|
||||
"YOD=\u05D9\n"
|
||||
"FINAL_KAF=\u05DA\n"
|
||||
"KAF=\u05DB\n"
|
||||
"LAMED=\u05DC\n"
|
||||
"FINAL_MEM=\u05DD\n"
|
||||
"MEM=\u05DE\n"
|
||||
"FINAL_NUN=\u05DF\n"
|
||||
"NUN=\u05E0\n"
|
||||
"SAMEKH=\u05E1\n"
|
||||
"AYIN=\u05E2\n"
|
||||
"FINAL_PE=\u05E3\n"
|
||||
"PE=\u05E4\n"
|
||||
"FINAL_TSADI=\u05E5\n"
|
||||
"TSADI=\u05E6\n"
|
||||
"QOF=\u05E7\n"
|
||||
"RESH=\u05E8\n"
|
||||
"SHIN=\u05E9\n"
|
||||
"TAV=\u05EA\n"
|
||||
"YIDDISH_DOUBLE_VAV=\u05F0\n"
|
||||
"YIDDISH_VAV_YOD=\u05F1\n"
|
||||
"YIDDISH_DOUBLE_YOD=\u05F2\n"
|
||||
"PUNCTUATION_GERESH=\u05F3\n"
|
||||
"PUNCTUATION_GERSHAYIM=\u05F4\n"
|
||||
|
||||
//wildcards
|
||||
//The values can be anything we don't use in this file: start at E000.
|
||||
|
||||
"letter=[abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ]\n"
|
||||
|
||||
"softvowel=[eiyEIY]\n"
|
||||
|
||||
"vowellike=[{ALEF}{AYIN}{YOD}{VAV}]\n"
|
||||
|
||||
//?>{POINT_SHEVA}
|
||||
//?>{POINT_HATAF_SEGOL}
|
||||
//?>{POINT_HATAF_PATAH}
|
||||
//?>{POINT_HATAF_QAMATS}
|
||||
//?>{POINT_HIRIQ}
|
||||
//?>{POINT_TSERE}
|
||||
//?>{POINT_SEGOL}
|
||||
//?>{POINT_PATAH}
|
||||
//?>{POINT_QAMATS}
|
||||
//?>{POINT_HOLAM}
|
||||
//?>{POINT_QUBUTS}
|
||||
//?>{POINT_DAGESH_OR_MAPIQ}
|
||||
//?>{POINT_METEG}
|
||||
//?>{PUNCTUATION_MAQAF}
|
||||
//?>{POINT_RAFE}
|
||||
//?>{PUNCTUATION_PASEQ}
|
||||
//?>{POINT_SHIN_DOT}
|
||||
//?>{POINT_SIN_DOT}
|
||||
//?>{PUNCTUATION_SOF_PASUQ}
|
||||
|
||||
"a>{ALEF}\n"
|
||||
"A>{ALEF}\n"
|
||||
|
||||
"b>{BET}\n"
|
||||
"B>{BET}\n"
|
||||
|
||||
"c[{softvowel}>{SAMEKH}\n"
|
||||
"C[{softvowel}>{SAMEKH}\n"
|
||||
"c[{letter}>{KAF}\n"
|
||||
"C[{letter}>{KAF}\n"
|
||||
"c>{FINAL_KAF}\n"
|
||||
"C>{FINAL_KAF}\n"
|
||||
|
||||
"d>{DALET}\n"
|
||||
"D>{DALET}\n"
|
||||
|
||||
"e>{AYIN}\n"
|
||||
"E>{AYIN}\n"
|
||||
|
||||
"f[{letter}>{PE}\n"
|
||||
"f>{FINAL_PE}\n"
|
||||
"F[{letter}>{PE}\n"
|
||||
"F>{FINAL_PE}\n"
|
||||
|
||||
"g>{GIMEL}\n"
|
||||
"G>{GIMEL}\n"
|
||||
|
||||
"h>{HE}\n"
|
||||
"H>{HE}\n"
|
||||
|
||||
"i>{YOD}\n"
|
||||
"I>{YOD}\n"
|
||||
|
||||
"j>{DALET}{SHIN}\n"
|
||||
"J>{DALET}{SHIN}\n"
|
||||
|
||||
"kH>{HET}\n"
|
||||
"kh>{HET}\n"
|
||||
"KH>{HET}\n"
|
||||
"Kh>{HET}\n"
|
||||
"k[{letter}>{KAF}\n"
|
||||
"K[{letter}>{KAF}\n"
|
||||
"k>{FINAL_KAF}\n"
|
||||
"K>{FINAL_KAF}\n"
|
||||
|
||||
"l>{LAMED}\n"
|
||||
"L>{LAMED}\n"
|
||||
|
||||
"m[{letter}>{MEM}\n"
|
||||
"m>{FINAL_MEM}\n"
|
||||
"M[{letter}>{MEM}\n"
|
||||
"M>{FINAL_MEM}\n"
|
||||
|
||||
"n[{letter}>{NUN}\n"
|
||||
"n>{FINAL_NUN}\n"
|
||||
"N[{letter}>{NUN}\n"
|
||||
"N>{FINAL_NUN}\n"
|
||||
|
||||
"o>{VAV}\n"
|
||||
"O>{VAV}\n"
|
||||
|
||||
"p[{letter}>{PE}\n"
|
||||
"p>{FINAL_PE}\n"
|
||||
"P[{letter}>{PE}\n"
|
||||
"P>{FINAL_PE}\n"
|
||||
|
||||
"q>{QOF}\n"
|
||||
"Q>{QOF}\n"
|
||||
|
||||
"r>{RESH}\n"
|
||||
"R>{RESH}\n"
|
||||
|
||||
"sH>{SHIN}\n"
|
||||
"sh>{SHIN}\n"
|
||||
"SH>{SHIN}\n"
|
||||
"Sh>{SHIN}\n"
|
||||
"s>{SAMEKH}\n"
|
||||
"S>{SAMEKH}\n"
|
||||
|
||||
"th>{TAV}\n"
|
||||
"tH>{TAV}\n"
|
||||
"TH>{TAV}\n"
|
||||
"Th>{TAV}\n"
|
||||
"tS[{letter}>{TSADI}\n"
|
||||
"ts[{letter}>{TSADI}\n"
|
||||
"Ts[{letter}>{TSADI}\n"
|
||||
"TS[{letter}>{TSADI}\n"
|
||||
"tS>{FINAL_TSADI}\n"
|
||||
"ts>{FINAL_TSADI}\n"
|
||||
"Ts>{FINAL_TSADI}\n"
|
||||
"TS>{FINAL_TSADI}\n"
|
||||
"t>{TET}\n"
|
||||
"T>{TET}\n"
|
||||
|
||||
"u>{VAV}\n"
|
||||
"U>{VAV}\n"
|
||||
|
||||
"v>{VAV}\n"
|
||||
"V>{VAV}\n"
|
||||
|
||||
"w>{VAV}\n"
|
||||
"W>{VAV}\n"
|
||||
|
||||
"x>{KAF}{SAMEKH}\n"
|
||||
"X>{KAF}{SAMEKH}\n"
|
||||
|
||||
"y>{YOD}\n"
|
||||
"Y>{YOD}\n"
|
||||
|
||||
"z>{ZAYIN}\n"
|
||||
"Z>{ZAYIN}\n"
|
||||
|
||||
//#?>{YIDDISH_DOUBLE_VAV}
|
||||
//?>{YIDDISH_VAV_YOD}
|
||||
//?>{YIDDISH_DOUBLE_YOD}
|
||||
//?>{PUNCTUATION_GERESH}
|
||||
//?>{PUNCTUATION_GERSHAYIM}
|
||||
|
||||
"''>\n"
|
||||
|
||||
//{POINT_SHEVA}>@
|
||||
//{POINT_HATAF_SEGOL}>@
|
||||
//{POINT_HATAF_PATAH}>@
|
||||
//{POINT_HATAF_QAMATS}>@
|
||||
//{POINT_HIRIQ}>@
|
||||
//{POINT_TSERE}>@
|
||||
//{POINT_SEGOL}>@
|
||||
//{POINT_PATAH}>@
|
||||
//{POINT_QAMATS}>@
|
||||
//{POINT_HOLAM}>@
|
||||
//{POINT_QUBUTS}>@
|
||||
//{POINT_DAGESH_OR_MAPIQ}>@
|
||||
//{POINT_METEG}>@
|
||||
//{PUNCTUATION_MAQAF}>@
|
||||
//{POINT_RAFE}>@
|
||||
//{PUNCTUATION_PASEQ}>@
|
||||
//{POINT_SHIN_DOT}>@
|
||||
//{POINT_SIN_DOT}>@
|
||||
//{PUNCTUATION_SOF_PASUQ}>@
|
||||
|
||||
"a<{ALEF}\n"
|
||||
"e<{AYIN}\n"
|
||||
"b<{BET}\n"
|
||||
"d<{DALET}\n"
|
||||
"k<{FINAL_KAF}\n"
|
||||
"m<{FINAL_MEM}\n"
|
||||
"n<{FINAL_NUN}\n"
|
||||
"p<{FINAL_PE}\n"
|
||||
"ts<{FINAL_TSADI}\n"
|
||||
"g<{GIMEL}\n"
|
||||
"kh<{HET}\n"
|
||||
"h<{HE}\n"
|
||||
"k''<{KAF}[{HE}\n"
|
||||
"k<{KAF}\n"
|
||||
"l<{LAMED}\n"
|
||||
"m<{MEM}\n"
|
||||
"n<{NUN}\n"
|
||||
"p<{PE}\n"
|
||||
"q<{QOF}\n"
|
||||
"r<{RESH}\n"
|
||||
"s''<{SAMEKH}[{HE}\n"
|
||||
"s<{SAMEKH}\n"
|
||||
"sh<{SHIN}\n"
|
||||
"th<{TAV}\n"
|
||||
"t''<{TET}[{HE}\n"
|
||||
"t''<{TET}[{HE}\n"
|
||||
"t''<{TET}[{SAMEKH}\n"
|
||||
"t''<{TET}[{SHIN}\n"
|
||||
"t<{TET}\n"
|
||||
"ts<{TSADI}\n"
|
||||
"v<{VAV}[{vowellike}\n"
|
||||
"u<{VAV}\n"
|
||||
"y<{YOD}\n"
|
||||
"z<{ZAYIN}\n"
|
||||
|
||||
//{YIDDISH_DOUBLE_VAV}>@
|
||||
//{YIDDISH_VAV_YOD}>@
|
||||
//{YIDDISH_DOUBLE_YOD}>@
|
||||
//{PUNCTUATION_GERESH}>@
|
||||
//{PUNCTUATION_GERSHAYIM}>@
|
||||
|
||||
"<''\n"
|
||||
}
|
||||
}
|
877
icu4c/data/translit/lkana.txt
Normal file
877
icu4c/data/translit/lkana.txt
Normal file
|
@ -0,0 +1,877 @@
|
|||
//--------------------------------------------------------------------
|
||||
// Copyright (C) 1999, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//--------------------------------------------------------------------
|
||||
// Date Name Description
|
||||
// 11/17/99 aliu Creation.
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
// Latin-Kana
|
||||
|
||||
// Rewritten April 1999 to implement Hepburn (kebon shiki)
|
||||
// transliteration. Reference: CJKV Information Processing, Lunde,
|
||||
// 1999, pp. 30-35.
|
||||
// @author Alan Liu
|
||||
|
||||
lkana {
|
||||
Rule {
|
||||
//------------------------------------------------------------
|
||||
// Variables
|
||||
//------------------------------------------------------------
|
||||
|
||||
// Hiragana. These are named according to the
|
||||
// regularized Nippon romanization (the naming system
|
||||
// used by Unicode). Thus \u3062 is called "di", not
|
||||
// "ji". "x_" is the small form of "_", e.g. "xa" is
|
||||
// small "a".
|
||||
|
||||
"xa=\u3041\n"
|
||||
"a=\u3042\n"
|
||||
"xi=\u3043\n"
|
||||
"i=\u3044\n"
|
||||
"xu=\u3045\n"
|
||||
"u=\u3046\n"
|
||||
"xe=\u3047\n"
|
||||
"e=\u3048\n"
|
||||
"xo=\u3049\n"
|
||||
"o=\u304A\n"
|
||||
|
||||
"ka=\u304B\n"
|
||||
"ga=\u304C\n"
|
||||
"ki=\u304D\n"
|
||||
"gi=\u304E\n"
|
||||
"ku=\u304F\n"
|
||||
"gu=\u3050\n"
|
||||
"ke=\u3051\n"
|
||||
"ge=\u3052\n"
|
||||
"ko=\u3053\n"
|
||||
"go=\u3054\n"
|
||||
|
||||
"sa=\u3055\n"
|
||||
"za=\u3056\n"
|
||||
"si=\u3057\n"
|
||||
"zi=\u3058\n"
|
||||
"su=\u3059\n"
|
||||
"zu=\u305A\n"
|
||||
"se=\u305B\n"
|
||||
"ze=\u305C\n"
|
||||
"so=\u305D\n"
|
||||
"zo=\u305E\n"
|
||||
|
||||
"ta=\u305F\n"
|
||||
"da=\u3060\n"
|
||||
"ti=\u3061\n"
|
||||
"di=\u3062\n"
|
||||
"xtu=\u3063\n"
|
||||
"tu=\u3064\n"
|
||||
"du=\u3065\n"
|
||||
"te=\u3066\n"
|
||||
"de=\u3067\n"
|
||||
"to=\u3068\n"
|
||||
"do=\u3069\n"
|
||||
|
||||
"na=\u306A\n"
|
||||
"ni=\u306B\n"
|
||||
"nu=\u306C\n"
|
||||
"ne=\u306D\n"
|
||||
"no=\u306E\n"
|
||||
|
||||
"ha=\u306F\n"
|
||||
"ba=\u3070\n"
|
||||
"pa=\u3071\n"
|
||||
"hi=\u3072\n"
|
||||
"bi=\u3073\n"
|
||||
"pi=\u3074\n"
|
||||
"hu=\u3075\n"
|
||||
"bu=\u3076\n"
|
||||
"pu=\u3077\n"
|
||||
"he=\u3078\n"
|
||||
"be=\u3079\n"
|
||||
"pe=\u307A\n"
|
||||
"ho=\u307B\n"
|
||||
"bo=\u307C\n"
|
||||
"po=\u307D\n"
|
||||
|
||||
"ma=\u307E\n"
|
||||
"mi=\u307F\n"
|
||||
"mu=\u3080\n"
|
||||
"me=\u3081\n"
|
||||
"mo=\u3082\n"
|
||||
|
||||
"xya=\u3083\n"
|
||||
"ya=\u3084\n"
|
||||
"xyu=\u3085\n"
|
||||
"yu=\u3086\n"
|
||||
"xyo=\u3087\n"
|
||||
"yo=\u3088\n"
|
||||
|
||||
"ra=\u3089\n"
|
||||
"ri=\u308A\n"
|
||||
"ru=\u308B\n"
|
||||
"re=\u308C\n"
|
||||
"ro=\u308D\n"
|
||||
|
||||
"xwa=\u308E\n"
|
||||
"wa=\u308F\n"
|
||||
"wi=\u3090\n"
|
||||
"we=\u3091\n"
|
||||
"wo=\u3092\n"
|
||||
|
||||
"n=\u3093\n"
|
||||
"vu=\u3094\n"
|
||||
|
||||
// Katakana. "X_" is the small form of "_", e.g. "XA"
|
||||
// is small "A".
|
||||
|
||||
"XA=\u30A1\n"
|
||||
"A=\u30A2\n"
|
||||
"XI=\u30A3\n"
|
||||
"I=\u30A4\n"
|
||||
"XU=\u30A5\n"
|
||||
"U=\u30A6\n"
|
||||
"XE=\u30A7\n"
|
||||
"E=\u30A8\n"
|
||||
"XO=\u30A9\n"
|
||||
"O=\u30AA\n"
|
||||
|
||||
"KA=\u30AB\n"
|
||||
"GA=\u30AC\n"
|
||||
"KI=\u30AD\n"
|
||||
"GI=\u30AE\n"
|
||||
"KU=\u30AF\n"
|
||||
"GU=\u30B0\n"
|
||||
"KE=\u30B1\n"
|
||||
"GE=\u30B2\n"
|
||||
"KO=\u30B3\n"
|
||||
"GO=\u30B4\n"
|
||||
|
||||
"SA=\u30B5\n"
|
||||
"ZA=\u30B6\n"
|
||||
"SI=\u30B7\n"
|
||||
"ZI=\u30B8\n"
|
||||
"SU=\u30B9\n"
|
||||
"ZU=\u30BA\n"
|
||||
"SE=\u30BB\n"
|
||||
"ZE=\u30BC\n"
|
||||
"SO=\u30BD\n"
|
||||
"ZO=\u30BE\n"
|
||||
|
||||
"TA=\u30BF\n"
|
||||
"DA=\u30C0\n"
|
||||
"TI=\u30C1\n"
|
||||
"DI=\u30C2\n"
|
||||
"XTU=\u30C3\n"
|
||||
"TU=\u30C4\n"
|
||||
"DU=\u30C5\n"
|
||||
"TE=\u30C6\n"
|
||||
"DE=\u30C7\n"
|
||||
"TO=\u30C8\n"
|
||||
"DO=\u30C9\n"
|
||||
|
||||
"NA=\u30CA\n"
|
||||
"NI=\u30CB\n"
|
||||
"NU=\u30CC\n"
|
||||
"NE=\u30CD\n"
|
||||
"NO=\u30CE\n"
|
||||
|
||||
"HA=\u30CF\n"
|
||||
"BA=\u30D0\n"
|
||||
"PA=\u30D1\n"
|
||||
"HI=\u30D2\n"
|
||||
"BI=\u30D3\n"
|
||||
"PI=\u30D4\n"
|
||||
"HU=\u30D5\n"
|
||||
"BU=\u30D6\n"
|
||||
"PU=\u30D7\n"
|
||||
"HE=\u30D8\n"
|
||||
"BE=\u30D9\n"
|
||||
"PE=\u30DA\n"
|
||||
"HO=\u30DB\n"
|
||||
"BO=\u30DC\n"
|
||||
"PO=\u30DD\n"
|
||||
|
||||
"MA=\u30DE\n"
|
||||
"MI=\u30DF\n"
|
||||
"MU=\u30E0\n"
|
||||
"ME=\u30E1\n"
|
||||
"MO=\u30E2\n"
|
||||
|
||||
"XYA=\u30E3\n"
|
||||
"YA=\u30E4\n"
|
||||
"XYU=\u30E5\n"
|
||||
"YU=\u30E6\n"
|
||||
"XYO=\u30E7\n"
|
||||
"YO=\u30E8\n"
|
||||
|
||||
"RA=\u30E9\n"
|
||||
"RI=\u30EA\n"
|
||||
"RU=\u30EB\n"
|
||||
"RE=\u30EC\n"
|
||||
"RO=\u30ED\n"
|
||||
|
||||
"XWA=\u30EE\n"
|
||||
"WA=\u30EF\n"
|
||||
"WI=\u30F0\n"
|
||||
"WE=\u30F1\n"
|
||||
"WO=\u30F2\n"
|
||||
|
||||
"N=\u30F3\n"
|
||||
"VU=\u30F4\n"
|
||||
|
||||
"XKA=\u30F5\n"
|
||||
"XKE=\u30F6\n"
|
||||
|
||||
"VA=\u30F7\n"
|
||||
"VI=\u30F8\n"
|
||||
"VE=\u30F9\n"
|
||||
"VO=\u30FA\n"
|
||||
|
||||
"DOT=\u30FB\n" // Middle dot
|
||||
"LONG=\u30FC\n" // Prolonged sound mark
|
||||
|
||||
// Categories and programmatic variables
|
||||
|
||||
"vowel=[aiueo]\n"
|
||||
"small=\uE000\n"
|
||||
"hvr=\uE001\n"
|
||||
"hv=[{xya}{xi}{xyu}{xe}{xyo}]\n"
|
||||
|
||||
//------------------------------------------------------------
|
||||
// Rules
|
||||
//------------------------------------------------------------
|
||||
/*
|
||||
// Hepburn equivalents
|
||||
|
||||
shi>|si
|
||||
ji>|zi
|
||||
chi>|ti
|
||||
// ji>|di // By default we use the ji-zi mapping
|
||||
tsu>|tu
|
||||
fu>|hu
|
||||
|
||||
sh[{vowel}>|sy
|
||||
ja>|zya
|
||||
// ji = zi
|
||||
ju>|zyu
|
||||
je>|zye
|
||||
jo>|zyo
|
||||
cha>|tya
|
||||
// chi = ti
|
||||
chu>|tyu
|
||||
che>|tye
|
||||
cho>|tyo
|
||||
// j[{vowel} = dy{vowel}, but we use zy{vowel} by default
|
||||
|
||||
// Historically, m preceded b, p, or m; now n is used
|
||||
// in all cases
|
||||
m[b>n
|
||||
m[p>n
|
||||
m[m>n
|
||||
|
||||
// Compatibility
|
||||
|
||||
// 'f' group
|
||||
fa>{fu}{xa}
|
||||
fi>{fu}{xi}
|
||||
// fu = hu
|
||||
fe>{fu}{xe}
|
||||
fo>{fu}{xo}
|
||||
|
||||
// 'jy' group; these will not round-trip, except for "jyi"
|
||||
// See also the 'j' group.
|
||||
jya>|zya
|
||||
jyi>{zi}{xyi}
|
||||
jyu>|zyu
|
||||
jye>|zye
|
||||
jyo>|zyo
|
||||
|
||||
// Nippon romanized forms
|
||||
|
||||
a>{a}
|
||||
i>{i}
|
||||
u>{u}
|
||||
e>{e}
|
||||
o>{o}
|
||||
ka>{ka}
|
||||
ki>{ki}
|
||||
ku>{ku}
|
||||
ke>{ke}
|
||||
ko>{ko}
|
||||
ga>{ga}
|
||||
gi>{gi}
|
||||
gu>{gu}
|
||||
ge>{ge}
|
||||
go>{go}
|
||||
sa>{sa}
|
||||
si>{si}
|
||||
su>{su}
|
||||
se>{se}
|
||||
so>{so}
|
||||
za>{za}
|
||||
zi>{zi}
|
||||
zu>{zu}
|
||||
ze>{ze}
|
||||
zo>{zo}
|
||||
ta>{ta}
|
||||
ti>{ti}
|
||||
tu>{tu}
|
||||
te>{te}
|
||||
to>{to}
|
||||
da>{da}
|
||||
di>{di}
|
||||
du>{du}
|
||||
de>{de}
|
||||
do>{do}
|
||||
na>{na}
|
||||
ni>{ni}
|
||||
nu>{nu}
|
||||
ne>{ne}
|
||||
no>{no}
|
||||
ha>{ha}
|
||||
hi>{hi}
|
||||
hu>{hu}
|
||||
he>{he}
|
||||
ho>{ho}
|
||||
ba>{ba}
|
||||
bi>{bi}
|
||||
bu>{bu}
|
||||
be>{be}
|
||||
bo>{bo}
|
||||
pa>{pa}
|
||||
pi>{pi}
|
||||
pu>{pu}
|
||||
pe>{pe}
|
||||
po>{po}
|
||||
ma>{ma}
|
||||
mi>{mi}
|
||||
mu>{mu}
|
||||
me>{me}
|
||||
mo>{mo}
|
||||
ya>{ya}
|
||||
yu>{yu}
|
||||
yo>{yo}
|
||||
ra>{ra}
|
||||
ri>{ri}
|
||||
ru>{ru}
|
||||
re>{re}
|
||||
ro>{ro}
|
||||
wa>{wa}
|
||||
wi>{wi}
|
||||
// No "wu"
|
||||
we>{we}
|
||||
wo>{wo} // Reverse {wo} to "o", not "wo"
|
||||
n''>{n}
|
||||
n>{n}
|
||||
|
||||
// Palatized Nippon romanized syllables
|
||||
|
||||
ky[{vowel}>{ki}|{small}
|
||||
gy[{vowel}>{gi}|{small}
|
||||
sy[{vowel}>{si}|{small}
|
||||
zy[{vowel}>{zi}|{small}
|
||||
ty[{vowel}>{ti}|{small}
|
||||
dy[{vowel}>{di}|{small}
|
||||
ny[{vowel}>{ni}|{small}
|
||||
my[{vowel}>{mi}|{small}
|
||||
hy[{vowel}>{hi}|{small}
|
||||
by[{vowel}>{bi}|{small}
|
||||
py[{vowel}>{pi}|{small}
|
||||
ry[{vowel}>{ri}|{small}
|
||||
|
||||
// Doubled consonants
|
||||
|
||||
c[c>{xtu}
|
||||
k[k>{xtu}
|
||||
g[g>{xtu}
|
||||
s[s>{xtu}
|
||||
z[z>{xtu}
|
||||
j[j>{xtu}
|
||||
t[t>{xtu}
|
||||
d[d>{xtu}
|
||||
h[h>{xtu}
|
||||
f[f>{xtu}
|
||||
p[p>{xtu}
|
||||
b[b>{xtu}
|
||||
m[m>{xtu}
|
||||
y[y>{xtu}
|
||||
r[r>{xtu}
|
||||
w[w>{xtu}
|
||||
*/
|
||||
|
||||
"a>{a}\n"
|
||||
|
||||
"ba>{ba}\n"
|
||||
"bi>{bi}\n"
|
||||
"bu>{bu}\n"
|
||||
"be>{be}\n"
|
||||
"bo>{bo}\n"
|
||||
"by[{vowel}>{bi}|{small}\n"
|
||||
"b[b>{xtu}\n"
|
||||
|
||||
"da>{da}\n"
|
||||
"di>{di}\n"
|
||||
"du>{du}\n"
|
||||
"de>{de}\n"
|
||||
"do>{do}\n"
|
||||
"dy[{vowel}>{di}|{small}\n"
|
||||
"dh[{vowel}>{de}|{small}\n"
|
||||
"d[d>{xtu}\n"
|
||||
|
||||
"e>{e}\n"
|
||||
|
||||
"fa>{hu}{xa}\n"
|
||||
"fi>{hu}{xi}\n"
|
||||
"fe>{hu}{xe}\n"
|
||||
"fo>{hu}{xo}\n"
|
||||
"fya>{hu}{xya}\n"
|
||||
"fyu>{hu}{xyu}\n"
|
||||
"fyo>{hu}{xyo}\n"
|
||||
"f[f>{xtu}\n"
|
||||
|
||||
"ga>{ga}\n"
|
||||
"gi>{gi}\n"
|
||||
"gu>{gu}\n"
|
||||
"ge>{ge}\n"
|
||||
"go>{go}\n"
|
||||
"gy[{vowel}>{gi}|{small}\n"
|
||||
"gwa>{gu}{xwa}\n"
|
||||
"gwi>{gu}{xi}\n"
|
||||
"gwu>{gu}{xu}\n"
|
||||
"gwe>{gu}{xe}\n"
|
||||
"gwo>{gu}{xo}\n"
|
||||
"g[g>{xtu}\n"
|
||||
|
||||
"ha>{ha}\n"
|
||||
"hi>{hi}\n"
|
||||
"hu>{hu}\n"
|
||||
"he>{he}\n"
|
||||
"ho>{ho}\n"
|
||||
"hy[{vowel}>{hi}|{small}\n"
|
||||
"h[h>{xtu}\n"
|
||||
|
||||
"i>{i}\n"
|
||||
|
||||
"ka>{ka}\n"
|
||||
"ki>{ki}\n"
|
||||
"ku>{ku}\n"
|
||||
"ke>{ke}\n"
|
||||
"ko>{ko}\n"
|
||||
"kwa>{ku}{xwa}\n"
|
||||
"kwi>{ku}{xi}\n"
|
||||
"kwu>{ku}{xu}\n"
|
||||
"kwe>{ku}{xe}\n"
|
||||
"kwo>{ku}{xo}\n"
|
||||
"ky[{vowel}>{ki}|{small}\n"
|
||||
"k[k>{xtu}\n"
|
||||
|
||||
"ma>{ma}\n"
|
||||
"mi>{mi}\n"
|
||||
"mu>{mu}\n"
|
||||
"me>{me}\n"
|
||||
"mo>{mo}\n"
|
||||
"my[{vowel}>{mi}|{small}\n"
|
||||
"m[b>{n}\n"
|
||||
"m[f>{n}\n"
|
||||
"m[m>{n}\n"
|
||||
"m[p>{n}\n"
|
||||
"m[v>{n}\n"
|
||||
"m''>{n}\n"
|
||||
|
||||
"na>{na}\n"
|
||||
"ni>{ni}\n"
|
||||
"nu>{nu}\n"
|
||||
"ne>{ne}\n"
|
||||
"no>{no}\n"
|
||||
"ny[{vowel}>{ni}|{small}\n"
|
||||
"nn>{n}\n"
|
||||
"n''>{n}\n"
|
||||
"n>{n}\n"
|
||||
|
||||
"o>{o}\n"
|
||||
|
||||
"pa>{pa}\n"
|
||||
"pi>{pi}\n"
|
||||
"pu>{pu}\n"
|
||||
"pe>{pe}\n"
|
||||
"po>{po}\n"
|
||||
"py[{vowel}>{pi}|{small}\n"
|
||||
"p[p>{xtu}\n"
|
||||
|
||||
"qa>{ku}{xa}\n"
|
||||
"qi>{ku}{xi}\n"
|
||||
"qu>{ku}{xu}\n"
|
||||
"qe>{ku}{xe}\n"
|
||||
"qo>{ku}{xo}\n"
|
||||
"qy[{vowel}>{ku}|{small}\n"
|
||||
"q[q>{xtu}\n"
|
||||
|
||||
"ra>{ra}\n"
|
||||
"ri>{ri}\n"
|
||||
"ru>{ru}\n"
|
||||
"re>{re}\n"
|
||||
"ro>{ro}\n"
|
||||
"ry[{vowel}>{ri}|{small}\n"
|
||||
"r[r>{xtu}\n"
|
||||
|
||||
"sa>{sa}\n"
|
||||
"si>{si}\n"
|
||||
"su>{su}\n"
|
||||
"se>{se}\n"
|
||||
"so>{so}\n"
|
||||
"sy[{vowel}>{si}|{small}\n"
|
||||
"s[sh>{xtu}\n"
|
||||
"s[s>{xtu}\n"
|
||||
|
||||
"ta>{ta}\n"
|
||||
"ti>{ti}\n"
|
||||
"tu>{tu}\n"
|
||||
"te>{te}\n"
|
||||
"to>{to}\n"
|
||||
"th[{vowel}>{te}|{small}\n"
|
||||
"tsa>{tu}{xa}\n"
|
||||
"tsi>{tu}{xi}\n"
|
||||
"tse>{tu}{xe}\n"
|
||||
"tso>{tu}{xo}\n"
|
||||
"ty[{vowel}>{ti}|{small}\n"
|
||||
"t[ts>{xtu}\n"
|
||||
"t[ch>{xtu}\n"
|
||||
"t[t>{xtu}\n"
|
||||
|
||||
"u>{u}\n"
|
||||
|
||||
"va>{VA}\n"
|
||||
"vi>{VI}\n"
|
||||
"vu>{vu}\n"
|
||||
"ve>{VE}\n"
|
||||
"vo>{VO}\n"
|
||||
"vy[{vowel}>{VI}|{small}\n"
|
||||
"v[v>{xtu}\n"
|
||||
|
||||
"wa>{wa}\n"
|
||||
"wi>{wi}\n"
|
||||
"we>{we}\n"
|
||||
"wo>{wo}\n"
|
||||
"w[w>{xtu}\n"
|
||||
|
||||
"ya>{ya}\n"
|
||||
"yu>{yu}\n"
|
||||
"ye>{i}{xe}\n"
|
||||
"yo>{yo}\n"
|
||||
"y[y>{xtu}\n"
|
||||
|
||||
"za>{za}\n"
|
||||
"zi>{zi}\n"
|
||||
"zu>{zu}\n"
|
||||
"ze>{ze}\n"
|
||||
"zo>{zo}\n"
|
||||
"zy[{vowel}>{zi}|{small}\n"
|
||||
"z[z>{xtu}\n"
|
||||
|
||||
"xa>{xa}\n"
|
||||
"xi>{xi}\n"
|
||||
"xu>{xu}\n"
|
||||
"xe>{xe}\n"
|
||||
"xo>{xo}\n"
|
||||
"xka>{XKA}\n"
|
||||
"xke>{XKE}\n"
|
||||
"xtu>{xtu}\n"
|
||||
"xwa>{xwa}\n"
|
||||
"xya>{xya}\n"
|
||||
"xyu>{xyu}\n"
|
||||
"xyo>{xyo}\n"
|
||||
|
||||
// optional mappings
|
||||
"wu>{u}\n"
|
||||
|
||||
"ca>{ka}\n"
|
||||
"ci>{si}\n"
|
||||
"cu>{ku}\n"
|
||||
"ce>{se}\n"
|
||||
"co>{ko}\n"
|
||||
"cha>{ti}{xya}\n"
|
||||
"chi>{ti}\n"
|
||||
"chu>{ti}{xyu}\n"
|
||||
"che>{ti}{xe}\n"
|
||||
"cho>{ti}{xyo}\n"
|
||||
"cy[{vowel}>{ti}|{small}\n"
|
||||
"c[k>{xtu}\n"
|
||||
"c[c>{xtu}\n"
|
||||
|
||||
"fu>{hu}\n"
|
||||
|
||||
"ja>{zi}{xya}\n"
|
||||
"ji>{zi}\n"
|
||||
"ju>{zi}{xyu}\n"
|
||||
"je>{zi}{xe}\n"
|
||||
"jo>{zi}{xyo}\n"
|
||||
"jy[{vowel}>{zi}|{small}\n"
|
||||
"j[j>{xtu}\n"
|
||||
|
||||
"la>{ra}\n"
|
||||
"li>{ri}\n"
|
||||
"lu>{ru}\n"
|
||||
"le>{re}\n"
|
||||
"lo>{ro}\n"
|
||||
"ly[{vowel}>{ri}|{small}\n"
|
||||
"l[l>{xtu}\n"
|
||||
|
||||
"sha>{si}{xya}\n"
|
||||
"shi>{si}\n"
|
||||
"shu>{si}{xyu}\n"
|
||||
"she>{si}{xe}\n"
|
||||
"sho>{si}{xyo}\n"
|
||||
|
||||
"tsu>{tu}\n"
|
||||
|
||||
"yi>{i}\n"
|
||||
|
||||
"xtsu>{xtu}\n"
|
||||
"xyi>{xi}\n"
|
||||
"xye>{xe}\n"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// Convert vowels to small form
|
||||
"{small}a>{xya}\n"
|
||||
"{small}i>{xi}\n"
|
||||
"{small}u>{xyu}\n"
|
||||
"{small}e>{xe}\n"
|
||||
"{small}o>{xyo}\n"
|
||||
|
||||
|
||||
|
||||
|
||||
"gy|{hvr}<{gi}[{hv}\n"
|
||||
"gwa<{gu}{xwa}\n"
|
||||
"gwi<{gu}{xi}\n"
|
||||
"gwu<{gu}{xu}\n"
|
||||
"gwe<{gu}{xe}\n"
|
||||
"gwo<{gu}{xo}\n"
|
||||
"ga<{ga}\n"
|
||||
"gi<{gi}\n"
|
||||
"gu<{gu}\n"
|
||||
"ge<{ge}\n"
|
||||
"go<{go}\n"
|
||||
|
||||
"ky|{hvr}<{ki}[{hv}\n"
|
||||
"kwa<{ku}{xwa}\n"
|
||||
"kwi<{ku}{xi}\n"
|
||||
"kwu<{ku}{xu}\n"
|
||||
"kwe<{ku}{xe}\n"
|
||||
"kwo<{ku}{xo}\n"
|
||||
"qa<{ku}{xa}\n"
|
||||
"qya<{ku}{xya}\n"
|
||||
"qyu<{ku}{xyu}\n"
|
||||
"qyo<{ku}{xyo}\n"
|
||||
"ka<{ka}\n"
|
||||
"ki<{ki}\n"
|
||||
"ku<{ku}\n"
|
||||
"ke<{ke}\n"
|
||||
"ko<{ko}\n"
|
||||
|
||||
"j|{hvr}<{zi}[{hv}\n" // Hepburn
|
||||
"za<{za}\n"
|
||||
"ji<{zi}\n" // Hepburn
|
||||
"zu<{zu}\n"
|
||||
"ze<{ze}\n"
|
||||
"zo<{zo}\n"
|
||||
|
||||
"sh|{hvr}<{si}[{hv}\n" // Hepburn
|
||||
"sa<{sa}\n"
|
||||
"shi<{si}\n"
|
||||
"su<{su}\n"
|
||||
"se<{se}\n"
|
||||
"so<{so}\n"
|
||||
|
||||
"j|{hvr}<{di}[{hv}\n" // Hepburn
|
||||
"dh|{hvr}<{de}[{hv}\n"
|
||||
"da<{da}\n"
|
||||
"ji<{di}\n" // Hepburn
|
||||
"de<{de}\n"
|
||||
"do<{do}\n"
|
||||
"zu<{du}\n" // Hepburn
|
||||
|
||||
"ch|{hvr}<{ti}[{hv}\n" // Hepburn
|
||||
"tsa<{tu}{xa}\n"
|
||||
"tsi<{tu}{xi}\n"
|
||||
"tse<{tu}{xe}\n"
|
||||
"tso<{tu}{xo}\n"
|
||||
"th|{hvr}<{te}[{hv}\n"
|
||||
"ta<{ta}\n"
|
||||
"chi<{ti}\n" // Hepburn
|
||||
"tsu<{tu}\n" // Hepburn
|
||||
"te<{te}\n"
|
||||
"to<{to}\n"
|
||||
|
||||
"ny|{hvr}<{ni}[{hv}\n"
|
||||
"na<{na}\n"
|
||||
"ni<{ni}\n"
|
||||
"nu<{nu}\n"
|
||||
"ne<{ne}\n"
|
||||
"no<{no}\n"
|
||||
|
||||
"by|{hvr}<{bi}[{hv}\n"
|
||||
"ba<{ba}\n"
|
||||
"bi<{bi}\n"
|
||||
"bu<{bu}\n"
|
||||
"be<{be}\n"
|
||||
"bo<{bo}\n"
|
||||
|
||||
"py|{hvr}<{pi}[{hv}\n"
|
||||
"pa<{pa}\n"
|
||||
"pi<{pi}\n"
|
||||
"pu<{pu}\n"
|
||||
"pe<{pe}\n"
|
||||
"po<{po}\n"
|
||||
|
||||
"hy|{hvr}<{hi}[{hv}\n"
|
||||
"fa<{hu}{xa}\n"
|
||||
"fi<{hu}{xi}\n"
|
||||
"fe<{hu}{xe}\n"
|
||||
"fo<{hu}{xo}\n"
|
||||
"fya<{hu}{xya}\n"
|
||||
"fyu<{hu}{xyu}\n"
|
||||
"fyo<{hu}{xyo}\n"
|
||||
"ha<{ha}\n"
|
||||
"hi<{hi}\n"
|
||||
"fu<{hu}\n" // Hepburn
|
||||
"he<{he}\n"
|
||||
"ho<{ho}\n"
|
||||
|
||||
"my|{hvr}<{mi}[{hv}\n"
|
||||
"ma<{ma}\n"
|
||||
"mi<{mi}\n"
|
||||
"mu<{mu}\n"
|
||||
"me<{me}\n"
|
||||
"mo<{mo}\n"
|
||||
|
||||
"ya<{ya}\n"
|
||||
"yu<{yu}\n"
|
||||
"ye<{i}{xe}\n"
|
||||
"yo<{yo}\n"
|
||||
"xya<{xya}\n"
|
||||
"xyu<{xyu}\n"
|
||||
"xyo<{xyo}\n"
|
||||
|
||||
"ry|{hvr}<{ri}[{hv}\n"
|
||||
"ra<{ra}\n"
|
||||
"ri<{ri}\n"
|
||||
"ru<{ru}\n"
|
||||
"re<{re}\n"
|
||||
"ro<{ro}\n"
|
||||
|
||||
"wa<{wa}\n"
|
||||
"wi<{wi}\n"
|
||||
"we<{we}\n"
|
||||
"wo<{wo}\n"
|
||||
|
||||
"vu<{vu}\n"
|
||||
"vy|{hvr}<{VI}[{hv}\n"
|
||||
"v<{xtu}[{vu}\n"
|
||||
|
||||
"xa<{xa}\n"
|
||||
"xi<{xi}\n"
|
||||
"xu<{xu}\n"
|
||||
"xe<{xe}\n"
|
||||
"xo<{xo}\n"
|
||||
|
||||
"n''<{n}[{a}\n"
|
||||
"n''<{n}[{i}\n"
|
||||
"n''<{n}[{u}\n"
|
||||
"n''<{n}[{e}\n"
|
||||
"n''<{n}[{o}\n"
|
||||
"n''<{n}[{na}\n"
|
||||
"n''<{n}[{ni}\n"
|
||||
"n''<{n}[{nu}\n"
|
||||
"n''<{n}[{ne}\n"
|
||||
"n''<{n}[{no}\n"
|
||||
"n''<{n}[{ya}\n"
|
||||
"n''<{n}[{yu}\n"
|
||||
"n''<{n}[{yo}\n"
|
||||
"n''<{n}[{n}\n"
|
||||
"n<{n}\n"
|
||||
|
||||
|
||||
"g<{xtu}[{ga}\n"
|
||||
"g<{xtu}[{gi}\n"
|
||||
"g<{xtu}[{gu}\n"
|
||||
"g<{xtu}[{ge}\n"
|
||||
"g<{xtu}[{go}\n"
|
||||
"k<{xtu}[{ka}\n"
|
||||
"k<{xtu}[{ki}\n"
|
||||
"k<{xtu}[{ku}\n"
|
||||
"k<{xtu}[{ke}\n"
|
||||
"k<{xtu}[{ko}\n"
|
||||
|
||||
"z<{xtu}[{za}\n"
|
||||
"z<{xtu}[{zi}\n"
|
||||
"z<{xtu}[{zu}\n"
|
||||
"z<{xtu}[{ze}\n"
|
||||
"z<{xtu}[{zo}\n"
|
||||
"s<{xtu}[{sa}\n"
|
||||
"s<{xtu}[{si}\n"
|
||||
"s<{xtu}[{su}\n"
|
||||
"s<{xtu}[{se}\n"
|
||||
"s<{xtu}[{so}\n"
|
||||
|
||||
"d<{xtu}[{da}\n"
|
||||
"d<{xtu}[{di}\n"
|
||||
"d<{xtu}[{du}\n"
|
||||
"d<{xtu}[{de}\n"
|
||||
"d<{xtu}[{do}\n"
|
||||
"t<{xtu}[{ta}\n"
|
||||
"t<{xtu}[{ti}\n"
|
||||
"t<{xtu}[{tu}\n"
|
||||
"t<{xtu}[{te}\n"
|
||||
"t<{xtu}[{to}\n"
|
||||
|
||||
|
||||
"b<{xtu}[{ba}\n"
|
||||
"b<{xtu}[{bi}\n"
|
||||
"b<{xtu}[{bu}\n"
|
||||
"b<{xtu}[{be}\n"
|
||||
"b<{xtu}[{bo}\n"
|
||||
"p<{xtu}[{pa}\n"
|
||||
"p<{xtu}[{pi}\n"
|
||||
"p<{xtu}[{pu}\n"
|
||||
"p<{xtu}[{pe}\n"
|
||||
"p<{xtu}[{po}\n"
|
||||
"h<{xtu}[{ha}\n"
|
||||
"h<{xtu}[{hi}\n"
|
||||
"h<{xtu}[{hu}\n"
|
||||
"h<{xtu}[{he}\n"
|
||||
"h<{xtu}[{ho}\n"
|
||||
|
||||
|
||||
"r<{xtu}[{ra}\n"
|
||||
"r<{xtu}[{ri}\n"
|
||||
"r<{xtu}[{ru}\n"
|
||||
"r<{xtu}[{re}\n"
|
||||
"r<{xtu}[{ro}\n"
|
||||
|
||||
"w<{xtu}[{wa}\n"
|
||||
"xtu<{xtu}\n"
|
||||
|
||||
"a<{a}\n"
|
||||
"i<{i}\n"
|
||||
"u<{u}\n"
|
||||
"e<{e}\n"
|
||||
"o<{o}\n"
|
||||
|
||||
|
||||
|
||||
// Convert small forms to vowels
|
||||
"a<{hvr}{xya}\n"
|
||||
"i<{hvr}{xi}\n"
|
||||
"u<{hvr}{xyu}\n"
|
||||
"e<{hvr}{xe}\n"
|
||||
"o<{hvr}{xyo}\n"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
315
icu4c/data/translit/lrussian.txt
Normal file
315
icu4c/data/translit/lrussian.txt
Normal file
|
@ -0,0 +1,315 @@
|
|||
//--------------------------------------------------------------------
|
||||
// Copyright (C) 1999, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//--------------------------------------------------------------------
|
||||
// Date Name Description
|
||||
// 11/17/99 aliu Creation.
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
// Latin-Russion
|
||||
|
||||
lrussian {
|
||||
Rule {
|
||||
// Russian Letters
|
||||
|
||||
"cyA=\u0410\n"
|
||||
"cyBe=\u0411\n"
|
||||
"cyVe=\u0412\n"
|
||||
"cyGe=\u0413\n"
|
||||
"cyDe=\u0414\n"
|
||||
"cyYe=\u0415\n"
|
||||
"cyYo=\u0416\n"
|
||||
"cyZhe=\u0417\n"
|
||||
"cyZe=\u0418\n"
|
||||
"cyYi=\u0419\n"
|
||||
"cyY=\u0419\n"
|
||||
"cyKe=\u041a\n"
|
||||
"cyLe=\u041b\n"
|
||||
"cyMe=\u041c\n"
|
||||
"cyNe=\u041d\n"
|
||||
"cyO=\u041e\n"
|
||||
"cyPe=\u041f\n"
|
||||
|
||||
"cyRe=\u0420\n"
|
||||
"cySe=\u0421\n"
|
||||
"cyTe=\u0422\n"
|
||||
"cyU=\u0423\n"
|
||||
"cyFe=\u0424\n"
|
||||
"cyKhe=\u0425\n"
|
||||
"cyTse=\u0426\n"
|
||||
"cyChe=\u0427\n"
|
||||
"cyShe=\u0428\n"
|
||||
"cyShche=\u0429\n"
|
||||
"cyHard=\u042a\n"
|
||||
"cyI=\u042b\n"
|
||||
"cySoft=\u042c\n"
|
||||
"cyE=\u042d\n"
|
||||
"cyYu=\u042e\n"
|
||||
"cyYa=\u042f\n"
|
||||
|
||||
"cya=\u0430\n"
|
||||
"cybe=\u0431\n"
|
||||
"cyve=\u0432\n"
|
||||
"cyge=\u0433\n"
|
||||
"cyde=\u0434\n"
|
||||
"cyye=\u0435\n"
|
||||
"cyzhe=\u0436\n"
|
||||
"cyze=\u0437\n"
|
||||
"cyyi=\u0438\n"
|
||||
"cyy=\u0439\n"
|
||||
"cyke=\u043a\n"
|
||||
"cyle=\u043b\n"
|
||||
"cyme=\u043c\n"
|
||||
"cyne=\u043d\n"
|
||||
"cyo=\u043e\n"
|
||||
"cype=\u043f\n"
|
||||
|
||||
"cyre=\u0440\n"
|
||||
"cyse=\u0441\n"
|
||||
"cyte=\u0442\n"
|
||||
"cyu=\u0443\n"
|
||||
"cyfe=\u0444\n"
|
||||
"cykhe=\u0445\n"
|
||||
"cytse=\u0446\n"
|
||||
"cyche=\u0447\n"
|
||||
"cyshe=\u0448\n"
|
||||
"cyshche=\u0449\n"
|
||||
"cyhard=\u044a\n"
|
||||
"cyi=\u044b\n"
|
||||
"cysoft=\u044c\n"
|
||||
"cye=\u044d\n"
|
||||
"cyyu=\u044e\n"
|
||||
"cyya=\u044f\n"
|
||||
|
||||
"cyyo=\u0451\n"
|
||||
|
||||
// convert English to Russian
|
||||
"Russian>\u041f\u0420\u0410\u0412\u0414\u0410\u00D1\u0020\u0411\u044d\u043b\u0430\u0440\u0443\u0441\u043a\u0430\u044f\u002c\u0020\u043a\u044b\u0440\u0433\u044b\u0437\u002c\u0020\u041c\u043e\u043b\u0434\u043e\u0432\u044d\u043d\u044f\u0441\u043a\u044d\u002e\n"
|
||||
|
||||
//special equivs for ay, oy, ...
|
||||
"YAI>{cyYa}{cyY}\n"
|
||||
"YEI>{cyYe}{cyY}\n"
|
||||
"YII>{cyYi}{cyY}\n"
|
||||
"YOI>{cyYo}{cyY}\n"
|
||||
"YUI>{cyYu}{cyY}\n"
|
||||
"AI>{cyA}{cyY}\n"
|
||||
"EI>{cyE}{cyY}\n"
|
||||
//skip II, since it is the soft sign
|
||||
"OI>{cyO}{cyY}\n"
|
||||
"UI>{cyU}{cyY}\n"
|
||||
|
||||
"A>{cyA}\n"
|
||||
"B>{cyBe}\n"
|
||||
"CH>{cyChe}\n"
|
||||
"C[I>{cySe}\n"
|
||||
"C[E>{cySe}\n"
|
||||
"C[Y>{cySe}\n"
|
||||
"C>{cyKe}\n"
|
||||
"D>{cyDe}\n"
|
||||
"E>{cyE}\n"
|
||||
"F>{cyFe}\n"
|
||||
"G>{cyGe}\n"
|
||||
"H>{cyHard}\n"
|
||||
"II>{cySoft}\n"
|
||||
"I>{cyI}\n"
|
||||
"J>{cyDe}{cyZhe}\n"
|
||||
"KH>{cyKhe}\n"
|
||||
"K>{cyKe}\n"
|
||||
"L>{cyLe}\n"
|
||||
"M>{cyMe}\n"
|
||||
"N>{cyNe}\n"
|
||||
"O>{cyO}\n"
|
||||
"P>{cyPe}\n"
|
||||
"QU>{cyKe}{cyVe}\n"
|
||||
"R>{cyRe}\n"
|
||||
"SHTCH>{cyShche}\n"
|
||||
"SHCH>{cyShche}\n"
|
||||
"SH>{cyShe}\n"
|
||||
"S>{cySe}\n"
|
||||
"TCH>{cyChe}\n"
|
||||
"TH>{cyZe}\n"
|
||||
"TS>{cyTse}\n"
|
||||
"T>{cyTe}\n"
|
||||
"U>{cyU}\n"
|
||||
"V>{cyVe}\n"
|
||||
"WH>{cyVe}\n"
|
||||
"W>{cyVe}\n"
|
||||
"X>{cyKe}{cySe}\n"
|
||||
"YE>{cyYe}\n"
|
||||
"YO>{cyYo}\n"
|
||||
"YU>{cyYu}\n"
|
||||
"YA>{cyYa}\n"
|
||||
"YI>{cyYi}\n"
|
||||
"Y>{cyY}\n"
|
||||
"ZH>{cyZhe}\n"
|
||||
"Z>{cyZe}\n"
|
||||
"X>{cyKe}{cySe}\n"
|
||||
|
||||
//lower case: doesn''t solve join bug
|
||||
"yai>{cyya}{cyy}\n"
|
||||
"yei>{cyye}{cyy}\n"
|
||||
"yii>{cyyi}{cyy}\n"
|
||||
"yoi>{cyyo}{cyy}\n"
|
||||
"yui>{cyyu}{cyy}\n"
|
||||
"ai>{cya}{cyy}\n"
|
||||
"ei>{cye}{cyy}\n"
|
||||
//skip ii, since it is the soft sign
|
||||
"oi>{cyo}{cyy}\n"
|
||||
"ui>{cyu}{cyy}\n"
|
||||
|
||||
"a>{cya}\n"
|
||||
"b>{cybe}\n"
|
||||
"ch>{cyche}\n"
|
||||
"c[i>{cyse}\n"
|
||||
"c[e>{cyse}\n"
|
||||
"c[y>{cyse}\n"
|
||||
"c>{cyke}\n"
|
||||
"d>{cyde}\n"
|
||||
"e>{cye}\n"
|
||||
"f>{cyfe}\n"
|
||||
"g>{cyge}\n"
|
||||
"h>{cyhard}\n"
|
||||
"ii>{cysoft}\n"
|
||||
"i>{cyi}\n"
|
||||
"j>{cyde}{cyzhe}\n"
|
||||
"kh>{cykhe}\n"
|
||||
"k>{cyke}\n"
|
||||
"l>{cyle}\n"
|
||||
"m>{cyme}\n"
|
||||
"n>{cyne}\n"
|
||||
"o>{cyo}\n"
|
||||
"p>{cype}\n"
|
||||
"qu>{cyke}{cyve}\n"
|
||||
"r>{cyre}\n"
|
||||
"shtch>{cyshche}\n"
|
||||
"shch>{cyshche}\n"
|
||||
"sh>{cyshe}\n"
|
||||
"s>{cyse}\n"
|
||||
"tch>{cyche}\n"
|
||||
"th>{cyze}\n"
|
||||
"ts>{cytse}\n"
|
||||
"t>{cyte}\n"
|
||||
"u>{cyu}\n"
|
||||
"v>{cyve}\n"
|
||||
"wh>{cyve}\n"
|
||||
"w>{cyve}\n"
|
||||
"x>{cyke}{cyse}\n"
|
||||
"ye>{cyye}\n"
|
||||
"yo>{cyyo}\n"
|
||||
"yu>{cyyu}\n"
|
||||
"ya>{cyya}\n"
|
||||
"yi>{cyyi}\n"
|
||||
"y>{cyy}\n"
|
||||
"zh>{cyzhe}\n"
|
||||
"z>{cyze}\n"
|
||||
"x>{cyke}{cyse}\n"
|
||||
|
||||
//generally the last rule
|
||||
"''>\n"
|
||||
|
||||
//now Russian to English
|
||||
|
||||
"Y''<{cyY}[{cyA}\n"
|
||||
"Y''<{cyY}[{cyE}\n"
|
||||
"Y''<{cyY}[{cyI}\n"
|
||||
"Y''<{cyY}[{cyO}\n"
|
||||
"Y''<{cyY}[{cyU}\n"
|
||||
"A<{cyA}\n"
|
||||
"B<{cyBe}\n"
|
||||
"J<{cyDe}{cyZhe}\n"
|
||||
"D<{cyDe}\n"
|
||||
"V<{cyVe}\n"
|
||||
"G<{cyGe}\n"
|
||||
"ZH<{cyZhe}\n"
|
||||
"Z''<{cyZe}[{cyHard}\n"
|
||||
"Z<{cyZe}\n"
|
||||
"YE<{cyYe}\n"
|
||||
"YO<{cyYo}\n"
|
||||
"YU<{cyYu}\n"
|
||||
"YA<{cyYa}\n"
|
||||
"YI<{cyYi}\n"
|
||||
"Y<{cyY}\n"
|
||||
"KH<{cyKhe}\n"
|
||||
"K''<{cyKe}[{cyHard}\n"
|
||||
"X<{cyKe}{cySe}\n"
|
||||
"K<{cyKe}\n"
|
||||
"L<{cyLe}\n"
|
||||
"M<{cyMe}\n"
|
||||
"N<{cyNe}\n"
|
||||
"O<{cyO}\n"
|
||||
"P<{cyPe}\n"
|
||||
|
||||
"R<{cyRe}\n"
|
||||
"SHCH<{cyShche}\n"
|
||||
"SH''<{cyShe}[{cyChe}\n"
|
||||
"SH<{cyShe}\n"
|
||||
"S''<{cySe}[{cyHard}\n"
|
||||
"S<{cySe}\n"
|
||||
"TS<{cyTse}\n"
|
||||
"T''<{cyTe}[{cySe}\n"
|
||||
"T''<{cyTe}[{cyHard}\n"
|
||||
"T<{cyTe}\n"
|
||||
"U<{cyU}\n"
|
||||
"F<{cyFe}\n"
|
||||
"CH<{cyChe}\n"
|
||||
"H<{cyHard}\n"
|
||||
"I''<{cyI}[{cyI}\n"
|
||||
"I<{cyI}\n"
|
||||
"II<{cySoft}\n"
|
||||
"E<{cyE}\n"
|
||||
|
||||
//lowercase
|
||||
"y''<{cyy}[{cya}\n"
|
||||
"y''<{cyy}[{cye}\n"
|
||||
"y''<{cyy}[{cyi}\n"
|
||||
"y''<{cyy}[{cyo}\n"
|
||||
"y''<{cyy}[{cyu}\n"
|
||||
"a<{cya}\n"
|
||||
"b<{cybe}\n"
|
||||
"j<{cyde}{cyzhe}\n"
|
||||
"d<{cyde}\n"
|
||||
"v<{cyve}\n"
|
||||
"g<{cyge}\n"
|
||||
"zh<{cyzhe}\n"
|
||||
"z''<{cyze}[{cyhard}\n"
|
||||
"z<{cyze}\n"
|
||||
"ye<{cyye}\n"
|
||||
"yo<{cyyo}\n"
|
||||
"yu<{cyyu}\n"
|
||||
"ya<{cyya}\n"
|
||||
"yi<{cyyi}\n"
|
||||
"y<{cyy}\n"
|
||||
"kh<{cykhe}\n"
|
||||
"k''<{cyke}[{cyhard}\n"
|
||||
"x<{cyke}{cyse}\n"
|
||||
"k<{cyke}\n"
|
||||
"l<{cyle}\n"
|
||||
"m<{cyme}\n"
|
||||
"n<{cyne}\n"
|
||||
"o<{cyo}\n"
|
||||
"p<{cype}\n"
|
||||
|
||||
"r<{cyre}\n"
|
||||
"shch<{cyshche}\n"
|
||||
"sh''<{cyshe}[{cyche}\n"
|
||||
"sh<{cyshe}\n"
|
||||
"s''<{cyse}[{cyhard}\n"
|
||||
"s<{cyse}\n"
|
||||
"ts<{cytse}\n"
|
||||
"t''<{cyte}[{cyse}\n"
|
||||
"t''<{cyte}[{cyhard}\n"
|
||||
"t<{cyte}\n"
|
||||
"u<{cyu}\n"
|
||||
"f<{cyfe}\n"
|
||||
"ch<{cyche}\n"
|
||||
"h<{cyhard}\n"
|
||||
"i''<{cyi}[{cyi}\n"
|
||||
"i<{cyi}\n"
|
||||
"ii<{cysoft}\n"
|
||||
"e<{cye}\n"
|
||||
|
||||
//generally the last rule
|
||||
"''>\n"
|
||||
//the end
|
||||
}
|
||||
}
|
83
icu4c/data/translit/quotes.txt
Normal file
83
icu4c/data/translit/quotes.txt
Normal file
|
@ -0,0 +1,83 @@
|
|||
//--------------------------------------------------------------------
|
||||
// Copyright (C) 1999, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//--------------------------------------------------------------------
|
||||
// Date Name Description
|
||||
// 11/17/99 aliu Creation.
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
// StraightQuotes-CurlyQuotes
|
||||
|
||||
quotes {
|
||||
Rule {
|
||||
// Rewritten using character codes [LIU]
|
||||
"white=[[:Zs:][:Zl:][:Zp:]]\n"
|
||||
"black=[^[:Zs:][:Zl:][:Zp:]]\n"
|
||||
"open=[[:Ps:]]\n"
|
||||
"dquote=\"\n"
|
||||
|
||||
"lAng=\u3008\n"
|
||||
"ldAng=\u300A\n"
|
||||
"lBrk='['\n"
|
||||
"lBrc='{'\n"
|
||||
|
||||
"lquote=\u2018\n"
|
||||
"rquote=\u2019\n"
|
||||
"ldquote=\u201C\n"
|
||||
"rdquote=\u201D\n"
|
||||
|
||||
"ldguill=\u00AB\n"
|
||||
"rdguill=\u00BB\n"
|
||||
"lguill=\u2039\n"
|
||||
"rguill=\u203A\n"
|
||||
|
||||
"mdash=\u2014\n"
|
||||
|
||||
//#######################################
|
||||
// Conversions from input
|
||||
//#######################################
|
||||
|
||||
// join single quotes
|
||||
"{lquote}''>{ldquote}\n"
|
||||
"{lquote}{lquote}>{ldquote}\n"
|
||||
"{rquote}''>{rdquote}\n"
|
||||
"{rquote}{rquote}>{rdquote}\n"
|
||||
|
||||
//smart single quotes
|
||||
"{white}]''>{lquote}\n"
|
||||
"{open}]''>{lquote}\n"
|
||||
"{black}]''>{rquote}\n"
|
||||
"''>{lquote}\n"
|
||||
|
||||
//smart doubles
|
||||
"{white}]{dquote}>{ldquote}\n"
|
||||
"{open}]{dquote}>{ldquote}\n"
|
||||
"{black}]{dquote}>{rdquote}\n"
|
||||
"{dquote}>{ldquote}\n"
|
||||
|
||||
// join single guillemets
|
||||
"{rguill}{rguill}>{rdguill}\n"
|
||||
"'>>'>{rdguill}\n"
|
||||
"{lguill}{lguill}>{ldguill}\n"
|
||||
"'<<'>{ldguill}\n"
|
||||
|
||||
// prevent double spaces
|
||||
" ] >\n"
|
||||
|
||||
// join hyphens into dash
|
||||
"-->{mdash}\n"
|
||||
|
||||
//#######################################
|
||||
// Conversions back to input
|
||||
//#######################################
|
||||
|
||||
//smart quotes
|
||||
"''<{lquote}\n"
|
||||
"''<{rquote}\n"
|
||||
"{dquote}<{ldquote}\n"
|
||||
"{dquote}<{rdquote}\n"
|
||||
|
||||
//hyphens
|
||||
"--<{mdash}\n"
|
||||
}
|
||||
}
|
1532
icu4c/data/translit/ucname.txt
Normal file
1532
icu4c/data/translit/ucname.txt
Normal file
File diff suppressed because it is too large
Load diff
277
icu4c/source/i18n/cpdtrans.cpp
Normal file
277
icu4c/source/i18n/cpdtrans.cpp
Normal file
|
@ -0,0 +1,277 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#include "cpdtrans.h"
|
||||
#include "unifilt.h"
|
||||
#include "unifltlg.h"
|
||||
|
||||
CompoundTransliterator::CompoundTransliterator(const UnicodeString& ID,
|
||||
UnicodeFilter* adoptedFilter) :
|
||||
Transliterator(ID,adoptedFilter),
|
||||
trans(0), count(0) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a new compound transliterator given an array of
|
||||
* transliterators. The array of transliterators may be of any
|
||||
* length, including zero or one, however, useful compound
|
||||
* transliterators have at least two components.
|
||||
* @param transliterators array of <code>Transliterator</code>
|
||||
* objects
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
CompoundTransliterator::CompoundTransliterator(const UnicodeString& ID,
|
||||
Transliterator* const transliterators[],
|
||||
int32_t transCount,
|
||||
UnicodeFilter* adoptedFilter) :
|
||||
Transliterator(ID,adoptedFilter),
|
||||
trans(0), count(0) {
|
||||
setTransliterators(transliterators, transCount);
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
CompoundTransliterator::CompoundTransliterator(const CompoundTransliterator& t) :
|
||||
Transliterator(t), trans(0), count(0) {
|
||||
*this = t;
|
||||
}
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
CompoundTransliterator::~CompoundTransliterator() {
|
||||
freeTransliterators();
|
||||
}
|
||||
|
||||
void CompoundTransliterator::freeTransliterators() {
|
||||
for (int32_t i=0; i<count; ++i) {
|
||||
delete trans[i];
|
||||
}
|
||||
delete[] trans;
|
||||
trans = 0;
|
||||
count = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Assignment operator.
|
||||
*/
|
||||
CompoundTransliterator& CompoundTransliterator::operator=(
|
||||
const CompoundTransliterator& t) {
|
||||
Transliterator::operator=(t);
|
||||
int32_t i;
|
||||
for (i=0; i<count; ++i) {
|
||||
delete trans[i];
|
||||
trans[i] = 0;
|
||||
}
|
||||
if (t.count > count) {
|
||||
delete[] trans;
|
||||
trans = new Transliterator*[t.count];
|
||||
}
|
||||
count = t.count;
|
||||
for (i=0; i<count; ++i) {
|
||||
trans[i] = t.trans[i]->clone();
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterator API.
|
||||
*/
|
||||
Transliterator* CompoundTransliterator::clone() const {
|
||||
return new CompoundTransliterator(*this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of transliterators in this chain.
|
||||
* @return number of transliterators in this chain.
|
||||
*/
|
||||
int32_t CompoundTransliterator::getCount() const {
|
||||
return count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the transliterator at the given index in this chain.
|
||||
* @param index index into chain, from 0 to <code>getCount() - 1</code>
|
||||
* @return transliterator at the given index
|
||||
*/
|
||||
const Transliterator& CompoundTransliterator::getTransliterator(int32_t index) const {
|
||||
return *trans[index];
|
||||
}
|
||||
|
||||
|
||||
void CompoundTransliterator::setTransliterators(Transliterator* const transliterators[],
|
||||
int32_t transCount) {
|
||||
Transliterator** a = new Transliterator*[transCount];
|
||||
for (int32_t i=0; i<transCount; ++i) {
|
||||
a[i] = transliterators[i]->clone();
|
||||
}
|
||||
adoptTransliterators(a, transCount);
|
||||
}
|
||||
|
||||
void CompoundTransliterator::adoptTransliterators(Transliterator* adoptedTransliterators[],
|
||||
int32_t transCount) {
|
||||
freeTransliterators();
|
||||
trans = adoptedTransliterators;
|
||||
count = transCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates a segment of a string. <code>Transliterator</code> API.
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @return the new limit index
|
||||
*/
|
||||
int32_t CompoundTransliterator::transliterate(Replaceable& text,
|
||||
int32_t start, int32_t limit) const {
|
||||
for (int32_t i=0; i<count; ++i) {
|
||||
limit = trans[i]->transliterate(text, start, limit);
|
||||
}
|
||||
return limit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleKeyboardTransliterate}.
|
||||
*/
|
||||
void CompoundTransliterator::handleKeyboardTransliterate(Replaceable& text,
|
||||
int32_t index[3]) const {
|
||||
/* Call each transliterator with the same start value and
|
||||
* initial cursor index, but with the limit index as modified
|
||||
* by preceding transliterators. The cursor index must be
|
||||
* reset for each transliterator to give each a chance to
|
||||
* transliterate the text. The initial cursor index is known
|
||||
* to still point to the same place after each transliterator
|
||||
* is called because each transliterator will not change the
|
||||
* text between start and the initial value of cursor.
|
||||
*
|
||||
* IMPORTANT: After the first transliterator, each subsequent
|
||||
* transliterator only gets to transliterate text committed by
|
||||
* preceding transliterators; that is, the cursor (output
|
||||
* value) of transliterator i becomes the limit (input value)
|
||||
* of transliterator i+1. Finally, the overall limit is fixed
|
||||
* up before we return.
|
||||
*
|
||||
* Assumptions we make here:
|
||||
* (1) start <= cursor <= limit ;cursor valid on entry
|
||||
* (2) cursor <= cursor' <= limit' ;cursor doesn't move back
|
||||
* (3) cursor <= limit' ;text before cursor unchanged
|
||||
* - cursor' is the value of cursor after calling handleKT
|
||||
* - limit' is the value of limit after calling handleKT
|
||||
*/
|
||||
|
||||
/**
|
||||
* Example: 3 transliterators. This example illustrates the
|
||||
* mechanics we need to implement. S, C, and L are the start,
|
||||
* cursor, and limit. gl is the globalLimit.
|
||||
*
|
||||
* 1. h-u, changes hex to Unicode
|
||||
*
|
||||
* 4 7 a d 0 4 7 a
|
||||
* abc/u0061/u => abca/u
|
||||
* S C L S C L gl=f->a
|
||||
*
|
||||
* 2. upup, changes "x" to "XX"
|
||||
*
|
||||
* 4 7 a 4 7 a
|
||||
* abca/u => abcAA/u
|
||||
* S CL S C
|
||||
* L gl=a->b
|
||||
* 3. u-h, changes Unicode to hex
|
||||
*
|
||||
* 4 7 a 4 7 a d 0 3
|
||||
* abcAA/u => abc/u0041/u0041/u
|
||||
* S C L S C
|
||||
* L gl=b->15
|
||||
* 4. return
|
||||
*
|
||||
* 4 7 a d 0 3
|
||||
* abc/u0041/u0041/u
|
||||
* S C L
|
||||
*/
|
||||
|
||||
if (count < 1) {
|
||||
return; // Short circuit for empty compound transliterators
|
||||
}
|
||||
|
||||
/**
|
||||
* One more wrinkle. If there is a filter F for the compound
|
||||
* transliterator as a whole, then we need to modify every
|
||||
* non-null filter f in the chain to be f' = F & f. Then,
|
||||
* when we're done, we restore the original filters.
|
||||
*
|
||||
* A possible future optimization is to change f to f' at
|
||||
* construction time, but then if anyone else is using the
|
||||
* transliterators in the chain outside of this context, they
|
||||
* will get unexpected results.
|
||||
*/
|
||||
const UnicodeFilter* F = getFilter();
|
||||
UnicodeFilter** f = 0;
|
||||
if (F != 0) {
|
||||
f = new UnicodeFilter*[count];
|
||||
for (int32_t i=0; i<count; ++i) {
|
||||
f[i] = trans[i]->getFilter()->clone();
|
||||
trans[i]->adoptFilter(UnicodeFilterLogic::createAnd(*F, *f[i]));
|
||||
}
|
||||
}
|
||||
|
||||
int32_t cursor = index[CURSOR];
|
||||
int32_t limit = index[LIMIT];
|
||||
int32_t globalLimit = limit;
|
||||
/* globalLimit is the overall limit. We keep track of this
|
||||
* since we overwrite index[LIMIT] with the previous
|
||||
* index[CURSOR]. After each transliteration, we update
|
||||
* globalLimit for insertions or deletions that have happened.
|
||||
*/
|
||||
|
||||
for (int32_t i=0; i<count; ++i) {
|
||||
index[CURSOR] = cursor; // Reset cursor
|
||||
index[LIMIT] = limit;
|
||||
|
||||
trans[i]->handleKeyboardTransliterate(text, index);
|
||||
|
||||
// Adjust overall limit for insertions/deletions
|
||||
globalLimit += index[LIMIT] - limit;
|
||||
limit = index[CURSOR]; // Move limit to end of committed text
|
||||
}
|
||||
// Cursor is good where it is -- where the last
|
||||
// transliterator left it. Limit needs to be put back
|
||||
// where it was, modulo adjustments for deletions/insertions.
|
||||
index[LIMIT] = globalLimit;
|
||||
|
||||
// Fixup the transliterator filters, if we had to modify them.
|
||||
if (f != 0) {
|
||||
for (int32_t i=0; i<count; ++i) {
|
||||
trans[i]->adoptFilter(f[i]);
|
||||
}
|
||||
delete[] f;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the length of the longest context required by this transliterator.
|
||||
* This is <em>preceding</em> context.
|
||||
* @return maximum number of preceding context characters this
|
||||
* transliterator needs to examine
|
||||
*/
|
||||
int32_t CompoundTransliterator::getMaximumContextLength() const {
|
||||
int32_t max = 0;
|
||||
for (int32_t i=0; i<count; ++i) {
|
||||
int32_t len = trans[i]->getMaximumContextLength();
|
||||
if (len > max) {
|
||||
max = len;
|
||||
}
|
||||
}
|
||||
return max;
|
||||
}
|
133
icu4c/source/i18n/cpdtrans.h
Normal file
133
icu4c/source/i18n/cpdtrans.h
Normal file
|
@ -0,0 +1,133 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef CPDTRANS_H
|
||||
#define CPDTRANS_H
|
||||
|
||||
#include "translit.h"
|
||||
|
||||
/**
|
||||
* A transliterator that is composed of two or more other
|
||||
* transliterator objects linked together. For example, if one
|
||||
* transliterator transliterates from script A to script B, and
|
||||
* another transliterates from script B to script C, the two may be
|
||||
* combined to form a new transliterator from A to C.
|
||||
*
|
||||
* <p>Composed transliterators may not behave as expected. For
|
||||
* example, inverses may not combine to form the identity
|
||||
* transliterator. See the class documentation for {@link
|
||||
* Transliterator} for details.
|
||||
*
|
||||
* <p>If a non-<tt>null</tt> <tt>UnicodeFilter</tt> is applied to a
|
||||
* <tt>CompoundTransliterator</tt>, it has the effect of being
|
||||
* logically <b>and</b>ed with the filter of each transliterator in
|
||||
* the chain.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: cpdtrans.h,v $ $Revision: 1.1 $ $Date: 1999/11/20 00:36:43 $
|
||||
*/
|
||||
class U_I18N_API CompoundTransliterator : public Transliterator {
|
||||
|
||||
Transliterator** trans;
|
||||
|
||||
int32_t count;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Constructs a new compound transliterator given an array of
|
||||
* transliterators. The array of transliterators may be of any
|
||||
* length, including zero or one, however, useful compound
|
||||
* transliterators have at least two components.
|
||||
* @param transliterators array of <code>Transliterator</code>
|
||||
* objects
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
CompoundTransliterator(const UnicodeString& ID,
|
||||
Transliterator* const transliterators[],
|
||||
int32_t count,
|
||||
UnicodeFilter* adoptedFilter = 0);
|
||||
|
||||
CompoundTransliterator(const UnicodeString& ID,
|
||||
UnicodeFilter* adoptedFilter = 0);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~CompoundTransliterator();
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
CompoundTransliterator(const CompoundTransliterator&);
|
||||
|
||||
/**
|
||||
* Assignment operator.
|
||||
*/
|
||||
CompoundTransliterator& operator=(const CompoundTransliterator&);
|
||||
|
||||
/**
|
||||
* Transliterator API.
|
||||
*/
|
||||
Transliterator* clone() const;
|
||||
|
||||
/**
|
||||
* Returns the number of transliterators in this chain.
|
||||
* @return number of transliterators in this chain.
|
||||
*/
|
||||
virtual int32_t getCount() const;
|
||||
|
||||
/**
|
||||
* Returns the transliterator at the given index in this chain.
|
||||
* @param index index into chain, from 0 to <code>getCount() - 1</code>
|
||||
* @return transliterator at the given index
|
||||
*/
|
||||
virtual const Transliterator& getTransliterator(int32_t index) const;
|
||||
|
||||
void setTransliterators(Transliterator* const transliterators[],
|
||||
int32_t count);
|
||||
|
||||
void adoptTransliterators(Transliterator* adoptedTransliterators[],
|
||||
int32_t count);
|
||||
|
||||
/**
|
||||
* Transliterates a segment of a string. <code>Transliterator</code> API.
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @return the new limit index
|
||||
*/
|
||||
virtual int32_t transliterate(Replaceable& text, int32_t start, int32_t limit) const;
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleKeyboardTransliterate}.
|
||||
*/
|
||||
virtual void handleKeyboardTransliterate(Replaceable& text,
|
||||
int32_t index[3]) const;
|
||||
|
||||
/**
|
||||
* Returns the length of the longest context required by this transliterator.
|
||||
* This is <em>preceding</em> context.
|
||||
* @return maximum number of preceding context characters this
|
||||
* transliterator needs to examine
|
||||
*/
|
||||
virtual int32_t getMaximumContextLength() const;
|
||||
|
||||
private:
|
||||
|
||||
void freeTransliterators();
|
||||
};
|
||||
#endif
|
155
icu4c/source/i18n/hextouni.cpp
Normal file
155
icu4c/source/i18n/hextouni.cpp
Normal file
|
@ -0,0 +1,155 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#include "hextouni.h"
|
||||
#include "rep.h"
|
||||
#include "unifilt.h"
|
||||
#include "uniset.h" // For UnicodeSet::digit REMOVE LATER
|
||||
|
||||
/**
|
||||
* ID for this transliterator.
|
||||
*/
|
||||
const char* HexToUnicodeTransliterator::_ID = "Hex-Unicode";
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
*/
|
||||
HexToUnicodeTransliterator::HexToUnicodeTransliterator(UnicodeFilter* adoptedFilter) :
|
||||
Transliterator(_ID, adoptedFilter) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
HexToUnicodeTransliterator::HexToUnicodeTransliterator(const HexToUnicodeTransliterator& o) :
|
||||
Transliterator(o) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Assignment operator.
|
||||
*/
|
||||
HexToUnicodeTransliterator& HexToUnicodeTransliterator::operator=(
|
||||
const HexToUnicodeTransliterator& o) {
|
||||
Transliterator::operator=(o);
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterator API.
|
||||
*/
|
||||
Transliterator* HexToUnicodeTransliterator::clone() const {
|
||||
return new HexToUnicodeTransliterator(*this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates a segment of a string. <code>Transliterator</code> API.
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @return the new limit index
|
||||
*/
|
||||
int32_t HexToUnicodeTransliterator::transliterate(Replaceable& text,
|
||||
int32_t start, int32_t limit) const {
|
||||
int32_t offsets[3] = { start, limit, start };
|
||||
handleKeyboardTransliterate(text, offsets);
|
||||
return offsets[LIMIT];
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleKeyboardTransliterate}.
|
||||
*/
|
||||
void HexToUnicodeTransliterator::handleKeyboardTransliterate(Replaceable& text,
|
||||
int32_t offsets[3]) const {
|
||||
/**
|
||||
* Performs transliteration changing Unicode hexadecimal
|
||||
* escapes to characters. For example, "U+0040" -> '@'. A fixed
|
||||
* set of prefixes is recognized: "\u", "\U", "u+", "U+".
|
||||
*/
|
||||
int32_t cursor = offsets[CURSOR];
|
||||
int32_t limit = offsets[LIMIT];
|
||||
|
||||
int32_t maxCursor = limit - 6;
|
||||
|
||||
while (cursor <= maxCursor) {
|
||||
UChar c = filteredCharAt(text, cursor + 5);
|
||||
int32_t digit0 = UnicodeSet::digit(c, 16);
|
||||
if (digit0 < 0) {
|
||||
if (c == '\\') {
|
||||
cursor += 5;
|
||||
} else if (c == 'U' || c == 'u' || c == '+') {
|
||||
cursor += 4;
|
||||
} else {
|
||||
cursor += 6;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
int32_t u = digit0;
|
||||
bool_t toTop = FALSE;
|
||||
|
||||
for (int32_t i=4; i>=2; --i) {
|
||||
c = filteredCharAt(text, cursor + i);
|
||||
int32_t digit = UnicodeSet::digit(c, 16);
|
||||
if (digit < 0) {
|
||||
if (c == 'U' || c == 'u' || c == '+') {
|
||||
cursor += i-1;
|
||||
} else {
|
||||
cursor += 6;
|
||||
}
|
||||
toTop = TRUE; // This is a little awkward -- it was a "continue loop:"
|
||||
break; // statement in Java, where loop marked the while().
|
||||
} else {
|
||||
u |= digit << (4 * (5-i));
|
||||
}
|
||||
}
|
||||
|
||||
if (toTop) {
|
||||
continue;
|
||||
}
|
||||
|
||||
c = filteredCharAt(text, cursor);
|
||||
UChar d = filteredCharAt(text, cursor + 1);
|
||||
if (((c == 'U' || c == 'u') && d == '+')
|
||||
|| (c == '\\' && (d == 'U' || d == 'u'))) {
|
||||
|
||||
// At this point, we have a match; replace cursor..cursor+5
|
||||
// with u.
|
||||
text.handleReplaceBetween(cursor, cursor+6, UnicodeString((UChar)u));
|
||||
limit -= 5;
|
||||
maxCursor -= 5;
|
||||
|
||||
++cursor;
|
||||
} else {
|
||||
cursor += 6;
|
||||
}
|
||||
}
|
||||
|
||||
offsets[LIMIT] = limit;
|
||||
offsets[CURSOR] = cursor;
|
||||
}
|
||||
|
||||
UChar HexToUnicodeTransliterator::filteredCharAt(Replaceable& text, int32_t i) const {
|
||||
UChar c;
|
||||
const UnicodeFilter* filter = getFilter();
|
||||
return (filter == 0) ? text.charAt(i) :
|
||||
(filter->isIn(c = text.charAt(i)) ? c : (UChar)0xFFFF);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the length of the longest context required by this transliterator.
|
||||
* This is <em>preceding</em> context.
|
||||
* @param direction either <code>FORWARD</code> or <code>REVERSE</code>
|
||||
* @return maximum number of preceding context characters this
|
||||
* transliterator needs to examine
|
||||
*/
|
||||
int32_t HexToUnicodeTransliterator::getMaximumContextLength() const {
|
||||
return 0;
|
||||
}
|
95
icu4c/source/i18n/hextouni.h
Normal file
95
icu4c/source/i18n/hextouni.h
Normal file
|
@ -0,0 +1,95 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef HEXTOUNI_H
|
||||
#define HEXTOUNI_H
|
||||
|
||||
#include "translit.h"
|
||||
|
||||
/**
|
||||
* A transliterator that converts from hexadecimal Unicode
|
||||
* escape sequences to the characters they represent. For example, "U+0040"
|
||||
* and '\u0040'. It recognizes the
|
||||
* prefixes "U+", "u+", "\U", and "\u". Hex values may be
|
||||
* upper- or lowercase.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: hextouni.h,v $ $Revision: 1.1 $ $Date: 1999/11/20 00:36:43 $
|
||||
*/
|
||||
class U_I18N_API HexToUnicodeTransliterator : public Transliterator {
|
||||
|
||||
/**
|
||||
* ID for this transliterator.
|
||||
*/
|
||||
static const char* _ID;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
*/
|
||||
HexToUnicodeTransliterator(UnicodeFilter* adoptedFilter = 0);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~HexToUnicodeTransliterator();
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
HexToUnicodeTransliterator(const HexToUnicodeTransliterator&);
|
||||
|
||||
/**
|
||||
* Assignment operator.
|
||||
*/
|
||||
HexToUnicodeTransliterator& operator=(const HexToUnicodeTransliterator&);
|
||||
|
||||
/**
|
||||
* Transliterator API.
|
||||
*/
|
||||
Transliterator* clone() const;
|
||||
|
||||
/**
|
||||
* Transliterates a segment of a string. <code>Transliterator</code> API.
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @return the new limit index
|
||||
*/
|
||||
virtual int32_t transliterate(Replaceable &text,
|
||||
int32_t start, int32_t limit) const;
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleKeyboardTransliterate}.
|
||||
*/
|
||||
virtual void handleKeyboardTransliterate(Replaceable& text,
|
||||
int32_t offsets[3]) const;
|
||||
|
||||
/**
|
||||
* Return the length of the longest context required by this transliterator.
|
||||
* This is <em>preceding</em> context.
|
||||
* @param direction either <code>FORWARD</code> or <code>REVERSE</code>
|
||||
* @return maximum number of preceding context characters this
|
||||
* transliterator needs to examine
|
||||
*/
|
||||
virtual int32_t getMaximumContextLength() const;
|
||||
|
||||
private:
|
||||
|
||||
UChar filteredCharAt(Replaceable& text, int32_t i) const;
|
||||
};
|
||||
|
||||
inline HexToUnicodeTransliterator::~HexToUnicodeTransliterator() {}
|
||||
|
||||
#endif
|
|
@ -69,7 +69,7 @@ LINK32=link.exe
|
|||
# PROP Ignore_Export_Lib 0
|
||||
# PROP Target_Dir ""
|
||||
# ADD BASE CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "I18N_EXPORTS" /YX /FD /GZ /c
|
||||
# ADD CPP /nologo /MDd /Za /W3 /Gm /GX /ZI /Od /I "..\..\include" /I "..\..\source\common" /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "I18N_EXPORTS" /D "U_I18N_IMPLEMENTATION" /YX /FD /GZ /c
|
||||
# ADD CPP /nologo /MDd /Za /W3 /Gm /GX /ZI /Od /I "..\..\include" /I "..\..\source\common" /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "I18N_EXPORTS" /D "U_I18N_IMPLEMENTATION" /FR /YX /FD /GZ /c
|
||||
# ADD BASE MTL /nologo /D "_DEBUG" /mktyplib203 /win32
|
||||
# ADD MTL /nologo /D "_DEBUG" /mktyplib203 /win32
|
||||
# ADD BASE RSC /l 0x409 /d "_DEBUG"
|
||||
|
@ -124,6 +124,10 @@ SOURCE=.\colrules.cpp
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\cpdtrans.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\datefmt.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
@ -153,6 +157,10 @@ SOURCE=.\gregocal.cpp
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\hextouni.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\lnbkdat.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
@ -173,6 +181,26 @@ SOURCE=.\ptnentry.cpp
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbt.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbt_data.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbt_pars.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbt_rule.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbt_set.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\simpletz.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
@ -209,6 +237,10 @@ SOURCE=.\timezone.cpp
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\translit.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\txtbdat.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
@ -241,10 +273,22 @@ SOURCE=.\unicdcm.cpp
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unifltlg.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unirange.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\uniset.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unitohex.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unum.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
@ -404,6 +448,33 @@ SOURCE=.\colrules.h
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\cpdtrans.h
|
||||
|
||||
!IF "$(CFG)" == "i18n - Win32 Release"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\cpdtrans.h
|
||||
|
||||
"..\..\include\cpdtrans.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy cpdtrans.h ..\..\include
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ELSEIF "$(CFG)" == "i18n - Win32 Debug"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\cpdtrans.h
|
||||
|
||||
"..\..\include\cpdtrans.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy cpdtrans.h ..\..\include
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ENDIF
|
||||
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\datefmt.h
|
||||
|
||||
!IF "$(CFG)" == "i18n - Win32 Release"
|
||||
|
@ -620,6 +691,33 @@ InputPath=.\gregocal.h
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\hextouni.h
|
||||
|
||||
!IF "$(CFG)" == "i18n - Win32 Release"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\hextouni.h
|
||||
|
||||
"..\..\include\hextouni.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy hextouni.h ..\..\include
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ELSEIF "$(CFG)" == "i18n - Win32 Debug"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\hextouni.h
|
||||
|
||||
"..\..\include\hextouni.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy hextouni.h ..\..\include
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ENDIF
|
||||
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\mergecol.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
@ -709,6 +807,57 @@ SOURCE=.\ptnentry.h
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbbi.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbbi_bld.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbt.h
|
||||
|
||||
!IF "$(CFG)" == "i18n - Win32 Release"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\rbt.h
|
||||
|
||||
"..\..\include\rbt.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy rbt.h ..\..\include
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ELSEIF "$(CFG)" == "i18n - Win32 Debug"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\rbt.h
|
||||
|
||||
"..\..\include\rbt.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy rbt.h ..\..\include
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ENDIF
|
||||
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbt_data.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbt_pars.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbt_rule.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbt_set.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\simpletz.h
|
||||
|
||||
!IF "$(CFG)" == "i18n - Win32 Release"
|
||||
|
@ -860,6 +1009,33 @@ InputPath=.\timezone.h
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\translit.h
|
||||
|
||||
!IF "$(CFG)" == "i18n - Win32 Release"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\translit.h
|
||||
|
||||
"..\..\include\translit.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy translit.h ..\..\include
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ELSEIF "$(CFG)" == "i18n - Win32 Debug"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\translit.h
|
||||
|
||||
"..\..\include\translit.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy translit.h ..\..\include
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ENDIF
|
||||
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\txtbdat.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
@ -1007,6 +1183,64 @@ SOURCE=.\unicdcm.h
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unifilt.h
|
||||
|
||||
!IF "$(CFG)" == "i18n - Win32 Release"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\unifilt.h
|
||||
|
||||
"..\..\include\unifilt.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy unifilt.h ..\..\include
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ELSEIF "$(CFG)" == "i18n - Win32 Debug"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\unifilt.h
|
||||
|
||||
"..\..\include\unifilt.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy unifilt.h ..\..\include
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ENDIF
|
||||
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unifltlg.h
|
||||
|
||||
!IF "$(CFG)" == "i18n - Win32 Release"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\unifltlg.h
|
||||
|
||||
"..\..\include\unifltlg.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy unifltlg.h ..\..\include
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ELSEIF "$(CFG)" == "i18n - Win32 Debug"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\unifltlg.h
|
||||
|
||||
"..\..\include\unifltlg.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy unifltlg.h ..\..\include
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ENDIF
|
||||
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unirange.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\uniset.h
|
||||
|
||||
!IF "$(CFG)" == "i18n - Win32 Release"
|
||||
|
@ -1034,6 +1268,33 @@ InputPath=.\uniset.h
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unitohex.h
|
||||
|
||||
!IF "$(CFG)" == "i18n - Win32 Release"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\unitohex.h
|
||||
|
||||
"..\..\include\unitohex.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy unitohex.h ..\..\include
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ELSEIF "$(CFG)" == "i18n - Win32 Debug"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\unitohex.h
|
||||
|
||||
"..\..\include\unitohex.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy unitohex.h ..\..\include
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ENDIF
|
||||
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unum.h
|
||||
|
||||
!IF "$(CFG)" == "i18n - Win32 Release"
|
||||
|
|
227
icu4c/source/i18n/rbt.cpp
Normal file
227
icu4c/source/i18n/rbt.cpp
Normal file
|
@ -0,0 +1,227 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#include "rbt.h"
|
||||
#include "rbt_pars.h"
|
||||
#include "rbt_data.h"
|
||||
#include "rbt_rule.h"
|
||||
#include "rep.h"
|
||||
|
||||
void RuleBasedTransliterator::_construct(const UnicodeString& rules,
|
||||
Direction direction,
|
||||
UErrorCode& status) {
|
||||
data = 0;
|
||||
isDataOwned = TRUE;
|
||||
if (U_SUCCESS(status)) {
|
||||
data = TransliterationRuleParser::parse(rules, direction);
|
||||
if (data == 0) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& ID,
|
||||
const TransliterationRuleData* theData,
|
||||
UnicodeFilter* adoptedFilter) :
|
||||
Transliterator(ID, adoptedFilter),
|
||||
data(theData), isDataOwned(FALSE) {}
|
||||
|
||||
/**
|
||||
* Copy constructor. Since the data object is immutable, we can share
|
||||
* it with other objects -- no need to clone it.
|
||||
*/
|
||||
RuleBasedTransliterator::RuleBasedTransliterator(
|
||||
const RuleBasedTransliterator& other) :
|
||||
Transliterator(other), data(other.data) {}
|
||||
|
||||
/**
|
||||
* Destructor. We do NOT own the data object, so we do not delete it.
|
||||
*/
|
||||
RuleBasedTransliterator::~RuleBasedTransliterator() {}
|
||||
|
||||
Transliterator* // Covariant return NOT ALLOWED (for portability)
|
||||
RuleBasedTransliterator::clone() const {
|
||||
return new RuleBasedTransliterator(*this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates a segment of a string. <code>Transliterator</code> API.
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param result buffer to receive the transliterated text; previous
|
||||
* contents are discarded
|
||||
*/
|
||||
void RuleBasedTransliterator::transliterate(const UnicodeString& text,
|
||||
int32_t start, int32_t limit,
|
||||
UnicodeString& result) const {
|
||||
/* In the following loop there is a virtual buffer consisting of the
|
||||
* text transliterated so far followed by the untransliterated text. There is
|
||||
* also a cursor, which may be in the already transliterated buffer or just
|
||||
* before the untransliterated text.
|
||||
*
|
||||
* Example: rules 1. ab>x|y
|
||||
* 2. yc>z
|
||||
*
|
||||
* []|eabcd start - no match, copy e to tranlated buffer
|
||||
* [e]|abcd match rule 1 - copy output & adjust cursor
|
||||
* [ex|y]cd match rule 2 - copy output & adjust cursor
|
||||
* [exz]|d no match, copy d to transliterated buffer
|
||||
* [exzd]| done
|
||||
*
|
||||
* cursor: an index into the virtual buffer, 0..result.length()-1.
|
||||
* Matches take place at the cursor. If there is no match, the cursor
|
||||
* is advanced, and one character is moved from the source text to the
|
||||
* result buffer.
|
||||
*
|
||||
* start, limit: these designate the substring of the source text which
|
||||
* has not been processed yet. The range of offsets is start..limit-1.
|
||||
* At any moment the virtual buffer consists of result +
|
||||
* text.substring(start, limit).
|
||||
*/
|
||||
int32_t cursor = 0;
|
||||
result.remove();
|
||||
while (start < limit || cursor < result.length()) {
|
||||
TransliterationRule* r = data->ruleSet.findMatch(text, start, limit,
|
||||
result,
|
||||
cursor,
|
||||
*data,
|
||||
getFilter());
|
||||
if (r == 0) {
|
||||
if (cursor == result.length()) {
|
||||
result.append(text.charAt(start++));
|
||||
}
|
||||
++cursor;
|
||||
} else {
|
||||
// At this point we have a match of one or more
|
||||
// characters. The characters cover the range [cursor,
|
||||
// cursor + r->getKeyLength()) - a half-open interval.
|
||||
// The index values refer to a virtual buffer with result
|
||||
// holding [0, result.length()) and text holding
|
||||
// [result.length(),...).
|
||||
|
||||
// First, figure out the range of result being replaced.
|
||||
int32_t rfirst = cursor;
|
||||
int32_t rlimit = icu_min(result.length(),
|
||||
cursor + r->getKeyLength());
|
||||
|
||||
// resultPad is length of result to right of cursor; >= 0
|
||||
int32_t resultPad = result.length() - cursor;
|
||||
|
||||
if (r->getKeyLength() > resultPad) {
|
||||
start += r->getKeyLength() - resultPad;
|
||||
}
|
||||
|
||||
result.replaceBetween(rfirst, rlimit,
|
||||
r->getOutput());
|
||||
|
||||
cursor += r->getCursorPos();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates a segment of a string. <code>Transliterator</code> API.
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @return The new limit index
|
||||
*/
|
||||
int32_t RuleBasedTransliterator::transliterate(Replaceable& text,
|
||||
int32_t start,
|
||||
int32_t limit) const {
|
||||
/* When using Replaceable, the algorithm is simpler, since we don't have
|
||||
* two separate buffers. We keep start and limit fixed the entire time,
|
||||
* relative to the text -- limit may move numerically if text is
|
||||
* inserted or removed. The cursor moves from start to limit, with
|
||||
* replacements happening under it.
|
||||
*
|
||||
* Example: rules 1. ab>x|y
|
||||
* 2. yc>z
|
||||
*
|
||||
* |eabcd start - no match, advance cursor
|
||||
* e|abcd match rule 1 - change text & adjust cursor
|
||||
* ex|ycd match rule 2 - change text & adjust cursor
|
||||
* exz|d no match, advance cursor
|
||||
* exzd| done
|
||||
*/
|
||||
int32_t cursor = start;
|
||||
while (cursor < limit) {
|
||||
TransliterationRule* r =
|
||||
data->ruleSet.findMatch(text, start, limit,
|
||||
cursor, *data,
|
||||
getFilter());
|
||||
if (r == 0) {
|
||||
++cursor;
|
||||
} else {
|
||||
text.handleReplaceBetween(cursor, cursor + r->getKeyLength(),
|
||||
r->getOutput());
|
||||
limit += r->getOutput().length() - r->getKeyLength();
|
||||
cursor += r->getCursorPos();
|
||||
}
|
||||
}
|
||||
return limit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleKeyboardTransliterate}.
|
||||
*/
|
||||
void
|
||||
RuleBasedTransliterator::handleKeyboardTransliterate(Replaceable& text,
|
||||
int32_t index[3]) const {
|
||||
int32_t start = index[START];
|
||||
int32_t limit = index[LIMIT];
|
||||
int32_t cursor = index[CURSOR];
|
||||
|
||||
bool_t isPartial;
|
||||
|
||||
while (cursor < limit) {
|
||||
TransliterationRule* r = data->ruleSet.findIncrementalMatch(
|
||||
text, start, limit, cursor,
|
||||
*data, isPartial,
|
||||
getFilter());
|
||||
/* If we match a rule then apply it by replacing the key
|
||||
* with the rule output and repositioning the cursor
|
||||
* appropriately. If we get a partial match, then we
|
||||
* can't do anything without more text; return with the
|
||||
* cursor at the current position. If we get null, then
|
||||
* there is no match at this position, and we can advance
|
||||
* the cursor.
|
||||
*/
|
||||
if (r == 0) {
|
||||
if (isPartial) {
|
||||
break;
|
||||
} else {
|
||||
++cursor;
|
||||
}
|
||||
} else {
|
||||
text.handleReplaceBetween(cursor, cursor + r->getKeyLength(),
|
||||
r->getOutput());
|
||||
limit += r->getOutput().length() - r->getKeyLength();
|
||||
cursor += r->getCursorPos();
|
||||
}
|
||||
}
|
||||
|
||||
index[LIMIT] = limit;
|
||||
index[CURSOR] = cursor;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the length of the longest context required by this transliterator.
|
||||
* This is <em>preceding</em> context.
|
||||
* @return Maximum number of preceding context characters this
|
||||
* transliterator needs to examine
|
||||
*/
|
||||
int32_t RuleBasedTransliterator::getMaximumContextLength() const {
|
||||
return data->ruleSet.getMaximumContextLength();
|
||||
}
|
377
icu4c/source/i18n/rbt.h
Normal file
377
icu4c/source/i18n/rbt.h
Normal file
|
@ -0,0 +1,377 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef RBT_H
|
||||
#define RBT_H
|
||||
|
||||
#include "translit.h"
|
||||
#include "uhash.h"
|
||||
#include "utypes.h"
|
||||
|
||||
class TransliterationRuleData;
|
||||
|
||||
/**
|
||||
* A transliterator that reads a set of rules in order to determine how to
|
||||
* perform translations. Rules are stored in resource bundles indexed by name.
|
||||
* Rules are separated by newline characters ('\n'); to include a literal
|
||||
* newline, prefix it with a backslash ('\\\n'). Whitespace is significant. If
|
||||
* the first character on a line is '#', the entire line is ignored as a
|
||||
* comment.
|
||||
*
|
||||
* <p>Each set of rules consists of two groups, one forward, and one reverse.
|
||||
* This is a convention that is not enforced; rules for one direction may be
|
||||
* omitted, with the result that translations in that direction will not modify
|
||||
* the source text.
|
||||
*
|
||||
* <p><b>Rule syntax</b>
|
||||
*
|
||||
* <p>Rule statements take one of the following forms:
|
||||
* <dl>
|
||||
* <dt><code>alefmadda=\u0622</code></dt>
|
||||
*
|
||||
* <dd><strong>Variable definition.</strong> The name on the left is
|
||||
* assigned the character or expression on the right. Names may not
|
||||
* contain any special characters (see list below). Duplicate names
|
||||
* (including duplicates of simple variables or category names)
|
||||
* cause an exception to be thrown. If the right hand side consists
|
||||
* of one character, then the variable stands for that character.
|
||||
* In this example, after this statement, instances of the left hand
|
||||
* name surrounded by braces, "<code>{alefmadda}</code>",
|
||||
* will be replaced by the Unicode character U+0622.</dd> If the
|
||||
* right hand side is longer than one character, then it is
|
||||
* interpreted as a character category expression; see below for
|
||||
* details.
|
||||
*
|
||||
* <dt><code>softvowel=[eiyEIY]</code></dt>
|
||||
*
|
||||
* <dd><strong>Category definition.</strong> The name on the left is assigned
|
||||
* to stand for a set of characters. The same rules for names of simple
|
||||
* variables apply. After this statement, the left hand variable will be
|
||||
* interpreted as indicating a set of characters in appropriate contexts. The
|
||||
* pattern syntax defining sets of characters is defined by {@link UnicodeSet}.
|
||||
* Examples of valid patterns are:<table>
|
||||
*
|
||||
* <tr valign=top>
|
||||
* <td nowrap><code>[abc]</code></td>
|
||||
* <td>The set containing the characters 'a', 'b', and 'c'.</td>
|
||||
* </tr>
|
||||
* <tr valign=top>
|
||||
* <td nowrap><code>[^abc]</code></td>
|
||||
* <td>The set of all characters <em>except</em> 'a', 'b', and 'c'.</td>
|
||||
* </tr>
|
||||
* <tr valign=top>
|
||||
* <td nowrap><code>[A-Z]</code></td>
|
||||
* <td>The set of all characters from 'A' to 'Z' in Unicode order.</td>
|
||||
* </tr>
|
||||
* <tr valign=top>
|
||||
* <td nowrap><code>[:Lu:]</code></td>
|
||||
* <td>The set of Unicode uppercase letters. See
|
||||
* <a href="http://www.unicode.org">www.unicode.org</a>
|
||||
* for a complete list of categories and their two-letter codes.</td>
|
||||
* </tr>
|
||||
* <tr valign=top>
|
||||
* <td nowrap><code>[^a-z[:Lu:][:Ll:]]</code></td>
|
||||
* <td>The set of all characters <em>except</em> 'a' through 'z' and
|
||||
* uppercase or lowercase letters.</td>
|
||||
* </tr>
|
||||
* </table>
|
||||
*
|
||||
* See {@link UnicodeSet} for more documentation and examples.
|
||||
* </dd>
|
||||
*
|
||||
* <dt><code>ai>{alefmadda}</code></dt>
|
||||
*
|
||||
* <dd><strong>Forward translation rule.</strong> This rule states that the
|
||||
* string on the left will be changed to the string on the right when
|
||||
* performing forward transliteration.</dd>
|
||||
*
|
||||
* <dt><code>ai<{alefmadda}</code></dt>
|
||||
*
|
||||
* <dd><strong>Reverse translation rule.</strong> This rule states that the
|
||||
* string on the right will be changed to the string on the left when
|
||||
* performing reverse transliteration.</dd>
|
||||
*
|
||||
* </dl>
|
||||
*
|
||||
* <p>Forward and reverse translation rules consist of a <em>match
|
||||
* pattern</em> and an <em>output string</em>. The match pattern consists
|
||||
* of literal characters, optionally preceded by context, and optionally
|
||||
* followed by context. Context characters, like literal pattern characters,
|
||||
* must be matched in the text being transliterated. However, unlike literal
|
||||
* pattern characters, they are not replaced by the output text. For example,
|
||||
* the pattern "<code>[abc]def</code>" indicates the characters
|
||||
* "<code>def</code>" must be preceded by "<code>abc</code>" for a successful
|
||||
* match. If there is a successful match, "<code>def</code>" will be replaced,
|
||||
* but not "<code>abc</code>". The initial '<code>[</code>' is optional, so
|
||||
* "<code>abc]def</code>" is equivalent to "<code>[abc]def</code>". Another
|
||||
* example is "<code>123[456]</code>" (or "<code>123[456</code>") in which the
|
||||
* literal pattern "<code>123</code>" must be followed by "<code>456</code>".
|
||||
*
|
||||
* <p>The output string of a forward or reverse rule consists of characters to
|
||||
* replace the literal pattern characters. If the output string contains the
|
||||
* character '<code>|</code>', this is taken to indicate the location of the
|
||||
* <em>cursor</em> after replacement. The cursor is the point in the text
|
||||
* at which the next replacement, if any, will be applied.
|
||||
*
|
||||
* <p><b>Example</b>
|
||||
*
|
||||
* <p>The following example rules illustrate many of the features of the rule
|
||||
* language.
|
||||
* <table cellpadding="4">
|
||||
* <tr valign=top><td>Rule 1.</td>
|
||||
* <td nowrap><code>abc]def>x|y</code></td></tr>
|
||||
* <tr valign=top><td>Rule 2.</td>
|
||||
* <td nowrap><code>xyz>r</code></td></tr>
|
||||
* <tr valign=top><td>Rule 3.</td>
|
||||
* <td nowrap><code>yz>q</code></td></tr>
|
||||
* </table>
|
||||
*
|
||||
* <p>Applying these rules to the string "<code>adefabcdefz</code>" yields the
|
||||
* following results:
|
||||
*
|
||||
* <table cellpadding="4">
|
||||
* <tr valign=top><td nowrap><code>|adefabcdefz</code></td>
|
||||
* <td>Initial state, no rules match. Advance cursor.</td></tr>
|
||||
* <tr valign=top><td nowrap><code>a|defabcdefz</code></td>
|
||||
* <td>Still no match. Rule 1 does not match because the preceding
|
||||
* context is not present.</td></tr>
|
||||
* <tr valign=top><td nowrap><code>ad|efabcdefz</code></td>
|
||||
* <td>Still no match. Keep advancing until there is a match...</td></tr>
|
||||
* <tr valign=top><td nowrap><code>ade|fabcdefz</code></td>
|
||||
* <td>...</td></tr>
|
||||
* <tr valign=top><td nowrap><code>adef|abcdefz</code></td>
|
||||
* <td>...</td></tr>
|
||||
* <tr valign=top><td nowrap><code>adefa|bcdefz</code></td>
|
||||
* <td>...</td></tr>
|
||||
* <tr valign=top><td nowrap><code>adefab|cdefz</code></td>
|
||||
* <td>...</td></tr>
|
||||
* <tr valign=top><td nowrap><code>adefabc|defz</code></td>
|
||||
* <td>Rule 1 matches; replace "<code>def</code>" with "<code>xy</code>"
|
||||
* and back up the cursor to before the '<code>y</code>'.</td></tr>
|
||||
* <tr valign=top><td nowrap><code>adefabcx|yz</code></td>
|
||||
* <td>Although "<code>xyz</code>" is present, rule 2 does not match
|
||||
* because the cursor is before the '<code>y</code>', not before the
|
||||
* '<code>x</code>'. Rule 3 does match. Replace "<code>yz</code>" with
|
||||
* "<code>q</code>".</td></tr>
|
||||
* <tr valign=top><td nowrap><code>adefabcxq|</code></td>
|
||||
* <td>The cursor is at the end; transliteration is complete.</td></tr>
|
||||
* </table>
|
||||
*
|
||||
* <p>The order of rules is significant. If multiple rules may match at some
|
||||
* point, the first matching rule is applied.
|
||||
*
|
||||
* <p>Forward and reverse rules may have an empty output string. Otherwise, an
|
||||
* empty left or right hand side of any statement is a syntax error.
|
||||
*
|
||||
* <p>Single quotes are used to quote the special characters
|
||||
* <code>=><{}[]|</code>. To specify a single quote itself, inside or
|
||||
* outside of quotes, use two single quotes in a row. For example, the rule
|
||||
* "<code>'>'>o''clock</code>" changes the string "<code>></code>" to
|
||||
* the string "<code>o'clock</code>".
|
||||
*
|
||||
* <p><b>Notes</b>
|
||||
*
|
||||
* <p>While a RuleBasedTransliterator is being built, it checks that the rules
|
||||
* are added in proper order. For example, if the rule "a>x" is followed by the
|
||||
* rule "ab>y", then the second rule will throw an exception. The reason is
|
||||
* that the second rule can never be triggered, since the first rule always
|
||||
* matches anything it matches. In other words, the first rule <em>masks</em>
|
||||
* the second rule. There is a cost of O(n^2) to make this check; in real-world
|
||||
* tests it appears to approximately double build time.
|
||||
*
|
||||
* <p>One optimization that can be made is to add a pragma to the rule language,
|
||||
* "#pragma order", that turns off ordering checking. This pragma can then be
|
||||
* added to all of our resource-based rules (after we build these once and
|
||||
* determine that there are no ordering errors). I haven't made this change yet
|
||||
* in the interests of keeping the code from getting too byzantine.
|
||||
*
|
||||
* @author Alan Liu
|
||||
*/
|
||||
class U_I18N_API RuleBasedTransliterator : public Transliterator {
|
||||
|
||||
/**
|
||||
* The data object is immutable, so we can freely share it with
|
||||
* other instances of RBT, as long as we do NOT own this object.
|
||||
*/
|
||||
TransliterationRuleData* data;
|
||||
|
||||
/**
|
||||
* If true, we own the data object and must delete it.
|
||||
*/
|
||||
bool_t dataIsOwned;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Direction constant passed to constructor to specify whether forward
|
||||
* or reverse rules are parsed. The other rules are ignored.
|
||||
*/
|
||||
enum Direction {
|
||||
/**
|
||||
* Direction constant passed to constructor to create a transliterator
|
||||
* using the forward rules.
|
||||
*/
|
||||
FORWARD,
|
||||
|
||||
/**
|
||||
* Direction constant passed to constructor to create a transliterator
|
||||
* using the reverse rules.
|
||||
*/
|
||||
REVERSE
|
||||
};
|
||||
|
||||
/**
|
||||
* Constructs a new transliterator from the given rules.
|
||||
* @param rules rules, separated by '\n'
|
||||
* @param direction either FORWARD or REVERSE.
|
||||
* @exception IllegalArgumentException if rules are malformed
|
||||
* or direction is invalid.
|
||||
*/
|
||||
RuleBasedTransliterator(const UnicodeString& ID,
|
||||
const UnicodeString& rules,
|
||||
Direction direction,
|
||||
UnicodeFilter* adoptedFilter,
|
||||
UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Covenience constructor with no filter.
|
||||
*/
|
||||
RuleBasedTransliterator(const UnicodeString& ID,
|
||||
const UnicodeString& rules,
|
||||
Direction direction,
|
||||
UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Covenience constructor with no filter and FORWARD direction.
|
||||
*/
|
||||
RuleBasedTransliterator(const UnicodeString& ID,
|
||||
const UnicodeString& rules,
|
||||
UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Covenience constructor with FORWARD direction.
|
||||
*/
|
||||
RuleBasedTransliterator(const UnicodeString& ID,
|
||||
const UnicodeString& rules,
|
||||
UnicodeFilter* adoptedFilter,
|
||||
UErrorCode& status);
|
||||
|
||||
RuleBasedTransliterator(const UnicodeString& ID,
|
||||
const TransliterationRuleData* theData,
|
||||
UnicodeFilter* adoptedFilter = 0);
|
||||
|
||||
RuleBasedTransliterator(const RuleBasedTransliterator&);
|
||||
|
||||
virtual ~RuleBasedTransliterator();
|
||||
|
||||
/**
|
||||
* Implement Transliterator API.
|
||||
*/
|
||||
Transliterator* clone() const;
|
||||
|
||||
/**
|
||||
* Transliterates a segment of a string. <code>Transliterator</code> API.
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param result buffer to receive the transliterated text; previous
|
||||
* contents are discarded
|
||||
*/
|
||||
virtual void transliterate(const UnicodeString& text,
|
||||
int32_t start, int32_t limit,
|
||||
UnicodeString& result) const;
|
||||
|
||||
/**
|
||||
* Transliterates a segment of a string. <code>Transliterator</code> API.
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @return The new limit index
|
||||
*/
|
||||
virtual int32_t transliterate(Replaceable& text,
|
||||
int32_t start, int32_t limit) const;
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleKeyboardTransliterate}.
|
||||
*/
|
||||
virtual void handleKeyboardTransliterate(Replaceable& text,
|
||||
int32_t index[3]) const;
|
||||
|
||||
/**
|
||||
* Returns the length of the longest context required by this transliterator.
|
||||
* This is <em>preceding</em> context.
|
||||
* @return Maximum number of preceding context characters this
|
||||
* transliterator needs to examine
|
||||
*/
|
||||
virtual int32_t getMaximumContextLength() const;
|
||||
|
||||
private:
|
||||
|
||||
void _construct(const UnicodeString& rules,
|
||||
Direction direction,
|
||||
UErrorCode& status);
|
||||
};
|
||||
|
||||
/**
|
||||
* Constructs a new transliterator from the given rules.
|
||||
* @param rules rules, separated by '\n'
|
||||
* @param direction either FORWARD or REVERSE.
|
||||
* @exception IllegalArgumentException if rules are malformed
|
||||
* or direction is invalid.
|
||||
*/
|
||||
inline RuleBasedTransliterator::RuleBasedTransliterator(
|
||||
const UnicodeString& ID,
|
||||
const UnicodeString& rules,
|
||||
Direction direction,
|
||||
UnicodeFilter* adoptedFilter,
|
||||
UErrorCode& status) :
|
||||
Transliterator(ID, adoptedFilter) {
|
||||
_construct(rules, direction, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Covenience constructor with no filter.
|
||||
*/
|
||||
inline RuleBasedTransliterator::RuleBasedTransliterator(
|
||||
const UnicodeString& ID,
|
||||
const UnicodeString& rules,
|
||||
Direction direction,
|
||||
UErrorCode& status) :
|
||||
Transliterator(ID, 0) {
|
||||
_construct(rules, direction, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Covenience constructor with no filter and FORWARD direction.
|
||||
*/
|
||||
inline RuleBasedTransliterator::RuleBasedTransliterator(
|
||||
const UnicodeString& ID,
|
||||
const UnicodeString& rules,
|
||||
UErrorCode& status) :
|
||||
Transliterator(ID, 0) {
|
||||
_construct(rules, FORWARD, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Covenience constructor with FORWARD direction.
|
||||
*/
|
||||
inline RuleBasedTransliterator::RuleBasedTransliterator(
|
||||
const UnicodeString& ID,
|
||||
const UnicodeString& rules,
|
||||
UnicodeFilter* adoptedFilter,
|
||||
UErrorCode& status) :
|
||||
Transliterator(ID, adoptedFilter) {
|
||||
_construct(rules, FORWARD, status);
|
||||
}
|
||||
|
||||
#endif
|
83
icu4c/source/i18n/rbt_data.cpp
Normal file
83
icu4c/source/i18n/rbt_data.cpp
Normal file
|
@ -0,0 +1,83 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#include "rbt_data.h"
|
||||
#include "uhash.h"
|
||||
#include "unistr.h"
|
||||
|
||||
TransliterationRuleData::TransliterationRuleData(UErrorCode& status) :
|
||||
variableNames(0), setVariables(0) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
variableNames = uhash_open(uhash_hashUString, &status);
|
||||
setVariables = uhash_open(0, &status);
|
||||
}
|
||||
|
||||
TransliterationRuleData::~TransliterationRuleData() {
|
||||
if (variableNames != 0) {
|
||||
uhash_close(variableNames);
|
||||
}
|
||||
if (setVariables != 0) {
|
||||
uhash_close(setVariables);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
TransliterationRuleData::defineVariable(const UnicodeString& name,
|
||||
UChar value,
|
||||
UErrorCode& status) {
|
||||
uhash_putKey(variableNames, name.hashCode() & 0x7FFFFFFF,
|
||||
(void*) value,
|
||||
&status);
|
||||
}
|
||||
|
||||
void
|
||||
TransliterationRuleData::defineVariable(const UnicodeString& name,
|
||||
UChar standIn,
|
||||
UnicodeSet* adoptedSet,
|
||||
UErrorCode& status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
if (adoptedSet == 0) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
uhash_putKey(variableNames, name.hashCode() & 0x7FFFFFFF,
|
||||
(void*) standIn,
|
||||
&status);
|
||||
uhash_putKey(setVariables, (int32_t) (standIn & 0x7FFFFFFF),
|
||||
adoptedSet,
|
||||
&status);
|
||||
}
|
||||
|
||||
UChar
|
||||
TransliterationRuleData::lookupVariable(const UnicodeString& name,
|
||||
UErrorCode& status) const {
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
void* value = uhash_get(variableNames, name.hashCode() & 0x7FFFFFFF);
|
||||
if (value == 0) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
return (UChar) (int32_t) value;
|
||||
}
|
||||
|
||||
UnicodeSet*
|
||||
TransliterationRuleData::lookupSet(UChar standIn) const {
|
||||
void* value = uhash_get(setVariables, (int32_t) (standIn & 0x7FFFFFFF));
|
||||
return (UnicodeSet*) value;
|
||||
}
|
||||
|
||||
bool_t
|
||||
TransliterationRuleData::isVariableDefined(const UnicodeString& name) const {
|
||||
return 0 != uhash_get(variableNames, name.hashCode() & 0x7FFFFFFF);
|
||||
}
|
85
icu4c/source/i18n/rbt_data.h
Normal file
85
icu4c/source/i18n/rbt_data.h
Normal file
|
@ -0,0 +1,85 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef RBT_DATA_H
|
||||
#define RBT_DATA_H
|
||||
|
||||
#include "rbt_set.h"
|
||||
|
||||
class UnicodeString;
|
||||
class UnicodeSet;
|
||||
struct UHashtable;
|
||||
|
||||
/**
|
||||
* The rule data for a RuleBasedTransliterators. RBT objects hold
|
||||
* a const pointer to a TRD object that they do not own. TRD objects
|
||||
* are essentially the parsed rules in compact, usable form. The
|
||||
* TRD objects themselves are held for the life of the process in
|
||||
* a static cache owned by Transliterator.
|
||||
*/
|
||||
class TransliterationRuleData {
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Rule table. May be empty.
|
||||
*
|
||||
* PUBLIC DATA MEMBER for internal use by RBT
|
||||
*/
|
||||
TransliterationRuleSet ruleSet;
|
||||
|
||||
/**
|
||||
* Map variable name (UnicodeString) to variable (Character).
|
||||
* A variable name may correspond to a single literal
|
||||
* character, in which case the character is stored in this
|
||||
* hash. It may also correspond to a UnicodeSet, in which
|
||||
* case a character is again stored in this hash, but the
|
||||
* character is a stand-in: it is a key for a secondary lookup
|
||||
* in data.setVariables. The stand-in also represents the
|
||||
* UnicodeSet in the stored rules.
|
||||
*
|
||||
* PUBLIC DATA MEMBER for internal use by RBT
|
||||
*/
|
||||
UHashtable* variableNames;
|
||||
|
||||
/**
|
||||
* Map category variable (UChar) to set (UnicodeSet).
|
||||
* Variables that correspond to a set of characters are mapped
|
||||
* from variable name to a stand-in character in
|
||||
* data.variableNames. The stand-in then serves as a key in
|
||||
* this hash to lookup the actual UnicodeSet object. In
|
||||
* addition, the stand-in is stored in the rule text to
|
||||
* represent the set of characters.
|
||||
*
|
||||
* PUBLIC DATA MEMBER for internal use by RBT
|
||||
*/
|
||||
UHashtable* setVariables;
|
||||
|
||||
TransliterationRuleData(UErrorCode& status);
|
||||
|
||||
~TransliterationRuleData();
|
||||
|
||||
void defineVariable(const UnicodeString& name,
|
||||
UChar value,
|
||||
UErrorCode& status);
|
||||
|
||||
void defineVariable(const UnicodeString& name,
|
||||
UChar standIn,
|
||||
UnicodeSet* adoptedSet,
|
||||
UErrorCode& status);
|
||||
|
||||
UChar lookupVariable(const UnicodeString& name,
|
||||
UErrorCode& status) const;
|
||||
|
||||
UnicodeSet* lookupSet(UChar standIn) const;
|
||||
|
||||
bool_t isVariableDefined(const UnicodeString& name) const;
|
||||
};
|
||||
|
||||
#endif
|
640
icu4c/source/i18n/rbt_pars.cpp
Normal file
640
icu4c/source/i18n/rbt_pars.cpp
Normal file
|
@ -0,0 +1,640 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#include "rbt_pars.h"
|
||||
#include "rbt.h"
|
||||
#include "rbt_rule.h"
|
||||
#include "unirange.h"
|
||||
#include "rbt_data.h"
|
||||
#include "uniset.h"
|
||||
|
||||
// Operators
|
||||
const UChar TransliterationRuleParser::VARIABLE_DEF_OP = '=';
|
||||
const UChar TransliterationRuleParser::FORWARD_RULE_OP = '>';
|
||||
const UChar TransliterationRuleParser::REVERSE_RULE_OP = '<';
|
||||
const char* TransliterationRuleParser::OPERATORS = "=><";
|
||||
|
||||
// Other special characters
|
||||
const UChar TransliterationRuleParser::QUOTE = '\'';
|
||||
const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = '{';
|
||||
const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = '}';
|
||||
const UChar TransliterationRuleParser::CONTEXT_OPEN = '[';
|
||||
const UChar TransliterationRuleParser::CONTEXT_CLOSE = ']';
|
||||
const UChar TransliterationRuleParser::CURSOR_POS = '|';
|
||||
const UChar TransliterationRuleParser::RULE_COMMENT_CHAR = '#';
|
||||
|
||||
|
||||
/**
|
||||
* Specials must be quoted in rules to be used as literals.
|
||||
* Specials may not occur in variable names.
|
||||
*
|
||||
* This string is a superset of OPERATORS.
|
||||
*/
|
||||
const char* TransliterationRuleParser::SPECIALS = "'{}[]|#=><";
|
||||
|
||||
/**
|
||||
* Specials that must be quoted in variable definitions.
|
||||
*/
|
||||
const char* TransliterationRuleParser::DEF_SPECIALS = "'{}";
|
||||
|
||||
TransliterationRuleData*
|
||||
TransliterationRuleParser::parse(const UnicodeString& rules,
|
||||
RuleBasedTransliterator::Direction direction) {
|
||||
TransliterationRuleParser parser(rules, direction);
|
||||
parser.parseRules();
|
||||
if (U_FAILURE(parser.status)) {
|
||||
delete parser.data;
|
||||
parser.data = 0;
|
||||
}
|
||||
return parser.data;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param rules list of rules, separated by newline characters
|
||||
* @exception IllegalArgumentException if there is a syntax error in the
|
||||
* rules
|
||||
*/
|
||||
TransliterationRuleParser::TransliterationRuleParser(
|
||||
const UnicodeString& theRules,
|
||||
RuleBasedTransliterator::Direction theDirection) :
|
||||
rules(theRules), direction(theDirection), data(0) {}
|
||||
|
||||
/**
|
||||
* Parse the given string as a sequence of rules, separated by newline
|
||||
* characters ('\n'), and cause this object to implement those rules. Any
|
||||
* previous rules are discarded. Typically this method is called exactly
|
||||
* once, during construction.
|
||||
* @exception IllegalArgumentException if there is a syntax error in the
|
||||
* rules
|
||||
*/
|
||||
void TransliterationRuleParser::parseRules() {
|
||||
status = U_ZERO_ERROR;
|
||||
|
||||
delete data;
|
||||
data = new TransliterationRuleData(status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
determineVariableRange();
|
||||
|
||||
int32_t n = rules.length();
|
||||
int32_t i = 0;
|
||||
while (i<n && U_SUCCESS(status)) {
|
||||
int32_t limit = rules.indexOf('\n', i);
|
||||
|
||||
// Recognize "\\\n" as an escaped "\n"
|
||||
while (limit>0 && rules.charAt(limit-1) == '\\') {
|
||||
limit = rules.indexOf('\n', limit+1);
|
||||
}
|
||||
|
||||
if (limit == -1) {
|
||||
limit = n;
|
||||
}
|
||||
// Skip over empty lines and line starting with #
|
||||
if (limit > i && rules.charAt(i) != RULE_COMMENT_CHAR) {
|
||||
applyRule(i, limit);
|
||||
}
|
||||
i = limit + 1;
|
||||
}
|
||||
|
||||
data->ruleSet.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the given substring as a rule, and append it to the rules currently
|
||||
* represented in this object.
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= rules.length()</code>.
|
||||
* @exception IllegalArgumentException if there is a syntax error in the
|
||||
* rules
|
||||
*/
|
||||
void TransliterationRuleParser::applyRule(int32_t start, int32_t limit) {
|
||||
/* General description of parsing: Initially, rules contain two types of
|
||||
* quoted characters. First, there are variable references, such as
|
||||
* "{alpha}". Second, there are quotes, such as "'<'" or "''". One of
|
||||
* the first steps in parsing a rule is to resolve such quoted matter.
|
||||
* Quotes are removed early, leaving unquoted literal matter. Variable
|
||||
* references are resolved and replaced by single characters. In some
|
||||
* instances these characters represent themselves; in others, they
|
||||
* stand for categories of characters. Character categories are either
|
||||
* predefined (e.g., "{Lu}"), or are defined by the user using a
|
||||
* statement (e.g., "vowels:aeiouAEIOU").
|
||||
*
|
||||
* Another early step in parsing is to split each rule into component
|
||||
* pieces. These pieces are, for every rule, a left-hand side, a right-
|
||||
* hand side, and an operator. The left- and right-hand sides may not
|
||||
* be empty, except for the output patterns of forward and reverse
|
||||
* rules. In addition to this partitioning, the match patterns of
|
||||
* forward and reverse rules must be partitioned into antecontext,
|
||||
* postcontext, and literal pattern, where the context portions may or
|
||||
* may not be present. Finally, output patterns must have the cursor
|
||||
* indicator '|' detected and removed, with its position recorded.
|
||||
*
|
||||
* Quote removal, variable resolution, and sub-pattern splitting must
|
||||
* all happen at once. This is due chiefly to the quoting mechanism,
|
||||
* which allows special characters to appear at arbitrary positions in
|
||||
* the final unquoted text. (For this reason, alteration of the rule
|
||||
* language is somewhat clumsy; it entails reassessment and revision of
|
||||
* the parsing methods as a whole.)
|
||||
*
|
||||
* After this processing of rules is complete, the final end products
|
||||
* are unquoted pieces of text of various types, and an integer cursor
|
||||
* position, if one is specified. These processed raw materials are now
|
||||
* easy to deal with; other classes such as UnicodeSet and
|
||||
* TransliterationRule need know nothing of quoting or variables.
|
||||
*/
|
||||
UnicodeString left;
|
||||
UnicodeString right;
|
||||
UnicodeString anteContext;
|
||||
UnicodeString postContext;
|
||||
int32_t cursorPos;
|
||||
|
||||
UChar op = parseRule(start, limit, left, right,
|
||||
anteContext, postContext, cursorPos);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
switch (op) {
|
||||
case VARIABLE_DEF_OP:
|
||||
applyVariableDef(left, right);
|
||||
break;
|
||||
case FORWARD_RULE_OP:
|
||||
if (direction == RuleBasedTransliterator::FORWARD) {
|
||||
data->ruleSet.addRule(new TransliterationRule(
|
||||
left, right,
|
||||
anteContext, postContext,
|
||||
cursorPos, status),
|
||||
status);
|
||||
} // otherwise ignore the rule; it's not the direction we want
|
||||
break;
|
||||
case REVERSE_RULE_OP:
|
||||
if (direction == RuleBasedTransliterator::REVERSE) {
|
||||
data->ruleSet.addRule(new TransliterationRule(
|
||||
right, left,
|
||||
anteContext, postContext,
|
||||
cursorPos, status),
|
||||
status);
|
||||
} // otherwise ignore the rule; it's not the direction we want
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a variable definition.
|
||||
* @param name the name of the variable. It must not already be defined.
|
||||
* @param pattern the value of the variable. It may be a single character
|
||||
* or a pattern describing a character set.
|
||||
* @exception IllegalArgumentException if there is a syntax error
|
||||
*/
|
||||
void TransliterationRuleParser::applyVariableDef(const UnicodeString& name,
|
||||
const UnicodeString& pattern) {
|
||||
validateVariableName(name);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (data->isVariableDefined(name)) {
|
||||
// throw new IllegalArgumentException("Duplicate variable definition: "
|
||||
// + name + '=' + pattern);
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
//! if (UnicodeSet.getCategoryID(name) >= 0) {
|
||||
//! throw new IllegalArgumentException("Reserved variable name: "
|
||||
//! + name);
|
||||
//! }
|
||||
if (pattern.length() < 1) {
|
||||
// throw new IllegalArgumentException("Variable definition missing: "
|
||||
// + name);
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
if (pattern.length() == 1) {
|
||||
// Got a single character variable definition
|
||||
//$ data->variableNames.put(name, new Character(pattern.charAt(0)));
|
||||
data->defineVariable(name, pattern.charAt(0), status);
|
||||
} else {
|
||||
// Got more than one character; parse it as a category
|
||||
if (variableNext >= variableLimit) {
|
||||
//$ throw new RuntimeException("Private use variables exhausted");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
//$ Character c = new Character(variableNext++);
|
||||
//$ data->variableNames.put(name, c);
|
||||
//$ data->setVariables.put(c, new UnicodeSet(pattern));
|
||||
data->defineVariable(name, variableNext++,
|
||||
new UnicodeSet(pattern, status),
|
||||
status);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a rule, parses it into three pieces: The left side, the right side,
|
||||
* and the operator. Returns the operator. Quotes and variable references
|
||||
* are resolved; the otuput text in all <code>StringBuffer</code> parameters
|
||||
* is literal text. This method delegates to other parsing methods to
|
||||
* handle the match pattern, output pattern, and other sub-patterns in the
|
||||
* rule.
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= rules.length()</code>.
|
||||
* @param left left side of rule is appended to this buffer
|
||||
* with the quotes removed and variables resolved
|
||||
* @param right right side of rule is appended to this buffer
|
||||
* with the quotes removed and variables resolved
|
||||
* @param anteContext the preceding context of the match pattern,
|
||||
* if there is one, is appended to this buffer
|
||||
* @param postContext the following context of the match pattern,
|
||||
* if there is one, is appended to this buffer
|
||||
* @param cursorPos if there is a cursor in the output pattern, its
|
||||
* offset is stored in <code>cursorPos</code>
|
||||
* @return The operator character, one of the characters in OPERATORS.
|
||||
*/
|
||||
UChar TransliterationRuleParser::parseRule(int32_t start, int32_t limit,
|
||||
UnicodeString& left,
|
||||
UnicodeString& right,
|
||||
UnicodeString& anteContext,
|
||||
UnicodeString& postContext,
|
||||
int32_t& cursorPos) {
|
||||
/* Parse the rule into three pieces -- left, operator, and right,
|
||||
* parsing out quotes. The result is that left and right will have
|
||||
* unquoted text. E.g., "gt<'>'" will have right = ">". Unquoted
|
||||
* operators throw an exception. Two quotes inside or outside
|
||||
* quotes indicates a quote literal. E.g., "o''clock" -> "o'clock".
|
||||
*/
|
||||
int32_t i = quotedIndexOf(rules, start, limit, OPERATORS);
|
||||
if (i < 0) {
|
||||
//$ throw new IllegalArgumentException(
|
||||
//$ "Syntax error: "
|
||||
//$ + rules.substring(start, limit));
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
UChar c = rules.charAt(i);
|
||||
switch (c) {
|
||||
case FORWARD_RULE_OP:
|
||||
if (i == start) {
|
||||
//$ throw new IllegalArgumentException(
|
||||
//$ "Empty left side: "
|
||||
//$ + rules.substring(start, limit));
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
parseMatchPattern(start, i, left, anteContext, postContext);
|
||||
if (i != (limit-1)) {
|
||||
parseOutputPattern(i+1, limit, right, cursorPos);
|
||||
}
|
||||
break;
|
||||
case REVERSE_RULE_OP:
|
||||
if (i == (limit-1)) {
|
||||
//$ throw new IllegalArgumentException(
|
||||
//$ "Empty right side: "
|
||||
//$ + rules.substring(start, limit));
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
if (i != start) {
|
||||
parseOutputPattern(start, i, left, cursorPos);
|
||||
}
|
||||
parseMatchPattern(i+1, limit, right, anteContext, postContext);
|
||||
break;
|
||||
default:
|
||||
if (i == start || i == (limit-1)) {
|
||||
//$ throw new IllegalArgumentException(
|
||||
//$ "Empty left or right side: "
|
||||
//$ + rules.substring(start, limit));
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
parseSubPattern(start, i, left);
|
||||
parseDefPattern(i+1, limit, right);
|
||||
break;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the match pattern of a forward or reverse rule. Given the raw
|
||||
* match pattern, return the match text and the context on both sides, if
|
||||
* any. Resolves all quotes and variables.
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= rules.length()</code>.
|
||||
* @param text the key to be matched will be appended to this buffer
|
||||
* @param anteContext the preceding context, if any, will be appended
|
||||
* to this buffer.
|
||||
* @param postContext the following context, if any, will be appended
|
||||
* to this buffer.
|
||||
*/
|
||||
void TransliterationRuleParser::parseMatchPattern(int32_t start, int32_t limit,
|
||||
UnicodeString& text,
|
||||
UnicodeString& anteContext,
|
||||
UnicodeString& postContext) {
|
||||
if (start >= limit) {
|
||||
//$ throw new IllegalArgumentException(
|
||||
//$ "Empty expression in rule: "
|
||||
//$ + rules.substring(start, limit));
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
//$ if (anteContext != 0) {
|
||||
// Ignore optional opening and closing context characters
|
||||
if (rules.charAt(start) == CONTEXT_OPEN) {
|
||||
++start;
|
||||
}
|
||||
if (rules.charAt(limit-1) == CONTEXT_CLOSE) {
|
||||
--limit;
|
||||
}
|
||||
// The four possibilities are:
|
||||
// key
|
||||
// anteContext]key
|
||||
// anteContext]key[postContext
|
||||
// key[postContext
|
||||
int32_t ante = quotedIndexOf(rules, start, limit, CONTEXT_CLOSE);
|
||||
int32_t post = quotedIndexOf(rules, start, limit, CONTEXT_OPEN);
|
||||
if (ante >= 0 && post >= 0 && ante > post) {
|
||||
//$ throw new IllegalArgumentException(
|
||||
//$ "Syntax error in context specifier: "
|
||||
//$ + rules.substring(start, limit));
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
if (ante >= 0) {
|
||||
parseSubPattern(start, ante, anteContext);
|
||||
start = ante+1;
|
||||
}
|
||||
if (post >= 0) {
|
||||
parseSubPattern(post+1, limit, postContext);
|
||||
limit = post;
|
||||
}
|
||||
//$ }
|
||||
parseSubPattern(start, limit, text);
|
||||
}
|
||||
|
||||
void TransliterationRuleParser::parseSubPattern(int32_t start, int32_t limit,
|
||||
UnicodeString& text) {
|
||||
parseSubPattern(start, limit, text, 0, SPECIALS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a variable definition sub pattern. This kind of sub
|
||||
* pattern differs in the set of characters that are considered
|
||||
* special. In particular, the '[' and ']' characters are not
|
||||
* special, since these are used in UnicodeSet patterns.
|
||||
*/
|
||||
void TransliterationRuleParser::parseDefPattern(int32_t start, int32_t limit,
|
||||
UnicodeString& text) {
|
||||
parseSubPattern(start, limit, text, 0, DEF_SPECIALS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the output pattern of a forward or reverse rule. Given the
|
||||
* output pattern, return the output text and the position of the cursor,
|
||||
* if any. Resolves all quotes and variables.
|
||||
* @param rules the string to be parsed
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= rules.length()</code>.
|
||||
* @param text the output text will be appended to this buffer
|
||||
* @param cursorPos if this parameter is not null, then cursorPos
|
||||
* will be set to the cursor position, or -1 if there is none. If this
|
||||
* parameter is null, then cursors will be disallowed.
|
||||
*/
|
||||
void TransliterationRuleParser::parseOutputPattern(int32_t start, int32_t limit,
|
||||
UnicodeString& text,
|
||||
int32_t& cursorPos) {
|
||||
parseSubPattern(start, limit, text, &cursorPos, SPECIALS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a sub-pattern of a rule. Return the text and the position of the cursor,
|
||||
* if any. Resolves all quotes and variables.
|
||||
* @param rules the string to be parsed
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= rules.length()</code>.
|
||||
* @param text the output text will be appended to this buffer
|
||||
* @param cursorPos if this parameter is not null, then cursorPos
|
||||
* will be set to the cursor position, or -1 if there is none. If this
|
||||
* parameter is null, then cursors will be disallowed.
|
||||
* @param specials characters that must be quoted; typically either
|
||||
* SPECIALS or DEF_SPECIALS.
|
||||
*/
|
||||
void TransliterationRuleParser::parseSubPattern(int32_t start, int32_t limit,
|
||||
UnicodeString& text,
|
||||
int32_t* cursorPos,
|
||||
const UnicodeString& specials) {
|
||||
bool_t inQuote = FALSE;
|
||||
|
||||
if (start >= limit) {
|
||||
//$ throw new IllegalArgumentException("Empty expression in rule");
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
if (cursorPos != 0) {
|
||||
*cursorPos = -1;
|
||||
}
|
||||
for (int32_t i=start; i<limit; ++i) {
|
||||
UChar c = rules.charAt(i);
|
||||
if (c == QUOTE) {
|
||||
// Check for double quote
|
||||
if ((i+1) < limit
|
||||
&& rules.charAt(i+1) == QUOTE) {
|
||||
text.append(QUOTE);
|
||||
++i; // Skip over both quotes
|
||||
} else {
|
||||
inQuote = !inQuote;
|
||||
}
|
||||
} else if (inQuote) {
|
||||
text.append(c);
|
||||
} else if (c == VARIABLE_REF_OPEN) {
|
||||
++i;
|
||||
int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, i);
|
||||
if (i == j || j < 0) { // empty or unterminated
|
||||
//$ throw new IllegalArgumentException("Illegal variable reference: "
|
||||
//$ + rules.substring(start, limit));
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
UnicodeString name;
|
||||
rules.extractBetween(i, j, name);
|
||||
validateVariableName(name);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
UChar ch = data->lookupVariable(name, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
text.append(ch);
|
||||
i = j;
|
||||
} else if (c == CURSOR_POS && cursorPos != 0) {
|
||||
if (*cursorPos >= 0) {
|
||||
//$ throw new IllegalArgumentException("Multiple cursors: "
|
||||
//$ + rules.substring(start, limit));
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
*cursorPos = text.length();
|
||||
} else if (specials.indexOf(c) >= 0) {
|
||||
//$ throw new IllegalArgumentException("Unquoted special character: "
|
||||
//$ + rules.substring(start, limit));
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
} else {
|
||||
text.append(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TransliterationRuleParser::validateVariableName(const UnicodeString& name) {
|
||||
if (indexOf(name, SPECIALS) >= 0) {
|
||||
//throw new IllegalArgumentException(
|
||||
// "Special character in variable name: "
|
||||
// + name);
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the single character value of the given variable name. Defined
|
||||
* names are recognized.
|
||||
*
|
||||
* NO LONGER SUPPORTED:
|
||||
* If a Unicode category name is given, a standard character variable
|
||||
* in the range firstCategoryVariable to lastCategoryVariable is returned,
|
||||
* with value firstCategoryVariable + n, where n is the category
|
||||
* number.
|
||||
* @exception IllegalArgumentException if the name is unknown.
|
||||
*/
|
||||
//$ UChar TransliterationRuleParser::getVariableDef(const UnicodeString& name) {
|
||||
//$ UChar ch = data->lookupVariable(name, status);
|
||||
//$ //! if (ch == null) {
|
||||
//$ //! int id = UnicodeSet.getCategoryID(name);
|
||||
//$ //! if (id >= 0) {
|
||||
//$ //! ch = new Character((char) (firstCategoryVariable + id));
|
||||
//$ //! data->variableNames.put(name, ch);
|
||||
//$ //! data->setVariables.put(ch, new UnicodeSet(id));
|
||||
//$ //! }
|
||||
//$ //! }
|
||||
//$ if (ch == 0) {
|
||||
//$ throw new IllegalArgumentException("Undefined variable: "
|
||||
//$ + name);
|
||||
//$ }
|
||||
//$ return ch;
|
||||
//$ }
|
||||
|
||||
/**
|
||||
* Determines what part of the private use region of Unicode we can use for
|
||||
* variable stand-ins. The correct way to do this is as follows: Parse each
|
||||
* rule, and for forward and reverse rules, take the FROM expression, and
|
||||
* make a hash of all characters used. The TO expression should be ignored.
|
||||
* When done, everything not in the hash is available for use. In practice,
|
||||
* this method may employ some other algorithm for improved speed.
|
||||
*/
|
||||
void TransliterationRuleParser::determineVariableRange() {
|
||||
UnicodeRange privateUse(0xE000, 0x1900); // Private use area
|
||||
|
||||
UnicodeRange* r = privateUse.largestUnusedSubrange(rules);
|
||||
|
||||
variableNext = variableLimit = (UChar) 0;
|
||||
|
||||
if (r != 0) {
|
||||
variableNext = r->start;
|
||||
variableLimit = (UChar) (r->start + r->length);
|
||||
delete r;
|
||||
}
|
||||
|
||||
if (variableNext >= variableLimit) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the index of the first character in a set, ignoring quoted text.
|
||||
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
|
||||
* found by a search for "h". Unlike String.indexOf(), this method searches
|
||||
* not for a single character, but for any character of the string
|
||||
* <code>setOfChars</code>.
|
||||
* @param text text to be searched
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param setOfChars string with one or more distinct characters
|
||||
* @return Offset of the first character in <code>setOfChars</code>
|
||||
* found, or -1 if not found.
|
||||
* @see #indexOf
|
||||
*/
|
||||
int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text,
|
||||
int32_t start, int32_t limit,
|
||||
const UnicodeString& setOfChars) {
|
||||
for (int32_t i=start; i<limit; ++i) {
|
||||
UChar c = text.charAt(i);
|
||||
if (c == QUOTE) {
|
||||
while (++i < limit
|
||||
&& text.charAt(i) != QUOTE) {}
|
||||
} else if (setOfChars.indexOf(c) >= 0) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the index of the first character in a set. Unlike
|
||||
* String.indexOf(), this method searches not for a single character, but
|
||||
* for any character of the string <code>setOfChars</code>.
|
||||
* @param text text to be searched
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param setOfChars string with one or more distinct characters
|
||||
* @return Offset of the first character in <code>setOfChars</code>
|
||||
* found, or -1 if not found.
|
||||
* @see #quotedIndexOf
|
||||
*/
|
||||
int32_t TransliterationRuleParser::indexOf(const UnicodeString& text,
|
||||
int32_t start, int32_t limit,
|
||||
const UnicodeString& setOfChars) {
|
||||
for (int32_t i=start; i<limit; ++i) {
|
||||
if (setOfChars.indexOf(text.charAt(i)) >= 0) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the index of the first character in a set. Unlike
|
||||
* String.indexOf(), this method searches not for a single character, but
|
||||
* for any character of the string <code>setOfChars</code>.
|
||||
* @param text text to be searched
|
||||
* @param setOfChars string with one or more distinct characters
|
||||
* @return Offset of the first character in <code>setOfChars</code>
|
||||
* found, or -1 if not found.
|
||||
* @see #quotedIndexOf
|
||||
*/
|
||||
int32_t TransliterationRuleParser::indexOf(const UnicodeString& text,
|
||||
const UnicodeString& setOfChars) {
|
||||
return indexOf(text, 0, text.length(), setOfChars);
|
||||
}
|
302
icu4c/source/i18n/rbt_pars.h
Normal file
302
icu4c/source/i18n/rbt_pars.h
Normal file
|
@ -0,0 +1,302 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef RBT_PARS_H
|
||||
#define RBT_PARS_H
|
||||
|
||||
#include "rbt.h"
|
||||
|
||||
class TransliterationRuleData;
|
||||
|
||||
class TransliterationRuleParser {
|
||||
|
||||
/**
|
||||
* This is a reference to external data we don't own. This works because
|
||||
* we only hold this for the duration of the call to parse().
|
||||
*/
|
||||
const UnicodeString& rules;
|
||||
|
||||
RuleBasedTransliterator::Direction direction;
|
||||
|
||||
TransliterationRuleData* data;
|
||||
|
||||
/**
|
||||
* We use a single error code during parsing. Rather than pass it
|
||||
* through each API, we keep it here.
|
||||
*/
|
||||
UErrorCode status;
|
||||
|
||||
/**
|
||||
* The next available stand-in for variables. This starts at some point in
|
||||
* the private use area (discovered dynamically) and increments up toward
|
||||
* <code>variableLimit</code>. At any point during parsing, available
|
||||
* variables are <code>variableNext..variableLimit-1</code>.
|
||||
*/
|
||||
UChar variableNext;
|
||||
|
||||
/**
|
||||
* The last available stand-in for variables. This is discovered
|
||||
* dynamically. At any point during parsing, available variables are
|
||||
* <code>variableNext..variableLimit-1</code>.
|
||||
*/
|
||||
UChar variableLimit;
|
||||
|
||||
// Operators
|
||||
static const UChar VARIABLE_DEF_OP;
|
||||
static const UChar FORWARD_RULE_OP;
|
||||
static const UChar REVERSE_RULE_OP;
|
||||
static const char* OPERATORS;
|
||||
|
||||
|
||||
// Other special characters
|
||||
static const UChar QUOTE;
|
||||
static const UChar VARIABLE_REF_OPEN;
|
||||
static const UChar VARIABLE_REF_CLOSE;
|
||||
static const UChar CONTEXT_OPEN;
|
||||
static const UChar CONTEXT_CLOSE;
|
||||
static const UChar CURSOR_POS;
|
||||
static const UChar RULE_COMMENT_CHAR;
|
||||
|
||||
|
||||
/**
|
||||
* Specials must be quoted in rules to be used as literals.
|
||||
* Specials may not occur in variable names.
|
||||
*/
|
||||
static const char* SPECIALS;
|
||||
|
||||
/**
|
||||
* Specials that must be quoted in variable definitions.
|
||||
*/
|
||||
static const char* DEF_SPECIALS;
|
||||
|
||||
public:
|
||||
|
||||
static TransliterationRuleData*
|
||||
parse(const UnicodeString& rules,
|
||||
RuleBasedTransliterator::Direction direction);
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* @param rules list of rules, separated by newline characters
|
||||
* @exception IllegalArgumentException if there is a syntax error in the
|
||||
* rules
|
||||
*/
|
||||
TransliterationRuleParser(const UnicodeString& rules,
|
||||
RuleBasedTransliterator::Direction direction);
|
||||
|
||||
/**
|
||||
* Parse the given string as a sequence of rules, separated by newline
|
||||
* characters ('\n'), and cause this object to implement those rules. Any
|
||||
* previous rules are discarded. Typically this method is called exactly
|
||||
* once, during construction.
|
||||
* @exception IllegalArgumentException if there is a syntax error in the
|
||||
* rules
|
||||
*/
|
||||
void parseRules();
|
||||
|
||||
/**
|
||||
* Parse the given substring as a rule, and append it to the rules currently
|
||||
* represented in this object.
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= rules.length()</code>.
|
||||
* @exception IllegalArgumentException if there is a syntax error in the
|
||||
* rules
|
||||
*/
|
||||
void applyRule(int32_t start, int32_t limit);
|
||||
|
||||
/**
|
||||
* Add a variable definition.
|
||||
* @param name the name of the variable. It must not already be defined.
|
||||
* @param pattern the value of the variable. It may be a single character
|
||||
* or a pattern describing a character set.
|
||||
* @exception IllegalArgumentException if there is a syntax error
|
||||
*/
|
||||
void applyVariableDef(const UnicodeString& name,
|
||||
const UnicodeString& pattern);
|
||||
|
||||
/**
|
||||
* Given a rule, parses it into three pieces: The left side, the right side,
|
||||
* and the operator. Returns the operator. Quotes and variable references
|
||||
* are resolved; the otuput text in all <code>StringBuffer</code> parameters
|
||||
* is literal text. This method delegates to other parsing methods to
|
||||
* handle the match pattern, output pattern, and other sub-patterns in the
|
||||
* rule.
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= rules.length()</code>.
|
||||
* @param left left side of rule is appended to this buffer
|
||||
* with the quotes removed and variables resolved
|
||||
* @param right right side of rule is appended to this buffer
|
||||
* with the quotes removed and variables resolved
|
||||
* @param anteContext the preceding context of the match pattern,
|
||||
* if there is one, is appended to this buffer
|
||||
* @param postContext the following context of the match pattern,
|
||||
* if there is one, is appended to this buffer
|
||||
* @param cursorPos if there is a cursor in the output pattern, its
|
||||
* offset is stored in <code>cursorPos[0]</code>
|
||||
* @return The operator character, one of the characters in OPERATORS.
|
||||
*/
|
||||
UChar parseRule(int32_t start, int32_t limit,
|
||||
UnicodeString& left, UnicodeString& right,
|
||||
UnicodeString& anteContext,
|
||||
UnicodeString& postContext,
|
||||
int32_t& cursorPos);
|
||||
|
||||
/**
|
||||
* Parses the match pattern of a forward or reverse rule. Given the raw
|
||||
* match pattern, return the match text and the context on both sides, if
|
||||
* any. Resolves all quotes and variables.
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= rules.length()</code>.
|
||||
* @param text the key to be matched will be appended to this buffer
|
||||
* @param anteContext the preceding context, if any, will be appended
|
||||
* to this buffer.
|
||||
* @param postContext the following context, if any, will be appended
|
||||
* to this buffer.
|
||||
*/
|
||||
void parseMatchPattern(int32_t start, int32_t limit,
|
||||
UnicodeString& text,
|
||||
UnicodeString& anteContext,
|
||||
UnicodeString& postContext);
|
||||
|
||||
void parseSubPattern(int32_t start, int32_t limit,
|
||||
UnicodeString& text);
|
||||
|
||||
/**
|
||||
* Parse a variable definition sub pattern. This kind of sub
|
||||
* pattern differs in the set of characters that are considered
|
||||
* special. In particular, the '[' and ']' characters are not
|
||||
* special, since these are used in UnicodeSet patterns.
|
||||
*/
|
||||
void parseDefPattern(int32_t start, int32_t limit,
|
||||
UnicodeString& text);
|
||||
|
||||
/**
|
||||
* Parses the output pattern of a forward or reverse rule. Given the
|
||||
* output pattern, return the output text and the position of the cursor,
|
||||
* if any. Resolves all quotes and variables.
|
||||
* @param rules the string to be parsed
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= rules.length()</code>.
|
||||
* @param text the output text will be appended to this buffer
|
||||
* @param cursorPos if this parameter is not null, then cursorPos[0]
|
||||
* will be set to the cursor position, or -1 if there is none. If this
|
||||
* parameter is null, then cursors will be disallowed.
|
||||
*/
|
||||
void parseOutputPattern(int32_t start, int32_t limit,
|
||||
UnicodeString& text,
|
||||
int32_t& cursorPos);
|
||||
|
||||
/**
|
||||
* Parses a sub-pattern of a rule. Return the text and the position of the cursor,
|
||||
* if any. Resolves all quotes and variables.
|
||||
* @param rules the string to be parsed
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= rules.length()</code>.
|
||||
* @param text the output text will be appended to this buffer
|
||||
* @param cursorPos if this parameter is not null, then cursorPos[0]
|
||||
* will be set to the cursor position, or -1 if there is none. If this
|
||||
* parameter is null, then cursors will be disallowed.
|
||||
* @param specials characters that must be quoted; typically either
|
||||
* SPECIALS or DEF_SPECIALS.
|
||||
*/
|
||||
void parseSubPattern(int32_t start, int32_t limit,
|
||||
UnicodeString& text,
|
||||
int32_t* cursorPos,
|
||||
const UnicodeString& specials);
|
||||
|
||||
void validateVariableName(const UnicodeString& name);
|
||||
|
||||
/**
|
||||
* Returns the single character value of the given variable name. Defined
|
||||
* names are recognized.
|
||||
*
|
||||
* NO LONGER SUPPORTED:
|
||||
* If a Unicode category name is given, a standard character variable
|
||||
* in the range firstCategoryVariable to lastCategoryVariable is returned,
|
||||
* with value firstCategoryVariable + n, where n is the category
|
||||
* number.
|
||||
* @exception IllegalArgumentException if the name is unknown.
|
||||
*/
|
||||
//$ Character getVariableDef(const UnicodeString& name);
|
||||
|
||||
/**
|
||||
* Determines what part of the private use region of Unicode we can use for
|
||||
* variable stand-ins. The correct way to do this is as follows: Parse each
|
||||
* rule, and for forward and reverse rules, take the FROM expression, and
|
||||
* make a hash of all characters used. The TO expression should be ignored.
|
||||
* When done, everything not in the hash is available for use. In practice,
|
||||
* this method may employ some other algorithm for improved speed.
|
||||
*/
|
||||
void determineVariableRange();
|
||||
|
||||
/**
|
||||
* Returns the index of the first character in a set, ignoring quoted text.
|
||||
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
|
||||
* found by a search for "h". Unlike String.indexOf(), this method searches
|
||||
* not for a single character, but for any character of the string
|
||||
* <code>setOfChars</code>.
|
||||
* @param text text to be searched
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param setOfChars string with one or more distinct characters
|
||||
* @return Offset of the first character in <code>setOfChars</code>
|
||||
* found, or -1 if not found.
|
||||
* @see #indexOf
|
||||
*/
|
||||
static int32_t quotedIndexOf(const UnicodeString& text,
|
||||
int32_t start, int32_t limit,
|
||||
const UnicodeString& setOfChars);
|
||||
|
||||
/**
|
||||
* Returns the index of the first character in a set. Unlike
|
||||
* String.indexOf(), this method searches not for a single character, but
|
||||
* for any character of the string <code>setOfChars</code>.
|
||||
* @param text text to be searched
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param setOfChars string with one or more distinct characters
|
||||
* @return Offset of the first character in <code>setOfChars</code>
|
||||
* found, or -1 if not found.
|
||||
* @see #quotedIndexOf
|
||||
*/
|
||||
static int32_t indexOf(const UnicodeString& text,
|
||||
int32_t start, int32_t limit,
|
||||
const UnicodeString& setOfChars);
|
||||
|
||||
/**
|
||||
* Returns the index of the first character in a set. Unlike
|
||||
* String.indexOf(), this method searches not for a single character, but
|
||||
* for any character of the string <code>setOfChars</code>.
|
||||
* @param text text to be searched
|
||||
* @param setOfChars string with one or more distinct characters
|
||||
* @return Offset of the first character in <code>setOfChars</code>
|
||||
* found, or -1 if not found.
|
||||
* @see #quotedIndexOf
|
||||
*/
|
||||
static int32_t indexOf(const UnicodeString& text,
|
||||
const UnicodeString& setOfChars);
|
||||
|
||||
};
|
||||
|
||||
#endif
|
436
icu4c/source/i18n/rbt_rule.cpp
Normal file
436
icu4c/source/i18n/rbt_rule.cpp
Normal file
|
@ -0,0 +1,436 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#include "rbt_rule.h"
|
||||
#include "rep.h"
|
||||
#include "rbt_data.h"
|
||||
#include "unifilt.h"
|
||||
#include "uniset.h"
|
||||
|
||||
/**
|
||||
* Construct a new rule with the given key, output text, and other
|
||||
* attributes. Zero, one, or two context strings may be specified. A
|
||||
* cursor position may be specified for the output text.
|
||||
* @param key the string to match
|
||||
* @param output the string to produce when the <code>key</code> is seen
|
||||
* @param anteContext if not null and not empty, then it must be matched
|
||||
* before the <code>key</code>
|
||||
* @param postContext if not null and not empty, then it must be matched
|
||||
* after the <code>key</code>
|
||||
* @param cursorPos a position for the cursor after the <code>output</code>
|
||||
* is emitted. If less than zero, then the cursor is placed after the
|
||||
* <code>output</code>; that is, -1 is equivalent to
|
||||
* <code>output.length()</code>. If greater than
|
||||
* <code>output.length()</code> then an exception is thrown.
|
||||
* @exception IllegalArgumentException if the cursor position is out of
|
||||
* range.
|
||||
*/
|
||||
TransliterationRule::TransliterationRule(const UnicodeString& theKey,
|
||||
const UnicodeString& theOutput,
|
||||
const UnicodeString& theAnteContext,
|
||||
const UnicodeString& thePostContext,
|
||||
int32_t theCursorPos,
|
||||
UErrorCode &status) :
|
||||
key(theKey), output(theOutput),
|
||||
anteContext(theAnteContext),
|
||||
postContext(thePostContext),
|
||||
cursorPos(theCursorPos),
|
||||
maskKey(0) {
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (cursorPos < 0) {
|
||||
cursorPos = output.length();
|
||||
}
|
||||
if (cursorPos > output.length()) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
/* The mask key is needed when we are adding individual rules to a rule
|
||||
* set, for performance. Here are the numbers: Without mask key, 13.0
|
||||
* seconds. With mask key, 6.2 seconds. However, once the rules have
|
||||
* been added to the set, then they can be discarded to free up space.
|
||||
* This is what the freeze() method does. After freeze() has been
|
||||
* called, the method masks() must NOT be called.
|
||||
*/
|
||||
maskKey = new UnicodeString(key);
|
||||
if (maskKey == 0) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
} else {
|
||||
maskKey->append(postContext);
|
||||
}
|
||||
}
|
||||
|
||||
TransliterationRule::~TransliterationRule() {
|
||||
delete maskKey;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the length of the key. Equivalent to <code>getKey().length()</code>.
|
||||
* @return the length of the match key.
|
||||
*/
|
||||
int32_t TransliterationRule::getKeyLength() const {
|
||||
return key.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the key.
|
||||
* @return the match key.
|
||||
*/
|
||||
const UnicodeString& TransliterationRule::getKey() const {
|
||||
return key;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the output string.
|
||||
* @return the output string.
|
||||
*/
|
||||
const UnicodeString& TransliterationRule::getOutput() const {
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the position of the cursor within the output string.
|
||||
* @return a value from 0 to <code>getOutput().length()</code>, inclusive.
|
||||
*/
|
||||
int32_t TransliterationRule::getCursorPos() const {
|
||||
return cursorPos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the preceding context length. This method is needed to
|
||||
* support the <code>Transliterator</code> method
|
||||
* <code>getMaximumContextLength()</code>.
|
||||
*/
|
||||
int32_t TransliterationRule::getAnteContextLength() const {
|
||||
return anteContext.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if this rule masks another rule. If r1 masks r2 then
|
||||
* r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
|
||||
* r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
|
||||
* "[c]a>x" masks "[dc]a>y".
|
||||
*
|
||||
* <p>This method must not be called after freeze() is called.
|
||||
*/
|
||||
bool_t TransliterationRule::masks(const TransliterationRule& r2) const {
|
||||
/* There are three cases of masking. In each instance, rule1
|
||||
* masks rule2.
|
||||
*
|
||||
* 1. KEY mask: len(key1) < len(key2), key2 starts with key1.
|
||||
*
|
||||
* 2. PREFIX mask: key1 == key2, len(prefix1) < len(prefix2),
|
||||
* prefix2 ends with prefix1, suffix2 starts with suffix1.
|
||||
*
|
||||
* 3. SUFFIX mask: key1 == key2, len(suffix1) < len(suffix2),
|
||||
* prefix2 ends with prefix1, suffix2 starts with suffix1.
|
||||
*/
|
||||
|
||||
/* LIMITATION of the current mask algorithm: Some rule
|
||||
* maskings are currently not detected. For example,
|
||||
* "{Lu}]a>x" masks "A]a>y". To detect these sorts of masking,
|
||||
* we need a subset operator on UnicodeSet objects, which we
|
||||
* currently do not have. This can be added later.
|
||||
*/
|
||||
return ((maskKey->length() < r2.maskKey->length() &&
|
||||
r2.maskKey->startsWith(*maskKey)) ||
|
||||
(r2.anteContext.length() != 0 && *maskKey == *r2.maskKey &&
|
||||
((anteContext.length() == 0) ||
|
||||
(anteContext.length() < r2.anteContext.length() &&
|
||||
r2.anteContext.endsWith(anteContext)))));
|
||||
}
|
||||
|
||||
/**
|
||||
* Free up space. Once this method is called, masks() must NOT be called.
|
||||
* If it is called, an exception will be thrown.
|
||||
*/
|
||||
void TransliterationRule::freeze() {
|
||||
delete maskKey;
|
||||
maskKey = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if this rule matches the given text. The text being matched
|
||||
* occupies a virtual buffer consisting of the contents of
|
||||
* <code>result</code> concatenated to a substring of <code>text</code>.
|
||||
* The substring is specified by <code>start</code> and <code>limit</code>.
|
||||
* The value of <code>cursor</code> is an index into this virtual buffer,
|
||||
* from 0 to the length of the buffer. In terms of the parameters,
|
||||
* <code>cursor</code> must be between 0 and <code>result.length() + limit -
|
||||
* start</code>.
|
||||
* @param text the untranslated text
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param result translated text so far
|
||||
* @param cursor position at which to translate next, an offset into result.
|
||||
* If greater than or equal to result.length(), represents offset start +
|
||||
* cursor - result.length() into text.
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
bool_t TransliterationRule::matches(const UnicodeString& text,
|
||||
int32_t start, int32_t limit,
|
||||
const UnicodeString& result,
|
||||
int32_t cursor,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const {
|
||||
return
|
||||
(anteContext.length() == 0
|
||||
|| regionMatches(text, start, limit, result,
|
||||
cursor - anteContext.length(),
|
||||
anteContext, data, filter)) &&
|
||||
regionMatches(text, start, limit, result, cursor,
|
||||
key, data, filter) &&
|
||||
(postContext.length() == 0
|
||||
|| regionMatches(text, start, limit, result,
|
||||
cursor + key.length(),
|
||||
postContext, data, filter));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if this rule matches the given text.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
bool_t TransliterationRule::matches(const Replaceable& text,
|
||||
int32_t start, int32_t limit,
|
||||
int32_t cursor,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const {
|
||||
return
|
||||
(anteContext.length() == 0
|
||||
|| regionMatches(text, start, limit, cursor - anteContext.length(),
|
||||
anteContext, data, filter)) &&
|
||||
regionMatches(text, start, limit, cursor,
|
||||
key, data, filter) &&
|
||||
(postContext.length() == 0
|
||||
|| regionMatches(text, start, limit, cursor + key.length(),
|
||||
postContext, data, filter));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the degree of match between this rule and the given text. The
|
||||
* degree of match may be mismatch, a partial match, or a full match. A
|
||||
* mismatch means at least one character of the text does not match the
|
||||
* context or key. A partial match means some context and key characters
|
||||
* match, but the text is not long enough to match all of them. A full
|
||||
* match means all context and key characters match.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return one of <code>MISMATCH</code>, <code>PARTIAL_MATCH</code>, or
|
||||
* <code>FULL_MATCH</code>.
|
||||
* @see #MISMATCH
|
||||
* @see #PARTIAL_MATCH
|
||||
* @see #FULL_MATCH
|
||||
*/
|
||||
int32_t TransliterationRule::getMatchDegree(const Replaceable& text,
|
||||
int32_t start, int32_t limit,
|
||||
int32_t cursor,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const {
|
||||
if (anteContext.length() != 0
|
||||
&& !regionMatches(text, start, limit, cursor - anteContext.length(),
|
||||
anteContext, data, filter)) {
|
||||
return MISMATCH;
|
||||
}
|
||||
int32_t len = getRegionMatchLength(text, start, limit, cursor,
|
||||
key, data, filter);
|
||||
if (len < 0) {
|
||||
return MISMATCH;
|
||||
}
|
||||
if (len < key.length()) {
|
||||
return PARTIAL_MATCH;
|
||||
}
|
||||
if (postContext.length() == 0) {
|
||||
return FULL_MATCH;
|
||||
}
|
||||
len = getRegionMatchLength(text, start, limit,
|
||||
cursor + key.length(),
|
||||
postContext, data, filter);
|
||||
return (len < 0) ? MISMATCH
|
||||
: ((len == postContext.length()) ? FULL_MATCH
|
||||
: PARTIAL_MATCH);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if a template matches the text. The entire length of the
|
||||
* template is compared to the text at the cursor. As in
|
||||
* <code>matches()</code>, the text being matched occupies a virtual buffer
|
||||
* consisting of the contents of <code>result</code> concatenated to a
|
||||
* substring of <code>text</code>. See <code>matches()</code> for details.
|
||||
* @param text the untranslated text
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param result translated text so far
|
||||
* @param cursor position at which to translate next, an offset into result.
|
||||
* If greater than or equal to result.length(), represents offset start +
|
||||
* cursor - result.length() into text.
|
||||
* @param templ the text to match against. All characters must match.
|
||||
* @param data a dictionary of variables mapping <code>Character</code>
|
||||
* to <code>UnicodeSet</code>
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return true if there is a match
|
||||
*/
|
||||
bool_t TransliterationRule::regionMatches(const UnicodeString& text,
|
||||
int32_t start, int32_t limit,
|
||||
const UnicodeString& result,
|
||||
int32_t cursor,
|
||||
const UnicodeString& templ,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const {
|
||||
int32_t rlen = result.length();
|
||||
if (cursor < 0
|
||||
|| (cursor + templ.length()) > (rlen + limit - start)) {
|
||||
return FALSE;
|
||||
}
|
||||
for (int32_t i=0; i<templ.length(); ++i, ++cursor) {
|
||||
if (!charMatches(templ.charAt(i),
|
||||
cursor < rlen ? result.charAt(cursor)
|
||||
: text.charAt(cursor - rlen + start),
|
||||
data, filter)) {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if a template matches the text. The entire length of the
|
||||
* template is compared to the text at the cursor.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param templ the text to match against. All characters must match.
|
||||
* @param data a dictionary of variables mapping <code>Character</code>
|
||||
* to <code>UnicodeSet</code>
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return true if there is a match
|
||||
*/
|
||||
bool_t TransliterationRule::regionMatches(const Replaceable& text,
|
||||
int32_t start, int32_t limit,
|
||||
int32_t cursor,
|
||||
const UnicodeString& templ,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const {
|
||||
if (cursor < start
|
||||
|| (cursor + templ.length()) > limit) {
|
||||
return FALSE;
|
||||
}
|
||||
for (int32_t i=0; i<templ.length(); ++i, ++cursor) {
|
||||
if (!charMatches(templ.charAt(i), text.charAt(cursor),
|
||||
data, filter)) {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of characters of the text that match this rule. If
|
||||
* there is a mismatch, return -1. If the text is not long enough to match
|
||||
* any characters, return 0.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param templ the text to match against. All characters must match.
|
||||
* @param data a dictionary of variables mapping <code>Character</code>
|
||||
* to <code>UnicodeSet</code>
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return -1 if there is a mismatch, 0 if the text is not long enough to
|
||||
* match any characters, otherwise the number of characters of text that
|
||||
* match this rule.
|
||||
*/
|
||||
int32_t TransliterationRule::getRegionMatchLength(const Replaceable& text,
|
||||
int32_t start,
|
||||
int32_t limit, int32_t cursor,
|
||||
const UnicodeString& templ,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const {
|
||||
if (cursor < start) {
|
||||
return -1;
|
||||
}
|
||||
int32_t i;
|
||||
for (i=0; i<templ.length() && cursor<limit; ++i, ++cursor) {
|
||||
if (!charMatches(templ.charAt(i), text.charAt(cursor),
|
||||
data, filter)) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if the given key matches the given text. This method
|
||||
* accounts for the fact that the key character may represent a character
|
||||
* set. Note that the key and text characters may not be interchanged
|
||||
* without altering the results.
|
||||
* @param keyChar a character in the match key
|
||||
* @param textChar a character in the text being transliterated
|
||||
* @param data a dictionary of variables mapping <code>Character</code>
|
||||
* to <code>UnicodeSet</code>
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
bool_t TransliterationRule::charMatches(UChar keyChar, UChar textChar,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const {
|
||||
UnicodeSet* set = 0;
|
||||
return (filter == 0 || filter->isIn(textChar)) &&
|
||||
((set = data.lookupSet(keyChar)) == 0) ?
|
||||
keyChar == textChar : set->contains(textChar);
|
||||
}
|
380
icu4c/source/i18n/rbt_rule.h
Normal file
380
icu4c/source/i18n/rbt_rule.h
Normal file
|
@ -0,0 +1,380 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef RBT_RULE_H
|
||||
#define RBT_RULE_H
|
||||
|
||||
#include "unistr.h"
|
||||
|
||||
class Replaceable;
|
||||
class TransliterationRuleData;
|
||||
class UnicodeFilter;
|
||||
|
||||
/**
|
||||
* A transliteration rule used by
|
||||
* <code>RuleBasedTransliterator</code>.
|
||||
* <code>TransliterationRule</code> is an immutable object.
|
||||
*
|
||||
* <p>A rule consists of an input pattern and an output string. When
|
||||
* the input pattern is matched, the output string is emitted. The
|
||||
* input pattern consists of zero or more characters which are matched
|
||||
* exactly (the key) and optional context. Context must match if it
|
||||
* is specified. Context may be specified before the key, after the
|
||||
* key, or both. The key, preceding context, and following context
|
||||
* may contain variables. Variables represent a set of Unicode
|
||||
* characters, such as the letters <i>a</i> through <i>z</i>.
|
||||
* Variables are detected by looking up each character in a supplied
|
||||
* variable list to see if it has been so defined.
|
||||
*
|
||||
* @author Alan Liu
|
||||
*/
|
||||
class TransliterationRule {
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Constants returned by <code>getMatchDegree()</code> indicating
|
||||
* the degree of match between the text and this rule.
|
||||
* @see #getMatchDegree
|
||||
*/
|
||||
enum {
|
||||
/**
|
||||
* Constant returned by <code>getMatchDegree()</code>
|
||||
* indicating a mismatch between the text and this rule. One
|
||||
* or more characters of the context or key do not match the
|
||||
* text.
|
||||
*/
|
||||
MISMATCH,
|
||||
|
||||
/**
|
||||
* Constant returned by <code>getMatchDegree()</code>
|
||||
* indicating a partial match between the text and this rule.
|
||||
* All characters of the text match the corresponding context
|
||||
* or key, but more characters are required for a complete
|
||||
* match. There are some key or context characters at the end
|
||||
* of the pattern that remain unmatched because the text isn't
|
||||
* long enough.
|
||||
*/
|
||||
PARTIAL_MATCH,
|
||||
|
||||
/**
|
||||
* Constant returned by <code>getMatchDegree()</code>
|
||||
* indicating a complete match between the text and this rule.
|
||||
* The text matches all context and key characters.
|
||||
*/
|
||||
FULL_MATCH
|
||||
};
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* The string that must be matched.
|
||||
*/
|
||||
UnicodeString key;
|
||||
|
||||
/**
|
||||
* The string that is emitted if the key, anteContext, and postContext
|
||||
* are matched.
|
||||
*/
|
||||
UnicodeString output;
|
||||
|
||||
/**
|
||||
* The string that must match before the key. If empty, then
|
||||
* there is no matching requirement before the key.
|
||||
*/
|
||||
UnicodeString anteContext;
|
||||
|
||||
/**
|
||||
* The string that must match after the key. If empty, then there
|
||||
* is no matching requirement after the key.
|
||||
*/
|
||||
UnicodeString postContext;
|
||||
|
||||
/**
|
||||
* The position of the cursor after emitting the output string, from 0 to
|
||||
* output.length(). For most rules with no special cursor specification,
|
||||
* the cursorPos is output.length().
|
||||
*/
|
||||
int32_t cursorPos;
|
||||
|
||||
/**
|
||||
* A string used to implement masks().
|
||||
* @see #freeze
|
||||
*/
|
||||
UnicodeString* maskKey;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Construct a new rule with the given key, output text, and other
|
||||
* attributes. Zero, one, or two context strings may be specified. A
|
||||
* cursor position may be specified for the output text.
|
||||
* @param key the string to match
|
||||
* @param output the string to produce when the <code>key</code> is seen
|
||||
* @param anteContext if not null and not empty, then it must be matched
|
||||
* before the <code>key</code>
|
||||
* @param postContext if not null and not empty, then it must be matched
|
||||
* after the <code>key</code>
|
||||
* @param cursorPos a position for the cursor after the <code>output</code>
|
||||
* is emitted. If less than zero, then the cursor is placed after the
|
||||
* <code>output</code>; that is, -1 is equivalent to
|
||||
* <code>output.length()</code>. If greater than
|
||||
* <code>output.length()</code> then an exception is thrown.
|
||||
* @exception IllegalArgumentException if the cursor position is out of
|
||||
* range.
|
||||
*/
|
||||
TransliterationRule(const UnicodeString& theKey,
|
||||
const UnicodeString& theOutput,
|
||||
const UnicodeString& theAnteContext,
|
||||
const UnicodeString& thePostContext,
|
||||
int32_t theCursorPos,
|
||||
UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~TransliterationRule();
|
||||
|
||||
/**
|
||||
* Return the length of the key. Equivalent to <code>getKey().length()</code>.
|
||||
* @return the length of the match key.
|
||||
*/
|
||||
virtual int32_t getKeyLength() const;
|
||||
|
||||
/**
|
||||
* Return the key.
|
||||
* @return the match key.
|
||||
*/
|
||||
virtual const UnicodeString& getKey() const;
|
||||
|
||||
/**
|
||||
* Return the output string.
|
||||
* @return the output string.
|
||||
*/
|
||||
virtual const UnicodeString& getOutput() const;
|
||||
|
||||
/**
|
||||
* Return the position of the cursor within the output string.
|
||||
* @return a value from 0 to <code>getOutput().length()</code>, inclusive.
|
||||
*/
|
||||
virtual int32_t getCursorPos() const;
|
||||
|
||||
/**
|
||||
* Return the preceding context length. This method is needed to
|
||||
* support the <code>Transliterator</code> method
|
||||
* <code>getMaximumContextLength()</code>.
|
||||
*/
|
||||
virtual int32_t getAnteContextLength() const;
|
||||
|
||||
/**
|
||||
* Return true if this rule masks another rule. If r1 masks r2 then
|
||||
* r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
|
||||
* r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
|
||||
* "[c]a>x" masks "[dc]a>y".
|
||||
*
|
||||
* <p>This method must not be called after freeze() is called.
|
||||
*/
|
||||
virtual bool_t masks(const TransliterationRule& r2) const;
|
||||
|
||||
/**
|
||||
* Free up space. Once this method is called, masks() must NOT be called.
|
||||
* If it is called, an exception will be thrown.
|
||||
*/
|
||||
virtual void freeze();
|
||||
|
||||
/**
|
||||
* Return true if this rule matches the given text. The text being matched
|
||||
* occupies a virtual buffer consisting of the contents of
|
||||
* <code>result</code> concatenated to a substring of <code>text</code>.
|
||||
* The substring is specified by <code>start</code> and <code>limit</code>.
|
||||
* The value of <code>cursor</code> is an index into this virtual buffer,
|
||||
* from 0 to the length of the buffer. In terms of the parameters,
|
||||
* <code>cursor</code> must be between 0 and <code>result.length() + limit -
|
||||
* start</code>.
|
||||
* @param text the untranslated text
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param result translated text so far
|
||||
* @param cursor position at which to translate next, an offset into result.
|
||||
* If greater than or equal to result.length(), represents offset start +
|
||||
* cursor - result.length() into text.
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
virtual bool_t matches(const UnicodeString& text,
|
||||
int32_t start, int32_t limit,
|
||||
const UnicodeString& result,
|
||||
int32_t cursor,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const;
|
||||
|
||||
/**
|
||||
* Return true if this rule matches the given text.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
virtual bool_t matches(const Replaceable& text,
|
||||
int32_t start, int32_t limit,
|
||||
int32_t cursor,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const;
|
||||
|
||||
/**
|
||||
* Return the degree of match between this rule and the given text. The
|
||||
* degree of match may be mismatch, a partial match, or a full match. A
|
||||
* mismatch means at least one character of the text does not match the
|
||||
* context or key. A partial match means some context and key characters
|
||||
* match, but the text is not long enough to match all of them. A full
|
||||
* match means all context and key characters match.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return one of <code>MISMATCH</code>, <code>PARTIAL_MATCH</code>, or
|
||||
* <code>FULL_MATCH</code>.
|
||||
* @see #MISMATCH
|
||||
* @see #PARTIAL_MATCH
|
||||
* @see #FULL_MATCH
|
||||
*/
|
||||
virtual int32_t getMatchDegree(const Replaceable& text,
|
||||
int32_t start, int32_t limit,
|
||||
int32_t cursor,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const;
|
||||
|
||||
/**
|
||||
* Return true if a template matches the text. The entire length of the
|
||||
* template is compared to the text at the cursor. As in
|
||||
* <code>matches()</code>, the text being matched occupies a virtual buffer
|
||||
* consisting of the contents of <code>result</code> concatenated to a
|
||||
* substring of <code>text</code>. See <code>matches()</code> for details.
|
||||
* @param text the untranslated text
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param result translated text so far
|
||||
* @param cursor position at which to translate next, an offset into result.
|
||||
* If greater than or equal to result.length(), represents offset start +
|
||||
* cursor - result.length() into text.
|
||||
* @param templ the text to match against. All characters must match.
|
||||
* @param data a dictionary of variables mapping <code>Character</code>
|
||||
* to <code>UnicodeSet</code>
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return true if there is a match
|
||||
*/
|
||||
virtual bool_t regionMatches(const UnicodeString& text,
|
||||
int32_t start, int32_t limit,
|
||||
const UnicodeString& result,
|
||||
int32_t cursor,
|
||||
const UnicodeString& templ,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const;
|
||||
|
||||
/**
|
||||
* Return true if a template matches the text. The entire length of the
|
||||
* template is compared to the text at the cursor.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param templ the text to match against. All characters must match.
|
||||
* @param data a dictionary of variables mapping <code>Character</code>
|
||||
* to <code>UnicodeSet</code>
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return true if there is a match
|
||||
*/
|
||||
virtual bool_t regionMatches(const Replaceable& text,
|
||||
int32_t start, int32_t limit,
|
||||
int32_t cursor,
|
||||
const UnicodeString& templ,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const;
|
||||
|
||||
/**
|
||||
* Return the number of characters of the text that match this rule. If
|
||||
* there is a mismatch, return -1. If the text is not long enough to match
|
||||
* any characters, return 0.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param templ the text to match against. All characters must match.
|
||||
* @param data a dictionary of variables mapping <code>Character</code>
|
||||
* to <code>UnicodeSet</code>
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return -1 if there is a mismatch, 0 if the text is not long enough to
|
||||
* match any characters, otherwise the number of characters of text that
|
||||
* match this rule.
|
||||
*/
|
||||
virtual int32_t getRegionMatchLength(const Replaceable& text, int32_t start,
|
||||
int32_t limit, int32_t cursor,
|
||||
const UnicodeString& templ,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const;
|
||||
|
||||
/**
|
||||
* Return true if the given key matches the given text. This method
|
||||
* accounts for the fact that the key character may represent a character
|
||||
* set. Note that the key and text characters may not be interchanged
|
||||
* without altering the results.
|
||||
* @param keyChar a character in the match key
|
||||
* @param textChar a character in the text being transliterated
|
||||
* @param data a dictionary of variables mapping <code>Character</code>
|
||||
* to <code>UnicodeSet</code>
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
virtual bool_t charMatches(UChar keyChar, UChar textChar,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const;
|
||||
};
|
||||
|
||||
#endif
|
217
icu4c/source/i18n/rbt_set.cpp
Normal file
217
icu4c/source/i18n/rbt_set.cpp
Normal file
|
@ -0,0 +1,217 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#include "rbt_set.h"
|
||||
#include "rbt_rule.h"
|
||||
#include "unistr.h"
|
||||
|
||||
/* Note: There was an old implementation that indexed by first letter of
|
||||
* key. Problem with this is that key may not have a meaningful first
|
||||
* letter; e.g., {Lu}>*. One solution is to keep a separate vector of all
|
||||
* rules whose intial key letter is a category variable. However, the
|
||||
* problem is that they must be kept in order with respect to other rules.
|
||||
* One solution -- add a sequence number to each rule. Do the usual
|
||||
* first-letter lookup, and also a lookup from the spare bin with rules like
|
||||
* {Lu}>*. Take the lower sequence number. This seems complex and not
|
||||
* worth the trouble, but we may revisit this later. For documentation (or
|
||||
* possible resurrection) the old code is included below, commented out
|
||||
* with the remark "// OLD INDEXED IMPLEMENTATION". Under the old
|
||||
* implementation, <code>rules</code> is a Hashtable, not a Vector.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Construct a new empty rule set.
|
||||
*/
|
||||
TransliterationRuleSet::TransliterationRuleSet() {
|
||||
maxContextLength = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the maximum context length.
|
||||
* @return the length of the longest preceding context.
|
||||
*/
|
||||
int32_t TransliterationRuleSet::getMaximumContextLength() const {
|
||||
return maxContextLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a rule to this set. Rules are added in order, and order is
|
||||
* significant.
|
||||
*
|
||||
* <p>Once freeze() is called, this method must not be called.
|
||||
* @param rule the rule to add
|
||||
*/
|
||||
void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
|
||||
UErrorCode& status) {
|
||||
|
||||
// Build time, no checking : 3562 ms
|
||||
// Build time, with checking: 6234 ms
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
delete adoptedRule;
|
||||
return;
|
||||
}
|
||||
|
||||
for (int32_t i=0; i<rules.size(); ++i) {
|
||||
TransliterationRule* r = (TransliterationRule*) rules.elementAt(i);
|
||||
if (r->masks(*adoptedRule)) {
|
||||
//throw new IllegalArgumentException("Rule " + rule +
|
||||
// " must precede " + r);
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
delete adoptedRule;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
rules.addElement(adoptedRule);
|
||||
int32_t len;
|
||||
if ((len = adoptedRule->getAnteContextLength()) > maxContextLength) {
|
||||
maxContextLength = len;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Free up space. Once this method is called, addRule() must NOT
|
||||
* be called again.
|
||||
*/
|
||||
void TransliterationRuleSet::freeze() {
|
||||
for (int32_t i=0; i<rules.size(); ++i) {
|
||||
((TransliterationRule*) rules.elementAt(i))->freeze();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to find a matching rule at the specified point in the text. The
|
||||
* text being matched occupies a virtual buffer consisting of the contents
|
||||
* of <code>result</code> concatenated to a substring of <code>text</code>.
|
||||
* The substring is specified by <code>start</code> and <code>limit</code>.
|
||||
* The value of <code>cursor</code> is an index into this virtual buffer,
|
||||
* from 0 to the length of the buffer. In terms of the parameters,
|
||||
* <code>cursor</code> must be between 0 and <code>result.length() + limit -
|
||||
* start</code>.
|
||||
* @param text the untranslated text
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param result translated text
|
||||
* @param cursor position at which to translate next, an offset into result.
|
||||
* If greater than or equal to result.length(), represents offset start +
|
||||
* cursor - result.length() into text.
|
||||
* @param data a dictionary mapping variables to the sets they
|
||||
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return the matching rule, or null if none found.
|
||||
*/
|
||||
TransliterationRule*
|
||||
TransliterationRuleSet::findMatch(const UnicodeString& text,
|
||||
int32_t start, int32_t limit,
|
||||
const UnicodeString& result,
|
||||
int32_t cursor,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const {
|
||||
for (int32_t i=0; i<rules.size(); ++i) {
|
||||
TransliterationRule* rule =
|
||||
(TransliterationRule*) rules.elementAt(i);
|
||||
if (rule->matches(text, start, limit, result,
|
||||
cursor, data, filter)) {
|
||||
return rule;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to find a matching rule at the specified point in the text.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param data a dictionary mapping variables to the sets they
|
||||
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return the matching rule, or null if none found.
|
||||
*/
|
||||
TransliterationRule*
|
||||
TransliterationRuleSet::findMatch(const Replaceable& text,
|
||||
int32_t start, int32_t limit,
|
||||
int32_t cursor,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const {
|
||||
for (int32_t i=0; i<rules.size(); ++i) {
|
||||
TransliterationRule* rule =
|
||||
(TransliterationRule*) rules.elementAt(i);
|
||||
if (rule->matches(text, start, limit, cursor,
|
||||
data, filter)) {
|
||||
return rule;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to find a matching rule at the specified point in the text.
|
||||
* Unlike <code>findMatch()</code>, this method does an incremental match.
|
||||
* An incremental match requires that there be no partial matches that might
|
||||
* pre-empt the full match that is found. If there are partial matches,
|
||||
* then null is returned. A non-null result indicates that a full match has
|
||||
* been found, and that it cannot be pre-empted by a partial match
|
||||
* regardless of what additional text is added to the translation buffer.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param data a dictionary mapping variables to the sets they
|
||||
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
|
||||
* @param partial output parameter. <code>partial[0]</code> is set to
|
||||
* true if a partial match is returned.
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return the matching rule, or null if none found, or if the text buffer
|
||||
* does not have enough text yet to unambiguously match a rule.
|
||||
*/
|
||||
TransliterationRule*
|
||||
TransliterationRuleSet::findIncrementalMatch(const Replaceable& text,
|
||||
int32_t start,
|
||||
int32_t limit, int32_t cursor,
|
||||
const TransliterationRuleData& data,
|
||||
bool_t& isPartial,
|
||||
const UnicodeFilter* filter) const {
|
||||
isPartial = FALSE;
|
||||
for (int32_t i=0; i<rules.size(); ++i) {
|
||||
TransliterationRule* rule =
|
||||
(TransliterationRule*) rules.elementAt(i);
|
||||
int32_t match = rule->getMatchDegree(text, start, limit, cursor,
|
||||
data, filter);
|
||||
switch (match) {
|
||||
case TransliterationRule::FULL_MATCH:
|
||||
return rule;
|
||||
case TransliterationRule::PARTIAL_MATCH:
|
||||
isPartial = TRUE;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
164
icu4c/source/i18n/rbt_set.h
Normal file
164
icu4c/source/i18n/rbt_set.h
Normal file
|
@ -0,0 +1,164 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef RBT_SET_H
|
||||
#define RBT_SET_H
|
||||
|
||||
#include "uvector.h"
|
||||
|
||||
class Replaceable;
|
||||
class TransliterationRule;
|
||||
class TransliterationRuleData;
|
||||
class UnicodeFilter;
|
||||
class UnicodeString;
|
||||
|
||||
/**
|
||||
* A set of rules for a <code>RuleBasedTransliterator</code>. This set encodes
|
||||
* the transliteration in one direction from one set of characters or short
|
||||
* strings to another. A <code>RuleBasedTransliterator</code> consists of up to
|
||||
* two such sets, one for the forward direction, and one for the reverse.
|
||||
*
|
||||
* <p>A <code>TransliterationRuleSet</code> has one important operation, that of
|
||||
* finding a matching rule at a given point in the text. This is accomplished
|
||||
* by the <code>findMatch()</code> method.
|
||||
*
|
||||
* @author Alan Liu
|
||||
*/
|
||||
class TransliterationRuleSet {
|
||||
/**
|
||||
* Vector of rules, in the order added.
|
||||
*/
|
||||
UVector rules;
|
||||
|
||||
/**
|
||||
* Length of the longest preceding context
|
||||
*/
|
||||
int32_t maxContextLength;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Construct a new empty rule set.
|
||||
*/
|
||||
TransliterationRuleSet();
|
||||
|
||||
/**
|
||||
* Return the maximum context length.
|
||||
* @return the length of the longest preceding context.
|
||||
*/
|
||||
virtual int32_t getMaximumContextLength() const;
|
||||
|
||||
/**
|
||||
* Add a rule to this set. Rules are added in order, and order is
|
||||
* significant.
|
||||
*
|
||||
* <p>Once freeze() is called, this method must not be called.
|
||||
* @param rule the rule to add
|
||||
*/
|
||||
virtual void addRule(TransliterationRule* adoptedRule,
|
||||
UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Free up space. Once this method is called, addRule() must NOT
|
||||
* be called again.
|
||||
*/
|
||||
virtual void freeze();
|
||||
|
||||
/**
|
||||
* Attempt to find a matching rule at the specified point in the text. The
|
||||
* text being matched occupies a virtual buffer consisting of the contents
|
||||
* of <code>result</code> concatenated to a substring of <code>text</code>.
|
||||
* The substring is specified by <code>start</code> and <code>limit</code>.
|
||||
* The value of <code>cursor</code> is an index into this virtual buffer,
|
||||
* from 0 to the length of the buffer. In terms of the parameters,
|
||||
* <code>cursor</code> must be between 0 and <code>result.length() + limit -
|
||||
* start</code>.
|
||||
* @param text the untranslated text
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param result tranlated text
|
||||
* @param cursor position at which to translate next, an offset into result.
|
||||
* If greater than or equal to result.length(), represents offset start +
|
||||
* cursor - result.length() into text.
|
||||
* @param data a dictionary mapping variables to the sets they
|
||||
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return the matching rule, or null if none found.
|
||||
*/
|
||||
virtual TransliterationRule* findMatch(const UnicodeString& text,
|
||||
int32_t start, int32_t limit,
|
||||
const UnicodeString& result,
|
||||
int32_t cursor,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const;
|
||||
|
||||
/**
|
||||
* Attempt to find a matching rule at the specified point in the text.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param data a dictionary mapping variables to the sets they
|
||||
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return the matching rule, or null if none found.
|
||||
*/
|
||||
virtual TransliterationRule* findMatch(const Replaceable& text,
|
||||
int32_t start, int32_t limit,
|
||||
int32_t cursor,
|
||||
const TransliterationRuleData& data,
|
||||
const UnicodeFilter* filter) const;
|
||||
|
||||
/**
|
||||
* Attempt to find a matching rule at the specified point in the text.
|
||||
* Unlike <code>findMatch()</code>, this method does an incremental match.
|
||||
* An incremental match requires that there be no partial matches that might
|
||||
* pre-empt the full match that is found. If there are partial matches,
|
||||
* then null is returned. A non-null result indicates that a full match has
|
||||
* been found, and that it cannot be pre-empted by a partial match
|
||||
* regardless of what additional text is added to the translation buffer.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param data a dictionary mapping variables to the sets they
|
||||
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
|
||||
* @param partial output parameter. <code>partial[0]</code> is set to
|
||||
* true if a partial match is returned.
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return the matching rule, or null if none found, or if the text buffer
|
||||
* does not have enough text yet to unambiguously match a rule.
|
||||
*/
|
||||
virtual TransliterationRule* findIncrementalMatch(const Replaceable& text,
|
||||
int32_t start,
|
||||
int32_t limit, int32_t cursor,
|
||||
const TransliterationRuleData& data,
|
||||
bool_t& isPartial,
|
||||
const UnicodeFilter* filter) const;
|
||||
};
|
||||
#endif
|
879
icu4c/source/i18n/translit.cpp
Normal file
879
icu4c/source/i18n/translit.cpp
Normal file
|
@ -0,0 +1,879 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#include "translit.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "hextouni.h"
|
||||
#include "locid.h"
|
||||
#include "msgfmt.h"
|
||||
#include "mutex.h"
|
||||
#include "rbt_data.h"
|
||||
#include "rbt_pars.h"
|
||||
#include "rep.h"
|
||||
#include "resbund.h"
|
||||
#include "uhash.h"
|
||||
#include "unifilt.h"
|
||||
#include "unitohex.h"
|
||||
|
||||
/**
|
||||
* Dictionary of known transliterators. Keys are <code>String</code>
|
||||
* names, values are one of the following:
|
||||
*
|
||||
* <ul><li><code>Transliterator</code> objects
|
||||
*
|
||||
* <li><code>RULE_BASED_PLACEHOLDER</code>, in which case the ID
|
||||
* will have its first '-' removed and be appended to
|
||||
* RB_RULE_BASED_PREFIX to form a resource bundle name from which
|
||||
* the RB_RULE key is looked up to obtain the rule.
|
||||
*
|
||||
* <li><code>REVERSE_RULE_BASED_PLACEHOLDER</code>. Like
|
||||
* <code>RULE_BASED_PLACEHOLDER</code>, except the entity names in
|
||||
* the ID are reversed, and the argument
|
||||
* RuleBasedTransliterator.REVERSE is pased to the
|
||||
* RuleBasedTransliterator constructor.
|
||||
* </ul>
|
||||
*/
|
||||
UHashtable* Transliterator::cache = 0;
|
||||
|
||||
/**
|
||||
* The mutex controlling access to the cache.
|
||||
*/
|
||||
UMTX Transliterator::cacheMutex = NULL;
|
||||
|
||||
/**
|
||||
* When set to TRUE, the cache has been initialized. Any code must
|
||||
* check this boolean before accessing the cache, and if the boolean
|
||||
* is FALSE, it must call initializeCache(). We do this form of lazy
|
||||
* evaluation for two reasons: (1) so we don't initialize if we don't
|
||||
* have to (i.e., if no one is using Transliterator, but has included
|
||||
* the code as part of a shared library, and (2) to avoid static
|
||||
* intialization problems.
|
||||
*/
|
||||
bool_t Transliterator::cacheInitialized = FALSE;
|
||||
|
||||
/**
|
||||
* Prefix for resource bundle key for the display name for a
|
||||
* transliterator. The ID is appended to this to form the key.
|
||||
* The resource bundle value should be a String.
|
||||
*/
|
||||
const char* Transliterator::RB_DISPLAY_NAME_PREFIX = "T:";
|
||||
|
||||
/**
|
||||
* Resource bundle key for display name pattern.
|
||||
* The resource bundle value should be a String forming a
|
||||
* MessageFormat pattern, e.g.:
|
||||
* "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}".
|
||||
*/
|
||||
const char* Transliterator::RB_DISPLAY_NAME_PATTERN =
|
||||
"TransliteratorNamePattern";
|
||||
|
||||
/**
|
||||
* Resource bundle key for the list of RuleBasedTransliterator IDs.
|
||||
* The resource bundle value should be a String[] with each element
|
||||
* being a valid ID. The ID will be appended to RB_RULE_BASED_PREFIX
|
||||
* to obtain the class name in which the RB_RULE key will be sought.
|
||||
*/
|
||||
const char* Transliterator::RB_RULE_BASED_IDS =
|
||||
"RuleBasedTransliteratorIDs";
|
||||
|
||||
/**
|
||||
* Resource bundle key for the RuleBasedTransliterator rule.
|
||||
*/
|
||||
const char* Transliterator::RB_RULE = "Rule";
|
||||
|
||||
/**
|
||||
* Default constructor.
|
||||
* @param theID the string identifier for this transliterator
|
||||
* @param theFilter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
Transliterator::Transliterator(const UnicodeString& theID,
|
||||
UnicodeFilter* adoptedFilter) :
|
||||
ID(theID), filter(adoptedFilter) {}
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
Transliterator::~Transliterator() {
|
||||
delete filter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
Transliterator::Transliterator(const Transliterator& other) :
|
||||
ID(other.ID), filter(0) {
|
||||
if (other.filter != 0) {
|
||||
// We own the filter, so we must have our own copy
|
||||
filter = other.filter->clone();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Assignment operator.
|
||||
*/
|
||||
Transliterator& Transliterator::operator=(const Transliterator& other) {
|
||||
ID = other.ID;
|
||||
filter = (other.filter == 0) ?
|
||||
0 : other.filter->clone();
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates the segment of a string that begins at the character
|
||||
* at offset <code>start</code> and extends to the character at offset
|
||||
* <code>limit - 1</code>. A default implementation is provided here;
|
||||
* subclasses should provide a more efficient implementation if
|
||||
* possible.
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param result buffer to receive the transliterated text; previous
|
||||
* contents are discarded
|
||||
*/
|
||||
void Transliterator::transliterate(const UnicodeString& text,
|
||||
int32_t start, int32_t limit,
|
||||
UnicodeString& result) const {
|
||||
/* This is a default implementation that should be replaced by
|
||||
* a more efficient subclass implementation if possible.
|
||||
*/
|
||||
text.extractBetween(start, limit, result);
|
||||
transliterate(result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates an entire string. Convenience method.
|
||||
* @param text the string to be transliterated
|
||||
* @param result buffer to receive the transliterated text; previous
|
||||
* contents are discarded
|
||||
*/
|
||||
void Transliterator::transliterate(const UnicodeString& text,
|
||||
UnicodeString& result) const {
|
||||
transliterate(text, 0, text.length(), result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates an entire string in place. Convenience method.
|
||||
* @param text the string to be transliterated
|
||||
*/
|
||||
void Transliterator::transliterate(Replaceable& text) const {
|
||||
transliterate(text, 0, text.length());
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates the portion of the text buffer that can be
|
||||
* transliterated unambiguosly after new text has been inserted,
|
||||
* typically as a result of a keyboard event. The new text in
|
||||
* <code>insertion</code> will be inserted into <code>text</code>
|
||||
* at <code>index[LIMIT]</code>, advancing
|
||||
* <code>index[LIMIT]</code> by <code>insertion.length()</code>.
|
||||
* Then the transliterator will try to transliterate characters of
|
||||
* <code>text</code> between <code>index[CURSOR]</code> and
|
||||
* <code>index[LIMIT]</code>. Characters before
|
||||
* <code>index[CURSOR]</code> will not be changed.
|
||||
*
|
||||
* <p>Upon return, values in <code>index[]</code> will be updated.
|
||||
* <code>index[START]</code> will be advanced to the first
|
||||
* character that future calls to this method will read.
|
||||
* <code>index[CURSOR]</code> and <code>index[LIMIT]</code> will
|
||||
* be adjusted to delimit the range of text that future calls to
|
||||
* this method may change.
|
||||
*
|
||||
* <p>Typical usage of this method begins with an initial call
|
||||
* with <code>index[START]</code> and <code>index[LIMIT]</code>
|
||||
* set to indicate the portion of <code>text</code> to be
|
||||
* transliterated, and <code>index[CURSOR] == index[START]</code>.
|
||||
* Thereafter, <code>index[]</code> can be used without
|
||||
* modification in future calls, provided that all changes to
|
||||
* <code>text</code> are made via this method.
|
||||
*
|
||||
* <p>This method assumes that future calls may be made that will
|
||||
* insert new text into the buffer. As a result, it only performs
|
||||
* unambiguous transliterations. After the last call to this
|
||||
* method, there may be untransliterated text that is waiting for
|
||||
* more input to resolve an ambiguity. In order to perform these
|
||||
* pending transliterations, clients should call {@link
|
||||
* #finishKeyboardTransliteration} after the last call to this
|
||||
* method has been made.
|
||||
*
|
||||
* @param text the buffer holding transliterated and untransliterated text
|
||||
* @param index an array of three integers.
|
||||
*
|
||||
* <ul><li><code>index[START]</code>: the beginning index,
|
||||
* inclusive; <code>0 <= index[START] <= index[LIMIT]</code>.
|
||||
*
|
||||
* <li><code>index[LIMIT]</code>: the ending index, exclusive;
|
||||
* <code>index[START] <= index[LIMIT] <= text.length()</code>.
|
||||
* <code>insertion</code> is inserted at
|
||||
* <code>index[LIMIT]</code>.
|
||||
*
|
||||
* <li><code>index[CURSOR]</code>: the next character to be
|
||||
* considered for transliteration; <code>index[START] <=
|
||||
* index[CURSOR] <= index[LIMIT]</code>. Characters before
|
||||
* <code>index[CURSOR]</code> will not be changed by future calls
|
||||
* to this method.</ul>
|
||||
*
|
||||
* @param insertion text to be inserted and possibly
|
||||
* transliterated into the translation buffer at
|
||||
* <code>index[LIMIT]</code>. If <code>null</code> then no text
|
||||
* is inserted.
|
||||
* @see #START
|
||||
* @see #LIMIT
|
||||
* @see #CURSOR
|
||||
* @see #handleKeyboardTransliterate
|
||||
* @exception IllegalArgumentException if <code>index[]</code>
|
||||
* is invalid
|
||||
*/
|
||||
void Transliterator::keyboardTransliterate(Replaceable& text,
|
||||
int32_t index[3],
|
||||
const UnicodeString& insertion,
|
||||
UErrorCode &status) const {
|
||||
_keyboardTransliterate(text, index, &insertion, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates the portion of the text buffer that can be
|
||||
* transliterated unambiguosly after a new character has been
|
||||
* inserted, typically as a result of a keyboard event. This is a
|
||||
* convenience method; see {@link
|
||||
* #keyboardTransliterate(Replaceable, int[], String)} for details.
|
||||
* @param text the buffer holding transliterated and
|
||||
* untransliterated text
|
||||
* @param index an array of three integers. See {@link
|
||||
* #keyboardTransliterate(Replaceable, int[], String)}.
|
||||
* @param insertion text to be inserted and possibly
|
||||
* transliterated into the translation buffer at
|
||||
* <code>index[LIMIT]</code>.
|
||||
* @see #keyboardTransliterate(Replaceable, int[], String)
|
||||
*/
|
||||
void Transliterator::keyboardTransliterate(Replaceable& text,
|
||||
int32_t index[3],
|
||||
UChar insertion,
|
||||
UErrorCode& status) const {
|
||||
UnicodeString str(insertion);
|
||||
_keyboardTransliterate(text, index, &str, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates the portion of the text buffer that can be
|
||||
* transliterated unambiguosly. This is a convenience method; see
|
||||
* {@link #keyboardTransliterate(Replaceable, int[], String)} for
|
||||
* details.
|
||||
* @param text the buffer holding transliterated and
|
||||
* untransliterated text
|
||||
* @param index an array of three integers. See {@link
|
||||
* #keyboardTransliterate(Replaceable, int[], String)}.
|
||||
* @see #keyboardTransliterate(Replaceable, int[], String)
|
||||
*/
|
||||
void Transliterator::keyboardTransliterate(Replaceable& text,
|
||||
int32_t index[3],
|
||||
UErrorCode& status) const {
|
||||
_keyboardTransliterate(text, index, 0, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finishes any pending transliterations that were waiting for
|
||||
* more characters. Clients should call this method as the last
|
||||
* call after a sequence of one or more calls to
|
||||
* <code>keyboardTransliterate()</code>.
|
||||
* @param text the buffer holding transliterated and
|
||||
* untransliterated text.
|
||||
* @param index the array of indices previously passed to {@link
|
||||
* #keyboardTransliterate}
|
||||
*/
|
||||
void Transliterator::finishKeyboardTransliteration(Replaceable& text,
|
||||
int32_t index[3]) const {
|
||||
transliterate(text, index[START], index[LIMIT]);
|
||||
}
|
||||
|
||||
/**
|
||||
* This internal method does keyboard transliteration. If the
|
||||
* 'insertion' is non-null then we append it to 'text' before
|
||||
* proceeding. This method calls through to the pure virtual
|
||||
* framework method handleKeyboardTransliterate() to do the actual
|
||||
* work.
|
||||
*/
|
||||
void Transliterator::_keyboardTransliterate(Replaceable& text,
|
||||
int32_t index[3],
|
||||
const UnicodeString* insertion,
|
||||
UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (index[START] < 0 ||
|
||||
index[LIMIT] > text.length() ||
|
||||
index[CURSOR] < index[START] ||
|
||||
index[CURSOR] > index[LIMIT]) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t originalStart = index[START];
|
||||
if (insertion != 0) {
|
||||
text.handleReplaceBetween(index[LIMIT], index[LIMIT], *insertion);
|
||||
index[LIMIT] += insertion->length();
|
||||
}
|
||||
|
||||
handleKeyboardTransliterate(text, index);
|
||||
|
||||
index[START] = icu_max(index[CURSOR] - getMaximumContextLength(),
|
||||
originalStart);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the length of the longest context required by this transliterator.
|
||||
* This is <em>preceding</em> context. The default implementation supplied
|
||||
* by <code>Transliterator</code> returns zero; subclasses
|
||||
* that use preceding context should override this method to return the
|
||||
* correct value. For example, if a transliterator translates "ddd" (where
|
||||
* d is any digit) to "555" when preceded by "(ddd)", then the preceding
|
||||
* context length is 5, the length of "(ddd)".
|
||||
*
|
||||
* @return The maximum number of preceding context characters this
|
||||
* transliterator needs to examine
|
||||
*/
|
||||
int32_t Transliterator::getMaximumContextLength() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a programmatic identifier for this transliterator.
|
||||
* If this identifier is passed to <code>getInstance()</code>, it
|
||||
* will return this object, if it has been registered.
|
||||
* @see #registerInstance
|
||||
* @see #getAvailableIDs
|
||||
*/
|
||||
const UnicodeString& Transliterator::getID() const {
|
||||
return ID;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a name for this transliterator that is appropriate for
|
||||
* display to the user in the default locale. See {@link
|
||||
* #getDisplayName(Locale)} for details.
|
||||
*/
|
||||
UnicodeString& Transliterator::getDisplayName(UnicodeString& result) const {
|
||||
return getDisplayName(Locale::getDefault(), result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a name for this transliterator that is appropriate for
|
||||
* display to the user in the given locale. This name is taken
|
||||
* from the locale resource data in the standard manner of the
|
||||
* <code>java.text</code> package.
|
||||
*
|
||||
* <p>If no localized names exist in the system resource bundles,
|
||||
* a name is synthesized using a localized
|
||||
* <code>MessageFormat</code> pattern from the resource data. The
|
||||
* arguments to this pattern are an integer followed by one or two
|
||||
* strings. The integer is the number of strings, either 1 or 2.
|
||||
* The strings are formed by splitting the ID for this
|
||||
* transliterator at the first '-'. If there is no '-', then the
|
||||
* entire ID forms the only string.
|
||||
* @param inLocale the Locale in which the display name should be
|
||||
* localized.
|
||||
* @see java.text.MessageFormat
|
||||
*/
|
||||
UnicodeString& Transliterator::getDisplayName(const Locale& inLocale,
|
||||
UnicodeString& result) const {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
ResourceBundle bundle(Locale::getDataDirectory(), inLocale, status);
|
||||
// Suspend checking status until later...
|
||||
|
||||
UnicodeString key(RB_DISPLAY_NAME_PREFIX);
|
||||
key.append(ID);
|
||||
|
||||
// Try to retrieve a UnicodeString* from the bundle. The result,
|
||||
// if any, should NOT be deleted.
|
||||
const UnicodeString* resString = bundle.getString(key, status);
|
||||
|
||||
if (U_SUCCESS(status) && resString != 0) {
|
||||
return result = *resString; // [sic] assign & return
|
||||
}
|
||||
|
||||
// We have failed to get a name from the locale data. This is
|
||||
// typical, since most transliterators will not have localized
|
||||
// name data. The next step is to retrieve the MessageFormat
|
||||
// pattern from the locale data and to use it to synthesize the
|
||||
// name from the ID.
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
resString = bundle.getString(RB_DISPLAY_NAME_PATTERN, status);
|
||||
|
||||
if (U_SUCCESS(status) && resString != 0) {
|
||||
MessageFormat msg(*resString, inLocale, status);
|
||||
// Suspend checking status until later...
|
||||
|
||||
// We pass either 2 or 3 Formattable objects to msg.
|
||||
Formattable args[3];
|
||||
int32_t i = ID.indexOf((UChar)'-');
|
||||
int32_t nargs;
|
||||
if (i < 0) {
|
||||
args[0].setLong(1); // # of args to follow
|
||||
args[1].setString(ID);
|
||||
nargs = 2;
|
||||
} else {
|
||||
UnicodeString left, right;
|
||||
ID.extractBetween(0, i, left);
|
||||
ID.extractBetween(i+1, ID.length(), right);
|
||||
args[0].setLong(2); // # of args to follow
|
||||
args[1].setString(left);
|
||||
args[2].setString(right);
|
||||
nargs = 3;
|
||||
}
|
||||
FieldPosition pos; // ignored by msg
|
||||
msg.format(args, nargs, result, pos, status);
|
||||
if (U_SUCCESS(status)) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// We should not reach this point unless there is something
|
||||
// wrong with the build or the RB_DISPLAY_NAME_PATTERN has
|
||||
// been deleted from the root RB_LOCALE_ELEMENTS resource.
|
||||
result = ID;
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the filter used by this transliterator, or <tt>null</tt>
|
||||
* if this transliterator uses no filter. Caller musn't delete
|
||||
* the result!
|
||||
*/
|
||||
const UnicodeFilter* Transliterator::getFilter() const {
|
||||
return filter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Changes the filter used by this transliterator. If the filter
|
||||
* is set to <tt>null</tt> then no filtering will occur.
|
||||
*
|
||||
* <p>Callers must take care if a transliterator is in use by
|
||||
* multiple threads. The filter should not be changed by one
|
||||
* thread while another thread may be transliterating.
|
||||
*/
|
||||
void Transliterator::adoptFilter(UnicodeFilter* filterToAdopt) {
|
||||
delete filter;
|
||||
filter = filterToAdopt;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns this transliterator's inverse. See the class
|
||||
* documentation for details. This implementation simply inverts
|
||||
* the two entities in the ID and attempts to retrieve the
|
||||
* resulting transliterator. That is, if <code>getID()</code>
|
||||
* returns "A-B", then this method will return the result of
|
||||
* <code>getInstance("B-A")</code>, or <code>null</code> if that
|
||||
* call fails.
|
||||
*
|
||||
* <p>This method does not take filtering into account. The
|
||||
* returned transliterator will have no filter.
|
||||
*
|
||||
* <p>Subclasses with knowledge of their inverse may wish to
|
||||
* override this method.
|
||||
*
|
||||
* @return a transliterator that is an inverse, not necessarily
|
||||
* exact, of this transliterator, or <code>null</code> if no such
|
||||
* transliterator is registered.
|
||||
* @see #registerInstance
|
||||
*/
|
||||
Transliterator* Transliterator::createInverse() const {
|
||||
int32_t i = ID.indexOf((UChar)'-');
|
||||
if (i >= 0) {
|
||||
UnicodeString inverseID, right;
|
||||
ID.extractBetween(i+1, ID.length(), inverseID);
|
||||
ID.extractBetween(0, i, right);
|
||||
inverseID.append((UChar)'-').append(right);
|
||||
return _createInstance(inverseID);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a <code>Transliterator</code> object given its ID.
|
||||
* The ID must be either a system transliterator ID or a ID registered
|
||||
* using <code>registerInstance()</code>.
|
||||
*
|
||||
* @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
|
||||
* @return A <code>Transliterator</code> object with the given ID
|
||||
* @exception IllegalArgumentException if the given ID is invalid.
|
||||
* @see #registerInstance
|
||||
* @see #getAvailableIDs
|
||||
* @see #getID
|
||||
*/
|
||||
Transliterator* Transliterator::createInstance(const UnicodeString& ID) {
|
||||
Transliterator* t = _createInstance(ID);
|
||||
return t;
|
||||
}
|
||||
|
||||
/**
|
||||
* This is the path to the subdirectory within the locale data
|
||||
* directory that contains the rule-based transliterator resource
|
||||
* bundle files. This is constructed dynamically the first time
|
||||
* Transliterator::getDataDirectory() is called.
|
||||
*/
|
||||
char* Transliterator::DATA_DIR = 0;
|
||||
|
||||
/**
|
||||
* This is the name of a subdirectory within the locale data directory
|
||||
* that contains the rule-based transliterator resource bundle files.
|
||||
*/
|
||||
const char* Transliterator::RESOURCE_SUB_DIR = "translit";
|
||||
|
||||
/**
|
||||
* Returns the directory in which the transliterator resource bundle
|
||||
* files are located. This is a subdirectory, named RESOURCE_SUB_DIR,
|
||||
* under Locale::getDataDirectory(). It ends in a path separator.
|
||||
*/
|
||||
const char* Transliterator::getDataDirectory() {
|
||||
if (DATA_DIR == 0) {
|
||||
Mutex lock; // Okay to use the global mutex here
|
||||
if (DATA_DIR == 0) {
|
||||
/* Construct the transliterator data directory path. This
|
||||
* is a subdirectory of the locale data directory. For
|
||||
* now, we get the separator from the data directory
|
||||
* assuming a path separator of one character. In the
|
||||
* future we might add API to get the separator.
|
||||
*
|
||||
* TODO: Fix this to get the path separator in some better
|
||||
* way. File an rfe for this.
|
||||
*/
|
||||
const char* data = Locale::getDataDirectory();
|
||||
int32_t len = icu_strlen(data);
|
||||
char sep[2];
|
||||
sep[0] = data[len-1];
|
||||
sep[1] = 0;
|
||||
DATA_DIR = (char*) icu_malloc(
|
||||
len + icu_strlen(RESOURCE_SUB_DIR) + 2);
|
||||
if (DATA_DIR == 0) {
|
||||
// This is a fatal unrecoverable error -- what should we do?
|
||||
}
|
||||
icu_strcpy(DATA_DIR, data);
|
||||
icu_strcat(DATA_DIR, RESOURCE_SUB_DIR);
|
||||
icu_strcat(DATA_DIR, sep);
|
||||
}
|
||||
}
|
||||
return DATA_DIR;
|
||||
}
|
||||
|
||||
inline int32_t Transliterator::hash(const UnicodeString& str) {
|
||||
return str.hashCode() & 0x7FFFFFFF;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a transliterator object given its ID. Unlike getInstance(),
|
||||
* this method returns null if it cannot make use of the given ID.
|
||||
*/
|
||||
Transliterator* Transliterator::_createInstance(const UnicodeString& ID) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
if (!cacheInitialized) {
|
||||
initializeCache();
|
||||
}
|
||||
|
||||
Mutex lock(&cacheMutex);
|
||||
|
||||
CacheEntry* entry = (CacheEntry*) uhash_get(cache, hash(ID));
|
||||
TransliterationRuleData* data = 0;
|
||||
|
||||
if (entry == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (entry->entryType == CacheEntry::RBT_DATA) {
|
||||
data = entry->u.data;
|
||||
// Fall through to construct transliterator from cached Data object.
|
||||
} else if (entry->entryType == CacheEntry::PROTOTYPE) {
|
||||
return entry->u.prototype->clone();
|
||||
} else {
|
||||
// At this point entry type must be either RULE_BASED_PLACEHOLDER
|
||||
// or REVERSE_RULE_BASED_PLACEHOLDER.
|
||||
bool_t isReverse =
|
||||
(entry->entryType ==
|
||||
CacheEntry::REVERSE_RULE_BASED_PLACEHOLDER);
|
||||
|
||||
// We use the file name, taken from another resource bundle
|
||||
// 2-d array at static init time, as a locale language. We're
|
||||
// just using the locale mechanism to map through to a file
|
||||
// name; this in no way represents an actual locale.
|
||||
Locale fakeLocale(entry->rbFile);
|
||||
|
||||
ResourceBundle bundle(Transliterator::getDataDirectory(),
|
||||
fakeLocale, status);
|
||||
|
||||
// Call RBT to parse the rules from the resource bundle
|
||||
|
||||
// We don't own the rules - 'rules' is an alias pointer to
|
||||
// a string in the RB cache.
|
||||
const UnicodeString* rules = bundle.getString(RB_RULE, status);
|
||||
|
||||
// If rules == 0 at this piont, or if the status indicates a
|
||||
// failure, then we don't have any rules -- there is probably
|
||||
// an installation error. The list in the root locale should
|
||||
// correspond to all the installed transliterators; if it
|
||||
// lists something that's not installed, we'll get a null
|
||||
// pointer here.
|
||||
if (rules != 0 && U_SUCCESS(status)) {
|
||||
|
||||
data = TransliterationRuleParser::parse(*rules, isReverse
|
||||
? RuleBasedTransliterator.REVERSE
|
||||
: RuleBasedTransliterator.FORWARD);
|
||||
|
||||
// Double check to see if someone has modified the entry
|
||||
// since we last looked at it.
|
||||
if (entry->entryType != CacheEntry::RBT_DATA) {
|
||||
entry->entryType = CacheEntry::RBT_DATA;
|
||||
entry->u.data = data;
|
||||
} else {
|
||||
// Oops! Another thread has updated this cache entry
|
||||
// already to point to a data object. Discard the
|
||||
// one we just created and use the one in the cache
|
||||
// instead.
|
||||
delete data;
|
||||
data = entry->u.data;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (data != 0) {
|
||||
return new RuleBasedTransliterator(ID, data);
|
||||
} else {
|
||||
// We have a failure of some kind. Remove the ID from the
|
||||
// cache so we don't keep trying. NOTE: This will throw off
|
||||
// anyone who is, at the moment, trying to iterate over the
|
||||
// available IDs. That's acceptable since we should never
|
||||
// really get here except under installation, configuration,
|
||||
// or unrecoverable run time memory failures.
|
||||
_unregister(ID);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Registers a instance <tt>obj</tt> of a subclass of
|
||||
* <code>Transliterator</code> with the system. This object must
|
||||
* implement the <tt>clone()</tt> method. When
|
||||
* <tt>getInstance()</tt> is called with an ID string that is
|
||||
* equal to <tt>obj.getID()</tt>, then <tt>obj.clone()</tt> is
|
||||
* returned.
|
||||
*
|
||||
* @param obj an instance of subclass of
|
||||
* <code>Transliterator</code> that defines <tt>clone()</tt>
|
||||
* @see #getInstance
|
||||
* @see #unregister
|
||||
*/
|
||||
void Transliterator::registerInstance(Transliterator* adoptedPrototype,
|
||||
UErrorCode &status) {
|
||||
if (!cacheInitialized) {
|
||||
initializeCache();
|
||||
}
|
||||
|
||||
Mutex lock(&cacheMutex);
|
||||
_registerInstance(adoptedPrototype, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* This internal method registers a prototype instance in the cache.
|
||||
* The CALLER MUST MUTEX using cacheMutex before calling this method.
|
||||
*/
|
||||
void Transliterator::_registerInstance(Transliterator* adoptedPrototype,
|
||||
UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t hashCode = hash(adoptedPrototype->getID());
|
||||
|
||||
// This needs explaining: The string reference that getID returns
|
||||
// is to the ID data member of Transliterator. As long as the
|
||||
// Transliterator object exists, this reference is valid, and in
|
||||
// fact we can take its address and store it in IDS. No problem
|
||||
// there. The only thing we have to be sure of is that before we
|
||||
// remove the prototype (via unregister()), we remove the ID
|
||||
// entry.
|
||||
cacheIDs.addElement((void*) &adoptedPrototype->getID());
|
||||
|
||||
CacheEntry* entry = (CacheEntry*) uhash_get(cache, hashCode);
|
||||
if (entry == 0) {
|
||||
entry = new CacheEntry();
|
||||
}
|
||||
|
||||
entry->adoptPrototype(adoptedPrototype);
|
||||
|
||||
uhash_putKey(cache, hashCode, entry, &status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Unregisters a transliterator or class. This may be either
|
||||
* a system transliterator or a user transliterator or class.
|
||||
*
|
||||
* @param ID the ID of the transliterator or class
|
||||
* @see #registerInstance
|
||||
*/
|
||||
void Transliterator::unregister(const UnicodeString& ID) {
|
||||
if (!cacheInitialized) {
|
||||
initializeCache();
|
||||
}
|
||||
Mutex lock(&cacheMutex);
|
||||
_unregister(ID);
|
||||
}
|
||||
|
||||
/**
|
||||
* Unregisters a transliterator or class. Internal method.
|
||||
* Prerequisites: The cache must be initialized, and the
|
||||
* caller must own the cacheMutex.
|
||||
*/
|
||||
void Transliterator::_unregister(const UnicodeString& ID) {
|
||||
cacheIDs.removeElement((void*) &ID);
|
||||
int32_t hc = hash(ID);
|
||||
CacheEntry* entry = (CacheEntry*) uhash_get(cache, hc);
|
||||
if (entry != 0) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
uhash_remove(cache, hc, &status);
|
||||
delete entry;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Vector of registered IDs.
|
||||
*/
|
||||
UVector Transliterator::cacheIDs;
|
||||
|
||||
/**
|
||||
* Return the number of IDs currently registered with the system.
|
||||
* To retrieve the actual IDs, call getAvailableID(i) with
|
||||
* i from 0 to countAvailableIDs() - 1.
|
||||
*/
|
||||
int32_t Transliterator::countAvailableIDs() {
|
||||
if (!cacheInitialized) {
|
||||
initializeCache();
|
||||
}
|
||||
Mutex lock(&cacheMutex);
|
||||
return cacheIDs.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the index-th available ID. index must be between 0
|
||||
* and countAvailableIDs() - 1, inclusive. If index is out of
|
||||
* range, the result of getAvailableID(0) is returned.
|
||||
*/
|
||||
const UnicodeString& Transliterator::getAvailableID(int32_t index) {
|
||||
if (index < 0 || index >= cacheIDs.size()) {
|
||||
index = 0;
|
||||
}
|
||||
if (!cacheInitialized) {
|
||||
initializeCache();
|
||||
}
|
||||
Mutex lock(&cacheMutex);
|
||||
return *(const UnicodeString*) cacheIDs[index];
|
||||
}
|
||||
|
||||
/**
|
||||
* Comparison function for UVector. Compares two UnicodeString
|
||||
* objects given void* pointers to them.
|
||||
*/
|
||||
bool_t Transliterator::compareIDs(void* a, void* b) {
|
||||
const UnicodeString* aa = (const UnicodeString*) a;
|
||||
const UnicodeString* bb = (const UnicodeString*) b;
|
||||
return *aa == *bb;
|
||||
}
|
||||
|
||||
void Transliterator::initializeCache() {
|
||||
// Lock first, check init boolean second
|
||||
Mutex lock(&cacheMutex);
|
||||
if (cacheInitialized) {
|
||||
return;
|
||||
}
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
// Before looking for the resource, construct our cache.
|
||||
// That way if the resource is absent, we will at least
|
||||
// have a valid cache object.
|
||||
cache = uhash_open(uhash_hashUString, &status);
|
||||
cacheIDs.setComparer(compareIDs);
|
||||
|
||||
/* The following code is assuming an n x 3 table
|
||||
* that looks like this:
|
||||
*
|
||||
* RuleBasedTransliteratorIDs {
|
||||
* { "Latin-Arabic", "Arabic-Latin", "larabic" }
|
||||
* { "KeyboardEscape-Latin1", "", "keyescl1" }
|
||||
* ...
|
||||
* }
|
||||
*/
|
||||
|
||||
ResourceBundle bundle(Locale::getDataDirectory(),
|
||||
Locale::getDefault(),
|
||||
status);
|
||||
int32_t rows, cols;
|
||||
const UnicodeString** ruleBasedIDs =
|
||||
bundle.get2dArray(RB_RULE_BASED_IDS, rows, cols, status);
|
||||
|
||||
if (U_SUCCESS(status) && (cols == 3)) {
|
||||
for (int32_t i=0; i<rows; ++i) {
|
||||
const UnicodeString* row = ruleBasedIDs[i];
|
||||
for (int32_t col=0; col<2; ++col) {
|
||||
|
||||
if (row[col].length() > 0) {
|
||||
CacheEntry* entry = new CacheEntry();
|
||||
entry->entryType = (col == 0) ?
|
||||
CacheEntry::RULE_BASED_PLACEHOLDER :
|
||||
CacheEntry::REVERSE_RULE_BASED_PLACEHOLDER;
|
||||
entry->rbFile = row[2];
|
||||
uhash_putKey(cache, hash(row[col]), entry, &status);
|
||||
|
||||
/* It's okay to take the address of the string
|
||||
* from the resource bundle under the assumption
|
||||
* that the RB is caching these, and that they
|
||||
* stay around forever. If this changes, what we
|
||||
* need to do is change the id vector so that it
|
||||
* owns its strings and create a copy here.
|
||||
*/
|
||||
cacheIDs.addElement((void*) &row[col]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Manually add prototypes that the system knows about to the
|
||||
// cache. This is how new non-rule-based transliterators are
|
||||
// added to the system.
|
||||
|
||||
status = U_ZERO_ERROR; // Reset status for following calls
|
||||
_registerInstance(new HexToUnicodeTransliterator(), status);
|
||||
_registerInstance(new UnicodeToHexTransliterator(), status);
|
||||
|
||||
cacheInitialized = TRUE;
|
||||
}
|
||||
|
||||
Transliterator::CacheEntry::CacheEntry() {
|
||||
u.prototype = 0;
|
||||
entryType = NONE;
|
||||
}
|
||||
|
||||
Transliterator::CacheEntry::~CacheEntry() {
|
||||
if (entryType == PROTOTYPE) {
|
||||
delete u.prototype;
|
||||
}
|
||||
}
|
||||
|
||||
void Transliterator::CacheEntry::adoptPrototype(Transliterator* adopted) {
|
||||
if (entryType == PROTOTYPE) {
|
||||
delete u.prototype;
|
||||
}
|
||||
entryType = PROTOTYPE;
|
||||
u.prototype = adopted;
|
||||
}
|
860
icu4c/source/i18n/translit.h
Normal file
860
icu4c/source/i18n/translit.h
Normal file
|
@ -0,0 +1,860 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef TRANSLIT_H
|
||||
#define TRANSLIT_H
|
||||
|
||||
#include "unistr.h"
|
||||
#include "umutex.h"
|
||||
#include "uvector.h"
|
||||
|
||||
class Replaceable;
|
||||
class UnicodeFilter;
|
||||
class TransliterationRuleData;
|
||||
struct UHashtable;
|
||||
|
||||
/**
|
||||
* <code>Transliterator</code> is an abstract class that
|
||||
* transliterates text from one format to another. The most common
|
||||
* kind of transliterator is a script, or alphabet, transliterator.
|
||||
* For example, a Russian to Latin transliterator changes Russian text
|
||||
* written in Cyrillic characters to phonetically equivalent Latin
|
||||
* characters. It does not <em>translate</em> Russian to English!
|
||||
* Transliteration, unlike translation, operates on characters, without
|
||||
* reference to the meanings of words and sentences.
|
||||
*
|
||||
* <p>Although script conversion is its most common use, a
|
||||
* transliterator can actually perform a more general class of tasks.
|
||||
* In fact, <code>Transliterator</code> defines a very general API
|
||||
* which specifies only that a segment of the input text is replaced
|
||||
* by new text. The particulars of this conversion are determined
|
||||
* entirely by subclasses of <code>Transliterator</code>.
|
||||
*
|
||||
* <p><b>Transliterators are stateless</b>
|
||||
*
|
||||
* <p><code>Transliterator</code> objects are <em>stateless</em>; they
|
||||
* retain no information between calls to
|
||||
* <code>transliterate()</code>. (However, this does <em>not</em>
|
||||
* mean that threads may share transliterators without synchronizing
|
||||
* them. Transliterators are not immutable, so they must be
|
||||
* synchronized when shared between threads.) This1 might seem to
|
||||
* limit the complexity of the transliteration operation. In
|
||||
* practice, subclasses perform complex transliterations by delaying
|
||||
* the replacement of text until it is known that no other
|
||||
* replacements are possible. In other words, although the
|
||||
* <code>Transliterator</code> objects are stateless, the source text
|
||||
* itself embodies all the needed information, and delayed operation
|
||||
* allows arbitrary complexity.
|
||||
*
|
||||
* <p><b>Batch transliteration</b>
|
||||
*
|
||||
* <p>The simplest way to perform transliteration is all at once, on a
|
||||
* string of existing text. This is referred to as <em>batch</em>
|
||||
* transliteration. For example, given a string <code>input</code>
|
||||
* and a transliterator <code>t</code>, the call
|
||||
*
|
||||
* <blockquote><code>String result = t.transliterate(input);
|
||||
* </code></blockquote>
|
||||
*
|
||||
* will transliterate it and return the result. Other methods allow
|
||||
* the client to specify a substring to be transliterated and to use
|
||||
* {@link Replaceable} objects instead of strings, in order to
|
||||
* preserve out-of-band information (such as text styles).
|
||||
*
|
||||
* <p><b>Keyboard transliteration</b>
|
||||
*
|
||||
* <p>Somewhat more involved is <em>keyboard</em>, or incremental
|
||||
* transliteration. This is the transliteration of text that is
|
||||
* arriving from some source (typically the user's keyboard) one
|
||||
* character at a time, or in some other piecemeal fashion.
|
||||
*
|
||||
* <p>In keyboard transliteration, a <code>Replaceable</code> buffer
|
||||
* stores the text. As text is inserted, as much as possible is
|
||||
* transliterated on the fly. This means a GUI that displays the
|
||||
* contents of the buffer may show text being modified as each new
|
||||
* character arrives.
|
||||
*
|
||||
* <p>Consider the simple <code>RuleBasedTransliterator</code>:
|
||||
*
|
||||
* <blockquote><code>
|
||||
* th>{theta}<br>
|
||||
* t>{tau}
|
||||
* </code></blockquote>
|
||||
*
|
||||
* When the user types 't', nothing will happen, since the
|
||||
* transliterator is waiting to see if the next character is 'h'. To
|
||||
* remedy this, we introduce the notion of a cursor, marked by a '|'
|
||||
* in the output string:
|
||||
*
|
||||
* <blockquote><code>
|
||||
* t>|{tau}<br>
|
||||
* {tau}h>{theta}
|
||||
* </code></blockquote>
|
||||
*
|
||||
* Now when the user types 't', tau appears, and if the next character
|
||||
* is 'h', the tau changes to a theta. This is accomplished by
|
||||
* maintaining a cursor position (independent of the insertion point,
|
||||
* and invisible in the GUI) across calls to
|
||||
* <code>keyboardTransliterate()</code>. Typically, the cursor will
|
||||
* be coincident with the insertion point, but in a case like the one
|
||||
* above, it will precede the insertion point.
|
||||
*
|
||||
* <p>Keyboard transliteration methods maintain a set of three indices
|
||||
* that are updated with each call to
|
||||
* <code>keyboardTransliterate()</code>, including the cursor, start,
|
||||
* and limit. Since these indices are changed by the method, they are
|
||||
* passed in an <code>int[]</code> array. The <code>START</code> index
|
||||
* marks the beginning of the substring that the transliterator will
|
||||
* look at. It is advanced as text becomes committed (but it is not
|
||||
* the committed index; that's the <code>CURSOR</code>). The
|
||||
* <code>CURSOR</code> index, described above, marks the point at
|
||||
* which the transliterator last stopped, either because it reached
|
||||
* the end, or because it required more characters to disambiguate
|
||||
* between possible inputs. The <code>CURSOR</code> can also be
|
||||
* explicitly set by rules in a <code>RuleBasedTransliterator</code>.
|
||||
* Any characters before the <code>CURSOR</code> index are frozen;
|
||||
* future keyboard transliteration calls within this input sequence
|
||||
* will not change them. New text is inserted at the
|
||||
* <code>LIMIT</code> index, which marks the end of the substring that
|
||||
* the transliterator looks at.
|
||||
*
|
||||
* <p>Because keyboard transliteration assumes that more characters
|
||||
* are to arrive, it is conservative in its operation. It only
|
||||
* transliterates when it can do so unambiguously. Otherwise it waits
|
||||
* for more characters to arrive. When the client code knows that no
|
||||
* more characters are forthcoming, perhaps because the user has
|
||||
* performed some input termination operation, then it should call
|
||||
* <code>finishKeyboardTransliteration()</code> to complete any
|
||||
* pending transliterations.
|
||||
*
|
||||
* <p><b>Inverses</b>
|
||||
*
|
||||
* <p>Pairs of transliterators may be inverses of one another. For
|
||||
* example, if transliterator <b>A</b> transliterates characters by
|
||||
* incrementing their Unicode value (so "abc" -> "def"), and
|
||||
* transliterator <b>B</b> decrements character values, then <b>A</b>
|
||||
* is an inverse of <b>B</b> and vice versa. If we compose <b>A</b>
|
||||
* with <b>B</b> in a compound transliterator, the result is the
|
||||
* indentity transliterator, that is, a transliterator that does not
|
||||
* change its input text.
|
||||
*
|
||||
* The <code>Transliterator</code> method <code>getInverse()</code>
|
||||
* returns a transliterator's inverse, if one exists, or
|
||||
* <code>null</code> otherwise. However, the result of
|
||||
* <code>getInverse()</code> usually will <em>not</em> be a true
|
||||
* mathematical inverse. This is because true inverse transliterators
|
||||
* are difficult to formulate. For example, consider two
|
||||
* transliterators: <b>AB</b>, which transliterates the character 'A'
|
||||
* to 'B', and <b>BA</b>, which transliterates 'B' to 'A'. It might
|
||||
* seem that these are exact inverses, since
|
||||
*
|
||||
* <blockquote>"A" x <b>AB</b> -> "B"<br>
|
||||
* "B" x <b>BA</b> -> "A"</blockquote>
|
||||
*
|
||||
* where 'x' represents transliteration. However,
|
||||
*
|
||||
* <blockquote>"ABCD" x <b>AB</b> -> "BBCD"<br>
|
||||
* "BBCD" x <b>BA</b> -> "AACD"</blockquote>
|
||||
*
|
||||
* so <b>AB</b> composed with <b>BA</b> is not the
|
||||
* identity. Nonetheless, <b>BA</b> may be usefully considered to be
|
||||
* <b>AB</b>'s inverse, and it is on this basis that
|
||||
* <b>AB</b><code>.getInverse()</code> could legitimately return
|
||||
* <b>BA</b>.
|
||||
*
|
||||
* <p><b>IDs and display names</b>
|
||||
*
|
||||
* <p>A transliterator is designated by a short identifier string or
|
||||
* <em>ID</em>. IDs follow the format <em>source-destination</em>,
|
||||
* where <em>source</em> describes the entity being replaced, and
|
||||
* <em>destination</em> describes the entity replacing
|
||||
* <em>source</em>. The entities may be the names of scripts,
|
||||
* particular sequences of characters, or whatever else it is that the
|
||||
* transliterator converts to or from. For example, a transliterator
|
||||
* from Russian to Latin might be named "Russian-Latin". A
|
||||
* transliterator from keyboard escape sequences to Latin-1 characters
|
||||
* might be named "KeyboardEscape-Latin1". By convention, system
|
||||
* entity names are in English, with the initial letters of words
|
||||
* capitalized; user entity names may follow any format so long as
|
||||
* they do not contain dashes.
|
||||
*
|
||||
* <p>In addition to programmatic IDs, transliterator objects have
|
||||
* display names for presentation in user interfaces, returned by
|
||||
* {@link #getDisplayName}.
|
||||
*
|
||||
* <p><b>Factory methods and registration</b>
|
||||
*
|
||||
* <p>In general, client code should use the factory method
|
||||
* <code>getInstance()</code> to obtain an instance of a
|
||||
* transliterator given its ID. Valid IDs may be enumerated using
|
||||
* <code>getAvailableIDs()</code>. Since transliterators are mutable,
|
||||
* multiple calls to <code>getInstance()</code> with the same ID will
|
||||
* return distinct objects.
|
||||
*
|
||||
* <p>In addition to the system transliterators registered at startup,
|
||||
* user transliterators may be registered by calling
|
||||
* <code>registerInstance()</code> at run time. A registered instance
|
||||
* acts a template; future calls to <tt>getInstance()</tt> with the ID
|
||||
* of the registered object return clones of that object. Thus any
|
||||
* object passed to <tt>registerInstance()</tt> must implement
|
||||
* <tt>clone()</tt> propertly. To register a transliterator subclass
|
||||
* without instantiating it (until it is needed), users may call
|
||||
* <code>registerClass()</code>. In this case, the objects are
|
||||
* instantiated by invoking the zero-argument public constructor of
|
||||
* the class.
|
||||
*
|
||||
* <p><b>Subclassing</b>
|
||||
*
|
||||
* <p>Subclasses must implement the abstract
|
||||
* <code>transliterate()</code> method. They should also override the
|
||||
* <code>transliterate()</code> method taking a <code>String</code>
|
||||
* and <code>StringBuffer</code> if the performance of these methods
|
||||
* can be improved over the performance obtained by the default
|
||||
* implementations in this class. Subclasses must also implement
|
||||
* <code>handleKeyboardTransliterate()</code>.
|
||||
*
|
||||
* @author Alan Liu
|
||||
*/
|
||||
class U_I18N_API Transliterator {
|
||||
|
||||
public:
|
||||
|
||||
enum {
|
||||
/**
|
||||
* In the <code>keyboardTransliterate()</code>
|
||||
* <code>index[]</code> array, the beginning index, inclusive
|
||||
* @see #keyboardTransliterate
|
||||
*/
|
||||
START = 0,
|
||||
|
||||
/**
|
||||
* In the <code>keyboardTransliterate()</code>
|
||||
* <code>index[]</code> array, the ending index, exclusive
|
||||
* @see #keyboardTransliterate
|
||||
*/
|
||||
LIMIT = 1,
|
||||
|
||||
/**
|
||||
* In the <code>keyboardTransliterate()</code>
|
||||
* <code>index[]</code> array, the next character to be considered
|
||||
* for transliteration
|
||||
* @see #keyboardTransliterate
|
||||
*/
|
||||
CURSOR = 2
|
||||
};
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* Programmatic name, e.g., "Latin-Arabic".
|
||||
*/
|
||||
UnicodeString ID;
|
||||
|
||||
/**
|
||||
* This transliterator's filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
UnicodeFilter* filter;
|
||||
|
||||
/**
|
||||
* Dictionary of known transliterators. Keys are <code>String</code>
|
||||
* names, values are one of the following:
|
||||
*
|
||||
* <ul><li><code>Transliterator</code> objects
|
||||
*
|
||||
* <li><code>Class</code> objects. Such objects must represent
|
||||
* subclasses of <code>Transliterator</code>, and must satisfy the
|
||||
* constraints described in <code>registerClass()</code>
|
||||
*
|
||||
* <li><code>RULE_BASED_PLACEHOLDER</code>, in which case the ID
|
||||
* will have its first '-' removed and be appended to
|
||||
* RB_RULE_BASED_PREFIX to form a resource bundle name from which
|
||||
* the RB_RULE key is looked up to obtain the rule.
|
||||
*
|
||||
* <li><code>REVERSE_RULE_BASED_PLACEHOLDER</code>. Like
|
||||
* <code>RULE_BASED_PLACEHOLDER</code>, except the entity names in
|
||||
* the ID are reversed, and the argument
|
||||
* RuleBasedTransliterator.REVERSE is pased to the
|
||||
* RuleBasedTransliterator constructor.
|
||||
* </ul>
|
||||
*/
|
||||
static UHashtable* cache;
|
||||
|
||||
/**
|
||||
* The mutex controlling access to the cache.
|
||||
*/
|
||||
static UMTX cacheMutex;
|
||||
|
||||
/**
|
||||
* When set to TRUE, the cache has been initialized. Any code must
|
||||
* check this boolean before accessing the cache, and if the boolean
|
||||
* is FALSE, it must call initializeCache(). We do this form of lazy
|
||||
* evaluation for two reasons: (1) so we don't initialize if we don't
|
||||
* have to (i.e., if no one is using Transliterator, but has included
|
||||
* the code as part of a shared library, and (2) to avoid static
|
||||
* intialization problems.
|
||||
*/
|
||||
static bool_t cacheInitialized;
|
||||
|
||||
/**
|
||||
* In Java, the cache stores objects of different types and
|
||||
* singleton objects as placeholders for rule-based
|
||||
* transliterators to be built as needed. In C++ we use the
|
||||
* following struct to achieve the same purpose. Instances of
|
||||
* this struct can be placeholders, can represent prototype
|
||||
* transliterators to be cloned, or can represent
|
||||
* RuleBasedTransliterator::Data objects. We don't support
|
||||
* storing classes in the cache because we don't have the rtti
|
||||
* infrastructure for it. We could easily add this if there is a
|
||||
* need for it in the future. The rbFile is the resource bundle
|
||||
* file name for rule-based transliterators.
|
||||
*/
|
||||
struct CacheEntry {
|
||||
enum Type {
|
||||
RULE_BASED_PLACEHOLDER,
|
||||
REVERSE_RULE_BASED_PLACEHOLDER,
|
||||
PROTOTYPE,
|
||||
RBT_DATA,
|
||||
NONE // Only used for uninitialized entries
|
||||
} entryType;
|
||||
UnicodeString rbFile; // For *PLACEHOLDER
|
||||
union {
|
||||
Transliterator* prototype; // For PROTOTYPE
|
||||
TransliterationRuleData* data; // For RBT_DATA
|
||||
} u;
|
||||
CacheEntry();
|
||||
~CacheEntry();
|
||||
void adoptPrototype(Transliterator* adopted);
|
||||
};
|
||||
|
||||
/**
|
||||
* Prefix for resource bundle key for the display name for a
|
||||
* transliterator. The ID is appended to this to form the key.
|
||||
* The resource bundle value should be a String.
|
||||
*/
|
||||
static const char* RB_DISPLAY_NAME_PREFIX;
|
||||
|
||||
/**
|
||||
* Resource bundle key for display name pattern.
|
||||
* The resource bundle value should be a String forming a
|
||||
* MessageFormat pattern, e.g.:
|
||||
* "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}".
|
||||
*/
|
||||
static const char* RB_DISPLAY_NAME_PATTERN;
|
||||
|
||||
/**
|
||||
* Resource bundle key for the list of RuleBasedTransliterator IDs.
|
||||
* The resource bundle value should be a String[] with each element
|
||||
* being a valid ID. The ID will be appended to RB_RULE_BASED_PREFIX
|
||||
* to obtain the class name in which the RB_RULE key will be sought.
|
||||
*/
|
||||
static const char* RB_RULE_BASED_IDS;
|
||||
|
||||
/**
|
||||
* Resource bundle key for the RuleBasedTransliterator rule.
|
||||
*/
|
||||
static const char* RB_RULE;
|
||||
|
||||
protected:
|
||||
|
||||
/**
|
||||
* Default constructor.
|
||||
* @param ID the string identifier for this transliterator
|
||||
* @param adoptedFilter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
Transliterator(const UnicodeString& ID, UnicodeFilter* adoptedFilter);
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
Transliterator(const Transliterator&);
|
||||
|
||||
/**
|
||||
* Assignment operator.
|
||||
*/
|
||||
Transliterator& operator=(const Transliterator&);
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~Transliterator();
|
||||
|
||||
/**
|
||||
* Implements Cloneable.
|
||||
* All subclasses are encouraged to implement this method if it is
|
||||
* possible and reasonable to do so. Subclasses that are to be
|
||||
* registered with the system using <tt>registerInstance()<tt>
|
||||
* are required to implement this method. If a subclass does not
|
||||
* implement clone() properly and is registered with the system
|
||||
* using registerInstance(), then the default clone() implementation
|
||||
* will return null, and calls to createInstance() will fail.
|
||||
*
|
||||
* @see #registerInstance
|
||||
*/
|
||||
virtual Transliterator* clone() const { return 0; }
|
||||
|
||||
/**
|
||||
* Transliterates the segment of a string that begins at the
|
||||
* character at offset <code>start</code> and extends to the
|
||||
* character at offset <code>limit - 1</code>, with optional
|
||||
* filtering. A default implementaion is provided here;
|
||||
* subclasses should provide a more efficient implementation if
|
||||
* possible.
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param result buffer to receive the transliterated text; previous
|
||||
* contents are discarded
|
||||
*/
|
||||
virtual void transliterate(const UnicodeString& text,
|
||||
int32_t start, int32_t limit,
|
||||
UnicodeString& result) const;
|
||||
|
||||
/**
|
||||
* Transliterates a segment of a string, with optional filtering.
|
||||
* Subclasses must override this abstract method.
|
||||
*
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return The new limit index. The text previously occupying <code>[start,
|
||||
* limit)</code> has been transliterated, possibly to a string of a different
|
||||
* length, at <code>[start, </code><em>new-limit</em><code>)</code>, where
|
||||
* <em>new-limit</em> is the return value.
|
||||
*/
|
||||
virtual int32_t transliterate(Replaceable& text,
|
||||
int32_t start, int32_t limit) const = 0;
|
||||
|
||||
/**
|
||||
* Transliterates an entire string. Convenience method.
|
||||
* @param text the string to be transliterated
|
||||
* @param result buffer to receive the transliterated text; previous
|
||||
* contents are discarded
|
||||
*/
|
||||
virtual void transliterate(const UnicodeString& text,
|
||||
UnicodeString& result) const;
|
||||
|
||||
/**
|
||||
* Transliterates an entire string in place. Convenience method.
|
||||
* @param text the string to be transliterated
|
||||
*/
|
||||
virtual void transliterate(Replaceable& text) const;
|
||||
|
||||
/**
|
||||
* Transliterates the portion of the text buffer that can be
|
||||
* transliterated unambiguosly after new text has been inserted,
|
||||
* typically as a result of a keyboard event. The new text in
|
||||
* <code>insertion</code> will be inserted into <code>text</code>
|
||||
* at <code>index[LIMIT]</code>, advancing
|
||||
* <code>index[LIMIT]</code> by <code>insertion.length()</code>.
|
||||
* Then the transliterator will try to transliterate characters of
|
||||
* <code>text</code> between <code>index[CURSOR]</code> and
|
||||
* <code>index[LIMIT]</code>. Characters before
|
||||
* <code>index[CURSOR]</code> will not be changed.
|
||||
*
|
||||
* <p>Upon return, values in <code>index[]</code> will be updated.
|
||||
* <code>index[START]</code> will be advanced to the first
|
||||
* character that future calls to this method will read.
|
||||
* <code>index[CURSOR]</code> and <code>index[LIMIT]</code> will
|
||||
* be adjusted to delimit the range of text that future calls to
|
||||
* this method may change.
|
||||
*
|
||||
* <p>Typical usage of this method begins with an initial call
|
||||
* with <code>index[START]</code> and <code>index[LIMIT]</code>
|
||||
* set to indicate the portion of <code>text</code> to be
|
||||
* transliterated, and <code>index[CURSOR] == index[START]</code>.
|
||||
* Thereafter, <code>index[]</code> can be used without
|
||||
* modification in future calls, provided that all changes to
|
||||
* <code>text</code> are made via this method.
|
||||
*
|
||||
* <p>This method assumes that future calls may be made that will
|
||||
* insert new text into the buffer. As a result, it only performs
|
||||
* unambiguous transliterations. After the last call to this
|
||||
* method, there may be untransliterated text that is waiting for
|
||||
* more input to resolve an ambiguity. In order to perform these
|
||||
* pending transliterations, clients should call {@link
|
||||
* #finishKeyboardTransliteration} after the last call to this
|
||||
* method has been made.
|
||||
*
|
||||
* @param text the buffer holding transliterated and untransliterated text
|
||||
* @param index an array of three integers.
|
||||
*
|
||||
* <ul><li><code>index[START]</code>: the beginning index,
|
||||
* inclusive; <code>0 <= index[START] <= index[LIMIT]</code>.
|
||||
*
|
||||
* <li><code>index[LIMIT]</code>: the ending index, exclusive;
|
||||
* <code>index[START] <= index[LIMIT] <= text.length()</code>.
|
||||
* <code>insertion</code> is inserted at
|
||||
* <code>index[LIMIT]</code>.
|
||||
*
|
||||
* <li><code>index[CURSOR]</code>: the next character to be
|
||||
* considered for transliteration; <code>index[START] <=
|
||||
* index[CURSOR] <= index[LIMIT]</code>. Characters before
|
||||
* <code>index[CURSOR]</code> will not be changed by future calls
|
||||
* to this method.</ul>
|
||||
*
|
||||
* @param insertion text to be inserted and possibly
|
||||
* transliterated into the translation buffer at
|
||||
* <code>index[LIMIT]</code>. If <code>null</code> then no text
|
||||
* is inserted.
|
||||
* @see #START
|
||||
* @see #LIMIT
|
||||
* @see #CURSOR
|
||||
* @see #handleKeyboardTransliterate
|
||||
* @exception IllegalArgumentException if <code>index[]</code>
|
||||
* is invalid
|
||||
*/
|
||||
virtual void keyboardTransliterate(Replaceable& text,
|
||||
int32_t index[3],
|
||||
const UnicodeString& insertion,
|
||||
UErrorCode& status) const;
|
||||
|
||||
/**
|
||||
* Transliterates the portion of the text buffer that can be
|
||||
* transliterated unambiguosly after a new character has been
|
||||
* inserted, typically as a result of a keyboard event. This is a
|
||||
* convenience method; see {@link
|
||||
* #keyboardTransliterate(Replaceable, int[], String)} for details.
|
||||
* @param text the buffer holding transliterated and
|
||||
* untransliterated text
|
||||
* @param index an array of three integers. See {@link
|
||||
* #keyboardTransliterate(Replaceable, int[], String)}.
|
||||
* @param insertion text to be inserted and possibly
|
||||
* transliterated into the translation buffer at
|
||||
* <code>index[LIMIT]</code>.
|
||||
* @see #keyboardTransliterate(Replaceable, int[], String)
|
||||
*/
|
||||
virtual void keyboardTransliterate(Replaceable& text, int32_t index[3],
|
||||
UChar insertion,
|
||||
UErrorCode& status) const;
|
||||
|
||||
/**
|
||||
* Transliterates the portion of the text buffer that can be
|
||||
* transliterated unambiguosly. This is a convenience method; see
|
||||
* {@link #keyboardTransliterate(Replaceable, int[], String)} for
|
||||
* details.
|
||||
* @param text the buffer holding transliterated and
|
||||
* untransliterated text
|
||||
* @param index an array of three integers. See {@link
|
||||
* #keyboardTransliterate(Replaceable, int[], String)}.
|
||||
* @see #keyboardTransliterate(Replaceable, int[], String)
|
||||
*/
|
||||
virtual void keyboardTransliterate(Replaceable& text, int32_t index[3],
|
||||
UErrorCode& status) const;
|
||||
|
||||
/**
|
||||
* Finishes any pending transliterations that were waiting for
|
||||
* more characters. Clients should call this method as the last
|
||||
* call after a sequence of one or more calls to
|
||||
* <code>keyboardTransliterate()</code>.
|
||||
* @param text the buffer holding transliterated and
|
||||
* untransliterated text.
|
||||
* @param index the array of indices previously passed to {@link
|
||||
* #keyboardTransliterate}
|
||||
*/
|
||||
virtual void finishKeyboardTransliteration(Replaceable& text,
|
||||
int32_t index[3]) const;
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* This internal method does keyboard transliteration. If the
|
||||
* 'insertion' is non-null then we append it to 'text' before
|
||||
* proceeding. This method calls through to the pure virtual
|
||||
* framework method handleKeyboardTransliterate() to do the actual
|
||||
* work.
|
||||
*/
|
||||
void _keyboardTransliterate(Replaceable& text,
|
||||
int32_t index[3],
|
||||
const UnicodeString* insertion,
|
||||
UErrorCode &status) const;
|
||||
|
||||
protected:
|
||||
|
||||
/**
|
||||
* Abstract method that concrete subclasses define to implement
|
||||
* keyboard transliteration. This method should transliterate all
|
||||
* characters between <code>index[CURSOR]</code> and
|
||||
* <code>index[LIMIT]</code> that can be unambiguously
|
||||
* transliterated, regardless of future insertions of text at
|
||||
* <code>index[LIMIT]</code>. <code>index[CURSOR]</code> should
|
||||
* be advanced past committed characters (those that will not
|
||||
* change in future calls to this method).
|
||||
* <code>index[LIMIT]</code> should be updated to reflect text
|
||||
* replacements that shorten or lengthen the text between
|
||||
* <code>index[CURSOR]</code> and <code>index[LIMIT]</code>. Upon
|
||||
* return, neither <code>index[CURSOR]</code> nor
|
||||
* <code>index[LIMIT]</code> should be less than the initial value
|
||||
* of <code>index[CURSOR]</code>. <code>index[START]</code>
|
||||
* should <em>not</em> be changed.
|
||||
*
|
||||
* @param text the buffer holding transliterated and
|
||||
* untransliterated text
|
||||
* @param index an array of three integers. See {@link
|
||||
* #keyboardTransliterate(Replaceable, int[], String)}.
|
||||
* @see #keyboardTransliterate
|
||||
*/
|
||||
virtual void handleKeyboardTransliterate(Replaceable& text,
|
||||
int32_t index[3]) const = 0;
|
||||
|
||||
// C++ requires this friend declaration so CompoundTransliterator
|
||||
// can access handleKeyboardTransliterate. Alternatively, we could
|
||||
// make handleKeyboardTransliterate public.
|
||||
friend class CompoundTransliterator;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Returns the length of the longest context required by this transliterator.
|
||||
* This is <em>preceding</em> context. The default implementation supplied
|
||||
* by <code>Transliterator</code> returns zero; subclasses
|
||||
* that use preceding context should override this method to return the
|
||||
* correct value. For example, if a transliterator translates "ddd" (where
|
||||
* d is any digit) to "555" when preceded by "(ddd)", then the preceding
|
||||
* context length is 5, the length of "(ddd)".
|
||||
*
|
||||
* @return The maximum number of preceding context characters this
|
||||
* transliterator needs to examine
|
||||
*/
|
||||
virtual int32_t getMaximumContextLength() const;
|
||||
|
||||
/**
|
||||
* Returns a programmatic identifier for this transliterator.
|
||||
* If this identifier is passed to <code>getInstance()</code>, it
|
||||
* will return this object, if it has been registered.
|
||||
* @see #registerInstance
|
||||
* @see #registerClass
|
||||
* @see #getAvailableIDs
|
||||
*/
|
||||
virtual const UnicodeString& getID() const;
|
||||
|
||||
/**
|
||||
* Returns a name for this transliterator that is appropriate for
|
||||
* display to the user in the default locale. See {@link
|
||||
* #getDisplayName(Locale)} for details.
|
||||
*/
|
||||
virtual UnicodeString& getDisplayName(UnicodeString& result) const;
|
||||
|
||||
/**
|
||||
* Returns a name for this transliterator that is appropriate for
|
||||
* display to the user in the given locale. This name is taken
|
||||
* from the locale resource data in the standard manner of the
|
||||
* <code>java.text</code> package.
|
||||
*
|
||||
* <p>If no localized names exist in the system resource bundles,
|
||||
* a name is synthesized using a localized
|
||||
* <code>MessageFormat</code> pattern from the resource data. The
|
||||
* arguments to this pattern are an integer followed by one or two
|
||||
* strings. The integer is the number of strings, either 1 or 2.
|
||||
* The strings are formed by splitting the ID for this
|
||||
* transliterator at the first '-'. If there is no '-', then the
|
||||
* entire ID forms the only string.
|
||||
* @param inLocale the Locale in which the display name should be
|
||||
* localized.
|
||||
* @see java.text.MessageFormat
|
||||
*/
|
||||
virtual UnicodeString& getDisplayName(const Locale& inLocale,
|
||||
UnicodeString& result) const;
|
||||
|
||||
/**
|
||||
* Returns the filter used by this transliterator, or <tt>null</tt>
|
||||
* if this transliterator uses no filter.
|
||||
*/
|
||||
virtual const UnicodeFilter* getFilter() const;
|
||||
|
||||
/**
|
||||
* Changes the filter used by this transliterator. If the filter
|
||||
* is set to <tt>null</tt> then no filtering will occur.
|
||||
*
|
||||
* <p>Callers must take care if a transliterator is in use by
|
||||
* multiple threads. The filter should not be changed by one
|
||||
* thread while another thread may be transliterating.
|
||||
*/
|
||||
virtual void adoptFilter(UnicodeFilter* adoptedFilter);
|
||||
|
||||
/**
|
||||
* Returns this transliterator's inverse. See the class
|
||||
* documentation for details. This implementation simply inverts
|
||||
* the two entities in the ID and attempts to retrieve the
|
||||
* resulting transliterator. That is, if <code>getID()</code>
|
||||
* returns "A-B", then this method will return the result of
|
||||
* <code>getInstance("B-A")</code>, or <code>null</code> if that
|
||||
* call fails.
|
||||
*
|
||||
* <p>This method does not take filtering into account. The
|
||||
* returned transliterator will have no filter.
|
||||
*
|
||||
* <p>Subclasses with knowledge of their inverse may wish to
|
||||
* override this method.
|
||||
*
|
||||
* @return a transliterator that is an inverse, not necessarily
|
||||
* exact, of this transliterator, or <code>null</code> if no such
|
||||
* transliterator is registered.
|
||||
* @see #registerInstance
|
||||
*/
|
||||
virtual Transliterator* createInverse() const;
|
||||
|
||||
/**
|
||||
* Returns a <code>Transliterator</code> object given its ID.
|
||||
* The ID must be either a system transliterator ID or a ID registered
|
||||
* using <code>registerInstance()</code>.
|
||||
*
|
||||
* @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
|
||||
* @return A <code>Transliterator</code> object with the given ID
|
||||
* @exception IllegalArgumentException if the given ID is invalid.
|
||||
* @see #registerInstance
|
||||
* @see #getAvailableIDs
|
||||
* @see #getID
|
||||
*/
|
||||
static Transliterator* createInstance(const UnicodeString& ID);
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* This is the path to the subdirectory within the locale data
|
||||
* directory that contains the rule-based transliterator resource
|
||||
* bundle files. This is constructed dynamically the first time
|
||||
* Transliterator::getDataDirectory() is called.
|
||||
*/
|
||||
static char* DATA_DIR;
|
||||
|
||||
/**
|
||||
* This is the name of a subdirectory within the locale data directory
|
||||
* that contains the rule-based transliterator resource bundle files.
|
||||
*/
|
||||
static const char* RESOURCE_SUB_DIR;
|
||||
|
||||
/**
|
||||
* Returns the directory in which the transliterator resource bundle
|
||||
* files are located. This is a subdirectory, named RESOURCE_SUB_DIR,
|
||||
* under Locale::getDataDirectory(). It ends in a path separator.
|
||||
*/
|
||||
static const char* getDataDirectory();
|
||||
|
||||
static int32_t hash(const UnicodeString& str);
|
||||
|
||||
/**
|
||||
* Returns a transliterator object given its ID. Unlike getInstance(),
|
||||
* this method returns null if it cannot make use of the given ID.
|
||||
*/
|
||||
static Transliterator* _createInstance(const UnicodeString& ID);
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Registers a instance <tt>obj</tt> of a subclass of
|
||||
* <code>Transliterator</code> with the system. When
|
||||
* <tt>createInstance()</tt> is called with an ID string that is
|
||||
* equal to <tt>obj->getID()</tt>, then <tt>obj->clone()</tt> is
|
||||
* returned.
|
||||
*
|
||||
* After this call the Transliterator class owns the adoptedObj
|
||||
* and will delete it.
|
||||
*
|
||||
* @param obj an instance of subclass of
|
||||
* <code>Transliterator</code> that defines <tt>clone()</tt>
|
||||
* @see #getInstance
|
||||
* @see #registerClass
|
||||
* @see #unregister
|
||||
*/
|
||||
static void registerInstance(Transliterator* adoptedObj,
|
||||
UErrorCode& status);
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* This internal method registers a prototype instance in the cache.
|
||||
* The CALLER MUST MUTEX using cacheMutex before calling this method.
|
||||
*/
|
||||
static void _registerInstance(Transliterator* adoptedPrototype,
|
||||
UErrorCode &status);
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Unregisters a transliterator or class. This may be either
|
||||
* a system transliterator or a user transliterator or class.
|
||||
*
|
||||
* @param ID the ID of the transliterator or class
|
||||
* @return the <code>Object</code> that was registered with
|
||||
* <code>ID</code>, or <code>null</code> if none was
|
||||
* @see #registerInstance
|
||||
* @see #registerClass
|
||||
*/
|
||||
static void unregister(const UnicodeString& ID);
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* Unregisters a transliterator or class. Internal method.
|
||||
* Prerequisites: The cache must be initialized, and the
|
||||
* caller must own the cacheMutex.
|
||||
*/
|
||||
static void _unregister(const UnicodeString& ID);
|
||||
|
||||
/**
|
||||
* Returns an enumeration over the programmatic names of registered
|
||||
* <code>Transliterator</code> objects. This includes both system
|
||||
* transliterators and user transliterators registered using
|
||||
* <code>registerInstance()</code>. The enumerated names may be
|
||||
* passed to <code>getInstance()</code>.
|
||||
*
|
||||
* @return An <code>Enumeration</code> over <code>String</code> objects
|
||||
* @see #getInstance
|
||||
* @see #registerInstance
|
||||
*/
|
||||
// virtual Enumeration getAvailableIDs();
|
||||
|
||||
/**
|
||||
* Vector of registered IDs.
|
||||
*/
|
||||
static UVector cacheIDs;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Return the number of IDs currently registered with the system.
|
||||
* To retrieve the actual IDs, call getAvailableID(i) with
|
||||
* i from 0 to countAvailableIDs() - 1.
|
||||
*/
|
||||
static int32_t countAvailableIDs();
|
||||
|
||||
/**
|
||||
* Return the index-th available ID. index must be between 0
|
||||
* and countAvailableIDs() - 1, inclusive. If index is out of
|
||||
* range, the result of getAvailableID(0) is returned.
|
||||
*/
|
||||
static const UnicodeString& getAvailableID(int32_t index);
|
||||
|
||||
private:
|
||||
/**
|
||||
* Comparison function for UVector. Compares two UnicodeString
|
||||
* objects given void* pointers to them.
|
||||
*/
|
||||
static bool_t compareIDs(void* a, void* b);
|
||||
|
||||
static void initializeCache();
|
||||
};
|
||||
|
||||
#endif
|
51
icu4c/source/i18n/unifilt.h
Normal file
51
icu4c/source/i18n/unifilt.h
Normal file
|
@ -0,0 +1,51 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef UNIFILT_H
|
||||
#define UNIFILT_H
|
||||
|
||||
/**
|
||||
* <code>UnicodeFilter</code> defines a protocol for selecting a
|
||||
* subset of the full range (U+0000 to U+FFFF) of Unicode characters.
|
||||
* Currently, filters are used in conjunction with classes like {@link
|
||||
* Transliterator} to only process selected characters through a
|
||||
* transformation.
|
||||
*
|
||||
* @see UnicodeFilterLogic
|
||||
*/
|
||||
class U_I18N_API UnicodeFilter {
|
||||
|
||||
public:
|
||||
|
||||
virtual ~UnicodeFilter();
|
||||
|
||||
/**
|
||||
* Returns <tt>true</tt> for characters that are in the selected
|
||||
* subset. In other words, if a character is <b>to be
|
||||
* filtered</b>, then <tt>isIn()</tt> returns
|
||||
* <b><tt>false</tt></b>.
|
||||
*/
|
||||
virtual bool_t isIn(UChar c) const = 0;
|
||||
|
||||
/**
|
||||
* Returns a copy of this object. All UnicodeFilter objects have
|
||||
* to support cloning in order to allow classes using
|
||||
* UnicodeFilters, such as Transliterator, to implement cloning.
|
||||
*/
|
||||
virtual UnicodeFilter* clone() const = 0;
|
||||
|
||||
protected:
|
||||
|
||||
UnicodeFilter();
|
||||
};
|
||||
|
||||
inline UnicodeFilter::UnicodeFilter() {}
|
||||
inline UnicodeFilter::~UnicodeFilter() {}
|
||||
|
||||
#endif
|
139
icu4c/source/i18n/unifltlg.cpp
Normal file
139
icu4c/source/i18n/unifltlg.cpp
Normal file
|
@ -0,0 +1,139 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#include "unifltlg.h"
|
||||
#include "unifilt.h"
|
||||
|
||||
class UnicodeNotFilter : public UnicodeFilter {
|
||||
UnicodeFilter* filt;
|
||||
public:
|
||||
UnicodeNotFilter(UnicodeFilter* adopted);
|
||||
UnicodeNotFilter(const UnicodeNotFilter&);
|
||||
virtual ~UnicodeNotFilter();
|
||||
virtual bool_t isIn(UChar c) const;
|
||||
virtual UnicodeFilter* clone() const;
|
||||
};
|
||||
|
||||
UnicodeNotFilter::UnicodeNotFilter(UnicodeFilter* adopted) : filt(adopted) {}
|
||||
UnicodeNotFilter::UnicodeNotFilter(const UnicodeNotFilter& f) : filt(f.filt->clone()) {}
|
||||
UnicodeNotFilter::~UnicodeNotFilter() { delete filt; }
|
||||
bool_t UnicodeNotFilter::isIn(UChar c) const { return !filt->isIn(c); }
|
||||
UnicodeFilter* UnicodeNotFilter::clone() const { return new UnicodeNotFilter(*this); }
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements the inverse of
|
||||
* the given filter.
|
||||
*/
|
||||
UnicodeFilter* UnicodeFilterLogic::createNot(const UnicodeFilter& f) {
|
||||
return new UnicodeNotFilter(f.clone());
|
||||
}
|
||||
|
||||
class UnicodeAndFilter : public UnicodeFilter {
|
||||
UnicodeFilter* filt1;
|
||||
UnicodeFilter* filt2;
|
||||
public:
|
||||
UnicodeAndFilter(UnicodeFilter* adopted1, UnicodeFilter* adopted2);
|
||||
UnicodeAndFilter(const UnicodeAndFilter&);
|
||||
virtual ~UnicodeAndFilter();
|
||||
virtual bool_t isIn(UChar c) const;
|
||||
virtual UnicodeFilter* clone() const;
|
||||
};
|
||||
|
||||
UnicodeAndFilter::UnicodeAndFilter(UnicodeFilter* f1, UnicodeFilter* f2) : filt1(f1), filt2(f2) {}
|
||||
UnicodeAndFilter::UnicodeAndFilter(const UnicodeAndFilter& f) :
|
||||
filt1(f.filt1->clone()), filt2(f.filt2->clone()) {}
|
||||
UnicodeAndFilter::~UnicodeAndFilter() { delete filt1; delete filt2; }
|
||||
bool_t UnicodeAndFilter::isIn(UChar c) const { return filt1->isIn(c) && filt2->isIn(c); }
|
||||
UnicodeFilter* UnicodeAndFilter::clone() const { return new UnicodeAndFilter(*this); }
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements a short
|
||||
* circuit AND of the result of the two given filters. That is,
|
||||
* if <tt>f.isIn()</tt> is <tt>false</tt>, then <tt>g.isIn()</tt>
|
||||
* is not called, and <tt>isIn()</tt> returns <tt>false</tt>.
|
||||
*
|
||||
* <p>Either <tt>f</tt> or <tt>g</tt> must be non-null.
|
||||
*/
|
||||
UnicodeFilter* UnicodeFilterLogic::createAnd(const UnicodeFilter& f,
|
||||
const UnicodeFilter& g) {
|
||||
return new UnicodeAndFilter(f.clone(), g.clone());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements a short
|
||||
* circuit AND of the result of the given filters. That is, if
|
||||
* <tt>f[i].isIn()</tt> is <tt>false</tt>, then
|
||||
* <tt>f[j].isIn()</tt> is not called, where <tt>j > i</tt>, and
|
||||
* <tt>isIn()</tt> returns <tt>false</tt>.
|
||||
*/
|
||||
//!UnicodeFilter* UnicodeFilterLogic::and(const UnicodeFilter** f) {
|
||||
//! return new UnicodeFilter() {
|
||||
//! public bool_t isIn(UChar c) {
|
||||
//! for (int32_t i=0; i<f.length; ++i) {
|
||||
//! if (!f[i].isIn(c)) {
|
||||
//! return FALSE;
|
||||
//! }
|
||||
//! }
|
||||
//! return TRUE;
|
||||
//! }
|
||||
//! };
|
||||
//!}
|
||||
|
||||
class UnicodeOrFilter : public UnicodeFilter {
|
||||
UnicodeFilter* filt1;
|
||||
UnicodeFilter* filt2;
|
||||
public:
|
||||
UnicodeOrFilter(UnicodeFilter* adopted1, UnicodeFilter* adopted2);
|
||||
UnicodeOrFilter(const UnicodeOrFilter&);
|
||||
virtual ~UnicodeOrFilter();
|
||||
virtual bool_t isIn(UChar c) const;
|
||||
virtual UnicodeFilter* clone() const;
|
||||
};
|
||||
|
||||
UnicodeOrFilter::UnicodeOrFilter(UnicodeFilter* f1, UnicodeFilter* f2) : filt1(f1), filt2(f2) {}
|
||||
UnicodeOrFilter::UnicodeOrFilter(const UnicodeOrFilter& f) :
|
||||
filt1(f.filt1->clone()), filt2(f.filt2->clone()) {}
|
||||
UnicodeOrFilter::~UnicodeOrFilter() { delete filt1; delete filt2; }
|
||||
bool_t UnicodeOrFilter::isIn(UChar c) const { return filt1->isIn(c) || filt2->isIn(c); }
|
||||
UnicodeFilter* UnicodeOrFilter::clone() const { return new UnicodeOrFilter(*this); }
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements a short
|
||||
* circuit OR of the result of the two given filters. That is, if
|
||||
* <tt>f.isIn()</tt> is <tt>true</tt>, then <tt>g.isIn()</tt> is
|
||||
* not called, and <tt>isIn()</tt> returns <tt>true</tt>.
|
||||
*
|
||||
* <p>Either <tt>f</tt> or <tt>g</tt> must be non-null.
|
||||
*/
|
||||
UnicodeFilter* UnicodeFilterLogic::createOr(const UnicodeFilter& f,
|
||||
const UnicodeFilter& g) {
|
||||
return new UnicodeOrFilter(f.clone(), g.clone());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements a short
|
||||
* circuit OR of the result of the given filters. That is, if
|
||||
* <tt>f[i].isIn()</tt> is <tt>false</tt>, then
|
||||
* <tt>f[j].isIn()</tt> is not called, where <tt>j > i</tt>, and
|
||||
* <tt>isIn()</tt> returns <tt>true</tt>.
|
||||
*/
|
||||
//!UnicodeFilter* UnicodeFilterLogic::or(const UnicodeFilter** f) {
|
||||
//! return new UnicodeFilter() {
|
||||
//! public bool_t isIn(UChar c) {
|
||||
//! for (int32_t i=0; i<f.length; ++i) {
|
||||
//! if (f[i].isIn(c)) {
|
||||
//! return TRUE;
|
||||
//! }
|
||||
//! }
|
||||
//! return FALSE;
|
||||
//! }
|
||||
//! };
|
||||
//!}
|
||||
|
||||
// TODO: Add nand() & nor() for convenience, if needed.
|
84
icu4c/source/i18n/unifltlg.h
Normal file
84
icu4c/source/i18n/unifltlg.h
Normal file
|
@ -0,0 +1,84 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef UNIFLTLG_H
|
||||
#define UNIFLTLG_H
|
||||
|
||||
#include "utypes.h"
|
||||
|
||||
class UnicodeFilter;
|
||||
|
||||
/**
|
||||
* <code>UnicodeFilterLogic</code> provides logical operators on
|
||||
* {@link UnicodeFilter} objects. This class cannot be instantiated;
|
||||
* it consists only of static methods. The static methods return
|
||||
* filter objects that perform logical inversion (<tt>not</tt>),
|
||||
* intersection (<tt>and</tt>), or union (<tt>or</tt>) of the given
|
||||
* filter objects.
|
||||
*/
|
||||
class U_I18N_API UnicodeFilterLogic {
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements the inverse of
|
||||
* the given filter.
|
||||
*/
|
||||
static UnicodeFilter* createNot(const UnicodeFilter& f);
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements a short
|
||||
* circuit AND of the result of the two given filters. That is,
|
||||
* if <tt>f.isIn()</tt> is <tt>false</tt>, then <tt>g.isIn()</tt>
|
||||
* is not called, and <tt>isIn()</tt> returns <tt>false</tt>.
|
||||
*
|
||||
* <p>Either <tt>f</tt> or <tt>g</tt> must be non-null.
|
||||
*/
|
||||
static UnicodeFilter* createAnd(const UnicodeFilter& f,
|
||||
const UnicodeFilter& g);
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements a short
|
||||
* circuit AND of the result of the given filters. That is, if
|
||||
* <tt>f[i].isIn()</tt> is <tt>false</tt>, then
|
||||
* <tt>f[j].isIn()</tt> is not called, where <tt>j > i</tt>, and
|
||||
* <tt>isIn()</tt> returns <tt>false</tt>.
|
||||
*/
|
||||
// static UnicodeFilter* and(const UnicodeFilter** f);
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements a short
|
||||
* circuit OR of the result of the two given filters. That is, if
|
||||
* <tt>f.isIn()</tt> is <tt>true</tt>, then <tt>g.isIn()</tt> is
|
||||
* not called, and <tt>isIn()</tt> returns <tt>true</tt>.
|
||||
*
|
||||
* <p>Either <tt>f</tt> or <tt>g</tt> must be non-null.
|
||||
*/
|
||||
static UnicodeFilter* createOr(const UnicodeFilter& f,
|
||||
const UnicodeFilter& g);
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements a short
|
||||
* circuit OR of the result of the given filters. That is, if
|
||||
* <tt>f[i].isIn()</tt> is <tt>false</tt>, then
|
||||
* <tt>f[j].isIn()</tt> is not called, where <tt>j > i</tt>, and
|
||||
* <tt>isIn()</tt> returns <tt>true</tt>.
|
||||
*/
|
||||
// static UnicodeFilter* or(const UnicodeFilter** f);
|
||||
|
||||
// TODO: Add nand() & nor() for convenience, if needed.
|
||||
|
||||
private:
|
||||
// Disallow instantiation
|
||||
UnicodeFilterLogic();
|
||||
};
|
||||
|
||||
inline UnicodeFilterLogic::UnicodeFilterLogic() {}
|
||||
|
||||
#endif
|
108
icu4c/source/i18n/unirange.cpp
Normal file
108
icu4c/source/i18n/unirange.cpp
Normal file
|
@ -0,0 +1,108 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#include "unirange.h"
|
||||
#include "uvector.h"
|
||||
#include "unistr.h"
|
||||
|
||||
UnicodeRange::UnicodeRange(UChar theStart, int32_t theLength) {
|
||||
start = theStart;
|
||||
length = theLength;
|
||||
}
|
||||
|
||||
UnicodeRange* UnicodeRange::clone() const {
|
||||
return new UnicodeRange(start, length);
|
||||
}
|
||||
|
||||
/**
|
||||
* CALLER OWNS RESULT.
|
||||
*/
|
||||
bool_t UnicodeRange::contains(UChar c) const {
|
||||
return c >= start && (c - start) < length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Assume that contains(c) is true. Split this range into two new
|
||||
* ranges around the character c. Make this range one of the new ranges
|
||||
* (modify it in place) and return the other new range. The character
|
||||
* itself is not included in either range. If the split results in an
|
||||
* empty range (that is, if c == start or c == start + length - 1) then
|
||||
* return null.
|
||||
*
|
||||
* MODIFIES THIS RANGE IN PLACE.
|
||||
*
|
||||
* CALLER OWNS RESULT.
|
||||
*/
|
||||
UnicodeRange* UnicodeRange::split(UChar c) {
|
||||
if (c == start) {
|
||||
++start;
|
||||
--length;
|
||||
return 0;
|
||||
} else if (c - start == length - 1) {
|
||||
--length;
|
||||
return 0;
|
||||
} else {
|
||||
++c;
|
||||
UnicodeRange* r = new UnicodeRange(c, start + length - c);
|
||||
length = --c - start;
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the largest unused subrange by the given string. A
|
||||
* subrange is unused by a string if the string contains no
|
||||
* characters in that range. If the given string contains no
|
||||
* characters in this range, then this range itself is
|
||||
* returned.
|
||||
*
|
||||
* CALLER OWNS RESULT.
|
||||
*/
|
||||
UnicodeRange*
|
||||
UnicodeRange::largestUnusedSubrange(const UnicodeString& str) const {
|
||||
int32_t n = str.length();
|
||||
|
||||
UVector v;
|
||||
v.setDeleter(UnicodeRange::deleter);
|
||||
v.addElement(clone());
|
||||
for (int32_t i=0; i<n; ++i) {
|
||||
UChar c = str.charAt(i);
|
||||
if (contains(c)) {
|
||||
for (int32_t j=0; j<v.size(); ++j) {
|
||||
UnicodeRange* r = (UnicodeRange*) v.elementAt(j);
|
||||
if (r->contains(c)) {
|
||||
r = r->split(c);
|
||||
if (r != 0) {
|
||||
v.addElement(r);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
UnicodeRange* bestRange = 0;
|
||||
int32_t ibest = -1;
|
||||
for (int32_t j=0; j<v.size(); ++j) {
|
||||
UnicodeRange* r = (UnicodeRange*) v.elementAt(j);
|
||||
if (bestRange == 0 || r->length > bestRange->length) {
|
||||
bestRange = r;
|
||||
ibest = j;
|
||||
}
|
||||
}
|
||||
|
||||
v.orphanElementAt(ibest); // So bestRange doesn't get deleted
|
||||
|
||||
return bestRange;
|
||||
}
|
||||
|
||||
// For UVector of UnicodeRange* objects
|
||||
void UnicodeRange::deleter(void* e) {
|
||||
delete (UnicodeRange*) e;
|
||||
}
|
79
icu4c/source/i18n/unirange.h
Normal file
79
icu4c/source/i18n/unirange.h
Normal file
|
@ -0,0 +1,79 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef UNIRANGE_H
|
||||
#define UNIRANGE_H
|
||||
|
||||
#include "utypes.h"
|
||||
|
||||
class UnicodeString;
|
||||
|
||||
/**
|
||||
* %%% INTERNAL CLASS USED BY RuleBasedTransliterator %%%
|
||||
*
|
||||
* A range of Unicode characters. Support the operations of testing for
|
||||
* inclusion (does this range contain this character?) and splitting.
|
||||
* Splitting involves breaking a range into two smaller ranges around a
|
||||
* character inside the original range. The split character is not included
|
||||
* in either range. If the split character is at either extreme end of the
|
||||
* range, one of the split products is an empty range.
|
||||
*
|
||||
* This class is used internally to determine the largest available private
|
||||
* use character range for variable stand-ins.
|
||||
*/
|
||||
class UnicodeRange {
|
||||
|
||||
public:
|
||||
|
||||
UChar start;
|
||||
|
||||
int32_t length;
|
||||
|
||||
UnicodeRange(UChar start, int32_t length);
|
||||
|
||||
/**
|
||||
* CALLER OWNS RESULT.
|
||||
*/
|
||||
UnicodeRange* clone() const;
|
||||
|
||||
bool_t contains(UChar c) const;
|
||||
|
||||
/**
|
||||
* Assume that contains(c) is true. Split this range into two new
|
||||
* ranges around the character c. Make this range one of the new ranges
|
||||
* (modify it in place) and return the other new range. The character
|
||||
* itself is not included in either range. If the split results in an
|
||||
* empty range (that is, if c == start or c == start + length - 1) then
|
||||
* return null.
|
||||
*
|
||||
* MODIFIES THIS RANGE IN PLACE.
|
||||
*
|
||||
* CALLER OWNS RESULT.
|
||||
*/
|
||||
UnicodeRange* split(UChar c);
|
||||
|
||||
/**
|
||||
* Finds the largest subrange of this range that is unused by the
|
||||
* given string. A subrange is unused by a string if the string
|
||||
* contains no characters in that range. If the given string
|
||||
* contains no characters in this range, then this range itself is
|
||||
* returned.
|
||||
*
|
||||
* CALLER OWNS RESULT.
|
||||
*/
|
||||
UnicodeRange* largestUnusedSubrange(const UnicodeString& str) const;
|
||||
|
||||
private:
|
||||
|
||||
// For UVector of UnicodeRange* objects
|
||||
static void deleter(void*);
|
||||
|
||||
};
|
||||
|
||||
#endif
|
204
icu4c/source/i18n/unitohex.cpp
Normal file
204
icu4c/source/i18n/unitohex.cpp
Normal file
|
@ -0,0 +1,204 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#include "unitohex.h"
|
||||
#include "rep.h"
|
||||
#include "unifilt.h"
|
||||
|
||||
/**
|
||||
* ID for this transliterator.
|
||||
*/
|
||||
const char* UnicodeToHexTransliterator::_ID = "Unicode-Hex";
|
||||
|
||||
const char* UnicodeToHexTransliterator::DEFAULT_PREFIX = "\\u";
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
* @param prefix the string that will precede the four hex
|
||||
* digits for UNICODE_HEX transliterators. Ignored
|
||||
* if direction is HEX_UNICODE.
|
||||
* @param uppercase if true, the four hex digits will be
|
||||
* converted to uppercase; otherwise they will be lowercase.
|
||||
* Ignored if direction is HEX_UNICODE.
|
||||
*/
|
||||
UnicodeToHexTransliterator::UnicodeToHexTransliterator(
|
||||
const UnicodeString& hexPrefix,
|
||||
bool_t isUppercase,
|
||||
UnicodeFilter* adoptedFilter) :
|
||||
Transliterator(_ID, adoptedFilter),
|
||||
prefix(hexPrefix),
|
||||
uppercase(isUppercase) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a transliterator with the default prefix "\u"
|
||||
* that outputs uppercase hex digits.
|
||||
*/
|
||||
UnicodeToHexTransliterator::UnicodeToHexTransliterator(
|
||||
UnicodeFilter* adoptedFilter) :
|
||||
Transliterator(_ID, adoptedFilter),
|
||||
prefix(DEFAULT_PREFIX),
|
||||
uppercase(TRUE) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
UnicodeToHexTransliterator::UnicodeToHexTransliterator(
|
||||
const UnicodeToHexTransliterator& other) :
|
||||
Transliterator(other), prefix(other.prefix),
|
||||
uppercase(other.uppercase) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Assignment operator.
|
||||
*/
|
||||
UnicodeToHexTransliterator&
|
||||
UnicodeToHexTransliterator::operator=(const UnicodeToHexTransliterator& other) {
|
||||
Transliterator::operator=(other);
|
||||
prefix = other.prefix;
|
||||
uppercase = other.uppercase;
|
||||
return *this;
|
||||
}
|
||||
|
||||
Transliterator*
|
||||
UnicodeToHexTransliterator::clone() const {
|
||||
return new UnicodeToHexTransliterator(*this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the string that precedes the four hex digits.
|
||||
* @return prefix string
|
||||
*/
|
||||
const UnicodeString& UnicodeToHexTransliterator::getPrefix() const {
|
||||
return prefix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the string that precedes the four hex digits.
|
||||
*
|
||||
* <p>Callers must take care if a transliterator is in use by
|
||||
* multiple threads. The prefix should not be changed by one
|
||||
* thread while another thread may be transliterating.
|
||||
* @param prefix prefix string
|
||||
*/
|
||||
void UnicodeToHexTransliterator::setPrefix(const UnicodeString& hexPrefix) {
|
||||
prefix = hexPrefix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if this transliterator outputs uppercase hex digits.
|
||||
*/
|
||||
bool_t UnicodeToHexTransliterator::isUppercase() const {
|
||||
return uppercase;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets if this transliterator outputs uppercase hex digits.
|
||||
*
|
||||
* <p>Callers must take care if a transliterator is in use by
|
||||
* multiple threads. The uppercase mode should not be changed by
|
||||
* one thread while another thread may be transliterating.
|
||||
* @param outputUppercase if true, then this transliterator
|
||||
* outputs uppercase hex digits.
|
||||
*/
|
||||
void UnicodeToHexTransliterator::setUppercase(bool_t outputUppercase) {
|
||||
uppercase = outputUppercase;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates a segment of a string. <code>Transliterator</code> API.
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @return the new limit index
|
||||
*/
|
||||
int32_t UnicodeToHexTransliterator::transliterate(Replaceable& text,
|
||||
int32_t start,
|
||||
int32_t limit) const {
|
||||
int32_t offsets[3] = { start, limit, start };
|
||||
handleKeyboardTransliterate(text, offsets);
|
||||
return offsets[LIMIT];
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleKeyboardTransliterate}.
|
||||
*/
|
||||
void UnicodeToHexTransliterator::handleKeyboardTransliterate(Replaceable& text,
|
||||
int32_t offsets[3]) const {
|
||||
/**
|
||||
* Performs transliteration changing all characters to
|
||||
* Unicode hexadecimal escapes. For example, '@' -> "U+0040",
|
||||
* assuming the prefix is "U+".
|
||||
*/
|
||||
int32_t cursor = offsets[CURSOR];
|
||||
int32_t limit = offsets[LIMIT];
|
||||
|
||||
const UnicodeFilter* filter = getFilter();
|
||||
UnicodeString hex;
|
||||
|
||||
while (cursor < limit) {
|
||||
UChar c = text.charAt(cursor);
|
||||
if (filter != 0 && !filter->isIn(c)) {
|
||||
++cursor;
|
||||
continue;
|
||||
}
|
||||
toHex(hex, c);
|
||||
text.handleReplaceBetween(cursor, cursor+1, hex);
|
||||
int32_t len = hex.length();
|
||||
cursor += len; // Advance cursor by 1 and adjust for new text
|
||||
--len;
|
||||
limit += len;
|
||||
}
|
||||
|
||||
offsets[LIMIT] = limit;
|
||||
offsets[CURSOR] = cursor;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the length of the longest context required by this transliterator.
|
||||
* This is <em>preceding</em> context.
|
||||
* @param direction either <code>FORWARD</code> or <code>REVERSE</code>
|
||||
* @return maximum number of preceding context characters this
|
||||
* transliterator needs to examine
|
||||
*/
|
||||
int32_t UnicodeToHexTransliterator::getMaximumContextLength() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
UChar UnicodeToHexTransliterator::HEX_DIGITS[32] = {
|
||||
// If necessary, replace these character constants with their hex values
|
||||
'0', '1', '2', '3', '4', '5', '6', '7',
|
||||
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
|
||||
'0', '1', '2', '3', '4', '5', '6', '7',
|
||||
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
|
||||
};
|
||||
|
||||
/**
|
||||
* Given an integer, return its least significant hex digit.
|
||||
*/
|
||||
UChar UnicodeToHexTransliterator::itoh(int32_t i) const {
|
||||
i &= 0xF;
|
||||
return HEX_DIGITS[uppercase ? (i|16) : i];
|
||||
}
|
||||
|
||||
/**
|
||||
* Form escape sequence.
|
||||
*/
|
||||
UnicodeString& UnicodeToHexTransliterator::toHex(UnicodeString& result,
|
||||
UChar c) const {
|
||||
result = prefix;
|
||||
result.append(itoh(c >> 12));
|
||||
result.append(itoh(c >> 8));
|
||||
result.append(itoh(c >> 4));
|
||||
result.append(itoh(c));
|
||||
return result;
|
||||
}
|
157
icu4c/source/i18n/unitohex.h
Normal file
157
icu4c/source/i18n/unitohex.h
Normal file
|
@ -0,0 +1,157 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef UNITOHEX_H
|
||||
#define UNITOHEX_H
|
||||
|
||||
#include "translit.h"
|
||||
#include "unistr.h"
|
||||
|
||||
class UnicodeFilter;
|
||||
|
||||
/**
|
||||
* A transliterator that converts from Unicode characters to
|
||||
* hexadecimal Unicode escape sequences. It outputs a
|
||||
* prefix specified in the constructor and optionally converts the hex
|
||||
* digits to uppercase.
|
||||
*
|
||||
* @author Alan Liu
|
||||
*/
|
||||
class U_I18N_API UnicodeToHexTransliterator : public Transliterator {
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* ID for this transliterator.
|
||||
*/
|
||||
static const char* _ID;
|
||||
|
||||
static const char* DEFAULT_PREFIX;
|
||||
|
||||
UnicodeString prefix;
|
||||
|
||||
bool_t uppercase;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
* @param prefix the string that will precede the four hex
|
||||
* digits for UNICODE_HEX transliterators. Ignored
|
||||
* if direction is HEX_UNICODE.
|
||||
* @param uppercase if true, the four hex digits will be
|
||||
* converted to uppercase; otherwise they will be lowercase.
|
||||
* Ignored if direction is HEX_UNICODE.
|
||||
*/
|
||||
UnicodeToHexTransliterator(const UnicodeString& hexPrefix,
|
||||
bool_t isUppercase,
|
||||
UnicodeFilter* adoptedFilter = 0);
|
||||
|
||||
/**
|
||||
* Constructs a transliterator with the default prefix "\u"
|
||||
* that outputs uppercase hex digits.
|
||||
*/
|
||||
UnicodeToHexTransliterator(UnicodeFilter* adoptedFilter = 0);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~UnicodeToHexTransliterator();
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
UnicodeToHexTransliterator(const UnicodeToHexTransliterator&);
|
||||
|
||||
/**
|
||||
* Assignment operator.
|
||||
*/
|
||||
UnicodeToHexTransliterator& operator=(const UnicodeToHexTransliterator&);
|
||||
|
||||
/**
|
||||
* Transliterator API.
|
||||
*/
|
||||
virtual Transliterator* clone() const;
|
||||
|
||||
/**
|
||||
* Returns the string that precedes the four hex digits.
|
||||
* @return prefix string
|
||||
*/
|
||||
virtual const UnicodeString& getPrefix() const;
|
||||
|
||||
/**
|
||||
* Sets the string that precedes the four hex digits.
|
||||
*
|
||||
* <p>Callers must take care if a transliterator is in use by
|
||||
* multiple threads. The prefix should not be changed by one
|
||||
* thread while another thread may be transliterating.
|
||||
* @param prefix prefix string
|
||||
*/
|
||||
virtual void setPrefix(const UnicodeString& prefix);
|
||||
|
||||
/**
|
||||
* Returns true if this transliterator outputs uppercase hex digits.
|
||||
*/
|
||||
virtual bool_t isUppercase() const;
|
||||
|
||||
/**
|
||||
* Sets if this transliterator outputs uppercase hex digits.
|
||||
*
|
||||
* <p>Callers must take care if a transliterator is in use by
|
||||
* multiple threads. The uppercase mode should not be changed by
|
||||
* one thread while another thread may be transliterating.
|
||||
* @param outputUppercase if true, then this transliterator
|
||||
* outputs uppercase hex digits.
|
||||
*/
|
||||
virtual void setUppercase(bool_t outputUppercase);
|
||||
|
||||
/**
|
||||
* Transliterates a segment of a string. <code>Transliterator</code> API.
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @return the new limit index
|
||||
*/
|
||||
virtual int32_t transliterate(Replaceable& text, int32_t start, int32_t limit) const;
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleKeyboardTransliterate}.
|
||||
*/
|
||||
virtual void handleKeyboardTransliterate(Replaceable& text,
|
||||
int32_t offsets[3]) const;
|
||||
|
||||
/**
|
||||
* Return the length of the longest context required by this transliterator.
|
||||
* This is <em>preceding</em> context.
|
||||
* @param direction either <code>FORWARD</code> or <code>REVERSE</code>
|
||||
* @return maximum number of preceding context characters this
|
||||
* transliterator needs to examine
|
||||
*/
|
||||
virtual int32_t getMaximumContextLength();
|
||||
|
||||
private:
|
||||
|
||||
static UChar HEX_DIGITS[32];
|
||||
|
||||
/**
|
||||
* Given an integer, return its least significant hex digit.
|
||||
*/
|
||||
UChar itoh(int32_t i) const;
|
||||
|
||||
/**
|
||||
* Form escape sequence.
|
||||
*/
|
||||
UnicodeString& toHex(UnicodeString& result, UChar c) const;
|
||||
};
|
||||
|
||||
inline UnicodeToHexTransliterator::~UnicodeToHexTransliterator() {}
|
||||
|
||||
#endif
|
Loading…
Add table
Reference in a new issue