ICU-114 Transliterator framework first working version

X-SVN-Rev: 194
This commit is contained in:
Alan Liu 1999-11-20 00:40:50 +00:00
parent a2f31432aa
commit bd14077b79
35 changed files with 14712 additions and 1 deletions

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,128 @@
//--------------------------------------------------------------------
// Copyright (C) 1999, International Business Machines
// Corporation and others. All Rights Reserved.
//--------------------------------------------------------------------
// Date Name Description
// 11/17/99 aliu Creation.
//--------------------------------------------------------------------
// KeyboardEscape-Latin1
kbdescl1 {
Rule {
"esc=''\n"
"grave=`\n"
"acute=''\n"
"hat=^\n"
"tilde=~\n"
"umlaut=:\n"
"ring=.\n"
"cedilla=,\n"
"slash=/\n"
"super=^\n"
// Make keyboard entry of {esc} possible
// and of backslash
"'\\'{esc}>{esc}\n"
"'\\\\'>'\\'\n"
// Long keys
"cur{esc}>\u00A4\n"
"sec{esc}>\u00A7\n"
"not{esc}>\u00AC\n"
"mul{esc}>\u00D7\n"
"div{esc}>\u00F7\n"
" {esc}>\u00A0\n" // non-breaking space
"!{esc}>\u00A1\n" // inverted exclamation
"c/{esc}>\u00A2\n" // cent sign
"lb{esc}>\u00A3\n" // pound sign
"'|'{esc}>\u00A6\n" // broken vertical bar
":{esc}>\u00A8\n" // umlaut
"{super}a{esc}>\u00AA\n" // feminine ordinal
"'<<'{esc}>\u00AB\n"
"r{esc}>\u00AE\n"
"--{esc}>\u00AF\n"
"-{esc}>\u00AD\n"
"+-{esc}>\u00B1\n"
"{super}2{esc}>\u00B2\n"
"{super}3{esc}>\u00B3\n"
"{acute}{esc}>\u00B4\n"
"m{esc}>\u00B5\n"
"para{esc}>\u00B6\n"
"dot{esc}>\u00B7\n"
"{cedilla}{esc}>\u00B8\n"
"{super}1{esc}>\u00B9\n"
"{super}o{esc}>\u00BA\n" // masculine ordinal
"'>>'{esc}>\u00BB\n"
"1/4{esc}>\u00BC\n"
"1/2{esc}>\u00BD\n"
"3/4{esc}>\u00BE\n"
"?{esc}>\u00BF\n"
"A{grave}{esc}>\u00C0\n"
"A{acute}{esc}>\u00C1\n"
"A{hat}{esc}>\u00C2\n"
"A{tilde}{esc}>\u00C3\n"
"A{umlaut}{esc}>\u00C4\n"
"A{ring}{esc}>\u00C5\n"
"AE{esc}>\u00C6\n"
"C{cedilla}{esc}>\u00C7\n"
"E{grave}{esc}>\u00C8\n"
"E{acute}{esc}>\u00C9\n"
"E{hat}{esc}>\u00CA\n"
"E{umlaut}{esc}>\u00CB\n"
"I{grave}{esc}>\u00CC\n"
"I{acute}{esc}>\u00CD\n"
"I{hat}{esc}>\u00CE\n"
"I{umlaut}{esc}>\u00CF\n"
"D-{esc}>\u00D0\n"
"N{tilde}{esc}>\u00D1\n"
"O{grave}{esc}>\u00D2\n"
"O{acute}{esc}>\u00D3\n"
"O{hat}{esc}>\u00D4\n"
"O{tilde}{esc}>\u00D5\n"
"O{umlaut}{esc}>\u00D6\n"
"O{slash}{esc}>\u00D8\n"
"U{grave}{esc}>\u00D9\n"
"U{acute}{esc}>\u00DA\n"
"U{hat}{esc}>\u00DB\n"
"U{umlaut}{esc}>\u00DC\n"
"Y{acute}{esc}>\u00DD\n"
"TH{esc}>\u00DE\n"
"ss{esc}>\u00DF\n"
"a{grave}{esc}>\u00E0\n"
"a{acute}{esc}>\u00E1\n"
"a{hat}{esc}>\u00E2\n"
"a{tilde}{esc}>\u00E3\n"
"a{umlaut}{esc}>\u00E4\n"
"a{ring}{esc}>\u00E5\n"
"ae{esc}>\u00E6\n"
"c{cedilla}{esc}>\u00E7\n"
"c{esc}>\u00A9\n" // copyright - after c{cedilla}
"e{grave}{esc}>\u00E8\n"
"e{acute}{esc}>\u00E9\n"
"e{hat}{esc}>\u00EA\n"
"e{umlaut}{esc}>\u00EB\n"
"i{grave}{esc}>\u00EC\n"
"i{acute}{esc}>\u00ED\n"
"i{hat}{esc}>\u00EE\n"
"i{umlaut}{esc}>\u00EF\n"
"d-{esc}>\u00F0\n"
"n{tilde}{esc}>\u00F1\n"
"o{grave}{esc}>\u00F2\n"
"o{acute}{esc}>\u00F3\n"
"o{hat}{esc}>\u00F4\n"
"o{tilde}{esc}>\u00F5\n"
"o{umlaut}{esc}>\u00F6\n"
"o{slash}{esc}>\u00F8\n"
"o{esc}>\u00B0\n"
"u{grave}{esc}>\u00F9\n"
"u{acute}{esc}>\u00FA\n"
"u{hat}{esc}>\u00FB\n"
"u{umlaut}{esc}>\u00FC\n"
"y{acute}{esc}>\u00FD\n"
"y{esc}>\u00A5\n" // yen sign
"th{esc}>\u00FE\n"
"ss{esc}>\u00FF\n"
}
}

View file

@ -0,0 +1,240 @@
//--------------------------------------------------------------------
// Copyright (C) 1999, International Business Machines
// Corporation and others. All Rights Reserved.
//--------------------------------------------------------------------
// Date Name Description
// 11/17/99 aliu Creation.
//--------------------------------------------------------------------
// Latin-Arabic
larabic {
Rule {
// To Do: finish adding shadda, add sokoon
"alefmadda=\u0622\n"
"alefuhamza=\u0623\n"
"wauuhamza=\u0624\n"
"alefhamza=\u0625\n"
"yehuhamza=\u0626\n"
"alef=\u0627\n"
"beh=\u0628\n"
"tehmarbuta=\u0629\n"
"teh=\u062A\n"
"theh=\u062B\n"
"geem=\u062C\n"
"hah=\u062D\n"
"kha=\u062E\n"
"dal=\u062F\n"
"dhal=\u0630\n"
"reh=\u0631\n"
"zain=\u0632\n"
"seen=\u0633\n"
"sheen=\u0634\n"
"sad=\u0635\n"
"dad=\u0636\n"
"tah=\u0637\n"
"zah=\u0638\n"
"ein=\u0639\n"
"ghein=\u063A\n"
"feh=\u0641\n"
"qaaf=\u0642\n"
"kaf=\u0643\n"
"lam=\u0644\n"
"meem=\u0645\n"
"noon=\u0646\n"
"heh=\u0647\n"
"wau=\u0648\n"
"yehmaqsura=\u0649\n"
"yeh=\u064A\n"
"peh=\u06A4\n"
"hamza=\u0621\n"
"fathatein=\u064B\n"
"dammatein=\u064C\n"
"kasratein=\u064D\n"
"fatha=\u064E\n"
"damma=\u064F\n"
"kasra=\u0650\n"
"shadda=\u0651\n"
"sokoon=\u0652\n"
// convert English to Arabic
"Arabic>"
"\u062a\u062a\u0645\u062a\u0639\u0020"
"\u0627\u0644\u0644\u063a\u0629\u0020"
"\u0627\u0644\u0639\u0631\u0628\u0628\u064a\u0629\u0020"
"\u0628\u0628\u0646\u0638\u0645\u0020"
"\u0643\u062a\u0627\u0628\u0628\u064a\u0629\u0020"
"\u062c\u0645\u064a\u0644\u0629\n"
"ai>{alefmadda}\n"
"ae>{alefuhamza}\n"
"ao>{alefhamza}\n"
"aa>{alef}\n"
"an>{fathatein}\n"
"a>{fatha}\n"
"b>{beh}\n"
"c>{kaf}\n"
"{dhal}]dh>{shadda}\n"
"dh>{dhal}\n"
"{dad}]dd>{shadda}\n"
"dd>{dad}\n"
"{dal}]d>{shadda}\n"
"d>{dal}\n"
"e>{ein}\n"
"f>{feh}\n"
"gh>{ghein}\n"
"g>{geem}\n"
"hh>{hah}\n"
"h>{heh}\n"
"ii>{kasratein}\n"
"i>{kasra}\n"
"j>{geem}\n"
"kh>{kha}\n"
"k>{kaf}\n"
"l>{lam}\n"
"m>{meem}\n"
"n>{noon}\n"
"o>{hamza}\n"
"p>{peh}\n"
"q>{qaaf}\n"
"r>{reh}\n"
"sh>{sheen}\n"
"ss>{sad}\n"
"s>{seen}\n"
"th>{theh}\n"
"tm>{tehmarbuta}\n"
"tt>{tah}\n"
"t>{teh}\n"
"uu>{dammatein}\n"
"u>{damma}\n"
"v>{beh}\n"
"we>{wauuhamza}\n"
"w>{wau}\n"
"x>{kaf}{shadda}{seen}\n"
"ye>{yehuhamza}\n"
"ym>{yehmaqsura}\n"
"y>{yeh}\n"
"zz>{zah}\n"
"z>{zain}\n"
"0>\u0660\n"+ // Arabic digit 0
"1>\u0661\n"+ // Arabic digit 1
"2>\u0662\n"+ // Arabic digit 2
"3>\u0663\n"+ // Arabic digit 3
"4>\u0664\n"+ // Arabic digit 4
"5>\u0665\n"+ // Arabic digit 5
"6>\u0666\n"+ // Arabic digit 6
"7>\u0667\n"+ // Arabic digit 7
"8>\u0668\n"+ // Arabic digit 8
"9>\u0669\n"+ // Arabic digit 9
"%>\u066A\n"+ // Arabic %
".>\u066B\n"+ // Arabic decimal separator
",>\u066C\n"+ // Arabic thousands separator
"*>\u066D\n"+ // Arabic five-pointed star
"`0>0\n"+ // Escaped forms of the above
"`1>1\n"
"`2>2\n"
"`3>3\n"
"`4>4\n"
"`5>5\n"
"`6>6\n"
"`7>7\n"
"`8>8\n"
"`9>9\n"
"`%>%\n"
"`.>.\n"
"`,>,\n"
"`*>*\n"
"``>`\n"
"''>\n"
// now Arabic to English
"''ai<a]{alefmadda}\n"
"ai<{alefmadda}\n"
"''ae<a]{alefuhamza}\n"
"ae<{alefuhamza}\n"
"''ao<a]{alefhamza}\n"
"ao<{alefhamza}\n"
"''aa<a]{alef}\n"
"aa<{alef}\n"
"''an<a]{fathatein}\n"
"an<{fathatein}\n"
"''a<a]{fatha}\n"
"a<{fatha}\n"
"b<{beh}\n"
"''dh<d]{dhal}\n"
"dh<{dhal}\n"
"''dd<d]{dad}\n"
"dd<{dad}\n"
"''d<d]{dal}\n"
"d<{dal}\n"
"''e<a]{ein}\n"
"''e<w]{ein}\n"
"''e<y]{ein}\n"
"e<{ein}\n"
"f<{feh}\n"
"gh<{ghein}\n"
"''hh<d]{hah}\n"
"''hh<t]{hah}\n"
"''hh<k]{hah}\n"
"''hh<s]{hah}\n"
"hh<{hah}\n"
"''h<d]{heh}\n"
"''h<t]{heh}\n"
"''h<k]{heh}\n"
"''h<s]{heh}\n"
"h<{heh}\n"
"''ii<i]{kasratein}\n"
"ii<{kasratein}\n"
"''i<i]{kasra}\n"
"i<{kasra}\n"
"j<{geem}\n"
"kh<{kha}\n"
"x<{kaf}{shadda}{seen}\n"
"k<{kaf}\n"
"l<{lam}\n"
"''m<y]{meem}\n"
"''m<t]{meem}\n"
"m<{meem}\n"
"n<{noon}\n"
"''o<a]{hamza}\n"
"o<{hamza}\n"
"p<{peh}\n"
"q<{qaaf}\n"
"r<{reh}\n"
"sh<{sheen}\n"
"''ss<s]{sad}\n"
"ss<{sad}\n"
"''s<s]{seen}\n"
"s<{seen}\n"
"th<{theh}\n"
"tm<{tehmarbuta}\n"
"''tt<t]{tah}\n"
"tt<{tah}\n"
"''t<t]{teh}\n"
"t<{teh}\n"
"''uu<u]{dammatein}\n"
"uu<{dammatein}\n"
"''u<u]{damma}\n"
"u<{damma}\n"
"we<{wauuhamza}\n"
"w<{wau}\n"
"ye<{yehuhamza}\n"
"ym<{yehmaqsura}\n"
"''y<y]{yeh}\n"
"y<{yeh}\n"
"''zz<z]{zah}\n"
"zz<{zah}\n"
"''z<z]{zain}\n"
"z<{zain}\n"
"dh<dh]{shadda}\n"
"dd<dd]{shadda}\n"
"''d<d]{shadda}\n"
}
}

View file

@ -0,0 +1,411 @@
//--------------------------------------------------------------------
// Copyright (C) 1999, International Business Machines
// Corporation and others. All Rights Reserved.
//--------------------------------------------------------------------
// Date Name Description
// 11/17/99 aliu Creation.
//--------------------------------------------------------------------
// Latin-Devanagari
ldevan {
Rule {
//#####################################################################
// Keyboard Transliteration Table
//#####################################################################
// Conversions should be:
// 1. complete
// * convert every sequence of Latin letters (a to z plus apostrophe)
// to a sequence of Native letters
// * convert every sequence of Native letters to Latin letters
// 2. reversable
// * any string of Native converted to Latin and back should be the same
// * this is not true for English converted to Native & back, e.g.:
// k -> {kaf} -> k
// c -> {kaf} -> k
//#####################################################################
// Sequences of Latin letters may convert to a single Native letter.
// When this is the case, an apostrophe can be used to indicate separate
// letters.$
// E.g. sh -> {shin}
// s'h -> {sin}{heh}
// ss -> {sad}
// s's -> {sin}{shadda}
//#####################################################################
// To Do:
// finish adding shadda, add sokoon, fix uppercase
// make two transliteration tables: one with vowels, one without
//#####################################################################
// Modifications
// Devanagari Transliterator: broken up with consonsants/vowels
//#####################################################################
// Unicode character name definitions
//#####################################################################
//consonants
"candrabindu=\u0901\n"
"bindu=\u0902\n"
"visarga=\u0903\n"
// w<vowel> represents the stand-alone form
"wa=\u0905\n"
"waa=\u0906\n"
"wi=\u0907\n"
"wii=\u0908\n"
"wu=\u0909\n"
"wuu=\u090A\n"
"wr=\u090B\n"
"wl=\u090C\n"
"we=\u090F\n"
"wai=\u0910\n"
"wo=\u0913\n"
"wau=\u0914\n"
"ka=\u0915\n"
"kha=\u0916\n"
"ga=\u0917\n"
"gha=\u0918\n"
"nga=\u0919\n"
"ca=\u091A\n"
"cha=\u091B\n"
"ja=\u091C\n"
"jha=\u091D\n"
"nya=\u091E\n"
"tta=\u091F\n"
"ttha=\u0920\n"
"dda=\u0921\n"
"ddha=\u0922\n"
"nna=\u0923\n"
"ta=\u0924\n"
"tha=\u0925\n"
"da=\u0926\n"
"dha=\u0927\n"
"na=\u0928\n"
"pa=\u092A\n"
"pha=\u092B\n"
"ba=\u092C\n"
"bha=\u092D\n"
"ma=\u092E\n"
"ya=\u092F\n"
"ra=\u0930\n"
"rra=\u0931\n"
"la=\u0933\n"
"va=\u0935\n"
"sha=\u0936\n"
"ssa=\u0937\n"
"sa=\u0938\n"
"ha=\u0939\n"
// <vowel> represents the dependent form
"aa=\u093E\n"
"i=\u093F\n"
"ii=\u0940\n"
"u=\u0941\n"
"uu=\u0942\n"
"rh=\u0943\n"
"lh=\u0944\n"
"e=\u0947\n"
"ai=\u0948\n"
"o=\u094B\n"
"au=\u094C\n"
"virama=\u094D\n"
"wrr=\u0960\n"
"rrh=\u0962\n"
"danda=\u0964\n"
"doubleDanda=\u0965\n"
"depVowelAbove=[\u093E-\u0940\u0945-\u094C]\n"
"depVowelBelow=[\u0941-\u0944]\n"
// Ech: Double escape U+0000, so UnicodeString doesn't consider it
// to be the end of the string. This is only necessary for U+0000
// right now. [liu]
"endThing=[{danda}{doubleDanda}\\u0000-\u08FF\u0980-\uFFFF]\n"
"&=[{virama}{aa}{ai}{au}{ii}{i}{uu}{u}{rrh}{rh}{lh}{e}{o}]\n"
"%=[bcdfghjklmnpqrstvwxyz]\n"
//#####################################################################
// convert from Latin letters to Native letters
//#####################################################################
//Hindi>\u092d\u093e\u0930\u0924--\u0020\u0926\u0947\u0936\u0020\u092c\u0928\u094d\u0927\u0941\u002e
// special forms with no good conversion
"mm>{bindu}\n"
"x>{visarga}\n"
// convert to independent forms at start of word or syllable:
// e.g. keai -> {ka}{e}{wai}; k'ai -> {ka}{wai}; (ai) -> ({wai})
// Moved up [LIU]
"aa>{waa}\n"
"ai>{wai}\n"
"au>{wau}\n"
"ii>{wii}\n"
"i>{wi}\n"
"uu>{wuu}\n"
"u>{wu}\n"
"rrh>{wrr}\n"
"rh>{wr}\n"
"lh>{wl}\n"
"e>{we}\n"
"o>{wo}\n"
"a>{wa}\n"
// normal consonants
"kh>{kha}|{virama}\n"
"k>{ka}|{virama}\n"
"q>{ka}|{virama}\n"
"gh>{gha}|{virama}\n"
"g>{ga}|{virama}\n"
"ng>{nga}|{virama}\n"
"ch>{cha}|{virama}\n"
"c>{ca}|{virama}\n"
"jh>{jha}|{virama}\n"
"j>{ja}|{virama}\n"
"ny>{nya}|{virama}\n"
"tth>{ttha}|{virama}\n"
"tt>{tta}|{virama}\n"
"ddh>{ddha}|{virama}\n"
"dd>{dda}|{virama}\n"
"nn>{nna}|{virama}\n"
"th>{tha}|{virama}\n"
"t>{ta}|{virama}\n"
"dh>{dha}|{virama}\n"
"d>{da}|{virama}\n"
"n>{na}|{virama}\n"
"ph>{pha}|{virama}\n"
"p>{pa}|{virama}\n"
"bh>{bha}|{virama}\n"
"b>{ba}|{virama}\n"
"m>{ma}|{virama}\n"
"y>{ya}|{virama}\n"
"r>{ra}|{virama}\n"
"l>{la}|{virama}\n"
"v>{va}|{virama}\n"
"f>{va}|{virama}\n"
"w>{va}|{virama}\n"
"sh>{sha}|{virama}\n"
"ss>{ssa}|{virama}\n"
"s>{sa}|{virama}\n"
"z>{sa}|{virama}\n"
"h>{ha}|{virama}\n"
".>{danda}\n"
"{danda}.>{doubleDanda}\n"
"{depVowelAbove}]~>{bindu}\n"
"{depVowelBelow}]~>{candrabindu}\n"
// convert to dependent forms after consonant with no vowel:
// e.g. kai -> {ka}{virama}ai -> {ka}{ai}
"{virama}aa>{aa}\n"
"{virama}ai>{ai}\n"
"{virama}au>{au}\n"
"{virama}ii>{ii}\n"
"{virama}i>{i}\n"
"{virama}uu>{uu}\n"
"{virama}u>{u}\n"
"{virama}rrh>{rrh}\n"
"{virama}rh>{rh}\n"
"{virama}lh>{lh}\n"
"{virama}e>{e}\n"
"{virama}o>{o}\n"
"{virama}a>\n"
// otherwise convert independent forms when separated by ': k'ai -> {ka}{virama}{wai}
"{virama}''aa>{waa}\n"
"{virama}''ai>{wai}\n"
"{virama}''au>{wau}\n"
"{virama}''ii>{wii}\n"
"{virama}''i>{wi}\n"
"{virama}''uu>{wuu}\n"
"{virama}''u>{wu}\n"
"{virama}''rrh>{wrr}\n"
"{virama}''rh>{wr}\n"
"{virama}''lh>{wl}\n"
"{virama}''e>{we}\n"
"{virama}''o>{wo}\n"
"{virama}''a>{wa}\n"
"{virama}[{endThing}>\n"
// convert any left-over apostrophes used for separation
"''>\n"
//#####################################################################
// convert from Native letters to Latin letters
//#####################################################################
// special forms with no good conversion
"mm<{bindu}\n"
"x<{visarga}\n"
// normal consonants
"kh<{kha}[&\n"
"kha<{kha}\n"
"k''<{ka}{virama}[{ha}\n"
"k<{ka}[&\n"
"ka<{ka}\n"
"gh<{gha}[&\n"
"gha<{gha}\n"
"g''<{ga}{virama}[{ha}\n"
"g<{ga}[&\n"
"ga<{ga}\n"
"ng<{nga}[&\n"
"nga<{nga}\n"
"ch<{cha}[&\n"
"cha<{cha}\n"
"c''<{ca}{virama}[{ha}\n"
"c<{ca}[&\n"
"ca<{ca}\n"
"jh<{jha}[&\n"
"jha<{jha}\n"
"j''<{ja}{virama}[{ha}\n"
"j<{ja}[&\n"
"ja<{ja}\n"
"ny<{nya}[&\n"
"nya<{nya}\n"
"tth<{ttha}[&\n"
"ttha<{ttha}\n"
"tt''<{tta}{virama}[{ha}\n"
"tt<{tta}[&\n"
"tta<{tta}\n"
"ddh<{ddha}[&\n"
"ddha<{ddha}\n"
"dd''<{dda}[&{ha}\n"
"dd<{dda}[&\n"
"dda<{dda}\n"
"dh<{dha}[&\n"
"dha<{dha}\n"
"d''<{da}{virama}[{ha}\n"
"d''<{da}{virama}[{ddha}\n"
"d''<{da}{virama}[{dda}\n"
"d''<{da}{virama}[{dha}\n"
"d''<{da}{virama}[{da}\n"
"d<{da}[&\n"
"da<{da}\n"
"th<{tha}[&\n"
"tha<{tha}\n"
"t''<{ta}{virama}[{ha}\n"
"t''<{ta}{virama}[{ttha}\n"
"t''<{ta}{virama}[{tta}\n"
"t''<{ta}{virama}[{tha}\n"
"t''<{ta}{virama}[{ta}\n"
"t<{ta}[&\n"
"ta<{ta}\n"
"n''<{na}{virama}[{ga}\n"
"n''<{na}{virama}[{ya}\n"
"n<{na}[&\n"
"na<{na}\n"
"ph<{pha}[&\n"
"pha<{pha}\n"
"p''<{pa}{virama}[{ha}\n"
"p<{pa}[&\n"
"pa<{pa}\n"
"bh<{bha}[&\n"
"bha<{bha}\n"
"b''<{ba}{virama}[{ha}\n"
"b<{ba}[&\n"
"ba<{ba}\n"
"m''<{ma}{virama}[{ma}\n"
"m''<{ma}{virama}[{bindu}\n"
"m<{ma}[&\n"
"ma<{ma}\n"
"y<{ya}[&\n"
"ya<{ya}\n"
"r''<{ra}{virama}[{ha}\n"
"r<{ra}[&\n"
"ra<{ra}\n"
"l''<{la}{virama}[{ha}\n"
"l<{la}[&\n"
"la<{la}\n"
"v<{va}[&\n"
"va<{va}\n"
"sh<{sha}[&\n"
"sha<{sha}\n"
"ss<{ssa}[&\n"
"ssa<{ssa}\n"
"s''<{sa}{virama}[{ha}\n"
"s''<{sa}{virama}[{sha}\n"
"s''<{sa}{virama}[{ssa}\n"
"s''<{sa}{virama}[{sa}\n"
"s<{sa}[&\n"
"sa<{sa}\n"
"h<{ha}[&\n"
"ha<{ha}\n"
// dependent vowels (should never occur except following consonants)
"aa<{aa}\n"
"ai<{ai}\n"
"au<{au}\n"
"ii<{ii}\n"
"i<{i}\n"
"uu<{uu}\n"
"u<{u}\n"
"rrh<{rrh}\n"
"rh<{rh}\n"
"lh<{lh}\n"
"e<{e}\n"
"o<{o}\n"
// independent vowels (when following consonants)
"''aa<a]{waa}\n"
"''aa<%]{waa}\n"
"''ai<a]{wai}\n"
"''ai<%]{wai}\n"
"''au<a]{wau}\n"
"''au<%]{wau}\n"
"''ii<a]{wii}\n"
"''ii<%]{wii}\n"
"''i<a]{wi}\n"
"''i<%]{wi}\n"
"''uu<a]{wuu}\n"
"''uu<%]{wuu}\n"
"''u<a]{wu}\n"
"''u<%]{wu}\n"
"''rrh<%]{wrr}\n"
"''rh<%]{wr}\n"
"''lh<%]{wl}\n"
"''e<%]{we}\n"
"''o<%]{wo}\n"
"''a<a]{wa}\n"
"''a<%]{wa}\n"
// independent vowels (otherwise)
"aa<{waa}\n"
"ai<{wai}\n"
"au<{wau}\n"
"ii<{wii}\n"
"i<{wi}\n"
"uu<{wuu}\n"
"u<{wu}\n"
"rrh<{wrr}\n"
"rh<{wr}\n"
"lh<{wl}\n"
"e<{we}\n"
"o<{wo}\n"
"a<{wa}\n"
// blow away any remaining viramas
"<{virama}\n"
}
}

View file

@ -0,0 +1,380 @@
//--------------------------------------------------------------------
// Copyright (C) 1999, International Business Machines
// Corporation and others. All Rights Reserved.
//--------------------------------------------------------------------
// Date Name Description
// 11/17/99 aliu Creation.
//--------------------------------------------------------------------
// Latin-Greek
lgreek {
Rule {
// Greek Letters
"grAl=\u0391\n"
"grBe=\u0392\n"
"grGa=\u0393\n"
"grDe=\u0394\n"
"grEp=\u0395\n"
"grZe=\u0396\n"
"grEt=\u0397\n"
"grTh=\u0398\n"
"grIo=\u0399\n"
"grKa=\u039A\n"
"grLa=\u039B\n"
"grMu=\u039C\n"
"grNu=\u039D\n"
"grKs=\u039E\n"
"grOm=\u039F\n"
"grPi=\u03A0\n"
"grRh=\u03A1\n"
"grSi=\u03A3\n"
"grTa=\u03A4\n"
"grUp=\u03A5\n"
"grPh=\u03A6\n"
"grKh=\u03A7\n"
"grPs=\u03A8\n"
"grOme=\u03A9\n"
"gral=\u03B1\n"
"grbe=\u03B2\n"
"grga=\u03B3\n"
"grde=\u03B4\n"
"grep=\u03B5\n"
"grze=\u03B6\n"
"gret=\u03B7\n"
"grth=\u03B8\n"
"grio=\u03B9\n"
"grka=\u03BA\n"
"grla=\u03BB\n"
"grmu=\u03BC\n"
"grnu=\u03BD\n"
"grks=\u03BE\n"
"grom=\u03BF\n"
"grpi=\u03C0\n"
"grrh=\u03C1\n"
"grsi=\u03C3\n"
"grta=\u03C4\n"
"grup=\u03C5\n"
"grph=\u03C6\n"
"grkh=\u03C7\n"
"grps=\u03C8\n"
"grome=\u03C9\n"
//forms
"grfinal=\u03C2\n"
"grAcAl=\u0386\n"
"grAcEp=\u0388\n"
"grAcEt=\u0389\n"
"grAcIo=\u038A\n"
"grAcOm=\u038C\n"
"grAcUp=\u038E\n"
"grAcOme=\u038F\n"
"grDiIo=\u03AA\n"
"grDiUp=\u03AB\n"
"gracal=\u03AC\n"
"gracep=\u03AD\n"
"gracet=\u03AE\n"
"gracio=\u03AF\n"
"gracom=\u03CC\n"
"gracup=\u03CD\n"
"gracome=\u03CE\n"
"grdiio=\u03CA\n"
"grdiup=\u03CB\n"
//gracdiio=\u00FD
//gracdiup=\u00FE
"letter=[[:Lu:][:Ll:]]\n"
// convert Roman to Native
"Greek>\u039c\u0397\u039d\u0399\u039d\u0020\u0391\u0395\u0399\u0394\u0395\u002c\u0020\u0398\u0395\u0391\u002c\u0020--\u0397\u039b\u0397\u0399\u0391\u0394\u0395\u03a9\u0020\u0391\u03a7\u0399\u039b\u0397\u039f\u03a3\n"
"AV`>{grAl}{grAcUp}\n"
"EV`>{grEp}{grAcUp}\n"
"AV>{grAl}{grUp}\n"
"EV>{grEp}{grUp}\n"
"NG>{grGa}{grGa}\n"
"NK>{grGa}{grKa}\n"
"NX>{grGa}{grKs}\n"
"NCH>{grGa}{grKh}\n"
//+ "final = [ .;]\n" // Syntax error, unused anyway - Liu
"A`>{grAcAl}\n"
"EE`>{grAcEt}\n"
"E`>{grAcEp}\n"
"I`>{grAcIo}\n"
"U`>{grAcUp}\n"
"OO`>{grAcOme}\n"
"O`>{grAcOm}\n"
"''I>{grDiIo}\n"
"''U>{grDiUp}\n"
"A>{grAl}\n"
"B>{grBe}\n"
"C[I>{grSi}\n"
"C[E>{grSi}\n"
"C[Y>{grSi}\n"
"CH>{grKh}\n"
"C>{grKa}\n"
"D>{grDe}\n"
"EE>{grEt}\n"
"E>{grEp}\n"
"F>{grPh}\n"
"G>{grGa}\n"
"H>{grKh}\n"
"I>{grIo}\n"
"J>{grIo}\n"
"KS>{grKs}\n"
"KH>{grKh}\n"
"K>{grKa}\n"
"L>{grLa}\n"
"M>{grMu}\n"
"N>{grNu}\n"
"OO>{grOme}\n"
"O>{grOm}\n"
"PS>{grPs}\n"
"PH>{grPh}\n"
"P>{grPi}\n"
"Q>{grKa}\n"
"R>{grRh}\n"
"S>{grSi}\n"
"TH>{grTh}\n"
"T>{grTa}\n"
"W>{grUp}{grUp}\n"
"U>{grUp}\n"
"V>{grUp}\n"
"X>{grKs}\n"
"Y>{grUp}\n"
"Z>{grZe}\n"
//now Native to Roman
"AV<{grAl}{grUp}\n"
"EV<{grEp}{grUp}\n"
"AV`<{grAl}{grAcUp}\n"
"EV`<{grEp}{grAcUp}\n"
"N''<{grNu}[{grGa}\n"
"NG<{grGa}{grGa}\n"
"N''<{grNu}[{grKa}\n"
"NK<{grGa}{grKa}\n"
"N''<{grNu}[{grKs}\n"
"NX<{grGa}{grKs}\n"
"N''<{grNu}[{grKh}\n"
"NCH<{grGa}{grKh}\n"
"A<{grAl}\n"
"B<{grBe}\n"
"G<{grGa}\n"
"D<{grDe}\n"
"E''<{grEp}[{grEp}\n"
"E''<{grEp}[{grEt}\n"
"E''<{grEp}[{grAcEp}\n"
"E''<{grEp}[{grAcEt}\n"
"E<{grEp}\n"
"Z<{grZe}\n"
"EE<{grEt}\n"
"TH<{grTh}\n"
"I<{grIo}\n"
"K<{grKa}\n"
"L<{grLa}\n"
"M<{grMu}\n"
"N<{grNu}\n"
"X<{grKs}\n"
"O''<{grOm}[{grOm}\n"
"O''<{grOm}[{grOme}\n"
"O''<{grOm}[{grAcOm}\n"
"O''<{grOm}[{grAcOme}\n"
"O<{grOm}\n"
"P''<{grPi}[{grSi}\n"
"P''<{grPi}[{grfinal}\n"
"P<{grPi}\n"
"R<{grRh}\n"
"S<{grSi}\n"
"T<{grTa}\n"
"W<{grUp}{grUp}\n"
"V<{grUp}[{grAcAl}\n"
"V<{grUp}[{grAcEp}\n"
"V<{grUp}[{grAcEt}\n"
"V<{grUp}[{grAcIo}\n"
"V<{grUp}[{grAcOm}\n"
"V<{grUp}[{grAcUp}\n"
"V<{grUp}[{grAcOme}\n"
"V<{grUp}[{grAl}\n"
"V<{grUp}[{grEp}\n"
"V<{grUp}[{grEt}\n"
"V<{grUp}[{grIo}\n"
"V<{grUp}[{grOm}\n"
//{grUp}[{grUp}<V
"V<{grUp}[{grOme}\n"
"U<{grUp}\n"
"PH<{grPh}\n"
"CH<{grKh}\n"
"PS<{grPs}\n"
"OO<{grOme}\n"
//forms
"A`<{grAcAl}\n"
"E`<{grAcEp}\n"
"EE`<{grAcEt}\n"
"I`<{grAcIo}\n"
"O`<{grAcOm}\n"
"U`<{grAcUp}\n"
"OO`<{grAcOme}\n"
"''I<{grDiIo}\n"
"''U<{grDiUp}\n"
//{gracdiio}<XX
//{gracdiup}<XX
//{grfinal}<XX
"av`>{gral}{gracup}\n"
"ev`>{grep}{gracup}\n"
"av>{gral}{grup}\n"
"ev>{grep}{grup}\n"
"ng>{grga}{grga}\n"
"nk>{grga}{grka}\n"
"nx>{grga}{grks}\n"
"nch>{grga}{grkh}\n"
"a`>{gracal}\n"
"ee`>{gracet}\n"
"e`>{gracep}\n"
"i`>{gracio}\n"
"u`>{gracup}\n"
"oo`>{gracome}\n"
"o`>{gracom}\n"
"''i>{grdiio}\n"
"''u>{grdiup}\n"
"a>{gral}\n"
"b>{grbe}\n"
"c[i>{grsi}\n"
"c[e>{grsi}\n"
"c[y>{grsi}\n"
"ch>{grkh}\n"
"c>{grka}\n"
"d>{grde}\n"
"ee>{gret}\n"
"e>{grep}\n"
"f>{grph}\n"
"g>{grga}\n"
"h>{grkh}\n"
"i>{grio}\n"
"j>{grio}\n"
"ks>{grks}\n"
"kh>{grkh}\n"
"k>{grka}\n"
"l>{grla}\n"
"m>{grmu}\n"
"n>{grnu}\n"
"oo>{grome}\n"
"o>{grom}\n"
"ps>{grps}\n"
"ph>{grph}\n"
"p>{grpi}\n"
"q>{grka}\n"
"r>{grrh}\n"
"s>|{grfinal}\n"
"{grfinal}[{letter}>{grsi}\n"
"th>{grth}\n"
"t>{grta}\n"
"w>{grup}{grup}\n"
"u>{grup}\n"
"v>{grup}\n"
"x>{grks}\n"
"y>{grup}\n"
"z>{grze}\n"
//forms
"''>\n"
//now native to roman
"av<{gral}{grup}\n"
"ev<{grep}{grup}\n"
"av`<{gral}{gracup}\n"
"ev`<{grep}{gracup}\n"
"n''<{grnu}[{grga}\n"
"ng<{grga}{grga}\n"
"n''<{grnu}[{grka}\n"
"nk<{grga}{grka}\n"
"n''<{grnu}[{grks}\n"
"nx<{grga}{grks}\n"
"n''<{grnu}[{grkh}\n"
"nch<{grga}{grkh}\n"
"a<{gral}\n"
"b<{grbe}\n"
"g<{grga}\n"
"d<{grde}\n"
"e''<{grep}[{grep}\n"
"e''<{grep}[{gret}\n"
"e''<{grep}[{gracep}\n"
"e''<{grep}[{gracet}\n"
"e<{grep}\n"
"z<{grze}\n"
"ee<{gret}\n"
"th<{grth}\n"
"i<{grio}\n"
"k<{grka}\n"
"l<{grla}\n"
"m<{grmu}\n"
"n<{grnu}\n"
"x<{grks}\n"
"o''<{grom}[{grom}\n"
"o''<{grom}[{grome}\n"
"o''<{grom}[{gracom}\n"
"o''<{grom}[{gracome}\n"
"o<{grom}\n"
"p''<{grpi}[{grsi}\n"
"p''<{grpi}[{grfinal}\n"
"p<{grpi}\n"
"r<{grrh}\n"
"s<{grsi}\n"
"s<{grfinal}\n"
"t<{grta}\n"
"w<{grup}{grup}\n"
"v<{grup}[{gracal}\n"
"v<{grup}[{gracep}\n"
"v<{grup}[{gracet}\n"
"v<{grup}[{gracio}\n"
"v<{grup}[{gracom}\n"
"v<{grup}[{gracup}\n"
"v<{grup}[{gracome}\n"
"v<{grup}[{gral}\n"
"v<{grup}[{grep}\n"
"v<{grup}[{gret}\n"
"v<{grup}[{grio}\n"
"v<{grup}[{grom}\n"
//{grup}[{grup}<v
"v<{grup}[{grome}\n"
"u<{grup}\n"
"ph<{grph}\n"
"ch<{grkh}\n"
"ps<{grps}\n"
"oo<{grome}\n"
//forms
"a`<{gracal}\n"
"e`<{gracep}\n"
"ee`<{gracet}\n"
"i`<{gracio}\n"
"o`<{gracom}\n"
"u`<{gracup}\n"
"oo`<{gracome}\n"
"''i<{grdiio}\n"
"''u<{grdiup}\n"
"<''\n"
//{gracdiio}<xx
//{gracdiup}<xx
//{grfinal}<xx
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,279 @@
//--------------------------------------------------------------------
// Copyright (C) 1999, International Business Machines
// Corporation and others. All Rights Reserved.
//--------------------------------------------------------------------
// Date Name Description
// 11/17/99 aliu Creation.
//--------------------------------------------------------------------
// Latin-Hebrew
lhebrew {
Rule {
//variable names, derived from the Unicode names.
"POINT_SHEVA=\u05B0\n"
"POINT_HATAF_SEGOL=\u05B1\n"
"POINT_HATAF_PATAH=\u05B2\n"
"POINT_HATAF_QAMATS=\u05B3\n"
"POINT_HIRIQ=\u05B4\n"
"POINT_TSERE=\u05B5\n"
"POINT_SEGOL=\u05B6\n"
"POINT_PATAH=\u05B7\n"
"POINT_QAMATS=\u05B8\n"
"POINT_HOLAM=\u05B9\n"
"POINT_QUBUTS=\u05BB\n"
"POINT_DAGESH_OR_MAPIQ=\u05BC\n"
"POINT_METEG=\u05BD\n"
"PUNCTUATION_MAQAF=\u05BE\n"
"POINT_RAFE=\u05BF\n"
"PUNCTUATION_PASEQ=\u05C0\n"
"POINT_SHIN_DOT=\u05C1\n"
"POINT_SIN_DOT=\u05C2\n"
"PUNCTUATION_SOF_PASUQ=\u05C3\n"
"ALEF=\u05D0\n"
"BET=\u05D1\n"
"GIMEL=\u05D2\n"
"DALET=\u05D3\n"
"HE=\u05D4\n"
"VAV=\u05D5\n"
"ZAYIN=\u05D6\n"
"HET=\u05D7\n"
"TET=\u05D8\n"
"YOD=\u05D9\n"
"FINAL_KAF=\u05DA\n"
"KAF=\u05DB\n"
"LAMED=\u05DC\n"
"FINAL_MEM=\u05DD\n"
"MEM=\u05DE\n"
"FINAL_NUN=\u05DF\n"
"NUN=\u05E0\n"
"SAMEKH=\u05E1\n"
"AYIN=\u05E2\n"
"FINAL_PE=\u05E3\n"
"PE=\u05E4\n"
"FINAL_TSADI=\u05E5\n"
"TSADI=\u05E6\n"
"QOF=\u05E7\n"
"RESH=\u05E8\n"
"SHIN=\u05E9\n"
"TAV=\u05EA\n"
"YIDDISH_DOUBLE_VAV=\u05F0\n"
"YIDDISH_VAV_YOD=\u05F1\n"
"YIDDISH_DOUBLE_YOD=\u05F2\n"
"PUNCTUATION_GERESH=\u05F3\n"
"PUNCTUATION_GERSHAYIM=\u05F4\n"
//wildcards
//The values can be anything we don't use in this file: start at E000.
"letter=[abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ]\n"
"softvowel=[eiyEIY]\n"
"vowellike=[{ALEF}{AYIN}{YOD}{VAV}]\n"
//?>{POINT_SHEVA}
//?>{POINT_HATAF_SEGOL}
//?>{POINT_HATAF_PATAH}
//?>{POINT_HATAF_QAMATS}
//?>{POINT_HIRIQ}
//?>{POINT_TSERE}
//?>{POINT_SEGOL}
//?>{POINT_PATAH}
//?>{POINT_QAMATS}
//?>{POINT_HOLAM}
//?>{POINT_QUBUTS}
//?>{POINT_DAGESH_OR_MAPIQ}
//?>{POINT_METEG}
//?>{PUNCTUATION_MAQAF}
//?>{POINT_RAFE}
//?>{PUNCTUATION_PASEQ}
//?>{POINT_SHIN_DOT}
//?>{POINT_SIN_DOT}
//?>{PUNCTUATION_SOF_PASUQ}
"a>{ALEF}\n"
"A>{ALEF}\n"
"b>{BET}\n"
"B>{BET}\n"
"c[{softvowel}>{SAMEKH}\n"
"C[{softvowel}>{SAMEKH}\n"
"c[{letter}>{KAF}\n"
"C[{letter}>{KAF}\n"
"c>{FINAL_KAF}\n"
"C>{FINAL_KAF}\n"
"d>{DALET}\n"
"D>{DALET}\n"
"e>{AYIN}\n"
"E>{AYIN}\n"
"f[{letter}>{PE}\n"
"f>{FINAL_PE}\n"
"F[{letter}>{PE}\n"
"F>{FINAL_PE}\n"
"g>{GIMEL}\n"
"G>{GIMEL}\n"
"h>{HE}\n"
"H>{HE}\n"
"i>{YOD}\n"
"I>{YOD}\n"
"j>{DALET}{SHIN}\n"
"J>{DALET}{SHIN}\n"
"kH>{HET}\n"
"kh>{HET}\n"
"KH>{HET}\n"
"Kh>{HET}\n"
"k[{letter}>{KAF}\n"
"K[{letter}>{KAF}\n"
"k>{FINAL_KAF}\n"
"K>{FINAL_KAF}\n"
"l>{LAMED}\n"
"L>{LAMED}\n"
"m[{letter}>{MEM}\n"
"m>{FINAL_MEM}\n"
"M[{letter}>{MEM}\n"
"M>{FINAL_MEM}\n"
"n[{letter}>{NUN}\n"
"n>{FINAL_NUN}\n"
"N[{letter}>{NUN}\n"
"N>{FINAL_NUN}\n"
"o>{VAV}\n"
"O>{VAV}\n"
"p[{letter}>{PE}\n"
"p>{FINAL_PE}\n"
"P[{letter}>{PE}\n"
"P>{FINAL_PE}\n"
"q>{QOF}\n"
"Q>{QOF}\n"
"r>{RESH}\n"
"R>{RESH}\n"
"sH>{SHIN}\n"
"sh>{SHIN}\n"
"SH>{SHIN}\n"
"Sh>{SHIN}\n"
"s>{SAMEKH}\n"
"S>{SAMEKH}\n"
"th>{TAV}\n"
"tH>{TAV}\n"
"TH>{TAV}\n"
"Th>{TAV}\n"
"tS[{letter}>{TSADI}\n"
"ts[{letter}>{TSADI}\n"
"Ts[{letter}>{TSADI}\n"
"TS[{letter}>{TSADI}\n"
"tS>{FINAL_TSADI}\n"
"ts>{FINAL_TSADI}\n"
"Ts>{FINAL_TSADI}\n"
"TS>{FINAL_TSADI}\n"
"t>{TET}\n"
"T>{TET}\n"
"u>{VAV}\n"
"U>{VAV}\n"
"v>{VAV}\n"
"V>{VAV}\n"
"w>{VAV}\n"
"W>{VAV}\n"
"x>{KAF}{SAMEKH}\n"
"X>{KAF}{SAMEKH}\n"
"y>{YOD}\n"
"Y>{YOD}\n"
"z>{ZAYIN}\n"
"Z>{ZAYIN}\n"
//#?>{YIDDISH_DOUBLE_VAV}
//?>{YIDDISH_VAV_YOD}
//?>{YIDDISH_DOUBLE_YOD}
//?>{PUNCTUATION_GERESH}
//?>{PUNCTUATION_GERSHAYIM}
"''>\n"
//{POINT_SHEVA}>@
//{POINT_HATAF_SEGOL}>@
//{POINT_HATAF_PATAH}>@
//{POINT_HATAF_QAMATS}>@
//{POINT_HIRIQ}>@
//{POINT_TSERE}>@
//{POINT_SEGOL}>@
//{POINT_PATAH}>@
//{POINT_QAMATS}>@
//{POINT_HOLAM}>@
//{POINT_QUBUTS}>@
//{POINT_DAGESH_OR_MAPIQ}>@
//{POINT_METEG}>@
//{PUNCTUATION_MAQAF}>@
//{POINT_RAFE}>@
//{PUNCTUATION_PASEQ}>@
//{POINT_SHIN_DOT}>@
//{POINT_SIN_DOT}>@
//{PUNCTUATION_SOF_PASUQ}>@
"a<{ALEF}\n"
"e<{AYIN}\n"
"b<{BET}\n"
"d<{DALET}\n"
"k<{FINAL_KAF}\n"
"m<{FINAL_MEM}\n"
"n<{FINAL_NUN}\n"
"p<{FINAL_PE}\n"
"ts<{FINAL_TSADI}\n"
"g<{GIMEL}\n"
"kh<{HET}\n"
"h<{HE}\n"
"k''<{KAF}[{HE}\n"
"k<{KAF}\n"
"l<{LAMED}\n"
"m<{MEM}\n"
"n<{NUN}\n"
"p<{PE}\n"
"q<{QOF}\n"
"r<{RESH}\n"
"s''<{SAMEKH}[{HE}\n"
"s<{SAMEKH}\n"
"sh<{SHIN}\n"
"th<{TAV}\n"
"t''<{TET}[{HE}\n"
"t''<{TET}[{HE}\n"
"t''<{TET}[{SAMEKH}\n"
"t''<{TET}[{SHIN}\n"
"t<{TET}\n"
"ts<{TSADI}\n"
"v<{VAV}[{vowellike}\n"
"u<{VAV}\n"
"y<{YOD}\n"
"z<{ZAYIN}\n"
//{YIDDISH_DOUBLE_VAV}>@
//{YIDDISH_VAV_YOD}>@
//{YIDDISH_DOUBLE_YOD}>@
//{PUNCTUATION_GERESH}>@
//{PUNCTUATION_GERSHAYIM}>@
"<''\n"
}
}

View file

@ -0,0 +1,877 @@
//--------------------------------------------------------------------
// Copyright (C) 1999, International Business Machines
// Corporation and others. All Rights Reserved.
//--------------------------------------------------------------------
// Date Name Description
// 11/17/99 aliu Creation.
//--------------------------------------------------------------------
// Latin-Kana
// Rewritten April 1999 to implement Hepburn (kebon shiki)
// transliteration. Reference: CJKV Information Processing, Lunde,
// 1999, pp. 30-35.
// @author Alan Liu
lkana {
Rule {
//------------------------------------------------------------
// Variables
//------------------------------------------------------------
// Hiragana. These are named according to the
// regularized Nippon romanization (the naming system
// used by Unicode). Thus \u3062 is called "di", not
// "ji". "x_" is the small form of "_", e.g. "xa" is
// small "a".
"xa=\u3041\n"
"a=\u3042\n"
"xi=\u3043\n"
"i=\u3044\n"
"xu=\u3045\n"
"u=\u3046\n"
"xe=\u3047\n"
"e=\u3048\n"
"xo=\u3049\n"
"o=\u304A\n"
"ka=\u304B\n"
"ga=\u304C\n"
"ki=\u304D\n"
"gi=\u304E\n"
"ku=\u304F\n"
"gu=\u3050\n"
"ke=\u3051\n"
"ge=\u3052\n"
"ko=\u3053\n"
"go=\u3054\n"
"sa=\u3055\n"
"za=\u3056\n"
"si=\u3057\n"
"zi=\u3058\n"
"su=\u3059\n"
"zu=\u305A\n"
"se=\u305B\n"
"ze=\u305C\n"
"so=\u305D\n"
"zo=\u305E\n"
"ta=\u305F\n"
"da=\u3060\n"
"ti=\u3061\n"
"di=\u3062\n"
"xtu=\u3063\n"
"tu=\u3064\n"
"du=\u3065\n"
"te=\u3066\n"
"de=\u3067\n"
"to=\u3068\n"
"do=\u3069\n"
"na=\u306A\n"
"ni=\u306B\n"
"nu=\u306C\n"
"ne=\u306D\n"
"no=\u306E\n"
"ha=\u306F\n"
"ba=\u3070\n"
"pa=\u3071\n"
"hi=\u3072\n"
"bi=\u3073\n"
"pi=\u3074\n"
"hu=\u3075\n"
"bu=\u3076\n"
"pu=\u3077\n"
"he=\u3078\n"
"be=\u3079\n"
"pe=\u307A\n"
"ho=\u307B\n"
"bo=\u307C\n"
"po=\u307D\n"
"ma=\u307E\n"
"mi=\u307F\n"
"mu=\u3080\n"
"me=\u3081\n"
"mo=\u3082\n"
"xya=\u3083\n"
"ya=\u3084\n"
"xyu=\u3085\n"
"yu=\u3086\n"
"xyo=\u3087\n"
"yo=\u3088\n"
"ra=\u3089\n"
"ri=\u308A\n"
"ru=\u308B\n"
"re=\u308C\n"
"ro=\u308D\n"
"xwa=\u308E\n"
"wa=\u308F\n"
"wi=\u3090\n"
"we=\u3091\n"
"wo=\u3092\n"
"n=\u3093\n"
"vu=\u3094\n"
// Katakana. "X_" is the small form of "_", e.g. "XA"
// is small "A".
"XA=\u30A1\n"
"A=\u30A2\n"
"XI=\u30A3\n"
"I=\u30A4\n"
"XU=\u30A5\n"
"U=\u30A6\n"
"XE=\u30A7\n"
"E=\u30A8\n"
"XO=\u30A9\n"
"O=\u30AA\n"
"KA=\u30AB\n"
"GA=\u30AC\n"
"KI=\u30AD\n"
"GI=\u30AE\n"
"KU=\u30AF\n"
"GU=\u30B0\n"
"KE=\u30B1\n"
"GE=\u30B2\n"
"KO=\u30B3\n"
"GO=\u30B4\n"
"SA=\u30B5\n"
"ZA=\u30B6\n"
"SI=\u30B7\n"
"ZI=\u30B8\n"
"SU=\u30B9\n"
"ZU=\u30BA\n"
"SE=\u30BB\n"
"ZE=\u30BC\n"
"SO=\u30BD\n"
"ZO=\u30BE\n"
"TA=\u30BF\n"
"DA=\u30C0\n"
"TI=\u30C1\n"
"DI=\u30C2\n"
"XTU=\u30C3\n"
"TU=\u30C4\n"
"DU=\u30C5\n"
"TE=\u30C6\n"
"DE=\u30C7\n"
"TO=\u30C8\n"
"DO=\u30C9\n"
"NA=\u30CA\n"
"NI=\u30CB\n"
"NU=\u30CC\n"
"NE=\u30CD\n"
"NO=\u30CE\n"
"HA=\u30CF\n"
"BA=\u30D0\n"
"PA=\u30D1\n"
"HI=\u30D2\n"
"BI=\u30D3\n"
"PI=\u30D4\n"
"HU=\u30D5\n"
"BU=\u30D6\n"
"PU=\u30D7\n"
"HE=\u30D8\n"
"BE=\u30D9\n"
"PE=\u30DA\n"
"HO=\u30DB\n"
"BO=\u30DC\n"
"PO=\u30DD\n"
"MA=\u30DE\n"
"MI=\u30DF\n"
"MU=\u30E0\n"
"ME=\u30E1\n"
"MO=\u30E2\n"
"XYA=\u30E3\n"
"YA=\u30E4\n"
"XYU=\u30E5\n"
"YU=\u30E6\n"
"XYO=\u30E7\n"
"YO=\u30E8\n"
"RA=\u30E9\n"
"RI=\u30EA\n"
"RU=\u30EB\n"
"RE=\u30EC\n"
"RO=\u30ED\n"
"XWA=\u30EE\n"
"WA=\u30EF\n"
"WI=\u30F0\n"
"WE=\u30F1\n"
"WO=\u30F2\n"
"N=\u30F3\n"
"VU=\u30F4\n"
"XKA=\u30F5\n"
"XKE=\u30F6\n"
"VA=\u30F7\n"
"VI=\u30F8\n"
"VE=\u30F9\n"
"VO=\u30FA\n"
"DOT=\u30FB\n" // Middle dot
"LONG=\u30FC\n" // Prolonged sound mark
// Categories and programmatic variables
"vowel=[aiueo]\n"
"small=\uE000\n"
"hvr=\uE001\n"
"hv=[{xya}{xi}{xyu}{xe}{xyo}]\n"
//------------------------------------------------------------
// Rules
//------------------------------------------------------------
/*
// Hepburn equivalents
shi>|si
ji>|zi
chi>|ti
// ji>|di // By default we use the ji-zi mapping
tsu>|tu
fu>|hu
sh[{vowel}>|sy
ja>|zya
// ji = zi
ju>|zyu
je>|zye
jo>|zyo
cha>|tya
// chi = ti
chu>|tyu
che>|tye
cho>|tyo
// j[{vowel} = dy{vowel}, but we use zy{vowel} by default
// Historically, m preceded b, p, or m; now n is used
// in all cases
m[b>n
m[p>n
m[m>n
// Compatibility
// 'f' group
fa>{fu}{xa}
fi>{fu}{xi}
// fu = hu
fe>{fu}{xe}
fo>{fu}{xo}
// 'jy' group; these will not round-trip, except for "jyi"
// See also the 'j' group.
jya>|zya
jyi>{zi}{xyi}
jyu>|zyu
jye>|zye
jyo>|zyo
// Nippon romanized forms
a>{a}
i>{i}
u>{u}
e>{e}
o>{o}
ka>{ka}
ki>{ki}
ku>{ku}
ke>{ke}
ko>{ko}
ga>{ga}
gi>{gi}
gu>{gu}
ge>{ge}
go>{go}
sa>{sa}
si>{si}
su>{su}
se>{se}
so>{so}
za>{za}
zi>{zi}
zu>{zu}
ze>{ze}
zo>{zo}
ta>{ta}
ti>{ti}
tu>{tu}
te>{te}
to>{to}
da>{da}
di>{di}
du>{du}
de>{de}
do>{do}
na>{na}
ni>{ni}
nu>{nu}
ne>{ne}
no>{no}
ha>{ha}
hi>{hi}
hu>{hu}
he>{he}
ho>{ho}
ba>{ba}
bi>{bi}
bu>{bu}
be>{be}
bo>{bo}
pa>{pa}
pi>{pi}
pu>{pu}
pe>{pe}
po>{po}
ma>{ma}
mi>{mi}
mu>{mu}
me>{me}
mo>{mo}
ya>{ya}
yu>{yu}
yo>{yo}
ra>{ra}
ri>{ri}
ru>{ru}
re>{re}
ro>{ro}
wa>{wa}
wi>{wi}
// No "wu"
we>{we}
wo>{wo} // Reverse {wo} to "o", not "wo"
n''>{n}
n>{n}
// Palatized Nippon romanized syllables
ky[{vowel}>{ki}|{small}
gy[{vowel}>{gi}|{small}
sy[{vowel}>{si}|{small}
zy[{vowel}>{zi}|{small}
ty[{vowel}>{ti}|{small}
dy[{vowel}>{di}|{small}
ny[{vowel}>{ni}|{small}
my[{vowel}>{mi}|{small}
hy[{vowel}>{hi}|{small}
by[{vowel}>{bi}|{small}
py[{vowel}>{pi}|{small}
ry[{vowel}>{ri}|{small}
// Doubled consonants
c[c>{xtu}
k[k>{xtu}
g[g>{xtu}
s[s>{xtu}
z[z>{xtu}
j[j>{xtu}
t[t>{xtu}
d[d>{xtu}
h[h>{xtu}
f[f>{xtu}
p[p>{xtu}
b[b>{xtu}
m[m>{xtu}
y[y>{xtu}
r[r>{xtu}
w[w>{xtu}
*/
"a>{a}\n"
"ba>{ba}\n"
"bi>{bi}\n"
"bu>{bu}\n"
"be>{be}\n"
"bo>{bo}\n"
"by[{vowel}>{bi}|{small}\n"
"b[b>{xtu}\n"
"da>{da}\n"
"di>{di}\n"
"du>{du}\n"
"de>{de}\n"
"do>{do}\n"
"dy[{vowel}>{di}|{small}\n"
"dh[{vowel}>{de}|{small}\n"
"d[d>{xtu}\n"
"e>{e}\n"
"fa>{hu}{xa}\n"
"fi>{hu}{xi}\n"
"fe>{hu}{xe}\n"
"fo>{hu}{xo}\n"
"fya>{hu}{xya}\n"
"fyu>{hu}{xyu}\n"
"fyo>{hu}{xyo}\n"
"f[f>{xtu}\n"
"ga>{ga}\n"
"gi>{gi}\n"
"gu>{gu}\n"
"ge>{ge}\n"
"go>{go}\n"
"gy[{vowel}>{gi}|{small}\n"
"gwa>{gu}{xwa}\n"
"gwi>{gu}{xi}\n"
"gwu>{gu}{xu}\n"
"gwe>{gu}{xe}\n"
"gwo>{gu}{xo}\n"
"g[g>{xtu}\n"
"ha>{ha}\n"
"hi>{hi}\n"
"hu>{hu}\n"
"he>{he}\n"
"ho>{ho}\n"
"hy[{vowel}>{hi}|{small}\n"
"h[h>{xtu}\n"
"i>{i}\n"
"ka>{ka}\n"
"ki>{ki}\n"
"ku>{ku}\n"
"ke>{ke}\n"
"ko>{ko}\n"
"kwa>{ku}{xwa}\n"
"kwi>{ku}{xi}\n"
"kwu>{ku}{xu}\n"
"kwe>{ku}{xe}\n"
"kwo>{ku}{xo}\n"
"ky[{vowel}>{ki}|{small}\n"
"k[k>{xtu}\n"
"ma>{ma}\n"
"mi>{mi}\n"
"mu>{mu}\n"
"me>{me}\n"
"mo>{mo}\n"
"my[{vowel}>{mi}|{small}\n"
"m[b>{n}\n"
"m[f>{n}\n"
"m[m>{n}\n"
"m[p>{n}\n"
"m[v>{n}\n"
"m''>{n}\n"
"na>{na}\n"
"ni>{ni}\n"
"nu>{nu}\n"
"ne>{ne}\n"
"no>{no}\n"
"ny[{vowel}>{ni}|{small}\n"
"nn>{n}\n"
"n''>{n}\n"
"n>{n}\n"
"o>{o}\n"
"pa>{pa}\n"
"pi>{pi}\n"
"pu>{pu}\n"
"pe>{pe}\n"
"po>{po}\n"
"py[{vowel}>{pi}|{small}\n"
"p[p>{xtu}\n"
"qa>{ku}{xa}\n"
"qi>{ku}{xi}\n"
"qu>{ku}{xu}\n"
"qe>{ku}{xe}\n"
"qo>{ku}{xo}\n"
"qy[{vowel}>{ku}|{small}\n"
"q[q>{xtu}\n"
"ra>{ra}\n"
"ri>{ri}\n"
"ru>{ru}\n"
"re>{re}\n"
"ro>{ro}\n"
"ry[{vowel}>{ri}|{small}\n"
"r[r>{xtu}\n"
"sa>{sa}\n"
"si>{si}\n"
"su>{su}\n"
"se>{se}\n"
"so>{so}\n"
"sy[{vowel}>{si}|{small}\n"
"s[sh>{xtu}\n"
"s[s>{xtu}\n"
"ta>{ta}\n"
"ti>{ti}\n"
"tu>{tu}\n"
"te>{te}\n"
"to>{to}\n"
"th[{vowel}>{te}|{small}\n"
"tsa>{tu}{xa}\n"
"tsi>{tu}{xi}\n"
"tse>{tu}{xe}\n"
"tso>{tu}{xo}\n"
"ty[{vowel}>{ti}|{small}\n"
"t[ts>{xtu}\n"
"t[ch>{xtu}\n"
"t[t>{xtu}\n"
"u>{u}\n"
"va>{VA}\n"
"vi>{VI}\n"
"vu>{vu}\n"
"ve>{VE}\n"
"vo>{VO}\n"
"vy[{vowel}>{VI}|{small}\n"
"v[v>{xtu}\n"
"wa>{wa}\n"
"wi>{wi}\n"
"we>{we}\n"
"wo>{wo}\n"
"w[w>{xtu}\n"
"ya>{ya}\n"
"yu>{yu}\n"
"ye>{i}{xe}\n"
"yo>{yo}\n"
"y[y>{xtu}\n"
"za>{za}\n"
"zi>{zi}\n"
"zu>{zu}\n"
"ze>{ze}\n"
"zo>{zo}\n"
"zy[{vowel}>{zi}|{small}\n"
"z[z>{xtu}\n"
"xa>{xa}\n"
"xi>{xi}\n"
"xu>{xu}\n"
"xe>{xe}\n"
"xo>{xo}\n"
"xka>{XKA}\n"
"xke>{XKE}\n"
"xtu>{xtu}\n"
"xwa>{xwa}\n"
"xya>{xya}\n"
"xyu>{xyu}\n"
"xyo>{xyo}\n"
// optional mappings
"wu>{u}\n"
"ca>{ka}\n"
"ci>{si}\n"
"cu>{ku}\n"
"ce>{se}\n"
"co>{ko}\n"
"cha>{ti}{xya}\n"
"chi>{ti}\n"
"chu>{ti}{xyu}\n"
"che>{ti}{xe}\n"
"cho>{ti}{xyo}\n"
"cy[{vowel}>{ti}|{small}\n"
"c[k>{xtu}\n"
"c[c>{xtu}\n"
"fu>{hu}\n"
"ja>{zi}{xya}\n"
"ji>{zi}\n"
"ju>{zi}{xyu}\n"
"je>{zi}{xe}\n"
"jo>{zi}{xyo}\n"
"jy[{vowel}>{zi}|{small}\n"
"j[j>{xtu}\n"
"la>{ra}\n"
"li>{ri}\n"
"lu>{ru}\n"
"le>{re}\n"
"lo>{ro}\n"
"ly[{vowel}>{ri}|{small}\n"
"l[l>{xtu}\n"
"sha>{si}{xya}\n"
"shi>{si}\n"
"shu>{si}{xyu}\n"
"she>{si}{xe}\n"
"sho>{si}{xyo}\n"
"tsu>{tu}\n"
"yi>{i}\n"
"xtsu>{xtu}\n"
"xyi>{xi}\n"
"xye>{xe}\n"
// Convert vowels to small form
"{small}a>{xya}\n"
"{small}i>{xi}\n"
"{small}u>{xyu}\n"
"{small}e>{xe}\n"
"{small}o>{xyo}\n"
"gy|{hvr}<{gi}[{hv}\n"
"gwa<{gu}{xwa}\n"
"gwi<{gu}{xi}\n"
"gwu<{gu}{xu}\n"
"gwe<{gu}{xe}\n"
"gwo<{gu}{xo}\n"
"ga<{ga}\n"
"gi<{gi}\n"
"gu<{gu}\n"
"ge<{ge}\n"
"go<{go}\n"
"ky|{hvr}<{ki}[{hv}\n"
"kwa<{ku}{xwa}\n"
"kwi<{ku}{xi}\n"
"kwu<{ku}{xu}\n"
"kwe<{ku}{xe}\n"
"kwo<{ku}{xo}\n"
"qa<{ku}{xa}\n"
"qya<{ku}{xya}\n"
"qyu<{ku}{xyu}\n"
"qyo<{ku}{xyo}\n"
"ka<{ka}\n"
"ki<{ki}\n"
"ku<{ku}\n"
"ke<{ke}\n"
"ko<{ko}\n"
"j|{hvr}<{zi}[{hv}\n" // Hepburn
"za<{za}\n"
"ji<{zi}\n" // Hepburn
"zu<{zu}\n"
"ze<{ze}\n"
"zo<{zo}\n"
"sh|{hvr}<{si}[{hv}\n" // Hepburn
"sa<{sa}\n"
"shi<{si}\n"
"su<{su}\n"
"se<{se}\n"
"so<{so}\n"
"j|{hvr}<{di}[{hv}\n" // Hepburn
"dh|{hvr}<{de}[{hv}\n"
"da<{da}\n"
"ji<{di}\n" // Hepburn
"de<{de}\n"
"do<{do}\n"
"zu<{du}\n" // Hepburn
"ch|{hvr}<{ti}[{hv}\n" // Hepburn
"tsa<{tu}{xa}\n"
"tsi<{tu}{xi}\n"
"tse<{tu}{xe}\n"
"tso<{tu}{xo}\n"
"th|{hvr}<{te}[{hv}\n"
"ta<{ta}\n"
"chi<{ti}\n" // Hepburn
"tsu<{tu}\n" // Hepburn
"te<{te}\n"
"to<{to}\n"
"ny|{hvr}<{ni}[{hv}\n"
"na<{na}\n"
"ni<{ni}\n"
"nu<{nu}\n"
"ne<{ne}\n"
"no<{no}\n"
"by|{hvr}<{bi}[{hv}\n"
"ba<{ba}\n"
"bi<{bi}\n"
"bu<{bu}\n"
"be<{be}\n"
"bo<{bo}\n"
"py|{hvr}<{pi}[{hv}\n"
"pa<{pa}\n"
"pi<{pi}\n"
"pu<{pu}\n"
"pe<{pe}\n"
"po<{po}\n"
"hy|{hvr}<{hi}[{hv}\n"
"fa<{hu}{xa}\n"
"fi<{hu}{xi}\n"
"fe<{hu}{xe}\n"
"fo<{hu}{xo}\n"
"fya<{hu}{xya}\n"
"fyu<{hu}{xyu}\n"
"fyo<{hu}{xyo}\n"
"ha<{ha}\n"
"hi<{hi}\n"
"fu<{hu}\n" // Hepburn
"he<{he}\n"
"ho<{ho}\n"
"my|{hvr}<{mi}[{hv}\n"
"ma<{ma}\n"
"mi<{mi}\n"
"mu<{mu}\n"
"me<{me}\n"
"mo<{mo}\n"
"ya<{ya}\n"
"yu<{yu}\n"
"ye<{i}{xe}\n"
"yo<{yo}\n"
"xya<{xya}\n"
"xyu<{xyu}\n"
"xyo<{xyo}\n"
"ry|{hvr}<{ri}[{hv}\n"
"ra<{ra}\n"
"ri<{ri}\n"
"ru<{ru}\n"
"re<{re}\n"
"ro<{ro}\n"
"wa<{wa}\n"
"wi<{wi}\n"
"we<{we}\n"
"wo<{wo}\n"
"vu<{vu}\n"
"vy|{hvr}<{VI}[{hv}\n"
"v<{xtu}[{vu}\n"
"xa<{xa}\n"
"xi<{xi}\n"
"xu<{xu}\n"
"xe<{xe}\n"
"xo<{xo}\n"
"n''<{n}[{a}\n"
"n''<{n}[{i}\n"
"n''<{n}[{u}\n"
"n''<{n}[{e}\n"
"n''<{n}[{o}\n"
"n''<{n}[{na}\n"
"n''<{n}[{ni}\n"
"n''<{n}[{nu}\n"
"n''<{n}[{ne}\n"
"n''<{n}[{no}\n"
"n''<{n}[{ya}\n"
"n''<{n}[{yu}\n"
"n''<{n}[{yo}\n"
"n''<{n}[{n}\n"
"n<{n}\n"
"g<{xtu}[{ga}\n"
"g<{xtu}[{gi}\n"
"g<{xtu}[{gu}\n"
"g<{xtu}[{ge}\n"
"g<{xtu}[{go}\n"
"k<{xtu}[{ka}\n"
"k<{xtu}[{ki}\n"
"k<{xtu}[{ku}\n"
"k<{xtu}[{ke}\n"
"k<{xtu}[{ko}\n"
"z<{xtu}[{za}\n"
"z<{xtu}[{zi}\n"
"z<{xtu}[{zu}\n"
"z<{xtu}[{ze}\n"
"z<{xtu}[{zo}\n"
"s<{xtu}[{sa}\n"
"s<{xtu}[{si}\n"
"s<{xtu}[{su}\n"
"s<{xtu}[{se}\n"
"s<{xtu}[{so}\n"
"d<{xtu}[{da}\n"
"d<{xtu}[{di}\n"
"d<{xtu}[{du}\n"
"d<{xtu}[{de}\n"
"d<{xtu}[{do}\n"
"t<{xtu}[{ta}\n"
"t<{xtu}[{ti}\n"
"t<{xtu}[{tu}\n"
"t<{xtu}[{te}\n"
"t<{xtu}[{to}\n"
"b<{xtu}[{ba}\n"
"b<{xtu}[{bi}\n"
"b<{xtu}[{bu}\n"
"b<{xtu}[{be}\n"
"b<{xtu}[{bo}\n"
"p<{xtu}[{pa}\n"
"p<{xtu}[{pi}\n"
"p<{xtu}[{pu}\n"
"p<{xtu}[{pe}\n"
"p<{xtu}[{po}\n"
"h<{xtu}[{ha}\n"
"h<{xtu}[{hi}\n"
"h<{xtu}[{hu}\n"
"h<{xtu}[{he}\n"
"h<{xtu}[{ho}\n"
"r<{xtu}[{ra}\n"
"r<{xtu}[{ri}\n"
"r<{xtu}[{ru}\n"
"r<{xtu}[{re}\n"
"r<{xtu}[{ro}\n"
"w<{xtu}[{wa}\n"
"xtu<{xtu}\n"
"a<{a}\n"
"i<{i}\n"
"u<{u}\n"
"e<{e}\n"
"o<{o}\n"
// Convert small forms to vowels
"a<{hvr}{xya}\n"
"i<{hvr}{xi}\n"
"u<{hvr}{xyu}\n"
"e<{hvr}{xe}\n"
"o<{hvr}{xyo}\n"
}
}

View file

@ -0,0 +1,315 @@
//--------------------------------------------------------------------
// Copyright (C) 1999, International Business Machines
// Corporation and others. All Rights Reserved.
//--------------------------------------------------------------------
// Date Name Description
// 11/17/99 aliu Creation.
//--------------------------------------------------------------------
// Latin-Russion
lrussian {
Rule {
// Russian Letters
"cyA=\u0410\n"
"cyBe=\u0411\n"
"cyVe=\u0412\n"
"cyGe=\u0413\n"
"cyDe=\u0414\n"
"cyYe=\u0415\n"
"cyYo=\u0416\n"
"cyZhe=\u0417\n"
"cyZe=\u0418\n"
"cyYi=\u0419\n"
"cyY=\u0419\n"
"cyKe=\u041a\n"
"cyLe=\u041b\n"
"cyMe=\u041c\n"
"cyNe=\u041d\n"
"cyO=\u041e\n"
"cyPe=\u041f\n"
"cyRe=\u0420\n"
"cySe=\u0421\n"
"cyTe=\u0422\n"
"cyU=\u0423\n"
"cyFe=\u0424\n"
"cyKhe=\u0425\n"
"cyTse=\u0426\n"
"cyChe=\u0427\n"
"cyShe=\u0428\n"
"cyShche=\u0429\n"
"cyHard=\u042a\n"
"cyI=\u042b\n"
"cySoft=\u042c\n"
"cyE=\u042d\n"
"cyYu=\u042e\n"
"cyYa=\u042f\n"
"cya=\u0430\n"
"cybe=\u0431\n"
"cyve=\u0432\n"
"cyge=\u0433\n"
"cyde=\u0434\n"
"cyye=\u0435\n"
"cyzhe=\u0436\n"
"cyze=\u0437\n"
"cyyi=\u0438\n"
"cyy=\u0439\n"
"cyke=\u043a\n"
"cyle=\u043b\n"
"cyme=\u043c\n"
"cyne=\u043d\n"
"cyo=\u043e\n"
"cype=\u043f\n"
"cyre=\u0440\n"
"cyse=\u0441\n"
"cyte=\u0442\n"
"cyu=\u0443\n"
"cyfe=\u0444\n"
"cykhe=\u0445\n"
"cytse=\u0446\n"
"cyche=\u0447\n"
"cyshe=\u0448\n"
"cyshche=\u0449\n"
"cyhard=\u044a\n"
"cyi=\u044b\n"
"cysoft=\u044c\n"
"cye=\u044d\n"
"cyyu=\u044e\n"
"cyya=\u044f\n"
"cyyo=\u0451\n"
// convert English to Russian
"Russian>\u041f\u0420\u0410\u0412\u0414\u0410\u00D1\u0020\u0411\u044d\u043b\u0430\u0440\u0443\u0441\u043a\u0430\u044f\u002c\u0020\u043a\u044b\u0440\u0433\u044b\u0437\u002c\u0020\u041c\u043e\u043b\u0434\u043e\u0432\u044d\u043d\u044f\u0441\u043a\u044d\u002e\n"
//special equivs for ay, oy, ...
"YAI>{cyYa}{cyY}\n"
"YEI>{cyYe}{cyY}\n"
"YII>{cyYi}{cyY}\n"
"YOI>{cyYo}{cyY}\n"
"YUI>{cyYu}{cyY}\n"
"AI>{cyA}{cyY}\n"
"EI>{cyE}{cyY}\n"
//skip II, since it is the soft sign
"OI>{cyO}{cyY}\n"
"UI>{cyU}{cyY}\n"
"A>{cyA}\n"
"B>{cyBe}\n"
"CH>{cyChe}\n"
"C[I>{cySe}\n"
"C[E>{cySe}\n"
"C[Y>{cySe}\n"
"C>{cyKe}\n"
"D>{cyDe}\n"
"E>{cyE}\n"
"F>{cyFe}\n"
"G>{cyGe}\n"
"H>{cyHard}\n"
"II>{cySoft}\n"
"I>{cyI}\n"
"J>{cyDe}{cyZhe}\n"
"KH>{cyKhe}\n"
"K>{cyKe}\n"
"L>{cyLe}\n"
"M>{cyMe}\n"
"N>{cyNe}\n"
"O>{cyO}\n"
"P>{cyPe}\n"
"QU>{cyKe}{cyVe}\n"
"R>{cyRe}\n"
"SHTCH>{cyShche}\n"
"SHCH>{cyShche}\n"
"SH>{cyShe}\n"
"S>{cySe}\n"
"TCH>{cyChe}\n"
"TH>{cyZe}\n"
"TS>{cyTse}\n"
"T>{cyTe}\n"
"U>{cyU}\n"
"V>{cyVe}\n"
"WH>{cyVe}\n"
"W>{cyVe}\n"
"X>{cyKe}{cySe}\n"
"YE>{cyYe}\n"
"YO>{cyYo}\n"
"YU>{cyYu}\n"
"YA>{cyYa}\n"
"YI>{cyYi}\n"
"Y>{cyY}\n"
"ZH>{cyZhe}\n"
"Z>{cyZe}\n"
"X>{cyKe}{cySe}\n"
//lower case: doesn''t solve join bug
"yai>{cyya}{cyy}\n"
"yei>{cyye}{cyy}\n"
"yii>{cyyi}{cyy}\n"
"yoi>{cyyo}{cyy}\n"
"yui>{cyyu}{cyy}\n"
"ai>{cya}{cyy}\n"
"ei>{cye}{cyy}\n"
//skip ii, since it is the soft sign
"oi>{cyo}{cyy}\n"
"ui>{cyu}{cyy}\n"
"a>{cya}\n"
"b>{cybe}\n"
"ch>{cyche}\n"
"c[i>{cyse}\n"
"c[e>{cyse}\n"
"c[y>{cyse}\n"
"c>{cyke}\n"
"d>{cyde}\n"
"e>{cye}\n"
"f>{cyfe}\n"
"g>{cyge}\n"
"h>{cyhard}\n"
"ii>{cysoft}\n"
"i>{cyi}\n"
"j>{cyde}{cyzhe}\n"
"kh>{cykhe}\n"
"k>{cyke}\n"
"l>{cyle}\n"
"m>{cyme}\n"
"n>{cyne}\n"
"o>{cyo}\n"
"p>{cype}\n"
"qu>{cyke}{cyve}\n"
"r>{cyre}\n"
"shtch>{cyshche}\n"
"shch>{cyshche}\n"
"sh>{cyshe}\n"
"s>{cyse}\n"
"tch>{cyche}\n"
"th>{cyze}\n"
"ts>{cytse}\n"
"t>{cyte}\n"
"u>{cyu}\n"
"v>{cyve}\n"
"wh>{cyve}\n"
"w>{cyve}\n"
"x>{cyke}{cyse}\n"
"ye>{cyye}\n"
"yo>{cyyo}\n"
"yu>{cyyu}\n"
"ya>{cyya}\n"
"yi>{cyyi}\n"
"y>{cyy}\n"
"zh>{cyzhe}\n"
"z>{cyze}\n"
"x>{cyke}{cyse}\n"
//generally the last rule
"''>\n"
//now Russian to English
"Y''<{cyY}[{cyA}\n"
"Y''<{cyY}[{cyE}\n"
"Y''<{cyY}[{cyI}\n"
"Y''<{cyY}[{cyO}\n"
"Y''<{cyY}[{cyU}\n"
"A<{cyA}\n"
"B<{cyBe}\n"
"J<{cyDe}{cyZhe}\n"
"D<{cyDe}\n"
"V<{cyVe}\n"
"G<{cyGe}\n"
"ZH<{cyZhe}\n"
"Z''<{cyZe}[{cyHard}\n"
"Z<{cyZe}\n"
"YE<{cyYe}\n"
"YO<{cyYo}\n"
"YU<{cyYu}\n"
"YA<{cyYa}\n"
"YI<{cyYi}\n"
"Y<{cyY}\n"
"KH<{cyKhe}\n"
"K''<{cyKe}[{cyHard}\n"
"X<{cyKe}{cySe}\n"
"K<{cyKe}\n"
"L<{cyLe}\n"
"M<{cyMe}\n"
"N<{cyNe}\n"
"O<{cyO}\n"
"P<{cyPe}\n"
"R<{cyRe}\n"
"SHCH<{cyShche}\n"
"SH''<{cyShe}[{cyChe}\n"
"SH<{cyShe}\n"
"S''<{cySe}[{cyHard}\n"
"S<{cySe}\n"
"TS<{cyTse}\n"
"T''<{cyTe}[{cySe}\n"
"T''<{cyTe}[{cyHard}\n"
"T<{cyTe}\n"
"U<{cyU}\n"
"F<{cyFe}\n"
"CH<{cyChe}\n"
"H<{cyHard}\n"
"I''<{cyI}[{cyI}\n"
"I<{cyI}\n"
"II<{cySoft}\n"
"E<{cyE}\n"
//lowercase
"y''<{cyy}[{cya}\n"
"y''<{cyy}[{cye}\n"
"y''<{cyy}[{cyi}\n"
"y''<{cyy}[{cyo}\n"
"y''<{cyy}[{cyu}\n"
"a<{cya}\n"
"b<{cybe}\n"
"j<{cyde}{cyzhe}\n"
"d<{cyde}\n"
"v<{cyve}\n"
"g<{cyge}\n"
"zh<{cyzhe}\n"
"z''<{cyze}[{cyhard}\n"
"z<{cyze}\n"
"ye<{cyye}\n"
"yo<{cyyo}\n"
"yu<{cyyu}\n"
"ya<{cyya}\n"
"yi<{cyyi}\n"
"y<{cyy}\n"
"kh<{cykhe}\n"
"k''<{cyke}[{cyhard}\n"
"x<{cyke}{cyse}\n"
"k<{cyke}\n"
"l<{cyle}\n"
"m<{cyme}\n"
"n<{cyne}\n"
"o<{cyo}\n"
"p<{cype}\n"
"r<{cyre}\n"
"shch<{cyshche}\n"
"sh''<{cyshe}[{cyche}\n"
"sh<{cyshe}\n"
"s''<{cyse}[{cyhard}\n"
"s<{cyse}\n"
"ts<{cytse}\n"
"t''<{cyte}[{cyse}\n"
"t''<{cyte}[{cyhard}\n"
"t<{cyte}\n"
"u<{cyu}\n"
"f<{cyfe}\n"
"ch<{cyche}\n"
"h<{cyhard}\n"
"i''<{cyi}[{cyi}\n"
"i<{cyi}\n"
"ii<{cysoft}\n"
"e<{cye}\n"
//generally the last rule
"''>\n"
//the end
}
}

View file

@ -0,0 +1,83 @@
//--------------------------------------------------------------------
// Copyright (C) 1999, International Business Machines
// Corporation and others. All Rights Reserved.
//--------------------------------------------------------------------
// Date Name Description
// 11/17/99 aliu Creation.
//--------------------------------------------------------------------
// StraightQuotes-CurlyQuotes
quotes {
Rule {
// Rewritten using character codes [LIU]
"white=[[:Zs:][:Zl:][:Zp:]]\n"
"black=[^[:Zs:][:Zl:][:Zp:]]\n"
"open=[[:Ps:]]\n"
"dquote=\"\n"
"lAng=\u3008\n"
"ldAng=\u300A\n"
"lBrk='['\n"
"lBrc='{'\n"
"lquote=\u2018\n"
"rquote=\u2019\n"
"ldquote=\u201C\n"
"rdquote=\u201D\n"
"ldguill=\u00AB\n"
"rdguill=\u00BB\n"
"lguill=\u2039\n"
"rguill=\u203A\n"
"mdash=\u2014\n"
//#######################################
// Conversions from input
//#######################################
// join single quotes
"{lquote}''>{ldquote}\n"
"{lquote}{lquote}>{ldquote}\n"
"{rquote}''>{rdquote}\n"
"{rquote}{rquote}>{rdquote}\n"
//smart single quotes
"{white}]''>{lquote}\n"
"{open}]''>{lquote}\n"
"{black}]''>{rquote}\n"
"''>{lquote}\n"
//smart doubles
"{white}]{dquote}>{ldquote}\n"
"{open}]{dquote}>{ldquote}\n"
"{black}]{dquote}>{rdquote}\n"
"{dquote}>{ldquote}\n"
// join single guillemets
"{rguill}{rguill}>{rdguill}\n"
"'>>'>{rdguill}\n"
"{lguill}{lguill}>{ldguill}\n"
"'<<'>{ldguill}\n"
// prevent double spaces
" ] >\n"
// join hyphens into dash
"-->{mdash}\n"
//#######################################
// Conversions back to input
//#######################################
//smart quotes
"''<{lquote}\n"
"''<{rquote}\n"
"{dquote}<{ldquote}\n"
"{dquote}<{rdquote}\n"
//hyphens
"--<{mdash}\n"
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,277 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#include "cpdtrans.h"
#include "unifilt.h"
#include "unifltlg.h"
CompoundTransliterator::CompoundTransliterator(const UnicodeString& ID,
UnicodeFilter* adoptedFilter) :
Transliterator(ID,adoptedFilter),
trans(0), count(0) {
}
/**
* Constructs a new compound transliterator given an array of
* transliterators. The array of transliterators may be of any
* length, including zero or one, however, useful compound
* transliterators have at least two components.
* @param transliterators array of <code>Transliterator</code>
* objects
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
*/
CompoundTransliterator::CompoundTransliterator(const UnicodeString& ID,
Transliterator* const transliterators[],
int32_t transCount,
UnicodeFilter* adoptedFilter) :
Transliterator(ID,adoptedFilter),
trans(0), count(0) {
setTransliterators(transliterators, transCount);
}
/**
* Copy constructor.
*/
CompoundTransliterator::CompoundTransliterator(const CompoundTransliterator& t) :
Transliterator(t), trans(0), count(0) {
*this = t;
}
/**
* Destructor
*/
CompoundTransliterator::~CompoundTransliterator() {
freeTransliterators();
}
void CompoundTransliterator::freeTransliterators() {
for (int32_t i=0; i<count; ++i) {
delete trans[i];
}
delete[] trans;
trans = 0;
count = 0;
}
/**
* Assignment operator.
*/
CompoundTransliterator& CompoundTransliterator::operator=(
const CompoundTransliterator& t) {
Transliterator::operator=(t);
int32_t i;
for (i=0; i<count; ++i) {
delete trans[i];
trans[i] = 0;
}
if (t.count > count) {
delete[] trans;
trans = new Transliterator*[t.count];
}
count = t.count;
for (i=0; i<count; ++i) {
trans[i] = t.trans[i]->clone();
}
return *this;
}
/**
* Transliterator API.
*/
Transliterator* CompoundTransliterator::clone() const {
return new CompoundTransliterator(*this);
}
/**
* Returns the number of transliterators in this chain.
* @return number of transliterators in this chain.
*/
int32_t CompoundTransliterator::getCount() const {
return count;
}
/**
* Returns the transliterator at the given index in this chain.
* @param index index into chain, from 0 to <code>getCount() - 1</code>
* @return transliterator at the given index
*/
const Transliterator& CompoundTransliterator::getTransliterator(int32_t index) const {
return *trans[index];
}
void CompoundTransliterator::setTransliterators(Transliterator* const transliterators[],
int32_t transCount) {
Transliterator** a = new Transliterator*[transCount];
for (int32_t i=0; i<transCount; ++i) {
a[i] = transliterators[i]->clone();
}
adoptTransliterators(a, transCount);
}
void CompoundTransliterator::adoptTransliterators(Transliterator* adoptedTransliterators[],
int32_t transCount) {
freeTransliterators();
trans = adoptedTransliterators;
count = transCount;
}
/**
* Transliterates a segment of a string. <code>Transliterator</code> API.
* @param text the string to be transliterated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @return the new limit index
*/
int32_t CompoundTransliterator::transliterate(Replaceable& text,
int32_t start, int32_t limit) const {
for (int32_t i=0; i<count; ++i) {
limit = trans[i]->transliterate(text, start, limit);
}
return limit;
}
/**
* Implements {@link Transliterator#handleKeyboardTransliterate}.
*/
void CompoundTransliterator::handleKeyboardTransliterate(Replaceable& text,
int32_t index[3]) const {
/* Call each transliterator with the same start value and
* initial cursor index, but with the limit index as modified
* by preceding transliterators. The cursor index must be
* reset for each transliterator to give each a chance to
* transliterate the text. The initial cursor index is known
* to still point to the same place after each transliterator
* is called because each transliterator will not change the
* text between start and the initial value of cursor.
*
* IMPORTANT: After the first transliterator, each subsequent
* transliterator only gets to transliterate text committed by
* preceding transliterators; that is, the cursor (output
* value) of transliterator i becomes the limit (input value)
* of transliterator i+1. Finally, the overall limit is fixed
* up before we return.
*
* Assumptions we make here:
* (1) start <= cursor <= limit ;cursor valid on entry
* (2) cursor <= cursor' <= limit' ;cursor doesn't move back
* (3) cursor <= limit' ;text before cursor unchanged
* - cursor' is the value of cursor after calling handleKT
* - limit' is the value of limit after calling handleKT
*/
/**
* Example: 3 transliterators. This example illustrates the
* mechanics we need to implement. S, C, and L are the start,
* cursor, and limit. gl is the globalLimit.
*
* 1. h-u, changes hex to Unicode
*
* 4 7 a d 0 4 7 a
* abc/u0061/u => abca/u
* S C L S C L gl=f->a
*
* 2. upup, changes "x" to "XX"
*
* 4 7 a 4 7 a
* abca/u => abcAA/u
* S CL S C
* L gl=a->b
* 3. u-h, changes Unicode to hex
*
* 4 7 a 4 7 a d 0 3
* abcAA/u => abc/u0041/u0041/u
* S C L S C
* L gl=b->15
* 4. return
*
* 4 7 a d 0 3
* abc/u0041/u0041/u
* S C L
*/
if (count < 1) {
return; // Short circuit for empty compound transliterators
}
/**
* One more wrinkle. If there is a filter F for the compound
* transliterator as a whole, then we need to modify every
* non-null filter f in the chain to be f' = F & f. Then,
* when we're done, we restore the original filters.
*
* A possible future optimization is to change f to f' at
* construction time, but then if anyone else is using the
* transliterators in the chain outside of this context, they
* will get unexpected results.
*/
const UnicodeFilter* F = getFilter();
UnicodeFilter** f = 0;
if (F != 0) {
f = new UnicodeFilter*[count];
for (int32_t i=0; i<count; ++i) {
f[i] = trans[i]->getFilter()->clone();
trans[i]->adoptFilter(UnicodeFilterLogic::createAnd(*F, *f[i]));
}
}
int32_t cursor = index[CURSOR];
int32_t limit = index[LIMIT];
int32_t globalLimit = limit;
/* globalLimit is the overall limit. We keep track of this
* since we overwrite index[LIMIT] with the previous
* index[CURSOR]. After each transliteration, we update
* globalLimit for insertions or deletions that have happened.
*/
for (int32_t i=0; i<count; ++i) {
index[CURSOR] = cursor; // Reset cursor
index[LIMIT] = limit;
trans[i]->handleKeyboardTransliterate(text, index);
// Adjust overall limit for insertions/deletions
globalLimit += index[LIMIT] - limit;
limit = index[CURSOR]; // Move limit to end of committed text
}
// Cursor is good where it is -- where the last
// transliterator left it. Limit needs to be put back
// where it was, modulo adjustments for deletions/insertions.
index[LIMIT] = globalLimit;
// Fixup the transliterator filters, if we had to modify them.
if (f != 0) {
for (int32_t i=0; i<count; ++i) {
trans[i]->adoptFilter(f[i]);
}
delete[] f;
}
}
/**
* Returns the length of the longest context required by this transliterator.
* This is <em>preceding</em> context.
* @return maximum number of preceding context characters this
* transliterator needs to examine
*/
int32_t CompoundTransliterator::getMaximumContextLength() const {
int32_t max = 0;
for (int32_t i=0; i<count; ++i) {
int32_t len = trans[i]->getMaximumContextLength();
if (len > max) {
max = len;
}
}
return max;
}

View file

@ -0,0 +1,133 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#ifndef CPDTRANS_H
#define CPDTRANS_H
#include "translit.h"
/**
* A transliterator that is composed of two or more other
* transliterator objects linked together. For example, if one
* transliterator transliterates from script A to script B, and
* another transliterates from script B to script C, the two may be
* combined to form a new transliterator from A to C.
*
* <p>Composed transliterators may not behave as expected. For
* example, inverses may not combine to form the identity
* transliterator. See the class documentation for {@link
* Transliterator} for details.
*
* <p>If a non-<tt>null</tt> <tt>UnicodeFilter</tt> is applied to a
* <tt>CompoundTransliterator</tt>, it has the effect of being
* logically <b>and</b>ed with the filter of each transliterator in
* the chain.
*
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
*
* @author Alan Liu
* @version $RCSfile: cpdtrans.h,v $ $Revision: 1.1 $ $Date: 1999/11/20 00:36:43 $
*/
class U_I18N_API CompoundTransliterator : public Transliterator {
Transliterator** trans;
int32_t count;
public:
/**
* Constructs a new compound transliterator given an array of
* transliterators. The array of transliterators may be of any
* length, including zero or one, however, useful compound
* transliterators have at least two components.
* @param transliterators array of <code>Transliterator</code>
* objects
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
*/
CompoundTransliterator(const UnicodeString& ID,
Transliterator* const transliterators[],
int32_t count,
UnicodeFilter* adoptedFilter = 0);
CompoundTransliterator(const UnicodeString& ID,
UnicodeFilter* adoptedFilter = 0);
/**
* Destructor.
*/
virtual ~CompoundTransliterator();
/**
* Copy constructor.
*/
CompoundTransliterator(const CompoundTransliterator&);
/**
* Assignment operator.
*/
CompoundTransliterator& operator=(const CompoundTransliterator&);
/**
* Transliterator API.
*/
Transliterator* clone() const;
/**
* Returns the number of transliterators in this chain.
* @return number of transliterators in this chain.
*/
virtual int32_t getCount() const;
/**
* Returns the transliterator at the given index in this chain.
* @param index index into chain, from 0 to <code>getCount() - 1</code>
* @return transliterator at the given index
*/
virtual const Transliterator& getTransliterator(int32_t index) const;
void setTransliterators(Transliterator* const transliterators[],
int32_t count);
void adoptTransliterators(Transliterator* adoptedTransliterators[],
int32_t count);
/**
* Transliterates a segment of a string. <code>Transliterator</code> API.
* @param text the string to be transliterated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @return the new limit index
*/
virtual int32_t transliterate(Replaceable& text, int32_t start, int32_t limit) const;
/**
* Implements {@link Transliterator#handleKeyboardTransliterate}.
*/
virtual void handleKeyboardTransliterate(Replaceable& text,
int32_t index[3]) const;
/**
* Returns the length of the longest context required by this transliterator.
* This is <em>preceding</em> context.
* @return maximum number of preceding context characters this
* transliterator needs to examine
*/
virtual int32_t getMaximumContextLength() const;
private:
void freeTransliterators();
};
#endif

View file

@ -0,0 +1,155 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#include "hextouni.h"
#include "rep.h"
#include "unifilt.h"
#include "uniset.h" // For UnicodeSet::digit REMOVE LATER
/**
* ID for this transliterator.
*/
const char* HexToUnicodeTransliterator::_ID = "Hex-Unicode";
/**
* Constructs a transliterator.
*/
HexToUnicodeTransliterator::HexToUnicodeTransliterator(UnicodeFilter* adoptedFilter) :
Transliterator(_ID, adoptedFilter) {
}
/**
* Copy constructor.
*/
HexToUnicodeTransliterator::HexToUnicodeTransliterator(const HexToUnicodeTransliterator& o) :
Transliterator(o) {
}
/**
* Assignment operator.
*/
HexToUnicodeTransliterator& HexToUnicodeTransliterator::operator=(
const HexToUnicodeTransliterator& o) {
Transliterator::operator=(o);
return *this;
}
/**
* Transliterator API.
*/
Transliterator* HexToUnicodeTransliterator::clone() const {
return new HexToUnicodeTransliterator(*this);
}
/**
* Transliterates a segment of a string. <code>Transliterator</code> API.
* @param text the string to be transliterated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @return the new limit index
*/
int32_t HexToUnicodeTransliterator::transliterate(Replaceable& text,
int32_t start, int32_t limit) const {
int32_t offsets[3] = { start, limit, start };
handleKeyboardTransliterate(text, offsets);
return offsets[LIMIT];
}
/**
* Implements {@link Transliterator#handleKeyboardTransliterate}.
*/
void HexToUnicodeTransliterator::handleKeyboardTransliterate(Replaceable& text,
int32_t offsets[3]) const {
/**
* Performs transliteration changing Unicode hexadecimal
* escapes to characters. For example, "U+0040" -> '@'. A fixed
* set of prefixes is recognized: "&#92;u", "&#92;U", "u+", "U+".
*/
int32_t cursor = offsets[CURSOR];
int32_t limit = offsets[LIMIT];
int32_t maxCursor = limit - 6;
while (cursor <= maxCursor) {
UChar c = filteredCharAt(text, cursor + 5);
int32_t digit0 = UnicodeSet::digit(c, 16);
if (digit0 < 0) {
if (c == '\\') {
cursor += 5;
} else if (c == 'U' || c == 'u' || c == '+') {
cursor += 4;
} else {
cursor += 6;
}
continue;
}
int32_t u = digit0;
bool_t toTop = FALSE;
for (int32_t i=4; i>=2; --i) {
c = filteredCharAt(text, cursor + i);
int32_t digit = UnicodeSet::digit(c, 16);
if (digit < 0) {
if (c == 'U' || c == 'u' || c == '+') {
cursor += i-1;
} else {
cursor += 6;
}
toTop = TRUE; // This is a little awkward -- it was a "continue loop:"
break; // statement in Java, where loop marked the while().
} else {
u |= digit << (4 * (5-i));
}
}
if (toTop) {
continue;
}
c = filteredCharAt(text, cursor);
UChar d = filteredCharAt(text, cursor + 1);
if (((c == 'U' || c == 'u') && d == '+')
|| (c == '\\' && (d == 'U' || d == 'u'))) {
// At this point, we have a match; replace cursor..cursor+5
// with u.
text.handleReplaceBetween(cursor, cursor+6, UnicodeString((UChar)u));
limit -= 5;
maxCursor -= 5;
++cursor;
} else {
cursor += 6;
}
}
offsets[LIMIT] = limit;
offsets[CURSOR] = cursor;
}
UChar HexToUnicodeTransliterator::filteredCharAt(Replaceable& text, int32_t i) const {
UChar c;
const UnicodeFilter* filter = getFilter();
return (filter == 0) ? text.charAt(i) :
(filter->isIn(c = text.charAt(i)) ? c : (UChar)0xFFFF);
}
/**
* Return the length of the longest context required by this transliterator.
* This is <em>preceding</em> context.
* @param direction either <code>FORWARD</code> or <code>REVERSE</code>
* @return maximum number of preceding context characters this
* transliterator needs to examine
*/
int32_t HexToUnicodeTransliterator::getMaximumContextLength() const {
return 0;
}

View file

@ -0,0 +1,95 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#ifndef HEXTOUNI_H
#define HEXTOUNI_H
#include "translit.h"
/**
* A transliterator that converts from hexadecimal Unicode
* escape sequences to the characters they represent. For example, "U+0040"
* and '\u0040'. It recognizes the
* prefixes "U+", "u+", "&#92;U", and "&#92;u". Hex values may be
* upper- or lowercase.
*
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
*
* @author Alan Liu
* @version $RCSfile: hextouni.h,v $ $Revision: 1.1 $ $Date: 1999/11/20 00:36:43 $
*/
class U_I18N_API HexToUnicodeTransliterator : public Transliterator {
/**
* ID for this transliterator.
*/
static const char* _ID;
public:
/**
* Constructs a transliterator.
*/
HexToUnicodeTransliterator(UnicodeFilter* adoptedFilter = 0);
/**
* Destructor.
*/
virtual ~HexToUnicodeTransliterator();
/**
* Copy constructor.
*/
HexToUnicodeTransliterator(const HexToUnicodeTransliterator&);
/**
* Assignment operator.
*/
HexToUnicodeTransliterator& operator=(const HexToUnicodeTransliterator&);
/**
* Transliterator API.
*/
Transliterator* clone() const;
/**
* Transliterates a segment of a string. <code>Transliterator</code> API.
* @param text the string to be transliterated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @return the new limit index
*/
virtual int32_t transliterate(Replaceable &text,
int32_t start, int32_t limit) const;
/**
* Implements {@link Transliterator#handleKeyboardTransliterate}.
*/
virtual void handleKeyboardTransliterate(Replaceable& text,
int32_t offsets[3]) const;
/**
* Return the length of the longest context required by this transliterator.
* This is <em>preceding</em> context.
* @param direction either <code>FORWARD</code> or <code>REVERSE</code>
* @return maximum number of preceding context characters this
* transliterator needs to examine
*/
virtual int32_t getMaximumContextLength() const;
private:
UChar filteredCharAt(Replaceable& text, int32_t i) const;
};
inline HexToUnicodeTransliterator::~HexToUnicodeTransliterator() {}
#endif

View file

@ -69,7 +69,7 @@ LINK32=link.exe
# PROP Ignore_Export_Lib 0
# PROP Target_Dir ""
# ADD BASE CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "I18N_EXPORTS" /YX /FD /GZ /c
# ADD CPP /nologo /MDd /Za /W3 /Gm /GX /ZI /Od /I "..\..\include" /I "..\..\source\common" /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "I18N_EXPORTS" /D "U_I18N_IMPLEMENTATION" /YX /FD /GZ /c
# ADD CPP /nologo /MDd /Za /W3 /Gm /GX /ZI /Od /I "..\..\include" /I "..\..\source\common" /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "I18N_EXPORTS" /D "U_I18N_IMPLEMENTATION" /FR /YX /FD /GZ /c
# ADD BASE MTL /nologo /D "_DEBUG" /mktyplib203 /win32
# ADD MTL /nologo /D "_DEBUG" /mktyplib203 /win32
# ADD BASE RSC /l 0x409 /d "_DEBUG"
@ -124,6 +124,10 @@ SOURCE=.\colrules.cpp
# End Source File
# Begin Source File
SOURCE=.\cpdtrans.cpp
# End Source File
# Begin Source File
SOURCE=.\datefmt.cpp
# End Source File
# Begin Source File
@ -153,6 +157,10 @@ SOURCE=.\gregocal.cpp
# End Source File
# Begin Source File
SOURCE=.\hextouni.cpp
# End Source File
# Begin Source File
SOURCE=.\lnbkdat.cpp
# End Source File
# Begin Source File
@ -173,6 +181,26 @@ SOURCE=.\ptnentry.cpp
# End Source File
# Begin Source File
SOURCE=.\rbt.cpp
# End Source File
# Begin Source File
SOURCE=.\rbt_data.cpp
# End Source File
# Begin Source File
SOURCE=.\rbt_pars.cpp
# End Source File
# Begin Source File
SOURCE=.\rbt_rule.cpp
# End Source File
# Begin Source File
SOURCE=.\rbt_set.cpp
# End Source File
# Begin Source File
SOURCE=.\simpletz.cpp
# End Source File
# Begin Source File
@ -209,6 +237,10 @@ SOURCE=.\timezone.cpp
# End Source File
# Begin Source File
SOURCE=.\translit.cpp
# End Source File
# Begin Source File
SOURCE=.\txtbdat.cpp
# End Source File
# Begin Source File
@ -241,10 +273,22 @@ SOURCE=.\unicdcm.cpp
# End Source File
# Begin Source File
SOURCE=.\unifltlg.cpp
# End Source File
# Begin Source File
SOURCE=.\unirange.cpp
# End Source File
# Begin Source File
SOURCE=.\uniset.cpp
# End Source File
# Begin Source File
SOURCE=.\unitohex.cpp
# End Source File
# Begin Source File
SOURCE=.\unum.cpp
# End Source File
# Begin Source File
@ -404,6 +448,33 @@ SOURCE=.\colrules.h
# End Source File
# Begin Source File
SOURCE=.\cpdtrans.h
!IF "$(CFG)" == "i18n - Win32 Release"
# Begin Custom Build
InputPath=.\cpdtrans.h
"..\..\include\cpdtrans.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy cpdtrans.h ..\..\include
# End Custom Build
!ELSEIF "$(CFG)" == "i18n - Win32 Debug"
# Begin Custom Build
InputPath=.\cpdtrans.h
"..\..\include\cpdtrans.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy cpdtrans.h ..\..\include
# End Custom Build
!ENDIF
# End Source File
# Begin Source File
SOURCE=.\datefmt.h
!IF "$(CFG)" == "i18n - Win32 Release"
@ -620,6 +691,33 @@ InputPath=.\gregocal.h
# End Source File
# Begin Source File
SOURCE=.\hextouni.h
!IF "$(CFG)" == "i18n - Win32 Release"
# Begin Custom Build
InputPath=.\hextouni.h
"..\..\include\hextouni.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy hextouni.h ..\..\include
# End Custom Build
!ELSEIF "$(CFG)" == "i18n - Win32 Debug"
# Begin Custom Build
InputPath=.\hextouni.h
"..\..\include\hextouni.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy hextouni.h ..\..\include
# End Custom Build
!ENDIF
# End Source File
# Begin Source File
SOURCE=.\mergecol.h
# End Source File
# Begin Source File
@ -709,6 +807,57 @@ SOURCE=.\ptnentry.h
# End Source File
# Begin Source File
SOURCE=.\rbbi.h
# End Source File
# Begin Source File
SOURCE=.\rbbi_bld.h
# End Source File
# Begin Source File
SOURCE=.\rbt.h
!IF "$(CFG)" == "i18n - Win32 Release"
# Begin Custom Build
InputPath=.\rbt.h
"..\..\include\rbt.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy rbt.h ..\..\include
# End Custom Build
!ELSEIF "$(CFG)" == "i18n - Win32 Debug"
# Begin Custom Build
InputPath=.\rbt.h
"..\..\include\rbt.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy rbt.h ..\..\include
# End Custom Build
!ENDIF
# End Source File
# Begin Source File
SOURCE=.\rbt_data.h
# End Source File
# Begin Source File
SOURCE=.\rbt_pars.h
# End Source File
# Begin Source File
SOURCE=.\rbt_rule.h
# End Source File
# Begin Source File
SOURCE=.\rbt_set.h
# End Source File
# Begin Source File
SOURCE=.\simpletz.h
!IF "$(CFG)" == "i18n - Win32 Release"
@ -860,6 +1009,33 @@ InputPath=.\timezone.h
# End Source File
# Begin Source File
SOURCE=.\translit.h
!IF "$(CFG)" == "i18n - Win32 Release"
# Begin Custom Build
InputPath=.\translit.h
"..\..\include\translit.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy translit.h ..\..\include
# End Custom Build
!ELSEIF "$(CFG)" == "i18n - Win32 Debug"
# Begin Custom Build
InputPath=.\translit.h
"..\..\include\translit.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy translit.h ..\..\include
# End Custom Build
!ENDIF
# End Source File
# Begin Source File
SOURCE=.\txtbdat.h
# End Source File
# Begin Source File
@ -1007,6 +1183,64 @@ SOURCE=.\unicdcm.h
# End Source File
# Begin Source File
SOURCE=.\unifilt.h
!IF "$(CFG)" == "i18n - Win32 Release"
# Begin Custom Build
InputPath=.\unifilt.h
"..\..\include\unifilt.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy unifilt.h ..\..\include
# End Custom Build
!ELSEIF "$(CFG)" == "i18n - Win32 Debug"
# Begin Custom Build
InputPath=.\unifilt.h
"..\..\include\unifilt.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy unifilt.h ..\..\include
# End Custom Build
!ENDIF
# End Source File
# Begin Source File
SOURCE=.\unifltlg.h
!IF "$(CFG)" == "i18n - Win32 Release"
# Begin Custom Build
InputPath=.\unifltlg.h
"..\..\include\unifltlg.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy unifltlg.h ..\..\include
# End Custom Build
!ELSEIF "$(CFG)" == "i18n - Win32 Debug"
# Begin Custom Build
InputPath=.\unifltlg.h
"..\..\include\unifltlg.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy unifltlg.h ..\..\include
# End Custom Build
!ENDIF
# End Source File
# Begin Source File
SOURCE=.\unirange.h
# End Source File
# Begin Source File
SOURCE=.\uniset.h
!IF "$(CFG)" == "i18n - Win32 Release"
@ -1034,6 +1268,33 @@ InputPath=.\uniset.h
# End Source File
# Begin Source File
SOURCE=.\unitohex.h
!IF "$(CFG)" == "i18n - Win32 Release"
# Begin Custom Build
InputPath=.\unitohex.h
"..\..\include\unitohex.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy unitohex.h ..\..\include
# End Custom Build
!ELSEIF "$(CFG)" == "i18n - Win32 Debug"
# Begin Custom Build
InputPath=.\unitohex.h
"..\..\include\unitohex.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy unitohex.h ..\..\include
# End Custom Build
!ENDIF
# End Source File
# Begin Source File
SOURCE=.\unum.h
!IF "$(CFG)" == "i18n - Win32 Release"

227
icu4c/source/i18n/rbt.cpp Normal file
View file

@ -0,0 +1,227 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#include "rbt.h"
#include "rbt_pars.h"
#include "rbt_data.h"
#include "rbt_rule.h"
#include "rep.h"
void RuleBasedTransliterator::_construct(const UnicodeString& rules,
Direction direction,
UErrorCode& status) {
data = 0;
isDataOwned = TRUE;
if (U_SUCCESS(status)) {
data = TransliterationRuleParser::parse(rules, direction);
if (data == 0) {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
}
}
RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& ID,
const TransliterationRuleData* theData,
UnicodeFilter* adoptedFilter) :
Transliterator(ID, adoptedFilter),
data(theData), isDataOwned(FALSE) {}
/**
* Copy constructor. Since the data object is immutable, we can share
* it with other objects -- no need to clone it.
*/
RuleBasedTransliterator::RuleBasedTransliterator(
const RuleBasedTransliterator& other) :
Transliterator(other), data(other.data) {}
/**
* Destructor. We do NOT own the data object, so we do not delete it.
*/
RuleBasedTransliterator::~RuleBasedTransliterator() {}
Transliterator* // Covariant return NOT ALLOWED (for portability)
RuleBasedTransliterator::clone() const {
return new RuleBasedTransliterator(*this);
}
/**
* Transliterates a segment of a string. <code>Transliterator</code> API.
* @param text the string to be transliterated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param result buffer to receive the transliterated text; previous
* contents are discarded
*/
void RuleBasedTransliterator::transliterate(const UnicodeString& text,
int32_t start, int32_t limit,
UnicodeString& result) const {
/* In the following loop there is a virtual buffer consisting of the
* text transliterated so far followed by the untransliterated text. There is
* also a cursor, which may be in the already transliterated buffer or just
* before the untransliterated text.
*
* Example: rules 1. ab>x|y
* 2. yc>z
*
* []|eabcd start - no match, copy e to tranlated buffer
* [e]|abcd match rule 1 - copy output & adjust cursor
* [ex|y]cd match rule 2 - copy output & adjust cursor
* [exz]|d no match, copy d to transliterated buffer
* [exzd]| done
*
* cursor: an index into the virtual buffer, 0..result.length()-1.
* Matches take place at the cursor. If there is no match, the cursor
* is advanced, and one character is moved from the source text to the
* result buffer.
*
* start, limit: these designate the substring of the source text which
* has not been processed yet. The range of offsets is start..limit-1.
* At any moment the virtual buffer consists of result +
* text.substring(start, limit).
*/
int32_t cursor = 0;
result.remove();
while (start < limit || cursor < result.length()) {
TransliterationRule* r = data->ruleSet.findMatch(text, start, limit,
result,
cursor,
*data,
getFilter());
if (r == 0) {
if (cursor == result.length()) {
result.append(text.charAt(start++));
}
++cursor;
} else {
// At this point we have a match of one or more
// characters. The characters cover the range [cursor,
// cursor + r->getKeyLength()) - a half-open interval.
// The index values refer to a virtual buffer with result
// holding [0, result.length()) and text holding
// [result.length(),...).
// First, figure out the range of result being replaced.
int32_t rfirst = cursor;
int32_t rlimit = icu_min(result.length(),
cursor + r->getKeyLength());
// resultPad is length of result to right of cursor; >= 0
int32_t resultPad = result.length() - cursor;
if (r->getKeyLength() > resultPad) {
start += r->getKeyLength() - resultPad;
}
result.replaceBetween(rfirst, rlimit,
r->getOutput());
cursor += r->getCursorPos();
}
}
}
/**
* Transliterates a segment of a string. <code>Transliterator</code> API.
* @param text the string to be transliterated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @return The new limit index
*/
int32_t RuleBasedTransliterator::transliterate(Replaceable& text,
int32_t start,
int32_t limit) const {
/* When using Replaceable, the algorithm is simpler, since we don't have
* two separate buffers. We keep start and limit fixed the entire time,
* relative to the text -- limit may move numerically if text is
* inserted or removed. The cursor moves from start to limit, with
* replacements happening under it.
*
* Example: rules 1. ab>x|y
* 2. yc>z
*
* |eabcd start - no match, advance cursor
* e|abcd match rule 1 - change text & adjust cursor
* ex|ycd match rule 2 - change text & adjust cursor
* exz|d no match, advance cursor
* exzd| done
*/
int32_t cursor = start;
while (cursor < limit) {
TransliterationRule* r =
data->ruleSet.findMatch(text, start, limit,
cursor, *data,
getFilter());
if (r == 0) {
++cursor;
} else {
text.handleReplaceBetween(cursor, cursor + r->getKeyLength(),
r->getOutput());
limit += r->getOutput().length() - r->getKeyLength();
cursor += r->getCursorPos();
}
}
return limit;
}
/**
* Implements {@link Transliterator#handleKeyboardTransliterate}.
*/
void
RuleBasedTransliterator::handleKeyboardTransliterate(Replaceable& text,
int32_t index[3]) const {
int32_t start = index[START];
int32_t limit = index[LIMIT];
int32_t cursor = index[CURSOR];
bool_t isPartial;
while (cursor < limit) {
TransliterationRule* r = data->ruleSet.findIncrementalMatch(
text, start, limit, cursor,
*data, isPartial,
getFilter());
/* If we match a rule then apply it by replacing the key
* with the rule output and repositioning the cursor
* appropriately. If we get a partial match, then we
* can't do anything without more text; return with the
* cursor at the current position. If we get null, then
* there is no match at this position, and we can advance
* the cursor.
*/
if (r == 0) {
if (isPartial) {
break;
} else {
++cursor;
}
} else {
text.handleReplaceBetween(cursor, cursor + r->getKeyLength(),
r->getOutput());
limit += r->getOutput().length() - r->getKeyLength();
cursor += r->getCursorPos();
}
}
index[LIMIT] = limit;
index[CURSOR] = cursor;
}
/**
* Returns the length of the longest context required by this transliterator.
* This is <em>preceding</em> context.
* @return Maximum number of preceding context characters this
* transliterator needs to examine
*/
int32_t RuleBasedTransliterator::getMaximumContextLength() const {
return data->ruleSet.getMaximumContextLength();
}

377
icu4c/source/i18n/rbt.h Normal file
View file

@ -0,0 +1,377 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#ifndef RBT_H
#define RBT_H
#include "translit.h"
#include "uhash.h"
#include "utypes.h"
class TransliterationRuleData;
/**
* A transliterator that reads a set of rules in order to determine how to
* perform translations. Rules are stored in resource bundles indexed by name.
* Rules are separated by newline characters ('\n'); to include a literal
* newline, prefix it with a backslash ('\\\n'). Whitespace is significant. If
* the first character on a line is '#', the entire line is ignored as a
* comment.
*
* <p>Each set of rules consists of two groups, one forward, and one reverse.
* This is a convention that is not enforced; rules for one direction may be
* omitted, with the result that translations in that direction will not modify
* the source text.
*
* <p><b>Rule syntax</b>
*
* <p>Rule statements take one of the following forms:
* <dl>
* <dt><code>alefmadda=&#092;u0622</code></dt>
*
* <dd><strong>Variable definition.</strong> The name on the left is
* assigned the character or expression on the right. Names may not
* contain any special characters (see list below). Duplicate names
* (including duplicates of simple variables or category names)
* cause an exception to be thrown. If the right hand side consists
* of one character, then the variable stands for that character.
* In this example, after this statement, instances of the left hand
* name surrounded by braces, &quot;<code>{alefmadda}</code>&quot,
* will be replaced by the Unicode character U+0622.</dd> If the
* right hand side is longer than one character, then it is
* interpreted as a character category expression; see below for
* details.
*
* <dt><code>softvowel=[eiyEIY]</code></dt>
*
* <dd><strong>Category definition.</strong> The name on the left is assigned
* to stand for a set of characters. The same rules for names of simple
* variables apply. After this statement, the left hand variable will be
* interpreted as indicating a set of characters in appropriate contexts. The
* pattern syntax defining sets of characters is defined by {@link UnicodeSet}.
* Examples of valid patterns are:<table>
*
* <tr valign=top>
* <td nowrap><code>[abc]</code></td>
* <td>The set containing the characters 'a', 'b', and 'c'.</td>
* </tr>
* <tr valign=top>
* <td nowrap><code>[^abc]</code></td>
* <td>The set of all characters <em>except</em> 'a', 'b', and 'c'.</td>
* </tr>
* <tr valign=top>
* <td nowrap><code>[A-Z]</code></td>
* <td>The set of all characters from 'A' to 'Z' in Unicode order.</td>
* </tr>
* <tr valign=top>
* <td nowrap><code>[:Lu:]</code></td>
* <td>The set of Unicode uppercase letters. See
* <a href="http://www.unicode.org">www.unicode.org</a>
* for a complete list of categories and their two-letter codes.</td>
* </tr>
* <tr valign=top>
* <td nowrap><code>[^a-z[:Lu:][:Ll:]]</code></td>
* <td>The set of all characters <em>except</em> 'a' through 'z' and
* uppercase or lowercase letters.</td>
* </tr>
* </table>
*
* See {@link UnicodeSet} for more documentation and examples.
* </dd>
*
* <dt><code>ai&gt;{alefmadda}</code></dt>
*
* <dd><strong>Forward translation rule.</strong> This rule states that the
* string on the left will be changed to the string on the right when
* performing forward transliteration.</dd>
*
* <dt><code>ai&lt;{alefmadda}</code></dt>
*
* <dd><strong>Reverse translation rule.</strong> This rule states that the
* string on the right will be changed to the string on the left when
* performing reverse transliteration.</dd>
*
* </dl>
*
* <p>Forward and reverse translation rules consist of a <em>match
* pattern</em> and an <em>output string</em>. The match pattern consists
* of literal characters, optionally preceded by context, and optionally
* followed by context. Context characters, like literal pattern characters,
* must be matched in the text being transliterated. However, unlike literal
* pattern characters, they are not replaced by the output text. For example,
* the pattern "<code>[abc]def</code>" indicates the characters
* "<code>def</code>" must be preceded by "<code>abc</code>" for a successful
* match. If there is a successful match, "<code>def</code>" will be replaced,
* but not "<code>abc</code>". The initial '<code>[</code>' is optional, so
* "<code>abc]def</code>" is equivalent to "<code>[abc]def</code>". Another
* example is "<code>123[456]</code>" (or "<code>123[456</code>") in which the
* literal pattern "<code>123</code>" must be followed by "<code>456</code>".
*
* <p>The output string of a forward or reverse rule consists of characters to
* replace the literal pattern characters. If the output string contains the
* character '<code>|</code>', this is taken to indicate the location of the
* <em>cursor</em> after replacement. The cursor is the point in the text
* at which the next replacement, if any, will be applied.
*
* <p><b>Example</b>
*
* <p>The following example rules illustrate many of the features of the rule
* language.
* <table cellpadding="4">
* <tr valign=top><td>Rule 1.</td>
* <td nowrap><code>abc]def&gt;x|y</code></td></tr>
* <tr valign=top><td>Rule 2.</td>
* <td nowrap><code>xyz&gt;r</code></td></tr>
* <tr valign=top><td>Rule 3.</td>
* <td nowrap><code>yz&gt;q</code></td></tr>
* </table>
*
* <p>Applying these rules to the string "<code>adefabcdefz</code>" yields the
* following results:
*
* <table cellpadding="4">
* <tr valign=top><td nowrap><code>|adefabcdefz</code></td>
* <td>Initial state, no rules match. Advance cursor.</td></tr>
* <tr valign=top><td nowrap><code>a|defabcdefz</code></td>
* <td>Still no match. Rule 1 does not match because the preceding
* context is not present.</td></tr>
* <tr valign=top><td nowrap><code>ad|efabcdefz</code></td>
* <td>Still no match. Keep advancing until there is a match...</td></tr>
* <tr valign=top><td nowrap><code>ade|fabcdefz</code></td>
* <td>...</td></tr>
* <tr valign=top><td nowrap><code>adef|abcdefz</code></td>
* <td>...</td></tr>
* <tr valign=top><td nowrap><code>adefa|bcdefz</code></td>
* <td>...</td></tr>
* <tr valign=top><td nowrap><code>adefab|cdefz</code></td>
* <td>...</td></tr>
* <tr valign=top><td nowrap><code>adefabc|defz</code></td>
* <td>Rule 1 matches; replace "<code>def</code>" with "<code>xy</code>"
* and back up the cursor to before the '<code>y</code>'.</td></tr>
* <tr valign=top><td nowrap><code>adefabcx|yz</code></td>
* <td>Although "<code>xyz</code>" is present, rule 2 does not match
* because the cursor is before the '<code>y</code>', not before the
* '<code>x</code>'. Rule 3 does match. Replace "<code>yz</code>" with
* "<code>q</code>".</td></tr>
* <tr valign=top><td nowrap><code>adefabcxq|</code></td>
* <td>The cursor is at the end; transliteration is complete.</td></tr>
* </table>
*
* <p>The order of rules is significant. If multiple rules may match at some
* point, the first matching rule is applied.
*
* <p>Forward and reverse rules may have an empty output string. Otherwise, an
* empty left or right hand side of any statement is a syntax error.
*
* <p>Single quotes are used to quote the special characters
* <code>=&gt;&lt;{}[]|</code>. To specify a single quote itself, inside or
* outside of quotes, use two single quotes in a row. For example, the rule
* "<code>'&gt;'&gt;o''clock</code>" changes the string "<code>&gt;</code>" to
* the string "<code>o'clock</code>".
*
* <p><b>Notes</b>
*
* <p>While a RuleBasedTransliterator is being built, it checks that the rules
* are added in proper order. For example, if the rule "a>x" is followed by the
* rule "ab>y", then the second rule will throw an exception. The reason is
* that the second rule can never be triggered, since the first rule always
* matches anything it matches. In other words, the first rule <em>masks</em>
* the second rule. There is a cost of O(n^2) to make this check; in real-world
* tests it appears to approximately double build time.
*
* <p>One optimization that can be made is to add a pragma to the rule language,
* "#pragma order", that turns off ordering checking. This pragma can then be
* added to all of our resource-based rules (after we build these once and
* determine that there are no ordering errors). I haven't made this change yet
* in the interests of keeping the code from getting too byzantine.
*
* @author Alan Liu
*/
class U_I18N_API RuleBasedTransliterator : public Transliterator {
/**
* The data object is immutable, so we can freely share it with
* other instances of RBT, as long as we do NOT own this object.
*/
TransliterationRuleData* data;
/**
* If true, we own the data object and must delete it.
*/
bool_t dataIsOwned;
public:
/**
* Direction constant passed to constructor to specify whether forward
* or reverse rules are parsed. The other rules are ignored.
*/
enum Direction {
/**
* Direction constant passed to constructor to create a transliterator
* using the forward rules.
*/
FORWARD,
/**
* Direction constant passed to constructor to create a transliterator
* using the reverse rules.
*/
REVERSE
};
/**
* Constructs a new transliterator from the given rules.
* @param rules rules, separated by '\n'
* @param direction either FORWARD or REVERSE.
* @exception IllegalArgumentException if rules are malformed
* or direction is invalid.
*/
RuleBasedTransliterator(const UnicodeString& ID,
const UnicodeString& rules,
Direction direction,
UnicodeFilter* adoptedFilter,
UErrorCode& status);
/**
* Covenience constructor with no filter.
*/
RuleBasedTransliterator(const UnicodeString& ID,
const UnicodeString& rules,
Direction direction,
UErrorCode& status);
/**
* Covenience constructor with no filter and FORWARD direction.
*/
RuleBasedTransliterator(const UnicodeString& ID,
const UnicodeString& rules,
UErrorCode& status);
/**
* Covenience constructor with FORWARD direction.
*/
RuleBasedTransliterator(const UnicodeString& ID,
const UnicodeString& rules,
UnicodeFilter* adoptedFilter,
UErrorCode& status);
RuleBasedTransliterator(const UnicodeString& ID,
const TransliterationRuleData* theData,
UnicodeFilter* adoptedFilter = 0);
RuleBasedTransliterator(const RuleBasedTransliterator&);
virtual ~RuleBasedTransliterator();
/**
* Implement Transliterator API.
*/
Transliterator* clone() const;
/**
* Transliterates a segment of a string. <code>Transliterator</code> API.
* @param text the string to be transliterated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param result buffer to receive the transliterated text; previous
* contents are discarded
*/
virtual void transliterate(const UnicodeString& text,
int32_t start, int32_t limit,
UnicodeString& result) const;
/**
* Transliterates a segment of a string. <code>Transliterator</code> API.
* @param text the string to be transliterated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @return The new limit index
*/
virtual int32_t transliterate(Replaceable& text,
int32_t start, int32_t limit) const;
/**
* Implements {@link Transliterator#handleKeyboardTransliterate}.
*/
virtual void handleKeyboardTransliterate(Replaceable& text,
int32_t index[3]) const;
/**
* Returns the length of the longest context required by this transliterator.
* This is <em>preceding</em> context.
* @return Maximum number of preceding context characters this
* transliterator needs to examine
*/
virtual int32_t getMaximumContextLength() const;
private:
void _construct(const UnicodeString& rules,
Direction direction,
UErrorCode& status);
};
/**
* Constructs a new transliterator from the given rules.
* @param rules rules, separated by '\n'
* @param direction either FORWARD or REVERSE.
* @exception IllegalArgumentException if rules are malformed
* or direction is invalid.
*/
inline RuleBasedTransliterator::RuleBasedTransliterator(
const UnicodeString& ID,
const UnicodeString& rules,
Direction direction,
UnicodeFilter* adoptedFilter,
UErrorCode& status) :
Transliterator(ID, adoptedFilter) {
_construct(rules, direction, status);
}
/**
* Covenience constructor with no filter.
*/
inline RuleBasedTransliterator::RuleBasedTransliterator(
const UnicodeString& ID,
const UnicodeString& rules,
Direction direction,
UErrorCode& status) :
Transliterator(ID, 0) {
_construct(rules, direction, status);
}
/**
* Covenience constructor with no filter and FORWARD direction.
*/
inline RuleBasedTransliterator::RuleBasedTransliterator(
const UnicodeString& ID,
const UnicodeString& rules,
UErrorCode& status) :
Transliterator(ID, 0) {
_construct(rules, FORWARD, status);
}
/**
* Covenience constructor with FORWARD direction.
*/
inline RuleBasedTransliterator::RuleBasedTransliterator(
const UnicodeString& ID,
const UnicodeString& rules,
UnicodeFilter* adoptedFilter,
UErrorCode& status) :
Transliterator(ID, adoptedFilter) {
_construct(rules, FORWARD, status);
}
#endif

View file

@ -0,0 +1,83 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#include "rbt_data.h"
#include "uhash.h"
#include "unistr.h"
TransliterationRuleData::TransliterationRuleData(UErrorCode& status) :
variableNames(0), setVariables(0) {
if (U_FAILURE(status)) {
return;
}
variableNames = uhash_open(uhash_hashUString, &status);
setVariables = uhash_open(0, &status);
}
TransliterationRuleData::~TransliterationRuleData() {
if (variableNames != 0) {
uhash_close(variableNames);
}
if (setVariables != 0) {
uhash_close(setVariables);
}
}
void
TransliterationRuleData::defineVariable(const UnicodeString& name,
UChar value,
UErrorCode& status) {
uhash_putKey(variableNames, name.hashCode() & 0x7FFFFFFF,
(void*) value,
&status);
}
void
TransliterationRuleData::defineVariable(const UnicodeString& name,
UChar standIn,
UnicodeSet* adoptedSet,
UErrorCode& status) {
if (U_FAILURE(status)) {
return;
}
if (adoptedSet == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
uhash_putKey(variableNames, name.hashCode() & 0x7FFFFFFF,
(void*) standIn,
&status);
uhash_putKey(setVariables, (int32_t) (standIn & 0x7FFFFFFF),
adoptedSet,
&status);
}
UChar
TransliterationRuleData::lookupVariable(const UnicodeString& name,
UErrorCode& status) const {
if (U_FAILURE(status)) {
return 0;
}
void* value = uhash_get(variableNames, name.hashCode() & 0x7FFFFFFF);
if (value == 0) {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
return (UChar) (int32_t) value;
}
UnicodeSet*
TransliterationRuleData::lookupSet(UChar standIn) const {
void* value = uhash_get(setVariables, (int32_t) (standIn & 0x7FFFFFFF));
return (UnicodeSet*) value;
}
bool_t
TransliterationRuleData::isVariableDefined(const UnicodeString& name) const {
return 0 != uhash_get(variableNames, name.hashCode() & 0x7FFFFFFF);
}

View file

@ -0,0 +1,85 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#ifndef RBT_DATA_H
#define RBT_DATA_H
#include "rbt_set.h"
class UnicodeString;
class UnicodeSet;
struct UHashtable;
/**
* The rule data for a RuleBasedTransliterators. RBT objects hold
* a const pointer to a TRD object that they do not own. TRD objects
* are essentially the parsed rules in compact, usable form. The
* TRD objects themselves are held for the life of the process in
* a static cache owned by Transliterator.
*/
class TransliterationRuleData {
public:
/**
* Rule table. May be empty.
*
* PUBLIC DATA MEMBER for internal use by RBT
*/
TransliterationRuleSet ruleSet;
/**
* Map variable name (UnicodeString) to variable (Character).
* A variable name may correspond to a single literal
* character, in which case the character is stored in this
* hash. It may also correspond to a UnicodeSet, in which
* case a character is again stored in this hash, but the
* character is a stand-in: it is a key for a secondary lookup
* in data.setVariables. The stand-in also represents the
* UnicodeSet in the stored rules.
*
* PUBLIC DATA MEMBER for internal use by RBT
*/
UHashtable* variableNames;
/**
* Map category variable (UChar) to set (UnicodeSet).
* Variables that correspond to a set of characters are mapped
* from variable name to a stand-in character in
* data.variableNames. The stand-in then serves as a key in
* this hash to lookup the actual UnicodeSet object. In
* addition, the stand-in is stored in the rule text to
* represent the set of characters.
*
* PUBLIC DATA MEMBER for internal use by RBT
*/
UHashtable* setVariables;
TransliterationRuleData(UErrorCode& status);
~TransliterationRuleData();
void defineVariable(const UnicodeString& name,
UChar value,
UErrorCode& status);
void defineVariable(const UnicodeString& name,
UChar standIn,
UnicodeSet* adoptedSet,
UErrorCode& status);
UChar lookupVariable(const UnicodeString& name,
UErrorCode& status) const;
UnicodeSet* lookupSet(UChar standIn) const;
bool_t isVariableDefined(const UnicodeString& name) const;
};
#endif

View file

@ -0,0 +1,640 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#include "rbt_pars.h"
#include "rbt.h"
#include "rbt_rule.h"
#include "unirange.h"
#include "rbt_data.h"
#include "uniset.h"
// Operators
const UChar TransliterationRuleParser::VARIABLE_DEF_OP = '=';
const UChar TransliterationRuleParser::FORWARD_RULE_OP = '>';
const UChar TransliterationRuleParser::REVERSE_RULE_OP = '<';
const char* TransliterationRuleParser::OPERATORS = "=><";
// Other special characters
const UChar TransliterationRuleParser::QUOTE = '\'';
const UChar TransliterationRuleParser::VARIABLE_REF_OPEN = '{';
const UChar TransliterationRuleParser::VARIABLE_REF_CLOSE = '}';
const UChar TransliterationRuleParser::CONTEXT_OPEN = '[';
const UChar TransliterationRuleParser::CONTEXT_CLOSE = ']';
const UChar TransliterationRuleParser::CURSOR_POS = '|';
const UChar TransliterationRuleParser::RULE_COMMENT_CHAR = '#';
/**
* Specials must be quoted in rules to be used as literals.
* Specials may not occur in variable names.
*
* This string is a superset of OPERATORS.
*/
const char* TransliterationRuleParser::SPECIALS = "'{}[]|#=><";
/**
* Specials that must be quoted in variable definitions.
*/
const char* TransliterationRuleParser::DEF_SPECIALS = "'{}";
TransliterationRuleData*
TransliterationRuleParser::parse(const UnicodeString& rules,
RuleBasedTransliterator::Direction direction) {
TransliterationRuleParser parser(rules, direction);
parser.parseRules();
if (U_FAILURE(parser.status)) {
delete parser.data;
parser.data = 0;
}
return parser.data;
}
/**
* @param rules list of rules, separated by newline characters
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
TransliterationRuleParser::TransliterationRuleParser(
const UnicodeString& theRules,
RuleBasedTransliterator::Direction theDirection) :
rules(theRules), direction(theDirection), data(0) {}
/**
* Parse the given string as a sequence of rules, separated by newline
* characters ('\n'), and cause this object to implement those rules. Any
* previous rules are discarded. Typically this method is called exactly
* once, during construction.
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
void TransliterationRuleParser::parseRules() {
status = U_ZERO_ERROR;
delete data;
data = new TransliterationRuleData(status);
if (U_FAILURE(status)) {
return;
}
determineVariableRange();
int32_t n = rules.length();
int32_t i = 0;
while (i<n && U_SUCCESS(status)) {
int32_t limit = rules.indexOf('\n', i);
// Recognize "\\\n" as an escaped "\n"
while (limit>0 && rules.charAt(limit-1) == '\\') {
limit = rules.indexOf('\n', limit+1);
}
if (limit == -1) {
limit = n;
}
// Skip over empty lines and line starting with #
if (limit > i && rules.charAt(i) != RULE_COMMENT_CHAR) {
applyRule(i, limit);
}
i = limit + 1;
}
data->ruleSet.freeze();
}
/**
* Parse the given substring as a rule, and append it to the rules currently
* represented in this object.
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
void TransliterationRuleParser::applyRule(int32_t start, int32_t limit) {
/* General description of parsing: Initially, rules contain two types of
* quoted characters. First, there are variable references, such as
* "{alpha}". Second, there are quotes, such as "'<'" or "''". One of
* the first steps in parsing a rule is to resolve such quoted matter.
* Quotes are removed early, leaving unquoted literal matter. Variable
* references are resolved and replaced by single characters. In some
* instances these characters represent themselves; in others, they
* stand for categories of characters. Character categories are either
* predefined (e.g., "{Lu}"), or are defined by the user using a
* statement (e.g., "vowels:aeiouAEIOU").
*
* Another early step in parsing is to split each rule into component
* pieces. These pieces are, for every rule, a left-hand side, a right-
* hand side, and an operator. The left- and right-hand sides may not
* be empty, except for the output patterns of forward and reverse
* rules. In addition to this partitioning, the match patterns of
* forward and reverse rules must be partitioned into antecontext,
* postcontext, and literal pattern, where the context portions may or
* may not be present. Finally, output patterns must have the cursor
* indicator '|' detected and removed, with its position recorded.
*
* Quote removal, variable resolution, and sub-pattern splitting must
* all happen at once. This is due chiefly to the quoting mechanism,
* which allows special characters to appear at arbitrary positions in
* the final unquoted text. (For this reason, alteration of the rule
* language is somewhat clumsy; it entails reassessment and revision of
* the parsing methods as a whole.)
*
* After this processing of rules is complete, the final end products
* are unquoted pieces of text of various types, and an integer cursor
* position, if one is specified. These processed raw materials are now
* easy to deal with; other classes such as UnicodeSet and
* TransliterationRule need know nothing of quoting or variables.
*/
UnicodeString left;
UnicodeString right;
UnicodeString anteContext;
UnicodeString postContext;
int32_t cursorPos;
UChar op = parseRule(start, limit, left, right,
anteContext, postContext, cursorPos);
if (U_FAILURE(status)) {
return;
}
switch (op) {
case VARIABLE_DEF_OP:
applyVariableDef(left, right);
break;
case FORWARD_RULE_OP:
if (direction == RuleBasedTransliterator::FORWARD) {
data->ruleSet.addRule(new TransliterationRule(
left, right,
anteContext, postContext,
cursorPos, status),
status);
} // otherwise ignore the rule; it's not the direction we want
break;
case REVERSE_RULE_OP:
if (direction == RuleBasedTransliterator::REVERSE) {
data->ruleSet.addRule(new TransliterationRule(
right, left,
anteContext, postContext,
cursorPos, status),
status);
} // otherwise ignore the rule; it's not the direction we want
break;
}
}
/**
* Add a variable definition.
* @param name the name of the variable. It must not already be defined.
* @param pattern the value of the variable. It may be a single character
* or a pattern describing a character set.
* @exception IllegalArgumentException if there is a syntax error
*/
void TransliterationRuleParser::applyVariableDef(const UnicodeString& name,
const UnicodeString& pattern) {
validateVariableName(name);
if (U_FAILURE(status)) {
return;
}
if (data->isVariableDefined(name)) {
// throw new IllegalArgumentException("Duplicate variable definition: "
// + name + '=' + pattern);
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
//! if (UnicodeSet.getCategoryID(name) >= 0) {
//! throw new IllegalArgumentException("Reserved variable name: "
//! + name);
//! }
if (pattern.length() < 1) {
// throw new IllegalArgumentException("Variable definition missing: "
// + name);
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (pattern.length() == 1) {
// Got a single character variable definition
//$ data->variableNames.put(name, new Character(pattern.charAt(0)));
data->defineVariable(name, pattern.charAt(0), status);
} else {
// Got more than one character; parse it as a category
if (variableNext >= variableLimit) {
//$ throw new RuntimeException("Private use variables exhausted");
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
//$ Character c = new Character(variableNext++);
//$ data->variableNames.put(name, c);
//$ data->setVariables.put(c, new UnicodeSet(pattern));
data->defineVariable(name, variableNext++,
new UnicodeSet(pattern, status),
status);
}
}
/**
* Given a rule, parses it into three pieces: The left side, the right side,
* and the operator. Returns the operator. Quotes and variable references
* are resolved; the otuput text in all <code>StringBuffer</code> parameters
* is literal text. This method delegates to other parsing methods to
* handle the match pattern, output pattern, and other sub-patterns in the
* rule.
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @param left left side of rule is appended to this buffer
* with the quotes removed and variables resolved
* @param right right side of rule is appended to this buffer
* with the quotes removed and variables resolved
* @param anteContext the preceding context of the match pattern,
* if there is one, is appended to this buffer
* @param postContext the following context of the match pattern,
* if there is one, is appended to this buffer
* @param cursorPos if there is a cursor in the output pattern, its
* offset is stored in <code>cursorPos</code>
* @return The operator character, one of the characters in OPERATORS.
*/
UChar TransliterationRuleParser::parseRule(int32_t start, int32_t limit,
UnicodeString& left,
UnicodeString& right,
UnicodeString& anteContext,
UnicodeString& postContext,
int32_t& cursorPos) {
/* Parse the rule into three pieces -- left, operator, and right,
* parsing out quotes. The result is that left and right will have
* unquoted text. E.g., "gt<'>'" will have right = ">". Unquoted
* operators throw an exception. Two quotes inside or outside
* quotes indicates a quote literal. E.g., "o''clock" -> "o'clock".
*/
int32_t i = quotedIndexOf(rules, start, limit, OPERATORS);
if (i < 0) {
//$ throw new IllegalArgumentException(
//$ "Syntax error: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UChar c = rules.charAt(i);
switch (c) {
case FORWARD_RULE_OP:
if (i == start) {
//$ throw new IllegalArgumentException(
//$ "Empty left side: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
parseMatchPattern(start, i, left, anteContext, postContext);
if (i != (limit-1)) {
parseOutputPattern(i+1, limit, right, cursorPos);
}
break;
case REVERSE_RULE_OP:
if (i == (limit-1)) {
//$ throw new IllegalArgumentException(
//$ "Empty right side: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if (i != start) {
parseOutputPattern(start, i, left, cursorPos);
}
parseMatchPattern(i+1, limit, right, anteContext, postContext);
break;
default:
if (i == start || i == (limit-1)) {
//$ throw new IllegalArgumentException(
//$ "Empty left or right side: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
parseSubPattern(start, i, left);
parseDefPattern(i+1, limit, right);
break;
}
return c;
}
/**
* Parses the match pattern of a forward or reverse rule. Given the raw
* match pattern, return the match text and the context on both sides, if
* any. Resolves all quotes and variables.
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @param text the key to be matched will be appended to this buffer
* @param anteContext the preceding context, if any, will be appended
* to this buffer.
* @param postContext the following context, if any, will be appended
* to this buffer.
*/
void TransliterationRuleParser::parseMatchPattern(int32_t start, int32_t limit,
UnicodeString& text,
UnicodeString& anteContext,
UnicodeString& postContext) {
if (start >= limit) {
//$ throw new IllegalArgumentException(
//$ "Empty expression in rule: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
//$ if (anteContext != 0) {
// Ignore optional opening and closing context characters
if (rules.charAt(start) == CONTEXT_OPEN) {
++start;
}
if (rules.charAt(limit-1) == CONTEXT_CLOSE) {
--limit;
}
// The four possibilities are:
// key
// anteContext]key
// anteContext]key[postContext
// key[postContext
int32_t ante = quotedIndexOf(rules, start, limit, CONTEXT_CLOSE);
int32_t post = quotedIndexOf(rules, start, limit, CONTEXT_OPEN);
if (ante >= 0 && post >= 0 && ante > post) {
//$ throw new IllegalArgumentException(
//$ "Syntax error in context specifier: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (ante >= 0) {
parseSubPattern(start, ante, anteContext);
start = ante+1;
}
if (post >= 0) {
parseSubPattern(post+1, limit, postContext);
limit = post;
}
//$ }
parseSubPattern(start, limit, text);
}
void TransliterationRuleParser::parseSubPattern(int32_t start, int32_t limit,
UnicodeString& text) {
parseSubPattern(start, limit, text, 0, SPECIALS);
}
/**
* Parse a variable definition sub pattern. This kind of sub
* pattern differs in the set of characters that are considered
* special. In particular, the '[' and ']' characters are not
* special, since these are used in UnicodeSet patterns.
*/
void TransliterationRuleParser::parseDefPattern(int32_t start, int32_t limit,
UnicodeString& text) {
parseSubPattern(start, limit, text, 0, DEF_SPECIALS);
}
/**
* Parses the output pattern of a forward or reverse rule. Given the
* output pattern, return the output text and the position of the cursor,
* if any. Resolves all quotes and variables.
* @param rules the string to be parsed
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @param text the output text will be appended to this buffer
* @param cursorPos if this parameter is not null, then cursorPos
* will be set to the cursor position, or -1 if there is none. If this
* parameter is null, then cursors will be disallowed.
*/
void TransliterationRuleParser::parseOutputPattern(int32_t start, int32_t limit,
UnicodeString& text,
int32_t& cursorPos) {
parseSubPattern(start, limit, text, &cursorPos, SPECIALS);
}
/**
* Parses a sub-pattern of a rule. Return the text and the position of the cursor,
* if any. Resolves all quotes and variables.
* @param rules the string to be parsed
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @param text the output text will be appended to this buffer
* @param cursorPos if this parameter is not null, then cursorPos
* will be set to the cursor position, or -1 if there is none. If this
* parameter is null, then cursors will be disallowed.
* @param specials characters that must be quoted; typically either
* SPECIALS or DEF_SPECIALS.
*/
void TransliterationRuleParser::parseSubPattern(int32_t start, int32_t limit,
UnicodeString& text,
int32_t* cursorPos,
const UnicodeString& specials) {
bool_t inQuote = FALSE;
if (start >= limit) {
//$ throw new IllegalArgumentException("Empty expression in rule");
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (cursorPos != 0) {
*cursorPos = -1;
}
for (int32_t i=start; i<limit; ++i) {
UChar c = rules.charAt(i);
if (c == QUOTE) {
// Check for double quote
if ((i+1) < limit
&& rules.charAt(i+1) == QUOTE) {
text.append(QUOTE);
++i; // Skip over both quotes
} else {
inQuote = !inQuote;
}
} else if (inQuote) {
text.append(c);
} else if (c == VARIABLE_REF_OPEN) {
++i;
int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, i);
if (i == j || j < 0) { // empty or unterminated
//$ throw new IllegalArgumentException("Illegal variable reference: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
UnicodeString name;
rules.extractBetween(i, j, name);
validateVariableName(name);
if (U_FAILURE(status)) {
return;
}
UChar ch = data->lookupVariable(name, status);
if (U_FAILURE(status)) {
return;
}
text.append(ch);
i = j;
} else if (c == CURSOR_POS && cursorPos != 0) {
if (*cursorPos >= 0) {
//$ throw new IllegalArgumentException("Multiple cursors: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
*cursorPos = text.length();
} else if (specials.indexOf(c) >= 0) {
//$ throw new IllegalArgumentException("Unquoted special character: "
//$ + rules.substring(start, limit));
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
} else {
text.append(c);
}
}
}
void TransliterationRuleParser::validateVariableName(const UnicodeString& name) {
if (indexOf(name, SPECIALS) >= 0) {
//throw new IllegalArgumentException(
// "Special character in variable name: "
// + name);
status = U_ILLEGAL_ARGUMENT_ERROR;
}
}
/**
* Returns the single character value of the given variable name. Defined
* names are recognized.
*
* NO LONGER SUPPORTED:
* If a Unicode category name is given, a standard character variable
* in the range firstCategoryVariable to lastCategoryVariable is returned,
* with value firstCategoryVariable + n, where n is the category
* number.
* @exception IllegalArgumentException if the name is unknown.
*/
//$ UChar TransliterationRuleParser::getVariableDef(const UnicodeString& name) {
//$ UChar ch = data->lookupVariable(name, status);
//$ //! if (ch == null) {
//$ //! int id = UnicodeSet.getCategoryID(name);
//$ //! if (id >= 0) {
//$ //! ch = new Character((char) (firstCategoryVariable + id));
//$ //! data->variableNames.put(name, ch);
//$ //! data->setVariables.put(ch, new UnicodeSet(id));
//$ //! }
//$ //! }
//$ if (ch == 0) {
//$ throw new IllegalArgumentException("Undefined variable: "
//$ + name);
//$ }
//$ return ch;
//$ }
/**
* Determines what part of the private use region of Unicode we can use for
* variable stand-ins. The correct way to do this is as follows: Parse each
* rule, and for forward and reverse rules, take the FROM expression, and
* make a hash of all characters used. The TO expression should be ignored.
* When done, everything not in the hash is available for use. In practice,
* this method may employ some other algorithm for improved speed.
*/
void TransliterationRuleParser::determineVariableRange() {
UnicodeRange privateUse(0xE000, 0x1900); // Private use area
UnicodeRange* r = privateUse.largestUnusedSubrange(rules);
variableNext = variableLimit = (UChar) 0;
if (r != 0) {
variableNext = r->start;
variableLimit = (UChar) (r->start + r->length);
delete r;
}
if (variableNext >= variableLimit) {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
}
/**
* Returns the index of the first character in a set, ignoring quoted text.
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
* found by a search for "h". Unlike String.indexOf(), this method searches
* not for a single character, but for any character of the string
* <code>setOfChars</code>.
* @param text text to be searched
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param setOfChars string with one or more distinct characters
* @return Offset of the first character in <code>setOfChars</code>
* found, or -1 if not found.
* @see #indexOf
*/
int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text,
int32_t start, int32_t limit,
const UnicodeString& setOfChars) {
for (int32_t i=start; i<limit; ++i) {
UChar c = text.charAt(i);
if (c == QUOTE) {
while (++i < limit
&& text.charAt(i) != QUOTE) {}
} else if (setOfChars.indexOf(c) >= 0) {
return i;
}
}
return -1;
}
/**
* Returns the index of the first character in a set. Unlike
* String.indexOf(), this method searches not for a single character, but
* for any character of the string <code>setOfChars</code>.
* @param text text to be searched
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param setOfChars string with one or more distinct characters
* @return Offset of the first character in <code>setOfChars</code>
* found, or -1 if not found.
* @see #quotedIndexOf
*/
int32_t TransliterationRuleParser::indexOf(const UnicodeString& text,
int32_t start, int32_t limit,
const UnicodeString& setOfChars) {
for (int32_t i=start; i<limit; ++i) {
if (setOfChars.indexOf(text.charAt(i)) >= 0) {
return i;
}
}
return -1;
}
/**
* Returns the index of the first character in a set. Unlike
* String.indexOf(), this method searches not for a single character, but
* for any character of the string <code>setOfChars</code>.
* @param text text to be searched
* @param setOfChars string with one or more distinct characters
* @return Offset of the first character in <code>setOfChars</code>
* found, or -1 if not found.
* @see #quotedIndexOf
*/
int32_t TransliterationRuleParser::indexOf(const UnicodeString& text,
const UnicodeString& setOfChars) {
return indexOf(text, 0, text.length(), setOfChars);
}

View file

@ -0,0 +1,302 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#ifndef RBT_PARS_H
#define RBT_PARS_H
#include "rbt.h"
class TransliterationRuleData;
class TransliterationRuleParser {
/**
* This is a reference to external data we don't own. This works because
* we only hold this for the duration of the call to parse().
*/
const UnicodeString& rules;
RuleBasedTransliterator::Direction direction;
TransliterationRuleData* data;
/**
* We use a single error code during parsing. Rather than pass it
* through each API, we keep it here.
*/
UErrorCode status;
/**
* The next available stand-in for variables. This starts at some point in
* the private use area (discovered dynamically) and increments up toward
* <code>variableLimit</code>. At any point during parsing, available
* variables are <code>variableNext..variableLimit-1</code>.
*/
UChar variableNext;
/**
* The last available stand-in for variables. This is discovered
* dynamically. At any point during parsing, available variables are
* <code>variableNext..variableLimit-1</code>.
*/
UChar variableLimit;
// Operators
static const UChar VARIABLE_DEF_OP;
static const UChar FORWARD_RULE_OP;
static const UChar REVERSE_RULE_OP;
static const char* OPERATORS;
// Other special characters
static const UChar QUOTE;
static const UChar VARIABLE_REF_OPEN;
static const UChar VARIABLE_REF_CLOSE;
static const UChar CONTEXT_OPEN;
static const UChar CONTEXT_CLOSE;
static const UChar CURSOR_POS;
static const UChar RULE_COMMENT_CHAR;
/**
* Specials must be quoted in rules to be used as literals.
* Specials may not occur in variable names.
*/
static const char* SPECIALS;
/**
* Specials that must be quoted in variable definitions.
*/
static const char* DEF_SPECIALS;
public:
static TransliterationRuleData*
parse(const UnicodeString& rules,
RuleBasedTransliterator::Direction direction);
private:
/**
* @param rules list of rules, separated by newline characters
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
TransliterationRuleParser(const UnicodeString& rules,
RuleBasedTransliterator::Direction direction);
/**
* Parse the given string as a sequence of rules, separated by newline
* characters ('\n'), and cause this object to implement those rules. Any
* previous rules are discarded. Typically this method is called exactly
* once, during construction.
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
void parseRules();
/**
* Parse the given substring as a rule, and append it to the rules currently
* represented in this object.
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @exception IllegalArgumentException if there is a syntax error in the
* rules
*/
void applyRule(int32_t start, int32_t limit);
/**
* Add a variable definition.
* @param name the name of the variable. It must not already be defined.
* @param pattern the value of the variable. It may be a single character
* or a pattern describing a character set.
* @exception IllegalArgumentException if there is a syntax error
*/
void applyVariableDef(const UnicodeString& name,
const UnicodeString& pattern);
/**
* Given a rule, parses it into three pieces: The left side, the right side,
* and the operator. Returns the operator. Quotes and variable references
* are resolved; the otuput text in all <code>StringBuffer</code> parameters
* is literal text. This method delegates to other parsing methods to
* handle the match pattern, output pattern, and other sub-patterns in the
* rule.
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @param left left side of rule is appended to this buffer
* with the quotes removed and variables resolved
* @param right right side of rule is appended to this buffer
* with the quotes removed and variables resolved
* @param anteContext the preceding context of the match pattern,
* if there is one, is appended to this buffer
* @param postContext the following context of the match pattern,
* if there is one, is appended to this buffer
* @param cursorPos if there is a cursor in the output pattern, its
* offset is stored in <code>cursorPos[0]</code>
* @return The operator character, one of the characters in OPERATORS.
*/
UChar parseRule(int32_t start, int32_t limit,
UnicodeString& left, UnicodeString& right,
UnicodeString& anteContext,
UnicodeString& postContext,
int32_t& cursorPos);
/**
* Parses the match pattern of a forward or reverse rule. Given the raw
* match pattern, return the match text and the context on both sides, if
* any. Resolves all quotes and variables.
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @param text the key to be matched will be appended to this buffer
* @param anteContext the preceding context, if any, will be appended
* to this buffer.
* @param postContext the following context, if any, will be appended
* to this buffer.
*/
void parseMatchPattern(int32_t start, int32_t limit,
UnicodeString& text,
UnicodeString& anteContext,
UnicodeString& postContext);
void parseSubPattern(int32_t start, int32_t limit,
UnicodeString& text);
/**
* Parse a variable definition sub pattern. This kind of sub
* pattern differs in the set of characters that are considered
* special. In particular, the '[' and ']' characters are not
* special, since these are used in UnicodeSet patterns.
*/
void parseDefPattern(int32_t start, int32_t limit,
UnicodeString& text);
/**
* Parses the output pattern of a forward or reverse rule. Given the
* output pattern, return the output text and the position of the cursor,
* if any. Resolves all quotes and variables.
* @param rules the string to be parsed
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @param text the output text will be appended to this buffer
* @param cursorPos if this parameter is not null, then cursorPos[0]
* will be set to the cursor position, or -1 if there is none. If this
* parameter is null, then cursors will be disallowed.
*/
void parseOutputPattern(int32_t start, int32_t limit,
UnicodeString& text,
int32_t& cursorPos);
/**
* Parses a sub-pattern of a rule. Return the text and the position of the cursor,
* if any. Resolves all quotes and variables.
* @param rules the string to be parsed
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= rules.length()</code>.
* @param text the output text will be appended to this buffer
* @param cursorPos if this parameter is not null, then cursorPos[0]
* will be set to the cursor position, or -1 if there is none. If this
* parameter is null, then cursors will be disallowed.
* @param specials characters that must be quoted; typically either
* SPECIALS or DEF_SPECIALS.
*/
void parseSubPattern(int32_t start, int32_t limit,
UnicodeString& text,
int32_t* cursorPos,
const UnicodeString& specials);
void validateVariableName(const UnicodeString& name);
/**
* Returns the single character value of the given variable name. Defined
* names are recognized.
*
* NO LONGER SUPPORTED:
* If a Unicode category name is given, a standard character variable
* in the range firstCategoryVariable to lastCategoryVariable is returned,
* with value firstCategoryVariable + n, where n is the category
* number.
* @exception IllegalArgumentException if the name is unknown.
*/
//$ Character getVariableDef(const UnicodeString& name);
/**
* Determines what part of the private use region of Unicode we can use for
* variable stand-ins. The correct way to do this is as follows: Parse each
* rule, and for forward and reverse rules, take the FROM expression, and
* make a hash of all characters used. The TO expression should be ignored.
* When done, everything not in the hash is available for use. In practice,
* this method may employ some other algorithm for improved speed.
*/
void determineVariableRange();
/**
* Returns the index of the first character in a set, ignoring quoted text.
* For example, in the string "abc'hide'h", the 'h' in "hide" will not be
* found by a search for "h". Unlike String.indexOf(), this method searches
* not for a single character, but for any character of the string
* <code>setOfChars</code>.
* @param text text to be searched
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param setOfChars string with one or more distinct characters
* @return Offset of the first character in <code>setOfChars</code>
* found, or -1 if not found.
* @see #indexOf
*/
static int32_t quotedIndexOf(const UnicodeString& text,
int32_t start, int32_t limit,
const UnicodeString& setOfChars);
/**
* Returns the index of the first character in a set. Unlike
* String.indexOf(), this method searches not for a single character, but
* for any character of the string <code>setOfChars</code>.
* @param text text to be searched
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param setOfChars string with one or more distinct characters
* @return Offset of the first character in <code>setOfChars</code>
* found, or -1 if not found.
* @see #quotedIndexOf
*/
static int32_t indexOf(const UnicodeString& text,
int32_t start, int32_t limit,
const UnicodeString& setOfChars);
/**
* Returns the index of the first character in a set. Unlike
* String.indexOf(), this method searches not for a single character, but
* for any character of the string <code>setOfChars</code>.
* @param text text to be searched
* @param setOfChars string with one or more distinct characters
* @return Offset of the first character in <code>setOfChars</code>
* found, or -1 if not found.
* @see #quotedIndexOf
*/
static int32_t indexOf(const UnicodeString& text,
const UnicodeString& setOfChars);
};
#endif

View file

@ -0,0 +1,436 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#include "rbt_rule.h"
#include "rep.h"
#include "rbt_data.h"
#include "unifilt.h"
#include "uniset.h"
/**
* Construct a new rule with the given key, output text, and other
* attributes. Zero, one, or two context strings may be specified. A
* cursor position may be specified for the output text.
* @param key the string to match
* @param output the string to produce when the <code>key</code> is seen
* @param anteContext if not null and not empty, then it must be matched
* before the <code>key</code>
* @param postContext if not null and not empty, then it must be matched
* after the <code>key</code>
* @param cursorPos a position for the cursor after the <code>output</code>
* is emitted. If less than zero, then the cursor is placed after the
* <code>output</code>; that is, -1 is equivalent to
* <code>output.length()</code>. If greater than
* <code>output.length()</code> then an exception is thrown.
* @exception IllegalArgumentException if the cursor position is out of
* range.
*/
TransliterationRule::TransliterationRule(const UnicodeString& theKey,
const UnicodeString& theOutput,
const UnicodeString& theAnteContext,
const UnicodeString& thePostContext,
int32_t theCursorPos,
UErrorCode &status) :
key(theKey), output(theOutput),
anteContext(theAnteContext),
postContext(thePostContext),
cursorPos(theCursorPos),
maskKey(0) {
if (U_FAILURE(status)) {
return;
}
if (cursorPos < 0) {
cursorPos = output.length();
}
if (cursorPos > output.length()) {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
/* The mask key is needed when we are adding individual rules to a rule
* set, for performance. Here are the numbers: Without mask key, 13.0
* seconds. With mask key, 6.2 seconds. However, once the rules have
* been added to the set, then they can be discarded to free up space.
* This is what the freeze() method does. After freeze() has been
* called, the method masks() must NOT be called.
*/
maskKey = new UnicodeString(key);
if (maskKey == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
} else {
maskKey->append(postContext);
}
}
TransliterationRule::~TransliterationRule() {
delete maskKey;
}
/**
* Return the length of the key. Equivalent to <code>getKey().length()</code>.
* @return the length of the match key.
*/
int32_t TransliterationRule::getKeyLength() const {
return key.length();
}
/**
* Return the key.
* @return the match key.
*/
const UnicodeString& TransliterationRule::getKey() const {
return key;
}
/**
* Return the output string.
* @return the output string.
*/
const UnicodeString& TransliterationRule::getOutput() const {
return output;
}
/**
* Return the position of the cursor within the output string.
* @return a value from 0 to <code>getOutput().length()</code>, inclusive.
*/
int32_t TransliterationRule::getCursorPos() const {
return cursorPos;
}
/**
* Return the preceding context length. This method is needed to
* support the <code>Transliterator</code> method
* <code>getMaximumContextLength()</code>.
*/
int32_t TransliterationRule::getAnteContextLength() const {
return anteContext.length();
}
/**
* Return true if this rule masks another rule. If r1 masks r2 then
* r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
* r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
* "[c]a>x" masks "[dc]a>y".
*
* <p>This method must not be called after freeze() is called.
*/
bool_t TransliterationRule::masks(const TransliterationRule& r2) const {
/* There are three cases of masking. In each instance, rule1
* masks rule2.
*
* 1. KEY mask: len(key1) < len(key2), key2 starts with key1.
*
* 2. PREFIX mask: key1 == key2, len(prefix1) < len(prefix2),
* prefix2 ends with prefix1, suffix2 starts with suffix1.
*
* 3. SUFFIX mask: key1 == key2, len(suffix1) < len(suffix2),
* prefix2 ends with prefix1, suffix2 starts with suffix1.
*/
/* LIMITATION of the current mask algorithm: Some rule
* maskings are currently not detected. For example,
* "{Lu}]a>x" masks "A]a>y". To detect these sorts of masking,
* we need a subset operator on UnicodeSet objects, which we
* currently do not have. This can be added later.
*/
return ((maskKey->length() < r2.maskKey->length() &&
r2.maskKey->startsWith(*maskKey)) ||
(r2.anteContext.length() != 0 && *maskKey == *r2.maskKey &&
((anteContext.length() == 0) ||
(anteContext.length() < r2.anteContext.length() &&
r2.anteContext.endsWith(anteContext)))));
}
/**
* Free up space. Once this method is called, masks() must NOT be called.
* If it is called, an exception will be thrown.
*/
void TransliterationRule::freeze() {
delete maskKey;
maskKey = 0;
}
/**
* Return true if this rule matches the given text. The text being matched
* occupies a virtual buffer consisting of the contents of
* <code>result</code> concatenated to a substring of <code>text</code>.
* The substring is specified by <code>start</code> and <code>limit</code>.
* The value of <code>cursor</code> is an index into this virtual buffer,
* from 0 to the length of the buffer. In terms of the parameters,
* <code>cursor</code> must be between 0 and <code>result.length() + limit -
* start</code>.
* @param text the untranslated text
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param result translated text so far
* @param cursor position at which to translate next, an offset into result.
* If greater than or equal to result.length(), represents offset start +
* cursor - result.length() into text.
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
*/
bool_t TransliterationRule::matches(const UnicodeString& text,
int32_t start, int32_t limit,
const UnicodeString& result,
int32_t cursor,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const {
return
(anteContext.length() == 0
|| regionMatches(text, start, limit, result,
cursor - anteContext.length(),
anteContext, data, filter)) &&
regionMatches(text, start, limit, result, cursor,
key, data, filter) &&
(postContext.length() == 0
|| regionMatches(text, start, limit, result,
cursor + key.length(),
postContext, data, filter));
}
/**
* Return true if this rule matches the given text.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
*/
bool_t TransliterationRule::matches(const Replaceable& text,
int32_t start, int32_t limit,
int32_t cursor,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const {
return
(anteContext.length() == 0
|| regionMatches(text, start, limit, cursor - anteContext.length(),
anteContext, data, filter)) &&
regionMatches(text, start, limit, cursor,
key, data, filter) &&
(postContext.length() == 0
|| regionMatches(text, start, limit, cursor + key.length(),
postContext, data, filter));
}
/**
* Return the degree of match between this rule and the given text. The
* degree of match may be mismatch, a partial match, or a full match. A
* mismatch means at least one character of the text does not match the
* context or key. A partial match means some context and key characters
* match, but the text is not long enough to match all of them. A full
* match means all context and key characters match.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
* @return one of <code>MISMATCH</code>, <code>PARTIAL_MATCH</code>, or
* <code>FULL_MATCH</code>.
* @see #MISMATCH
* @see #PARTIAL_MATCH
* @see #FULL_MATCH
*/
int32_t TransliterationRule::getMatchDegree(const Replaceable& text,
int32_t start, int32_t limit,
int32_t cursor,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const {
if (anteContext.length() != 0
&& !regionMatches(text, start, limit, cursor - anteContext.length(),
anteContext, data, filter)) {
return MISMATCH;
}
int32_t len = getRegionMatchLength(text, start, limit, cursor,
key, data, filter);
if (len < 0) {
return MISMATCH;
}
if (len < key.length()) {
return PARTIAL_MATCH;
}
if (postContext.length() == 0) {
return FULL_MATCH;
}
len = getRegionMatchLength(text, start, limit,
cursor + key.length(),
postContext, data, filter);
return (len < 0) ? MISMATCH
: ((len == postContext.length()) ? FULL_MATCH
: PARTIAL_MATCH);
}
/**
* Return true if a template matches the text. The entire length of the
* template is compared to the text at the cursor. As in
* <code>matches()</code>, the text being matched occupies a virtual buffer
* consisting of the contents of <code>result</code> concatenated to a
* substring of <code>text</code>. See <code>matches()</code> for details.
* @param text the untranslated text
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param result translated text so far
* @param cursor position at which to translate next, an offset into result.
* If greater than or equal to result.length(), represents offset start +
* cursor - result.length() into text.
* @param templ the text to match against. All characters must match.
* @param data a dictionary of variables mapping <code>Character</code>
* to <code>UnicodeSet</code>
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
* @return true if there is a match
*/
bool_t TransliterationRule::regionMatches(const UnicodeString& text,
int32_t start, int32_t limit,
const UnicodeString& result,
int32_t cursor,
const UnicodeString& templ,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const {
int32_t rlen = result.length();
if (cursor < 0
|| (cursor + templ.length()) > (rlen + limit - start)) {
return FALSE;
}
for (int32_t i=0; i<templ.length(); ++i, ++cursor) {
if (!charMatches(templ.charAt(i),
cursor < rlen ? result.charAt(cursor)
: text.charAt(cursor - rlen + start),
data, filter)) {
return FALSE;
}
}
return TRUE;
}
/**
* Return true if a template matches the text. The entire length of the
* template is compared to the text at the cursor.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* @param templ the text to match against. All characters must match.
* @param data a dictionary of variables mapping <code>Character</code>
* to <code>UnicodeSet</code>
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
* @return true if there is a match
*/
bool_t TransliterationRule::regionMatches(const Replaceable& text,
int32_t start, int32_t limit,
int32_t cursor,
const UnicodeString& templ,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const {
if (cursor < start
|| (cursor + templ.length()) > limit) {
return FALSE;
}
for (int32_t i=0; i<templ.length(); ++i, ++cursor) {
if (!charMatches(templ.charAt(i), text.charAt(cursor),
data, filter)) {
return FALSE;
}
}
return TRUE;
}
/**
* Return the number of characters of the text that match this rule. If
* there is a mismatch, return -1. If the text is not long enough to match
* any characters, return 0.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* @param templ the text to match against. All characters must match.
* @param data a dictionary of variables mapping <code>Character</code>
* to <code>UnicodeSet</code>
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
* @return -1 if there is a mismatch, 0 if the text is not long enough to
* match any characters, otherwise the number of characters of text that
* match this rule.
*/
int32_t TransliterationRule::getRegionMatchLength(const Replaceable& text,
int32_t start,
int32_t limit, int32_t cursor,
const UnicodeString& templ,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const {
if (cursor < start) {
return -1;
}
int32_t i;
for (i=0; i<templ.length() && cursor<limit; ++i, ++cursor) {
if (!charMatches(templ.charAt(i), text.charAt(cursor),
data, filter)) {
return -1;
}
}
return i;
}
/**
* Return true if the given key matches the given text. This method
* accounts for the fact that the key character may represent a character
* set. Note that the key and text characters may not be interchanged
* without altering the results.
* @param keyChar a character in the match key
* @param textChar a character in the text being transliterated
* @param data a dictionary of variables mapping <code>Character</code>
* to <code>UnicodeSet</code>
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
*/
bool_t TransliterationRule::charMatches(UChar keyChar, UChar textChar,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const {
UnicodeSet* set = 0;
return (filter == 0 || filter->isIn(textChar)) &&
((set = data.lookupSet(keyChar)) == 0) ?
keyChar == textChar : set->contains(textChar);
}

View file

@ -0,0 +1,380 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#ifndef RBT_RULE_H
#define RBT_RULE_H
#include "unistr.h"
class Replaceable;
class TransliterationRuleData;
class UnicodeFilter;
/**
* A transliteration rule used by
* <code>RuleBasedTransliterator</code>.
* <code>TransliterationRule</code> is an immutable object.
*
* <p>A rule consists of an input pattern and an output string. When
* the input pattern is matched, the output string is emitted. The
* input pattern consists of zero or more characters which are matched
* exactly (the key) and optional context. Context must match if it
* is specified. Context may be specified before the key, after the
* key, or both. The key, preceding context, and following context
* may contain variables. Variables represent a set of Unicode
* characters, such as the letters <i>a</i> through <i>z</i>.
* Variables are detected by looking up each character in a supplied
* variable list to see if it has been so defined.
*
* @author Alan Liu
*/
class TransliterationRule {
public:
/**
* Constants returned by <code>getMatchDegree()</code> indicating
* the degree of match between the text and this rule.
* @see #getMatchDegree
*/
enum {
/**
* Constant returned by <code>getMatchDegree()</code>
* indicating a mismatch between the text and this rule. One
* or more characters of the context or key do not match the
* text.
*/
MISMATCH,
/**
* Constant returned by <code>getMatchDegree()</code>
* indicating a partial match between the text and this rule.
* All characters of the text match the corresponding context
* or key, but more characters are required for a complete
* match. There are some key or context characters at the end
* of the pattern that remain unmatched because the text isn't
* long enough.
*/
PARTIAL_MATCH,
/**
* Constant returned by <code>getMatchDegree()</code>
* indicating a complete match between the text and this rule.
* The text matches all context and key characters.
*/
FULL_MATCH
};
private:
/**
* The string that must be matched.
*/
UnicodeString key;
/**
* The string that is emitted if the key, anteContext, and postContext
* are matched.
*/
UnicodeString output;
/**
* The string that must match before the key. If empty, then
* there is no matching requirement before the key.
*/
UnicodeString anteContext;
/**
* The string that must match after the key. If empty, then there
* is no matching requirement after the key.
*/
UnicodeString postContext;
/**
* The position of the cursor after emitting the output string, from 0 to
* output.length(). For most rules with no special cursor specification,
* the cursorPos is output.length().
*/
int32_t cursorPos;
/**
* A string used to implement masks().
* @see #freeze
*/
UnicodeString* maskKey;
public:
/**
* Construct a new rule with the given key, output text, and other
* attributes. Zero, one, or two context strings may be specified. A
* cursor position may be specified for the output text.
* @param key the string to match
* @param output the string to produce when the <code>key</code> is seen
* @param anteContext if not null and not empty, then it must be matched
* before the <code>key</code>
* @param postContext if not null and not empty, then it must be matched
* after the <code>key</code>
* @param cursorPos a position for the cursor after the <code>output</code>
* is emitted. If less than zero, then the cursor is placed after the
* <code>output</code>; that is, -1 is equivalent to
* <code>output.length()</code>. If greater than
* <code>output.length()</code> then an exception is thrown.
* @exception IllegalArgumentException if the cursor position is out of
* range.
*/
TransliterationRule(const UnicodeString& theKey,
const UnicodeString& theOutput,
const UnicodeString& theAnteContext,
const UnicodeString& thePostContext,
int32_t theCursorPos,
UErrorCode &status);
/**
* Destructor.
*/
virtual ~TransliterationRule();
/**
* Return the length of the key. Equivalent to <code>getKey().length()</code>.
* @return the length of the match key.
*/
virtual int32_t getKeyLength() const;
/**
* Return the key.
* @return the match key.
*/
virtual const UnicodeString& getKey() const;
/**
* Return the output string.
* @return the output string.
*/
virtual const UnicodeString& getOutput() const;
/**
* Return the position of the cursor within the output string.
* @return a value from 0 to <code>getOutput().length()</code>, inclusive.
*/
virtual int32_t getCursorPos() const;
/**
* Return the preceding context length. This method is needed to
* support the <code>Transliterator</code> method
* <code>getMaximumContextLength()</code>.
*/
virtual int32_t getAnteContextLength() const;
/**
* Return true if this rule masks another rule. If r1 masks r2 then
* r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
* r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
* "[c]a>x" masks "[dc]a>y".
*
* <p>This method must not be called after freeze() is called.
*/
virtual bool_t masks(const TransliterationRule& r2) const;
/**
* Free up space. Once this method is called, masks() must NOT be called.
* If it is called, an exception will be thrown.
*/
virtual void freeze();
/**
* Return true if this rule matches the given text. The text being matched
* occupies a virtual buffer consisting of the contents of
* <code>result</code> concatenated to a substring of <code>text</code>.
* The substring is specified by <code>start</code> and <code>limit</code>.
* The value of <code>cursor</code> is an index into this virtual buffer,
* from 0 to the length of the buffer. In terms of the parameters,
* <code>cursor</code> must be between 0 and <code>result.length() + limit -
* start</code>.
* @param text the untranslated text
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param result translated text so far
* @param cursor position at which to translate next, an offset into result.
* If greater than or equal to result.length(), represents offset start +
* cursor - result.length() into text.
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
*/
virtual bool_t matches(const UnicodeString& text,
int32_t start, int32_t limit,
const UnicodeString& result,
int32_t cursor,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const;
/**
* Return true if this rule matches the given text.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
*/
virtual bool_t matches(const Replaceable& text,
int32_t start, int32_t limit,
int32_t cursor,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const;
/**
* Return the degree of match between this rule and the given text. The
* degree of match may be mismatch, a partial match, or a full match. A
* mismatch means at least one character of the text does not match the
* context or key. A partial match means some context and key characters
* match, but the text is not long enough to match all of them. A full
* match means all context and key characters match.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
* @return one of <code>MISMATCH</code>, <code>PARTIAL_MATCH</code>, or
* <code>FULL_MATCH</code>.
* @see #MISMATCH
* @see #PARTIAL_MATCH
* @see #FULL_MATCH
*/
virtual int32_t getMatchDegree(const Replaceable& text,
int32_t start, int32_t limit,
int32_t cursor,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const;
/**
* Return true if a template matches the text. The entire length of the
* template is compared to the text at the cursor. As in
* <code>matches()</code>, the text being matched occupies a virtual buffer
* consisting of the contents of <code>result</code> concatenated to a
* substring of <code>text</code>. See <code>matches()</code> for details.
* @param text the untranslated text
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param result translated text so far
* @param cursor position at which to translate next, an offset into result.
* If greater than or equal to result.length(), represents offset start +
* cursor - result.length() into text.
* @param templ the text to match against. All characters must match.
* @param data a dictionary of variables mapping <code>Character</code>
* to <code>UnicodeSet</code>
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
* @return true if there is a match
*/
virtual bool_t regionMatches(const UnicodeString& text,
int32_t start, int32_t limit,
const UnicodeString& result,
int32_t cursor,
const UnicodeString& templ,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const;
/**
* Return true if a template matches the text. The entire length of the
* template is compared to the text at the cursor.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* @param templ the text to match against. All characters must match.
* @param data a dictionary of variables mapping <code>Character</code>
* to <code>UnicodeSet</code>
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
* @return true if there is a match
*/
virtual bool_t regionMatches(const Replaceable& text,
int32_t start, int32_t limit,
int32_t cursor,
const UnicodeString& templ,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const;
/**
* Return the number of characters of the text that match this rule. If
* there is a mismatch, return -1. If the text is not long enough to match
* any characters, return 0.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* @param templ the text to match against. All characters must match.
* @param data a dictionary of variables mapping <code>Character</code>
* to <code>UnicodeSet</code>
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
* @return -1 if there is a mismatch, 0 if the text is not long enough to
* match any characters, otherwise the number of characters of text that
* match this rule.
*/
virtual int32_t getRegionMatchLength(const Replaceable& text, int32_t start,
int32_t limit, int32_t cursor,
const UnicodeString& templ,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const;
/**
* Return true if the given key matches the given text. This method
* accounts for the fact that the key character may represent a character
* set. Note that the key and text characters may not be interchanged
* without altering the results.
* @param keyChar a character in the match key
* @param textChar a character in the text being transliterated
* @param data a dictionary of variables mapping <code>Character</code>
* to <code>UnicodeSet</code>
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
*/
virtual bool_t charMatches(UChar keyChar, UChar textChar,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const;
};
#endif

View file

@ -0,0 +1,217 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#include "rbt_set.h"
#include "rbt_rule.h"
#include "unistr.h"
/* Note: There was an old implementation that indexed by first letter of
* key. Problem with this is that key may not have a meaningful first
* letter; e.g., {Lu}>*. One solution is to keep a separate vector of all
* rules whose intial key letter is a category variable. However, the
* problem is that they must be kept in order with respect to other rules.
* One solution -- add a sequence number to each rule. Do the usual
* first-letter lookup, and also a lookup from the spare bin with rules like
* {Lu}>*. Take the lower sequence number. This seems complex and not
* worth the trouble, but we may revisit this later. For documentation (or
* possible resurrection) the old code is included below, commented out
* with the remark "// OLD INDEXED IMPLEMENTATION". Under the old
* implementation, <code>rules</code> is a Hashtable, not a Vector.
*/
/**
* Construct a new empty rule set.
*/
TransliterationRuleSet::TransliterationRuleSet() {
maxContextLength = 0;
}
/**
* Return the maximum context length.
* @return the length of the longest preceding context.
*/
int32_t TransliterationRuleSet::getMaximumContextLength() const {
return maxContextLength;
}
/**
* Add a rule to this set. Rules are added in order, and order is
* significant.
*
* <p>Once freeze() is called, this method must not be called.
* @param rule the rule to add
*/
void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
UErrorCode& status) {
// Build time, no checking : 3562 ms
// Build time, with checking: 6234 ms
if (U_FAILURE(status)) {
delete adoptedRule;
return;
}
for (int32_t i=0; i<rules.size(); ++i) {
TransliterationRule* r = (TransliterationRule*) rules.elementAt(i);
if (r->masks(*adoptedRule)) {
//throw new IllegalArgumentException("Rule " + rule +
// " must precede " + r);
status = U_ILLEGAL_ARGUMENT_ERROR;
delete adoptedRule;
return;
}
}
rules.addElement(adoptedRule);
int32_t len;
if ((len = adoptedRule->getAnteContextLength()) > maxContextLength) {
maxContextLength = len;
}
}
/**
* Free up space. Once this method is called, addRule() must NOT
* be called again.
*/
void TransliterationRuleSet::freeze() {
for (int32_t i=0; i<rules.size(); ++i) {
((TransliterationRule*) rules.elementAt(i))->freeze();
}
}
/**
* Attempt to find a matching rule at the specified point in the text. The
* text being matched occupies a virtual buffer consisting of the contents
* of <code>result</code> concatenated to a substring of <code>text</code>.
* The substring is specified by <code>start</code> and <code>limit</code>.
* The value of <code>cursor</code> is an index into this virtual buffer,
* from 0 to the length of the buffer. In terms of the parameters,
* <code>cursor</code> must be between 0 and <code>result.length() + limit -
* start</code>.
* @param text the untranslated text
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param result translated text
* @param cursor position at which to translate next, an offset into result.
* If greater than or equal to result.length(), represents offset start +
* cursor - result.length() into text.
* @param data a dictionary mapping variables to the sets they
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
* @return the matching rule, or null if none found.
*/
TransliterationRule*
TransliterationRuleSet::findMatch(const UnicodeString& text,
int32_t start, int32_t limit,
const UnicodeString& result,
int32_t cursor,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const {
for (int32_t i=0; i<rules.size(); ++i) {
TransliterationRule* rule =
(TransliterationRule*) rules.elementAt(i);
if (rule->matches(text, start, limit, result,
cursor, data, filter)) {
return rule;
}
}
return 0;
}
/**
* Attempt to find a matching rule at the specified point in the text.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* @param data a dictionary mapping variables to the sets they
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
* @return the matching rule, or null if none found.
*/
TransliterationRule*
TransliterationRuleSet::findMatch(const Replaceable& text,
int32_t start, int32_t limit,
int32_t cursor,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const {
for (int32_t i=0; i<rules.size(); ++i) {
TransliterationRule* rule =
(TransliterationRule*) rules.elementAt(i);
if (rule->matches(text, start, limit, cursor,
data, filter)) {
return rule;
}
}
return 0;
}
/**
* Attempt to find a matching rule at the specified point in the text.
* Unlike <code>findMatch()</code>, this method does an incremental match.
* An incremental match requires that there be no partial matches that might
* pre-empt the full match that is found. If there are partial matches,
* then null is returned. A non-null result indicates that a full match has
* been found, and that it cannot be pre-empted by a partial match
* regardless of what additional text is added to the translation buffer.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* @param data a dictionary mapping variables to the sets they
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
* @param partial output parameter. <code>partial[0]</code> is set to
* true if a partial match is returned.
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
* @return the matching rule, or null if none found, or if the text buffer
* does not have enough text yet to unambiguously match a rule.
*/
TransliterationRule*
TransliterationRuleSet::findIncrementalMatch(const Replaceable& text,
int32_t start,
int32_t limit, int32_t cursor,
const TransliterationRuleData& data,
bool_t& isPartial,
const UnicodeFilter* filter) const {
isPartial = FALSE;
for (int32_t i=0; i<rules.size(); ++i) {
TransliterationRule* rule =
(TransliterationRule*) rules.elementAt(i);
int32_t match = rule->getMatchDegree(text, start, limit, cursor,
data, filter);
switch (match) {
case TransliterationRule::FULL_MATCH:
return rule;
case TransliterationRule::PARTIAL_MATCH:
isPartial = TRUE;
return 0;
}
}
return 0;
}

164
icu4c/source/i18n/rbt_set.h Normal file
View file

@ -0,0 +1,164 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#ifndef RBT_SET_H
#define RBT_SET_H
#include "uvector.h"
class Replaceable;
class TransliterationRule;
class TransliterationRuleData;
class UnicodeFilter;
class UnicodeString;
/**
* A set of rules for a <code>RuleBasedTransliterator</code>. This set encodes
* the transliteration in one direction from one set of characters or short
* strings to another. A <code>RuleBasedTransliterator</code> consists of up to
* two such sets, one for the forward direction, and one for the reverse.
*
* <p>A <code>TransliterationRuleSet</code> has one important operation, that of
* finding a matching rule at a given point in the text. This is accomplished
* by the <code>findMatch()</code> method.
*
* @author Alan Liu
*/
class TransliterationRuleSet {
/**
* Vector of rules, in the order added.
*/
UVector rules;
/**
* Length of the longest preceding context
*/
int32_t maxContextLength;
public:
/**
* Construct a new empty rule set.
*/
TransliterationRuleSet();
/**
* Return the maximum context length.
* @return the length of the longest preceding context.
*/
virtual int32_t getMaximumContextLength() const;
/**
* Add a rule to this set. Rules are added in order, and order is
* significant.
*
* <p>Once freeze() is called, this method must not be called.
* @param rule the rule to add
*/
virtual void addRule(TransliterationRule* adoptedRule,
UErrorCode& status);
/**
* Free up space. Once this method is called, addRule() must NOT
* be called again.
*/
virtual void freeze();
/**
* Attempt to find a matching rule at the specified point in the text. The
* text being matched occupies a virtual buffer consisting of the contents
* of <code>result</code> concatenated to a substring of <code>text</code>.
* The substring is specified by <code>start</code> and <code>limit</code>.
* The value of <code>cursor</code> is an index into this virtual buffer,
* from 0 to the length of the buffer. In terms of the parameters,
* <code>cursor</code> must be between 0 and <code>result.length() + limit -
* start</code>.
* @param text the untranslated text
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param result tranlated text
* @param cursor position at which to translate next, an offset into result.
* If greater than or equal to result.length(), represents offset start +
* cursor - result.length() into text.
* @param data a dictionary mapping variables to the sets they
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
* @return the matching rule, or null if none found.
*/
virtual TransliterationRule* findMatch(const UnicodeString& text,
int32_t start, int32_t limit,
const UnicodeString& result,
int32_t cursor,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const;
/**
* Attempt to find a matching rule at the specified point in the text.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* @param data a dictionary mapping variables to the sets they
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
* @return the matching rule, or null if none found.
*/
virtual TransliterationRule* findMatch(const Replaceable& text,
int32_t start, int32_t limit,
int32_t cursor,
const TransliterationRuleData& data,
const UnicodeFilter* filter) const;
/**
* Attempt to find a matching rule at the specified point in the text.
* Unlike <code>findMatch()</code>, this method does an incremental match.
* An incremental match requires that there be no partial matches that might
* pre-empt the full match that is found. If there are partial matches,
* then null is returned. A non-null result indicates that a full match has
* been found, and that it cannot be pre-empted by a partial match
* regardless of what additional text is added to the translation buffer.
* @param text the text, both translated and untranslated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param cursor position at which to translate next, representing offset
* into text. This value must be between <code>start</code> and
* <code>limit</code>.
* @param data a dictionary mapping variables to the sets they
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
* @param partial output parameter. <code>partial[0]</code> is set to
* true if a partial match is returned.
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
* @return the matching rule, or null if none found, or if the text buffer
* does not have enough text yet to unambiguously match a rule.
*/
virtual TransliterationRule* findIncrementalMatch(const Replaceable& text,
int32_t start,
int32_t limit, int32_t cursor,
const TransliterationRuleData& data,
bool_t& isPartial,
const UnicodeFilter* filter) const;
};
#endif

View file

@ -0,0 +1,879 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#include "translit.h"
#include "cmemory.h"
#include "cstring.h"
#include "hextouni.h"
#include "locid.h"
#include "msgfmt.h"
#include "mutex.h"
#include "rbt_data.h"
#include "rbt_pars.h"
#include "rep.h"
#include "resbund.h"
#include "uhash.h"
#include "unifilt.h"
#include "unitohex.h"
/**
* Dictionary of known transliterators. Keys are <code>String</code>
* names, values are one of the following:
*
* <ul><li><code>Transliterator</code> objects
*
* <li><code>RULE_BASED_PLACEHOLDER</code>, in which case the ID
* will have its first '-' removed and be appended to
* RB_RULE_BASED_PREFIX to form a resource bundle name from which
* the RB_RULE key is looked up to obtain the rule.
*
* <li><code>REVERSE_RULE_BASED_PLACEHOLDER</code>. Like
* <code>RULE_BASED_PLACEHOLDER</code>, except the entity names in
* the ID are reversed, and the argument
* RuleBasedTransliterator.REVERSE is pased to the
* RuleBasedTransliterator constructor.
* </ul>
*/
UHashtable* Transliterator::cache = 0;
/**
* The mutex controlling access to the cache.
*/
UMTX Transliterator::cacheMutex = NULL;
/**
* When set to TRUE, the cache has been initialized. Any code must
* check this boolean before accessing the cache, and if the boolean
* is FALSE, it must call initializeCache(). We do this form of lazy
* evaluation for two reasons: (1) so we don't initialize if we don't
* have to (i.e., if no one is using Transliterator, but has included
* the code as part of a shared library, and (2) to avoid static
* intialization problems.
*/
bool_t Transliterator::cacheInitialized = FALSE;
/**
* Prefix for resource bundle key for the display name for a
* transliterator. The ID is appended to this to form the key.
* The resource bundle value should be a String.
*/
const char* Transliterator::RB_DISPLAY_NAME_PREFIX = "T:";
/**
* Resource bundle key for display name pattern.
* The resource bundle value should be a String forming a
* MessageFormat pattern, e.g.:
* "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}".
*/
const char* Transliterator::RB_DISPLAY_NAME_PATTERN =
"TransliteratorNamePattern";
/**
* Resource bundle key for the list of RuleBasedTransliterator IDs.
* The resource bundle value should be a String[] with each element
* being a valid ID. The ID will be appended to RB_RULE_BASED_PREFIX
* to obtain the class name in which the RB_RULE key will be sought.
*/
const char* Transliterator::RB_RULE_BASED_IDS =
"RuleBasedTransliteratorIDs";
/**
* Resource bundle key for the RuleBasedTransliterator rule.
*/
const char* Transliterator::RB_RULE = "Rule";
/**
* Default constructor.
* @param theID the string identifier for this transliterator
* @param theFilter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
*/
Transliterator::Transliterator(const UnicodeString& theID,
UnicodeFilter* adoptedFilter) :
ID(theID), filter(adoptedFilter) {}
/**
* Destructor.
*/
Transliterator::~Transliterator() {
delete filter;
}
/**
* Copy constructor.
*/
Transliterator::Transliterator(const Transliterator& other) :
ID(other.ID), filter(0) {
if (other.filter != 0) {
// We own the filter, so we must have our own copy
filter = other.filter->clone();
}
}
/**
* Assignment operator.
*/
Transliterator& Transliterator::operator=(const Transliterator& other) {
ID = other.ID;
filter = (other.filter == 0) ?
0 : other.filter->clone();
return *this;
}
/**
* Transliterates the segment of a string that begins at the character
* at offset <code>start</code> and extends to the character at offset
* <code>limit - 1</code>. A default implementation is provided here;
* subclasses should provide a more efficient implementation if
* possible.
* @param text the string to be transliterated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param result buffer to receive the transliterated text; previous
* contents are discarded
*/
void Transliterator::transliterate(const UnicodeString& text,
int32_t start, int32_t limit,
UnicodeString& result) const {
/* This is a default implementation that should be replaced by
* a more efficient subclass implementation if possible.
*/
text.extractBetween(start, limit, result);
transliterate(result);
}
/**
* Transliterates an entire string. Convenience method.
* @param text the string to be transliterated
* @param result buffer to receive the transliterated text; previous
* contents are discarded
*/
void Transliterator::transliterate(const UnicodeString& text,
UnicodeString& result) const {
transliterate(text, 0, text.length(), result);
}
/**
* Transliterates an entire string in place. Convenience method.
* @param text the string to be transliterated
*/
void Transliterator::transliterate(Replaceable& text) const {
transliterate(text, 0, text.length());
}
/**
* Transliterates the portion of the text buffer that can be
* transliterated unambiguosly after new text has been inserted,
* typically as a result of a keyboard event. The new text in
* <code>insertion</code> will be inserted into <code>text</code>
* at <code>index[LIMIT]</code>, advancing
* <code>index[LIMIT]</code> by <code>insertion.length()</code>.
* Then the transliterator will try to transliterate characters of
* <code>text</code> between <code>index[CURSOR]</code> and
* <code>index[LIMIT]</code>. Characters before
* <code>index[CURSOR]</code> will not be changed.
*
* <p>Upon return, values in <code>index[]</code> will be updated.
* <code>index[START]</code> will be advanced to the first
* character that future calls to this method will read.
* <code>index[CURSOR]</code> and <code>index[LIMIT]</code> will
* be adjusted to delimit the range of text that future calls to
* this method may change.
*
* <p>Typical usage of this method begins with an initial call
* with <code>index[START]</code> and <code>index[LIMIT]</code>
* set to indicate the portion of <code>text</code> to be
* transliterated, and <code>index[CURSOR] == index[START]</code>.
* Thereafter, <code>index[]</code> can be used without
* modification in future calls, provided that all changes to
* <code>text</code> are made via this method.
*
* <p>This method assumes that future calls may be made that will
* insert new text into the buffer. As a result, it only performs
* unambiguous transliterations. After the last call to this
* method, there may be untransliterated text that is waiting for
* more input to resolve an ambiguity. In order to perform these
* pending transliterations, clients should call {@link
* #finishKeyboardTransliteration} after the last call to this
* method has been made.
*
* @param text the buffer holding transliterated and untransliterated text
* @param index an array of three integers.
*
* <ul><li><code>index[START]</code>: the beginning index,
* inclusive; <code>0 <= index[START] <= index[LIMIT]</code>.
*
* <li><code>index[LIMIT]</code>: the ending index, exclusive;
* <code>index[START] <= index[LIMIT] <= text.length()</code>.
* <code>insertion</code> is inserted at
* <code>index[LIMIT]</code>.
*
* <li><code>index[CURSOR]</code>: the next character to be
* considered for transliteration; <code>index[START] <=
* index[CURSOR] <= index[LIMIT]</code>. Characters before
* <code>index[CURSOR]</code> will not be changed by future calls
* to this method.</ul>
*
* @param insertion text to be inserted and possibly
* transliterated into the translation buffer at
* <code>index[LIMIT]</code>. If <code>null</code> then no text
* is inserted.
* @see #START
* @see #LIMIT
* @see #CURSOR
* @see #handleKeyboardTransliterate
* @exception IllegalArgumentException if <code>index[]</code>
* is invalid
*/
void Transliterator::keyboardTransliterate(Replaceable& text,
int32_t index[3],
const UnicodeString& insertion,
UErrorCode &status) const {
_keyboardTransliterate(text, index, &insertion, status);
}
/**
* Transliterates the portion of the text buffer that can be
* transliterated unambiguosly after a new character has been
* inserted, typically as a result of a keyboard event. This is a
* convenience method; see {@link
* #keyboardTransliterate(Replaceable, int[], String)} for details.
* @param text the buffer holding transliterated and
* untransliterated text
* @param index an array of three integers. See {@link
* #keyboardTransliterate(Replaceable, int[], String)}.
* @param insertion text to be inserted and possibly
* transliterated into the translation buffer at
* <code>index[LIMIT]</code>.
* @see #keyboardTransliterate(Replaceable, int[], String)
*/
void Transliterator::keyboardTransliterate(Replaceable& text,
int32_t index[3],
UChar insertion,
UErrorCode& status) const {
UnicodeString str(insertion);
_keyboardTransliterate(text, index, &str, status);
}
/**
* Transliterates the portion of the text buffer that can be
* transliterated unambiguosly. This is a convenience method; see
* {@link #keyboardTransliterate(Replaceable, int[], String)} for
* details.
* @param text the buffer holding transliterated and
* untransliterated text
* @param index an array of three integers. See {@link
* #keyboardTransliterate(Replaceable, int[], String)}.
* @see #keyboardTransliterate(Replaceable, int[], String)
*/
void Transliterator::keyboardTransliterate(Replaceable& text,
int32_t index[3],
UErrorCode& status) const {
_keyboardTransliterate(text, index, 0, status);
}
/**
* Finishes any pending transliterations that were waiting for
* more characters. Clients should call this method as the last
* call after a sequence of one or more calls to
* <code>keyboardTransliterate()</code>.
* @param text the buffer holding transliterated and
* untransliterated text.
* @param index the array of indices previously passed to {@link
* #keyboardTransliterate}
*/
void Transliterator::finishKeyboardTransliteration(Replaceable& text,
int32_t index[3]) const {
transliterate(text, index[START], index[LIMIT]);
}
/**
* This internal method does keyboard transliteration. If the
* 'insertion' is non-null then we append it to 'text' before
* proceeding. This method calls through to the pure virtual
* framework method handleKeyboardTransliterate() to do the actual
* work.
*/
void Transliterator::_keyboardTransliterate(Replaceable& text,
int32_t index[3],
const UnicodeString* insertion,
UErrorCode &status) const {
if (U_FAILURE(status)) {
return;
}
if (index[START] < 0 ||
index[LIMIT] > text.length() ||
index[CURSOR] < index[START] ||
index[CURSOR] > index[LIMIT]) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
int32_t originalStart = index[START];
if (insertion != 0) {
text.handleReplaceBetween(index[LIMIT], index[LIMIT], *insertion);
index[LIMIT] += insertion->length();
}
handleKeyboardTransliterate(text, index);
index[START] = icu_max(index[CURSOR] - getMaximumContextLength(),
originalStart);
}
/**
* Returns the length of the longest context required by this transliterator.
* This is <em>preceding</em> context. The default implementation supplied
* by <code>Transliterator</code> returns zero; subclasses
* that use preceding context should override this method to return the
* correct value. For example, if a transliterator translates "ddd" (where
* d is any digit) to "555" when preceded by "(ddd)", then the preceding
* context length is 5, the length of "(ddd)".
*
* @return The maximum number of preceding context characters this
* transliterator needs to examine
*/
int32_t Transliterator::getMaximumContextLength() const {
return 0;
}
/**
* Returns a programmatic identifier for this transliterator.
* If this identifier is passed to <code>getInstance()</code>, it
* will return this object, if it has been registered.
* @see #registerInstance
* @see #getAvailableIDs
*/
const UnicodeString& Transliterator::getID() const {
return ID;
}
/**
* Returns a name for this transliterator that is appropriate for
* display to the user in the default locale. See {@link
* #getDisplayName(Locale)} for details.
*/
UnicodeString& Transliterator::getDisplayName(UnicodeString& result) const {
return getDisplayName(Locale::getDefault(), result);
}
/**
* Returns a name for this transliterator that is appropriate for
* display to the user in the given locale. This name is taken
* from the locale resource data in the standard manner of the
* <code>java.text</code> package.
*
* <p>If no localized names exist in the system resource bundles,
* a name is synthesized using a localized
* <code>MessageFormat</code> pattern from the resource data. The
* arguments to this pattern are an integer followed by one or two
* strings. The integer is the number of strings, either 1 or 2.
* The strings are formed by splitting the ID for this
* transliterator at the first '-'. If there is no '-', then the
* entire ID forms the only string.
* @param inLocale the Locale in which the display name should be
* localized.
* @see java.text.MessageFormat
*/
UnicodeString& Transliterator::getDisplayName(const Locale& inLocale,
UnicodeString& result) const {
UErrorCode status = U_ZERO_ERROR;
ResourceBundle bundle(Locale::getDataDirectory(), inLocale, status);
// Suspend checking status until later...
UnicodeString key(RB_DISPLAY_NAME_PREFIX);
key.append(ID);
// Try to retrieve a UnicodeString* from the bundle. The result,
// if any, should NOT be deleted.
const UnicodeString* resString = bundle.getString(key, status);
if (U_SUCCESS(status) && resString != 0) {
return result = *resString; // [sic] assign & return
}
// We have failed to get a name from the locale data. This is
// typical, since most transliterators will not have localized
// name data. The next step is to retrieve the MessageFormat
// pattern from the locale data and to use it to synthesize the
// name from the ID.
status = U_ZERO_ERROR;
resString = bundle.getString(RB_DISPLAY_NAME_PATTERN, status);
if (U_SUCCESS(status) && resString != 0) {
MessageFormat msg(*resString, inLocale, status);
// Suspend checking status until later...
// We pass either 2 or 3 Formattable objects to msg.
Formattable args[3];
int32_t i = ID.indexOf((UChar)'-');
int32_t nargs;
if (i < 0) {
args[0].setLong(1); // # of args to follow
args[1].setString(ID);
nargs = 2;
} else {
UnicodeString left, right;
ID.extractBetween(0, i, left);
ID.extractBetween(i+1, ID.length(), right);
args[0].setLong(2); // # of args to follow
args[1].setString(left);
args[2].setString(right);
nargs = 3;
}
FieldPosition pos; // ignored by msg
msg.format(args, nargs, result, pos, status);
if (U_SUCCESS(status)) {
return result;
}
}
// We should not reach this point unless there is something
// wrong with the build or the RB_DISPLAY_NAME_PATTERN has
// been deleted from the root RB_LOCALE_ELEMENTS resource.
result = ID;
return result;
}
/**
* Returns the filter used by this transliterator, or <tt>null</tt>
* if this transliterator uses no filter. Caller musn't delete
* the result!
*/
const UnicodeFilter* Transliterator::getFilter() const {
return filter;
}
/**
* Changes the filter used by this transliterator. If the filter
* is set to <tt>null</tt> then no filtering will occur.
*
* <p>Callers must take care if a transliterator is in use by
* multiple threads. The filter should not be changed by one
* thread while another thread may be transliterating.
*/
void Transliterator::adoptFilter(UnicodeFilter* filterToAdopt) {
delete filter;
filter = filterToAdopt;
}
/**
* Returns this transliterator's inverse. See the class
* documentation for details. This implementation simply inverts
* the two entities in the ID and attempts to retrieve the
* resulting transliterator. That is, if <code>getID()</code>
* returns "A-B", then this method will return the result of
* <code>getInstance("B-A")</code>, or <code>null</code> if that
* call fails.
*
* <p>This method does not take filtering into account. The
* returned transliterator will have no filter.
*
* <p>Subclasses with knowledge of their inverse may wish to
* override this method.
*
* @return a transliterator that is an inverse, not necessarily
* exact, of this transliterator, or <code>null</code> if no such
* transliterator is registered.
* @see #registerInstance
*/
Transliterator* Transliterator::createInverse() const {
int32_t i = ID.indexOf((UChar)'-');
if (i >= 0) {
UnicodeString inverseID, right;
ID.extractBetween(i+1, ID.length(), inverseID);
ID.extractBetween(0, i, right);
inverseID.append((UChar)'-').append(right);
return _createInstance(inverseID);
}
return 0;
}
/**
* Returns a <code>Transliterator</code> object given its ID.
* The ID must be either a system transliterator ID or a ID registered
* using <code>registerInstance()</code>.
*
* @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
* @return A <code>Transliterator</code> object with the given ID
* @exception IllegalArgumentException if the given ID is invalid.
* @see #registerInstance
* @see #getAvailableIDs
* @see #getID
*/
Transliterator* Transliterator::createInstance(const UnicodeString& ID) {
Transliterator* t = _createInstance(ID);
return t;
}
/**
* This is the path to the subdirectory within the locale data
* directory that contains the rule-based transliterator resource
* bundle files. This is constructed dynamically the first time
* Transliterator::getDataDirectory() is called.
*/
char* Transliterator::DATA_DIR = 0;
/**
* This is the name of a subdirectory within the locale data directory
* that contains the rule-based transliterator resource bundle files.
*/
const char* Transliterator::RESOURCE_SUB_DIR = "translit";
/**
* Returns the directory in which the transliterator resource bundle
* files are located. This is a subdirectory, named RESOURCE_SUB_DIR,
* under Locale::getDataDirectory(). It ends in a path separator.
*/
const char* Transliterator::getDataDirectory() {
if (DATA_DIR == 0) {
Mutex lock; // Okay to use the global mutex here
if (DATA_DIR == 0) {
/* Construct the transliterator data directory path. This
* is a subdirectory of the locale data directory. For
* now, we get the separator from the data directory
* assuming a path separator of one character. In the
* future we might add API to get the separator.
*
* TODO: Fix this to get the path separator in some better
* way. File an rfe for this.
*/
const char* data = Locale::getDataDirectory();
int32_t len = icu_strlen(data);
char sep[2];
sep[0] = data[len-1];
sep[1] = 0;
DATA_DIR = (char*) icu_malloc(
len + icu_strlen(RESOURCE_SUB_DIR) + 2);
if (DATA_DIR == 0) {
// This is a fatal unrecoverable error -- what should we do?
}
icu_strcpy(DATA_DIR, data);
icu_strcat(DATA_DIR, RESOURCE_SUB_DIR);
icu_strcat(DATA_DIR, sep);
}
}
return DATA_DIR;
}
inline int32_t Transliterator::hash(const UnicodeString& str) {
return str.hashCode() & 0x7FFFFFFF;
}
/**
* Returns a transliterator object given its ID. Unlike getInstance(),
* this method returns null if it cannot make use of the given ID.
*/
Transliterator* Transliterator::_createInstance(const UnicodeString& ID) {
UErrorCode status = U_ZERO_ERROR;
if (!cacheInitialized) {
initializeCache();
}
Mutex lock(&cacheMutex);
CacheEntry* entry = (CacheEntry*) uhash_get(cache, hash(ID));
TransliterationRuleData* data = 0;
if (entry == 0) {
return 0;
}
if (entry->entryType == CacheEntry::RBT_DATA) {
data = entry->u.data;
// Fall through to construct transliterator from cached Data object.
} else if (entry->entryType == CacheEntry::PROTOTYPE) {
return entry->u.prototype->clone();
} else {
// At this point entry type must be either RULE_BASED_PLACEHOLDER
// or REVERSE_RULE_BASED_PLACEHOLDER.
bool_t isReverse =
(entry->entryType ==
CacheEntry::REVERSE_RULE_BASED_PLACEHOLDER);
// We use the file name, taken from another resource bundle
// 2-d array at static init time, as a locale language. We're
// just using the locale mechanism to map through to a file
// name; this in no way represents an actual locale.
Locale fakeLocale(entry->rbFile);
ResourceBundle bundle(Transliterator::getDataDirectory(),
fakeLocale, status);
// Call RBT to parse the rules from the resource bundle
// We don't own the rules - 'rules' is an alias pointer to
// a string in the RB cache.
const UnicodeString* rules = bundle.getString(RB_RULE, status);
// If rules == 0 at this piont, or if the status indicates a
// failure, then we don't have any rules -- there is probably
// an installation error. The list in the root locale should
// correspond to all the installed transliterators; if it
// lists something that's not installed, we'll get a null
// pointer here.
if (rules != 0 && U_SUCCESS(status)) {
data = TransliterationRuleParser::parse(*rules, isReverse
? RuleBasedTransliterator.REVERSE
: RuleBasedTransliterator.FORWARD);
// Double check to see if someone has modified the entry
// since we last looked at it.
if (entry->entryType != CacheEntry::RBT_DATA) {
entry->entryType = CacheEntry::RBT_DATA;
entry->u.data = data;
} else {
// Oops! Another thread has updated this cache entry
// already to point to a data object. Discard the
// one we just created and use the one in the cache
// instead.
delete data;
data = entry->u.data;
}
}
}
if (data != 0) {
return new RuleBasedTransliterator(ID, data);
} else {
// We have a failure of some kind. Remove the ID from the
// cache so we don't keep trying. NOTE: This will throw off
// anyone who is, at the moment, trying to iterate over the
// available IDs. That's acceptable since we should never
// really get here except under installation, configuration,
// or unrecoverable run time memory failures.
_unregister(ID);
return 0;
}
}
/**
* Registers a instance <tt>obj</tt> of a subclass of
* <code>Transliterator</code> with the system. This object must
* implement the <tt>clone()</tt> method. When
* <tt>getInstance()</tt> is called with an ID string that is
* equal to <tt>obj.getID()</tt>, then <tt>obj.clone()</tt> is
* returned.
*
* @param obj an instance of subclass of
* <code>Transliterator</code> that defines <tt>clone()</tt>
* @see #getInstance
* @see #unregister
*/
void Transliterator::registerInstance(Transliterator* adoptedPrototype,
UErrorCode &status) {
if (!cacheInitialized) {
initializeCache();
}
Mutex lock(&cacheMutex);
_registerInstance(adoptedPrototype, status);
}
/**
* This internal method registers a prototype instance in the cache.
* The CALLER MUST MUTEX using cacheMutex before calling this method.
*/
void Transliterator::_registerInstance(Transliterator* adoptedPrototype,
UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
int32_t hashCode = hash(adoptedPrototype->getID());
// This needs explaining: The string reference that getID returns
// is to the ID data member of Transliterator. As long as the
// Transliterator object exists, this reference is valid, and in
// fact we can take its address and store it in IDS. No problem
// there. The only thing we have to be sure of is that before we
// remove the prototype (via unregister()), we remove the ID
// entry.
cacheIDs.addElement((void*) &adoptedPrototype->getID());
CacheEntry* entry = (CacheEntry*) uhash_get(cache, hashCode);
if (entry == 0) {
entry = new CacheEntry();
}
entry->adoptPrototype(adoptedPrototype);
uhash_putKey(cache, hashCode, entry, &status);
}
/**
* Unregisters a transliterator or class. This may be either
* a system transliterator or a user transliterator or class.
*
* @param ID the ID of the transliterator or class
* @see #registerInstance
*/
void Transliterator::unregister(const UnicodeString& ID) {
if (!cacheInitialized) {
initializeCache();
}
Mutex lock(&cacheMutex);
_unregister(ID);
}
/**
* Unregisters a transliterator or class. Internal method.
* Prerequisites: The cache must be initialized, and the
* caller must own the cacheMutex.
*/
void Transliterator::_unregister(const UnicodeString& ID) {
cacheIDs.removeElement((void*) &ID);
int32_t hc = hash(ID);
CacheEntry* entry = (CacheEntry*) uhash_get(cache, hc);
if (entry != 0) {
UErrorCode status = U_ZERO_ERROR;
uhash_remove(cache, hc, &status);
delete entry;
}
}
/**
* Vector of registered IDs.
*/
UVector Transliterator::cacheIDs;
/**
* Return the number of IDs currently registered with the system.
* To retrieve the actual IDs, call getAvailableID(i) with
* i from 0 to countAvailableIDs() - 1.
*/
int32_t Transliterator::countAvailableIDs() {
if (!cacheInitialized) {
initializeCache();
}
Mutex lock(&cacheMutex);
return cacheIDs.size();
}
/**
* Return the index-th available ID. index must be between 0
* and countAvailableIDs() - 1, inclusive. If index is out of
* range, the result of getAvailableID(0) is returned.
*/
const UnicodeString& Transliterator::getAvailableID(int32_t index) {
if (index < 0 || index >= cacheIDs.size()) {
index = 0;
}
if (!cacheInitialized) {
initializeCache();
}
Mutex lock(&cacheMutex);
return *(const UnicodeString*) cacheIDs[index];
}
/**
* Comparison function for UVector. Compares two UnicodeString
* objects given void* pointers to them.
*/
bool_t Transliterator::compareIDs(void* a, void* b) {
const UnicodeString* aa = (const UnicodeString*) a;
const UnicodeString* bb = (const UnicodeString*) b;
return *aa == *bb;
}
void Transliterator::initializeCache() {
// Lock first, check init boolean second
Mutex lock(&cacheMutex);
if (cacheInitialized) {
return;
}
UErrorCode status = U_ZERO_ERROR;
// Before looking for the resource, construct our cache.
// That way if the resource is absent, we will at least
// have a valid cache object.
cache = uhash_open(uhash_hashUString, &status);
cacheIDs.setComparer(compareIDs);
/* The following code is assuming an n x 3 table
* that looks like this:
*
* RuleBasedTransliteratorIDs {
* { "Latin-Arabic", "Arabic-Latin", "larabic" }
* { "KeyboardEscape-Latin1", "", "keyescl1" }
* ...
* }
*/
ResourceBundle bundle(Locale::getDataDirectory(),
Locale::getDefault(),
status);
int32_t rows, cols;
const UnicodeString** ruleBasedIDs =
bundle.get2dArray(RB_RULE_BASED_IDS, rows, cols, status);
if (U_SUCCESS(status) && (cols == 3)) {
for (int32_t i=0; i<rows; ++i) {
const UnicodeString* row = ruleBasedIDs[i];
for (int32_t col=0; col<2; ++col) {
if (row[col].length() > 0) {
CacheEntry* entry = new CacheEntry();
entry->entryType = (col == 0) ?
CacheEntry::RULE_BASED_PLACEHOLDER :
CacheEntry::REVERSE_RULE_BASED_PLACEHOLDER;
entry->rbFile = row[2];
uhash_putKey(cache, hash(row[col]), entry, &status);
/* It's okay to take the address of the string
* from the resource bundle under the assumption
* that the RB is caching these, and that they
* stay around forever. If this changes, what we
* need to do is change the id vector so that it
* owns its strings and create a copy here.
*/
cacheIDs.addElement((void*) &row[col]);
}
}
}
}
// Manually add prototypes that the system knows about to the
// cache. This is how new non-rule-based transliterators are
// added to the system.
status = U_ZERO_ERROR; // Reset status for following calls
_registerInstance(new HexToUnicodeTransliterator(), status);
_registerInstance(new UnicodeToHexTransliterator(), status);
cacheInitialized = TRUE;
}
Transliterator::CacheEntry::CacheEntry() {
u.prototype = 0;
entryType = NONE;
}
Transliterator::CacheEntry::~CacheEntry() {
if (entryType == PROTOTYPE) {
delete u.prototype;
}
}
void Transliterator::CacheEntry::adoptPrototype(Transliterator* adopted) {
if (entryType == PROTOTYPE) {
delete u.prototype;
}
entryType = PROTOTYPE;
u.prototype = adopted;
}

View file

@ -0,0 +1,860 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#ifndef TRANSLIT_H
#define TRANSLIT_H
#include "unistr.h"
#include "umutex.h"
#include "uvector.h"
class Replaceable;
class UnicodeFilter;
class TransliterationRuleData;
struct UHashtable;
/**
* <code>Transliterator</code> is an abstract class that
* transliterates text from one format to another. The most common
* kind of transliterator is a script, or alphabet, transliterator.
* For example, a Russian to Latin transliterator changes Russian text
* written in Cyrillic characters to phonetically equivalent Latin
* characters. It does not <em>translate</em> Russian to English!
* Transliteration, unlike translation, operates on characters, without
* reference to the meanings of words and sentences.
*
* <p>Although script conversion is its most common use, a
* transliterator can actually perform a more general class of tasks.
* In fact, <code>Transliterator</code> defines a very general API
* which specifies only that a segment of the input text is replaced
* by new text. The particulars of this conversion are determined
* entirely by subclasses of <code>Transliterator</code>.
*
* <p><b>Transliterators are stateless</b>
*
* <p><code>Transliterator</code> objects are <em>stateless</em>; they
* retain no information between calls to
* <code>transliterate()</code>. (However, this does <em>not</em>
* mean that threads may share transliterators without synchronizing
* them. Transliterators are not immutable, so they must be
* synchronized when shared between threads.) This1 might seem to
* limit the complexity of the transliteration operation. In
* practice, subclasses perform complex transliterations by delaying
* the replacement of text until it is known that no other
* replacements are possible. In other words, although the
* <code>Transliterator</code> objects are stateless, the source text
* itself embodies all the needed information, and delayed operation
* allows arbitrary complexity.
*
* <p><b>Batch transliteration</b>
*
* <p>The simplest way to perform transliteration is all at once, on a
* string of existing text. This is referred to as <em>batch</em>
* transliteration. For example, given a string <code>input</code>
* and a transliterator <code>t</code>, the call
*
* <blockquote><code>String result = t.transliterate(input);
* </code></blockquote>
*
* will transliterate it and return the result. Other methods allow
* the client to specify a substring to be transliterated and to use
* {@link Replaceable} objects instead of strings, in order to
* preserve out-of-band information (such as text styles).
*
* <p><b>Keyboard transliteration</b>
*
* <p>Somewhat more involved is <em>keyboard</em>, or incremental
* transliteration. This is the transliteration of text that is
* arriving from some source (typically the user's keyboard) one
* character at a time, or in some other piecemeal fashion.
*
* <p>In keyboard transliteration, a <code>Replaceable</code> buffer
* stores the text. As text is inserted, as much as possible is
* transliterated on the fly. This means a GUI that displays the
* contents of the buffer may show text being modified as each new
* character arrives.
*
* <p>Consider the simple <code>RuleBasedTransliterator</code>:
*
* <blockquote><code>
* th&gt;{theta}<br>
* t&gt;{tau}
* </code></blockquote>
*
* When the user types 't', nothing will happen, since the
* transliterator is waiting to see if the next character is 'h'. To
* remedy this, we introduce the notion of a cursor, marked by a '|'
* in the output string:
*
* <blockquote><code>
* t&gt;|{tau}<br>
* {tau}h&gt;{theta}
* </code></blockquote>
*
* Now when the user types 't', tau appears, and if the next character
* is 'h', the tau changes to a theta. This is accomplished by
* maintaining a cursor position (independent of the insertion point,
* and invisible in the GUI) across calls to
* <code>keyboardTransliterate()</code>. Typically, the cursor will
* be coincident with the insertion point, but in a case like the one
* above, it will precede the insertion point.
*
* <p>Keyboard transliteration methods maintain a set of three indices
* that are updated with each call to
* <code>keyboardTransliterate()</code>, including the cursor, start,
* and limit. Since these indices are changed by the method, they are
* passed in an <code>int[]</code> array. The <code>START</code> index
* marks the beginning of the substring that the transliterator will
* look at. It is advanced as text becomes committed (but it is not
* the committed index; that's the <code>CURSOR</code>). The
* <code>CURSOR</code> index, described above, marks the point at
* which the transliterator last stopped, either because it reached
* the end, or because it required more characters to disambiguate
* between possible inputs. The <code>CURSOR</code> can also be
* explicitly set by rules in a <code>RuleBasedTransliterator</code>.
* Any characters before the <code>CURSOR</code> index are frozen;
* future keyboard transliteration calls within this input sequence
* will not change them. New text is inserted at the
* <code>LIMIT</code> index, which marks the end of the substring that
* the transliterator looks at.
*
* <p>Because keyboard transliteration assumes that more characters
* are to arrive, it is conservative in its operation. It only
* transliterates when it can do so unambiguously. Otherwise it waits
* for more characters to arrive. When the client code knows that no
* more characters are forthcoming, perhaps because the user has
* performed some input termination operation, then it should call
* <code>finishKeyboardTransliteration()</code> to complete any
* pending transliterations.
*
* <p><b>Inverses</b>
*
* <p>Pairs of transliterators may be inverses of one another. For
* example, if transliterator <b>A</b> transliterates characters by
* incrementing their Unicode value (so "abc" -> "def"), and
* transliterator <b>B</b> decrements character values, then <b>A</b>
* is an inverse of <b>B</b> and vice versa. If we compose <b>A</b>
* with <b>B</b> in a compound transliterator, the result is the
* indentity transliterator, that is, a transliterator that does not
* change its input text.
*
* The <code>Transliterator</code> method <code>getInverse()</code>
* returns a transliterator's inverse, if one exists, or
* <code>null</code> otherwise. However, the result of
* <code>getInverse()</code> usually will <em>not</em> be a true
* mathematical inverse. This is because true inverse transliterators
* are difficult to formulate. For example, consider two
* transliterators: <b>AB</b>, which transliterates the character 'A'
* to 'B', and <b>BA</b>, which transliterates 'B' to 'A'. It might
* seem that these are exact inverses, since
*
* <blockquote>"A" x <b>AB</b> -> "B"<br>
* "B" x <b>BA</b> -> "A"</blockquote>
*
* where 'x' represents transliteration. However,
*
* <blockquote>"ABCD" x <b>AB</b> -> "BBCD"<br>
* "BBCD" x <b>BA</b> -> "AACD"</blockquote>
*
* so <b>AB</b> composed with <b>BA</b> is not the
* identity. Nonetheless, <b>BA</b> may be usefully considered to be
* <b>AB</b>'s inverse, and it is on this basis that
* <b>AB</b><code>.getInverse()</code> could legitimately return
* <b>BA</b>.
*
* <p><b>IDs and display names</b>
*
* <p>A transliterator is designated by a short identifier string or
* <em>ID</em>. IDs follow the format <em>source-destination</em>,
* where <em>source</em> describes the entity being replaced, and
* <em>destination</em> describes the entity replacing
* <em>source</em>. The entities may be the names of scripts,
* particular sequences of characters, or whatever else it is that the
* transliterator converts to or from. For example, a transliterator
* from Russian to Latin might be named "Russian-Latin". A
* transliterator from keyboard escape sequences to Latin-1 characters
* might be named "KeyboardEscape-Latin1". By convention, system
* entity names are in English, with the initial letters of words
* capitalized; user entity names may follow any format so long as
* they do not contain dashes.
*
* <p>In addition to programmatic IDs, transliterator objects have
* display names for presentation in user interfaces, returned by
* {@link #getDisplayName}.
*
* <p><b>Factory methods and registration</b>
*
* <p>In general, client code should use the factory method
* <code>getInstance()</code> to obtain an instance of a
* transliterator given its ID. Valid IDs may be enumerated using
* <code>getAvailableIDs()</code>. Since transliterators are mutable,
* multiple calls to <code>getInstance()</code> with the same ID will
* return distinct objects.
*
* <p>In addition to the system transliterators registered at startup,
* user transliterators may be registered by calling
* <code>registerInstance()</code> at run time. A registered instance
* acts a template; future calls to <tt>getInstance()</tt> with the ID
* of the registered object return clones of that object. Thus any
* object passed to <tt>registerInstance()</tt> must implement
* <tt>clone()</tt> propertly. To register a transliterator subclass
* without instantiating it (until it is needed), users may call
* <code>registerClass()</code>. In this case, the objects are
* instantiated by invoking the zero-argument public constructor of
* the class.
*
* <p><b>Subclassing</b>
*
* <p>Subclasses must implement the abstract
* <code>transliterate()</code> method. They should also override the
* <code>transliterate()</code> method taking a <code>String</code>
* and <code>StringBuffer</code> if the performance of these methods
* can be improved over the performance obtained by the default
* implementations in this class. Subclasses must also implement
* <code>handleKeyboardTransliterate()</code>.
*
* @author Alan Liu
*/
class U_I18N_API Transliterator {
public:
enum {
/**
* In the <code>keyboardTransliterate()</code>
* <code>index[]</code> array, the beginning index, inclusive
* @see #keyboardTransliterate
*/
START = 0,
/**
* In the <code>keyboardTransliterate()</code>
* <code>index[]</code> array, the ending index, exclusive
* @see #keyboardTransliterate
*/
LIMIT = 1,
/**
* In the <code>keyboardTransliterate()</code>
* <code>index[]</code> array, the next character to be considered
* for transliteration
* @see #keyboardTransliterate
*/
CURSOR = 2
};
private:
/**
* Programmatic name, e.g., "Latin-Arabic".
*/
UnicodeString ID;
/**
* This transliterator's filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
*/
UnicodeFilter* filter;
/**
* Dictionary of known transliterators. Keys are <code>String</code>
* names, values are one of the following:
*
* <ul><li><code>Transliterator</code> objects
*
* <li><code>Class</code> objects. Such objects must represent
* subclasses of <code>Transliterator</code>, and must satisfy the
* constraints described in <code>registerClass()</code>
*
* <li><code>RULE_BASED_PLACEHOLDER</code>, in which case the ID
* will have its first '-' removed and be appended to
* RB_RULE_BASED_PREFIX to form a resource bundle name from which
* the RB_RULE key is looked up to obtain the rule.
*
* <li><code>REVERSE_RULE_BASED_PLACEHOLDER</code>. Like
* <code>RULE_BASED_PLACEHOLDER</code>, except the entity names in
* the ID are reversed, and the argument
* RuleBasedTransliterator.REVERSE is pased to the
* RuleBasedTransliterator constructor.
* </ul>
*/
static UHashtable* cache;
/**
* The mutex controlling access to the cache.
*/
static UMTX cacheMutex;
/**
* When set to TRUE, the cache has been initialized. Any code must
* check this boolean before accessing the cache, and if the boolean
* is FALSE, it must call initializeCache(). We do this form of lazy
* evaluation for two reasons: (1) so we don't initialize if we don't
* have to (i.e., if no one is using Transliterator, but has included
* the code as part of a shared library, and (2) to avoid static
* intialization problems.
*/
static bool_t cacheInitialized;
/**
* In Java, the cache stores objects of different types and
* singleton objects as placeholders for rule-based
* transliterators to be built as needed. In C++ we use the
* following struct to achieve the same purpose. Instances of
* this struct can be placeholders, can represent prototype
* transliterators to be cloned, or can represent
* RuleBasedTransliterator::Data objects. We don't support
* storing classes in the cache because we don't have the rtti
* infrastructure for it. We could easily add this if there is a
* need for it in the future. The rbFile is the resource bundle
* file name for rule-based transliterators.
*/
struct CacheEntry {
enum Type {
RULE_BASED_PLACEHOLDER,
REVERSE_RULE_BASED_PLACEHOLDER,
PROTOTYPE,
RBT_DATA,
NONE // Only used for uninitialized entries
} entryType;
UnicodeString rbFile; // For *PLACEHOLDER
union {
Transliterator* prototype; // For PROTOTYPE
TransliterationRuleData* data; // For RBT_DATA
} u;
CacheEntry();
~CacheEntry();
void adoptPrototype(Transliterator* adopted);
};
/**
* Prefix for resource bundle key for the display name for a
* transliterator. The ID is appended to this to form the key.
* The resource bundle value should be a String.
*/
static const char* RB_DISPLAY_NAME_PREFIX;
/**
* Resource bundle key for display name pattern.
* The resource bundle value should be a String forming a
* MessageFormat pattern, e.g.:
* "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}".
*/
static const char* RB_DISPLAY_NAME_PATTERN;
/**
* Resource bundle key for the list of RuleBasedTransliterator IDs.
* The resource bundle value should be a String[] with each element
* being a valid ID. The ID will be appended to RB_RULE_BASED_PREFIX
* to obtain the class name in which the RB_RULE key will be sought.
*/
static const char* RB_RULE_BASED_IDS;
/**
* Resource bundle key for the RuleBasedTransliterator rule.
*/
static const char* RB_RULE;
protected:
/**
* Default constructor.
* @param ID the string identifier for this transliterator
* @param adoptedFilter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
*/
Transliterator(const UnicodeString& ID, UnicodeFilter* adoptedFilter);
/**
* Copy constructor.
*/
Transliterator(const Transliterator&);
/**
* Assignment operator.
*/
Transliterator& operator=(const Transliterator&);
public:
/**
* Destructor.
*/
virtual ~Transliterator();
/**
* Implements Cloneable.
* All subclasses are encouraged to implement this method if it is
* possible and reasonable to do so. Subclasses that are to be
* registered with the system using <tt>registerInstance()<tt>
* are required to implement this method. If a subclass does not
* implement clone() properly and is registered with the system
* using registerInstance(), then the default clone() implementation
* will return null, and calls to createInstance() will fail.
*
* @see #registerInstance
*/
virtual Transliterator* clone() const { return 0; }
/**
* Transliterates the segment of a string that begins at the
* character at offset <code>start</code> and extends to the
* character at offset <code>limit - 1</code>, with optional
* filtering. A default implementaion is provided here;
* subclasses should provide a more efficient implementation if
* possible.
* @param text the string to be transliterated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param result buffer to receive the transliterated text; previous
* contents are discarded
*/
virtual void transliterate(const UnicodeString& text,
int32_t start, int32_t limit,
UnicodeString& result) const;
/**
* Transliterates a segment of a string, with optional filtering.
* Subclasses must override this abstract method.
*
* @param text the string to be transliterated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @param filter the filter. Any character for which
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
* altered by this transliterator. If <tt>filter</tt> is
* <tt>null</tt> then no filtering is applied.
* @return The new limit index. The text previously occupying <code>[start,
* limit)</code> has been transliterated, possibly to a string of a different
* length, at <code>[start, </code><em>new-limit</em><code>)</code>, where
* <em>new-limit</em> is the return value.
*/
virtual int32_t transliterate(Replaceable& text,
int32_t start, int32_t limit) const = 0;
/**
* Transliterates an entire string. Convenience method.
* @param text the string to be transliterated
* @param result buffer to receive the transliterated text; previous
* contents are discarded
*/
virtual void transliterate(const UnicodeString& text,
UnicodeString& result) const;
/**
* Transliterates an entire string in place. Convenience method.
* @param text the string to be transliterated
*/
virtual void transliterate(Replaceable& text) const;
/**
* Transliterates the portion of the text buffer that can be
* transliterated unambiguosly after new text has been inserted,
* typically as a result of a keyboard event. The new text in
* <code>insertion</code> will be inserted into <code>text</code>
* at <code>index[LIMIT]</code>, advancing
* <code>index[LIMIT]</code> by <code>insertion.length()</code>.
* Then the transliterator will try to transliterate characters of
* <code>text</code> between <code>index[CURSOR]</code> and
* <code>index[LIMIT]</code>. Characters before
* <code>index[CURSOR]</code> will not be changed.
*
* <p>Upon return, values in <code>index[]</code> will be updated.
* <code>index[START]</code> will be advanced to the first
* character that future calls to this method will read.
* <code>index[CURSOR]</code> and <code>index[LIMIT]</code> will
* be adjusted to delimit the range of text that future calls to
* this method may change.
*
* <p>Typical usage of this method begins with an initial call
* with <code>index[START]</code> and <code>index[LIMIT]</code>
* set to indicate the portion of <code>text</code> to be
* transliterated, and <code>index[CURSOR] == index[START]</code>.
* Thereafter, <code>index[]</code> can be used without
* modification in future calls, provided that all changes to
* <code>text</code> are made via this method.
*
* <p>This method assumes that future calls may be made that will
* insert new text into the buffer. As a result, it only performs
* unambiguous transliterations. After the last call to this
* method, there may be untransliterated text that is waiting for
* more input to resolve an ambiguity. In order to perform these
* pending transliterations, clients should call {@link
* #finishKeyboardTransliteration} after the last call to this
* method has been made.
*
* @param text the buffer holding transliterated and untransliterated text
* @param index an array of three integers.
*
* <ul><li><code>index[START]</code>: the beginning index,
* inclusive; <code>0 <= index[START] <= index[LIMIT]</code>.
*
* <li><code>index[LIMIT]</code>: the ending index, exclusive;
* <code>index[START] <= index[LIMIT] <= text.length()</code>.
* <code>insertion</code> is inserted at
* <code>index[LIMIT]</code>.
*
* <li><code>index[CURSOR]</code>: the next character to be
* considered for transliteration; <code>index[START] <=
* index[CURSOR] <= index[LIMIT]</code>. Characters before
* <code>index[CURSOR]</code> will not be changed by future calls
* to this method.</ul>
*
* @param insertion text to be inserted and possibly
* transliterated into the translation buffer at
* <code>index[LIMIT]</code>. If <code>null</code> then no text
* is inserted.
* @see #START
* @see #LIMIT
* @see #CURSOR
* @see #handleKeyboardTransliterate
* @exception IllegalArgumentException if <code>index[]</code>
* is invalid
*/
virtual void keyboardTransliterate(Replaceable& text,
int32_t index[3],
const UnicodeString& insertion,
UErrorCode& status) const;
/**
* Transliterates the portion of the text buffer that can be
* transliterated unambiguosly after a new character has been
* inserted, typically as a result of a keyboard event. This is a
* convenience method; see {@link
* #keyboardTransliterate(Replaceable, int[], String)} for details.
* @param text the buffer holding transliterated and
* untransliterated text
* @param index an array of three integers. See {@link
* #keyboardTransliterate(Replaceable, int[], String)}.
* @param insertion text to be inserted and possibly
* transliterated into the translation buffer at
* <code>index[LIMIT]</code>.
* @see #keyboardTransliterate(Replaceable, int[], String)
*/
virtual void keyboardTransliterate(Replaceable& text, int32_t index[3],
UChar insertion,
UErrorCode& status) const;
/**
* Transliterates the portion of the text buffer that can be
* transliterated unambiguosly. This is a convenience method; see
* {@link #keyboardTransliterate(Replaceable, int[], String)} for
* details.
* @param text the buffer holding transliterated and
* untransliterated text
* @param index an array of three integers. See {@link
* #keyboardTransliterate(Replaceable, int[], String)}.
* @see #keyboardTransliterate(Replaceable, int[], String)
*/
virtual void keyboardTransliterate(Replaceable& text, int32_t index[3],
UErrorCode& status) const;
/**
* Finishes any pending transliterations that were waiting for
* more characters. Clients should call this method as the last
* call after a sequence of one or more calls to
* <code>keyboardTransliterate()</code>.
* @param text the buffer holding transliterated and
* untransliterated text.
* @param index the array of indices previously passed to {@link
* #keyboardTransliterate}
*/
virtual void finishKeyboardTransliteration(Replaceable& text,
int32_t index[3]) const;
private:
/**
* This internal method does keyboard transliteration. If the
* 'insertion' is non-null then we append it to 'text' before
* proceeding. This method calls through to the pure virtual
* framework method handleKeyboardTransliterate() to do the actual
* work.
*/
void _keyboardTransliterate(Replaceable& text,
int32_t index[3],
const UnicodeString* insertion,
UErrorCode &status) const;
protected:
/**
* Abstract method that concrete subclasses define to implement
* keyboard transliteration. This method should transliterate all
* characters between <code>index[CURSOR]</code> and
* <code>index[LIMIT]</code> that can be unambiguously
* transliterated, regardless of future insertions of text at
* <code>index[LIMIT]</code>. <code>index[CURSOR]</code> should
* be advanced past committed characters (those that will not
* change in future calls to this method).
* <code>index[LIMIT]</code> should be updated to reflect text
* replacements that shorten or lengthen the text between
* <code>index[CURSOR]</code> and <code>index[LIMIT]</code>. Upon
* return, neither <code>index[CURSOR]</code> nor
* <code>index[LIMIT]</code> should be less than the initial value
* of <code>index[CURSOR]</code>. <code>index[START]</code>
* should <em>not</em> be changed.
*
* @param text the buffer holding transliterated and
* untransliterated text
* @param index an array of three integers. See {@link
* #keyboardTransliterate(Replaceable, int[], String)}.
* @see #keyboardTransliterate
*/
virtual void handleKeyboardTransliterate(Replaceable& text,
int32_t index[3]) const = 0;
// C++ requires this friend declaration so CompoundTransliterator
// can access handleKeyboardTransliterate. Alternatively, we could
// make handleKeyboardTransliterate public.
friend class CompoundTransliterator;
public:
/**
* Returns the length of the longest context required by this transliterator.
* This is <em>preceding</em> context. The default implementation supplied
* by <code>Transliterator</code> returns zero; subclasses
* that use preceding context should override this method to return the
* correct value. For example, if a transliterator translates "ddd" (where
* d is any digit) to "555" when preceded by "(ddd)", then the preceding
* context length is 5, the length of "(ddd)".
*
* @return The maximum number of preceding context characters this
* transliterator needs to examine
*/
virtual int32_t getMaximumContextLength() const;
/**
* Returns a programmatic identifier for this transliterator.
* If this identifier is passed to <code>getInstance()</code>, it
* will return this object, if it has been registered.
* @see #registerInstance
* @see #registerClass
* @see #getAvailableIDs
*/
virtual const UnicodeString& getID() const;
/**
* Returns a name for this transliterator that is appropriate for
* display to the user in the default locale. See {@link
* #getDisplayName(Locale)} for details.
*/
virtual UnicodeString& getDisplayName(UnicodeString& result) const;
/**
* Returns a name for this transliterator that is appropriate for
* display to the user in the given locale. This name is taken
* from the locale resource data in the standard manner of the
* <code>java.text</code> package.
*
* <p>If no localized names exist in the system resource bundles,
* a name is synthesized using a localized
* <code>MessageFormat</code> pattern from the resource data. The
* arguments to this pattern are an integer followed by one or two
* strings. The integer is the number of strings, either 1 or 2.
* The strings are formed by splitting the ID for this
* transliterator at the first '-'. If there is no '-', then the
* entire ID forms the only string.
* @param inLocale the Locale in which the display name should be
* localized.
* @see java.text.MessageFormat
*/
virtual UnicodeString& getDisplayName(const Locale& inLocale,
UnicodeString& result) const;
/**
* Returns the filter used by this transliterator, or <tt>null</tt>
* if this transliterator uses no filter.
*/
virtual const UnicodeFilter* getFilter() const;
/**
* Changes the filter used by this transliterator. If the filter
* is set to <tt>null</tt> then no filtering will occur.
*
* <p>Callers must take care if a transliterator is in use by
* multiple threads. The filter should not be changed by one
* thread while another thread may be transliterating.
*/
virtual void adoptFilter(UnicodeFilter* adoptedFilter);
/**
* Returns this transliterator's inverse. See the class
* documentation for details. This implementation simply inverts
* the two entities in the ID and attempts to retrieve the
* resulting transliterator. That is, if <code>getID()</code>
* returns "A-B", then this method will return the result of
* <code>getInstance("B-A")</code>, or <code>null</code> if that
* call fails.
*
* <p>This method does not take filtering into account. The
* returned transliterator will have no filter.
*
* <p>Subclasses with knowledge of their inverse may wish to
* override this method.
*
* @return a transliterator that is an inverse, not necessarily
* exact, of this transliterator, or <code>null</code> if no such
* transliterator is registered.
* @see #registerInstance
*/
virtual Transliterator* createInverse() const;
/**
* Returns a <code>Transliterator</code> object given its ID.
* The ID must be either a system transliterator ID or a ID registered
* using <code>registerInstance()</code>.
*
* @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
* @return A <code>Transliterator</code> object with the given ID
* @exception IllegalArgumentException if the given ID is invalid.
* @see #registerInstance
* @see #getAvailableIDs
* @see #getID
*/
static Transliterator* createInstance(const UnicodeString& ID);
private:
/**
* This is the path to the subdirectory within the locale data
* directory that contains the rule-based transliterator resource
* bundle files. This is constructed dynamically the first time
* Transliterator::getDataDirectory() is called.
*/
static char* DATA_DIR;
/**
* This is the name of a subdirectory within the locale data directory
* that contains the rule-based transliterator resource bundle files.
*/
static const char* RESOURCE_SUB_DIR;
/**
* Returns the directory in which the transliterator resource bundle
* files are located. This is a subdirectory, named RESOURCE_SUB_DIR,
* under Locale::getDataDirectory(). It ends in a path separator.
*/
static const char* getDataDirectory();
static int32_t hash(const UnicodeString& str);
/**
* Returns a transliterator object given its ID. Unlike getInstance(),
* this method returns null if it cannot make use of the given ID.
*/
static Transliterator* _createInstance(const UnicodeString& ID);
public:
/**
* Registers a instance <tt>obj</tt> of a subclass of
* <code>Transliterator</code> with the system. When
* <tt>createInstance()</tt> is called with an ID string that is
* equal to <tt>obj->getID()</tt>, then <tt>obj->clone()</tt> is
* returned.
*
* After this call the Transliterator class owns the adoptedObj
* and will delete it.
*
* @param obj an instance of subclass of
* <code>Transliterator</code> that defines <tt>clone()</tt>
* @see #getInstance
* @see #registerClass
* @see #unregister
*/
static void registerInstance(Transliterator* adoptedObj,
UErrorCode& status);
private:
/**
* This internal method registers a prototype instance in the cache.
* The CALLER MUST MUTEX using cacheMutex before calling this method.
*/
static void _registerInstance(Transliterator* adoptedPrototype,
UErrorCode &status);
public:
/**
* Unregisters a transliterator or class. This may be either
* a system transliterator or a user transliterator or class.
*
* @param ID the ID of the transliterator or class
* @return the <code>Object</code> that was registered with
* <code>ID</code>, or <code>null</code> if none was
* @see #registerInstance
* @see #registerClass
*/
static void unregister(const UnicodeString& ID);
private:
/**
* Unregisters a transliterator or class. Internal method.
* Prerequisites: The cache must be initialized, and the
* caller must own the cacheMutex.
*/
static void _unregister(const UnicodeString& ID);
/**
* Returns an enumeration over the programmatic names of registered
* <code>Transliterator</code> objects. This includes both system
* transliterators and user transliterators registered using
* <code>registerInstance()</code>. The enumerated names may be
* passed to <code>getInstance()</code>.
*
* @return An <code>Enumeration</code> over <code>String</code> objects
* @see #getInstance
* @see #registerInstance
*/
// virtual Enumeration getAvailableIDs();
/**
* Vector of registered IDs.
*/
static UVector cacheIDs;
public:
/**
* Return the number of IDs currently registered with the system.
* To retrieve the actual IDs, call getAvailableID(i) with
* i from 0 to countAvailableIDs() - 1.
*/
static int32_t countAvailableIDs();
/**
* Return the index-th available ID. index must be between 0
* and countAvailableIDs() - 1, inclusive. If index is out of
* range, the result of getAvailableID(0) is returned.
*/
static const UnicodeString& getAvailableID(int32_t index);
private:
/**
* Comparison function for UVector. Compares two UnicodeString
* objects given void* pointers to them.
*/
static bool_t compareIDs(void* a, void* b);
static void initializeCache();
};
#endif

View file

@ -0,0 +1,51 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#ifndef UNIFILT_H
#define UNIFILT_H
/**
* <code>UnicodeFilter</code> defines a protocol for selecting a
* subset of the full range (U+0000 to U+FFFF) of Unicode characters.
* Currently, filters are used in conjunction with classes like {@link
* Transliterator} to only process selected characters through a
* transformation.
*
* @see UnicodeFilterLogic
*/
class U_I18N_API UnicodeFilter {
public:
virtual ~UnicodeFilter();
/**
* Returns <tt>true</tt> for characters that are in the selected
* subset. In other words, if a character is <b>to be
* filtered</b>, then <tt>isIn()</tt> returns
* <b><tt>false</tt></b>.
*/
virtual bool_t isIn(UChar c) const = 0;
/**
* Returns a copy of this object. All UnicodeFilter objects have
* to support cloning in order to allow classes using
* UnicodeFilters, such as Transliterator, to implement cloning.
*/
virtual UnicodeFilter* clone() const = 0;
protected:
UnicodeFilter();
};
inline UnicodeFilter::UnicodeFilter() {}
inline UnicodeFilter::~UnicodeFilter() {}
#endif

View file

@ -0,0 +1,139 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#include "unifltlg.h"
#include "unifilt.h"
class UnicodeNotFilter : public UnicodeFilter {
UnicodeFilter* filt;
public:
UnicodeNotFilter(UnicodeFilter* adopted);
UnicodeNotFilter(const UnicodeNotFilter&);
virtual ~UnicodeNotFilter();
virtual bool_t isIn(UChar c) const;
virtual UnicodeFilter* clone() const;
};
UnicodeNotFilter::UnicodeNotFilter(UnicodeFilter* adopted) : filt(adopted) {}
UnicodeNotFilter::UnicodeNotFilter(const UnicodeNotFilter& f) : filt(f.filt->clone()) {}
UnicodeNotFilter::~UnicodeNotFilter() { delete filt; }
bool_t UnicodeNotFilter::isIn(UChar c) const { return !filt->isIn(c); }
UnicodeFilter* UnicodeNotFilter::clone() const { return new UnicodeNotFilter(*this); }
/**
* Returns a <tt>UnicodeFilter</tt> that implements the inverse of
* the given filter.
*/
UnicodeFilter* UnicodeFilterLogic::createNot(const UnicodeFilter& f) {
return new UnicodeNotFilter(f.clone());
}
class UnicodeAndFilter : public UnicodeFilter {
UnicodeFilter* filt1;
UnicodeFilter* filt2;
public:
UnicodeAndFilter(UnicodeFilter* adopted1, UnicodeFilter* adopted2);
UnicodeAndFilter(const UnicodeAndFilter&);
virtual ~UnicodeAndFilter();
virtual bool_t isIn(UChar c) const;
virtual UnicodeFilter* clone() const;
};
UnicodeAndFilter::UnicodeAndFilter(UnicodeFilter* f1, UnicodeFilter* f2) : filt1(f1), filt2(f2) {}
UnicodeAndFilter::UnicodeAndFilter(const UnicodeAndFilter& f) :
filt1(f.filt1->clone()), filt2(f.filt2->clone()) {}
UnicodeAndFilter::~UnicodeAndFilter() { delete filt1; delete filt2; }
bool_t UnicodeAndFilter::isIn(UChar c) const { return filt1->isIn(c) && filt2->isIn(c); }
UnicodeFilter* UnicodeAndFilter::clone() const { return new UnicodeAndFilter(*this); }
/**
* Returns a <tt>UnicodeFilter</tt> that implements a short
* circuit AND of the result of the two given filters. That is,
* if <tt>f.isIn()</tt> is <tt>false</tt>, then <tt>g.isIn()</tt>
* is not called, and <tt>isIn()</tt> returns <tt>false</tt>.
*
* <p>Either <tt>f</tt> or <tt>g</tt> must be non-null.
*/
UnicodeFilter* UnicodeFilterLogic::createAnd(const UnicodeFilter& f,
const UnicodeFilter& g) {
return new UnicodeAndFilter(f.clone(), g.clone());
}
/**
* Returns a <tt>UnicodeFilter</tt> that implements a short
* circuit AND of the result of the given filters. That is, if
* <tt>f[i].isIn()</tt> is <tt>false</tt>, then
* <tt>f[j].isIn()</tt> is not called, where <tt>j > i</tt>, and
* <tt>isIn()</tt> returns <tt>false</tt>.
*/
//!UnicodeFilter* UnicodeFilterLogic::and(const UnicodeFilter** f) {
//! return new UnicodeFilter() {
//! public bool_t isIn(UChar c) {
//! for (int32_t i=0; i<f.length; ++i) {
//! if (!f[i].isIn(c)) {
//! return FALSE;
//! }
//! }
//! return TRUE;
//! }
//! };
//!}
class UnicodeOrFilter : public UnicodeFilter {
UnicodeFilter* filt1;
UnicodeFilter* filt2;
public:
UnicodeOrFilter(UnicodeFilter* adopted1, UnicodeFilter* adopted2);
UnicodeOrFilter(const UnicodeOrFilter&);
virtual ~UnicodeOrFilter();
virtual bool_t isIn(UChar c) const;
virtual UnicodeFilter* clone() const;
};
UnicodeOrFilter::UnicodeOrFilter(UnicodeFilter* f1, UnicodeFilter* f2) : filt1(f1), filt2(f2) {}
UnicodeOrFilter::UnicodeOrFilter(const UnicodeOrFilter& f) :
filt1(f.filt1->clone()), filt2(f.filt2->clone()) {}
UnicodeOrFilter::~UnicodeOrFilter() { delete filt1; delete filt2; }
bool_t UnicodeOrFilter::isIn(UChar c) const { return filt1->isIn(c) || filt2->isIn(c); }
UnicodeFilter* UnicodeOrFilter::clone() const { return new UnicodeOrFilter(*this); }
/**
* Returns a <tt>UnicodeFilter</tt> that implements a short
* circuit OR of the result of the two given filters. That is, if
* <tt>f.isIn()</tt> is <tt>true</tt>, then <tt>g.isIn()</tt> is
* not called, and <tt>isIn()</tt> returns <tt>true</tt>.
*
* <p>Either <tt>f</tt> or <tt>g</tt> must be non-null.
*/
UnicodeFilter* UnicodeFilterLogic::createOr(const UnicodeFilter& f,
const UnicodeFilter& g) {
return new UnicodeOrFilter(f.clone(), g.clone());
}
/**
* Returns a <tt>UnicodeFilter</tt> that implements a short
* circuit OR of the result of the given filters. That is, if
* <tt>f[i].isIn()</tt> is <tt>false</tt>, then
* <tt>f[j].isIn()</tt> is not called, where <tt>j > i</tt>, and
* <tt>isIn()</tt> returns <tt>true</tt>.
*/
//!UnicodeFilter* UnicodeFilterLogic::or(const UnicodeFilter** f) {
//! return new UnicodeFilter() {
//! public bool_t isIn(UChar c) {
//! for (int32_t i=0; i<f.length; ++i) {
//! if (f[i].isIn(c)) {
//! return TRUE;
//! }
//! }
//! return FALSE;
//! }
//! };
//!}
// TODO: Add nand() & nor() for convenience, if needed.

View file

@ -0,0 +1,84 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#ifndef UNIFLTLG_H
#define UNIFLTLG_H
#include "utypes.h"
class UnicodeFilter;
/**
* <code>UnicodeFilterLogic</code> provides logical operators on
* {@link UnicodeFilter} objects. This class cannot be instantiated;
* it consists only of static methods. The static methods return
* filter objects that perform logical inversion (<tt>not</tt>),
* intersection (<tt>and</tt>), or union (<tt>or</tt>) of the given
* filter objects.
*/
class U_I18N_API UnicodeFilterLogic {
public:
/**
* Returns a <tt>UnicodeFilter</tt> that implements the inverse of
* the given filter.
*/
static UnicodeFilter* createNot(const UnicodeFilter& f);
/**
* Returns a <tt>UnicodeFilter</tt> that implements a short
* circuit AND of the result of the two given filters. That is,
* if <tt>f.isIn()</tt> is <tt>false</tt>, then <tt>g.isIn()</tt>
* is not called, and <tt>isIn()</tt> returns <tt>false</tt>.
*
* <p>Either <tt>f</tt> or <tt>g</tt> must be non-null.
*/
static UnicodeFilter* createAnd(const UnicodeFilter& f,
const UnicodeFilter& g);
/**
* Returns a <tt>UnicodeFilter</tt> that implements a short
* circuit AND of the result of the given filters. That is, if
* <tt>f[i].isIn()</tt> is <tt>false</tt>, then
* <tt>f[j].isIn()</tt> is not called, where <tt>j > i</tt>, and
* <tt>isIn()</tt> returns <tt>false</tt>.
*/
// static UnicodeFilter* and(const UnicodeFilter** f);
/**
* Returns a <tt>UnicodeFilter</tt> that implements a short
* circuit OR of the result of the two given filters. That is, if
* <tt>f.isIn()</tt> is <tt>true</tt>, then <tt>g.isIn()</tt> is
* not called, and <tt>isIn()</tt> returns <tt>true</tt>.
*
* <p>Either <tt>f</tt> or <tt>g</tt> must be non-null.
*/
static UnicodeFilter* createOr(const UnicodeFilter& f,
const UnicodeFilter& g);
/**
* Returns a <tt>UnicodeFilter</tt> that implements a short
* circuit OR of the result of the given filters. That is, if
* <tt>f[i].isIn()</tt> is <tt>false</tt>, then
* <tt>f[j].isIn()</tt> is not called, where <tt>j > i</tt>, and
* <tt>isIn()</tt> returns <tt>true</tt>.
*/
// static UnicodeFilter* or(const UnicodeFilter** f);
// TODO: Add nand() & nor() for convenience, if needed.
private:
// Disallow instantiation
UnicodeFilterLogic();
};
inline UnicodeFilterLogic::UnicodeFilterLogic() {}
#endif

View file

@ -0,0 +1,108 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#include "unirange.h"
#include "uvector.h"
#include "unistr.h"
UnicodeRange::UnicodeRange(UChar theStart, int32_t theLength) {
start = theStart;
length = theLength;
}
UnicodeRange* UnicodeRange::clone() const {
return new UnicodeRange(start, length);
}
/**
* CALLER OWNS RESULT.
*/
bool_t UnicodeRange::contains(UChar c) const {
return c >= start && (c - start) < length;
}
/**
* Assume that contains(c) is true. Split this range into two new
* ranges around the character c. Make this range one of the new ranges
* (modify it in place) and return the other new range. The character
* itself is not included in either range. If the split results in an
* empty range (that is, if c == start or c == start + length - 1) then
* return null.
*
* MODIFIES THIS RANGE IN PLACE.
*
* CALLER OWNS RESULT.
*/
UnicodeRange* UnicodeRange::split(UChar c) {
if (c == start) {
++start;
--length;
return 0;
} else if (c - start == length - 1) {
--length;
return 0;
} else {
++c;
UnicodeRange* r = new UnicodeRange(c, start + length - c);
length = --c - start;
return r;
}
}
/**
* Finds the largest unused subrange by the given string. A
* subrange is unused by a string if the string contains no
* characters in that range. If the given string contains no
* characters in this range, then this range itself is
* returned.
*
* CALLER OWNS RESULT.
*/
UnicodeRange*
UnicodeRange::largestUnusedSubrange(const UnicodeString& str) const {
int32_t n = str.length();
UVector v;
v.setDeleter(UnicodeRange::deleter);
v.addElement(clone());
for (int32_t i=0; i<n; ++i) {
UChar c = str.charAt(i);
if (contains(c)) {
for (int32_t j=0; j<v.size(); ++j) {
UnicodeRange* r = (UnicodeRange*) v.elementAt(j);
if (r->contains(c)) {
r = r->split(c);
if (r != 0) {
v.addElement(r);
}
break;
}
}
}
}
UnicodeRange* bestRange = 0;
int32_t ibest = -1;
for (int32_t j=0; j<v.size(); ++j) {
UnicodeRange* r = (UnicodeRange*) v.elementAt(j);
if (bestRange == 0 || r->length > bestRange->length) {
bestRange = r;
ibest = j;
}
}
v.orphanElementAt(ibest); // So bestRange doesn't get deleted
return bestRange;
}
// For UVector of UnicodeRange* objects
void UnicodeRange::deleter(void* e) {
delete (UnicodeRange*) e;
}

View file

@ -0,0 +1,79 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#ifndef UNIRANGE_H
#define UNIRANGE_H
#include "utypes.h"
class UnicodeString;
/**
* %%% INTERNAL CLASS USED BY RuleBasedTransliterator %%%
*
* A range of Unicode characters. Support the operations of testing for
* inclusion (does this range contain this character?) and splitting.
* Splitting involves breaking a range into two smaller ranges around a
* character inside the original range. The split character is not included
* in either range. If the split character is at either extreme end of the
* range, one of the split products is an empty range.
*
* This class is used internally to determine the largest available private
* use character range for variable stand-ins.
*/
class UnicodeRange {
public:
UChar start;
int32_t length;
UnicodeRange(UChar start, int32_t length);
/**
* CALLER OWNS RESULT.
*/
UnicodeRange* clone() const;
bool_t contains(UChar c) const;
/**
* Assume that contains(c) is true. Split this range into two new
* ranges around the character c. Make this range one of the new ranges
* (modify it in place) and return the other new range. The character
* itself is not included in either range. If the split results in an
* empty range (that is, if c == start or c == start + length - 1) then
* return null.
*
* MODIFIES THIS RANGE IN PLACE.
*
* CALLER OWNS RESULT.
*/
UnicodeRange* split(UChar c);
/**
* Finds the largest subrange of this range that is unused by the
* given string. A subrange is unused by a string if the string
* contains no characters in that range. If the given string
* contains no characters in this range, then this range itself is
* returned.
*
* CALLER OWNS RESULT.
*/
UnicodeRange* largestUnusedSubrange(const UnicodeString& str) const;
private:
// For UVector of UnicodeRange* objects
static void deleter(void*);
};
#endif

View file

@ -0,0 +1,204 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#include "unitohex.h"
#include "rep.h"
#include "unifilt.h"
/**
* ID for this transliterator.
*/
const char* UnicodeToHexTransliterator::_ID = "Unicode-Hex";
const char* UnicodeToHexTransliterator::DEFAULT_PREFIX = "\\u";
/**
* Constructs a transliterator.
* @param prefix the string that will precede the four hex
* digits for UNICODE_HEX transliterators. Ignored
* if direction is HEX_UNICODE.
* @param uppercase if true, the four hex digits will be
* converted to uppercase; otherwise they will be lowercase.
* Ignored if direction is HEX_UNICODE.
*/
UnicodeToHexTransliterator::UnicodeToHexTransliterator(
const UnicodeString& hexPrefix,
bool_t isUppercase,
UnicodeFilter* adoptedFilter) :
Transliterator(_ID, adoptedFilter),
prefix(hexPrefix),
uppercase(isUppercase) {
}
/**
* Constructs a transliterator with the default prefix "&#092;u"
* that outputs uppercase hex digits.
*/
UnicodeToHexTransliterator::UnicodeToHexTransliterator(
UnicodeFilter* adoptedFilter) :
Transliterator(_ID, adoptedFilter),
prefix(DEFAULT_PREFIX),
uppercase(TRUE) {
}
/**
* Copy constructor.
*/
UnicodeToHexTransliterator::UnicodeToHexTransliterator(
const UnicodeToHexTransliterator& other) :
Transliterator(other), prefix(other.prefix),
uppercase(other.uppercase) {
}
/**
* Assignment operator.
*/
UnicodeToHexTransliterator&
UnicodeToHexTransliterator::operator=(const UnicodeToHexTransliterator& other) {
Transliterator::operator=(other);
prefix = other.prefix;
uppercase = other.uppercase;
return *this;
}
Transliterator*
UnicodeToHexTransliterator::clone() const {
return new UnicodeToHexTransliterator(*this);
}
/**
* Returns the string that precedes the four hex digits.
* @return prefix string
*/
const UnicodeString& UnicodeToHexTransliterator::getPrefix() const {
return prefix;
}
/**
* Sets the string that precedes the four hex digits.
*
* <p>Callers must take care if a transliterator is in use by
* multiple threads. The prefix should not be changed by one
* thread while another thread may be transliterating.
* @param prefix prefix string
*/
void UnicodeToHexTransliterator::setPrefix(const UnicodeString& hexPrefix) {
prefix = hexPrefix;
}
/**
* Returns true if this transliterator outputs uppercase hex digits.
*/
bool_t UnicodeToHexTransliterator::isUppercase() const {
return uppercase;
}
/**
* Sets if this transliterator outputs uppercase hex digits.
*
* <p>Callers must take care if a transliterator is in use by
* multiple threads. The uppercase mode should not be changed by
* one thread while another thread may be transliterating.
* @param outputUppercase if true, then this transliterator
* outputs uppercase hex digits.
*/
void UnicodeToHexTransliterator::setUppercase(bool_t outputUppercase) {
uppercase = outputUppercase;
}
/**
* Transliterates a segment of a string. <code>Transliterator</code> API.
* @param text the string to be transliterated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @return the new limit index
*/
int32_t UnicodeToHexTransliterator::transliterate(Replaceable& text,
int32_t start,
int32_t limit) const {
int32_t offsets[3] = { start, limit, start };
handleKeyboardTransliterate(text, offsets);
return offsets[LIMIT];
}
/**
* Implements {@link Transliterator#handleKeyboardTransliterate}.
*/
void UnicodeToHexTransliterator::handleKeyboardTransliterate(Replaceable& text,
int32_t offsets[3]) const {
/**
* Performs transliteration changing all characters to
* Unicode hexadecimal escapes. For example, '@' -> "U+0040",
* assuming the prefix is "U+".
*/
int32_t cursor = offsets[CURSOR];
int32_t limit = offsets[LIMIT];
const UnicodeFilter* filter = getFilter();
UnicodeString hex;
while (cursor < limit) {
UChar c = text.charAt(cursor);
if (filter != 0 && !filter->isIn(c)) {
++cursor;
continue;
}
toHex(hex, c);
text.handleReplaceBetween(cursor, cursor+1, hex);
int32_t len = hex.length();
cursor += len; // Advance cursor by 1 and adjust for new text
--len;
limit += len;
}
offsets[LIMIT] = limit;
offsets[CURSOR] = cursor;
}
/**
* Return the length of the longest context required by this transliterator.
* This is <em>preceding</em> context.
* @param direction either <code>FORWARD</code> or <code>REVERSE</code>
* @return maximum number of preceding context characters this
* transliterator needs to examine
*/
int32_t UnicodeToHexTransliterator::getMaximumContextLength() {
return 0;
}
UChar UnicodeToHexTransliterator::HEX_DIGITS[32] = {
// If necessary, replace these character constants with their hex values
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
};
/**
* Given an integer, return its least significant hex digit.
*/
UChar UnicodeToHexTransliterator::itoh(int32_t i) const {
i &= 0xF;
return HEX_DIGITS[uppercase ? (i|16) : i];
}
/**
* Form escape sequence.
*/
UnicodeString& UnicodeToHexTransliterator::toHex(UnicodeString& result,
UChar c) const {
result = prefix;
result.append(itoh(c >> 12));
result.append(itoh(c >> 8));
result.append(itoh(c >> 4));
result.append(itoh(c));
return result;
}

View file

@ -0,0 +1,157 @@
/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#ifndef UNITOHEX_H
#define UNITOHEX_H
#include "translit.h"
#include "unistr.h"
class UnicodeFilter;
/**
* A transliterator that converts from Unicode characters to
* hexadecimal Unicode escape sequences. It outputs a
* prefix specified in the constructor and optionally converts the hex
* digits to uppercase.
*
* @author Alan Liu
*/
class U_I18N_API UnicodeToHexTransliterator : public Transliterator {
private:
/**
* ID for this transliterator.
*/
static const char* _ID;
static const char* DEFAULT_PREFIX;
UnicodeString prefix;
bool_t uppercase;
public:
/**
* Constructs a transliterator.
* @param prefix the string that will precede the four hex
* digits for UNICODE_HEX transliterators. Ignored
* if direction is HEX_UNICODE.
* @param uppercase if true, the four hex digits will be
* converted to uppercase; otherwise they will be lowercase.
* Ignored if direction is HEX_UNICODE.
*/
UnicodeToHexTransliterator(const UnicodeString& hexPrefix,
bool_t isUppercase,
UnicodeFilter* adoptedFilter = 0);
/**
* Constructs a transliterator with the default prefix "\u"
* that outputs uppercase hex digits.
*/
UnicodeToHexTransliterator(UnicodeFilter* adoptedFilter = 0);
/**
* Destructor.
*/
virtual ~UnicodeToHexTransliterator();
/**
* Copy constructor.
*/
UnicodeToHexTransliterator(const UnicodeToHexTransliterator&);
/**
* Assignment operator.
*/
UnicodeToHexTransliterator& operator=(const UnicodeToHexTransliterator&);
/**
* Transliterator API.
*/
virtual Transliterator* clone() const;
/**
* Returns the string that precedes the four hex digits.
* @return prefix string
*/
virtual const UnicodeString& getPrefix() const;
/**
* Sets the string that precedes the four hex digits.
*
* <p>Callers must take care if a transliterator is in use by
* multiple threads. The prefix should not be changed by one
* thread while another thread may be transliterating.
* @param prefix prefix string
*/
virtual void setPrefix(const UnicodeString& prefix);
/**
* Returns true if this transliterator outputs uppercase hex digits.
*/
virtual bool_t isUppercase() const;
/**
* Sets if this transliterator outputs uppercase hex digits.
*
* <p>Callers must take care if a transliterator is in use by
* multiple threads. The uppercase mode should not be changed by
* one thread while another thread may be transliterating.
* @param outputUppercase if true, then this transliterator
* outputs uppercase hex digits.
*/
virtual void setUppercase(bool_t outputUppercase);
/**
* Transliterates a segment of a string. <code>Transliterator</code> API.
* @param text the string to be transliterated
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= text.length()</code>.
* @return the new limit index
*/
virtual int32_t transliterate(Replaceable& text, int32_t start, int32_t limit) const;
/**
* Implements {@link Transliterator#handleKeyboardTransliterate}.
*/
virtual void handleKeyboardTransliterate(Replaceable& text,
int32_t offsets[3]) const;
/**
* Return the length of the longest context required by this transliterator.
* This is <em>preceding</em> context.
* @param direction either <code>FORWARD</code> or <code>REVERSE</code>
* @return maximum number of preceding context characters this
* transliterator needs to examine
*/
virtual int32_t getMaximumContextLength();
private:
static UChar HEX_DIGITS[32];
/**
* Given an integer, return its least significant hex digit.
*/
UChar itoh(int32_t i) const;
/**
* Form escape sequence.
*/
UnicodeString& toHex(UnicodeString& result, UChar c) const;
};
inline UnicodeToHexTransliterator::~UnicodeToHexTransliterator() {}
#endif